Ejemplo n.º 1
0
    def test_multiple_2d_textures(self):
        mod = SourceModule("""
        texture<float, 2, cudaReadModeElementType> mtx_tex;
        texture<float, 2, cudaReadModeElementType> mtx2_tex;

        __global__ void copy_texture(float *dest)
        {
          int row = threadIdx.x;
          int col = threadIdx.y;
          int w = blockDim.y;
          dest[row*w+col] =
              tex2D(mtx_tex, row, col)
              +
              tex2D(mtx2_tex, row, col);
        }
        """)

        copy_texture = mod.get_function("copy_texture")
        mtx_tex = mod.get_texref("mtx_tex")
        mtx2_tex = mod.get_texref("mtx2_tex")

        shape = (3, 4)
        a = np.random.randn(*shape).astype(np.float32)
        b = np.random.randn(*shape).astype(np.float32)
        drv.matrix_to_texref(a, mtx_tex, order="F")
        drv.matrix_to_texref(b, mtx2_tex, order="F")

        dest = np.zeros(shape, dtype=np.float32)
        copy_texture(drv.Out(dest),
                     block=shape + (1, ),
                     texrefs=[mtx_tex, mtx2_tex])
        assert la.norm(dest - a - b) < 1e-6
Ejemplo n.º 2
0
def create_2d_texture(a, module, variable, point_sampling=False):
    a = numpy.ascontiguousarray(a)
    out_texref = module.get_texref(variable)
    cuda.matrix_to_texref(a, out_texref, order='C')
    if point_sampling: out_texref.set_filter_mode(cuda.filter_mode.POINT)
    else: out_texref.set_filter_mode(cuda.filter_mode.LINEAR)
    return out_texref
Ejemplo n.º 3
0
    def copy_texture_memory_args(self, texmem_args):
        """adds texture memory arguments to the most recently compiled module

        :param texmem_args: A dictionary containing the data to be passed to the
            device texture memory. See tune_kernel().
        :type texmem_args: dict
        """

        filter_mode_map = {
            'point': drv.filter_mode.POINT,
            'linear': drv.filter_mode.LINEAR
        }
        address_mode_map = {
            'border': drv.address_mode.BORDER,
            'clamp': drv.address_mode.CLAMP,
            'mirror': drv.address_mode.MIRROR,
            'wrap': drv.address_mode.WRAP
        }

        logging.debug('copy_texture_memory_args called')
        logging.debug('current module: ' + str(self.current_module))
        self.texrefs = []
        for k, v in texmem_args.items():
            tex = self.current_module.get_texref(k)
            self.texrefs.append(tex)

            logging.debug('copying to texture: ' + str(k))
            if not isinstance(v, dict):
                data = v
            else:
                data = v['array']
            logging.debug('texture to be copied: ')
            logging.debug(data.nbytes)
            logging.debug(data.dtype)
            logging.debug(data.flags)

            drv.matrix_to_texref(data, tex, order="C")

            if isinstance(v, dict):
                if 'address_mode' in v and v['address_mode'] is not None:
                    # address_mode is set per axis
                    amode = v['address_mode']
                    if not isinstance(amode, list):
                        amode = [amode] * data.ndim
                    for i, m in enumerate(amode):
                        try:
                            if m is not None:
                                tex.set_address_mode(i, address_mode_map[m])
                        except KeyError:
                            raise ValueError('Unknown address mode: ' + m)
                if 'filter_mode' in v and v['filter_mode'] is not None:
                    fmode = v['filter_mode']
                    try:
                        tex.set_filter_mode(filter_mode_map[fmode])
                    except KeyError:
                        raise ValueError('Unknown filter mode: ' + fmode)
                if 'normalized_coordinates' in v and v[
                        'normalized_coordinates']:
                    tex.set_flags(tex.get_flags()
                                  | drv.TRSF_NORMALIZED_COORDINATES)
def wrap_cuda_convolution(img, kernel) -> np.ndarray:
    # check kernel and get radius
    kernel_width, kernel_height = np.int32(np.shape(kernel))
    assert kernel_height % 2 or kernel_width % 2, "Kernel shape does not consist of odd numbers!"
    assert kernel_height == kernel_width, "Kernel is not a square!"

    # cast input to float32
    img_in = img.astype(np.float32)
    kernel_cpu = kernel.astype(np.float32)

    # pass data to cuda texture
    cuda.matrix_to_texref(img_in, TEX_IMG, order='C')
    cuda.matrix_to_texref(kernel_cpu, TEX_KERNEL, order='C')

    # setup output
    img_out = np.zeros_like(img, dtype=np.float32)

    # setup grid
    img_height, img_width = np.shape(img_in)
    blocksize = 32
    grid = (int(np.ceil(img_width / blocksize)),
            int(np.ceil(img_height / blocksize)),
            1)

    kernel_radius = kernel_width // 2
    CUDA_CONVOLUTION(np.int32(img_width),
                     np.int32(img_height),
                     np.int32(kernel_radius),
                     cuda.Out(img_out),
                     texrefs=[TEX_IMG, TEX_KERNEL],
                     block=(blocksize, blocksize, 1),
                     grid=grid)

    return img_out
Ejemplo n.º 5
0
    def test_multiple_2d_textures(self):
        mod = SourceModule("""
        texture<float, 2, cudaReadModeElementType> mtx_tex;
        texture<float, 2, cudaReadModeElementType> mtx2_tex;

        __global__ void copy_texture(float *dest)
        {
          int row = threadIdx.x;
          int col = threadIdx.y;
          int w = blockDim.y;
          dest[row*w+col] =
              tex2D(mtx_tex, row, col)
              +
              tex2D(mtx2_tex, row, col);
        }
        """)

        copy_texture = mod.get_function("copy_texture")
        mtx_tex = mod.get_texref("mtx_tex")
        mtx2_tex = mod.get_texref("mtx2_tex")

        shape = (3,4)
        a = np.random.randn(*shape).astype(np.float32)
        b = np.random.randn(*shape).astype(np.float32)
        drv.matrix_to_texref(a, mtx_tex, order="F")
        drv.matrix_to_texref(b, mtx2_tex, order="F")

        dest = np.zeros(shape, dtype=np.float32)
        copy_texture(drv.Out(dest),
                block=shape+(1,),
                texrefs=[mtx_tex, mtx2_tex]
                )
        assert la.norm(dest-a-b) < 1e-6
Ejemplo n.º 6
0
def create_2d_texture(a, module, variable, point_sampling=False):
    a = numpy.ascontiguousarray(a)
    out_texref = module.get_texref(variable)
    cuda.matrix_to_texref(a, out_texref, order='C')
    if point_sampling: out_texref.set_filter_mode(cuda.filter_mode.POINT)
    else: out_texref.set_filter_mode(cuda.filter_mode.LINEAR)
    return out_texref
Ejemplo n.º 7
0
def rotate_image( a, resize = 1.5, angle = 20., interpolation = "linear", blocks = (16,16,1)  ):
    """
    Rotates the array. The new array has the new size and centers the
    picture in the middle.
    
    a             - array (2-dim)
    resize        - new_image w/old_image w
    angle         - degrees to rotate the image
    interpolation - "linear" or None
    blocks        - given to the kernel when run


    returns: a new array with dtype=uint8 containing the rotated image
    """
    angle = angle/180. *pi
    
    # Convert this image to float. Unsigned int texture gave 
    # strange results for me. This conversion is slow though :(
    a = a.astype("float32")

    # Calculate the dimensions of the new image
    calc_x = lambda x_y: (x_y[0]*a.shape[1]/2.*cos(angle)-x_y[1]*a.shape[0]/2.*sin(angle))
    calc_y = lambda x_y1: (x_y1[0]*a.shape[1]/2.*sin(angle)+x_y1[1]*a.shape[0]/2.*cos(angle))

    xs = [ calc_x(p) for p in [ (-1.,-1.),(1.,-1.),(1.,1.),(-1.,1.) ] ]
    ys = [ calc_y(p) for p in [ (-1.,-1.),(1.,-1.),(1.,1.),(-1.,1.) ] ]

    new_image_dim = (
        int(numpy.ceil(max(ys)-min(ys))*resize),
        int(numpy.ceil(max(xs)-min(xs))*resize),
    )
    
    # Now generate the cuda texture
    cuda.matrix_to_texref(a, texref, order="C")
    
    # We could set the next if we wanted to address the image
    # in normalized coordinates ( 0 <= coordinate < 1.)
    # texref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)
    if interpolation == "linear":
        texref.set_filter_mode(cuda.filter_mode.LINEAR)

    # Calculate the gridsize. This is entirely given by the size of our image. 
    gridx = new_image_dim[0]/blocks[0] if \
            new_image_dim[0]%blocks[0]==1 else new_image_dim[0]/blocks[0] +1
    gridy = new_image_dim[1]/blocks[1] if \
            new_image_dim[1]%blocks[1]==0 else new_image_dim[1]/blocks[1] +1

    # Get the output image
    output = numpy.zeros(new_image_dim,dtype="uint8")
    
    # Call the kernel
    copy_texture_func(
        numpy.float32(resize), numpy.float32(angle),
        numpy.uint16(a.shape[1]), numpy.uint16(a.shape[0]),
        numpy.uint16(new_image_dim[1]), numpy.uint16(new_image_dim[0]),
            cuda.Out(output),texrefs=[texref],block=blocks,grid=(gridx,gridy))
    
    return output
Ejemplo n.º 8
0
def rotate_image( a, resize = 1.5, angle = 180., interpolation = "linear", blocks = (16,16,1)  ):
    """
    Rotates the array. The new array has the new size and centers the
    picture in the middle.

    a             - array (2-dim)
    resize        - new_image w/old_image w
    angle         - degrees to rotate the image
    interpolation - "linear" or None
    blocks        - given to the kernel when run


    returns: a new array with dtype=uint8 containing the rotated image
    """
    angle = angle/180. *pi

    # Convert this image to float. Unsigned int texture gave
    # strange results for me. This conversion is slow though :(
    a = a.astype("float32")

    # Calculate the dimensions of the new image
    calc_x = lambda (x,y): (x*a.shape[1]/2.*cos(angle)-y*a.shape[0]/2.*sin(angle))
    calc_y = lambda (x,y): (x*a.shape[1]/2.*sin(angle)+y*a.shape[0]/2.*cos(angle))

    xs = [ calc_x(p) for p in [ (-1.,-1.),(1.,-1.),(1.,1.),(-1.,1.) ] ]
    ys = [ calc_y(p) for p in [ (-1.,-1.),(1.,-1.),(1.,1.),(-1.,1.) ] ]

    new_image_dim = (
        int(numpy.ceil(max(ys)-min(ys))*resize),
        int(numpy.ceil(max(xs)-min(xs))*resize),
    )

    # Now generate the cuda texture
    cuda.matrix_to_texref(a, texref, order="C")

    # We could set the next if we wanted to address the image
    # in normalized coordinates ( 0 <= coordinate < 1.)
    # texref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)
    if interpolation == "linear":
        texref.set_filter_mode(cuda.filter_mode.LINEAR)

    # Calculate the gridsize. This is entirely given by the size of our image.
    gridx = new_image_dim[0]/blocks[0] if \
            new_image_dim[0]%blocks[0]==1 else new_image_dim[0]/blocks[0] +1
    gridy = new_image_dim[1]/blocks[1] if \
            new_image_dim[1]%blocks[1]==0 else new_image_dim[1]/blocks[1] +1

    # Get the output image
    output = numpy.zeros(new_image_dim,dtype="uint8")

    # Call the kernel
    copy_texture_func(
        numpy.float32(resize), numpy.float32(angle),
        numpy.uint16(a.shape[1]), numpy.uint16(a.shape[0]),
        numpy.uint16(new_image_dim[1]), numpy.uint16(new_image_dim[0]),
            cuda.Out(output),texrefs=[texref],block=blocks,grid=(gridx,gridy))

    return output
Ejemplo n.º 9
0
    def copy_texture_memory_args(self, texmem_args):
        """adds texture memory arguments to the most recently compiled module

        :param texmem_args: A dictionary containing the data to be passed to the
            device texture memory. TODO
        """


        filter_mode_map = { 'point': drv.filter_mode.POINT,
                            'linear': drv.filter_mode.LINEAR }
        address_mode_map = { 'border': drv.address_mode.BORDER,
                             'clamp': drv.address_mode.CLAMP,
                             'mirror': drv.address_mode.MIRROR,
                             'wrap': drv.address_mode.WRAP }

        logging.debug('copy_texture_memory_args called')
        logging.debug('current module: ' + str(self.current_module))
        self.texrefs = []
        for k, v in texmem_args.items():
            tex = self.current_module.get_texref(k)
            self.texrefs.append(tex)

            logging.debug('copying to texture: ' + str(k))
            if not isinstance(v, dict):
                data = v
            else:
                data = v['array']
            logging.debug('texture to be copied: ')
            logging.debug(data.nbytes)
            logging.debug(data.dtype)
            logging.debug(data.flags)

            drv.matrix_to_texref(data, tex, order="C")

            if isinstance(v, dict):
                if 'address_mode' in v and v['address_mode'] is not None:
                    # address_mode is set per axis
                    amode = v['address_mode']
                    if not isinstance(amode, list):
                        amode = [ amode ] * data.ndim
                    for i, m in enumerate(amode):
                        try:
                            if m is not None:
                                tex.set_address_mode(i, address_mode_map[m])
                        except KeyError:
                            raise ValueError('Unknown address mode: ' + m)
                if 'filter_mode' in v and v['filter_mode'] is not None:
                    fmode = v['filter_mode']
                    try:
                        tex.set_filter_mode(filter_mode_map[fmode])
                    except KeyError:
                        raise ValueError('Unknown filter mode: ' + fmode)
                if 'normalized_coordinates' in v and v['normalized_coordinates']:
                    tex.set_flags(tex.get_flags() | drv.TRSF_NORMALIZED_COORDINATES)
Ejemplo n.º 10
0
def doit(img):
  width, height = (img.shape[0], img.shape[1])
  
  cuda.matrix_to_texref(img, texref, order="C")
  texref.set_filter_mode(cuda.filter_mode.LINEAR)
  
  gpu_output = np.zeros((width/2,height/2), dtype=np.float32)
  gridsize = (width / blocksize[0], height / blocksize[1])

  downsample_func(cuda.Out(gpu_output), np.int32(width), np.int32(width/2), block=blocksize, grid = gridsize, texrefs=[texref])

  # gpu_output = gpu_output.transpose()

  return gpu_output
Ejemplo n.º 11
0
    def kernel(self):
        density, x, y, z, Qx, Qy, Qz, result = self.born_args
        nx, ny, nz, nqx, nqy, nqz = [
            numpy.int32(len(v)) for v in x, y, z, Qx, Qy, Qz
        ]
        cx, cy, cz, cQx, cQy, cQz, cdensity = [
            gpuarray.to_gpu(v) for v in x, y, z, Qx, Qy, Qz, density
        ]

        cframe = cuda.mem_alloc(result[0].nbytes)
        cuda.matrix_to_texref(nx, texref, order="C")
        texref.set_filter_mode(cuda.filter_mode.LINEAR)
        n = int(1 * nqy * nqz)
        print 'fn in kernel'
        while True:
            try:
                qxi = numpy.int32(self.work_queue.get(block=False))
            except Queue.Empty:
                break

            print "%d of %d on %d\n" % (qxi, nqx, self.gpu),
            cuda_texture_func(nx,
                              ny,
                              nz,
                              nqx,
                              nqy,
                              nqz,
                              cdensity,
                              cx,
                              cy,
                              cz,
                              cQx,
                              cQy,
                              cQz,
                              qxi,
                              cuda.Out(result),
                              texrefs=[texref])

            #self.cudaBorn(nx,ny,nz,nqx,nqy,nqz,
            #cdensity,cx,cy,cz,cQx,cQy,cQz,qxi,cframe,
            #**cuda_partition(n))
            ## Delay fetching result until the kernel is complete
            #cuda_sync()

            ## Fetch result back to the CPU
            #cuda.memcpy_dtoh(result[qxi], cframe)

            print "%d %s\n" % (qxi, ctemp.get())
        del cx, cy, cz, cQx, cQy, cQz, cdensity, cframe
Ejemplo n.º 12
0
    def cuda_interpolate(self, channel, m, size_result):
        cols = size_result[0]
        rows = size_result[1]

        kernel_code = """
		texture<float, 2> tex;
	
		__global__ void interpolation(float *dest, float *m0, float *m1)
		{
			int idx = threadIdx.x + blockDim.x * blockIdx.x;
			int idy = threadIdx.y + blockDim.y * blockIdx.y;
	
			if (( idx < %(NCOLS)s ) && ( idy < %(NDIM)s )) {
				dest[%(NDIM)s * idx + idy] = tex2D(tex, m0[%(NDIM)s * idy + idx], m1[%(NDIM)s * idy + idx]);
			}
		}
		"""

        kernel_code = kernel_code % {'NCOLS': cols, 'NDIM': rows}
        mod = SourceModule(kernel_code)

        interpolation = mod.get_function("interpolation")
        texref = mod.get_texref("tex")

        channel = channel.astype("float32")
        drv.matrix_to_texref(channel, texref, order="F")
        texref.set_filter_mode(drv.filter_mode.LINEAR)

        bdim = (16, 16, 1)
        dx, mx = divmod(cols, bdim[0])
        dy, my = divmod(rows, bdim[1])

        gdim = ((dx + (mx > 0)) * bdim[0], (dy + (my > 0)) * bdim[1])

        dest = np.zeros((rows, cols)).astype("float32")
        m0 = (m[0, :] - 1).astype("float32")
        m1 = (m[1, :] - 1).astype("float32")

        interpolation(drv.Out(dest),
                      drv.In(m0),
                      drv.In(m1),
                      block=bdim,
                      grid=gdim,
                      texrefs=[texref])

        return dest.astype("uint8")
Ejemplo n.º 13
0
def gpu_calc(image, sigma_r, sigma_d):
    N, M = image.shape[0], image.shape[1]
    block_size = (16, 16, 1)
    grid_size = (int(np.ceil(N / block_size[0])),
                 int(np.ceil(M / block_size[1])))
    result = np.zeros((N, M), dtype=np.uint32)
    calc = mod.get_function("calc")
    start = time.time()
    tex = mod.get_texref("tex")
    driver.matrix_to_texref(image.astype(np.uint32), tex, order="C")
    calc(driver.Out(result),
         np.int32(N),
         np.int32(M),
         np.float32(sigma_d),
         np.float32(sigma_r),
         block=block_size,
         grid=grid_size,
         texrefs=[tex])
    driver.Context.synchronize()
    end = time.time()
    cv2.imwrite('img_gpu.bmp', result.astype(np.uint8))
    return end - start
Ejemplo n.º 14
0
    def cuda_interpolate3D(self, img, m, size_result):
        cols = size_result[0]
        rows = size_result[1]

        kernel_code = """
		texture<float, 2> texR;
		texture<float, 2> texG;
		texture<float, 2> texB;
	
		__global__ void interpolation(float *dest, float *m0, float *m1)
	    {
			int idx = threadIdx.x + blockDim.x * blockIdx.x;
			int idy = threadIdx.y + blockDim.y * blockIdx.y;
	
			if (( idx < %(NCOLS)s ) && ( idy < %(NDIM)s )) {
				dest[3*(%(NDIM)s * idx + idy)] = tex2D(texR, m0[%(NDIM)s * idy + idx], m1[%(NDIM)s * idy + idx]);
				dest[3*(%(NDIM)s * idx + idy) + 1] = tex2D(texG, m0[%(NDIM)s * idy + idx], m1[%(NDIM)s * idy + idx]);
				dest[3*(%(NDIM)s * idx + idy) + 2] = tex2D(texB, m0[%(NDIM)s * idy + idx], m1[%(NDIM)s * idy + idx]);
			}
		}
		"""

        kernel_code = kernel_code % {'NCOLS': cols, 'NDIM': rows}
        mod = SourceModule(kernel_code)

        interpolation = mod.get_function("interpolation")
        texrefR = mod.get_texref("texR")
        texrefG = mod.get_texref("texG")
        texrefB = mod.get_texref("texB")

        img = img.astype("float32")
        drv.matrix_to_texref(img[:, :, 0], texrefR, order="F")
        texrefR.set_filter_mode(drv.filter_mode.LINEAR)
        drv.matrix_to_texref(img[:, :, 1], texrefG, order="F")
        texrefG.set_filter_mode(drv.filter_mode.LINEAR)
        drv.matrix_to_texref(img[:, :, 2], texrefB, order="F")
        texrefB.set_filter_mode(drv.filter_mode.LINEAR)

        bdim = (16, 16, 1)
        dx, mx = divmod(cols, bdim[0])
        dy, my = divmod(rows, bdim[1])

        gdim = ((dx + (mx > 0)) * bdim[0], (dy + (my > 0)) * bdim[1])

        dest = np.zeros((rows, cols, 3)).astype("float32")
        m0 = (m[0, :] - 1).astype("float32")
        m1 = (m[1, :] - 1).astype("float32")

        interpolation(drv.Out(dest),
                      drv.In(m0),
                      drv.In(m1),
                      block=bdim,
                      grid=gdim,
                      texrefs=[texrefR, texrefG, texrefB])

        return dest.astype("uint8")
Ejemplo n.º 15
0
    def load_texture(self, name, arr):
        '''
        Loads an array into a texture with a name.

        Address by the name in the kernel code.
        '''
        tex = self.mod.get_texref(name)  # x*y*z
        arr = arr.astype('float32')

        if len(arr.shape) == 3:
            carr = arr.copy('F')
            texarray = numpy3d_to_array(carr, 'F')
            tex.set_array(texarray)
        else:
            if len(arr.shape) == 1:
                arr = np.expand_dims(arr, 1)
            tex.set_flags(0)
            cuda.matrix_to_texref(arr, tex, order='F')

        tex.set_address_mode(0, cuda.address_mode.CLAMP)
        tex.set_address_mode(1, cuda.address_mode.CLAMP)
        tex.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)
        tex.set_filter_mode(cuda.filter_mode.LINEAR)
        self.textures[name] = tex
Ejemplo n.º 16
0
def trikmeans_gpu(data, clusters, iterations, return_times = 0):
    # trikmeans_gpu(data, clusters, iterations) returns (clusters, labels)
    
    # kmeans using triangle inequality algorithm and cuda
    # input arguments are the data, intial cluster values, and number of iterations to repeat
    # The shape of data is (nDim, nPts) where nDim = # of dimensions in the data and
    # nPts = number of data points
    # The shape of clustrs is (nDim, nClusters) 
    #
    # The return values are the updated clusters and labels for the data
    
    #---------------------------------------------------------------
    #                   get problem parameters
    #---------------------------------------------------------------
    (nDim, nPts) = data.shape
    nClusters = clusters.shape[1]

    
    #---------------------------------------------------------------
    #            set calculation control variables
    #---------------------------------------------------------------
    useTextureForData = 1
    
    
    # block and grid sizes for the ccdist kernel (also for hdclosest)
    blocksize_ccdist = min(512, 16*(1+(nClusters-1)/16))
    gridsize_ccdist = 1 + (nClusters-1)/blocksize_ccdist
    
    #block and grid sizes for the init module
    threads_desired = 16*(1+(max(nPts, nDim*nClusters)-1)/16)
    blocksize_init = min(512, threads_desired) 
    gridsize_init = 1 + (threads_desired - 1)/blocksize_init
    
    #block and grid sizes for the step3 module
    blocksize_step3 = blocksize_init
    gridsize_step3 = gridsize_init
    
    #block and grid sizes for the step4 module
    for blocksize_step4_x in range(32, 512, 32):
        if blocksize_step4_x >= nClusters:
            break;
    blocksize_step4_y = min(nDim, 512/blocksize_step4_x)
    gridsize_step4_x = 1 + (nClusters-1)/blocksize_step4_x
    gridsize_step4_y = 1 + (nDim-1)/blocksize_step4_y
    
    #block and grid sizes for the calc_movement module
    blocksize_calcm = blocksize_step4_x
    gridsize_calcm = gridsize_step4_x    
    
    #block and grid sizes for the step56 module
    blocksize_step56 = blocksize_init
    gridsize_step56 = gridsize_init
    
    
    
    #---------------------------------------------------------------
    #                    prepare source modules
    #---------------------------------------------------------------
    t1 = time.time()
    mod_ccdist = mods2.get_ccdist_module(nDim, nPts, nClusters, blocksize_ccdist, blocksize_init, 
                                        blocksize_step4_x, blocksize_step4_y, blocksize_step56,
                                        useTextureForData)

    #mod_step56 = mods2.get_step56_module(nDim, nPts, nClusters, blocksize_step56)
    
    ccdist = mod_ccdist.get_function("ccdist")
    calc_hdclosest = mod_ccdist.get_function("calc_hdclosest")
    init = mod_ccdist.get_function("init")
    step3 = mod_ccdist.get_function("step3")
    step4 = mod_ccdist.get_function("step4")
    calc_movement = mod_ccdist.get_function("calc_movement")
    step56 = mod_ccdist.get_function("step56")
    pycuda.autoinit.context.synchronize()
    t2 = time.time()
    module_time = t2-t1

    #---------------------------------------------------------------
    #                    setup data on GPU
    #---------------------------------------------------------------
    t1 = time.time()

    data = np.array(data).astype(np.float32)
    clusters = np.array(clusters).astype(np.float32)
    
    if useTextureForData:
        # copy the data to the texture
        texrefData = mod_ccdist.get_texref("texData")
        cuda.matrix_to_texref(data, texrefData, order="F")
    else:
        gpu_data = gpuarray.to_gpu(data)


    gpu_clusters = gpuarray.to_gpu(clusters)
    gpu_assignments = gpuarray.zeros((nPts,), np.int32)         # cluster assignment
    gpu_lower = gpuarray.zeros((nClusters, nPts), np.float32)   # lower bounds on distance between 
                                                                # point and each cluster
    gpu_upper = gpuarray.zeros((nPts,), np.float32)             # upper bounds on distance between
                                                                # point and any cluster
    gpu_ccdist = gpuarray.zeros((nClusters, nClusters), np.float32)    # cluster-cluster distances
    gpu_hdClosest = gpuarray.zeros((nClusters,), np.float32)    # half distance to closest
    gpu_hdClosest.fill(1.0e10)  # set to large value // **TODO**  get the acutal float max
    gpu_badUpper = gpuarray.zeros((nPts,), np.int32)   # flag to indicate upper bound needs recalc
    gpu_clusters2 = gpuarray.zeros((nDim, nClusters), np.float32);
    gpu_cluster_movement = gpuarray.zeros((nClusters,), np.float32);
    
    gpu_cluster_changed = gpuarray.zeros((nClusters,), np.int32)
    
    pycuda.autoinit.context.synchronize()
    t2 = time.time()
    data_time = t2-t1
    
    #---------------------------------------------------------------
    #                    do calculations
    #---------------------------------------------------------------
    ccdist_time = 0.
    hdclosest_time = 0.
    init_time = 0.
    step3_time = 0.
    step4_time = 0.
    step56_time = 0.

    t1 = time.time()
    ccdist(gpu_clusters, gpu_ccdist, gpu_hdClosest,
             block = (blocksize_ccdist, 1, 1),
             grid = (gridsize_ccdist, 1))
    pycuda.autoinit.context.synchronize()
    t2 = time.time()
    ccdist_time += t2-t1
    
    t1 = time.time()
    calc_hdclosest(gpu_ccdist, gpu_hdClosest,
            block = (blocksize_ccdist, 1, 1),
            grid = (gridsize_ccdist, 1))
    pycuda.autoinit.context.synchronize()
    t2 = time.time()
    hdclosest_time += t2-t1
    
    t1 = time.time()
    if useTextureForData:
        init(gpu_clusters, gpu_ccdist, gpu_hdClosest, gpu_assignments, 
                gpu_lower, gpu_upper,
                block = (blocksize_init, 1, 1),
                grid = (gridsize_init, 1),
                texrefs=[texrefData])
    else:
        init(gpu_data, gpu_clusters, gpu_ccdist, gpu_hdClosest, gpu_assignments, 
                gpu_lower, gpu_upper,
                block = (blocksize_init, 1, 1),
                grid = (gridsize_init, 1))
    pycuda.autoinit.context.synchronize()
    t2 = time.time()
    init_time += t2-t1

    """    
    print "data"
    print data
    print "gpu_dataout"
    print gpu_dataout
    return 1
    """

    for i in range(iterations):
    
        if i>0:
            t1 = time.time()
            ccdist(gpu_clusters, gpu_ccdist, gpu_hdClosest,
                     block = (blocksize_ccdist, 1, 1),
                     grid = (gridsize_ccdist, 1))
            pycuda.autoinit.context.synchronize()
            t2 = time.time()
            ccdist_time += t2-t1
            
            t1 = time.time()
            calc_hdclosest(gpu_ccdist, gpu_hdClosest,
                    block = (blocksize_ccdist, 1, 1),
                    grid = (gridsize_ccdist, 1))
            pycuda.autoinit.context.synchronize()
            t2 = time.time()
            hdclosest_time += t2-t1
            
        """
        print "Just before step 3=========================================="
        print "gpu_clusters"
        print gpu_clusters
        print "gpu_ccdist"
        print gpu_ccdist
        print "gpu_hdClosest"
        print gpu_hdClosest
        print "gpu_assignments"
        print gpu_assignments
        print "gpu_lower"
        print gpu_lower
        print "gpu_upper"
        print gpu_upper
        print "gpu_badUpper"
        print gpu_badUpper
        """
        
        t1 = time.time()
        gpu_cluster_changed.fill(0)
        if useTextureForData:
            step3(gpu_clusters, gpu_ccdist, gpu_hdClosest, gpu_assignments,
                    gpu_lower, gpu_upper, gpu_badUpper, gpu_cluster_changed,
                    block = (blocksize_step3, 1, 1),
                    grid = (gridsize_step3, 1),
                    texrefs=[texrefData])
        else:
            step3(gpu_data, gpu_clusters, gpu_ccdist, gpu_hdClosest, gpu_assignments,
                    gpu_lower, gpu_upper, gpu_badUpper,  gpu_cluster_changed,
                    block = (blocksize_step3, 1, 1),
                    grid = (gridsize_step3, 1))
        
        pycuda.autoinit.context.synchronize()
        t2 = time.time()
        step3_time += t2-t1
        
        """
        print "gpu_cluster_changed"
        print gpu_cluster_changed.get()
        """
        
        """
        print "Just before step 4=========================================="
        print "gpu_assignments"
        print gpu_assignments
        print "gpu_lower"
        print gpu_lower
        print "gpu_upper"
        print gpu_upper
        print "gpu_badUpper"
        print gpu_badUpper
        """        
    
        t1 = time.time()
        
        if useTextureForData:
            step4(gpu_clusters, gpu_clusters2, gpu_assignments, gpu_cluster_movement,
                gpu_cluster_changed,
                block = (blocksize_step4_x, blocksize_step4_y, 1),
                grid = (gridsize_step4_x, gridsize_step4_y),
                texrefs=[texrefData])
        else:
            step4(gpu_data, gpu_clusters, gpu_clusters2, gpu_assignments, gpu_cluster_movement,
                gpu_cluster_changed,
                block = (blocksize_step4_x, blocksize_step4_y, 1),
                grid = (gridsize_step4_x, gridsize_step4_y))
        
        #"""
        calc_movement(gpu_clusters, gpu_clusters2, gpu_cluster_movement,
                block = (blocksize_calcm, 1, 1),
                grid = (gridsize_calcm, 1))
        #"""
        
        pycuda.autoinit.context.synchronize()
        t2 = time.time()
        step4_time += t2-t1
        
        """
        print "Just before step 5=========================================="
        print "gpu_cluste_movement"
        print gpu_cluster_movement
        print "gpu_clusters"
        print gpu_clusters2
        """
    
        t1 = time.time() #------------------------------------------------------------------

        if useTextureForData:
            step56(gpu_assignments, gpu_lower, gpu_upper, 
                    gpu_cluster_movement, gpu_badUpper,
                    block = (blocksize_step56, 1, 1),
                    grid = (gridsize_step56, 1),
                    texrefs=[texrefData])
        else:
            step56(gpu_assignments, gpu_lower, gpu_upper, 
                    gpu_cluster_movement, gpu_badUpper,
                    block = (blocksize_step56, 1, 1),
                    grid = (gridsize_step56, 1))
                    
        pycuda.autoinit.context.synchronize()
        t2 = time.time()
        step56_time += t2-t1 #--------------------------------------------------------------
        
        """
        print "Just after step 6=========================================="
        print "gpu_lower"
        print gpu_lower
        print "gpu_upper"
        print gpu_upper
        print "gpu_badUpper"
        print gpu_badUpper
        """

        #if gpuarray.sum(gpu_cluster_movement).get() < 1.e-7:
            #print "No change in clusters!"
            #break
            
        # prepare for next iteration
        temp = gpu_clusters
        gpu_clusters = gpu_clusters2
        gpu_clusters2 = temp
        
    if return_times:
        return gpu_ccdist, gpu_hdClosest, gpu_assignments, gpu_lower, gpu_upper, \
                gpu_clusters.get(), gpu_cluster_movement, \
                data_time, module_time, init_time, \
                ccdist_time/iterations, hdclosest_time/iterations, \
                step3_time/iterations, step4_time/iterations, step56_time/iterations
    else:
        return gpu_clusters.get(), gpu_assignments.get()
Ejemplo n.º 17
0
def kmeans_gpu(data, clusters, iterations, return_times=0):
    # kmeans_gpu(data, clusters, iterations) returns (clusters, labels)

    # kmeans using standard algorithm and cuda
    # input arguments are the data, intial cluster values, and number of iterations to repeat
    # The shape of data is (nDim, nPts) where nDim = # of dimensions in the data and
    # nPts = number of data points
    # The shape of clusters is (nDim, nClusters)
    #
    # The return values are the updated clusters and labels for the data

    #---------------------------------------------------------------
    #                   get problem parameters
    #---------------------------------------------------------------
    (nDim, nPts) = data.shape
    nClusters = clusters.shape[1]

    #---------------------------------------------------------------
    #            set calculation control variables
    #---------------------------------------------------------------
    useTextureForData = 0
    usePageLockedMemory = 0
    if (nPts > 32768):
        useTextureForData = 0

    # block and grid sizes for the cluster_assign kernel
    threads_desired = 16 * (1 + (max(nPts, nDim * nClusters) - 1) / 16)
    blocksize_assign = min(256, threads_desired)
    gridsize_assign = 1 + (threads_desired - 1) / blocksize_assign
    """
    print "\nblocksize_assign =", blocksize_assign
    print "gridsize_assign  =", gridsize_assign
    """

    # block and grid sizes for the cluster_calc kernel
    blocksize_calc = 2
    while (blocksize_calc < min(512, nPts)):
        blocksize_calc *= 2
    maxblocks = 512
    seqcount_calc = 1 + (nPts - 1) / (blocksize_calc * maxblocks)
    gridsize_calc = 1 + (nPts - 1) / (seqcount_calc * blocksize_calc)
    """
    print "blocksize_calc =", blocksize_calc
    print "gridsize_calc  =", gridsize_calc
    print "seqcount_calc  =", seqcount_calc
    """

    blocksize_calc_part2 = 1
    while (blocksize_calc_part2 < gridsize_calc):
        blocksize_calc_part2 *= 2

    #---------------------------------------------------------------
    #                    prepare source modules
    #---------------------------------------------------------------
    t1 = time.time()

    mod_cuda = kernels.get_cuda_module(nDim, nPts, nClusters, blocksize_calc,
                                       seqcount_calc, gridsize_calc,
                                       blocksize_calc_part2, useTextureForData,
                                       BOUNDS)

    cuda_assign = mod_cuda.get_function("assign")
    cuda_calc = mod_cuda.get_function("calc")
    cuda_calc_part2 = mod_cuda.get_function("calc_part2")

    pycuda.autoinit.context.synchronize()
    t2 = time.time()
    module_time = t2 - t1

    #---------------------------------------------------------------
    #                    setup data on GPU
    #---------------------------------------------------------------
    t1 = time.time()

    data = np.array(data).astype(np.float32)
    clusters = np.array(clusters).astype(np.float32)

    if useTextureForData:
        # copy the data to the texture
        texrefData = mod_cuda.get_texref("texData")
        cuda.matrix_to_texref(data, texrefData, order="F")
    else:
        if usePageLockedMemory:
            data_pl = cuda.pagelocked_empty_like(data)
            data_pl[:, :] = data
            gpu_data = gpuarray.to_gpu(data_pl)
        else:
            gpu_data = gpuarray.to_gpu(data)

    if usePageLockedMemory:
        clusters_pl = cuda.pagelocked_empty_like(clusters)
        clusters_pl[:, :] = clusters
        gpu_clusters = gpuarray.to_gpu(clusters_pl)
    else:
        gpu_clusters = gpuarray.to_gpu(clusters)

    gpu_assignments = gpuarray.zeros((nPts, ), np.int32)
    gpu_clusters2 = gpuarray.zeros((nDim, nClusters), np.float32)
    gpu_reduction_out = gpuarray.zeros((nDim, nClusters * gridsize_calc),
                                       np.float32)
    gpu_reduction_counts = gpuarray.zeros((nClusters * gridsize_calc, ),
                                          np.int32)

    pycuda.autoinit.context.synchronize()
    t2 = time.time()
    data_time = t2 - t1

    #---------------------------------------------------------------
    #                    do calculations
    #---------------------------------------------------------------
    assign_time = 0.
    calc_time = 0.

    for i in range(iterations):

        # assign data to clusters
        t1 = time.time()
        if useTextureForData:
            cuda_assign(gpu_clusters,
                        gpu_assignments,
                        block=(blocksize_assign, 1, 1),
                        grid=(gridsize_assign, 1),
                        texrefs=[texrefData])
        else:
            cuda_assign(gpu_data,
                        gpu_clusters,
                        gpu_assignments,
                        block=(blocksize_assign, 1, 1),
                        grid=(gridsize_assign, 1))
        pycuda.autoinit.context.synchronize()
        t2 = time.time()
        assign_time += t2 - t1

        # calculate new cluster centers
        t1 = time.time()
        if useTextureForData:
            cuda_calc(gpu_reduction_out,
                      gpu_reduction_counts,
                      gpu_assignments,
                      block=(blocksize_calc, 1, 1),
                      grid=(gridsize_calc, nDim),
                      texrefs=[texrefData])
        else:
            cuda_calc(gpu_data,
                      gpu_reduction_out,
                      gpu_reduction_counts,
                      gpu_assignments,
                      block=(blocksize_calc, 1, 1),
                      grid=(gridsize_calc, nDim))

        cuda_calc_part2(gpu_reduction_out,
                        gpu_reduction_counts,
                        gpu_clusters2,
                        gpu_clusters,
                        block=(blocksize_calc_part2, 1, 1),
                        grid=(1, nDim))

        pycuda.autoinit.context.synchronize()
        t2 = time.time()
        calc_time += t2 - t1

        # prepare for next iteration
        temp = gpu_clusters
        gpu_clusters = gpu_clusters2
        gpu_clusters2 = temp

    if return_times:
        return gpu_assignments, gpu_clusters.get(), \
                data_time, module_time, assign_time/iterations, calc_time/iterations
    else:
        return gpu_clusters.get(), gpu_assignments.get()
def main():

    #Initialise InteractionMatrix
    def Delta(a,b):
        if a==b:
            return 1
        else:
            return 0
    for i in range(InteractionMatrix.shape[0]):
        for j in range(InteractionMatrix.shape[1]):
            InteractionMatrix[i][j] = ( 1 - i % 2 ) * Delta( i, j+1 ) + ( i % 2 ) * Delta( i, j-1 )

    #Initialise GPU (equivalent of autoinit)
    drv.init()
    assert drv.Device.count() >= 1
    dev = drv.Device(0)
    ctx = dev.make_context(0)

    #Convert GlobalParams to List
    GlobalParams = np.zeros(len(GlobalParamsDict.values())).astype(np.float32)
    count = 0
    for x in GlobalParamsDict.keys():
        GlobalParams[count] = GlobalParamsDict[x]
        count += 1

    #Convert FitnessParams to List
    FitnessParams = np.zeros(len(FitnessParamsDict.values())).astype(np.float32)
    count = 0
    for x in FitnessParamsDict.keys():
        FitnessParams[count] = FitnessParamsDict[x]
        count += 1

    #Convert GAParams to List
    GAParams = np.zeros(len(GAParamsDict.values())).astype(np.float32)
    count = 0
    for x in GAParamsDict.keys():
        GAParams[count] = GAParamsDict[x]
        count += 1

    # Set environment for template package Jinja2
    env = Environment(loader=PackageLoader('main', 'cuda'))

    # Load source code from file
    Source = env.get_template('kernel.cu') #Template( file(KernelFile).read() )

    #Create dictionary argument for rendering
    RenderArgs= {"params_size":GlobalParams.nbytes,\
                "fitnessparams_size":FitnessParams.nbytes,\
                "gaparams_size":GAParams.nbytes,\
                "genome_bytelength":int(ByteLengthGenome),\
                "genome_bitlength":int(BitLengthGenome),\
                "ga_nr_threadsperblock":GA_NrThreadsPerBlock,\
                "textures":range( 0, NrFitnessFunctionGrids ),\
                "curandinit_nr_threadsperblock":CurandInit_NrThreadsPerBlock,\
                "with_mixed_crossover":WithMixedCrossover,
                "with_bank_conflict":WithBankConflict,
                "with_naive_roulette_wheel_selection":WithNaiveRouletteWheelSelection,
                "with_assume_normalized_fitness_function_values":WithAssumeNormalizedFitnessFunctionValues,
                "with_uniform_crossover":WithUniformCrossover,
                "with_single_point_crossover":WithSinglePointCrossover,
                "with_surefire_mutation":WithSurefireMutation,
                "with_storeassembledgridsinglobalmemory":WithStoreAssembledGridsInGlobalMemory,
                "ga_threaddimx":int(ThreadDim),
                "glob_nr_tiletypes":int(NrTileTypes),
                "glob_nr_edgetypes":int(NrEdgeTypes),
                "glob_nr_tileorientations":int(NrTileOrientations),
                "fit_dimgridx":int(DimGridX),
                "fit_dimgridy":int(DimGridY),
                "fit_nr_fitnessfunctiongrids":int(NrFitnessFunctionGrids),
                "fit_nr_fourpermutations":int(NrFourPermutations),
                "fit_assembly_redundancy":int(NrAssemblyRedundancy),
                "fit_nr_threadsperblock":int(Fit_NrThreadsPerBlock),
                "sort_threaddimx":int(Sort_ThreadDimX),
                "glob_nr_genomes":int(NrGenomes),
                "fit_dimthreadx":int(ThreadDimX),
                "fit_dimthready":int(ThreadDimY),
                "fit_dimsubgridx":int(SubgridDimX),
                "fit_dimsubgridy":int(SubgridDimY),
                "fit_nr_subgridsperbank":int(NrSubgridsPerBank),
                "glob_bitlength_edgetype":int(EdgeTypeBitLength)
                }

    # Render source code
    RenderedSource = Source.render( RenderArgs )

    # Save rendered source code to file
    f = open('./rendered.cu', 'w')
    f.write(RenderedSource)
    f.close()

    #Load source code into module
    KernelSourceModule = SourceModule(RenderedSource, options=None, no_extern_c=True, arch="compute_11", code="sm_11", cache_dir=None)

    #Allocate values on GPU
    Genomes_h = drv.mem_alloc(Genomes.nbytes)
    FitnessPartialSums_h = drv.mem_alloc(FitnessPartialSums.nbytes)
    FitnessValues_h = drv.mem_alloc(FitnessValues.nbytes)
    AssembledGrids_h = drv.mem_alloc(AssembledGrids.nbytes)
    Mutexe_h = drv.mem_alloc(Mutexe.nbytes)
    ReductionList_h = drv.mem_alloc(ReductionList.nbytes)

    #Copy values to global memory
    drv.memcpy_htod(Genomes_h, Genomes)
    drv.memcpy_htod(FitnessPartialSums_h, FitnessPartialSums)
    drv.memcpy_htod(FitnessValues_h, FitnessValues)
    drv.memcpy_htod(AssembledGrids_h, AssembledGrids)
    drv.memcpy_htod(Mutexe_h, Mutexe)

    #Copy values to constant / texture memory
    for id in range(0, NrFitnessFunctionGrids):
        FitnessFunctionGrids_h.append( KernelSourceModule.get_texref("t_ucFitnessFunctionGrids%d"%(id)) )
        drv.matrix_to_texref( FitnessFunctionGrids[id], FitnessFunctionGrids_h[id] , order="C")
    InteractionMatrix_h = KernelSourceModule.get_texref("t_ucInteractionMatrix")
    drv.matrix_to_texref( InteractionMatrix, InteractionMatrix_h , order="C")

    GlobalParams_h = KernelSourceModule.get_global("c_fParams") # Constant memory address
    drv.memcpy_htod(GlobalParams_h[0], GlobalParams)
    FitnessParams_h = KernelSourceModule.get_global("c_fFitnessParams") # Constant memory address
    drv.memcpy_htod(FitnessParams_h[0], FitnessParams)
    GAParams_h = KernelSourceModule.get_global("c_fGAParams") # Constant memory address
    drv.memcpy_htod(GAParams_h[0], GAParams)
    FourPermutations_h = KernelSourceModule.get_global("c_ucFourPermutations") # Constant memory address
    drv.memcpy_htod(FourPermutations_h[0], FourPermutations)
    FitnessSumConst_h = KernelSourceModule.get_global("c_fFitnessSumConst")
    FitnessListConst_h = KernelSourceModule.get_global("c_fFitnessListConst")

    #Set up curandStates
    curandState_bytesize = 40 # This might be incorrect, depending on your compiler (info from Tomasz Rybak's pyCUDA cuRAND wrapper)
    CurandStates_h = drv.mem_alloc(curandState_bytesize * NrGenomes)

    #Compile kernels
    curandinit_fnc = KernelSourceModule.get_function("CurandInitKernel")
    fitness_fnc = KernelSourceModule.get_function("FitnessKernel")
    sorting_fnc = KernelSourceModule.get_function("SortingKernel")
    ga_fnc = KernelSourceModule.get_function("GAKernel")

    #Initialise Curand
    curandinit_fnc(CurandStates_h, block=(int(CurandInit_NrThreadsPerBlock), 1, 1), grid=(int(CurandInit_NrBlocks), 1))

    #Build parameter lists for FitnessKernel and GAKernel
    FitnessKernelParams = (Genomes_h, FitnessValues_h, AssembledGrids_h, CurandStates_h, Mutexe_h);
    SortingKernelParams = (FitnessValues_h, FitnessPartialSums_h)
    GAKernelParams = (Genomes_h, FitnessValues_h, AssembledGrids_h, CurandStates_h);

    #TEST ONLY
    return
    #TEST ONLY

    #Initialise CUDA timers
    start = drv.Event()
    stop = drv.Event()

    #execute kernels for specified number of generations
    start.record()
    for gen in range(0, GlobalParamsDict["NrGenerations"]):
        #print "Processing Generation: %d"%(gen)

        #fitness_fnc(*(FitnessKernelParams), block=fit_blocks, grid=fit_grid)

        #Launch CPU processing (should be asynchroneous calls)

        sorting_fnc(*(SortingKernelParams), block=sorting_blocks, grid=sorting_grids) #Launch Sorting Kernel

        drv.memcpy_dtoh(ReductionList, ReductionList_h) #Copy from Device to Host and finish sorting
        FitnessSumConst = ReductionList.sum()
        drv.memcpy_htod(FitnessSumConst_h[0], FitnessSumConst) #Copy from Host to Device constant memory
        drv.memcpy_dtod(FitnessListConst_h[0], FitnessValues_h, FitnessValues.nbytes) #Copy FitneValues from Device to Device Const

        ga_fnc(*(GAKernelParams), block=ga_blocks, grid=ga_grids)

        drv.memcpy_dtoh(Genomes, Genomes_h) #Copy data from GPU
        drv.memcpy_dtoh(FitnessValues, FitnessValues_h)
        drv.memcpy_dtoh(AssembledGrids, AssembledGrids_h)

    stop.record()
    stop.synchronize()
    print "Total kernel time taken: %fs"%(start.time_till(stop)*1e-3)
    print "Mean time per generation: %fs"%(start.time_till(stop)*1e-3 / NrGenerations)
    pass
Ejemplo n.º 19
0
def mds(MATRIX_FILE, DIMENSIONS, N_ITERATIONS, IMAGES_EVERY, STATION_BLOCK_SIZE, N_NEARBY_STATIONS, DEBUG_OUTPUT, STATUS_FUNCTION, GRAPH_FUNCTION) :

        print 'Loading matrix...'
        npz = np.load(MATRIX_FILE)
        station_coords = npz['station_coords']
        grid_dim       = npz['grid_dim']
        matrix         = npz['matrix'].astype(np.int32)

        # EVERYTHING SHOULD BE IN FLOAT32 for ease of debugging. even times.
        # Matrix and others should be textures, arrays, or in constant memory, to do cacheing.
        # As it is, I'm doing explicit cacheing.

        # force OD matrix symmetry for test
        # THIS was responsible for the coordinate drift!!!
        # need to symmetrize it before copy to device
        matrix = (matrix + matrix.T) / 2

        station_coords_int = station_coords.round().astype(np.int32)

        # to be removed when textures are working
        station_coords_gpu = gpuarray.to_gpu(station_coords_int)
        matrix_gpu = gpuarray.to_gpu(matrix)

        max_x, max_y = grid_dim
        n_gridpoints = int(max_x * max_y)
        n_stations   = len(station_coords)

        cuda_grid_shape = ( int( math.ceil( float(max_x)/CUDA_BLOCK_SHAPE[0] ) ), int( math.ceil( float(max_y)/CUDA_BLOCK_SHAPE[1] ) ) )

        print "\n----PARAMETERS----"
        print "Input file:            ", MATRIX_FILE
        print "Number of stations:    ", n_stations
        print "OD matrix shape:       ", matrix.shape    
        print "Station coords shape:  ", station_coords_int.shape 
        print "Station cache size:    ", N_NEARBY_STATIONS
        print "Map dimensions:        ", grid_dim
        print "Number of map points:  ", n_gridpoints
        print "Target space dimensionality: ", DIMENSIONS
        print "CUDA block dimensions: ", CUDA_BLOCK_SHAPE 
        print "CUDA grid dimensions:  ", cuda_grid_shape

        assert station_coords.shape == (n_stations, 2)
        assert N_NEARBY_STATIONS <= n_stations
        
        #sys.exit()

        # Make and register custom color map for pylab graphs

        cdict = {'red':   ((0.0,  0.0, 0.0),
                           (0.2,  0.0, 0.0),
                           (0.4,  0.9, 0.9),
                           (1.0,  0.0, 0.0)),

                 'green': ((0.0,  0.0, 0.1),
                           (0.05, 0.9, 0.9),
                           (0.1,  0.0, 0.0),
                           (0.4,  0.9, 0.9),
                           (0.6,  0.0, 0.0),
                           (1.0,  0.0, 0.0)),

                 'blue':  ((0.0,  0.0, 0.0),
                           (0.05, 0.0, 0.0),
                           (0.2,  0.9, 0.9),
                           (0.3,  0.0, 0.0),
                           (1.0,  0.0, 0.0))}

        mymap = LinearSegmentedColormap('mymap', cdict)
        mymap.set_over( (1.0, 0.0, 1.0) )
        mymap.set_bad ( (0.0, 0.0, 0.0) )
        pl.plt.register_cmap(cmap=mymap)

        # set up arrays for calculations

        coords_gpu = gpuarray.to_gpu(np.random.random( (max_x, max_y, DIMENSIONS) ).astype(np.float32))   # initialize coordinates to random values in range 0...1
        forces_gpu = gpuarray.zeros( (int(max_x), int(max_y), DIMENSIONS), dtype=np.float32 )             # 3D float32 accumulate forces over one timestep
        weights_gpu = gpuarray.zeros( (int(max_x), int(max_y)),             dtype=np.float32 )             # 2D float32 cell error accumulation
        errors_gpu = gpuarray.zeros( (int(max_x), int(max_y)),             dtype=np.float32 )             # 2D float32 cell error accumulation
        near_stations_gpu = gpuarray.zeros( (cuda_grid_shape[0], cuda_grid_shape[1], N_NEARBY_STATIONS), dtype=np.int32)

        debug_gpu     = gpuarray.zeros( n_gridpoints, dtype = np.int32 )
        debug_img_gpu = gpuarray.zeros_like( errors_gpu )

        print "\n----COMPILATION----"
        # times could be merged into forces kernel, if done by pixel not station.
        # integrate kernel could be GPUArray operation; also helps clean up code by using GPUArrays.
        # DIM should be replaced by python script, so as not to define twice. 
        src = open("unified_mds.cu").read()
        src = src.replace( 'N_NEARBY_STATIONS_PYTHON', str(N_NEARBY_STATIONS) )
        src = src.replace( 'N_STATIONS_PYTHON', str(n_stations) )
        src = src.replace( 'DIMENSIONS_PYTHON', str(DIMENSIONS) )
        mod = SourceModule(src, options=["--ptxas-options=-v"])
        stations_kernel  = mod.get_function("stations"  )
        forces_kernel    = mod.get_function("forces"  )
        integrate_kernel = mod.get_function("integrate")

        matrix_texref         = mod.get_texref('tex_matrix')
        station_coords_texref = mod.get_texref('tex_station_coords')
        near_stations_texref  = mod.get_texref('tex_near_stations')
        #ts_coords_texref  = mod.get_texref('tex_ts_coords') could be a 4-channel 2 dim texture, or 3 dim texture. or just 1D.

        cuda.matrix_to_texref(matrix, matrix_texref, order="F") # copy directly to device with texref - made for 2D x 1channel textures
        cuda.matrix_to_texref(station_coords_int, station_coords_texref, order="F") # fortran ordering, because we will be accessing with texND() instead of C-style indices
        near_stations_gpu.bind_to_texref_ext(near_stations_texref)

        # note, cuda.In and cuda.Out are from the perspective of the KERNEL not the host app!
        stations_kernel(near_stations_gpu, block=CUDA_BLOCK_SHAPE, grid=cuda_grid_shape)    
        autoinit.context.synchronize()

        #print "Near stations list:"
        #print near_stations_gpu
        print "\n----CALCULATION----"
        t_start = time.time()
        n_pass = 0
        while (n_pass < N_ITERATIONS) :
            n_pass += 1    
            # Pay attention to grid sizes when testing: if you don't run the integrator on the coordinates connected to stations, 
            # they don't move... so the whole thing stabilizes in a couple of cycles.    
            # Stations are worked on in blocks to avoid locking up the GPU with one giant kernel.
            for subset_low in range(0, n_stations, STATION_BLOCK_SIZE) :
                subset_high = subset_low + STATION_BLOCK_SIZE
                if subset_high > n_stations : subset_high = n_stations
                sys.stdout.write( "\rpass %03i / station %04i of %04i / total runtime %03.1f min " % (n_pass, subset_high, n_stations, (time.time() - t_start) / 60.0) )
                sys.stdout.flush()
                STATUS_FUNCTION(n_pass, subset_high, n_stations, (time.time() - t_start) / 60.0, (time.time() - t_start) / n_pass + (subset_low/n_stations))
                # adding texrefs in kernel call seems to change nothing, leaving them out.
                # max_x and max_y could be #defined in kernel source, along with STATION_BLOCK_SIZE 
                forces_kernel(np.int32(n_stations), np.int32(subset_low), np.int32(subset_high), max_x, max_y, coords_gpu, forces_gpu, weights_gpu, errors_gpu, debug_gpu, debug_img_gpu, block=CUDA_BLOCK_SHAPE, grid=cuda_grid_shape)
                autoinit.context.synchronize()
                # print coords_gpu, forces_gpu
                time.sleep(0.5)  # let the user interface catch up.
                
            integrate_kernel(max_x, max_y, coords_gpu, forces_gpu, weights_gpu, block=CUDA_BLOCK_SHAPE, grid=cuda_grid_shape)    
            autoinit.context.synchronize()

            print IMAGES_EVERY
            if (IMAGES_EVERY > 0) and (n_pass % IMAGES_EVERY == 0) :
            
                #print 'Kernel debug output:'
                #print debug_gpu
                
                velocities = np.sqrt(np.sum(forces_gpu.get() ** 2, axis = 2)) 
                pl.imshow( velocities.T, cmap=mymap, origin='bottom', vmin=0, vmax=100 )
                pl.title( 'Velocity ( sec / timestep) - step %03d' % n_pass )
                pl.colorbar()
                pl.savefig( 'img/vel%03d.png' % n_pass )
                pl.close()
                
                pl.imshow( (errors_gpu.get() / weights_gpu.get() / 60.0 ).T, cmap=mymap, origin='bottom', vmin=0, vmax=100 )
                pl.title( 'Average absolute error (min) - step %03d' %n_pass )
                pl.colorbar()
                pl.savefig( 'img/err%03d.png' % n_pass )
                pl.close()

                pl.imshow( (debug_img_gpu.get() / 60.0).T, cmap=mymap, origin='bottom', vmin=0, vmax=100 )
                pl.title( 'Debugging Output - step %03d' %n_pass )
                pl.colorbar()
                pl.savefig( 'img/debug%03d.png' % n_pass )
                pl.close()
                
                GRAPH_FUNCTION('img/err%03d.png' % n_pass)
                #INTERFACE.update
              
            sys.stdout.write( "/ avg pass time %02.1f sec" % ( (time.time() - t_start) / n_pass, ) )
            sys.stdout.flush()
Ejemplo n.º 20
0
def setup_texture_nparr(tex_ref, arr):
    cuda.matrix_to_texref(load_map(arr).astype(np.float32), tex_ref, order="C")
    tex_ref.set_address_mode(0, cuda.address_mode.WRAP)
    tex_ref.set_address_mode(1, cuda.address_mode.WRAP)
    tex_ref.set_filter_mode(cuda.filter_mode.POINT)
    tex_ref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)
Ejemplo n.º 21
0
def kmeans_gpu(data, clusters, iterations, return_times = 0):
    # kmeans_gpu(data, clusters, iterations) returns (clusters, labels)
    
    # kmeans using standard algorithm and cuda
    # input arguments are the data, intial cluster values, and number of iterations to repeat
    # The shape of data is (nDim, nPts) where nDim = # of dimensions in the data and
    # nPts = number of data points
    # The shape of clusters is (nDim, nClusters) 
    #
    # The return values are the updated clusters and labels for the data
    
    #---------------------------------------------------------------
    #                   get problem parameters
    #---------------------------------------------------------------
    (nDim, nPts) = data.shape
    nClusters = clusters.shape[1]


    #---------------------------------------------------------------
    #            set calculation control variables
    #---------------------------------------------------------------
    useTextureForData = 0
    usePageLockedMemory = 0
    if(nPts > 32768):
        useTextureForData = 0
    
    # block and grid sizes for the cluster_assign kernel
    threads_desired = 16*(1+(max(nPts, nDim*nClusters)-1)/16)
    blocksize_assign = min(256, threads_desired)
    gridsize_assign = 1 + (threads_desired - 1)/blocksize_assign
    
    """
    print "\nblocksize_assign =", blocksize_assign
    print "gridsize_assign  =", gridsize_assign
    """
    
    # block and grid sizes for the cluster_calc kernel
    blocksize_calc = 2
    while(blocksize_calc < min(512, nPts)):
        blocksize_calc *= 2
    maxblocks = 512
    seqcount_calc = 1 + (nPts-1)/(blocksize_calc * maxblocks)
    gridsize_calc = 1 + (nPts-1)/(seqcount_calc * blocksize_calc)
    
    """
    print "blocksize_calc =", blocksize_calc
    print "gridsize_calc  =", gridsize_calc
    print "seqcount_calc  =", seqcount_calc
    """
    
    blocksize_calc_part2 = 1
    while(blocksize_calc_part2 < gridsize_calc):
        blocksize_calc_part2 *= 2


    #---------------------------------------------------------------
    #                    prepare source modules
    #---------------------------------------------------------------
    t1 = time.time()
    
    mod_cuda = kernels.get_cuda_module(nDim, nPts, nClusters,
                                        blocksize_calc, seqcount_calc, gridsize_calc, 
                                        blocksize_calc_part2, useTextureForData, BOUNDS)

    cuda_assign = mod_cuda.get_function("assign")
    cuda_calc = mod_cuda.get_function("calc")
    cuda_calc_part2 = mod_cuda.get_function("calc_part2")

    pycuda.autoinit.context.synchronize()
    t2 = time.time()
    module_time = t2-t1

    
    #---------------------------------------------------------------
    #                    setup data on GPU
    #---------------------------------------------------------------
    t1 = time.time()

    data = np.array(data).astype(np.float32)
    clusters = np.array(clusters).astype(np.float32)
    
    if useTextureForData:
        # copy the data to the texture
        texrefData = mod_cuda.get_texref("texData")
        cuda.matrix_to_texref(data, texrefData, order="F")
    else:
        if usePageLockedMemory:
            data_pl = cuda.pagelocked_empty_like(data)
            data_pl[:,:] = data;
            gpu_data = gpuarray.to_gpu(data_pl)
        else:
            gpu_data = gpuarray.to_gpu(data)

    if usePageLockedMemory:
        clusters_pl = cuda.pagelocked_empty_like(clusters)
        clusters_pl[:,:] = clusters
        gpu_clusters = gpuarray.to_gpu(clusters_pl)
    else:
        gpu_clusters = gpuarray.to_gpu(clusters)


    gpu_assignments = gpuarray.zeros((nPts,), np.int32)
    gpu_clusters2 = gpuarray.zeros((nDim, nClusters), np.float32);
    gpu_reduction_out = gpuarray.zeros((nDim, nClusters*gridsize_calc), 
                                                                np.float32)
    gpu_reduction_counts = gpuarray.zeros((nClusters*gridsize_calc,), 
                                                                np.int32)
    
    pycuda.autoinit.context.synchronize()
    t2 = time.time()
    data_time = t2-t1


    #---------------------------------------------------------------
    #                    do calculations
    #---------------------------------------------------------------
    assign_time = 0.
    calc_time = 0.


    for i in range(iterations):
    
        # assign data to clusters
        t1 = time.time()
        if useTextureForData:
            cuda_assign(gpu_clusters, gpu_assignments,
                        block = (blocksize_assign, 1, 1),
                        grid = (gridsize_assign, 1),
                        texrefs=[texrefData])
        else:
            cuda_assign(gpu_data, gpu_clusters, gpu_assignments,
                         block = (blocksize_assign, 1, 1),
                         grid = (gridsize_assign, 1))
        pycuda.autoinit.context.synchronize()
        t2 = time.time()
        assign_time += t2-t1

        # calculate new cluster centers
        t1 = time.time()
        if useTextureForData:
            cuda_calc(gpu_reduction_out, gpu_reduction_counts, gpu_assignments,
                block = (blocksize_calc, 1, 1),
                grid = (gridsize_calc, nDim),
                texrefs=[texrefData])
        else:
            cuda_calc(gpu_data, gpu_reduction_out, gpu_reduction_counts, 
                gpu_assignments,
                block = (blocksize_calc, 1, 1),
                grid = (gridsize_calc, nDim))
        
        cuda_calc_part2(gpu_reduction_out, gpu_reduction_counts, 
                gpu_clusters2, gpu_clusters,
                block = (blocksize_calc_part2, 1, 1),
                grid = (1, nDim))
        
        pycuda.autoinit.context.synchronize()
        t2 = time.time()
        calc_time += t2-t1
    
        # prepare for next iteration
        temp = gpu_clusters
        gpu_clusters = gpu_clusters2
        gpu_clusters2 = temp
        
    if return_times:
        return gpu_assignments, gpu_clusters.get(), \
                data_time, module_time, assign_time/iterations, calc_time/iterations 
    else:
        return gpu_clusters.get(), gpu_assignments.get()
def main():

    #Create dictionary argument for rendering
    RenderArgs= {"safe_memory_mapping":1,
                 "aligned_byte_length_genome":8,
                 "bit_length_edge_type":3, 
                 "curand_nr_threads_per_block":256,
                 "nr_tile_types":4,
                 "nr_edge_types":8,
                 "warpsize":32,
                 "fit_dim_thread_x":1,
                 "fit_dim_thread_y":1,
                 "fit_dim_block_x":1 }

    # Set environment for template package Jinja2
    env = Environment(loader=PackageLoader('main', './'))

    # Load source code from file
    Source = env.get_template('./alpha.cu') #Template( file(KernelFile).read() )
    # Render source code
    RenderedSource = Source.render( RenderArgs )

    # Save rendered source code to file
    f = open('./rendered.cu', 'w')
    f.write(RenderedSource)
    f.close()

    #Load source code into module
    KernelSourceModule = SourceModule(RenderedSource, options=None, arch="compute_20", code="sm_20")
    Kernel = KernelSourceModule.get_function("TestEdgeSortKernel")
    CurandKernel = KernelSourceModule.get_function("CurandInitKernel")

    #Initialise InteractionMatrix
    InteractionMatrix = numpy.zeros( ( 8, 8) ).astype(numpy.float32)
    def Delta(a,b):
        if a==b:
            return 1
        else:
            return 0
    for i in range(InteractionMatrix.shape[0]):
        for j in range(InteractionMatrix.shape[1]):
            InteractionMatrix[i][j] = ( 1 - i % 2 ) * Delta( i, j+1 ) + ( i % 2 ) * Delta( i, j-1 )

    #Set up our InteractionMatrix
    InteractionMatrix_h = KernelSourceModule.get_texref("t_ucInteractionMatrix")
    drv.matrix_to_texref( InteractionMatrix, InteractionMatrix_h , order="C")
    print InteractionMatrix

    #a =  numpy.random.randn(400).astype(numpy.uint8)
    #b = numpy.random.randn(400).astype(numpy.uint8)
    dest = numpy.arange(256).astype(numpy.uint8)
    for i in range(0, 256/8):
        dest[i*8 + 0] = 36
        dest[i*8 + 1] = 151
        dest[i*8 + 2] = 90
        dest[i*8 + 3] = 109
        dest[i*8 + 4] = 224
        dest[i*8 + 5] = 4
        dest[i*8 + 6] = 0
        dest[i*8 + 7] = 0
    
    dest_h = drv.mem_alloc(dest.nbytes)

    drv.memcpy_htod(dest_h, dest)
    print "before: "
    print dest
    curand = numpy.zeros(40*256).astype(numpy.uint8);
    curand_h = drv.mem_alloc(curand.nbytes)
    CurandKernel(curand_h, block=(32,1,1), grid=(1,1))
    Kernel(dest_h, curand_h, block=(32,1,1), grid=(1,1))
    drv.memcpy_dtoh(dest, dest_h)
    print "after: "
    print dest
def main():

    #FourPermutations set-up
    FourPermutations = numpy.array([ [1,2,3,4],
                                  [1,2,4,3],
                                  [1,3,2,4],
                                  [1,3,4,2],
                                  [1,4,2,3],
                                  [1,4,3,2],
                                  [2,1,3,4],
                                  [2,1,4,3],
                                  [2,3,1,4],
                                  [2,3,4,1],
                                  [2,4,1,3],
                                  [2,4,3,1],
                                  [3,2,1,4],
                                  [3,2,4,1],
                                  [3,1,2,4],
                                  [3,1,4,2],
                                  [3,4,2,1],
                                  [3,4,1,2],
                                  [4,2,3,1],
                                  [4,2,1,3],
                                  [4,3,2,1],
                                  [4,3,1,2],
                                  [4,1,2,3],
                                  [4,1,3,2],]).astype(numpy.uint8)

    #Create dictionary argument for rendering
    RenderArgs= {"safe_memory_mapping":1,
                 "aligned_byte_length_genome":8,
                 "bit_length_edge_type":3, 
                 "curand_nr_threads_per_block":256,
                 "nr_tile_types":4,
                 "nr_edge_types":8,
                 "warpsize":32,
                 "fit_dim_thread_x":1,
                 "fit_dim_thread_y":1,
                 "fit_dim_block_x":1,
                 "fit_dim_grid_x":19,
                 "fit_dim_grid_y":19,
                 "fit_nr_four_permutations":24,
                 "fit_length_movelist":244,
                 "fit_nr_redundancy_grid_depth":2,
                 "fit_nr_redundancy_assemblies":10,
                 "fit_tile_index_starting_tile":0,
                 "glob_nr_tile_orientations":4
                }

    # Set environment for template package Jinja2
    env = Environment(loader=PackageLoader('main', './'))

    # Load source code from file
    Source = env.get_template('./alpha.cu') #Template( file(KernelFile).read() )
    # Render source code
    RenderedSource = Source.render( RenderArgs )

    # Save rendered source code to file
    f = open('./rendered.cu', 'w')
    f.write(RenderedSource)
    f.close()

    #Load source code into module
    KernelSourceModule = SourceModule(RenderedSource, options=None, arch="compute_20", code="sm_20")
    Kernel = KernelSourceModule.get_function("TestAssemblyKernel")
    CurandKernel = KernelSourceModule.get_function("CurandInitKernel")

    #Initialise InteractionMatrix
    InteractionMatrix = numpy.zeros( ( 8, 8) ).astype(numpy.float32)
    def Delta(a,b):
        if a==b:
            return 1
        else:
            return 0
    for i in range(InteractionMatrix.shape[0]):
        for j in range(InteractionMatrix.shape[1]):
            InteractionMatrix[i][j] = ( 1 - i % 2 ) * Delta( i, j+1 ) + ( i % 2 ) * Delta( i, j-1 )

    #Set up our InteractionMatrix
    InteractionMatrix_h = KernelSourceModule.get_texref("t_ucInteractionMatrix")
    drv.matrix_to_texref( InteractionMatrix, InteractionMatrix_h , order="C")
    print InteractionMatrix
 
    #Set-up genomes
    dest = numpy.arange(256).astype(numpy.uint8)
    for i in range(0, 256/8):
        #dest[i*8 + 0] = int('0b00100101',2) #CRASHES
        #dest[i*8 + 1] = int('0b00010000',2) #CRASHES
        #dest[i*8 + 0] = int('0b00101000',2)
        #dest[i*8 + 1] = int('0b00000000',2)
        #dest[i*8 + 2] = int('0b00000000',2)
        #dest[i*8 + 3] = int('0b00000000',2)
        #dest[i*8 + 4] = int('0b00000000',2)
        #dest[i*8 + 5] = int('0b00000000',2)
        #dest[i*8 + 6] = int('0b00000000',2)
        #dest[i*8 + 7] = int('0b00000000',2)
        dest[i*8 + 0] = 36
        dest[i*8 + 1] = 151
        dest[i*8 + 2] = 90
        dest[i*8 + 3] = 109
        dest[i*8 + 4] = 224
        dest[i*8 + 5] = 4
        dest[i*8 + 6] = 0
        dest[i*8 + 7] = 0

    dest[0] = 40
    dest[1] = 0
    dest[2] = 0
    dest[3] = 0
    dest[4] = 0
    dest[5] = 0
    dest[6] = 0
    dest[7] = 0

    dest_h = drv.mem_alloc(dest.nbytes)
    drv.memcpy_htod(dest_h, dest)
    print "Genomes before: "
    print dest

    #Set-up grids
    grids = numpy.zeros((32, 19, 19)).astype(numpy.uint8)
    grids_h = drv.mem_alloc(grids.nbytes)
    drv.memcpy_htod(grids_h, grids)
    print "Grids:"
    print grids    

    #Set-up fitness values
    fitness = numpy.zeros(256).astype(numpy.float32)
    fitness_h = drv.mem_alloc(fitness.nbytes)
    drv.memcpy_htod(fitness_h, fitness)
    print "Fitness values:"
    print fitness

    #Set-up curand
    curand = numpy.zeros(40*256).astype(numpy.uint8);
    curand_h = drv.mem_alloc(curand.nbytes)

    #Set-up four permutations
    FourPermutations_h = KernelSourceModule.get_global("c_ucFourPermutations") # Constant memory address
    drv.memcpy_htod(FourPermutations_h[0], FourPermutations)

    #Set-up timers
    #start = drv.Event()
    #stop = drv.Event()
    #start.record()        

    #Call kernels
    CurandKernel(curand_h, block=(32,1,1), grid=(1,1))
    Kernel(dest_h, fitness_h, grids_h, curand_h, block=(32,1,1), grid=(1,1))
 
    #drv.Context.synchronize()
    #Clean-up timer
    #stop.synchronize()
    #print "Total kernel time taken: %fs"%(start.time_till(stop)*1e-3)
    #print "Mean time per generation: %fs"%(start.time_till(stop)*1e-3 / NrGenerations)
    #pass    

    #Output
    drv.memcpy_dtoh(dest, dest_h)
    print "Genomes after: "
    print dest
    drv.memcpy_dtoh(fitness, fitness_h)
    print "Fitness after: "
    print fitness
    drv.memcpy_dtoh(grids, grids_h)
    print "Grids[0] after: "
    print grids[0]
    print "Grids[31] after:"
    print grids[31]
Ejemplo n.º 24
0
grid_size = (int(np.ceil(N / block_size[0])), int(np.ceil(M / block_size[1])))

start_cpu = time.time()
result = filt_cpu(image, sigma_r, sigma_d)
end_cpu = time.time()

result_gpu = np.zeros((N, M), dtype=np.uint32)

filt_gpu = mod.get_function("filt_gpu")

start_gpu = time.time()
tex = mod.get_texref("tex")
tex.set_filter_mode(drv.filter_mode.LINEAR)
tex.set_address_mode(0, drv.address_mode.MIRROR)
tex.set_address_mode(1, drv.address_mode.MIRROR)
drv.matrix_to_texref(image.astype(np.uint32), tex, order="C")
filter_bilat(drv.Out(result_gpu),
             np.int32(N),
             np.int32(M),
             np.float32(sigma_d),
             np.float32(sigma_r),
             block=block_size,
             grid=grid_size,
             texrefs=[tex])
drv.Context.synchronize()
end_gpu = time.time()

cv2.imwrite('res_gpu.bmp', result_gpu.astype(np.uint8))
cv2.imwrite('res_cpu.bmp', result)

print('Время CPU {}'.format(end_cpu - start_cpu))
Ejemplo n.º 25
0
x_out = np.array([i for i in range(M2)] * N2)
y_out = np.array([i for i in range(N2)] * M2)

start = driver.Event()
stop = driver.Event()

#подготовка текстуры
print("Считаем на ГПУ...")
start.record()

prep_image = prepare_image(image)
tex = mod.get_texref("tex")
tex.set_filter_mode(driver.filter_mode.LINEAR)
tex.set_address_mode(0, driver.address_mode.CLAMP)
tex.set_address_mode(1, driver.address_mode.CLAMP)
driver.matrix_to_texref(prep_image, tex, order="C")

bilinear_interpolation_kernel(driver.Out(result),
                              driver.In(x_out),
                              driver.In(y_out),
                              np.int32(M1),
                              np.int32(N1),
                              np.int32(M2),
                              np.int32(N2),
                              block=block,
                              grid=grid,
                              texrefs=[tex])
big_image = normalize_image(result, image.shape[2])
stop.record()
stop.synchronize()
gpu_time = stop.time_since(start)
Ejemplo n.º 26
0
def alm2lenmap_onGPU(lib_alm, unlalm, dx_gu, dy_gu, do_not_prefilter=False):
    """
    Lens the input unl_CMB map on the GPU using the pyCUDA interface.
    dx dy displacement in grid units. (f.get_dx_ingridunits() e.g.)
    Can be path to arrays or arrays or memmap.
    Will probably crash for too large maps, with need to split the job.
    Works for 4096 x 4096 at least on my laptop.

    Cost dominated by texture setup. # FIXME : try get rid of texture
    Note that the first call might be substantially slower than subsequent calls, as it caches the fft and ifft plans
    for subsequent usage.
    :param unl_CMB:
    :param func: bicubic or bilinear
    :param normalized_tex: use a modified version of the GPU bicubic spline to account for periodicity of the map
    :return:
    """
    if timed:
        ti = time.time()
    shape = lib_alm.ell_mat.shape
    rshape = (shape[0], shape[1] / 2 + 1)
    assert shape[0] == shape[1], shape
    assert IsPowerOfTwo(shape[0]), shape
    assert load_map(dx_gu).shape == shape, (load_map(dx_gu).shape,
                                            lib_alm.ell_mat.shape)
    assert load_map(dy_gu).shape == shape, (load_map(dy_gu).shape,
                                            lib_alm.ell_mat.shape)

    assert np.all(np.array(shape) % GPU_block[0] == 0), shape
    if shape[0] > 4096:
        print "--- Exercise caution, array shapes larger than 4096 have never been tested so far ---"

    GPU_grid = (shape[0] / GPU_block[0], shape[1] / GPU_block[1], 1)

    # Prefiltering forces the interpolant to pass through the samples and increase accuracy, but dominates the cost.
    rfft2_unlCMB_gpu = gpuarray.to_gpu(
        lib_alm.alm2rfft(unlalm / np.prod(shape)).astype(np.complex64))
    coeffs_gpu = gpuarray.empty(lib_alm.ell_mat.shape, dtype=np.float32)
    plan, plan_inv = get_rfft_plans(shape)

    if not do_not_prefilter:
        # The prefilter makes sure the spline is exact at the nodes.
        # Uncomments this to put coeffs_gpu on pitched memory to allow later for 2D texture binding :
        # alloc,pitch  = cuda.mem_alloc_pitch(shape[0] * 4,shape[1],4) # 4 bytes per float32
        wx = (6. / (2. * np.cos(
            2. * np.pi * Freq(np.arange(shape[0]), shape[0]) / shape[0]) + 4.))
        wx_gpu = gpuarray.to_gpu(wx.astype(np.float32))
        prefilter = CUDA_module.get_function("cf_outer_w")
        prefilter(rfft2_unlCMB_gpu,
                  wx_gpu,
                  np.int32(rshape[1]),
                  np.int32(rshape[0]),
                  block=GPU_block,
                  grid=GPU_grid)
        del wx_gpu

    ifft(rfft2_unlCMB_gpu, coeffs_gpu, plan_inv, False)

    # Binding arrays to textures and getting lensing func.
    if texture_count == 0:
        lens_func = CUDA_module.get_function("bicubiclensKernel_notex")
        tex_refs = []
        dx_gu = gpuarray.to_gpu(load_map(dx_gu).astype(np.float32))
        dy_gu = gpuarray.to_gpu(load_map(dy_gu).astype(np.float32))
    elif texture_count == 1:
        unl_CMB_tex = CUDA_module.get_texref("unl_CMB")
        tex_refs = [unl_CMB_tex]
        unl_CMB_tex.set_array(cuda.gpuarray_to_array(coeffs_gpu, "C"))
        del coeffs_gpu
        dx_gu = gpuarray.to_gpu(load_map(dx_gu).astype(np.float32))
        dy_gu = gpuarray.to_gpu(load_map(dy_gu).astype(np.float32))
        lens_func = CUDA_module.get_function(
            "bicubiclensKernel_normtex_singletex")
    elif texture_count == 3:
        unl_CMB_tex = CUDA_module.get_texref("unl_CMB")
        dx_tex = CUDA_module.get_texref("tex_dx")
        dy_tex = CUDA_module.get_texref("tex_dy")
        tex_refs = ([unl_CMB_tex, dx_tex, dy_tex])
        unl_CMB_tex.set_array(cuda.gpuarray_to_array(coeffs_gpu, "C"))
        del coeffs_gpu
        cuda.matrix_to_texref(load_map(dx_gu).astype(np.float32),
                              dx_tex,
                              order="C")
        cuda.matrix_to_texref(load_map(dy_gu).astype(np.float32),
                              dy_tex,
                              order="C")
        lens_func = CUDA_module.get_function("bicubiclensKernel_normtex")
    else:
        tex_refs = []
        lens_func = 0
        assert 0
    # Wraping, important for periodic boundary conditions.
    # Note that WRAP has not effect for unnormalized texture coordinates.

    for tex_ref in tex_refs:
        tex_ref.set_address_mode(0, cuda.address_mode.WRAP)
        tex_ref.set_address_mode(1, cuda.address_mode.WRAP)
        tex_ref.set_filter_mode(cuda.filter_mode.POINT)
        tex_ref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)

    if timed: t0 = time.time()

    len_CMB = np.empty(shape, dtype=np.float32)

    if texture_count == 0:
        lens_func(cuda.Out(len_CMB),
                  coeffs_gpu,
                  dx_gu,
                  dy_gu,
                  np.int32(shape[0]),
                  block=GPU_block,
                  grid=GPU_grid,
                  texrefs=tex_refs)
    elif texture_count == 1:
        lens_func(cuda.Out(len_CMB),
                  dx_gu,
                  dy_gu,
                  np.int32(shape[0]),
                  block=GPU_block,
                  grid=GPU_grid,
                  texrefs=tex_refs)
    elif texture_count == 3:
        lens_func(cuda.Out(len_CMB),
                  np.int32(shape[0]),
                  block=GPU_block,
                  grid=GPU_grid,
                  texrefs=tex_refs)

    if timed:
        dt = time.time() - t0
        t_tot = time.time() - ti
        print "     GPU bicubic spline and transfer at %s Mpixel / sec, time %s sec" % (
            np.prod(lib_alm.ell_mat.shape) / 1e6 / dt, dt)
        print " Total ex. time at %s Mpixel / sec, ex. time %s sec." % (
            np.prod(shape) / 1e6 / t_tot, t_tot)
    return len_CMB.astype(np.float64)
Ejemplo n.º 27
0
IMG = 'input.bmp'
img = cv2.imread(IMG, cv2.IMREAD_GRAYSCALE)
M, N = img.shape

sigma_d = 100
sigma_r = 10
block = (16, 16, 1)
grid = (int(np.ceil(M / block[0])), int(np.ceil(N / block[1])))
start = driver.Event()
stop = driver.Event()

start.record()
tex = mod.get_texref("tex")
tex.set_filter_mode(driver.filter_mode.LINEAR)
tex.set_address_mode(1, driver.address_mode.MIRROR)
driver.matrix_to_texref(img.astype("int32"), tex, order="C")
gpu_result = np.zeros((M, N), dtype=np.uint32)
bilateral_interpolation(driver.Out(gpu_result),
                        np.int32(M),
                        np.int32(N),
                        np.float32(sigma_d),
                        np.float32(sigma_r),
                        block=block,
                        grid=grid,
                        texrefs=[tex])
stop.record()
stop.synchronize()
gpu_time = stop.time_since(start)
print(gpu_time)
cv2.imwrite("output.bmp", gpu_result.astype("int8"))
def interpolate_image_by_cuda(image: np.ndarray, scale: int = 2):
    import pycuda.autoinit

    cu_module = SourceModule(open("kernel.cu", "r").read())
    interpolate = cu_module.get_function("interpolate")

    start = time.time()

    print('Getting color channels..')
    uint32_image = np.zeros((image.shape[0], image.shape[1]), dtype=np.uint32)
    for x in range(uint32_image.shape[0]):
        for y in range(uint32_image.shape[1]):
            for ch in range(image.shape[2]):
                uint32_image[x,
                             y] += image[x, y,
                                         ch] << (8 * (image.shape[2] - ch - 1))

    print('Copying texture..')
    cu_tex = cu_module.get_texref("tex")
    cu_tex.set_filter_mode(cuda.filter_mode.POINT)
    cu_tex.set_address_mode(0, cuda.address_mode.CLAMP)
    cu_tex.set_address_mode(1, cuda.address_mode.CLAMP)
    cuda.matrix_to_texref(uint32_image, cu_tex, order="C")

    print('Getting image enlarged shape..')
    enlarged_image_shape = get_image_enlarged_shape(image, scale)
    result = np.zeros((enlarged_image_shape[0], enlarged_image_shape[1]),
                      dtype=np.uint32)
    block = (16, 16, 1)
    grid = (int(np.ceil(enlarged_image_shape[0] / block[0])),
            int(np.ceil(enlarged_image_shape[1] / block[1])))

    print('Interpolating..')
    interpolate(cuda.Out(result),
                np.int32(image.shape[1]),
                np.int32(image.shape[0]),
                np.int32(enlarged_image_shape[1]),
                np.int32(enlarged_image_shape[0]),
                np.int32(image.shape[2]),
                block=block,
                grid=grid,
                texrefs=[cu_tex])

    print('Combining channels into color points..')
    rgba_image = np.zeros(
        (enlarged_image_shape[0], enlarged_image_shape[1], image.shape[2]),
        dtype=np.uint32)
    for x in range(rgba_image.shape[0]):
        for y in range(rgba_image.shape[1]):
            output_x_y = result[x, y]
            for ch in range(rgba_image.shape[2]):
                rgba_image[x, y,
                           rgba_image.shape[2] - ch - 1] = output_x_y % 256
                output_x_y >>= 8

    print('Clearing temporaries..')
    del result
    del uint32_image
    print(
        f'interpolate_image_by_cuda - calculation time: {time.time() - start:.5f} s'
    )
    return rgba_image
Ejemplo n.º 29
0
def trikmeans_gpu(data, clusters, iterations, return_times = 0):
    """trikmeans_gpu(data, clusters, iterations) returns (clusters, labels)
    
    K-means using triangle inequality algorithm and PyCuda
    Input arguments are the data, intial cluster values, and number of iterations to repeat.
    The shape of data is (nDim, nPts) where nDim = # of dimensions in the data and
        nPts = number of data points.
    The shape of clusters is (nDim, nClusters) 
    
    The return values are the updated clusters and labels for the data
    """

    #---------------------------------------------------------------
    #                   get problem parameters
    #---------------------------------------------------------------
    (nDim, nPts) = data.shape
    nClusters = clusters.shape[1]


    #---------------------------------------------------------------
    #            set calculation control variables
    #---------------------------------------------------------------
    useTextureForData = 0
    
    usePageLockedMemory = 0

    if(nPts > 32768):
        useTextureForData = 0
    
    
    # block and grid sizes for the ccdist kernel (also for hdclosest)
    blocksize_ccdist = min(512, 16*(1+(nClusters-1)/16))
    gridsize_ccdist = 1 + (nClusters-1)/blocksize_ccdist
    
    #block and grid sizes for the init module
    threads_desired = 16*(1+(max(nPts, nDim*nClusters)-1)/16)
    #blocksize_init = min(512, threads_desired)
    blocksize_init = min(128, threads_desired)
    gridsize_init = 1 + (threads_desired - 1)/blocksize_init
    
    #block and grid sizes for the step3 module
    blocksize_step3 = blocksize_init
    if not useTextureForData:
        blocksize_step3 = min(256, blocksize_step3)
    gridsize_step3 = gridsize_init
    
    #block and grid sizes for the step4 module
    # Each block of threads will handle seqcount times the data
    # eg blocksize of 512 and seqcount of 4, each block reduces 4*512 = 2048 elements
    blocksize_step4 = 2
    while(blocksize_step4 < min(512, nPts)):
        blocksize_step4 *= 2
    maxblocks = 512
    seqcount_step4 = 1 + (nPts-1)/(blocksize_step4*maxblocks)
    gridsize_step4 = 1 + (nPts-1)/(seqcount_step4*blocksize_step4)
    
    blocksize_step4part2 = 1
    while(blocksize_step4part2 < gridsize_step4):
        blocksize_step4part2 *= 2
    
    
    
    #block and grid sizes for the calc_movement module
    for blocksize_calcm in range(32, 512, 32):
        if blocksize_calcm >= nClusters:
            break;
    gridsize_calcm = 1 + (nClusters-1)/blocksize_calcm
    
    #block and grid sizes for the step56 module
    blocksize_step56 = blocksize_init
    gridsize_step56 = gridsize_init
    
    
    #---------------------------------------------------------------
    #                    prepare source modules
    #---------------------------------------------------------------
    t1 = time.time()
    
    mod_ccdist = kernels.get_big_module(nDim, nPts, nClusters,
                                        blocksize_step4, seqcount_step4, gridsize_step4, 
                                        blocksize_step4part2, useTextureForData)

    ccdist = mod_ccdist.get_function("ccdist")
    calc_hdclosest = mod_ccdist.get_function("calc_hdclosest")
    init = mod_ccdist.get_function("init")
    step3 = mod_ccdist.get_function("step3")
    step4 = mod_ccdist.get_function("step4")
    step4part2 = mod_ccdist.get_function("step4part2")
    calc_movement = mod_ccdist.get_function("calc_movement")
    step56 = mod_ccdist.get_function("step56")
    pycuda.autoinit.context.synchronize()
    t2 = time.time()
    module_time = t2-t1


    #---------------------------------------------------------------
    #                    setup data on GPU
    #---------------------------------------------------------------
    t1 = time.time()

    data = np.array(data).astype(np.float32)
    clusters = np.array(clusters).astype(np.float32)
    
    if useTextureForData:
        # copy the data to the texture
        texrefData = mod_ccdist.get_texref("texData")
        cuda.matrix_to_texref(data, texrefData, order="F")
    else:
        if usePageLockedMemory:
            data_pl = cuda.pagelocked_empty_like(data)
            data_pl[:,:] = data;
            gpu_data = gpuarray.to_gpu(data_pl)
        else:
            gpu_data = gpuarray.to_gpu(data)

    if usePageLockedMemory:
        clusters_pl = cuda.pagelocked_empty_like(clusters)
        clusters_pl[:,:] = clusters
        gpu_clusters = gpuarray.to_gpu(clusters_pl)
    else:
        gpu_clusters = gpuarray.to_gpu(clusters)


    gpu_assignments = gpuarray.zeros((nPts,), np.int32)         # cluster assignment
    gpu_lower = gpuarray.zeros((nClusters, nPts), np.float32)   # lower bounds on distance between 
                                                                # point and each cluster
    gpu_upper = gpuarray.zeros((nPts,), np.float32)             # upper bounds on distance between
                                                                # point and any cluster
    gpu_ccdist = gpuarray.zeros((nClusters, nClusters), np.float32)    # cluster-cluster distances
    gpu_hdClosest = gpuarray.zeros((nClusters,), np.float32)    # half distance to closest
    gpu_hdClosest.fill(1.0e10)  # set to large value // **TODO**  get the acutal float max
    gpu_badUpper = gpuarray.zeros((nPts,), np.int32)   # flag to indicate upper bound needs recalc
    gpu_clusters2 = gpuarray.zeros((nDim, nClusters), np.float32);
    gpu_cluster_movement = gpuarray.zeros((nClusters,), np.float32);
    
    gpu_cluster_changed = gpuarray.zeros((nClusters,), np.int32)
    gpu_cluster_changed.fill(1)
    
    gpu_reduction_out = gpuarray.zeros((nDim, nClusters*gridsize_step4), np.float32)
    gpu_reduction_counts = gpuarray.zeros((nClusters*gridsize_step4,), np.int32)
    
    pycuda.autoinit.context.synchronize()
    t2 = time.time()
    data_time = t2-t1
    
    #---------------------------------------------------------------
    #                    do calculations
    #---------------------------------------------------------------
    ccdist_time = 0.
    hdclosest_time = 0.
    init_time = 0.
    step3_time = 0.
    step4_time = 0.
    step56_time = 0.

    t1 = time.time()
    ccdist(gpu_clusters, gpu_ccdist, gpu_hdClosest,
             block = (blocksize_ccdist, 1, 1),
             grid = (gridsize_ccdist, 1))
    pycuda.autoinit.context.synchronize()
    t2 = time.time()
    ccdist_time += t2-t1
    
    t1 = time.time()
    calc_hdclosest(gpu_ccdist, gpu_hdClosest,
            block = (blocksize_ccdist, 1, 1),
            grid = (gridsize_ccdist, 1))
    pycuda.autoinit.context.synchronize()
    t2 = time.time()
    hdclosest_time += t2-t1
    
    t1 = time.time()
    if useTextureForData:
        init(gpu_clusters, gpu_ccdist, gpu_hdClosest, gpu_assignments, 
                gpu_lower, gpu_upper,
                block = (blocksize_init, 1, 1),
                grid = (gridsize_init, 1),
                texrefs=[texrefData])
    else:
        init(gpu_data, gpu_clusters, gpu_ccdist, gpu_hdClosest, gpu_assignments, 
                gpu_lower, gpu_upper,
                block = (blocksize_init, 1, 1),
                grid = (gridsize_init, 1))
    pycuda.autoinit.context.synchronize()
    t2 = time.time()
    init_time += t2-t1

    for i in range(iterations):
    
        if i>0:
            t1 = time.time()
            ccdist(gpu_clusters, gpu_ccdist, gpu_hdClosest,
                     block = (blocksize_ccdist, 1, 1),
                     grid = (gridsize_ccdist, 1))
            pycuda.autoinit.context.synchronize()
            t2 = time.time()
            ccdist_time += t2-t1
            
            t1 = time.time()
            calc_hdclosest(gpu_ccdist, gpu_hdClosest,
                    block = (blocksize_ccdist, 1, 1),
                    grid = (gridsize_ccdist, 1))
            pycuda.autoinit.context.synchronize()
            t2 = time.time()
            hdclosest_time += t2-t1

            
        t1 = time.time()
        if i > 0:
            gpu_cluster_changed.fill(0)
        if useTextureForData:
            step3(gpu_clusters, gpu_ccdist, gpu_hdClosest, gpu_assignments,
                    gpu_lower, gpu_upper, gpu_badUpper, gpu_cluster_changed,
                    block = (blocksize_step3, 1, 1),
                    grid = (gridsize_step3, 1),
                    texrefs=[texrefData])
        else:
            step3(gpu_data, gpu_clusters, gpu_ccdist, gpu_hdClosest, gpu_assignments,
                    gpu_lower, gpu_upper, gpu_badUpper,  gpu_cluster_changed,
                    block = (blocksize_step3, 1, 1),
                    grid = (gridsize_step3, 1))
        
        pycuda.autoinit.context.synchronize()
        t2 = time.time()
        step3_time += t2-t1
        
        
        t1 = time.time()
        
        if useTextureForData:
            step4(gpu_cluster_changed, gpu_reduction_out,
                gpu_reduction_counts, gpu_assignments,
                block = (blocksize_step4, 1, 1),
                grid = (gridsize_step4, nDim),
                texrefs=[texrefData])
        else:
            step4(gpu_data, gpu_cluster_changed, gpu_reduction_out,
                gpu_reduction_counts, gpu_assignments,
                block = (blocksize_step4, 1, 1),
                grid = (gridsize_step4, nDim))
        
        step4part2(gpu_cluster_changed, gpu_reduction_out, gpu_reduction_counts, 
                gpu_clusters2, gpu_clusters,
                block = (blocksize_step4part2, 1, 1),
                grid = (1, nDim))
        
        calc_movement(gpu_clusters, gpu_clusters2, gpu_cluster_movement, gpu_cluster_changed,
                block = (blocksize_calcm, 1, 1),
                grid = (gridsize_calcm, 1))
        
        pycuda.autoinit.context.synchronize()
        t2 = time.time()
        step4_time += t2-t1
    
        t1 = time.time()
        if useTextureForData:
            step56(gpu_assignments, gpu_lower, gpu_upper, 
                    gpu_cluster_movement, gpu_badUpper,
                    block = (blocksize_step56, 1, 1),
                    grid = (gridsize_step56, 1),
                    texrefs=[texrefData])
        else:
            step56(gpu_assignments, gpu_lower, gpu_upper, 
                    gpu_cluster_movement, gpu_badUpper,
                    block = (blocksize_step56, 1, 1),
                    grid = (gridsize_step56, 1))
        pycuda.autoinit.context.synchronize()
        t2 = time.time()
        step56_time += t2-t1
        
        # prepare for next iteration
        temp = gpu_clusters
        gpu_clusters = gpu_clusters2
        gpu_clusters2 = temp
        
    if return_times:
        return gpu_ccdist, gpu_hdClosest, gpu_assignments, gpu_lower, gpu_upper, \
                gpu_clusters.get(), gpu_cluster_movement, \
                data_time, module_time, init_time, \
                ccdist_time/iterations, hdclosest_time/iterations, \
                step3_time/iterations, step4_time/iterations, step56_time/iterations
    else:
        return gpu_clusters.get(), gpu_assignments.get()
def main():

    #FourPermutations set-up
    FourPermutations = numpy.array([ [1,2,3,4],
                                  [1,2,4,3],
                                  [1,3,2,4],
                                  [1,3,4,2],
                                  [1,4,2,3],
                                  [1,4,3,2],
                                  [2,1,3,4],
                                  [2,1,4,3],
                                  [2,3,1,4],
                                  [2,3,4,1],
                                  [2,4,1,3],
                                  [2,4,3,1],
                                  [3,2,1,4],
                                  [3,2,4,1],
                                  [3,1,2,4],
                                  [3,1,4,2],
                                  [3,4,2,1],
                                  [3,4,1,2],
                                  [4,2,3,1],
                                  [4,2,1,3],
                                  [4,3,2,1],
                                  [4,3,1,2],
                                  [4,1,2,3],
                                  [4,1,3,2],]).astype(numpy.uint8)

    BankSize = 8 # Do not go beyond 8!

    #Define constants
    DimGridX = 19
    DimGridY = 19
    #SearchSpaceSize = 2**24
    #BlockDimY = SearchSpaceSize / (2**16)
    #BlockDimX = SearchSpaceSize / (BlockDimY)
    #print "SearchSpaceSize: ", SearchSpaceSize, " (", BlockDimX, ", ", BlockDimY,")"
    BlockDimX = 100
    BlockDimY = 100
    SearchSpaceSize = BlockDimX * BlockDimY  * 32
    #BlockDimX = 600
    #BlockDimY = 600
    FitnessValDim = SearchSpaceSize
    GenomeDim = SearchSpaceSize

    #Create dictionary argument for rendering
    RenderArgs= {"safe_memory_mapping":1,
                 "aligned_byte_length_genome":4,
                 "bit_length_edge_type":3,
                 "curand_nr_threads_per_block":32,
                 "nr_tile_types":2,
                 "nr_edge_types":8,
                 "warpsize":32,
                 "fit_dim_thread_x":32*BankSize,
                 "fit_dim_thread_y":1,
                 "fit_dim_block_x":BlockDimX,
                 "fit_dim_grid_x":19,
                 "fit_dim_grid_y":19,
                 "fit_nr_four_permutations":24,
                 "fit_length_movelist":244,
                 "fit_nr_redundancy_grid_depth":2,
                 "fit_nr_redundancy_assemblies":10,
                 "fit_tile_index_starting_tile":0,
                 "glob_nr_tile_orientations":4,
                 "banksize":BankSize,
                 "curand_dim_block_x":BlockDimX
                }
    # Set environment for template package Jinja2
    env = Environment(loader=PackageLoader('main', './'))

    # Load source code from file
    Source = env.get_template('./alpha.cu') #Template( file(KernelFile).read() )
    # Render source code
    RenderedSource = Source.render( RenderArgs )

    # Save rendered source code to file
    f = open('./rendered.cu', 'w')
    f.write(RenderedSource)
    f.close()

    #Load source code into module
    KernelSourceModule = SourceModule(RenderedSource, options=None, arch="compute_20", code="sm_20")
    Kernel = KernelSourceModule.get_function("SearchSpaceKernel")
    CurandKernel = KernelSourceModule.get_function("CurandInitKernel")


    #Initialise InteractionMatrix
    InteractionMatrix = numpy.zeros( ( 8, 8) ).astype(numpy.float32)
    def Delta(a,b):
        if a==b:
            return 1
        else:
            return 0
    for i in range(InteractionMatrix.shape[0]):
        for j in range(InteractionMatrix.shape[1]):
            InteractionMatrix[i][j] = ( 1 - i % 2 ) * Delta( i, j+1 ) + ( i % 2 ) * Delta( i, j-1 )

    #Set up our InteractionMatrix
    InteractionMatrix_h = KernelSourceModule.get_texref("t_ucInteractionMatrix")
    drv.matrix_to_texref( InteractionMatrix, InteractionMatrix_h , order="C")
    print InteractionMatrix



    #Set-up genomes
    dest = numpy.arange(GenomeDim*4).astype(numpy.uint8)
    #for i in range(0, GenomeDim/4):
        #dest[i*8 + 0] = int('0b00100101',2) #CRASHES
        #dest[i*8 + 1] = int('0b00010000',2) #CRASHES
        #dest[i*8 + 0] = int('0b00101000',2)
        #dest[i*8 + 1] = int('0b00000000',2)
        #dest[i*8 + 2] = int('0b00000000',2)
        #dest[i*8 + 3] = int('0b00000000',2)
        #dest[i*8 + 4] = int('0b00000000',2)
        #dest[i*8 + 5] = int('0b00000000',2)
        #dest[i*8 + 6] = int('0b00000000',2)
        #dest[i*8 + 7] = int('0b00000000',2)
    #    dest[i*4 + 0] = 40
    #    dest[i*4 + 1] = 0
    #    dest[i*4 + 2] = 0
    #    dest[i*4 + 3] = 0

    dest_h = drv.mem_alloc(GenomeDim*4) #dest.nbytes)
    #drv.memcpy_htod(dest_h, dest)
    #print "Genomes before: "
    #print dest

    #Set-up grids
    #grids = numpy.zeros((10000, DimGridX, DimGridY)).astype(numpy.uint8) #TEST
    #grids_h = drv.mem_alloc(GenomeDim*DimGridX*DimGridY) #TEST
    #drv.memcpy_htod(grids_h, grids)
    #print "Grids:"
    #print grids    

    #Set-up fitness values
    #fitness = numpy.zeros(FitnessValDim).astype(numpy.float32)
    #fitness_h = drv.mem_alloc(fitness.nbytes)
    fitness_left = numpy.zeros(FitnessValDim).astype(numpy.float32)
    fitness_left_h = drv.mem_alloc(fitness_left.nbytes)
    fitness_bottom = numpy.zeros(FitnessValDim).astype(numpy.float32)
    fitness_bottom_h = drv.mem_alloc(fitness_bottom.nbytes)
    #drv.memcpy_htod(fitness_h, fitness)
    #print "Fitness values:"
    #print fitness

    #Set-up grids
    grids = numpy.zeros((10000*32, DimGridX, DimGridY)).astype(numpy.uint8) #TEST
    grids_h = drv.mem_alloc(GenomeDim*DimGridX*DimGridY) #TEST
    #drv.memcpy_htod(grids_h, grids)
    #print "Grids:"
    #print grids 

    #Set-up curand
    #curand = numpy.zeros(40*GenomeDim).astype(numpy.uint8);
    #curand_h = drv.mem_alloc(curand.nbytes)
    curand_h = drv.mem_alloc(40*GenomeDim)

    #Set-up four permutations
    FourPermutations_h = KernelSourceModule.get_global("c_ucFourPermutations") # Constant memory address
    drv.memcpy_htod(FourPermutations_h[0], FourPermutations)

    #SearchSpace control
    #SearchSpaceSize = 2**24
    #BlockDimY = SearchSpaceSize / (2**16)
    #BlockDimX = SearchSpaceSize / (BlockDimY)
    #print "SearchSpaceSize: ", SearchSpaceSize, " (", BlockDimX, ", ", BlockDimY,")"
   
    #Schedule kernel calls
    #MaxBlockDim = 100
    OffsetBlocks = (SearchSpaceSize) % (BlockDimX*BlockDimY*32)
    MaxBlockCycles = (SearchSpaceSize - OffsetBlocks)/(BlockDimX*BlockDimY*32)
    BlockCounter=0
    print "Will do that many kernels a ",BlockDimX,"x",BlockDimY,":", MaxBlockCycles

    for i in range(MaxBlockCycles):
        #Set-up timer
        start = drv.Event()
        stop = drv.Event()
        start.record()
        print "Start kernel:"

        #Call kernels
        CurandKernel(curand_h, block=(32,1,1), grid=(BlockDimX, BlockDimY))
        print "Finished Curand kernel, starting main kernel..."
        Kernel(dest_h, grids_h, fitness_left_h, fitness_bottom_h, curand_h, block=(32*BankSize,1,1), grid=(BlockDimX,BlockDimY))

        #End timer
        stop.record()
        stop.synchronize()
        print "Total kernel time taken: %fs"%(start.time_till(stop)*1e-3)
        #print "Mean time per generation: %fs"%(start.time_till(stop)*1e-3 / NrGenerations)
        pass

    #Output
    #drv.memcpy_dtoh(dest, dest_h)
    #print "Genomes after: "
    #print dest[0:4]
    
    drv.memcpy_dtoh(fitness_left, fitness_left_h)
    print "FitnessLeft after: "
    print fitness_left#[1000000:1000500]

    drv.memcpy_dtoh(fitness_bottom, fitness_bottom_h)
    print "FitnessBottom after: "
    print fitness_bottom#[1000000:1000500]

    drv.memcpy_dtoh(grids, grids_h)
    print "Grids[0] after: "
    for i in range(0,5):
        print "Grid ",i,": "
        print grids[i]
def main():

    #Initialise InteractionMatrix
    def Delta(a,b):
        if a==b:
            return 1
        else:
            return 0
    for i in range(InteractionMatrix.shape[0]):
        for j in range(InteractionMatrix.shape[1]):
            InteractionMatrix[i][j] = ( 1 - i % 2 ) * Delta( i, j+1 ) + ( i % 2 ) * Delta( i, j-1 )

    #Initialise GPU (equivalent of autoinit)
    drv.init()
    assert drv.Device.count() >= 1
    dev = drv.Device(0)
    ctx = dev.make_context(0)

    #Convert GlobalParams to List
    GlobalParams = np.zeros(len(GlobalParamsDict.values())).astype(np.float32)
    count = 0
    for x in GlobalParamsDict.keys():
        GlobalParams[count] = GlobalParamsDict[x]
        count += 1

    #Convert FitnessParams to List
    FitnessParams = np.zeros(len(FitnessParamsDict.values())).astype(np.float32)
    count = 0
    for x in FitnessParamsDict.keys():
        FitnessParams[count] = FitnessParamsDict[x]
        count += 1

    #Convert GAParams to List
    GAParams = np.zeros(len(GAParamsDict.values())).astype(np.float32)
    count = 0
    for x in GAParamsDict.keys():
        GAParams[count] = GAParamsDict[x]
        count += 1

    # Set environment for template package Jinja2
    env = Environment(loader=PackageLoader('main_discoverytime', './templates'))

    # Load source code from file
    Source = env.get_template('./kernel.cu') #Template( file(KernelFile).read() )

    #Create dictionary argument for rendering
    RenderArgs= {"params_size":GlobalParams.nbytes,\
                "fitnessparams_size":FitnessParams.nbytes,\
                "gaparams_size":GAParams.nbytes,\
                "genome_bytelength":int(ByteLengthGenome),\
                "genome_bitlength":int(BitLengthGenome),\
                "ga_nr_threadsperblock":GA_NrThreadsPerBlock,\
                "textures":range( 0, NrFitnessFunctionGrids ),\
                "curandinit_nr_threadsperblock":CurandInit_NrThreadsPerBlock,\
                "with_mixed_crossover":WithMixedCrossover,
                "with_bank_conflict":WithBankConflict,
                "with_naive_roulette_wheel_selection":WithNaiveRouletteWheelSelection,
                "with_assume_normalized_fitness_function_values":WithAssumeNormalizedFitnessFunctionValues,
                "with_uniform_crossover":WithUniformCrossover,
                "with_single_point_crossover":WithSinglePointCrossover,
                "with_surefire_mutation":WithSurefireMutation,
                "with_storeassembledgridsinglobalmemory":WithStoreAssembledGridsInGlobalMemory,
                "ga_threaddimx":int(GA_ThreadDim),
                "glob_nr_tiletypes":int(NrTileTypes),
                "glob_nr_edgetypes":int(NrEdgeTypes),
                "glob_nr_tileorientations":int(NrTileOrientations),
                "fit_dimgridx":int(DimGridX),
                "fit_dimgridy":int(DimGridY),
                "fit_nr_fitnessfunctiongrids":int(NrFitnessFunctionGrids),
                "fit_nr_fourpermutations":int(NrFourPermutations),
                "fit_assembly_redundancy":int(NrAssemblyRedundancy),
                "fit_nr_threadsperblock":int(Fit_NrThreadsPerBlock),
                "sort_threaddimx":int(Sort_ThreadDimX),
                "glob_nr_genomes":int(NrGenomes),
                "fit_dimthreadx":int(ThreadDimX),
                "fit_dimthready":int(ThreadDimY),
                "fit_dimsubgridx":int(SubgridDimX),
                "fit_dimsubgridy":int(SubgridDimY),
                "fit_nr_subgridsperbank":int(NrSubgridsPerBank),
                "glob_bitlength_edgetype":int(EdgeTypeBitLength),
                "fitness_break_value":int(BitLengthGenome),   # ADAPTED FOR DISCOVERY KERNEL
                "fitness_flag_index":int(NrGenomes)
                }

    # Render source code
    RenderedSource = Source.render( RenderArgs )

    # Save rendered source code to file
    f = open('./rendered.cu', 'w')
    f.write(RenderedSource)
    f.close()

    #Load source code into module
    KernelSourceModule = SourceModule(RenderedSource, options=None, no_extern_c=True, arch="compute_20", code="sm_20", cache_dir=None)

    #Allocate values on GPU
    Genomes_h = drv.mem_alloc(Genomes.nbytes)
    FitnessPartialSums_h = drv.mem_alloc(FitnessPartialSums.nbytes)
    FitnessValues_h = drv.mem_alloc(FitnessValues.nbytes)
    AssembledGrids_h = drv.mem_alloc(AssembledGrids.nbytes)
    Mutexe_h = drv.mem_alloc(Mutexe.nbytes)
    #ReductionList_h = drv.mem_alloc(ReductionList.nbytes)

    #Copy values to global memory
    drv.memcpy_htod(Genomes_h, Genomes)
    drv.memcpy_htod(FitnessPartialSums_h, FitnessPartialSums)
    drv.memcpy_htod(FitnessValues_h, FitnessValues)
    drv.memcpy_htod(AssembledGrids_h, AssembledGrids)
    drv.memcpy_htod(Mutexe_h, Mutexe)

    #Copy values to constant / texture memory
    for id in range(0, NrFitnessFunctionGrids):
        FitnessFunctionGrids_h.append( KernelSourceModule.get_texref("t_ucFitnessFunctionGrids%d"%(id)) )
        drv.matrix_to_texref( FitnessFunctionGrids[id], FitnessFunctionGrids_h[id] , order="C")
    InteractionMatrix_h = KernelSourceModule.get_texref("t_ucInteractionMatrix")
    drv.matrix_to_texref( InteractionMatrix, InteractionMatrix_h , order="C")

    GlobalParams_h = KernelSourceModule.get_global("c_fParams") # Constant memory address
    drv.memcpy_htod(GlobalParams_h[0], GlobalParams)
    FitnessParams_h = KernelSourceModule.get_global("c_fFitnessParams") # Constant memory address
    drv.memcpy_htod(FitnessParams_h[0], FitnessParams)
    GAParams_h = KernelSourceModule.get_global("c_fGAParams") # Constant memory address
    drv.memcpy_htod(GAParams_h[0], GAParams)
    FourPermutations_h = KernelSourceModule.get_global("c_ucFourPermutations") # Constant memory address
    drv.memcpy_htod(FourPermutations_h[0], FourPermutations)
    FitnessSumConst_h = KernelSourceModule.get_global("c_fFitnessSumConst")
    FitnessListConst_h = KernelSourceModule.get_global("c_fFitnessListConst")

    #Set up curandStates
    curandState_bytesize = 40 # This might be incorrect, depending on your compiler (info from Tomasz Rybak's pyCUDA cuRAND wrapper)
    CurandStates_h = drv.mem_alloc(curandState_bytesize * NrGenomes)

    #Compile kernels
    curandinit_fnc = KernelSourceModule.get_function("CurandInitKernel")
    #fitness_fnc = KernelSourceModule.get_function("FitnessKernel")
    sorting_fnc = KernelSourceModule.get_function("SortingKernel")
    ga_fnc = KernelSourceModule.get_function("GAKernel")

    #Initialise Curand
    curandinit_fnc(CurandStates_h, block=(int(CurandInit_NrThreadsPerBlock), 1, 1), grid=(int(CurandInit_NrBlocks), 1))

    #Build parameter lists for FitnessKernel and GAKernel
    FitnessKernelParams = (Genomes_h, FitnessValues_h, AssembledGrids_h, CurandStates_h, Mutexe_h);
    SortingKernelParams = (FitnessValues_h, FitnessPartialSums_h)
    GAKernelParams = (Genomes_h, FitnessValues_h, AssembledGrids_h, CurandStates_h);

    #TEST ONLY
    #return #ADAPTED
    #TEST ONLY

    #START ADAPTED
    print "GENOMES NOW:\n"
    print Genomes
    print ":::STARTING KERNEL EXECUTION:::"
    #STOP ADAPTED

    #Discovery time parameters
    min_fitness_value = BitLengthGenome # Want all bits set
    mutation_rate = -2.0 #normally: -2


    #Define Numpy construct to sideways join arrays (glue columns together)
    #Taken from: http://stackoverflow.com/questions/5355744/numpy-joining-structured-arrays
    #def join_struct_arrays(arrays):
    #    sizes = np.array([a.itemsize for a in arrays])
    #    offsets = np.r_[0, sizes.cumsum()]
    #    n = len(arrays[0])
    #    joint = np.empty((n, offsets[-1]), dtype=np.int32)
    #    for a, size, offset in zip(arrays, sizes, offsets):
    #        joint[:,offset:offset+size] = a.view(np.int32).reshape(n,size)
    #    dtype = sum((a.dtype.descr for a in arrays), [])
    #    return joint.ravel().view(dtype)
    #Test join_struct_arrays:
    #a = np.array([[1, 2], [11, 22],  [111, 222]]).astype(np.int32);
    #b = np.array([[3, 4], [33, 44],  [333, 444]]).astype(np.int32);
    #c = np.array([[5, 6], [55, 66],  [555, 666]]).astype(np.int32);
    #print "Test join_struct_arrays:"
    #print join_struct_arrays([a, b, c]) #FAILED
    #Set up PYTABLES
    #class GAGenome(IsDescription):
        #gen_id = Int32Col()
        #fitness_val = Float32Col()
        #genome = StringCol(mByteLengthGenome)
        #last_nr_mutations = Int32Col() # Contains the Nr of mutations genome underwent during this generation
        #mother_id = Int32Col() # Contains the crossover "mother"
        #father_id = Int32Col()  # Contains the crossover "father" (empty if no crossing over)
        #assembledgrid      = StringCol(DimGridX*DimGridY)   # 16-character String
    #class GAGenerations(IsDescription):
    #    nr_generations = Int32Col()
    #    nr_genomes = Int32Col()
    #    mutation_rate = Float32Col() # Contains the Nr of mutations genome underwent during this generation

    #from datetime import datetime

    #filename = "fujiama_"+str(NrGenomes)+"_"+str(RateMutation)+"_"+".h5"
    #print filename
    #h5file = openFile(filename, mode = "w", title = "GA FILE")
    #group = h5file.createGroup("/", 'fujiama_ga', 'Fujiama Genetic Algorithm output')
    #table = h5file.createTable(group, 'GaGenerations', GAGenerations, "Raw data")
    #atom = Atom.from_dtype(np.float32)

    #Initialise File I/O
    FILE = open("fujiamakernel_nrgen-" + str(NrGenomes) + "_adaptation.plot", "w")

    #ADAPTED FOR TESTING HISTOGRAM
    #TestValues = [13,24,26,31,32,14]
    #print np.histogram(TestValues,  bins=[0, 24, 32])[0][1]
    #quit()

    #Initialise CUDA timers
    start = drv.Event()
    stop = drv.Event()


    while mutation_rate < 1: # normally: 1

        #ds = h5file.createArray(f.root, 'ga_raw_'+str(mutation_rate), atom, x.shape)
        mutation_rate += 0.1
        GAParams[0]  = 10.0 ** mutation_rate
        drv.memcpy_htod(GAParams_h[0], GAParams)
        print "Mutation rate: ", GAParams[0]

        #ADAPTED: Initialise global memory (absolutely necessary!!)
        drv.memcpy_htod(Genomes_h, Genomes)
        drv.memcpy_htod(FitnessValues_h, FitnessValues)
        drv.memcpy_htod(AssembledGrids_h, AssembledGrids)
        drv.memcpy_htod(Mutexe_h, Mutexe)

        #execute kernels for specified number of generations
        start.record()

        biggest_fit = 0
	reprange = 100
        average_breakup = np.zeros((reprange)).astype(np.float32)
        
        for rep in range(0, reprange):
            breakup_generation = GlobalParamsDict["NrGenerations"]
            dontcount = 0
            #ADAPTED: Initialise global memory (absolutely necessary!!)
            drv.memcpy_htod(Genomes_h, Genomes)
            drv.memcpy_htod(FitnessValues_h, FitnessValues)
            drv.memcpy_htod(AssembledGrids_h, AssembledGrids)
            drv.memcpy_htod(Mutexe_h, Mutexe)

            #execute kernels for specified number of generations
            start.record()
            for gen in range(0, GlobalParamsDict["NrGenerations"]):
                #print "Processing Generation: %d"%(gen)

                #Launch CPU processing (should be asynchroneous calls)

                sorting_fnc(*(SortingKernelParams), block=sorting_blocks, grid=sorting_grids) #Launch Sorting Kernel
                drv.memcpy_dtoh(FitnessPartialSums, FitnessPartialSums_h) #Copy from Device to Host and finish sorting
                FitnessSumConst = FitnessPartialSums.sum()
                drv.memcpy_htod(FitnessSumConst_h[0], FitnessSumConst) #Copy from Host to Device constant memory
                #drv.memcpy_dtod(FitnessListConst_h[0], FitnessValues_h, FitnessValues.nbytes) #Copy FitnessValues from Device to Device Const #TEST

                ga_fnc(*(GAKernelParams), block=ga_blocks, grid=ga_grids) #TEST
                #Note: Fitness Function is here integrated into GA kernel!

                drv.memcpy_dtoh(Genomes_res, Genomes_h) #Copy data from GPU
                drv.memcpy_dtoh(FitnessValues_res, FitnessValues_h)
                #drv.memcpy_dtoh(AssembledGrids_res, AssembledGrids_h) #Takes about as much time as the whole simulation!

                #print FitnessValues_res

                #maxxie = FitnessValues_res.max()
                #if maxxie > biggest_fit:
            	#    biggest_fit = maxxie

                #print "max fitness:", maxxie
                #if maxxie >= 25.0 and breakup_generation == -1:
	        if np.histogram(FitnessValues_res,  (0, 24, 32))[0][1]  >= NrGenomes/2:
                    breakup_generation = gen
                    break
                # else:
                #    breakup_generation = -1
                #if FitnessValues[NrGenomes]  == float(1):
                #    breakup_generation = i
                #    break
                #else:
                #    breakup_generation = -1

                #maxxie = FitnessValues_res.max()
                #if maxxie >= 30:
                #    print "Max fitness value: ", FitnessValues_res.max()
                #ds[:]  = FitnessValues #join_struct_arrays(Genomes,  FitnessValues,  AssembledGrids);
                #trow = table.row
                #trow['nr_generations'] = NrGenerations
                #trow['nr_genomes'] = NrGenomes
                #trow['mutation_rate'] = mutation_rate
                #trow.append()
                #trow.flush()

            stop.record()
            stop.synchronize()
            print "Total kernel time taken: %fs"%(start.time_till(stop)*1e-3)
            print "Mean time per generation: %fs"%(start.time_till(stop)*1e-3 / breakup_generation)
            print "Discovery time (generations) for mutation rate %f: %d"%(GAParams[0],  breakup_generation)
            #print "Max:", biggest_fit

	    #TEST MODE
	    #print "Printing all FitnessValues now:"
	    #print FitnessValues_res
            #raw_input("Next redundancy with keystroke!...");

            #if breakup_generation==0:
            #    print FitnessValues_res
            #    print "Genomes: "
            #    print Genomes_res
            average_breakup[rep] = breakup_generation / reprange
            #if breakup_generation == -1:
            #    dontcount = 1
            #    break

        #if dontcount == 1:
        #    average_breakup.fill(20000)
        FILE.write( str(GAParams[0]) + " " + str(np.median(average_breakup)) + " " + str(np.std(average_breakup)) + "\n");
        FILE.flush()

    #Clean-up pytables
    #h5file.close()
    #Clean up File I/O
    FILE.close()
def main():
    #Set up global timer
    tot_time = time.time()

    #Define constants
    BankSize = 8 # Do not go beyond 8!
    WarpSize = 32 #Do not change...
    DimGridX = 19
    DimGridY = 19
    BlockDimX = 256
    BlockDimY = 256
    SearchSpaceSize = 2**24 #BlockDimX * BlockDimY  * 32
    FitnessValDim = BlockDimX*BlockDimY*WarpSize #SearchSpaceSize
    GenomeDim = BlockDimX*BlockDimY*WarpSize #SearchSpaceSize
    AlignedByteLengthGenome = 4

    #Create dictionary argument for rendering
    RenderArgs= {"safe_memory_mapping":1,
                 "aligned_byte_length_genome":AlignedByteLengthGenome,
                 "bit_length_edge_type":3,
                 "curand_nr_threads_per_block":32,
                 "nr_tile_types":2,
                 "nr_edge_types":8,
                 "warpsize":WarpSize,
                 "fit_dim_thread_x":32*BankSize,
                 "fit_dim_thread_y":1,
                 "fit_dim_block_x":BlockDimX,
                 "fit_dim_grid_x":19,
                 "fit_dim_grid_y":19,
                 "fit_nr_four_permutations":24,
                 "fit_length_movelist":244,
                 "fit_nr_redundancy_grid_depth":2,
                 "fit_nr_redundancy_assemblies":10,
                 "fit_tile_index_starting_tile":0,
                 "glob_nr_tile_orientations":4,
                 "banksize":BankSize,
                 "curand_dim_block_x":BlockDimX
                }
    # Set environment for template package Jinja2
    env = Environment(loader=PackageLoader('main', './'))
    # Load source code from file
    Source = env.get_template('./alpha.cu') #Template( file(KernelFile).read() )
    # Render source code
    RenderedSource = Source.render( RenderArgs )

    # Save rendered source code to file
    f = open('./rendered.cu', 'w')
    f.write(RenderedSource)
    f.close()

    #Load source code into module
    KernelSourceModule = SourceModule(RenderedSource, options=None, arch="compute_20", code="sm_20")
    Kernel = KernelSourceModule.get_function("SearchSpaceKernel")
    CurandKernel = KernelSourceModule.get_function("CurandInitKernel")


    #Initialise InteractionMatrix
    InteractionMatrix = numpy.zeros( ( 8, 8) ).astype(numpy.float32)
    def Delta(a,b):
        if a==b:
            return 1
        else:
            return 0
    for i in range(InteractionMatrix.shape[0]):
        for j in range(InteractionMatrix.shape[1]):
            InteractionMatrix[i][j] = ( 1 - i % 2 ) * Delta( i, j+1 ) + ( i % 2 ) * Delta( i, j-1 )

    #Set up our InteractionMatrix
    InteractionMatrix_h = KernelSourceModule.get_texref("t_ucInteractionMatrix")
    drv.matrix_to_texref( InteractionMatrix, InteractionMatrix_h , order="C")
    print InteractionMatrix

    #Set-up genomes
    #dest = numpy.arange(GenomeDim*4).astype(numpy.uint8)
    #for i in range(0, GenomeDim/4):
        #dest[i*8 + 0] = int('0b00100101',2) #CRASHES
        #dest[i*8 + 1] = int('0b00010000',2) #CRASHES
        #dest[i*8 + 0] = int('0b00101000',2)
        #dest[i*8 + 1] = int('0b00000000',2)
        #dest[i*8 + 2] = int('0b00000000',2)
        #dest[i*8 + 3] = int('0b00000000',2)
        #dest[i*8 + 4] = int('0b00000000',2)
        #dest[i*8 + 5] = int('0b00000000',2)
        #dest[i*8 + 6] = int('0b00000000',2)
        #dest[i*8 + 7] = int('0b00000000',2)
    #    dest[i*4 + 0] = 40
    #    dest[i*4 + 1] = 0
    #    dest[i*4 + 2] = 0
    #    dest[i*4 + 3] = 0

    dest_h = drv.mem_alloc(GenomeDim*AlignedByteLengthGenome) #dest.nbytes)
    #drv.memcpy_htod(dest_h, dest)
    #print "Genomes before: "
    #print dest

    #Set-up grids
    #grids = numpy.zeros((10000, DimGridX, DimGridY)).astype(numpy.uint8) #TEST
    #grids_h = drv.mem_alloc(GenomeDim*DimGridX*DimGridY) #TEST
    #drv.memcpy_htod(grids_h, grids)
    #print "Grids:"
    #print grids    

    #Set-up fitness values
    #fitness = numpy.zeros(FitnessValDim).astype(numpy.float32)
    #fitness_h = drv.mem_alloc(fitness.nbytes)
    #fitness_size = numpy.zeros(FitnessValDim).astype(numpy.uint32)
    fitness_size = drv.pagelocked_zeros((FitnessValDim), numpy.uint32, "C", 0)
    fitness_size_h = drv.mem_alloc(fitness_size.nbytes)
    #fitness_hash = numpy.zeros(FitnessValDim).astype(numpy.uint32)
    fitness_hash = drv.pagelocked_zeros((FitnessValDim), numpy.uint32, "C", 0)
    fitness_hash_h = drv.mem_alloc(fitness_hash.nbytes)
    #drv.memcpy_htod(fitness_h, fitness)
    #print "Fitness values:"
    #print fitness

    #Set-up grids
    #grids = numpy.zeros((GenomeDim, DimGridX, DimGridY)).astype(numpy.uint8) #TEST
    grids = drv.pagelocked_zeros((GenomeDim, DimGridX, DimGridY), numpy.uint8, "C", 0)
    grids_h = drv.mem_alloc(GenomeDim*DimGridX*DimGridY) #TEST
    
    #drv.memcpy_htod(grids_h, grids)
    #print "Grids:"
    #print grids 

    #Set-up curand
    #curand = numpy.zeros(40*GenomeDim).astype(numpy.uint8);
    #curand_h = drv.mem_alloc(curand.nbytes)
    curand_h = drv.mem_alloc(40*GenomeDim)

    #SearchSpace control
    #SearchSpaceSize = 2**24
    #BlockDimY = SearchSpaceSize / (2**16)
    #BlockDimX = SearchSpaceSize / (BlockDimY)
    #print "SearchSpaceSize: ", SearchSpaceSize, " (", BlockDimX, ", ", BlockDimY,")"
   
    #Schedule kernel calls
    #MaxBlockDim = 100
    OffsetBlocks = (SearchSpaceSize) % (BlockDimX*BlockDimY*WarpSize)
    MaxBlockCycles = (SearchSpaceSize - OffsetBlocks)/(BlockDimX*BlockDimY*WarpSize)
    BlockCounter = 0
    print "Will do that many kernels a ", BlockDimX,"x", BlockDimY,"x ", WarpSize, ":", MaxBlockCycles
    #quit()

    #SET UP PROCESSING
    histo = {}
     
    #INITIALISATION
    CurandKernel(curand_h, block=(WarpSize,1,1), grid=(BlockDimX, BlockDimY))
    print "Finished Curand kernel, starting main kernel..."

    #FIRST GENERATION
    proc_time = time.time()
    print "Starting first generation..."
    start = drv.Event()
    stop = drv.Event()
    start.record()
    Kernel(dest_h, grids_h, fitness_size_h, fitness_hash_h, curand_h, numpy.int64(0), block=(WarpSize*BankSize,1,1), grid=(BlockDimX,BlockDimY))
    stop.record()
    stop.synchronize()
    print "Total kernel time taken: %fs"%(start.time_till(stop)*1e-3)    
    print "Copying..."
    drv.memcpy_dtoh(fitness_size, fitness_size_h)
    drv.memcpy_dtoh(fitness_hash, fitness_hash_h)
    drv.memcpy_dtoh(grids, grids_h)

    #INTERMEDIATE GENERATION
    for i in range(MaxBlockCycles-1):
        print "Starting generation: ", i+1
        start = drv.Event()
        stop = drv.Event()
        start.record()
        Kernel(dest_h, grids_h, fitness_size_h, fitness_hash_h, curand_h, numpy.int64((i+1)*BlockDimX*BlockDimY*WarpSize), block=(WarpSize*BankSize,1,1), grid=(BlockDimX,BlockDimY))
        "Processing..."
        for j in range(grids.shape[0]):
#            if (fitness_hash[j]!=33) and (fitness_hash[j]!=44) and (fitness_hash[j]!=22):
            if fitness_hash[j] in histo: 
                histo[fitness_hash[j]] = (histo[fitness_hash[j]][0], histo[fitness_hash[j]][1]+1)
            else:
                histo[fitness_hash[j]] = (grids[j], 1)

        stop.record()
        stop.synchronize()
        print "Total kernel time taken: %fs"%(start.time_till(stop)*1e-3)
        print "This corresponds to %f polyomino classification a second."%((BlockDimX*BlockDimY*WarpSize)/(start.time_till(stop)*1e-3))
        print "Copying..."
        drv.memcpy_dtoh(fitness_size, fitness_size_h)
        drv.memcpy_dtoh(fitness_hash, fitness_hash_h)
        drv.memcpy_dtoh(grids, grids_h)

    #FINAL PROCESSING
    "Processing..."
    for i in range(grids.shape[0]):
        if fitness_hash[i] in histo:
            histo[fitness_hash[i]] = (histo[fitness_hash[i]][0], histo[fitness_hash[i]][1]+1)
        else:
            histo[fitness_hash[i]] = (grids[i], 1)

    print "Done!"

    #TIMING RESULTS
    print "Total time including set-up: ", (time.time() - tot_time)
    print "Total Processing time: ", (time.time() - proc_time)

    #OUTPUT    
    print histo
Ejemplo n.º 33
0
Archivo: ds2.py Proyecto: haehn/sandbox
  unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
  unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;

  if ((x >= width) || (y >= height)) {
    return;
  }

  data[x * width + y] = tex2D(tex, x, y);

}
""")

downsample_func = downsample.get_function("downsample")
texref = downsample.get_texref("tex")

cuda.matrix_to_texref(img, texref, order="C")
# texref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)
# texref.set_filter_mode(cuda.filter_mode.LINEAR)

gpu_output = np.zeros_like(img, dtype=np.int16)

blocksize = (16,16,1)
gridsize = (width / blocksize[0], height / blocksize[1])

print 'blocksize', blocksize
print 'gridsize', gridsize

downsample_func(cuda.Out(gpu_output), np.int32(width), np.int32(height), block=blocksize, grid = gridsize, texrefs=[texref])


gpu_output = gpu_output.transpose()
Ejemplo n.º 34
0
    def __init__(self, img_path):
        super(LFapplication, self).__init__()

        #
        # Load image data
        #
        base_path = os.path.splitext(img_path)[0]
        lenslet_path = base_path + '-lenslet.txt'
        optics_path = base_path + '-optics.txt'

        with open(lenslet_path, 'r') as f:
            tmp = eval(f.readline())
            x_offset, y_offset, right_dx, right_dy, down_dx, down_dy = \
                     np.array(tmp, dtype=np.float32)

        with open(optics_path, 'r') as f:
            for line in f:
                name, val = line.strip().split()
                try:
                    setattr(self, name, np.float32(val))
                except:
                    pass

        max_angle = math.atan(self.pitch / 2 / self.flen)

        #
        # Prepare image
        #
        im_pil = Image.open(img_path)
        if im_pil.mode == 'RGB':
            self.NCHANNELS = 3
            w, h = im_pil.size
            im = np.zeros((h, w, 4), dtype=np.float32)
            im[:, :, :3] = np.array(im_pil).astype(np.float32)
            self.LF_dim = (ceil(h / down_dy), ceil(w / right_dx), 3)
        else:
            self.NCHANNELS = 1
            im = np.array(im_pil.getdata()).reshape(im_pil.size[::-1]).astype(
                np.float32)
            h, w = im.shape
            self.LF_dim = (ceil(h / down_dy), ceil(w / right_dx))

        x_start = x_offset - int(x_offset / right_dx) * right_dx
        y_start = y_offset - int(y_offset / down_dy) * down_dy
        x_ratio = self.flen * right_dx / self.pitch
        y_ratio = self.flen * down_dy / self.pitch

        #
        # Generate the cuda kernel
        #
        mod_LFview = pycuda.compiler.SourceModule(
            _kernel_tpl.render(newiw=self.LF_dim[1],
                               newih=self.LF_dim[0],
                               oldiw=w,
                               oldih=h,
                               x_start=x_start,
                               y_start=y_start,
                               x_ratio=x_ratio,
                               y_ratio=y_ratio,
                               x_step=right_dx,
                               y_step=down_dy,
                               NCHANNELS=self.NCHANNELS))

        self.LFview_func = mod_LFview.get_function("LFview_kernel")
        self.texref = mod_LFview.get_texref("tex")

        #
        # Now generate the cuda texture
        #
        if self.NCHANNELS == 3:
            cuda.bind_array_to_texref(
                cuda.make_multichannel_2d_array(im, order="C"), self.texref)
        else:
            cuda.matrix_to_texref(im, self.texref, order="C")

        #
        # We could set the next if we wanted to address the image
        # in normalized coordinates ( 0 <= coordinate < 1.)
        # texref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)
        #
        self.texref.set_filter_mode(cuda.filter_mode.LINEAR)

        #
        # Prepare the traits
        #
        self.add_trait('X_angle', Range(-max_angle, max_angle, 0.0))
        self.add_trait('Y_angle', Range(-max_angle, max_angle, 0.0))

        self.plotdata = ArrayPlotData(LF_img=self.sampleLF())
        self.LF_img = Plot(self.plotdata)
        if self.NCHANNELS == 3:
            self.LF_img.img_plot("LF_img")
        else:
            self.LF_img.img_plot("LF_img", colormap=gray)
Ejemplo n.º 35
0
    def run (self) :
        cuda.init()
        self.cuda_dev = cuda.Device(0)
        self.cuda_context = self.cuda_dev.make_context()

#        print 'Loading matrix...'
#        npz = np.load(self.MATRIX_FILE)
#        station_coords = npz['station_coords']
#        grid_dim       = npz['grid_dim']
#        matrix         = npz['matrix']
        station_coords  = self.station_coords
        grid_dim        = self.grid_dim
        matrix          = self.matrix
        nearby_stations = self.nearby_stations    

        # EVERYTHING SHOULD BE IN FLOAT32 for ease of debugging. even times.
        # Matrix and others should be textures, arrays, or in constant memory, to do cacheing.
        
        # make matrix symmetric before converting to int32. this avoids halving the pseudo-infinity value.
        matrix = (matrix + matrix.T) / 2
       #print np.where(matrix == np.inf)
        matrix[matrix == np.inf] = 99999999
        # nan == nan is False because any operation involving nan is False !
        # must use specific isnan function. however, inf works like a normal number.
        matrix[np.isnan(matrix)] = 99999999
        #matrix[matrix >= 60 * 60 * 3] = 0
        matrix = matrix.astype(np.int32)
        #matrix += 60 * 5
        #matrix = np.nan_to_num(matrix)
        print matrix
        
        # force OD matrix symmetry for test
        # THIS was responsible for the coordinate drift!!!
        # need to symmetrize it before copy to device
#        matrix = (matrix + matrix.T) / 2
#        print matrix

        #print np.any(matrix == np.nan)
        #print np.any(matrix == np.inf)

        station_coords_int = station_coords.round().astype(np.int32)

        # to be removed when textures are working
        station_coords_gpu = gpuarray.to_gpu(station_coords_int)
        matrix_gpu = gpuarray.to_gpu(matrix)

        max_x, max_y = grid_dim
        n_gridpoints = int(max_x * max_y)
        n_stations   = len(station_coords)

        cuda_grid_shape = ( int( math.ceil( float(max_x)/CUDA_BLOCK_SHAPE[0] ) ), int( math.ceil( float(max_y)/CUDA_BLOCK_SHAPE[1] ) ) )

        print "\n----PARAMETERS----"
        print "Input file:            ", self.MATRIX_FILE
        print "Number of stations:    ", n_stations
        print "OD matrix shape:       ", matrix.shape    
        print "Station coords shape:  ", station_coords_int.shape 
        print "Station cache size:    ", self.N_NEARBY_STATIONS
        print "Map dimensions:        ", grid_dim
        print "Number of map points:  ", n_gridpoints
        print "Target space dimensionality: ", self.DIMENSIONS
        print "CUDA block dimensions: ", CUDA_BLOCK_SHAPE 
        print "CUDA grid dimensions:  ", cuda_grid_shape

        assert station_coords.shape == (n_stations, 2)
        assert self.N_NEARBY_STATIONS <= n_stations
        
        #sys.exit()

        # Make and register custom color map for pylab graphs

        cdict = {'red':   ((0.0,  0.0, 0.0),
                           (0.2,  0.0, 0.0),
                           (0.4,  0.9, 0.9),
                           (1.0,  0.0, 0.0)),

                 'green': ((0.0,  0.0, 0.1),
                           (0.05, 0.9, 0.9),
                           (0.1,  0.0, 0.0),
                           (0.4,  0.9, 0.9),
                           (0.6,  0.0, 0.0),
                           (1.0,  0.0, 0.0)),

                 'blue':  ((0.0,  0.0, 0.0),
                           (0.05, 0.0, 0.0),
                           (0.2,  0.9, 0.9),
                           (0.3,  0.0, 0.0),
                           (1.0,  0.0, 0.0))}

        mymap = LinearSegmentedColormap('mymap', cdict)
        mymap.set_over( (1.0, 0.0, 1.0) )
        mymap.set_bad ( (0.0, 0.0, 0.0) )
        #pl.plt.register_cmap(cmap=mymap)

        # set up arrays for calculations

        coords_gpu        = gpuarray.to_gpu(np.random.random( (max_x, max_y, self.DIMENSIONS) ).astype(np.float32))   # initialize coordinates to random values in range 0...1
        forces_gpu        = gpuarray.zeros( (int(max_x), int(max_y), self.DIMENSIONS), dtype=np.float32 )             # 3D float32 accumulate forces over one timestep
        weights_gpu       = gpuarray.zeros( (int(max_x), int(max_y)),                  dtype=np.float32 )             # 2D float32 cell error accumulation
        errors_gpu        = gpuarray.zeros( (int(max_x), int(max_y)),                  dtype=np.float32 )             # 2D float32 cell error accumulation
        #near_stations_gpu = gpuarray.zeros( (int(max_x), int(max_y), self.N_NEARBY_STATIONS, 2), dtype=np.int32)
        # instead of using synthetic distances, use the network distance near stations lists.         
        # rather than copying the array over to the GPU then binding to external texref, 
        # could just use matrix_to_texref, but this function only seems to understand 2d arrays
        near_stations_gpu = gpuarray.to_gpu( nearby_stations )

        debug_gpu     = gpuarray.zeros( n_gridpoints, dtype = np.int32 )
        debug_img_gpu = gpuarray.zeros_like( errors_gpu )

        print "\n----COMPILATION----"
        # times could be merged into forces kernel, if done by pixel not station.
        # integrate kernel could be GPUArray operation; also helps clean up code by using GPUArrays.
        # DIM should be replaced by python script, so as not to define twice. 

        replacements = [( 'N_NEAR_STATIONS', self.N_NEARBY_STATIONS ),
                        ( 'N_STATIONS',      n_stations ),
                        ( 'DIM',             self.DIMENSIONS)]
        src = preprocess_cu('unified_mds_stochastic.cu', replacements)
        #print src
        mod = SourceModule(src, options=["--ptxas-options=-v"])
        stations_kernel  = mod.get_function("stations"  )
        forces_kernel    = mod.get_function("forces"  )
        integrate_kernel = mod.get_function("integrate")

        matrix_texref         = mod.get_texref('tex_matrix')
        station_coords_texref = mod.get_texref('tex_station_coords')
        near_stations_texref  = mod.get_texref('tex_near_stations')
        #ts_coords_texref  = mod.get_texref('tex_ts_coords') could be a 4-channel 2 dim texture, or 3 dim texture. or just 1D.

        cuda.matrix_to_texref(matrix, matrix_texref, order="F") # copy directly to device with texref - made for 2D x 1channel textures
        cuda.matrix_to_texref(station_coords_int, station_coords_texref, order="F") # fortran ordering, because we will be accessing with texND() instead of C-style indices
        # again, matrix_to_texref is not used here because that function only understands 2D arrays
        near_stations_gpu.bind_to_texref_ext(near_stations_texref)

        # note, cuda.In and cuda.Out are from the perspective of the KERNEL not the host app!
        # stations_kernel disabled since true network distances are now being used        
        #stations_kernel(near_stations_gpu, max_x, max_y, block=CUDA_BLOCK_SHAPE, grid=cuda_grid_shape)    
        # autoinit.context.synchronize()
        #self.cuda_context.synchronize() 
        
        #print "Near stations list:"
        #print near_stations_gpu
        
        #sys.exit()
        
        print "\n----CALCULATION----"
        t_start = time.time()
        n_pass = 0
        active_cells = list(self.active_cells) # make a working list of map cells that are accessible
        print "N active cells:", len(active_cells)
        while (n_pass < self.N_ITERATIONS) :
            n_pass += 1    
            random.shuffle(active_cells)
            active_cells_gpu = gpuarray.to_gpu(np.array(active_cells).astype(np.int32))
            # Pay attention to grid sizes when testing: if you don't run the integrator on the coordinates connected to stations, 
            # they don't move... so the whole thing stabilizes in a couple of cycles.    
            # Stations are worked on in blocks to avoid locking up the GPU with one giant kernel.
#            for subset_low in range(0, n_stations, self.STATION_BLOCK_SIZE) :
            for subset_low in range(1) : # changed to try integrating more often
                subset_high = subset_low + self.STATION_BLOCK_SIZE
                if subset_high > n_stations : subset_high = n_stations
                sys.stdout.write( "\rpass %03i / station %04i of %04i / total runtime %03.1f min " % (n_pass, subset_high, n_stations, (time.time() - t_start) / 60.0) )
                sys.stdout.flush()
                self.emit(QtCore.SIGNAL( 'outputProgress(int, int, int, float, float)' ),
                      n_pass, subset_high, n_stations, 
                      (time.time() - t_start) / 60.0, (time.time() - t_start) / n_pass + (subset_low/n_stations) )
                
                # adding texrefs in kernel call seems to change nothing, leaving them out.
                # max_x and max_y could be #defined in kernel source, along with STATION_BLOCK_SIZE 

                forces_kernel(np.int32(n_stations), np.int32(subset_low), np.int32(subset_high), max_x, max_y, active_cells_gpu, coords_gpu, forces_gpu, weights_gpu, errors_gpu, debug_gpu, debug_img_gpu, block=CUDA_BLOCK_SHAPE, grid=cuda_grid_shape)
                #autoinit.context.synchronize()
                self.cuda_context.synchronize()
                
                # show a sample of the results
                #print coords_gpu.get() [00:10,00:10]
                #print forces_gpu.get() [00:10,00:10]
                #print weights_gpu.get()[00:10,00:10]
                time.sleep(0.05)  # let the OS GUI use the GPU for a bit.
                
                #pl.imshow( (debug_img_gpu.get() / 60.0).T, cmap=mymap, origin='bottom')#, vmin=0, vmax=100 )
                #pl.title( 'Debugging Output - step %03d' %n_pass )
                #pl.colorbar()
                #pl.savefig( 'img/debug%03d.png' % n_pass )
                #pl.close()

# why was this indented? shouldn't it be integrated only after all stations are taken into account?
            integrate_kernel(max_x, max_y, coords_gpu, forces_gpu, weights_gpu, block=CUDA_BLOCK_SHAPE, grid=cuda_grid_shape)    
            self.cuda_context.synchronize()

            if (self.IMAGES_EVERY > 0) and (n_pass % self.IMAGES_EVERY == 0) :
                #print 'Kernel debug output:'
                #print debug_gpu
                
#                velocities = np.sqrt(np.sum(forces_gpu.get() ** 2, axis = 2)) 
#                png_f = open('img/vel%03d.png' % n_pass, 'wb')
#                png_w = png.Writer(max_x, max_y, greyscale = True, bitdepth=8)
#                png_w.write(png_f, velocities / 1200)
#                png_f.close()

#                np.set_printoptions(threshold=np.nan)
#                print velocities.astype(np.int32)

#                pl.imshow( velocities.T, origin='bottom') #, vmin=0, vmax=100 )
#                pl.title( 'Velocity ( sec / timestep) - step %03d' % n_pass )
#                pl.colorbar()
#                pl.savefig( 'img/vel%03d.png' % n_pass )
#                plt.close()
#                
#                pl.imshow( (errors_gpu.get() / weights_gpu.get() / 60.0 ).T, cmap=mymap, origin='bottom') #, vmin=0, vmax=100 )
#                pl.title( 'Average absolute error (min) - step %03d' %n_pass )
#                pl.colorbar()
#                pl.savefig( 'img/err%03d.png' % n_pass )
#                pl.close()

#                pl.imshow( (debug_img_gpu.get() / 60.0).T, cmap=mymap, origin='bottom') #, vmin=0, vmax=100 )
#                pl.title( 'Debugging Output - step %03d' %n_pass )
#                pl.colorbar()
#                pl.savefig( 'img/debug%03d.png' % n_pass )
#                pl.close()
                
                #self.emit( QtCore.SIGNAL( 'outputImage(QString)' ), QtCore.QString('img/err%03d.png' % n_pass) )
                #self.emit( QtCore.SIGNAL( 'outputImage(QImage)' ), numpy2qimage( (errors_gpu.get() / weights_gpu.get() / 60.0 / 30 * 255 ).astype(np.uint8) ) )
                velocities = np.sqrt(np.sum(forces_gpu.get() ** 2, axis = 2))
                velocities /= 15. # out of 15 sec range
                velocities *= 255
                np.clip(velocities, 0, 255, velocities)  
                velImage = numpy2qimage(velocities.astype(np.uint8)).transformed(QtGui.QMatrix().rotate(-90))
                
#                errors = np.sqrt(errors_gpu.get() / weights_gpu.get())
                e = np.sum(np.nan_to_num(errors_gpu.get())) / np.sum(np.nan_to_num(weights_gpu.get()))
                print "average error (sec) over all active cells:", e
                errors = errors_gpu.get() / weights_gpu.get() # average instead of RMS error 
                errors /= 60.
                errors /= 15. # out of 15 min range
                errors *= 255
                np.clip(errors, 0, 255, errors)  
                errImage = numpy2qimage(errors.astype(np.uint8)).transformed(QtGui.QMatrix().rotate(-90))
                
                self.emit( QtCore.SIGNAL( 'outputImage(QImage, QImage)' ), errImage, velImage )
                velImage.save('img/vel%03d.png' % n_pass, 'png' )
                errImage.save('img/err%03d.png' % n_pass, 'png' )            
  
            sys.stdout.write( "/ avg pass time %02.1f sec" % ( (time.time() - t_start) / n_pass, ) )
            sys.stdout.flush()

        #end of main loop
        np.save('result.npy', coords_gpu.get())
Ejemplo n.º 36
0
    def __init__(self, img_path):
	super(LFapplication, self).__init__()

	#
	# Load image data
	#
	base_path = os.path.splitext(img_path)[0]
	lenslet_path = base_path + '-lenslet.txt'
	optics_path = base_path + '-optics.txt'

	with open(lenslet_path, 'r') as f:
            tmp = eval(f.readline())
	    x_offset, y_offset, right_dx, right_dy, down_dx, down_dy = \
              np.array(tmp, dtype=np.float32)

	with open(optics_path, 'r') as f:
            for line in f:
                name, val = line.strip().split()
                try:
                    setattr(self, name, np.float32(val))
                except:
                    pass

        max_angle = math.atan(self.pitch/2/self.flen)

        #
	# Prepare image
	#
	im_pil = Image.open(img_path)
        if im_pil.mode == 'RGB':
            self.NCHANNELS = 3
            w, h = im_pil.size
            im = np.zeros((h, w, 4), dtype=np.float32)
            im[:, :, :3] = np.array(im_pil).astype(np.float32)
            self.LF_dim = (ceil(h/down_dy), ceil(w/right_dx), 3)
        else:
            self.NCHANNELS = 1
            im = np.array(im_pil.getdata()).reshape(im_pil.size[::-1]).astype(np.float32)
            h, w = im.shape
            self.LF_dim = (ceil(h/down_dy), ceil(w/right_dx))

        x_start = x_offset - int(x_offset / right_dx) * right_dx
        y_start = y_offset - int(y_offset / down_dy) * down_dy
        x_ratio = self.flen * right_dx / self.pitch
        y_ratio = self.flen * down_dy / self.pitch

        #
        # Generate the cuda kernel
        #
        mod_LFview = pycuda.compiler.SourceModule(
            _kernel_tpl.render(
                newiw=self.LF_dim[1],
                newih=self.LF_dim[0],
                oldiw=w,
                oldih=h,
                x_start=x_start,
                y_start=y_start,
                x_ratio=x_ratio,
                y_ratio=y_ratio,
                x_step=right_dx,
                y_step=down_dy,
                NCHANNELS=self.NCHANNELS
                )
            )
        
        self.LFview_func = mod_LFview.get_function("LFview_kernel")
        self.texref = mod_LFview.get_texref("tex")
        
        #
	# Now generate the cuda texture
	#
        if self.NCHANNELS == 3:
            cuda.bind_array_to_texref(
                cuda.make_multichannel_2d_array(im, order="C"),
                self.texref
                )
        else:
            cuda.matrix_to_texref(im, self.texref, order="C")
            
	#
	# We could set the next if we wanted to address the image
	# in normalized coordinates ( 0 <= coordinate < 1.)
	# texref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)
	#
	self.texref.set_filter_mode(cuda.filter_mode.LINEAR)

        #
	# Prepare the traits
	#
        self.add_trait('X_angle', Range(-max_angle, max_angle, 0.0))
        self.add_trait('Y_angle', Range(-max_angle, max_angle, 0.0))
	
        self.plotdata = ArrayPlotData(LF_img=self.sampleLF())
	self.LF_img = Plot(self.plotdata)
        if self.NCHANNELS == 3:
            self.LF_img.img_plot("LF_img")
        else:
            self.LF_img.img_plot("LF_img", colormap=gray)
Ejemplo n.º 37
0
  }
  i += blockDim.x * gridDim.x;
  }
}
""")

########
#get the kernel
########
copy_texture_func = mod_copy_texture.get_function("copy_texture_kernel")

#########
#Map the Kernel to texture object
#########
texref = mod_copy_texture.get_texref("tex")
cuda.matrix_to_texref(realrow , texref , order = "C")

#texref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)
#texref.set_filter_mode()

A=5
gpu_output = np.zeros_like(realrow)
tic()
copy_texture_func(cuda.In(np.float32([M,N,A])),cuda.Out(gpu_output),block=(32,32, 1), grid=(M/32,N/32,1), texrefs=[texref])
print "time        ",toc()

print "Output"
print gpu_output

p.gray()
p.subplot(1,2,1)
Ejemplo n.º 38
0
def trikmeans_gpu(data, clusters, iterations, return_times=0):
    """trikmeans_gpu(data, clusters, iterations) returns (clusters, labels)
    
    K-means using triangle inequality algorithm and PyCuda
    Input arguments are the data, intial cluster values, and number of iterations to repeat.
    The shape of data is (nDim, nPts) where nDim = # of dimensions in the data and
        nPts = number of data points.
    The shape of clusters is (nDim, nClusters) 
    
    The return values are the updated clusters and labels for the data
    """

    #---------------------------------------------------------------
    #                   get problem parameters
    #---------------------------------------------------------------
    (nDim, nPts) = data.shape
    nClusters = clusters.shape[1]

    #---------------------------------------------------------------
    #            set calculation control variables
    #---------------------------------------------------------------
    useTextureForData = 0

    usePageLockedMemory = 0

    if (nPts > 32768):
        useTextureForData = 0

    # block and grid sizes for the ccdist kernel (also for hdclosest)
    blocksize_ccdist = min(512, 16 * (1 + (nClusters - 1) / 16))
    gridsize_ccdist = 1 + (nClusters - 1) / blocksize_ccdist

    #block and grid sizes for the init module
    threads_desired = 16 * (1 + (max(nPts, nDim * nClusters) - 1) / 16)
    #blocksize_init = min(512, threads_desired)
    blocksize_init = min(128, threads_desired)
    gridsize_init = 1 + (threads_desired - 1) / blocksize_init

    #block and grid sizes for the step3 module
    blocksize_step3 = blocksize_init
    if not useTextureForData:
        blocksize_step3 = min(256, blocksize_step3)
    gridsize_step3 = gridsize_init

    #block and grid sizes for the step4 module
    # Each block of threads will handle seqcount times the data
    # eg blocksize of 512 and seqcount of 4, each block reduces 4*512 = 2048 elements
    blocksize_step4 = 2
    while (blocksize_step4 < min(512, nPts)):
        blocksize_step4 *= 2
    maxblocks = 512
    seqcount_step4 = 1 + (nPts - 1) / (blocksize_step4 * maxblocks)
    gridsize_step4 = 1 + (nPts - 1) / (seqcount_step4 * blocksize_step4)

    blocksize_step4part2 = 1
    while (blocksize_step4part2 < gridsize_step4):
        blocksize_step4part2 *= 2
    """
    print "blocksize_step4 =", blocksize_step4
    print "gridsize_step4 =", gridsize_step4
    print "seqcount_step4 =", seqcount_step4
    """

    #block and grid sizes for the calc_movement module
    for blocksize_calcm in range(32, 512, 32):
        if blocksize_calcm >= nClusters:
            break
    gridsize_calcm = 1 + (nClusters - 1) / blocksize_calcm

    #block and grid sizes for the step56 module
    blocksize_step56 = blocksize_init
    gridsize_step56 = gridsize_init

    #---------------------------------------------------------------
    #                    prepare source modules
    #---------------------------------------------------------------
    t1 = time.time()

    mod_ccdist = kernels.get_big_module(nDim, nPts, nClusters, blocksize_step4,
                                        seqcount_step4, gridsize_step4,
                                        blocksize_step4part2,
                                        useTextureForData, BOUNDS)

    ccdist = mod_ccdist.get_function("ccdist")
    calc_hdclosest = mod_ccdist.get_function("calc_hdclosest")
    init = mod_ccdist.get_function("init")
    step3 = mod_ccdist.get_function("step3")
    step4 = mod_ccdist.get_function("step4")
    step4part2 = mod_ccdist.get_function("step4part2")
    calc_movement = mod_ccdist.get_function("calc_movement")
    step56 = mod_ccdist.get_function("step56")
    pycuda.autoinit.context.synchronize()
    t2 = time.time()
    module_time = t2 - t1

    #---------------------------------------------------------------
    #                    setup data on GPU
    #---------------------------------------------------------------
    t1 = time.time()

    data = np.array(data).astype(np.float32)
    clusters = np.array(clusters).astype(np.float32)

    if useTextureForData:
        # copy the data to the texture
        texrefData = mod_ccdist.get_texref("texData")
        cuda.matrix_to_texref(data, texrefData, order="F")
    else:
        if usePageLockedMemory:
            data_pl = cuda.pagelocked_empty_like(data)
            data_pl[:, :] = data
            gpu_data = gpuarray.to_gpu(data_pl)
        else:
            gpu_data = gpuarray.to_gpu(data)

    if usePageLockedMemory:
        clusters_pl = cuda.pagelocked_empty_like(clusters)
        clusters_pl[:, :] = clusters
        gpu_clusters = gpuarray.to_gpu(clusters_pl)
    else:
        gpu_clusters = gpuarray.to_gpu(clusters)

    gpu_assignments = gpuarray.zeros((nPts, ), np.int32)  # cluster assignment
    gpu_lower = gpuarray.zeros((nClusters, nPts),
                               np.float32)  # lower bounds on distance between
    # point and each cluster
    gpu_upper = gpuarray.zeros((nPts, ),
                               np.float32)  # upper bounds on distance between
    # point and any cluster
    gpu_ccdist = gpuarray.zeros((nClusters, nClusters),
                                np.float32)  # cluster-cluster distances
    gpu_hdClosest = gpuarray.zeros((nClusters, ),
                                   np.float32)  # half distance to closest
    gpu_hdClosest.fill(
        1.0e10)  # set to large value // **TODO**  get the acutal float max
    gpu_badUpper = gpuarray.zeros(
        (nPts, ), np.int32)  # flag to indicate upper bound needs recalc
    gpu_clusters2 = gpuarray.zeros((nDim, nClusters), np.float32)
    gpu_cluster_movement = gpuarray.zeros((nClusters, ), np.float32)

    gpu_cluster_changed = gpuarray.zeros((nClusters, ), np.int32)
    gpu_cluster_changed.fill(1)

    gpu_reduction_out = gpuarray.zeros((nDim, nClusters * gridsize_step4),
                                       np.float32)
    gpu_reduction_counts = gpuarray.zeros((nClusters * gridsize_step4, ),
                                          np.int32)

    pycuda.autoinit.context.synchronize()
    t2 = time.time()
    data_time = t2 - t1

    #---------------------------------------------------------------
    #                    do calculations
    #---------------------------------------------------------------
    ccdist_time = 0.
    hdclosest_time = 0.
    init_time = 0.
    step3_time = 0.
    step4_time = 0.
    step56_time = 0.

    t1 = time.time()
    ccdist(gpu_clusters,
           gpu_ccdist,
           gpu_hdClosest,
           block=(blocksize_ccdist, 1, 1),
           grid=(gridsize_ccdist, 1))
    pycuda.autoinit.context.synchronize()
    t2 = time.time()
    ccdist_time += t2 - t1

    t1 = time.time()
    calc_hdclosest(gpu_ccdist,
                   gpu_hdClosest,
                   block=(blocksize_ccdist, 1, 1),
                   grid=(gridsize_ccdist, 1))
    pycuda.autoinit.context.synchronize()
    t2 = time.time()
    hdclosest_time += t2 - t1

    t1 = time.time()
    if useTextureForData:
        init(gpu_clusters,
             gpu_ccdist,
             gpu_hdClosest,
             gpu_assignments,
             gpu_lower,
             gpu_upper,
             block=(blocksize_init, 1, 1),
             grid=(gridsize_init, 1),
             texrefs=[texrefData])
    else:
        init(gpu_data,
             gpu_clusters,
             gpu_ccdist,
             gpu_hdClosest,
             gpu_assignments,
             gpu_lower,
             gpu_upper,
             block=(blocksize_init, 1, 1),
             grid=(gridsize_init, 1))
    pycuda.autoinit.context.synchronize()
    t2 = time.time()
    init_time += t2 - t1

    for i in range(iterations):

        if i > 0:
            t1 = time.time()
            ccdist(gpu_clusters,
                   gpu_ccdist,
                   gpu_hdClosest,
                   block=(blocksize_ccdist, 1, 1),
                   grid=(gridsize_ccdist, 1))
            pycuda.autoinit.context.synchronize()
            t2 = time.time()
            ccdist_time += t2 - t1

            t1 = time.time()
            calc_hdclosest(gpu_ccdist,
                           gpu_hdClosest,
                           block=(blocksize_ccdist, 1, 1),
                           grid=(gridsize_ccdist, 1))
            pycuda.autoinit.context.synchronize()
            t2 = time.time()
            hdclosest_time += t2 - t1

        t1 = time.time()
        if i > 0:
            gpu_cluster_changed.fill(0)
        if useTextureForData:
            step3(gpu_clusters,
                  gpu_ccdist,
                  gpu_hdClosest,
                  gpu_assignments,
                  gpu_lower,
                  gpu_upper,
                  gpu_badUpper,
                  gpu_cluster_changed,
                  block=(blocksize_step3, 1, 1),
                  grid=(gridsize_step3, 1),
                  texrefs=[texrefData])
        else:
            step3(gpu_data,
                  gpu_clusters,
                  gpu_ccdist,
                  gpu_hdClosest,
                  gpu_assignments,
                  gpu_lower,
                  gpu_upper,
                  gpu_badUpper,
                  gpu_cluster_changed,
                  block=(blocksize_step3, 1, 1),
                  grid=(gridsize_step3, 1))

        pycuda.autoinit.context.synchronize()
        t2 = time.time()
        step3_time += t2 - t1

        t1 = time.time()

        if useTextureForData:
            step4(gpu_cluster_changed,
                  gpu_reduction_out,
                  gpu_reduction_counts,
                  gpu_assignments,
                  block=(blocksize_step4, 1, 1),
                  grid=(gridsize_step4, nDim),
                  texrefs=[texrefData])
        else:
            step4(gpu_data,
                  gpu_cluster_changed,
                  gpu_reduction_out,
                  gpu_reduction_counts,
                  gpu_assignments,
                  block=(blocksize_step4, 1, 1),
                  grid=(gridsize_step4, nDim))

        step4part2(gpu_cluster_changed,
                   gpu_reduction_out,
                   gpu_reduction_counts,
                   gpu_clusters2,
                   gpu_clusters,
                   block=(blocksize_step4part2, 1, 1),
                   grid=(1, nDim))

        calc_movement(gpu_clusters,
                      gpu_clusters2,
                      gpu_cluster_movement,
                      gpu_cluster_changed,
                      block=(blocksize_calcm, 1, 1),
                      grid=(gridsize_calcm, 1))

        pycuda.autoinit.context.synchronize()
        t2 = time.time()
        step4_time += t2 - t1

        t1 = time.time()
        if useTextureForData:
            step56(gpu_assignments,
                   gpu_lower,
                   gpu_upper,
                   gpu_cluster_movement,
                   gpu_badUpper,
                   block=(blocksize_step56, 1, 1),
                   grid=(gridsize_step56, 1),
                   texrefs=[texrefData])
        else:
            step56(gpu_assignments,
                   gpu_lower,
                   gpu_upper,
                   gpu_cluster_movement,
                   gpu_badUpper,
                   block=(blocksize_step56, 1, 1),
                   grid=(gridsize_step56, 1))
        pycuda.autoinit.context.synchronize()
        t2 = time.time()
        step56_time += t2 - t1

        # prepare for next iteration
        temp = gpu_clusters
        gpu_clusters = gpu_clusters2
        gpu_clusters2 = temp

    if return_times:
        return gpu_ccdist, gpu_hdClosest, gpu_assignments, gpu_lower, gpu_upper, \
                gpu_clusters.get(), gpu_cluster_movement, \
                data_time, module_time, init_time, \
                ccdist_time/iterations, hdclosest_time/iterations, \
                step3_time/iterations, step4_time/iterations, step56_time/iterations
    else:
        return gpu_clusters.get(), gpu_assignments.get()
Ejemplo n.º 39
0
import numpy

realrow = numpy.array([1.0, 2.0, 3.0, 4.0, 5.0],
                      dtype=numpy.float32).reshape(1, 5)

mod_copy_texture = cuda.SourceModule("""
texture<float, 1> tex;
texture<float, 1> tex2;

__global__ void copy_texture_kernel(float * data) {
    int ty=threadIdx.y;
    //data[ty] = tex1D(tex, (float)(ty));
    data[ty] = tex1D(tex, (float)(ty)/2.0f);
}
""")

copy_texture_func = mod_copy_texture.get_function("copy_texture_kernel")
texref = mod_copy_texture.get_texref("tex")
tex2ref = mod_copy_texture.get_texref("tex2")

cuda.matrix_to_texref(realrow, texref, order="C")

texref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)
texref.set_filter_mode(cuda.filter_mode.LINEAR)

gpu_output = numpy.zeros_like(realrow)
copy_texture_func(cuda.Out(gpu_output), block=(1, 1, 1), texrefs=[texref])

print "Output:"
print gpu_output