def reconstruct(opts_path):
    """reconstruct from channel data
    """
    opts = loadOptions(opts_path)
    # normalize paths according to the platform
    opts['extra']['src_dir'] =\
        os.path.expanduser(os.path.normpath(opts['extra']['src_dir']))
    opts['extra']['dest_dir'] =\
        os.path.expanduser(os.path.normpath(opts['extra']['dest_dir']))
    # load data from hdf5 files
    ind = opts['load']['EXP_START']
    if opts['load']['EXP_END'] != -1 and\
       opts['load']['EXP_END'] != ind:
        notifyCli('WARNING: multiple experiments selected. '
                  'Only the first dataset will be processed')
    chn_data, chn_data_3d = load_hdf5_data(
        opts['extra']['dest_dir'], ind)
    if opts['unpack']['Show_Image'] != 0:
        notifyCli('Currently only Show_Image = 0 is supported.')
    # initialize pyCuda environment
    cuda.init()
    dev = cuda.Device(0)
    ctx = dev.make_context()
    reImg = reconstruction_3d(chn_data_3d, opts['recon'])
    ctx.pop()
    del ctx
    save_reconstructed_image(reImg, opts['extra']['dest_dir'],
                             ind, 'tiff', '_3d')
	def gpuFunc(iterator):
	    # 1. Data preparation
            iterator = iter(iterator)
            cpu_data = list(iterator)
            cpu_dataset = " ".join(cpu_data)
            ascii_data = np.asarray([ord(x) for x in cpu_dataset], dtype=np.uint8)

	    # 2. Driver initialization and data transfer
	    cuda.init()
	    dev = cuda.Device(0)
	    contx = dev.make_context()
            gpu_dataset = gpuarray.to_gpu(ascii_data)

	    # 3. GPU kernel.
	    # The kernel's algorithm counts the words by keeping 
	    # track of the space between them
            countkrnl = reduction.ReductionKernel(long, neutral = "0",
            		map_expr = "(a[i] == 32)*(b[i] != 32)",
                        reduce_expr = "a + b", arguments = "char *a, char *b")

            results = countkrnl(gpu_dataset[:-1],gpu_dataset[1:]).get()
            yield results

	    # Release GPU context resources
	    contx.pop() 
	    del gpu_dataset
            del contx
	   
	    gc.collect()            
Esempio n. 3
0
def worker():
    comm = MPI.Comm.Get_parent()
    size = comm.Get_size()
    rank = comm.Get_rank()
    name = MPI.Get_processor_name()

    import pycuda.driver as drv
    drv.init()

    # Find maximum number of available GPUs:
    max_gpus = drv.Device.count()

    # Use modular arithmetic to avoid assigning a nonexistent GPU:
    n = rank % max_gpus
    dev = drv.Device(n)
    ctx = dev.make_context()
    atexit.register(ctx.pop)

    # Execute a kernel:
    import pycuda.gpuarray as gpuarray
    from pycuda.elementwise import ElementwiseKernel
    
    kernel = ElementwiseKernel('double *y, double *x, double a',
                               'y[i] = a*x[i]')
    x_gpu = gpuarray.to_gpu(np.random.rand(2))
    y_gpu = gpuarray.empty_like(x_gpu)
    kernel(y_gpu, x_gpu, np.double(2.0))

    print 'I am process %d of %d on CPU %s using GPU %s of %s [x_gpu=%s, y_gpu=%s]' % \
        (rank, size, name, n, max_gpus, str(x_gpu.get()), str(y_gpu.get()))
    comm.Disconnect()
Esempio n. 4
0
def init_device(device='gpu0'):
  
    if device.startswith('cuda'):
        
        import os
        if 'THEANO_FLAGS' in os.environ:
            raise ValueError('Use theanorc to set the theano config')
        
        os.environ['THEANO_FLAGS'] = 'device={0}'.format(device)
        import theano.gpuarray
        # This is a bit of black magic that may stop working in future
        # theano releases
        ctx = theano.gpuarray.type.get_context(None)
        drv = None
        
    elif device.startswith('gpu'):
        
        gpuid = int(device[-1])

        import pycuda.driver as drv
        drv.init()
        dev = drv.Device(gpuid)
        ctx = dev.make_context()
        import theano.sandbox.cuda
        theano.sandbox.cuda.use(device)
        import theano
    else:
        drv=None
        ctx=None
        import theano.sandbox.cuda
        theano.sandbox.cuda.use(device)
        import theano
        
    from theano import function, config, shared, sandbox, tensor

    vlen = 10 * 30 * 768  # 10 x #cores x # threads per core
    iters = 1000

    rng = np.random.RandomState(22)
    arr = rng.rand(vlen)

    shared_x = theano.shared(np.asarray(arr, config.floatX))
    shared_xx = theano.shared(np.asarray(arr, config.floatX))
    
    x=tensor.fvector("x")
    # compile a function so that shared_x will be set to part of a computing graph on GPU (CUDAndarray)
    f = function([], tensor.exp(x), givens=[(x,shared_x)]) 
    
    
    if np.any([isinstance(x.op, tensor.Elemwise) and
                  ('Gpu' not in type(x.op).__name__)
                  for x in f.maker.fgraph.toposort()]):
        print('Used the cpu')
    else:
        print('Used the gpu')

    # if np.any([isinstance(x.op, tensor.Elemwise) for x in f.maker.fgraph.toposort()]) and device!='cpu':
    #     raise TypeError('graph not compiled on GPU') 

    return drv,ctx, arr, shared_x, shared_xx
Esempio n. 5
0
def fun_load(config, sock_data=5000):

    send_queue = config['queue_l2t']
    recv_queue = config['queue_t2l']
    # recv_queue and send_queue are multiprocessing.Queue
    # recv_queue is only for receiving
    # send_queue is only for sending

    # if need to do random crop and mirror
    flag_randproc = not config['use_data_layer']
    flag_batch = config['batch_crop_mirror']

    drv.init()
    dev = drv.Device(int(config['gpu'][-1]))
    ctx = dev.make_context()
    sock = zmq.Context().socket(zmq.PAIR)
    sock.bind('tcp://*:{0}'.format(sock_data))

    shape, dtype, h = sock.recv_pyobj()
    print 'shared_x information received', shape, dtype
    shape = (3, 255, 255, 256) # TODO remove fix

    gpu_data_remote = gpuarray.GPUArray(shape, dtype,
                                        gpudata=drv.IPCMemoryHandle(h))
    gpu_data = gpuarray.GPUArray(shape, dtype)

    img_mean = recv_queue.get()
    print 'img_mean received'

    # The first time, do the set ups and other stuff

    # receive information for loading

    while True:
        # getting the hkl file name to load
        hkl_name = recv_queue.get()

        # print hkl_name
        #data = pickle.load(open(hkl_name)) - img_mean
        data = hkl.load(hkl_name) - img_mean
        # print 'load ', time.time() - bgn_time
        if flag_randproc:
            param_rand = recv_queue.get()

            data = crop_and_mirror(data, param_rand, flag_batch=flag_batch)
        gpu_data.set(data)

        # wait for computation on last minibatch to finish
        msg = recv_queue.get()
        assert msg == 'calc_finished'

        drv.memcpy_peer(gpu_data_remote.ptr,
                        gpu_data.ptr,
                        gpu_data.dtype.itemsize *
                        gpu_data.size,
                        ctx, ctx)

        ctx.synchronize()

        send_queue.put('copy_finished')
Esempio n. 6
0
def _init_gpu(comm):
    """ Chooses a gpu and creates a context on it. """
    # Find out how many GPUs are available to us on this node.
    driver.init()
    num_gpus = driver.Device.count()

    # Figure out the names of the other hosts.
    rank = comm.Get_rank()  # Find out which process I am.
    name = MPI.Get_processor_name()  # The name of my node.
    hosts = comm.allgather(name)  # Get the names of all the other hosts

    # Find out which GPU to take (by precedence).
    gpu_id = hosts[0:rank].count(name)
    if gpu_id >= num_gpus:
        raise TypeError("No GPU available.")

    # Create a context on the appropriate device.
    for k in range(num_gpus):
        try:
            device = driver.Device((gpu_id + k) % num_gpus)
            context = device.make_context()
        except:
            continue
        else:
            #             print "On %s: process %d taking gpu %d of %d.\n" % \
            #                 (name, rank, gpu_id+k, num_gpus)
            break

    return device, context  # Return device and context.
Esempio n. 7
0
def choose_gpu():
    # Find out how many GPUs are available to us on this node.
    drv.init()
    num_gpus = drv.Device.count()

    # Figure out the names of the other hosts.
    rank = MPI.COMM_WORLD.Get_rank() # Find out which process I am.
    name = MPI.Get_processor_name() # The name of my node.
    hosts = MPI.COMM_WORLD.allgather(name) # Get the names of all the other hosts

    # Figure out our precendence on this node.

    # Make sure the number of hosts and processes are equal.
    num_processes = MPI.COMM_WORLD.Get_size()
    if (len(hosts) is not num_processes):
        raise TypeError('Number of hosts and number of processes do not match.')


    # Make sure the name of my node matches.
    if (name != hosts[rank]):
        # print name, hosts[rank]
        raise TypeError('Hostname does not match.')

    # Find out which GPU to take.
    gpu_id = hosts[0:rank].count(name)
    if gpu_id >= num_gpus:
        raise TypeError('No GPU available.')

#     sys.stdout.write("On %s: %d/%d taking gpu %d/%d.\n" % \
#                         (name, rank, num_processes, gpu_id, num_gpus))
    
    # Make and return a context on the device.
    return drv.Device(gpu_id).make_context() 
Esempio n. 8
0
    def __init__(self, options, gpu_id):
        """Initializes the CUDA backend.

        :param options: LBConfig object
        :param gpu_id: number of the GPU to use
        """
        cuda.init()
        self.buffers = {}
        self.arrays = {}
        self._kern_stats = set()
        self.options = options
        self._device = cuda.Device(gpu_id)
        self._ctx = self._device.make_context(
            flags=cuda.ctx_flags.SCHED_AUTO if not options.cuda_sched_yield else
            cuda.ctx_flags.SCHED_YIELD)

        if (options.precision == 'double' and
            self._device.compute_capability()[0] >= 3):
            if hasattr(self._ctx, 'set_shared_config'):
                self._ctx.set_shared_config(cuda.shared_config.EIGHT_BYTE_BANK_SIZE)

        # To keep track of allocated memory.
        self._total_memory_bytes = 0

        self._iteration_kernels = []
Esempio n. 9
0
 def n_blocks(self):
     n_blocks = self.opts.get('n_blocks')
     if n_blocks is None:
         default_threads_per_block = 32
         bytes_per_float = 4
         memory_per_thread = (self._len_species + 1) * bytes_per_float
         if cuda is None:
             threads_per_block = default_threads_per_block
         else:
             cuda.init()
             device = cuda.Device(self.gpu[0])
             attrs = device.get_attributes()
             shared_memory_per_block = attrs[
                 cuda.device_attribute.MAX_SHARED_MEMORY_PER_BLOCK]
             upper_limit_threads_per_block = attrs[
                 cuda.device_attribute.MAX_THREADS_PER_BLOCK]
             max_threads_per_block = min(
                 shared_memory_per_block / memory_per_thread,
                 upper_limit_threads_per_block)
             threads_per_block = min(max_threads_per_block,
                                     default_threads_per_block)
         n_blocks = int(
             np.ceil(1. * len(self.param_values) / threads_per_block))
         self._logger.debug('n_blocks set to {} (used pycuda: {})'.format(
             n_blocks, cuda is not None
         ))
     self.n_blocks = n_blocks
     return n_blocks
Esempio n. 10
0
    def _init_gpu(self):
        """
        Initialize GPU device.

        Notes
        -----
        Must be called from within the `run()` method, not from within
        `__init__()`.
        """

        if self.device == None:
            self.log_info('no GPU specified - not initializing ')
        else:

            # Import pycuda.driver here so as to facilitate the
            # subclassing of Module to create pure Python LPUs that don't use GPUs:
            import pycuda.driver as drv
            drv.init()
            try:
                self.gpu_ctx = drv.Device(self.device).make_context()
            except Exception as e:
                self.log_info('_init_gpu exception: ' + e.message)
            else:
                atexit.register(self.gpu_ctx.pop)
                self.log_info('GPU initialized')
Esempio n. 11
0
	def __init__(self, device_num=0, sync_calls=False):

		cuda.init()

		#self.context = pycuda.tools.make_default_context()
		#self.device = self.context.get_device()

		self.device = cuda.Device(device_num)
		self.context = self.device.make_context()

		self.stream = cuda.Stream()

		self.max_block_size = self.device.get_attribute(cuda.device_attribute.MAX_BLOCK_DIM_X)

		self.max_grid_size_x = self.device.get_attribute(cuda.device_attribute.MAX_GRID_DIM_X)
		self.max_grid_size_y = self.device.get_attribute(cuda.device_attribute.MAX_GRID_DIM_Y)

		self.max_grid_size_x_pow2 = 2 ** log2(self.max_grid_size_x)

		self.max_registers = self.device.get_attribute(cuda.device_attribute.MAX_REGISTERS_PER_BLOCK)

		self.warp_size = self.device.get_attribute(cuda.device_attribute.WARP_SIZE)

		self.gpu = True
		self.cuda = True

		self._sync_calls = sync_calls

		self.allocated = 0
Esempio n. 12
0
    def run_kernel_on_gpus(self, vec_a, vec_b):

        drv.init()
        num = drv.Device.count()
        num = 1

        vector_len = vec_b.shape[0]

        sections = range(0, vector_len, vector_len / num)
        sections = sections[1:]
        print "section on gpus:"
        print sections

        sub_vec_bs = numpy.split(vec_b, sections)

        gpu_thread_list = []
        for i in range(num):
            gpu_thread = GPUThread(i, vec_a, sub_vec_bs[i], self.block, self.grid)
            gpu_thread.start()
            gpu_thread_list.append(gpu_thread)

        dest = numpy.array([])
        for gpu in gpu_thread_list:
            gpu.join()
            dest = numpy.concatenate((dest, gpu.vec_b))

        print dest

        return dest
Esempio n. 13
0
    def __init__(self, shape, dtype=numpy.float32, stream=None, allocator=drv.mem_alloc,cuda_device=0):
        try:
            drv.init()
            ctx = drv.Device(0).make_context()
        except RuntimeError:
            "device is already initialized! so we ignore this ugly, but works for now"
        
        #which device are we working on
        self.cuda_device = cuda_device
        
        #internal shape
        self.shape = shape
        
        #internal type
        self.dtype = numpy.dtype(dtype)

        from pytools import product
        
        #internal size
        self.size = product(shape)

        self.allocator = allocator
        if self.size:
            self.gpudata = self.allocator(self.size * self.dtype.itemsize)
        else:
            self.gpudata = None
        self.stream = stream

        self._update_kernel_kwargs()
Esempio n. 14
0
    def __init__(self, device_number=0, thread_per_block=512, **kwargs):
        self.device_number = device_number
        self.thread_per_block = thread_per_block
        self.device_type = 'nvidia_gpu'
        self.language    = 'cuda'
        self.code_type   = 'cu'

        try:
            import pycuda.driver as cuda
            cuda.init()

        except Exception as e:
            logger.error("Error: CUDA initialization error", exc_info=True)
            raise SystemExit

        max_devices = cuda.Device.count()
        if max_devices == 0:
            logger.error("Error: There is no CUDA device (NVIDIA GPU).")
            raise SystemExit

        elif device_number >= max_devices:
            logger.error("Error: The given device_number(%d) is bigger than physical GPU devices(%d)."%(device_number, max_devices))
            raise SystemExit

        else:
            device = cuda.Device(device_number)
            context = device.make_context()

            import atexit
            atexit.register(context.pop)

            self.cuda = cuda
            self.device = device
            self.context = context
Esempio n. 15
0
    def _init_gpu(self):
        """
        Initialize GPU device.

        Notes
        -----
        Must be called from within the `run()` method, not from within
        `__init__()`.
        """

        if self.device == None:
            self.log_info('no GPU specified - not initializing ')
        else:

            # Import pycuda.driver here so as to facilitate the
            # subclassing of Module to create pure Python LPUs that don't use GPUs:
            import pycuda.driver as drv
            drv.init()

            N_gpu = drv.Device.count()
            if not self.device < N_gpu:
                new_device = randint(0,N_gpu - 1)
                self.log_warning("GPU device device %d not in GPU devices %s" % (self.device, str(range(0,N_gpu))))
                self.log_warning("Setting device = %d" % new_device)
                self.device = new_device

            try:
                self.gpu_ctx = drv.Device(self.device).make_context()
            except Exception as e:
                self.log_info('_init_gpu exception: ' + e.message)
            else:
                atexit.register(self.gpu_ctx.pop)
                self.log_info('GPU %s initialized' % self.device)
Esempio n. 16
0
def get_device_count(verbose=False):
    """
    Query device count through PyCuda.

    Arguments:
        verbose (bool): prints verbose logging if True, default False.

    Returns:
        int: Number of GPUs available.
    """
    try:
        import pycuda
        import pycuda.driver as drv
    except ImportError:
        if verbose:
            print("PyCUDA module not found")
        return 0
    try:
        drv.init()
    except pycuda._driver.RuntimeError as e:
        print("PyCUDA Runtime error: {0}".format(str(e)))
        return 0

    count = drv.Device.count()

    if verbose:
        print "Found %d GPU(s)", count

    return count
def test_vector_add():
    #Check pycuda is installed and if a CUDA capable device is present, if not skip the test
    try:
        import pycuda.driver as drv
        drv.init()
    except (ImportError, Exception):
        pytest.skip("PyCuda not installed or no CUDA device detected")

    kernel_string = """
    __global__ void vector_add(float *c, float *a, float *b, int n) {
        int i = blockIdx.x * block_size_x + threadIdx.x;
        if (i<n) {
            c[i] = a[i] + b[i];
        }
    }
    """

    size = 10000000
    problem_size = (size, 1)

    a = numpy.random.randn(size).astype(numpy.float32)
    b = numpy.random.randn(size).astype(numpy.float32)
    c = numpy.zeros_like(b)
    n = numpy.int32(size)

    args = [c, a, b, n]
    params = {"block_size_x": 512}

    answer = run_kernel("vector_add", kernel_string, problem_size, args, params)

    assert numpy.allclose(answer[0], a+b, atol=1e-8)
Esempio n. 18
0
 def _init_gpu(self):
     """
     Initialize gpu context
     """
     self.logger.info("starting cuda")
     cuda.init()
     dev = cuda.Device( self.gpu_id )
     self.ctx = dev.make_context()
Esempio n. 19
0
def get_num_gpus():
    """Returns the number of GPUs available"""
    print ("Determining number of GPUs...")
    from pycuda import driver 
    driver.init()
    num_gpus = driver.Device.count()
    print ("Number of GPUs: {}".format(num_gpus))
    return num_gpus
Esempio n. 20
0
 def tensorrt_init(self, *args, **kwargs):
     from tensorrt.lite import Engine
     import pycuda.driver as cuda
     cuda.init()
     args[1].cuda_context = cuda.Device(0).make_context()
     args[0].logger.info('Loading TensorRT engine: %s' % self.engine_file)
     args[1].trt_engine = Engine(PLAN=self.engine_file)
     cuda.Context.pop()
Esempio n. 21
0
def init():
  # MAGIC MAGIC
  import pycuda.driver as cuda
  cuda.init()
  from pycuda.tools import make_default_context
  context = make_default_context()
  device = context.get_device()
  import atexit
  atexit.register(context.detach)
Esempio n. 22
0
    def __init__(s, ndim, ns, ds, dt, h5save=False, **kwargs):
        s.dt = dt
        s.ndim = ndim
        if s.ndim == 1:
            s.nx = s.ns = ns
            s.dx = s.ds = ds
        elif s.ndim == 2:
            s.nx, s.ny = s.ns = ns
            if ds == list:
                s.dx, s.dy = s.ds = ds
            else:
                s.dx = s.dy = s.ds = ds
        s.h5save = h5save

        # sub area
        s.snx = 1024
        s.sx0 = s.nx / 2 - s.snx / 2
        s.sx1 = s.nx / 2 + s.snx / 2
        if ndim == 2:
            s.sny = 1024
            s.sy0 = s.ny / 2 - s.sny / 2
            s.sy1 = s.ny / 2 + s.sny / 2

            # dk
        s.dkx = 1.0 / (s.nx * s.dx) * 2 * np.pi
        if ndim == 2:
            s.dky = 1.0 / (s.ny * s.dy) * 2 * np.pi

            # array allocations
        s.psi = np.zeros(s.ns, dtype=np.complex64)

        s.x = np.arange(s.nx, dtype=np.float32) * s.dx
        s.kx = np.fft.fftfreq(s.nx, s.dx) * 2 * np.pi
        s.lcx = np.zeros(s.nx, dtype=np.complex64)
        s.lcx_sqrt = np.zeros(s.nx, dtype=np.complex64)
        s.lcx[:] = np.exp(-0.5j * s.kx ** 2 * dt)
        s.lcx_sqrt[:] = np.sqrt(s.lcx)
        if ndim == 2:
            s.y = np.arange(s.ny, dtype=np.float32) * s.dy
            s.ky = np.fft.fftfreq(s.ny, s.dy) * 2 * np.pi
            s.lcy = np.zeros(s.ny, dtype=np.complex64)
            s.lcy_sqrt = np.zeros(s.ny, dtype=np.complex64)
            s.lcy[:] = np.exp(-0.5j * s.ky ** 2 * dt)
            s.lcy_sqrt[:] = np.sqrt(s.lcy)

            # cuda
        cuda.init()
        s.ctx = cuda.Device(0).make_context()
        s.strm = cuda.Stream()
        s.plan = Plan(s.ns, dtype=np.complex64, context=s.ctx, stream=s.strm)
        s.psi_gpu = gpuarray.to_gpu(s.psi)
        if s.ndim == 2:
            s.lcy_gpu = gpuarray.to_gpu(s.lcy)

        s.tpb = 256
        s.bpg = 30 * 4
        print "tpb = %d, bpg = %g" % (s.tpb, s.bpg)
Esempio n. 23
0
    def __init__(self, ne, p_degree, cfl=0.1, v=0.5, target_gpu=0):
        cuda.init()
        self.dev = cuda.Device(target_gpu)
        self.ctx = self.dev.make_context()

        import atexit
        atexit.register(self.ctx.pop)

        super(DGModalGpu, self).__init__(ne, p_degree, cfl, v)
Esempio n. 24
0
File: test.py Progetto: src-d/kmcuda
 def __init__(self):
     import pycuda.driver as cuda
     self.cuda = cuda
     cuda.init()
     self.ctxs = [cuda.Device(i).make_context()
                  for i in range(cuda.Device.count())]
     for ctx in self.ctxs:
         ctx.pop()
     self.arrays = {}
Esempio n. 25
0
def init_all_devices():
    global DEVICES, DEVICE_INFO
    if DEVICES is not None:
        return  DEVICES
    log.info("CUDA initialization (this may take a few seconds)")
    driver.init()
    DEVICES = []
    DEVICE_INFO = {}
    log("CUDA driver version=%s", driver.get_driver_version())
    ngpus = driver.Device.count()
    if ngpus==0:
        log.info("CUDA %s / PyCUDA %s, no devices found", ".".join([str(x) for x in driver.get_version()]), pycuda.VERSION_TEXT)
        return DEVICES
    da = driver.device_attribute
    cf = driver.ctx_flags
    for i in range(ngpus):
        device = None
        context = None
        devinfo = "gpu %i" % i
        try:
            device = driver.Device(i)
            devinfo = device_info(device)
            log(" + testing device %s: %s", i, devinfo)
            DEVICE_INFO[i] = devinfo
            host_mem = device.get_attribute(da.CAN_MAP_HOST_MEMORY)
            if not host_mem:
                log.warn("skipping device %s (cannot map host memory)", devinfo)
                continue
            context = device.make_context(flags=cf.SCHED_YIELD | cf.MAP_HOST)
            try:
                log("   created context=%s", context)
                log("   api version=%s", context.get_api_version())
                free, total = driver.mem_get_info()
                log("   memory: free=%sMB, total=%sMB",  int(free/1024/1024), int(total/1024/1024))
                log("   multi-processors: %s, clock rate: %s", device.get_attribute(da.MULTIPROCESSOR_COUNT), device.get_attribute(da.CLOCK_RATE))
                log("   max block sizes: (%s, %s, %s)", device.get_attribute(da.MAX_BLOCK_DIM_X), device.get_attribute(da.MAX_BLOCK_DIM_Y), device.get_attribute(da.MAX_BLOCK_DIM_Z))
                log("   max grid sizes: (%s, %s, %s)", device.get_attribute(da.MAX_GRID_DIM_X), device.get_attribute(da.MAX_GRID_DIM_Y), device.get_attribute(da.MAX_GRID_DIM_Z))
                max_width = device.get_attribute(da.MAXIMUM_TEXTURE2D_WIDTH)
                max_height = device.get_attribute(da.MAXIMUM_TEXTURE2D_HEIGHT)
                log("   maximum texture size: %sx%s", max_width, max_height)
                log("   max pitch: %s", device.get_attribute(da.MAX_PITCH))
                SMmajor, SMminor = device.compute_capability()
                compute = (SMmajor<<4) + SMminor
                log("   compute capability: %#x (%s.%s)", compute, SMmajor, SMminor)
                if i==0:
                    #we print the list info "header" from inside the loop
                    #so that the log output is bunched up together
                    log.info("CUDA %s / PyCUDA %s, found %s device%s:",
                             ".".join([str(x) for x in driver.get_version()]), pycuda.VERSION_TEXT, ngpus, engs(ngpus))
                DEVICES.append(i)
                log.info("  + %s (memory: %s%% free, compute: %s.%s)", device_info(device), 100*free/total, SMmajor, SMminor)
            finally:
                context.pop()
        except Exception as e:
            log.error("error on device %s: %s", devinfo, e)
    return DEVICES
Esempio n. 26
0
    def __init__(self, device=0):
        drv.init()

        self.device = drv.Device(device)
        self.context = self.device.make_context()

        self.memory_pool = pycuda.tools.DeviceMemoryPool()

        #init fft object
        self.fft = FFT(self)
Esempio n. 27
0
def init(device=None):
    """Initializes CUDA global state.

    Chainer maintains CUDA context, CUBLAS context, random number generator and
    device memory pool for each GPU device and for each process (the main
    process or a process forked by :mod:`multiprocessing`) as global states.
    When called for the first time on the process, this function initializes
    these global states.

    .. warning::

       This function also initializes PyCUDA and scikits.cuda. Since these
       packages do not support forking after initialization, do not call this
       function before forking the process.

    This function also registers :func:`shutdown` to :mod:`atexit` slot.

    It also initializes random number generator. User can set fixed seed with
    ``CHAINER_SEED`` environment variable.

    Args:
        device (``int`` or :class:`~pycuda.driver.Device` or ``None``): Device
            ID to initialize on.

    """
    global _contexts, _cublas_handles, _generators, _pid, _pools

    if not available:
        global _import_error
        raise RuntimeError(
            'CUDA environment is not correctly set up. ' +
            'The original import error said: ' + str(_import_error))

    pid = os.getpid()
    if _pid == pid:  # already initialized
        return

    drv.init()

    if device is None:  # use default device
        context = cutools.make_default_context()
        device = Context.get_device()
    else:
        device = Device(device)
        context = device.make_context()
    _contexts = {device: context}
    _generators = {}
    _pools = {}
    _cublas_handles = {}
    cumisc.init(mem_alloc)

    seed(os.environ.get('CHAINER_SEED'))

    _pid = pid  # mark as initialized
    atexit.register(shutdown)
Esempio n. 28
0
def init():
  # MAGIC MAGIC
  from pycuda import driver
  driver.init()
  from pycuda.tools import make_default_context
  context = make_default_context()
  device = context.get_device()
  import atexit
  atexit.register(context.detach)

  return context
Esempio n. 29
0
def get_num_gpus_core():
    cuda.init()
    num = 0
    while True:
        try:
            cuda.Device(num)
        except:
            break
        else:
            num +=1
    return num
Esempio n. 30
0
def list_devices():
  import pycuda.driver as cuda
  cuda.init()
  for i in range(cuda.Device.count()):
    dev = cuda.Device(i)
    attrs = dev.get_attributes()
    print 'Device %d (%s): compute %d.%d, free mem %d, PCI %s' % (
        i, dev.name(),
        attrs[cuda.device_attribute.COMPUTE_CAPABILITY_MAJOR],
        attrs[cuda.device_attribute.COMPUTE_CAPABILITY_MINOR],
        dev.total_memory(), dev.pci_bus_id())
Esempio n. 31
0
def main():

    figsaved = True

    if not figsaved:

        #1. parsing sys arguments

        import sys
        import subnets.layers.someconfigs as someconfigs
        try:

            device = sys.argv[1]

        except:
            print 'USAGE: python tsne.py [device]'
            print 'example: device=cuda0'
            raise

        #2.initialize devices
        if device.startswith('gpu'):
            backend = 'cudandarray'
            someconfigs.backend = 'cudandarray'

        else:
            backend = 'gpuarray'
            someconfigs.backend = 'gpuarray'

        gpuid = int(device[-1])

        if backend == 'cudandarray':

            import pycuda.driver as drv
            drv.init()
            dev = drv.Device(gpuid)
            ctx = dev.make_context()

            import theano.sandbox.cuda
            theano.sandbox.cuda.use(device)

            # import pycuda.gpuarray as gpuarray
            # #import theano
            # import theano.misc.pycuda_init
            # import theano.misc.pycuda_utils
        else:
            import os
            if 'THEANO_FLAGS' in os.environ:
                raise ValueError('Use theanorc to set the theano config')
            os.environ['THEANO_FLAGS'] = 'device={0}'.format(device)
            import theano.gpuarray
            ctx = theano.gpuarray.type.get_context(None)
            # from pygpu import collectives
        #---

        # 3. create save_path and info_matrix for loading samples
        import time
        date = '%d-%d' % (time.gmtime()[1], time.gmtime()[2])

        import os
        pid = os.getpid()

        save_path = './fig-%s-%d/' % (date, pid)

        if not os.path.exists(save_path):
            os.makedirs(save_path)
            print 'create dir', save_path

        data = load_testset() / 255.

        base_path = '/scratch/mahe6562/gap/'
        #base_path = '/work/imj/gap/'

        #model_path = 'grans/lsun_reo/swp0.025-8-gpuarray-1397924_genclipped/'
        #model_path = 'gran-lsun-nccl/11-19-swp0.1-8-gpuarray-1526900/'
        model_path = 'dcgan-lsun-nccl/11-17-swp0.1-8-gpuarray-1255492/'
        model_path1 = 'combined-lsun-nccl/11-19-swp0.1-4-gpuarray-658562/'

        info_matrix = [[base_path + model_path, 8, 4, 0, 34],
                       [base_path + model_path, 8, 4, 4, 34],
                       [base_path + model_path1, 4, 2, 0, 34]]
        mname = 'DCGAN'

        # 4. load samples based on info_matrix

        samples_single = load_single(*info_matrix[0])
        samples_gap = load_single(*info_matrix[1])
        samples_gap_comb = load_single(*info_matrix[2])

        # 5. tsne based on loaded samples

        alldata, fig = tsne(mname,
                            data,
                            samples_single,
                            samples_gap,
                            samples_gap_comb,
                            verbose=True)

        np.save(save_path + '/all%s.npy' % mname, alldata)
        fig.savefig(save_path + '/t-SNE%s.pdf' % mname, format='pdf')

        alldata = np.load(save_path + '/all%s.npy' % mname)
        plot_separate(alldata, mname='DCGAN')

    else:

        import time
        date = '%d-%d' % (time.gmtime()[1], time.gmtime()[2])

        save_path = './fig-11-26-51116/'  #'./fig-%s/' % date
        mname = 'DCGAN'

        alldata = np.load(save_path + '/all%s.npy' % mname)
        plot_separate(alldata, mname='DCGAN')
Esempio n. 32
0
def gen_backend(model=None,
                gpu=None,
                nrv=False,
                datapar=False,
                modelpar=False,
                flexpoint=False,
                rng_seed=None,
                numerr_handling=None,
                half=False,
                stochastic_round=0,
                device_id=None):
    """
    Construct and return a backend instance of the appropriate type based on
    the arguments given.  With no parameters, a single CPU core, float32
    backend is returned.

    Arguments:
        model (neon.models.model.Model): The instantiated model upon which we
                                         will utilize this backend.
        gpu (string, optional): Attempt to utilize a CUDA capable GPU if
                                installed in the system. Defaults to None which
                                implies a CPU based backend.  If 'cudanet',
                                utilize a cuda-convnet2 based backed, which
                                supports Kepler and Maxwell GPUs with single
                                precision. If 'nervanagpu', attempt to utilize
                                the NervanaGPU Maxwell backend with float16 and
                                float32 support.
        nrv (bool, optional): If True, attempt to utilize the Nervana Engine
                              for computation (must be installed on the
                              system).  Defaults to False which implies a CPU
                              based backend.
        datapar (bool, optional): Set to True to ensure that data is
                                  partitioned and each chunk is processed in
                                  parallel on different compute cores. Requires
                                  mpi4py.  Defaults to False which implies that
                                  all data will be processed sequentially on a
                                  single compute core.
        modelpar (bool, optional): Set to True to ensure that the nodes in each
                                   model layer are partitioned and distributed
                                   across multiple compute cores.  Requires
                                   mpi4py.  Defaults to False which implies
                                   that all nodes in all model layers will be
                                   processed by the same single compute core.
        flexpoint (bool, optional): If True, attempt to use FlexPoint(TM)
                                    element typed data instead of the default
                                    float32 which is in place if set to False.
        rng_seed (numeric, optional): Set this to a numeric value which can be
                                      used to seed the random number generator
                                      of the instantiated backend.  Defaults to
                                      None, which doesn't explicitly seed (so
                                      each run will be different)
        stochastic_round (numeric, optional): Only affects the max backend. If
                                              1, perform stochastic rounding.
                                              If 0, round to nearest.
        numerr_handling (dict, optional): Dictate how numeric errors are
                                          displayed and handled.  The keys and
                                          values permissible for this dict
                                          match that seen in numpy.seterr.
                                          If set to None (the default),
                                          behavior is equivalent to
                                          {'all': 'warn'}
        device_id (numeric, optional): Set this to a numeric value which can be
                                       used to select which device to run the
                                       process on

    Returns:
        Backend: newly constructed backend instance of the specifed type.

    Notes:
        * Attempts to construct a GPU instance without a CUDA capable card or
          without cudanet or nervanagpu package installed will cause the
          program to display an error message and exit.
        * Attempts to construct a parallel instance without mpi4py installed
          will cause the program to display an error message and exit.
        * The returned backend will still need to call its par.init_model()
          at some point after the model has been linked, in order for parallel
          training to proceed.
    """
    logger = logging.getLogger(__name__)
    gpuflag = False

    if datapar and modelpar:
        raise NotImplementedError('Hybrid parallelization scheme not '
                                  'implemented yet.  Try with at most one of'
                                  'datapar or modelpar')
    if modelpar:
        par = ModelPar()
    elif datapar:
        par = DataPar()
    else:
        par = NoPar()

    if par.device_id is not None:
        if device_id is not None:
            logger.warn('Ignoring device id specified in command line.')
        device_id = par.device_id

    if gpu is not None:
        gpu = gpu.lower()
        if sys.platform.startswith("linux"):
            gpuflag = (os.system("nvcc --version > /dev/null 2>&1") == 0)
        elif sys.platform.startswith("darwin"):
            gpuflag = (
                os.system("kextstat | grep -i cuda > /dev/null 2>&1") == 0)
        if gpuflag and gpu == 'cudanet':
            try:
                import cudanet  # noqa
                from neon.backends.cc2 import GPU
                be_name = 'Cudanet'
                be = GPU(rng_seed=rng_seed, device_id=device_id)
            except ImportError:
                logger.warning("cudanet not found, can't run via GPU")
                gpuflag = False
        elif gpuflag and gpu == 'nervanagpu':
            try:
                import nervanagpu  # noqa
                try:
                    # import pycuda.autoinit
                    import pycuda.driver as drv
                    drv.init()
                    device_id = device_id if device_id is not None else 0
                    global ctx
                    ctx = drv.Device(device_id).make_context()
                    import atexit
                    atexit.register(ctx.pop)
                    from neon.backends.gpu import GPU
                    be_name = 'NervanaGPU'
                    be = GPU(rng_seed=rng_seed,
                             stochastic_round=stochastic_round,
                             device_id=device_id)
                except ImportError:
                    logger.warning("pycuda error, can't run via GPU")
                    gpuflag = False
            except ImportError:
                logger.warning("nervanagpu not found, can't run via GPU")
                gpuflag = False
        if gpuflag is False:
            raise RuntimeError("Can't find CUDA capable GPU")
    elif nrv:
        nrv = False
        try:
            from umd.nrv_backend import NRVBackend
            nrv = True
        except ImportError:
            logger.warning("Nervana Engine system software not found")

    if flexpoint:
        logger.warning("Flexpoint(TM) backend not currently available")

    if nrv:
        be_name = 'NRV'
        be = NRVBackend(rng_seed=rng_seed,
                        seterr_handling=numerr_handling,
                        device_id=device_id)
    elif not gpuflag:
        be_name = 'CPU'
        be = CPU(rng_seed=rng_seed, seterr_handling=numerr_handling)
    logger.info("{} backend, RNG seed: {}, numerr: {}".format(
        be_name, rng_seed, numerr_handling))

    par.associate(be)
    return be
Esempio n. 33
0
    def __init__(self):
        self.devices, self.device = [], None

        cu_driver.init()
        for gpu_i in range(cu_driver.Device.count()):
            self.devices.append(cu_driver.Device(gpu_i))
Esempio n. 34
0
def _diffusion_child(comm, bm=None):

    rank = comm.Get_rank()
    ngpus = comm.Get_size()

    nodename = socket.gethostname()
    name = '%s %s' %(nodename, rank)
    print(name)

    if rank == 0:

        # reduce blocksize
        bm.data = np.copy(bm.data[bm.argmin_z:bm.argmax_z, bm.argmin_y:bm.argmax_y, bm.argmin_x:bm.argmax_x], order='C')
        bm.labelData = np.copy(bm.labelData[bm.argmin_z:bm.argmax_z, bm.argmin_y:bm.argmax_y, bm.argmin_x:bm.argmax_x], order='C')

        # domain decomposition
        sizeofblocks = (bm.argmax_z - bm.argmin_z) // ngpus
        blocks = [0]
        for k in range(ngpus-1):
            block_temp = blocks[-1] + sizeofblocks
            blocks.append(block_temp)
        blocks.append(bm.argmax_z - bm.argmin_z)
        print('blocks =', blocks)

        # read labeled slices
        if bm.label.allaxis:
            tmp = np.swapaxes(bm.labelData, 0, 1)
            tmp = np.ascontiguousarray(tmp)
            indices_01, _ = read_labeled_slices_allx(tmp)
            tmp = np.swapaxes(tmp, 0, 2)
            tmp = np.ascontiguousarray(tmp)
            indices_02, _ = read_labeled_slices_allx(tmp)

        # send data to childs
        for destination in range(ngpus-1,-1,-1):

            # ghost blocks
            blockmin = blocks[destination]
            blockmax = blocks[destination+1]
            datablockmin = blockmin - 100
            datablockmax = blockmax + 100
            datablockmin = 0 if datablockmin < 0 else datablockmin
            datablockmax = (bm.argmax_z - bm.argmin_z) if datablockmax > (bm.argmax_z - bm.argmin_z) else datablockmax
            datablock = np.copy(bm.data[datablockmin:datablockmax], order='C')
            labelblock = np.copy(bm.labelData[datablockmin:datablockmax], order='C')

            # read labeled slices
            if bm.label.allaxis:
                labelblock = labelblock.astype(np.int32)
                labelblock[:blockmin - datablockmin] = -1
                labelblock[blockmax - datablockmin:] = -1
                indices_child, labels_child = [], []
                indices_00, labels_00 = read_labeled_slices_allx(labelblock)
                indices_child.append(indices_00)
                labels_child.append(labels_00)
                tmp = np.swapaxes(labelblock, 0, 1)
                tmp = np.ascontiguousarray(tmp)
                labels_01 = np.zeros((0, tmp.shape[1], tmp.shape[2]), dtype=np.int32)
                for slcIndex in indices_01:
                    labels_01 = np.append(labels_01, [tmp[slcIndex]], axis=0)
                indices_child.append(indices_01)
                labels_child.append(labels_01)
                tmp = np.swapaxes(tmp, 0, 2)
                tmp = np.ascontiguousarray(tmp)
                labels_02 = np.zeros((0, tmp.shape[1], tmp.shape[2]), dtype=np.int32)
                for slcIndex in indices_02:
                    labels_02 = np.append(labels_02, [tmp[slcIndex]], axis=0)
                indices_child.append(indices_02)
                labels_child.append(labels_02)
            else:
                labelblock[:blockmin - datablockmin] = 0
                labelblock[blockmax - datablockmin:] = 0
                indices_child, labels_child = read_labeled_slices(labelblock)

            # print indices of labels
            print('indices child %s:' %(destination), indices_child)

            if destination > 0:
                blocks_temp = blocks[:]
                blocks_temp[destination] = blockmin - datablockmin
                blocks_temp[destination+1] = blockmax - datablockmin
                dataListe = splitlargedata(datablock)
                sendToChild(comm, indices_child, destination, dataListe, labels_child,
                            bm.label.nbrw, bm.label.sorw, blocks_temp, bm.label.allaxis,
                            bm.allLabels, bm.label.smooth, bm.label.uncertainty, bm.platform)

            else:

                # select platform
                if bm.platform == 'cuda':
                    import pycuda.driver as cuda
                    cuda.init()
                    dev = cuda.Device(rank)
                    ctx, queue = dev.make_context(), None
                    if bm.label.allaxis:
                        from biomedisa_features.random_walk.pycuda_large_allx import walk
                    else:
                        from biomedisa_features.random_walk.pycuda_large import walk
                else:
                    ctx, queue = _get_device(bm.platform, rank)
                    from biomedisa_features.random_walk.pyopencl_large import walk

                # run random walks
                tic = time.time()
                memory_error, final, final_uncertainty, final_smooth = walk(comm, datablock,
                                    labels_child, indices_child, bm.label.nbrw, bm.label.sorw,
                                    blockmin-datablockmin, blockmax-datablockmin, name,
                                    bm.allLabels, bm.label.smooth, bm.label.uncertainty, ctx, queue)
                tac = time.time()
                print('Walktime_%s: ' %(name) + str(int(tac - tic)) + ' ' + 'seconds')

                # free device
                if bm.platform == 'cuda':
                    ctx.pop()
                    del ctx

        if memory_error:

            print('GPU out of memory. Image too large.')

        else:

            # gather data
            for source in range(1, ngpus):
                lendataListe = comm.recv(source=source, tag=0)
                for l in range(lendataListe):
                    data_z, data_y, data_x = comm.recv(source=source, tag=10+(2*l))
                    receivedata = np.empty((data_z, data_y, data_x), dtype=np.uint8)
                    comm.Recv([receivedata, MPI.BYTE], source=source, tag=10+(2*l+1))
                    final = np.append(final, receivedata, axis=0)

            # save finals
            final2 = np.zeros((bm.zsh, bm.ysh, bm.xsh), dtype=np.uint8)
            final2[bm.argmin_z:bm.argmax_z, bm.argmin_y:bm.argmax_y, bm.argmin_x:bm.argmax_x] = final
            final2 = final2[1:-1, 1:-1, 1:-1]
            save_data(bm.path_to_final, final2, bm.header, bm.final_image_type, bm.label.compression)

            # uncertainty
            if final_uncertainty is not None:
                final_uncertainty *= 255
                final_uncertainty = final_uncertainty.astype(np.uint8)
                for source in range(1, ngpus):
                    lendataListe = comm.recv(source=source, tag=0)
                    for l in range(lendataListe):
                        data_z, data_y, data_x = comm.recv(source=source, tag=10+(2*l))
                        receivedata = np.empty((data_z, data_y, data_x), dtype=np.uint8)
                        comm.Recv([receivedata, MPI.BYTE], source=source, tag=10+(2*l+1))
                        final_uncertainty = np.append(final_uncertainty, receivedata, axis=0)
                # save finals
                final2 = np.zeros((bm.zsh, bm.ysh, bm.xsh), dtype=np.uint8)
                final2[bm.argmin_z:bm.argmax_z, bm.argmin_y:bm.argmax_y, bm.argmin_x:bm.argmax_x] = final_uncertainty
                final2 = final2[1:-1, 1:-1, 1:-1]
                save_data(bm.path_to_uq, final2, compress=bm.label.compression)

            # smooth
            if final_smooth is not None:
                for source in range(1, ngpus):
                    lendataListe = comm.recv(source=source, tag=0)
                    for l in range(lendataListe):
                        data_z, data_y, data_x = comm.recv(source=source, tag=10+(2*l))
                        receivedata = np.empty((data_z, data_y, data_x), dtype=np.uint8)
                        comm.Recv([receivedata, MPI.BYTE], source=source, tag=10+(2*l+1))
                        final_smooth = np.append(final_smooth, receivedata, axis=0)
                # save finals
                final2 = np.zeros((bm.zsh, bm.ysh, bm.xsh), dtype=np.uint8)
                final2[bm.argmin_z:bm.argmax_z, bm.argmin_y:bm.argmax_y, bm.argmin_x:bm.argmax_x] = final_smooth
                final2 = final2[1:-1, 1:-1, 1:-1]
                save_data(bm.path_to_smooth, final2, bm.header, bm.final_image_type, bm.label.compression)

            # print computation time
            t = int(time.time() - bm.TIC)
            if t < 60:
                time_str = str(t) + ' sec'
            elif 60 <= t < 3600:
                time_str = str(t // 60) + ' min ' + str(t % 60) + ' sec'
            elif 3600 < t:
                time_str = str(t // 3600) + ' h ' + str((t % 3600) // 60) + ' min ' + str(t % 60) + ' sec'
            print('Computation time:', time_str)

    else:

        lendataListe = comm.recv(source=0, tag=0)
        for k in range(lendataListe):
            data_z, data_y, data_x, data_dtype = comm.recv(source=0, tag=10+(2*k))
            if k==0: data = np.zeros((0, data_y, data_x), dtype=data_dtype)
            data_temp = np.empty((data_z, data_y, data_x), dtype=data_dtype)
            if data_dtype == 'uint8':
                comm.Recv([data_temp, MPI.BYTE], source=0, tag=10+(2*k+1))
            else:
                comm.Recv([data_temp, MPI.FLOAT], source=0, tag=10+(2*k+1))
            data = np.append(data, data_temp, axis=0)

        nbrw, sorw, allx, smooth, uncertainty, platform = comm.recv(source=0, tag=1)

        if allx:
            labels = []
            for k in range(3):
                lenlabelsListe = comm.recv(source=0, tag=2+k)
                for l in range(lenlabelsListe):
                    labels_z, labels_y, labels_x = comm.recv(source=0, tag=100+(2*k))
                    if l==0: labels_tmp = np.zeros((0, labels_y, labels_x), dtype=np.int32)
                    tmp = np.empty((labels_z, labels_y, labels_x), dtype=np.int32)
                    comm.Recv([tmp, MPI.INT], source=0, tag=100+(2*k+1))
                    labels_tmp = np.append(labels_tmp, tmp, axis=0)
                labels.append(labels_tmp)
        else:
            lenlabelsListe = comm.recv(source=0, tag=2)
            for k in range(lenlabelsListe):
                labels_z, labels_y, labels_x = comm.recv(source=0, tag=100+(2*k))
                if k==0: labels = np.zeros((0, labels_y, labels_x), dtype=np.int32)
                tmp = np.empty((labels_z, labels_y, labels_x), dtype=np.int32)
                comm.Recv([tmp, MPI.INT], source=0, tag=100+(2*k+1))
                labels = np.append(labels, tmp, axis=0)

        allLabels = comm.recv(source=0, tag=99)
        indices = comm.recv(source=0, tag=8)
        blocks = comm.recv(source=0, tag=9)

        blockmin = blocks[rank]
        blockmax = blocks[rank+1]

        # select platform
        if platform == 'cuda':
            import pycuda.driver as cuda
            cuda.init()
            dev = cuda.Device(rank)
            ctx, queue = dev.make_context(), None
            if allx:
                from biomedisa_features.random_walk.pycuda_large_allx import walk
            else:
                from biomedisa_features.random_walk.pycuda_large import walk
        else:
            ctx, queue = _get_device(platform, rank)
            from biomedisa_features.random_walk.pyopencl_large import walk

        # run random walks
        tic = time.time()
        memory_error, final, final_uncertainty, final_smooth = walk(comm, data,
                                    labels, indices, nbrw, sorw, blockmin, blockmax,
                                    name, allLabels, smooth, uncertainty, ctx, queue)
        tac = time.time()
        print('Walktime_%s: ' %(name) + str(int(tac - tic)) + ' ' + 'seconds')

        # free device
        if platform == 'cuda':
            ctx.pop()
            del ctx

        # send finals
        if not memory_error:
            dataListe = splitlargedata(final)
            comm.send(len(dataListe), dest=0, tag=0)
            for k, dataTemp in enumerate(dataListe):
                dataTemp = dataTemp.copy(order='C')
                comm.send([dataTemp.shape[0], dataTemp.shape[1], dataTemp.shape[2]], dest=0, tag=10+(2*k))
                comm.Send([dataTemp, MPI.BYTE], dest=0, tag=10+(2*k+1))

            if final_uncertainty is not None:
                final_uncertainty *= 255
                final_uncertainty = final_uncertainty.astype(np.uint8)
                dataListe = splitlargedata(final_uncertainty)
                comm.send(len(dataListe), dest=0, tag=0)
                for k, dataTemp in enumerate(dataListe):
                    dataTemp = dataTemp.copy(order='C')
                    comm.send([dataTemp.shape[0], dataTemp.shape[1], dataTemp.shape[2]], dest=0, tag=10+(2*k))
                    comm.Send([dataTemp, MPI.BYTE], dest=0, tag=10+(2*k+1))

            if final_smooth is not None:
                dataListe = splitlargedata(final_smooth)
                comm.send(len(dataListe), dest=0, tag=0)
                for k, dataTemp in enumerate(dataListe):
                    dataTemp = dataTemp.copy(order='C')
                    comm.send([dataTemp.shape[0], dataTemp.shape[1], dataTemp.shape[2]], dest=0, tag=10+(2*k))
                    comm.Send([dataTemp, MPI.BYTE], dest=0, tag=10+(2*k+1))
Esempio n. 35
0
def gpu_init(device_no, pick_up, pickup_dirs):
    cuda.init()
    dev = cuda.Device(device_no)  # the number of GPU
    ctx = dev.make_context()
    kwargs = {"h": h, "hsnham": hsnham, "VISCOINDX" : VISCOINDX, "H_xomoi": H_moi, "bienQ" : bienQ,\
              "moci" : moci ,"mocj" : mocj, "dauj": dauj, "daui" : daui, "cuoii" : cuoii, "cuoij" : cuoij,\
              "Tsxw" : Tsxw, "Tsyw" : Tsyw, "khouot" : khouot, "boundary_type" : boundary_type,\
              "u": u, "v": v, "z" : z, "t_u": t_u, "t_v": t_v, "t_z": t_z, "Htdu": Htdu, "Htdv" : Htdv, \
              "Kx1" : Kx1, "Ky1" : Ky1, "htaiz" : htaiz, "htaiz_bd" : htaiz,\
              "bc_up": bc_up, "bc_down": bc_down, "bc_left": bc_left, "bc_right" : bc_right,\
              "ubt" : ubt, "ubp" : ubp, "vbt" : vbt, "vbd" : vbd, "hi": hi,\
              "FS" : FS, 'tFS': tFS, 'CC_u' : CC_u, 'CC_d' : CC_d, 'CC_l' : CC_l, 'CC_r' : CC_r,\
              "VTH": VTH, "Kx" : Kx, "Ky" : Ky, "Fw" : Fw, "Qbx" : Qbx, "Qby" : Qby, "dH" : dH}

    # create a pointer object that store address of pointers on device
    pointers = Pointers(ctx, dtype=np.float64, **kwargs)

    # pointers = Pointers(ctx,**kwargs)
    # hmax is used to calculate boundary condition, this will be recalculated later on
    # in pre_processing kernel
    hmax = np.max(h[2])
    # print hmax

    # allocate memory on device
    pd = pointers.alloc_on_device_only(N, M)
    pc = pointers.alloc()

    # store pointers on a list to transfer it to gpu
    # hmax here are just dummie values, for address alignment
    # so that pointers of other arrays can be copied to the right place in memory
    global_attributes = [np.int32(M), np.int32(N), floattype(hmax), floattype(hmax), floattype(hmax), floattype(hmax),\
                    pc['bienQ'], pc['daui'], pc['dauj'], pc['cuoii'], pc['cuoij'], pc['moci'], pc['mocj'], pc['khouot'], pc['boundary_type'],\
                    pc['h'], pc['v'], pc['u'], pc['z'], pc['t_u'], pc['t_v'], pc['t_z'], pc['Htdu'], pc['Htdv'], pc['H_moi'], pc['htaiz'],\
                    pc['htaiz_bd'], pc['ubt'], pc['ubp'], pc['vbt'], pc['vbd'], \
                    pc['hsnham'], pc['VISCOINDX'], pc['Kx1'], pc['Ky1'], pc['Tsyw'], pc['Tsxw'],\
                    pc['bc_up'], pc['bc_down'], pc['bc_left'], pc['bc_right'], pc['hi'],\
                    pc['FS'], pc['tFS'], pc['CC_u'], pc['CC_d'], pc['CC_l'], pc['CC_r'],\
                    pc['VTH'], pc['Kx'], pc['Ky'], pc['Fw'], pc['Qbx'], pc['Qby'], pc['dH']]

    auxilary_arrays = [pd['a1'], pd['b1'], pd['c1'], pd['d1'], pd['a2'], pd['c2'], pd['d2'], \
                   pd['f1'], pd['f2'], pd['f3'], pd['f5'],\
                   pd['AA'], pd['BB'], pd['CC'], pd['DD'],\
                   pd['x'], pd['Ap'], pd['Bp'], pd['ep'], pd['SN'] ]

    # copy struct to gpu: struct that store attribute arrays
    arg_struct_ptr = cuda.mem_alloc(
        np.intp(0).nbytes * (len(global_attributes) - 6) + 8 +
        4 * np.dtype(floattype).itemsize)

    # copy struct to gpu: struct that store supporting arrays (i.e. arrays that only exist on device and don't have corresponding arrays on host)
    arr_struct_ptr = cuda.mem_alloc(np.intp(0).nbytes * len(auxilary_arrays))
    arg_struct = PointersStruct(global_attributes, arg_struct_ptr)
    arr_struct = PointersStruct(auxilary_arrays,
                                arr_struct_ptr,
                                structtype='ARR')
    pointers.toDevice(['h', 'hsnham', 'VISCOINDX', 'bienQ', 'Tsyw', 'Tsxw', 'boundary_type', 'bc_up', 'bc_down', 'bc_left', 'bc_right',\
                      'u', 'v', 'z', 'CC_u', 'CC_d', 'CC_l', 'CC_r', 'Fw'])
    ctx.synchronize()

    supplement = open("support_funcs.cu").read()
    supmod = SourceModule(supplement, include_dirs=[os.getcwd()])

    # get functions from cuda file
    init_Kernel = supmod.get_function("Onetime_init")
    Find_Calculation_limits_x = supmod.get_function(
        "Find_Calculation_limits_Horizontal")
    Find_Calculation_limits_y = supmod.get_function(
        "Find_Calculation_limits_Vertical")
    gpu_Htuongdoi = supmod.get_function("Htuongdoi")
    preprocess = supmod.get_function("preprocess_data")

    # declare block size and grid size
    block_2d = (min(32, M + 3), 1, 1)
    grid_2d = ((M + 3) // min(32, M + 3) + 1, N + 3, 1)

    # call intialize kernels
    init_Kernel(arg_struct_ptr, block=block_2d, grid=grid_2d)
    ctx.synchronize()

    if pick_up is True:
        load_intial_condition(dirs, pointers)
    Find_Calculation_limits_x(arg_struct_ptr, block=(32, 1, 1), grid=(1, N, 1))
    Find_Calculation_limits_y(arg_struct_ptr, block=(32, 1, 1), grid=(1, M, 1))
    gpu_Htuongdoi(arg_struct_ptr, block=block_2d, grid=grid_2d)
    ctx.synchronize()
    preprocess(arg_struct_ptr, block=(32, 1, 1), grid=(1, 1, 1))

    return pointers, ctx, arg_struct_ptr, arr_struct_ptr, supmod
Esempio n. 36
0
 def initialise_cuda():
     global MAXGPU
     if  not pycuda.isinitialised:
         drv.init()
         pycuda.isinitialised = True
         MAXGPU = drv.Device.count()
Esempio n. 37
0
def fun_mlp(shared_args, private_args, this_queue, that_queue):
    '''
    shared_args 
    contains neural network parameters

    private_args
    contains parameters for process run on each gpu

    this_queue and that_queue are used for synchronization between processes.
    '''

    learning_rate = shared_args['learning_rate']
    n_epochs = shared_args['n_epochs']
    dataset = shared_args['dataset']
    batch_size = shared_args['batch_size']
    L1_reg = shared_args['L1_reg']
    L2_reg = shared_args['L2_reg']
    n_hidden = shared_args['n_hidden']

    ####
    # pycuda and zmq environment
    drv.init()
    dev = drv.Device(private_args['ind_gpu'])
    ctx = dev.make_context()
    sock = zmq.Context().socket(zmq.PAIR)

    if private_args['flag_client']:
        sock.connect('tcp://localhost:5000')
    else:
        sock.bind('tcp://*:5000')
    ####

    ####
    # import theano related
    import theano.sandbox.cuda
    theano.sandbox.cuda.use(private_args['gpu'])

    import theano
    import theano.tensor as T

    from logistic_sgd import load_data
    from mlp import MLP

    import theano.misc.pycuda_init
    import theano.misc.pycuda_utils

    ####

    datasets = load_data(dataset)

    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    x = T.matrix('x')  # the data is presented as rasterized images
    y = T.ivector('y')  # the labels are presented as 1D vector of
    # [int] labels

    rng = np.random.RandomState(1234)

    classifier = MLP(rng=rng,
                     input=x,
                     n_in=28 * 28,
                     n_hidden=n_hidden,
                     n_out=10)

    cost = (classifier.negative_log_likelihood(y) + L1_reg * classifier.L1 +
            L2_reg * classifier.L2_sqr)

    validate_model = theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        givens={
            x: valid_set_x[index * batch_size:(index + 1) * batch_size],
            y: valid_set_y[index * batch_size:(index + 1) * batch_size]
        })

    gparams = [T.grad(cost, param) for param in classifier.params]

    updates = [(param, param - learning_rate * gparam)
               for param, gparam in zip(classifier.params, gparams)]

    train_model = theano.function(
        inputs=[index],
        outputs=cost,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            y: train_set_y[index * batch_size:(index + 1) * batch_size]
        })
    ####
    # setting pycuda and
    # pass handles, only done once

    param_ga_list = []
    # a list of pycuda gpuarrays which point to value of theano shared variable on this gpu

    param_other_list = []
    # a list of theano shared variables that are used to store values of theano shared variable from the other gpu

    param_ga_other_list = []
    # a list of pycuda gpuarrays which point to theano shared variables in param_other_list

    h_list = []
    # a list of pycuda IPC handles

    shape_list = []
    # a list containing shapes of variables in param_ga_list

    dtype_list = []
    # a list containing dtypes of variables in param_ga_list

    average_fun_list = []
    # a list containing theano functions for averaging parameters

    for param in classifier.params:
        param_other = theano.shared(param.get_value())
        param_ga = \
            theano.misc.pycuda_utils.to_gpuarray(param.container.value)
        param_ga_other = \
            theano.misc.pycuda_utils.to_gpuarray(
                param_other.container.value)
        h = drv.mem_get_ipc_handle(param_ga.ptr)
        average_fun = \
            theano.function([], updates=[(param,
                                          (param + param_other) / 2.)])

        param_other_list.append(param_other)
        param_ga_list.append(param_ga)
        param_ga_other_list.append(param_ga_other)
        h_list.append(h)
        shape_list.append(param_ga.shape)
        dtype_list.append(param_ga.dtype)
        average_fun_list.append(average_fun)

    # pass shape, dtype and handles
    sock.send_pyobj((shape_list, dtype_list, h_list))
    shape_other_list, dtype_other_list, h_other_list = sock.recv_pyobj()

    param_ga_remote_list = []

    # create gpuarray point to the other gpu use the passed information
    for shape_other, dtype_other, h_other in zip(shape_other_list,
                                                 dtype_other_list,
                                                 h_other_list):
        param_ga_remote = \
            gpuarray.GPUArray(shape_other, dtype_other,
                              gpudata=drv.IPCMemoryHandle(h_other))

        param_ga_remote_list.append(param_ga_remote)
    ####

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'

    this_queue.put('')
    that_queue.get()
    start_time = time.time()

    epoch = 0

    while epoch < n_epochs:
        epoch = epoch + 1
        for minibatch_index in xrange(n_train_batches):

            if minibatch_index % 2 == private_args['mod']:
                train_model(minibatch_index)

                this_queue.put('')
                that_queue.get()

                # exchanging weights
                for param_ga, param_ga_other, param_ga_remote in \
                        zip(param_ga_list, param_ga_other_list,
                            param_ga_remote_list):

                    drv.memcpy_peer(
                        param_ga_other.ptr, param_ga_remote.ptr,
                        param_ga_remote.dtype.itemsize * param_ga_remote.size,
                        ctx, ctx)

                ctx.synchronize()
                this_queue.put('')
                that_queue.get()

                for average_fun in average_fun_list:
                    average_fun()

        if private_args['verbose']:
            validation_losses = [
                validate_model(i) for i in xrange(n_valid_batches)
            ]
            this_validation_loss = np.mean(validation_losses)

            print('epoch %i, minibatch %i/%i, validation error %f %%' %
                  (epoch, minibatch_index + 1, n_train_batches,
                   this_validation_loss * 100.))

    end_time = time.time()

    this_queue.put('')
    that_queue.get()

    if private_args['verbose']:
        print 'The code run for %d epochs, with %f epochs/sec' % (
            epoch, 1. * epoch / (end_time - start_time))
        print >> sys.stderr, ('The code for file ' +
                              os.path.split(__file__)[1] + ' ran for %.1fs' %
                              ((end_time - start_time)))
Esempio n. 38
0
def beamsim_gpu(k: float = 1000.0,
                x0: float = 0.1e-3,
                y0: float = 1e-3,
                z0: float = 0.0,
                nx: int = 1,
                ny: int = 240,
                nz: int = 160,
                dx: float = 1.0,
                dy: float = 1.0e-3,
                dz: float = 1.0e-3,
                elements_vectorized=None):

    if elements_vectorized is None:
        elements_vectorized = [0.0, 0.0, 0.0, 1.0, 0.0, 0.0]

    # note, that on a remote worker, there is never a saying where this thread wakes up
    # therefore i must be extra carefull to always initialize all the resources needed

    # initialize manually
    drv.init()
    # local_device_count = drv.Device.count()
    # print('found {} GPUs'.format(local_device_count))
    # choose one of the GPUs at random
    gpu_to_take = random.choice(range(drv.Device.count()))
    # take device 0 for now
    gpu_context = drv.Device(gpu_to_take).make_context()
    gpu_context.push()  # make the context active
    code_text = SourceModule("""

#include <stdio.h>

#define pi2 2.0f*3.141592653589793f

__global__ void BeamSimKernel ( const float *tx, unsigned int tx_length, 
                                float *out,
                                float x0, float y0, float z0,
                                unsigned int nx, unsigned int ny, unsigned int nz, 
                                float dx, float dy, float dz,
                                float k )
    {
    unsigned int offset=0;
    unsigned int ix,iy,iz=0;
    unsigned int itx = 0; // used as a transducer iterator
    float amplitude = 0.0; 
    // float pressure; // no need for it anymore
    float distance,kd,pressure_re,pressure_im=0;
    // float directivity_cos; // directivity_cos not used in this version
    float pixel_x,pixel_y,pixel_z,dix,diy,diz =0;        
    // di* - delta distances as optimisation    
    // iz=0 for now in CUDA, use 2D calculation only
    
    // calculate iz,ix from thread built-in variables
    
    
    ix = blockIdx.x * blockDim.x + threadIdx.x; // use cuda.x-grid as world.x
    iy = blockIdx.y * blockDim.y + threadIdx.y; // use cuda.y-grid as world.y
    iz = blockIdx.z * blockDim.z + threadIdx.z; // use cuda.z-grid as world.z
    
    // make sure that this thread won't try to calculate non-existent receiver
    
    if (iy >= ny) return;        
    if (ix >= nx) return;    
    if (iz >= nz) return;
    
    // start actual calculation : zero the accumulators
    pressure_re = 0;
    pressure_im = 0;
    
    // where am I in space?
    pixel_x = (float)ix * dx + x0;
    pixel_y = (float)iy * dy + y0; 
    pixel_z = (float)iz * dz + z0;
    
    // debugging code only:
    // printf("block %d.%d: pixel  %d.%d.%d, at %0.3f|%0.3f|%0.3f\\n",blockIdx.y,blockIdx.z, ix, iy, iz, pixel_x,pixel_y,pixel_z); // note that enabling this makes this a long call, time outs the driver
    
    // for each transmitter-element, do this:
    for (itx=0; itx<tx_length*6; itx=itx+6) // this hopefully acesses the same memory location for each thread and therefore will be cached
     {            
        // calculate distance in cartesian space:
        dix = (pixel_x-tx[0+itx]);                            // tx.x
        diy = (pixel_y-tx[1+itx]);                            // tx.y
        diz = (pixel_z-tx[2+itx]);                            // tx.z
        distance = sqrtf( dix*dix + diy*diy + diz*diz );
        // amplitude decays with distance as the energy gets distributed on a ring around the transmit point
        // note that ring is for 2D space, and a sphere surface would be more appropriate for 3D space
        
        amplitude = tx[3+itx] / ( pi2 * distance );                   // amplitude is at itx+3  
        kd = -k * distance + tx[4 + itx];                           // phase is at itx+4            
        
        // accumulate the energy                           
        pressure_re = pressure_re + __cosf(kd) * amplitude;                    
        pressure_im = pressure_im + __sinf(kd) * amplitude; 
     }        
     
    // write the result:
        
    // in case if I want the absolute pressure only (discards the phase)
    // pressure=sqrtf(pressure_re*pressure_re+pressure_im*pressure_im); 
           
    // in CUDA, i need to calculate rx array memory offset manually for each thread:
    // offset=ix+iy*nx+iz*nx*ny; // that's a version for xyz, real-only numbers (e.g. pycuda.float32 version
    // out[offset]=(float)pressure;
               
    // this is a version for complex numbers: pycuda.complex64
    offset=2*(iz+iy*nz+ix*nz*ny); // that's a version for xyz version    
    out[offset]=(float)pressure_re;
    offset++; // go to the imaginary value pointer
    out[offset]=(float)pressure_im;    
}
""")

    # instantiate the code into the compiler
    beamsim_kernel = code_text.get_function("BeamSimKernel")

    # convert the values from pythonic to Cudific
    ck = numpy.float32(k)
    cx0 = numpy.float32(x0)
    cy0 = numpy.float32(y0)
    cz0 = numpy.float32(z0)
    cnx = numpy.int32(nx)
    cny = numpy.int32(ny)
    cnz = numpy.int32(nz)
    cdx = numpy.float32(dx)
    cdy = numpy.float32(dy)
    cdz = numpy.float32(dz)
    ctx = numpy.asarray(elements_vectorized).astype(numpy.float32)
    ctx_count = numpy.int32(len(ctx) / 6)
    # note: must reserve the output memory right here
    # note: for 2D values, the x must be == 1
    assert(nx == 1), 'this version supports nx=1 only'
    # prevent from the image size being too large
    assert(ny < 4096+1), ' ny too large: reduce calculated field pixel count'
    assert(nz < 4096+1), ' nz too large: reduce calculated field pixel count'
    # prevent from the transducer description to be too large - you can remove this limitation later on
    assert(ctx_count < 81920+1),'too many radiator points. Reduce simulation complexity'
    cuda_out = numpy.zeros((int(cny), int(cnz))).astype(numpy.complex64)

    # prepare the GPU call : thread wave shape:
    threads_x = 1
    threads_y = 16
    threads_z = 64
    blocks_x = 1
    blocks_y = int((int(cny) / threads_y) + 1)
    blocks_z = int((int(cnz) / threads_z) + 1)

    # start the timer!
    # time_1 = time.clock()

    beamsim_kernel(
        drv.In(ctx),
        ctx_count,
        drv.Out(cuda_out),
        cx0, cy0, cz0,
        cnx, cny, cnz,
        cdx, cdy, cdz,
        ck,
        block=(threads_x, threads_y, threads_z), grid=(blocks_x, blocks_y, blocks_z))

    # time_2 = time.clock()

    # release the GPU from this thread
    # release the context, otherwise memory leak might occur
    gpu_context.pop()
    gpu_context.detach()

    # performance = numpy.float64(numpy.int128(cnx)*numpy.int128(cny)*numpy.int128(cnz)*numpy.int128(ctx_count)) / (time_2 - time_1)

    return cuda_out
Esempio n. 39
0
def cuebeamlambert(
    elements_vectorized=None,
    k: float = 1000.0,
    lambert_radius: float = 100e-3,
    lambert_map_density: float = 100):
    # print("lambert_radius = {}, lambert_map_density={}".format(lambert_radius,lambert_map_density))
    # matlab:: [img_lambert lambert_x lambert_y lambert_z] = cueBeam.cueBeam_lambert(tx',enviroment.wavenumber,lambert_radius,lambert_map_density);
    t0 = time.time()

    if elements_vectorized is None:
        elements_vectorized = [0.0, 0.0, 0.0, 1.0, 0.0, 0.0]

    # note, that on a remote worker, there is never a saying where this thread wakes up
    # therefore i must be extra carefull to always initialize all the resources needed

    drv_init_time_before = time.time()-t0
    # initialize manually

    drv.init()

    # local_device_count = drv.Device.count()
    # print('found {} GPUs'.format(local_device_count))

    # choose one of the GPUs at random
    gpu_to_take = random.choice(range(drv.Device.count()))
    gpu_context = drv.Device(gpu_to_take).make_context()
    gpu_context.push()  # make the context active

    drv_init_time_after = time.time() - t0

    code_text_lambert=SourceModule("""
    
    #include <stdio.h>

#define pi2 2.0f*3.141592653589793f

    __global__ void BeamsimLambertKernel ( float *tx, unsigned int tx_length, float *out,
        unsigned int n, float d, float r,
        float k,float *xp, float *yp, float *zp)
{
    unsigned int offset=0;
    unsigned int ix,iy,itx=0;
    float pressure,distance,kd,pressure_re,pressure_im=0;
    float dist2=0;
    float dix,diy,diz,lambert_x,lambert_y,lambert_z=0;
    float xbase,ybase,rho2,rhoi,cosphi,sinphi,cosl,sinl=0;
    float xbase0=-sqrtf((float)2)+(float)1e-8;
    // calculate ix,iy from thread built-in variables
    ix = blockIdx.x * blockDim.x + threadIdx.x;
    iy = blockIdx.y * blockDim.y + threadIdx.y;
    //ix=0; // debug //
    //C// for (iy=0; iy<ny; iy++)
    //C//  for (ix=0; ix<nx; ix++)
    
    // make sure that this thread won't try to calculate non-existent receiver
    if (iy>n) return;
    
    if (ix>n) return;
    
    // start actual calculation
    pressure_re=0;
    pressure_im=0;
    
    xbase=(float)ix*d+xbase0;
    ybase=(float)iy*d+xbase0; // it would be an optimisation not to recalculate it for each pixel, this has to stay here be due to future port to CUDA where each pixel has it's own thread
    rho2=xbase*xbase+ybase*ybase;
    offset=ix+n*iy;
    if (rho2>(float)2)
    {
        out[2*offset]=0;
        out[2*offset+1]=0; // note, complex valued output
        
        xp[offset]=0;
        yp[offset]=0;
        zp[offset]=0;
        return;
    }
    rhoi=rsqrtf(rho2);
    cosl=-ybase*rhoi;
    cosphi=sqrtf(rho2-rho2*rho2/(float)4);
    lambert_x=r*cosl*cosphi;
    
    sinl=xbase*rhoi;
    lambert_y=r*sinl*cosphi;
    sinphi=(float)1-rho2/(float)2;
    lambert_z=r*sinphi;
    
    xp[offset]=lambert_x;
    yp[offset]=lambert_y;
    zp[offset]=lambert_z;
    
    for (itx=0; itx<tx_length*6; itx=itx+6) // this hopefully acesses the same memory location for each thread and therefore will be cached
    {
        //    distance=single(sqrt( (ix*dx+x0-tx(1,itx)).^2 + (iy*dy+y0-tx(2,itx)).^2 + (iz*dz+z0-tx(3,itx)).^2 ));
        dix=(lambert_x-tx[0+itx]);
        diy=(lambert_y-tx[1+itx]);
        diz=(lambert_z-tx[2+itx]);
        distance=sqrtf( dix*dix + diy*diy + diz*diz );
        kd=-k*distance+tx[5+itx];
        dist2=tx[4+itx]/(6.283185307179586f*distance); //equals 2*pi                
        pressure_re=pressure_re+__cosf(kd)*dist2;
        pressure_im=pressure_im+__sinf(kd)*dist2;
        
        // note: __sinf is an simlpified sin function that yields less accurate result. May need to switch to full sin for final product, ok for testing for now
        
        // note 2: function sincosf(...) may be faster in this case - calculates both sin and cos. but since it requires additional accumulators,
        // a detailed test will be required to find out what's faster.
        
    }
    // mem write    
    // pressure=sqrtf(pressure_re*pressure_re+pressure_im*pressure_im);
    // in CUDA, i need to calculate rx array memory offset manually for each thread:
    //offset=ix+nx*iy+(ny*nx)*iz;    
    // out[offset]=(float)pressure; //left for debug    
    // offset=2*(ix+n*iy);  // note, complex-valued version A 
    out[2*offset+0]=(float)pressure_re; // use real-to-complex offset conversion
    offset++; // go to the imaginary value pointer
    out[2*offset+1]=(float)pressure_im;
}

    """)

    # instantiate the code into the compiler
    beamsim_lambert_kernel = code_text_lambert.get_function("BeamsimLambertKernel")
    kernel_init_time = time.time() - t0

    # calc basic space properties
    npts = float(math.ceil(6.283185307179586 * lambert_radius / lambert_map_density))
    d = 2.0 * math.sqrt(2)/npts
    n = 1 + math.ceil(2*math.sqrt(2)/d)

    # convert the values from pythonic to Cudific
    ctx = numpy.asarray(elements_vectorized).astype(numpy.float32)
    ctx_count = numpy.int32(len(ctx) / 6)
    cn = numpy.int32(n)
    cd = numpy.float32(d)
    cr = numpy.float32(lambert_radius)
    ck = numpy.float32(k)
    cuda_out_xp = numpy.zeros((int(n), int(n))).astype(numpy.float32)
    cuda_out_yp = numpy.zeros((int(n), int(n))).astype(numpy.float32)
    cuda_out_zp = numpy.zeros((int(n), int(n))).astype(numpy.float32)
    cuda_out = numpy.zeros((int(n), int(n))).astype(numpy.complex64) # note: must reserve the output memory right here

    # prevent from the transducer description to be too large - you can remove this limitation later on
    assert (ctx_count < 300001), "transducer definition too large"

    # prepare the GPU call : thread wave shape:
    threads_x = 16
    threads_y = 64
    threads_z = 1
    blocks_x = int((int(n) / threads_x) + 1)
    blocks_y = int((int(n) / threads_y) + 1)
    blocks_z = 1

    kernel_prepare_time = time.time() - t0
    beamsim_lambert_kernel(
        drv.In(ctx),
        ctx_count,
        drv.Out(cuda_out),
        cn, cd, cr, ck,
        drv.Out(cuda_out_xp),
        drv.Out(cuda_out_yp),
        drv.Out(cuda_out_zp),
        block=(threads_x, threads_y, threads_z),
        grid=(blocks_x, blocks_y, blocks_z))

    kernel_run_time=time.time() - t0


    # release the GPU from this thread
    # release the context, otherwise memory leak might occur
    gpu_context.pop()
    gpu_context.detach()
    detach_time = time.time() - t0

    # calculate performance metrics. This is usefull for debugging the distributed computation system
    dinittime = drv_init_time_after - drv_init_time_before
    dkernel_init_time = kernel_init_time - drv_init_time_after
    dkernel_prepare_time = kernel_prepare_time - kernel_init_time
    dkernel_run_time = kernel_run_time - kernel_prepare_time
    dkernel_detach_time = detach_time - kernel_run_time
    print("lambert: init:{:06.4f}, kernel_init:{:06.4f}, kernel_prepare:{:06.4f}, kernel_run:{:06.4f}, detach:{:06.4f}".format(dinittime,dkernel_init_time,dkernel_prepare_time,dkernel_run_time,dkernel_detach_time))

    # finally...
    return cuda_out, cuda_out_xp, cuda_out_yp, cuda_out_zp
Esempio n. 40
0
def image_iterator_gpu(image_volume,
                       roi=None,
                       radius=2,
                       gray_levels=None,
                       binwidth=None,
                       dx=1,
                       dy=0,
                       dz=0,
                       ndev=2,
                       cadd=(0, 0, 0),
                       sadd=3,
                       csub=(0, 0, 0),
                       ssub=3,
                       i=0,
                       fixed_start=-250,
                       fixed_end=350,
                       feature_kernel='kernel_glcm',
                       stat_name='stat_glcm_contrast'):
    """Uses PyCuda to parallelize the computation of the voxel-wise image entropy using a variable \
            neighborhood radius

    Args:
	radius -- neighborhood radius; where neighborhood size is isotropic and calculated as 2*radius+1
    """
    # initialize cuda context
    cuda.init()
    cudacontext = cuda.Device(NVDEVICE).make_context()

    parent_dir = os.path.dirname(os.path.realpath(__file__))
    with open(os.path.join(parent_dir, 'local_features.cuh'), mode='r') as f:
        cuda_template = Template(f.read())

    roimask = None
    if isinstance(image_volume, np.ndarray):
        toBaseVolume = False
        logger.debug('recognized as an np.ndarray')
        if image_volume.ndim == 3:
            d, r, c = image_volume.shape
        elif image_volume.ndim == 2:
            d, r, c = (1, *image_volume.shape)
        image = image_volume.flatten()
        # # use stat based GLCM quantization
        # quantize_mode=QMODE_STAT
    else:
        toBaseVolume = True
        logger.debug('recognized as a BaseVolume')
        image = image_volume
        if roi:
            image = image.conformTo(roi.frameofreference)
        d, r, c = image.frameofreference.size[::-1]
        image = image.vectorize()
        # if not image_volume.modality.lower() == 'ct':
        #     # use stat based GLCM quantization
        #     quantize_mode=QMODE_STAT

        # mask to roi
        if (roi):
            roimask = roi.makeDenseMask().vectorize()

    logger.debug('d:{:d}, r:{:d}, c:{:d}'.format(d, r, c))
    if d == 1:
        z_radius = 0
    elif d > 1:
        z_radius = radius

    # enforce quantization mode selection
    if gray_levels and binwidth:
        logger.exception(
            'must exclusively specify "binwidth" or "gray_levels" to select glcm quantization mode'
        )
    elif binwidth:
        quantize_mode = QMODE_FIXEDHU
        nbins = int(math.floor((fixed_end - fixed_start) / binwidth)) + 2
        logger.debug(
            'quantization using {} fixed bins from {} to {} with spacing {}'.
            format(nbins, fixed_start, fixed_end, binwidth))
        gray_levels = -1
    elif gray_levels:
        warnings.warn(
            'QMODE_STAT quantization mode will be deprecated soon in favor of other quantization methods.',
            DeprecationWarning)
        quantize_mode = QMODE_STAT
        nbins = gray_levels
        binwidth = -1
    else:
        # kernel doesn't use glcm
        quantize_mode = -1
        nbins = 1
        gray_levels = -1
        binwidth = -1

    maxrunlength = math.ceil(
        math.sqrt(2 * (radius * 2 + 1) * (radius * 2 + 1) +
                  (z_radius * 2 + 1)))

    cuda_source = cuda_template.substitute({
        'RADIUS': radius,
        'Z_RADIUS': z_radius,
        'IMAGE_DEPTH': d,
        'IMAGE_HEIGHT': r,
        'IMAGE_WIDTH': c,
        'QUANTIZE_MODE': quantize_mode,
        'GRAY_LEVELS': gray_levels,
        'FIXED_BINWIDTH': binwidth,
        'FIXED_START': fixed_start,
        'NBINS': nbins,
        'MAXRUNLENGTH': maxrunlength,
        'DX': dx,
        'DY': dy,
        'DZ': dz,
        'NDEV': ndev,
        'CADD_X': cadd[0],
        'CADD_Y': cadd[1],
        'CADD_Z': cadd[2],
        'SADD': sadd,
        'CSUB_X': csub[0],
        'CSUB_Y': csub[1],
        'CSUB_Z': csub[2],
        'SSUB': ssub,
        'KERNEL': feature_kernel,
        'STAT': stat_name
    })
    mod2 = SourceModule(
        cuda_source,
        options=[
            '-I {!s}'.format(parent_dir),
            #  '-g', '-G', '-lineinfo'
        ])
    func = mod2.get_function('image_iterator_gpu')

    # allocate image on device in global memory
    image = image.astype(np.float32)
    image_gpu = cuda.mem_alloc(image.nbytes)
    result = np.zeros_like(image)
    result_gpu = cuda.mem_alloc(result.nbytes)
    # transfer image to device
    cuda.memcpy_htod(image_gpu, image)
    cuda.memcpy_htod(result_gpu, result)
    # call device kernel
    blocksize = 256
    gridsize = math.ceil(r * c * d / blocksize)
    func(image_gpu, result_gpu, block=(blocksize, 1, 1), grid=(gridsize, 1, 1))
    # get result from device
    cuda.memcpy_dtoh(result, result_gpu)

    # detach from cuda context
    # cudacontext.synchronize()
    # cudacontext.detach()
    cudacontext.pop()
    # required to successfully free device memory for created context
    del cudacontext
    gc.collect()
    pycuda.tools.clear_context_caches()

    logger.debug('feature result shape: {!s}'.format(result.shape))
    logger.debug('GPU done')

    # clean invalid values from result
    result = np.nan_to_num(result)

    if (roimask is not None):
        result = np.multiply(result, roimask)

    if d == 1:
        result = result.reshape(r, c)
    elif d > 1:
        result = result.reshape(d, r, c)

    if toBaseVolume:
        if roi:
            FOR = roi.frameofreference
        else:
            FOR = image_volume.frameofreference
        outvolume = MaskableVolume().fromArray(result, FOR)
        outvolume.modality = image_volume.modality
        return outvolume
    else:
        return result
Esempio n. 41
0
def elementwise_composition_gpu(feature_volume_list,
                                comp_type='elementwiseMean'):
    """computes the elementwise mean of the like-shaped volumes in feature_volume_list"""
    # initialize cuda context
    cuda.init()
    cudacontext = cuda.Device(NVDEVICE).make_context()

    parent_dir = os.path.dirname(os.path.realpath(__file__))
    with open(os.path.join(parent_dir, 'feature_compositions.cuh'),
              mode='r') as f:
        mod = SourceModule(
            f.read(),
            options=[
                '-I {!s}'.format(parent_dir),
                # '-g', '-G', '-lineinfo'
            ])
    func = mod.get_function(comp_type)

    # combine volumes into linearized array
    FOR = feature_volume_list[0].frameofreference
    vols = []
    for vol in feature_volume_list:
        vols.append(vol.vectorize())
    array_length = np.product(FOR.size).item()
    while len(vols) > 1:
        num_arrays = 2
        cat = np.concatenate([vols.pop() for x in range(num_arrays)], axis=0)

        # allocate image on device in global memory
        cat = cat.astype(np.float32)
        cat_gpu = cuda.mem_alloc(cat.nbytes)
        result = np.zeros((array_length)).astype(np.float32)
        result_gpu = cuda.mem_alloc(result.nbytes)
        # transfer cat to device
        cuda.memcpy_htod(cat_gpu, cat)
        cuda.memcpy_htod(result_gpu, result)
        # call device kernel
        blocksize = 512
        gridsize = math.ceil(array_length / blocksize)
        func(cat_gpu,
             result_gpu,
             np.int32(array_length),
             np.int32(num_arrays),
             block=(blocksize, 1, 1),
             grid=(gridsize, 1, 1))
        # get result from device
        cuda.memcpy_dtoh(result, result_gpu)
        vols.append(result.reshape((-1, 1)))
    result = vols[0]

    # detach from cuda context
    # cudacontext.synchronize()
    # cudacontext.detach()
    cudacontext.pop()
    # required to successfully free device memory for created context
    del cudacontext
    gc.collect()
    pycuda.tools.clear_context_caches()

    x = MaskableVolume().fromArray(result, FOR)
    x.modality = feature_volume_list[0].modality
    return x
Esempio n. 42
0
    def init_cuda(self, X, Y, cls_start, max_kernels=1):

        #assert X.shape[0]==Y.shape[0]
        self.max_concurrent_kernels = max_kernels

        self.X = X
        self.Y = Y

        self.cls_start = cls_start.astype(np.int32)

        #handle to gpu memory for y for each concurrent classifier
        self.g_y = []
        #handle to gpu memory for results for each concurrent classifier
        self.g_out = []  #gpu kernel out
        self.kernel_out = []  #cpu kernel out
        #blocks per grid for each concurrent classifier
        self.bpg = []

        #function reference
        self.func = []

        #texture references for each concurrent kernel
        self.tex_ref = []

        #main vectors
        #gpu
        self.g_vecI = []
        self.g_vecJ = []
        #cpu
        self.main_vecI = []
        self.main_vecJ = []

        #cpu class
        self.cls_count = []
        self.cls = []
        #gpu class
        self.g_cls_count = []
        self.g_cls = []

        self.sum_cls = []

        for i in range(max_kernels):
            self.bpg.append(0)
            self.g_y.append(0)
            self.g_out.append(0)
            self.kernel_out.append(0)
            self.cls_count.append(0)
            self.cls.append(0)
            self.g_cls_count.append(0)
            self.g_cls.append(0)
            #            self.func.append(0)
            #            self.tex_ref.append(0)
            self.g_vecI.append(0)
            self.g_vecJ.append(0)
            #            self.main_vecI.append(0)
            #            self.main_vecJ.append(0)
            self.sum_cls.append(0)

        self.N, self.Dim = X.shape
        column_size = self.N * 4
        cacheMB = self.cache_size * 1024 * 1024  #100MB for cache size

        #how many kernel colums will be stored in cache
        cache_items = np.floor(cacheMB / column_size).astype(int)

        cache_items = min(self.N, cache_items)
        self.kernel_cache = pylru.lrucache(cache_items)

        self.compute_diag()

        #cuda initialization
        cuda.init()

        self.dev = cuda.Device(0)
        self.ctx = self.dev.make_context()

        #reade cuda .cu file with module code
        with open(self.module_file, "r") as CudaFile:
            module_code = CudaFile.read()

        #compile module
        self.module = SourceModule(module_code, keep=True, no_extern_c=True)

        (g_gamma, gsize) = self.module.get_global('GAMMA')
        cuda.memcpy_htod(g_gamma, np.float32(self.Gamma))

        #get functions reference

        Dim = self.Dim
        vecBytes = Dim * 4
        for f in range(self.max_concurrent_kernels):
            gfun = self.module.get_function(self.func_name)
            self.func.append(gfun)

            #init texture for vector I
            vecI_tex = self.module.get_texref('VecI_TexRef')
            self.g_vecI[f] = cuda.mem_alloc(vecBytes)
            vecI_tex.set_address(self.g_vecI[f], vecBytes)

            #init texture for vector J
            vecJ_tex = self.module.get_texref('VecJ_TexRef')
            self.g_vecJ[f] = cuda.mem_alloc(vecBytes)
            vecJ_tex.set_address(self.g_vecJ[f], vecBytes)

            self.tex_ref.append((vecI_tex, vecJ_tex))

            self.main_vecI.append(np.zeros((1, Dim), dtype=np.float32))
            self.main_vecJ.append(np.zeros((1, Dim), dtype=np.float32))

            texReflist = list(self.tex_ref[f])

            #function definition P-pointer i-int
            gfun.prepare("PPPPPPiiiiiiPPP", texrefs=texReflist)

        #transform X to particular format
        v, c, r = spf.csr2ellpack(self.X, align=self.prefetch)
        #copy format data structure to gpu memory

        self.g_val = cuda.to_device(v)
        self.g_col = cuda.to_device(c)
        self.g_len = cuda.to_device(r)
        self.g_sdot = cuda.to_device(self.Xsquare)

        self.g_cls_start = cuda.to_device(self.cls_start)
Esempio n. 43
0
from __future__ import print_function, division, absolute_import
"""
Module to handle the cuda runtime environment.
"""
#system level imports
import ctypes
import os
import math

# pycuda imports
import pycuda
import pycuda.driver as cudadrv

# Init cuda
cudadrv.init()

#package level imports
from ppmd.cuda import cuda_config
from ppmd import runtime, pio, mpi, abort

CUDA_ENABLED = cuda_config.CUDA_CFG['enable-cuda'][1]
OPT = cuda_config.CUDA_CFG['opt-level'][1]
DEBUG = cuda_config.CUDA_CFG['debug-level'][1]
VERBOSE = cuda_config.CUDA_CFG['verbose-level'][1]
TIMER = cuda_config.CUDA_CFG['timer-level'][1]
BUILD_TIMER = cuda_config.CUDA_CFG['build-timer-level'][1]
ERROR_LEVEL = cuda_config.CUDA_CFG['error-level'][1]
BUILD_DIR = runtime.BUILD_DIR

from . import cuda_build
 def __init__(self):
     cuda.init()
     if cuda.Device.count() < 0:
         raise ValueError("No GPU found on this device")
Esempio n. 45
0
    drv.memcpy_htod(gpu_args[0], output_image)

    #launch the kernel
    context.synchronize()
    start.record()
    convolution(*gpu_args, block=threads, grid=grid, stream=None, shared=0)
    end.record()
    context.synchronize()
    print("convolution_kernel took", end.time_since(start), "ms.")

    #copy output data back from GPU
    drv.memcpy_dtoh(output_image, gpu_args[0])

    #compare output with reference
    correct = numpy.allclose(output_image, reference, atol=1e-6)
    if not correct:
        print("TEST FAILED!")
    else:
        print("TEST PASSED!")


if __name__ == "__main__":

    #init pycuda
    drv.init()
    context = drv.Device(0).make_context()
    try:
        convolution_example(context)
    finally:
        context.pop()
Esempio n. 46
0
#!/usr/bin/env python3

# A simple class to know about your cuda devices
import pycuda.driver as cuda
import pycuda.autoinit # Necessary for using its functions
cuda.init() # Necesarry for using its functions

class aboutCudaDevices():
    def __init__(self):
        pass
    
    def num_devices(self):
        """Return number of devices connected."""
        return cuda.Device.count()
    
    def devices(self):
        """Get info on all devices connected."""
        num = cuda.Device.count()
        print("%d device(s) found:"%num)
        for i in range(num):
            print(cuda.Device(i).name(), "(Id: %d)"%i)
            
    def mem_info(self):
        """Get available and total memory of all devices."""
        available, total = cuda.mem_get_info()
        print("Available: %.2f GB\nTotal:     %.2f GB"%(available/1e9, total/1e9))
        
    def attributes(self, device_id=0):
        """Get attributes of device with device Id = device_id"""
        return cuda.Device(device_id).get_attributes()
    
Esempio n. 47
0
# Authors: Paul Kienzle, Christopher Metting
#03/23/2010

import time
import Queue
import threading

import numpy
import numpy.linalg as linalg
from . import approximations
from ..model.sample_prep import *
try:
    from pycuda import gpuarray
    import pycuda.driver as cuda
    from pycuda.compiler import SourceModule
    cuda.init()
    cudaFind = True
except:
    print 'Pycuda or Cuda not installed Reverting to CPU calculation'
    cudaFind = False


def readfile(name):
    file = open(name)
    txt = file.read()
    file.close()
    return txt


def loadkernelsrc(name, precision='float32', defines={}):
    import os
Esempio n. 48
0
def main(inputSize):
    """Create a TensorRT engine for ONNX-based YOLOv3-608 and run inference."""
    global vs, outputFrame, lock, t0, t1, fps
    cuda.init()
    device = cuda.Device(0)
    onnx_file_path = 'yolov3-tiny-416.onnx'
    engine_file_path = 'yolov3-tiny-{}.trt'.format(inputSize)
    h, w = (inputSize, inputSize)
    # Two-dimensional tuple with the target network's (spatial) input resolution in HW ordered
    input_resolution_yolov3_HW = (inputSize, inputSize)
    # Create a pre-processor object by specifying the required input resolution for YOLOv3
    preprocessor = PreprocessYOLO(input_resolution_yolov3_HW)

    # Output shapes expected by the post-processor
    output_shapes = [(1, 255, 13, 13), (1, 255, 26, 26)]

    # Do inference with TensorRT
    cuda.init()  # Initialize CUDA
    ctx = make_default_context()  # Create CUDA context

    postprocessor_args = {
        "yolo_masks": [(3, 4, 5), (0, 1, 2)],
        "yolo_anchors": [(10, 14), (23, 27), (37, 58), (81, 82), (135, 169),
                         (344, 319)],
        "obj_threshold":
        0.4,
        "nms_threshold":
        0.5,
        "yolo_input_resolution":
        input_resolution_yolov3_HW
    }
    postprocessor = PostprocessYOLO(**postprocessor_args)

    with get_engine(onnx_file_path, engine_file_path
                    ) as engine, engine.create_execution_context() as context:
        print("performing inference")
        inputs, outputs, bindings, stream = common.allocate_buffers(engine)
        while True:

            trt_outputs = []
            #image_raw=vs.read()
            ret, image_raw = cap.read()
            if image_raw is not None:
                image_raw, image = preprocessor.process(image_raw)
                shape_orig_WH = image_raw.size
                inputs[0].host = image
                t0 = time.time()
                trt_outputs = common.do_inference(context,
                                                  bindings=bindings,
                                                  inputs=inputs,
                                                  outputs=outputs,
                                                  stream=stream)

                trt_outputs = [
                    output.reshape(shape)
                    for output, shape in zip(trt_outputs, output_shapes)
                ]

                boxes, classes, scores = postprocessor.process(
                    trt_outputs, (shape_orig_WH), 0)
                t1 = time.time()
                t_inf = t1 - t0
                fps = 1 / t_inf
                draw = True
                if (boxes is None):
                    print("no bboxes")
                    draw = False
                if (classes is None):
                    print("no classes")
                    draw = False
                if (scores is None):
                    print("no scores")
                    draw = False
                if draw:
                    obj_detected_img = draw_bboxes(
                        image_raw,
                        bboxes=boxes,
                        confidences=scores,
                        categories=classes,
                        all_categories=ALL_CATEGORIES)
                else:
                    obj_detected_img = image_raw
    #now stream this image

                with lock:
                    outputFrame = np.array(obj_detected_img)

    ctx.pop()
Esempio n. 49
0
def train_net(config, private_config):

    # UNPACK CONFIGS
    (flag_para_load, flag_datalayer, train_filenames, val_filenames,
     train_labels, val_labels, img_mean) = \
        unpack_configs(config, ext_data=private_config['ext_data'],
                       ext_label=private_config['ext_label'])

    gpu_send_queue = private_config['queue_gpu_send']
    gpu_recv_queue = private_config['queue_gpu_recv']

    # pycuda and zmq set up
    drv.init()
    dev = drv.Device(int(private_config['gpu'][-1]))
    ctx = dev.make_context()

    sock_gpu = zmq.Context().socket(zmq.PAIR)
    if private_config['flag_client']:
        sock_gpu.connect('tcp://*****:*****@ iter = ', num_iter
                    print 'training cost:', cost_ij

                if config['print_train_error']:
                    error_ij = train_error()

                    gpu_send_queue.put(error_ij)
                    that_error = gpu_recv_queue.get()
                    error_ij = (error_ij + that_error) / 2.

                    if private_config['flag_verbose']:
                        print 'training error rate:', error_ij

            if flag_para_load and (count < len(minibatch_range)):
                load_send_queue.put('calc_finished')

        ############### Test on Validation Set ##################

        DropoutLayer.SetDropoutOff()

        this_val_error, this_val_loss = get_val_error_loss(
            rand_arr,
            shared_x,
            shared_y,
            val_filenames,
            val_labels,
            flag_datalayer,
            flag_para_load,
            batch_size,
            validate_model,
            send_queue=load_send_queue,
            recv_queue=load_recv_queue)

        # report validation stats
        gpu_send_queue.put(this_val_error)
        that_val_error = gpu_recv_queue.get()
        this_val_error = (this_val_error + that_val_error) / 2.

        gpu_send_queue.put(this_val_loss)
        that_val_loss = gpu_recv_queue.get()
        this_val_loss = (this_val_loss + that_val_loss) / 2.

        if private_config['flag_verbose']:
            print('epoch %i: validation loss %f ' % (epoch, this_val_loss))
            print('epoch %i: validation error %f %%' %
                  (epoch, this_val_error * 100.))
        val_record.append([this_val_error, this_val_loss])

        if private_config['flag_save']:
            np.save(config['weights_dir'] + 'val_record.npy', val_record)

        DropoutLayer.SetDropoutOn()
        ############################################

        # Adapt Learning Rate
        step_idx = adjust_learning_rate(config, epoch, step_idx, val_record,
                                        learning_rate)

        # Save Weights, only one of them will do
        if private_config['flag_save']:
            if epoch % config['snapshot_freq'] == 0:
                save_weights(layers, config['weights_dir'], epoch)
                np.save(config['weights_dir'] + 'lr_' + str(epoch) + '.npy',
                        learning_rate.get_value())
                save_momentums(vels, config['weights_dir'], epoch)

    print('Optimization complete.')
Esempio n. 50
0
    def _test_pycuda(self):
        import pycuda.driver as drv
        import pycuda.tools
        import pycuda.autoinit
        import numpy
        import numpy.linalg as la
        from pycuda.compiler import SourceModule
        from jinja2 import Template
        import pycuda.gpuarray as gpuarray

        tpl2 = Template("""
        
        extern __shared__ int smem[];
        
        __global__ void square_them(int* a,int l)
        {
            
            const int i = blockDim.x*blockIdx.x +l*threadIdx.x;
            int* tmp=&smem[l*threadIdx.x];
            for(int j=0;j<l;++j)
                tmp[j]=a[i+j];
            __syncthreads();
            for(int j=0;j<l;++j)
                tmp[j]*=2;
            __syncthreads();
            for(int j=0;j<l;++j)
                a[i+j]=tmp[j];
            
        }
        """)

        drv.init()
        dev = drv.Device(0)

        t = 9
        nthr = 256
        size_arrays = 1024 * 512
        num_blocks = 64
        l = int(size_arrays / num_blocks / nthr)
        print l
        a = range(0, l)
        a = a * (nthr * num_blocks)
        a = numpy.array(a, dtype=numpy.int32)
        a_gpu = gpuarray.to_gpu(a.astype(numpy.int32))

        code = tpl2.render()
        #print code
        mod = SourceModule(code)
        square_them = mod.get_function("square_them")
        shmem = 4 * l * nthr
        print "sh mem usage: ", shmem
        if shmem > dev.MAX_SHARED_MEMORY_PER_BLOCK:
            print "too much shared memory used"
            #return
        print a[0:2 * l]
        square_them(a_gpu,
                    numpy.int32(l),
                    grid=(num_blocks, 1, 1),
                    block=(nthr, 1, 1),
                    shared=shmem)
        print a[0:2 * l]
Esempio n. 51
0
def calculation(in_queue, out_queue):

    device_num, params = in_queue.get()

    chunk_size = params['chunk_size']
    chunks_num = params['chunks_num']
    particles = params['particles']
    state = params['state']
    representation = params['representation']
    quantities = params['quantities']

    decoherence = params['decoherence']
    if decoherence is not None:
        decoherence_steps = decoherence['steps']
        decoherence_coeff = decoherence['coeff']
    else:
        decoherence_steps = 0
        decoherence_coeff = 1

    binning = params['binning']
    if binning is not None:
        s = set()
        for names, _, _ in binning:
            s.update(names)
        quantities = sorted(list(s))

    c_dtype = numpy.complex128
    c_ctype = 'double2'
    s_dtype = numpy.float64
    s_ctype = 'double'
    Fs = []

    cuda.init()

    device = cuda.Device(device_num)
    ctx = device.make_context()
    free, total = cuda.mem_get_info()
    max_chunk_size = float(total) / len(quantities) / numpy.dtype(
        c_dtype).itemsize / 1.1
    max_chunk_size = 10**int(numpy.log(max_chunk_size) / numpy.log(10))
    #print free, total, max_chunk_size

    if max_chunk_size > chunk_size:
        subchunk_size = chunk_size
        subchunks_num = 1
    else:
        assert chunk_size % max_chunk_size == 0
        subchunk_size = max_chunk_size
        subchunks_num = chunk_size / subchunk_size

    buffers = []
    for quantity in sorted(quantities):
        buffers.append(GPUArray(subchunk_size, c_dtype))

    stream = cuda.Stream()

    # compile code
    try:
        source = TEMPLATE.render(c_ctype=c_ctype,
                                 s_ctype=s_ctype,
                                 particles=particles,
                                 state=state,
                                 representation=representation,
                                 quantities=quantities,
                                 decoherence_coeff=decoherence_coeff)
    except:
        print exceptions.text_error_template().render()
        raise

    try:
        module = SourceModule(source, no_extern_c=True)
    except:
        for i, l in enumerate(source.split("\n")):
            print i + 1, ":", l
        raise

    kernel_initialize = module.get_function("initialize")
    kernel_calculate = module.get_function("calculate")
    kernel_decoherence = module.get_function("decoherence")

    # prepare call parameters

    gen_block_size = min(kernel_initialize.max_threads_per_block,
                         kernel_calculate.max_threads_per_block)
    gen_grid_size = device.get_attribute(
        cuda.device_attribute.MULTIPROCESSOR_COUNT)
    gen_block = (gen_block_size, 1, 1)
    gen_grid = (gen_grid_size, 1, 1)

    num_gen = gen_block_size * gen_grid_size
    assert num_gen <= 20000

    # prepare RNG states

    #seeds = to_gpu(numpy.ones(size, dtype=numpy.uint32))
    seeds = to_gpu(
        numpy.random.randint(0, 2**32 - 1, size=num_gen).astype(numpy.uint32))
    state_type_size = sizeof("curandStateXORWOW", "#include <curand_kernel.h>")
    states = cuda.mem_alloc(num_gen * state_type_size)

    #prev_stack_size = cuda.Context.get_limit(cuda.limit.STACK_SIZE)
    #cuda.Context.set_limit(cuda.limit.STACK_SIZE, 1<<14) # 16k
    kernel_initialize(states,
                      seeds.gpudata,
                      block=gen_block,
                      grid=gen_grid,
                      stream=stream)
    #cuda.Context.set_limit(cuda.limit.STACK_SIZE, prev_stack_size)

    # run calculation
    args = [states] + [buf.gpudata
                       for buf in buffers] + [numpy.int32(subchunk_size)]

    if binning is None:

        results = {
            quantity: numpy.zeros(
                (decoherence_steps + 1, chunks_num * subchunks_num), c_dtype)
            for quantity in quantities
        }
        for i in xrange(chunks_num * subchunks_num):
            kernel_calculate(*args,
                             block=gen_block,
                             grid=gen_grid,
                             stream=stream)

            for k in xrange(decoherence_steps + 1):
                if k > 0:
                    kernel_decoherence(*args,
                                       block=gen_block,
                                       grid=gen_grid,
                                       stream=stream)

                for j, quantity in enumerate(sorted(quantities)):
                    F = (gpuarray.sum(buffers[j], stream=stream) /
                         buffers[j].size).get()
                    results[quantity][k, i] = F

        for quantity in sorted(quantities):
            results[quantity] = results[quantity].reshape(
                decoherence_steps + 1, chunks_num,
                subchunks_num).mean(2).real.tolist()

        out_queue.put(results)

    else:

        bin_accums = [
            numpy.zeros(tuple([binnum] * len(vals)), numpy.int64)
            for vals, binnum, _ in binning
        ]
        bin_edges = [None] * len(binning)

        for i in xrange(chunks_num * subchunks_num):
            bin_edges = []
            kernel_calculate(*args,
                             block=gen_block,
                             grid=gen_grid,
                             stream=stream)
            results = {
                quantity: buffers[j].get().real
                for j, quantity in enumerate(sorted(quantities))
            }

            for binparam, bin_accum in zip(binning, bin_accums):
                qnames, binnum, ranges = binparam
                sample_lines = [results[quantity] for quantity in qnames]
                sample = numpy.concatenate(
                    [arr.reshape(subchunk_size, 1) for arr in sample_lines],
                    axis=1)

                hist, edges = numpy.histogramdd(sample, binnum, ranges)
                bin_accum += hist
                bin_edges.append(numpy.array(edges))

        results = [[acc.tolist(), edges.tolist()]
                   for acc, edges in zip(bin_accums, bin_edges)]

        out_queue.put(results)

    #ctx.pop()
    ctx.detach()
Esempio n. 52
0
from xpra.codecs.image_wrapper import ImageWrapper
from xpra.codecs.codec_constants import codec_spec, get_subsampling_divs
from xpra.log import Logger, debug_if_env
log = Logger()
debug = debug_if_env(log, "XPRA_CUDA_DEBUG")
error = log.error

import threading
import os
import numpy
import time
assert bytearray
import pycuda                               #@UnresolvedImport
from pycuda import driver                   #@UnresolvedImport
from pycuda.compiler import compile         #@UnresolvedImport
driver.init()


DEFAULT_CUDA_DEVICE_ID = int(os.environ.get("XPRA_CUDA_DEVICE", "0"))
COLORSPACES_MAP = {
                   "BGRA" : ("YUV420P", "YUV422P", "YUV444P"),
                   "BGRX" : ("YUV420P", "YUV422P", "YUV444P"),
                   "RGBA" : ("YUV420P", "YUV422P", "YUV444P"),
                   "RGBX" : ("YUV420P", "YUV422P", "YUV444P"),
                   }
KERNELS_MAP = {}


def log_sys_info():
    log.info("PyCUDA version=%s", ".".join([str(x) for x in driver.get_version()]))
    log.info("PyCUDA driver version=%s", driver.get_driver_version())
Esempio n. 53
0
    def __init__(self, img_size, **kwargs):
        cuda.init()
        from pycuda.tools import make_default_context
        global context
        context = make_default_context()
        unknown = []
        for k in kwargs.keys():
            if k not in [
                    'verbose', 'levels', 'resampling_factor', 'kernel_file',
                    'iterations', 'show_diff', 'Nfields', 'img', 'fields',
                    'mask', 'mul'
            ]:
                unknown.append(k)
        if len(unknown) != 0:
            warnings.warn(
                "Unrecognized parameter" +
                ('s: ' + str(unknown) if len(unknown) > 1 else ': ' +
                 unknown[0]), SyntaxWarning)
        self.verbose = kwargs.get("verbose", 0)
        self.debug(
            3, "You set the verbose level to the maximum.\n\
It may help finding bugs or tracking errors but it may also \
impact the program performance as it will print A LOT of \
output and add GPU->CPU copies only to print information.\n\
If it is not desired, consider lowering the verbosity: \
1 or 2 is a reasonable choice, \
0 won't show anything except for errors.")
        self.levels = kwargs.get("levels", 5)
        self.loop = 0
        self.resamplingFactor = kwargs.get("resampling_factor", 2)
        h, w = img_size
        self.nbIter = kwargs.get("iterations", 4)
        self.debug(1, "Initializing... Master resolution:", img_size,
                   "levels:", self.levels, "verbosity:", self.verbose)

        # Computing dimensions of the different levels #
        self.h, self.w = [], []
        for i in range(self.levels):
            self.h.append(int(round(h / (self.resamplingFactor**i))))
            self.w.append(int(round(w / (self.resamplingFactor**i))))

        if kwargs.get("Nfields") is not None:
            self.Nfields = kwargs.get("Nfields")
        else:
            try:
                self.Nfields = len(kwargs["fields"])
            except KeyError:
                self.debug(
                    0, "Error! You must provide the number of fields at init. \
Add Nfields=x or directly set fields with fields=list/tuple")
                raise ValueError

        kernelFile = kwargs.get("kernel_file")
        if kernelFile is None:
            self.debug(
                3, "Kernel file not specified, using the one in crappy dir")
            from crappy import __path__ as crappyPath
            kernelFile = crappyPath[0] + "/data/kernels.cu"
        self.debug(3, "Kernel file:", kernelFile)

        # Creating a new instance of CorrelStage for each stage #
        self.correl = []
        for i in range(self.levels):
            self.correl.append(
                CorrelStage((self.h[i], self.w[i]),
                            verbose=self.verbose,
                            Nfields=self.Nfields,
                            iterations=self.nbIter,
                            show_diff=(i == 0
                                       and kwargs.get("show_diff", False)),
                            mul=kwargs.get("mul", 3),
                            kernel_file=kernelFile))

        # Set original image if provided #
        if kwargs.get("img") is not None:
            self.setOrig(kwargs.get("img"))

        s = """
    texture<float, cudaTextureType2D, cudaReadModeElementType> texFx{0};
    texture<float, cudaTextureType2D, cudaReadModeElementType> texFy{0};
    __global__ void resample{0}(float* outX, float* outY, int x, int y)
    {{
      int idx = blockIdx.x*blockDim.x+threadIdx.x;
      int idy = blockIdx.y*blockDim.y+threadIdx.y;
      if(idx < x && idy < y)
      {{
        outX[idy*x+idx] = tex2D(texFx{0},(float)idx/x, (float)idy/y);
        outY[idy*x+idx] = tex2D(texFy{0},(float)idx/x, (float)idy/y);
      }}
    }}
    """
        self.src = ""
        for i in range(self.Nfields):
            self.src += s.format(
                i)  # Adding textures for the quick fields resampling

        self.mod = SourceModule(self.src)

        self.texFx = []
        self.texFy = []
        self.resampleF = []
        for i in range(self.Nfields):
            self.texFx.append(self.mod.get_texref("texFx%d" % i))
            self.texFy.append(self.mod.get_texref("texFy%d" % i))
            self.resampleF.append(self.mod.get_function("resample%d" % i))
            self.resampleF[i].prepare("PPii",
                                      texrefs=[self.texFx[i], self.texFy[i]])

        for t in self.texFx + self.texFy:
            t.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)
            t.set_filter_mode(cuda.filter_mode.LINEAR)
            t.set_address_mode(0, cuda.address_mode.BORDER)
            t.set_address_mode(1, cuda.address_mode.BORDER)

        # Set fields if provided #
        if kwargs.get("fields") is not None:
            self.setFields(kwargs.get("fields"))

        if kwargs.get("mask") is not None:
            self.setMask(kwargs.get("mask"))
Esempio n. 54
0
def init_all_devices():
    global DEVICES, DEVICE_INFO
    if DEVICES is not None:
        return DEVICES
    log.info("CUDA initialization (this may take a few seconds)")
    DEVICES = []
    DEVICE_INFO = {}
    try:
        driver.init()
    except Exception as e:
        log.error("Error: cannot initialize CUDA")
        log.error(" %s", e)
        return DEVICES
    log("CUDA driver version=%s", driver.get_driver_version())
    ngpus = driver.Device.count()
    if ngpus == 0:
        log.info("CUDA %s / PyCUDA %s, no devices found",
                 ".".join([str(x) for x in driver.get_version()]),
                 pycuda.VERSION_TEXT)
        return DEVICES
    cuda_device_blacklist = get_pref("blacklist")
    da = driver.device_attribute
    cf = driver.ctx_flags
    for i in range(ngpus):
        device = None
        context = None
        devinfo = "gpu %i" % i
        try:
            device = driver.Device(i)
            devinfo = device_info(device)
            if cuda_device_blacklist:
                blacklisted = [
                    x for x in cuda_device_blacklist
                    if x and devinfo.find(x) >= 0
                ]
                log("blacklisted(%s / %s)=%s", devinfo, cuda_device_blacklist,
                    blacklisted)
                if blacklisted:
                    log.warn(
                        "Warning: device '%s' is blacklisted and will not be used",
                        devinfo)
                    continue
            log(" + testing device %s: %s", i, devinfo)
            DEVICE_INFO[i] = devinfo
            host_mem = device.get_attribute(da.CAN_MAP_HOST_MEMORY)
            if not host_mem:
                log.warn("skipping device %s (cannot map host memory)",
                         devinfo)
                continue
            context = device.make_context(flags=cf.SCHED_YIELD | cf.MAP_HOST)
            try:
                log("   created context=%s", context)
                log("   api version=%s", context.get_api_version())
                free, total = driver.mem_get_info()
                log("   memory: free=%sMB, total=%sMB",
                    int(free / 1024 / 1024), int(total / 1024 / 1024))
                log("   multi-processors: %s, clock rate: %s",
                    device.get_attribute(da.MULTIPROCESSOR_COUNT),
                    device.get_attribute(da.CLOCK_RATE))
                log("   max block sizes: (%s, %s, %s)",
                    device.get_attribute(da.MAX_BLOCK_DIM_X),
                    device.get_attribute(da.MAX_BLOCK_DIM_Y),
                    device.get_attribute(da.MAX_BLOCK_DIM_Z))
                log("   max grid sizes: (%s, %s, %s)",
                    device.get_attribute(da.MAX_GRID_DIM_X),
                    device.get_attribute(da.MAX_GRID_DIM_Y),
                    device.get_attribute(da.MAX_GRID_DIM_Z))
                max_width = device.get_attribute(da.MAXIMUM_TEXTURE2D_WIDTH)
                max_height = device.get_attribute(da.MAXIMUM_TEXTURE2D_HEIGHT)
                log("   maximum texture size: %sx%s", max_width, max_height)
                log("   max pitch: %s", device.get_attribute(da.MAX_PITCH))
                SMmajor, SMminor = device.compute_capability()
                compute = (SMmajor << 4) + SMminor
                log("   compute capability: %#x (%s.%s)", compute, SMmajor,
                    SMminor)
                if i == 0:
                    #we print the list info "header" from inside the loop
                    #so that the log output is bunched up together
                    log.info("CUDA %s / PyCUDA %s, found %s device%s:",
                             ".".join([str(x) for x in driver.get_version()]),
                             pycuda.VERSION_TEXT, ngpus, engs(ngpus))
                if SMmajor >= 2:
                    DEVICES.append(i)
                else:
                    log.info("  this device is too old!")
                log.info("  + %s (memory: %s%% free, compute: %s.%s)",
                         device_info(device), 100 * free / total, SMmajor,
                         SMminor)
            finally:
                context.pop()
        except Exception as e:
            log.error("error on device %s: %s", devinfo, e)
    return DEVICES
Esempio n. 55
0
def solve_gpu(currentmodelrun, modelend, G):
    """Solving using FDTD method on GPU. Implemented using Nvidia CUDA.

    Args:
        currentmodelrun (int): Current model run number.
        modelend (int): Number of last model to run.
        G (class): Grid class instance - holds essential parameters describing the model.

    Returns:
        tsolve (float): Time taken to execute solving
    """

    import pycuda.driver as drv
    from pycuda.compiler import SourceModule
    drv.init()

    # Suppress nvcc warnings on Windows
    if sys.platform == 'win32':
        compiler_opts = ['-w']
    else:
        compiler_opts = None

    # Create device handle and context on specifc GPU device (and make it current context)
    dev = drv.Device(G.gpu.deviceID)
    ctx = dev.make_context()

    # Electric and magnetic field updates - prepare kernels, and get kernel functions
    if Material.maxpoles > 0:
        kernels_fields = SourceModule(kernels_template_fields.substitute(REAL=cudafloattype, COMPLEX=cudacomplextype, N_updatecoeffsE=G.updatecoeffsE.size, N_updatecoeffsH=G.updatecoeffsH.size, NY_MATCOEFFS=G.updatecoeffsE.shape[1], NY_MATDISPCOEFFS=G.updatecoeffsdispersive.shape[1], NX_FIELDS=G.Ex.shape[0], NY_FIELDS=G.Ex.shape[1], NZ_FIELDS=G.Ex.shape[2], NX_ID=G.ID.shape[1], NY_ID=G.ID.shape[2], NZ_ID=G.ID.shape[3], NX_T=G.Tx.shape[1], NY_T=G.Tx.shape[2], NZ_T=G.Tx.shape[3]), options=compiler_opts)
    else:   # Set to one any substitutions for dispersive materials
        kernels_fields = SourceModule(kernels_template_fields.substitute(REAL=cudafloattype, COMPLEX=cudacomplextype, N_updatecoeffsE=G.updatecoeffsE.size, N_updatecoeffsH=G.updatecoeffsH.size, NY_MATCOEFFS=G.updatecoeffsE.shape[1], NY_MATDISPCOEFFS=1, NX_FIELDS=G.Ex.shape[0], NY_FIELDS=G.Ex.shape[1], NZ_FIELDS=G.Ex.shape[2], NX_ID=G.ID.shape[1], NY_ID=G.ID.shape[2], NZ_ID=G.ID.shape[3], NX_T=1, NY_T=1, NZ_T=1), options=compiler_opts)
    update_e_gpu = kernels_fields.get_function("update_e")
    update_h_gpu = kernels_fields.get_function("update_h")

    # Copy material coefficient arrays to constant memory of GPU (must be <64KB) for fields kernels
    updatecoeffsE = kernels_fields.get_global('updatecoeffsE')[0]
    updatecoeffsH = kernels_fields.get_global('updatecoeffsH')[0]
    if G.updatecoeffsE.nbytes + G.updatecoeffsH.nbytes > G.gpu.constmem:
        raise GeneralError('Too many materials in the model to fit onto constant memory of size {} on {} - {} GPU'.format(human_size(G.gpu.constmem), G.gpu.deviceID, G.gpu.name))
    else:
        drv.memcpy_htod(updatecoeffsE, G.updatecoeffsE)
        drv.memcpy_htod(updatecoeffsH, G.updatecoeffsH)

    # Electric and magnetic field updates - dispersive materials - get kernel functions and initialise array on GPU
    if Material.maxpoles > 0:  # If there are any dispersive materials (updates are split into two parts as they require present and updated electric field values).
        update_e_dispersive_A_gpu = kernels_fields.get_function("update_e_dispersive_A")
        update_e_dispersive_B_gpu = kernels_fields.get_function("update_e_dispersive_B")
        G.gpu_initialise_dispersive_arrays()

    # Electric and magnetic field updates - set blocks per grid and initialise field arrays on GPU
    G.gpu_set_blocks_per_grid()
    G.gpu_initialise_arrays()

    # PML updates
    if G.pmls:
        # Prepare kernels
        kernels_pml = SourceModule(kernels_template_pml.substitute(REAL=cudafloattype, N_updatecoeffsE=G.updatecoeffsE.size, N_updatecoeffsH=G.updatecoeffsH.size, NY_MATCOEFFS=G.updatecoeffsE.shape[1], NY_R=G.pmls[0].ERA.shape[1], NX_FIELDS=G.Ex.shape[0], NY_FIELDS=G.Ex.shape[1], NZ_FIELDS=G.Ex.shape[2], NX_ID=G.ID.shape[1], NY_ID=G.ID.shape[2], NZ_ID=G.ID.shape[3]), options=compiler_opts)
        # Copy material coefficient arrays to constant memory of GPU (must be <64KB) for PML kernels
        updatecoeffsE = kernels_pml.get_global('updatecoeffsE')[0]
        updatecoeffsH = kernels_pml.get_global('updatecoeffsH')[0]
        drv.memcpy_htod(updatecoeffsE, G.updatecoeffsE)
        drv.memcpy_htod(updatecoeffsH, G.updatecoeffsH)
        # Set block per grid, initialise arrays on GPU, and get kernel functions
        for pml in G.pmls:
            pml.gpu_set_blocks_per_grid(G)
            pml.gpu_initialise_arrays()
            pml.gpu_get_update_funcs(kernels_pml)

    # Receivers
    if G.rxs:
        # Initialise arrays on GPU
        rxcoords_gpu, rxs_gpu = gpu_initialise_rx_arrays(G)
        # Prepare kernel and get kernel function
        kernel_store_outputs = SourceModule(kernel_template_store_outputs.substitute(REAL=cudafloattype, NY_RXCOORDS=3, NX_RXS=6, NY_RXS=G.iterations, NZ_RXS=len(G.rxs), NX_FIELDS=G.Ex.shape[0], NY_FIELDS=G.Ex.shape[1], NZ_FIELDS=G.Ex.shape[2]), options=compiler_opts)
        store_outputs_gpu = kernel_store_outputs.get_function("store_outputs")

    # Sources - initialise arrays on GPU, prepare kernel and get kernel functions
    if G.voltagesources + G.hertziandipoles + G.magneticdipoles:
        kernels_sources = SourceModule(kernels_template_sources.substitute(REAL=cudafloattype, N_updatecoeffsE=G.updatecoeffsE.size, N_updatecoeffsH=G.updatecoeffsH.size, NY_MATCOEFFS=G.updatecoeffsE.shape[1], NY_SRCINFO=4, NY_SRCWAVES=G.iterations, NX_FIELDS=G.Ex.shape[0], NY_FIELDS=G.Ex.shape[1], NZ_FIELDS=G.Ex.shape[2], NX_ID=G.ID.shape[1], NY_ID=G.ID.shape[2], NZ_ID=G.ID.shape[3]), options=compiler_opts)
        # Copy material coefficient arrays to constant memory of GPU (must be <64KB) for source kernels
        updatecoeffsE = kernels_sources.get_global('updatecoeffsE')[0]
        updatecoeffsH = kernels_sources.get_global('updatecoeffsH')[0]
        drv.memcpy_htod(updatecoeffsE, G.updatecoeffsE)
        drv.memcpy_htod(updatecoeffsH, G.updatecoeffsH)
        if G.hertziandipoles:
            srcinfo1_hertzian_gpu, srcinfo2_hertzian_gpu, srcwaves_hertzian_gpu = gpu_initialise_src_arrays(G.hertziandipoles, G)
            update_hertzian_dipole_gpu = kernels_sources.get_function("update_hertzian_dipole")
        if G.magneticdipoles:
            srcinfo1_magnetic_gpu, srcinfo2_magnetic_gpu, srcwaves_magnetic_gpu = gpu_initialise_src_arrays(G.magneticdipoles, G)
            update_magnetic_dipole_gpu = kernels_sources.get_function("update_magnetic_dipole")
        if G.voltagesources:
            srcinfo1_voltage_gpu, srcinfo2_voltage_gpu, srcwaves_voltage_gpu = gpu_initialise_src_arrays(G.voltagesources, G)
            update_voltage_source_gpu = kernels_sources.get_function("update_voltage_source")

    # Snapshots - initialise arrays on GPU, prepare kernel and get kernel functions
    if G.snapshots:
        # Initialise arrays on GPU
        snapEx_gpu, snapEy_gpu, snapEz_gpu, snapHx_gpu, snapHy_gpu, snapHz_gpu = gpu_initialise_snapshot_array(G)
        # Prepare kernel and get kernel function
        kernel_store_snapshot = SourceModule(kernel_template_store_snapshot.substitute(REAL=cudafloattype, NX_SNAPS=Snapshot.nx_max, NY_SNAPS=Snapshot.ny_max, NZ_SNAPS=Snapshot.nz_max, NX_FIELDS=G.Ex.shape[0], NY_FIELDS=G.Ex.shape[1], NZ_FIELDS=G.Ex.shape[2]), options=compiler_opts)
        store_snapshot_gpu = kernel_store_snapshot.get_function("store_snapshot")

    # Iteration loop timer
    iterstart = drv.Event()
    iterend = drv.Event()
    iterstart.record()

    for iteration in tqdm(range(G.iterations), desc='Running simulation, model ' + str(currentmodelrun) + '/' + str(modelend), ncols=get_terminal_width() - 1, file=sys.stdout, disable=G.tqdmdisable):

        # Get GPU memory usage on final iteration
        if iteration == G.iterations - 1:
            memsolve = drv.mem_get_info()[1] - drv.mem_get_info()[0]

        # Store field component values for every receiver
        if G.rxs:
            store_outputs_gpu(np.int32(len(G.rxs)), np.int32(iteration),
                              rxcoords_gpu.gpudata, rxs_gpu.gpudata,
                              G.Ex_gpu.gpudata, G.Ey_gpu.gpudata, G.Ez_gpu.gpudata,
                              G.Hx_gpu.gpudata, G.Hy_gpu.gpudata, G.Hz_gpu.gpudata,
                              block=(1, 1, 1), grid=(round32(len(G.rxs)), 1, 1))

        # Store any snapshots
        for i, snap in enumerate(G.snapshots):
            if snap.time == iteration + 1:
                store_snapshot_gpu(np.int32(i), np.int32(snap.xs),
                                   np.int32(snap.xf), np.int32(snap.ys),
                                   np.int32(snap.yf), np.int32(snap.zs),
                                   np.int32(snap.zf), np.int32(snap.dx),
                                   np.int32(snap.dy), np.int32(snap.dz),
                                   G.Ex_gpu.gpudata, G.Ey_gpu.gpudata, G.Ez_gpu.gpudata,
                                   G.Hx_gpu.gpudata, G.Hy_gpu.gpudata, G.Hz_gpu.gpudata,
                                   snapEx_gpu.gpudata, snapEy_gpu.gpudata, snapEz_gpu.gpudata,
                                   snapHx_gpu.gpudata, snapHy_gpu.gpudata, snapHz_gpu.gpudata,
                                   block=Snapshot.tpb, grid=Snapshot.bpg)
                if G.snapsgpu2cpu:
                    gpu_get_snapshot_array(snapEx_gpu.get(), snapEy_gpu.get(), snapEz_gpu.get(),
                                           snapHx_gpu.get(), snapHy_gpu.get(), snapHz_gpu.get(), i, snap)

        # Update magnetic field components
        update_h_gpu(np.int32(G.nx), np.int32(G.ny), np.int32(G.nz),
                     G.ID_gpu.gpudata, G.Hx_gpu.gpudata, G.Hy_gpu.gpudata,
                     G.Hz_gpu.gpudata, G.Ex_gpu.gpudata, G.Ey_gpu.gpudata,
                     G.Ez_gpu.gpudata, block=G.tpb, grid=G.bpg)

        # Update magnetic field components with the PML correction
        for pml in G.pmls:
            pml.gpu_update_magnetic(G)

        # Update magnetic field components for magetic dipole sources
        if G.magneticdipoles:
            update_magnetic_dipole_gpu(np.int32(len(G.magneticdipoles)), np.int32(iteration),
                                       floattype(G.dx), floattype(G.dy), floattype(G.dz),
                                       srcinfo1_magnetic_gpu.gpudata, srcinfo2_magnetic_gpu.gpudata,
                                       srcwaves_magnetic_gpu.gpudata, G.ID_gpu.gpudata,
                                       G.Hx_gpu.gpudata, G.Hy_gpu.gpudata, G.Hz_gpu.gpudata,
                                       block=(1, 1, 1), grid=(round32(len(G.magneticdipoles)), 1, 1))

        # Update electric field components
        # If all materials are non-dispersive do standard update
        if Material.maxpoles == 0:
            update_e_gpu(np.int32(G.nx), np.int32(G.ny), np.int32(G.nz), G.ID_gpu.gpudata,
                         G.Ex_gpu.gpudata, G.Ey_gpu.gpudata, G.Ez_gpu.gpudata,
                         G.Hx_gpu.gpudata, G.Hy_gpu.gpudata, G.Hz_gpu.gpudata,
                         block=G.tpb, grid=G.bpg)
        # If there are any dispersive materials do 1st part of dispersive update
        # (it is split into two parts as it requires present and updated electric field values).
        else:
            update_e_dispersive_A_gpu(np.int32(G.nx), np.int32(G.ny), np.int32(G.nz),
                                      np.int32(Material.maxpoles), G.updatecoeffsdispersive_gpu.gpudata,
                                      G.Tx_gpu.gpudata, G.Ty_gpu.gpudata, G.Tz_gpu.gpudata, G.ID_gpu.gpudata,
                                      G.Ex_gpu.gpudata, G.Ey_gpu.gpudata, G.Ez_gpu.gpudata,
                                      G.Hx_gpu.gpudata, G.Hy_gpu.gpudata, G.Hz_gpu.gpudata,
                                      block=G.tpb, grid=G.bpg)

        # Update electric field components with the PML correction
        for pml in G.pmls:
            pml.gpu_update_electric(G)

        # Update electric field components for voltage sources
        if G.voltagesources:
            update_voltage_source_gpu(np.int32(len(G.voltagesources)), np.int32(iteration),
                                      floattype(G.dx), floattype(G.dy), floattype(G.dz),
                                      srcinfo1_voltage_gpu.gpudata, srcinfo2_voltage_gpu.gpudata,
                                      srcwaves_voltage_gpu.gpudata, G.ID_gpu.gpudata,
                                      G.Ex_gpu.gpudata, G.Ey_gpu.gpudata, G.Ez_gpu.gpudata,
                                      block=(1, 1, 1), grid=(round32(len(G.voltagesources)), 1, 1))

        # Update electric field components for Hertzian dipole sources (update any Hertzian dipole sources last)
        if G.hertziandipoles:
            update_hertzian_dipole_gpu(np.int32(len(G.hertziandipoles)), np.int32(iteration),
                                       floattype(G.dx), floattype(G.dy), floattype(G.dz),
                                       srcinfo1_hertzian_gpu.gpudata, srcinfo2_hertzian_gpu.gpudata,
                                       srcwaves_hertzian_gpu.gpudata, G.ID_gpu.gpudata,
                                       G.Ex_gpu.gpudata, G.Ey_gpu.gpudata, G.Ez_gpu.gpudata,
                                       block=(1, 1, 1), grid=(round32(len(G.hertziandipoles)), 1, 1))

        # If there are any dispersive materials do 2nd part of dispersive update (it is split into two parts as it requires present and updated electric field values). Therefore it can only be completely updated after the electric field has been updated by the PML and source updates.
        if Material.maxpoles > 0:
            update_e_dispersive_B_gpu(np.int32(G.nx), np.int32(G.ny), np.int32(G.nz),
                                      np.int32(Material.maxpoles), G.updatecoeffsdispersive_gpu.gpudata,
                                      G.Tx_gpu.gpudata, G.Ty_gpu.gpudata, G.Tz_gpu.gpudata, G.ID_gpu.gpudata,
                                      G.Ex_gpu.gpudata, G.Ey_gpu.gpudata, G.Ez_gpu.gpudata,
                                      block=G.tpb, grid=G.bpg)

    # Copy output from receivers array back to correct receiver objects
    if G.rxs:
        gpu_get_rx_array(rxs_gpu.get(), rxcoords_gpu.get(), G)

    # Copy data from any snapshots back to correct snapshot objects
    if G.snapshots and not G.snapsgpu2cpu:
        for i, snap in enumerate(G.snapshots):
            gpu_get_snapshot_array(snapEx_gpu.get(), snapEy_gpu.get(), snapEz_gpu.get(),
                                   snapHx_gpu.get(), snapHy_gpu.get(), snapHz_gpu.get(), i, snap)

    iterend.record()
    iterend.synchronize()
    tsolve = iterstart.time_till(iterend) * 1e-3

    # Remove context from top of stack and delete
    ctx.pop()
    del ctx

    return tsolve, memsolve
Esempio n. 56
0
def MetropolisCuda(InputCU):

    print("Inside ",InputCU)
    
    iterations=InputCU['Iterations']
    steps=InputCU['Steps']
    blocks=InputCU['Blocks']
    threads=InputCU['Threads']
    Device=InputCU['Device']
    RNG=InputCU['RNG']
    ValueType=InputCU['ValueType']
    TestType=InputCU['IfThen']
    
    Marsaglia,Computing,Test=DictionariesAPI()
    
    try:
        # For PyCUDA import
        import pycuda.driver as cuda
        from pycuda.compiler import SourceModule
        
        cuda.init()
        for Id in range(cuda.Device.count()):
            if Id==Device:
                XPU=cuda.Device(Id)
                print("GPU selected %s" % XPU.name())
        print

    except ImportError:
        print("Platform does not seem to support CUDA")

    circle=numpy.zeros(blocks*threads).astype(numpy.uint64)  
    circleCU = cuda.InOut(circle)
    #circleCU = cuda.mem_alloc(circle.size*circle.dtype.itemize)
    #cuda.memcpy_htod(circleCU, circle)

    Context=XPU.make_context()

    try:
        mod = SourceModule(KernelCodeCuda(),options=['--compiler-options','-DTRNG=%i -DTYPE=%s' % (Marsaglia[RNG],Computing[ValueType])])
        #mod = SourceModule(KernelCodeCuda(),nvcc='nvcc',keep=True)
        # Needed to set the compiler via ccbin for CUDA9 implementation
        #mod = SourceModule(KernelCodeCuda(),options=['-ccbin','clang-3.9','--compiler-options','-DTRNG=%i' % Marsaglia[RNG],'-DTYPE=%s' % Computing[ValueType],'-DTEST=%s' % Test[TestType]],keep=True)
    except:
        print("Compilation seems to break")
 
    MetropolisBlocksCU=mod.get_function("MainLoopBlocks")
    MetropolisThreadsCU=mod.get_function("MainLoopThreads")
    MetropolisHybridCU=mod.get_function("MainLoopHybrid")

    MyDuration=numpy.zeros(steps)

    jobs=blocks*threads;

    iterationsCU=numpy.uint64(iterations/jobs)
    if iterations%jobs!=0:
        iterationsCU+=numpy.uint64(1)

    for i in range(steps):
        start_time=time.time()

        try:
            MetropolisHybridCU(circleCU,
                               numpy.uint64(iterationsCU),
                               numpy.uint32(110271),
                               numpy.uint32(101008),
                               # numpy.uint32(nprnd(2**32)),
                               # numpy.uint32(nprnd(2**32)),
                               grid=(blocks,1),block=(threads,1,1))
        except:
            print("Crash during CUDA call")

        elapsed = time.time()-start_time
        print("(Blocks/Threads)=(%i,%i) method done in %.2f s..." % (blocks,threads,elapsed))

        MyDuration[i]=elapsed

    OutputCU={'Inside':sum(circle),'NewIterations':numpy.uint64(iterationsCU*jobs),'Duration':MyDuration}
    print(OutputCU)
    Context.pop()
    
    Context.detach()

    return(OutputCU)
Esempio n. 57
0
def _diffusion_child(comm, bm=None):

    rank = comm.Get_rank()
    ngpus = comm.Get_size()

    nodename = socket.gethostname()
    name = '%s %s' %(nodename, rank)
    print(name)

    if rank == 0:

        # split indices on GPUs
        indices_split = _split_indices(bm.indices, ngpus)
        print('Indices:', indices_split)

        # send data to GPUs
        for k in range(1, ngpus):
            sendToChild(comm, bm.indices, indices_split[k], k, bm.data, bm.labels, bm.label.nbrw, bm.label.sorw, bm.label.allaxis)

        # init cuda device
        cuda.init()
        dev = cuda.Device(rank)
        ctx = dev.make_context()

        # select the desired script
        if bm.label.allaxis:
            from pycuda_small_allx import walk
        else:
            from pycuda_small import walk

        # run random walks
        tic = time.time()
        walkmap = walk(bm.data, bm.labels, bm.indices, indices_split[0], bm.label.nbrw, bm.label.sorw, name)
        tac = time.time()
        print('Walktime_%s: ' %(name) + str(int(tac - tic)) + ' ' + 'seconds')

        # gather data
        zsh_tmp = bm.argmax_z - bm.argmin_z
        ysh_tmp = bm.argmax_y - bm.argmin_y
        xsh_tmp = bm.argmax_x - bm.argmin_x
        if ngpus > 1:
            final_zero = np.empty((bm.nol, zsh_tmp, ysh_tmp, xsh_tmp), dtype=np.float32)
            for k in range(bm.nol):
                sendbuf = np.copy(walkmap[k])
                recvbuf = np.empty((zsh_tmp, ysh_tmp, xsh_tmp), dtype=np.float32)
                comm.Barrier()
                comm.Reduce([sendbuf, MPI.FLOAT], [recvbuf, MPI.FLOAT], root=0, op=MPI.SUM)
                final_zero[k] = recvbuf
        else:
            final_zero = walkmap

        # block and grid size
        block = (32, 32, 1)
        x_grid = (xsh_tmp // 32) + 1
        y_grid = (ysh_tmp // 32) + 1
        grid = (int(x_grid), int(y_grid), int(zsh_tmp))
        xsh_gpu = np.int32(xsh_tmp)
        ysh_gpu = np.int32(ysh_tmp)

        # smooth
        if bm.label.smooth:
            try:
                update_gpu = _build_update_gpu()
                curvature_gpu = _build_curvature_gpu()
                a_gpu = gpuarray.empty((zsh_tmp, ysh_tmp, xsh_tmp), dtype=np.float32)
                b_gpu = gpuarray.zeros((zsh_tmp, ysh_tmp, xsh_tmp), dtype=np.float32)
            except Exception as e:
                print('Warning: GPU out of memory to allocate smooth array. Process starts without smoothing.')
                bm.label.smooth = 0

        if bm.label.smooth:
            final_smooth = np.copy(final_zero)
            for k in range(bm.nol):
                a_gpu = gpuarray.to_gpu(final_smooth[k])
                for l in range(bm.label.smooth):
                    curvature_gpu(a_gpu, b_gpu, xsh_gpu, ysh_gpu, block=block, grid=grid)
                    update_gpu(a_gpu, b_gpu, xsh_gpu, ysh_gpu, block=block, grid=grid)
                final_smooth[k] = a_gpu.get()
            final_smooth = np.argmax(final_smooth, axis=0).astype(np.uint8)
            final_smooth = get_labels(final_smooth, bm.allLabels)
            final = np.zeros((bm.zsh, bm.ysh, bm.xsh), dtype=np.uint8)
            final[bm.argmin_z:bm.argmax_z, bm.argmin_y:bm.argmax_y, bm.argmin_x:bm.argmax_x] = final_smooth
            final = final[1:-1, 1:-1, 1:-1]
            bm.path_to_smooth = unique_file_path(bm.path_to_smooth, bm.image.user.username)
            save_data(bm.path_to_smooth, final, bm.header, bm.final_image_type, bm.label.compression)

        # uncertainty
        if bm.label.uncertainty:
            try:
                max_gpu = gpuarray.zeros((3, zsh_tmp, ysh_tmp, xsh_tmp), dtype=np.float32)
                a_gpu = gpuarray.zeros((zsh_tmp, ysh_tmp, xsh_tmp), dtype=np.float32)
                kernel_uncertainty = _build_kernel_uncertainty()
                kernel_max = _build_kernel_max()
                for k in range(bm.nol):
                    a_gpu = gpuarray.to_gpu(final_zero[k])
                    kernel_max(max_gpu, a_gpu, xsh_gpu, ysh_gpu, block=block, grid=grid)
                kernel_uncertainty(max_gpu, a_gpu, xsh_gpu, ysh_gpu, block=block, grid=grid)
                uq = a_gpu.get()
                uq *= 255
                uq = uq.astype(np.uint8)
                final = np.zeros((bm.zsh, bm.ysh, bm.xsh), dtype=np.uint8)
                final[bm.argmin_z:bm.argmax_z, bm.argmin_y:bm.argmax_y, bm.argmin_x:bm.argmax_x] = uq
                final = final[1:-1, 1:-1, 1:-1]
                bm.path_to_uq = unique_file_path(bm.path_to_uq, bm.image.user.username)
                save_data(bm.path_to_uq, final, compress=bm.label.compression)
            except Exception as e:
                print('Warning: GPU out of memory to allocate uncertainty array. Process starts without uncertainty.')
                bm.label.uncertainty = False

        # free device
        ctx.pop()
        del ctx

        # argmax
        final_zero = np.argmax(final_zero, axis=0).astype(np.uint8)

        # save finals
        final_zero = get_labels(final_zero, bm.allLabels)
        final = np.zeros((bm.zsh, bm.ysh, bm.xsh), dtype=np.uint8)
        final[bm.argmin_z:bm.argmax_z, bm.argmin_y:bm.argmax_y, bm.argmin_x:bm.argmax_x] = final_zero
        final = final[1:-1, 1:-1, 1:-1]
        bm.path_to_final = unique_file_path(bm.path_to_final, bm.image.user.username)
        save_data(bm.path_to_final, final, bm.header, bm.final_image_type, bm.label.compression)

        # create final objects
        shortfilename = os.path.basename(bm.path_to_final)
        filename = 'images/' + bm.image.user.username + '/' + shortfilename
        tmp = Upload.objects.create(pic=filename, user=bm.image.user, project=bm.image.project, final=1, active=1, imageType=3, shortfilename=shortfilename)
        tmp.friend = tmp.id
        tmp.save()
        if bm.label.uncertainty:
            shortfilename = os.path.basename(bm.path_to_uq)
            filename = 'images/' + bm.image.user.username + '/' + shortfilename
            Upload.objects.create(pic=filename, user=bm.image.user, project=bm.image.project, final=4, imageType=3, shortfilename=shortfilename, friend=tmp.id)
        if bm.label.smooth:
            shortfilename = os.path.basename(bm.path_to_smooth)
            filename = 'images/' + bm.image.user.username + '/' + shortfilename
            smooth = Upload.objects.create(pic=filename, user=bm.image.user, project=bm.image.project, final=5, imageType=3, shortfilename=shortfilename, friend=tmp.id)

        # write in logs
        t = int(time.time() - bm.TIC)
        if t < 60:
            time_str = str(t) + ' sec'
        elif 60 <= t < 3600:
            time_str = str(t // 60) + ' min ' + str(t % 60) + ' sec'
        elif 3600 < t:
            time_str = str(t // 3600) + ' h ' + str((t % 3600) // 60) + ' min ' + str(t % 60) + ' sec'
        with open(bm.path_to_time, 'a') as timefile:
            print('%s %s %s %s MB %s on %s' %(time.ctime(), bm.image.user.username, bm.image.shortfilename, bm.imageSize, time_str, config['SERVER_ALIAS']), file=timefile)
        print('Total calculation time:', time_str)

        # send notification
        send_notification(bm.image.user.username, bm.image.shortfilename, time_str, config['SERVER_ALIAS'])

        # start subprocesses
        if config['OS'] == 'linux':
            # acwe
            q = Queue('acwe', connection=Redis())
            job = q.enqueue_call(active_contour, args=(bm.image.id, tmp.id, bm.label.id,), timeout=-1)

            # cleanup
            q = Queue('cleanup', connection=Redis())
            job = q.enqueue_call(remove_outlier, args=(bm.image.id, tmp.id, tmp.id, bm.label.id,), timeout=-1)
            if bm.label.smooth:
                job = q.enqueue_call(remove_outlier, args=(bm.image.id, smooth.id, tmp.id, bm.label.id, False,), timeout=-1)

            # create slices
            q = Queue('slices', connection=Redis())
            job = q.enqueue_call(create_slices, args=(bm.path_to_data, bm.path_to_final,), timeout=-1)
            if bm.label.smooth:
                job = q.enqueue_call(create_slices, args=(bm.path_to_data, bm.path_to_smooth,), timeout=-1)
            if bm.label.uncertainty:
                job = q.enqueue_call(create_slices, args=(bm.path_to_uq, None,), timeout=-1)

        elif config['OS'] == 'windows':

            # acwe
            Process(target=active_contour, args=(bm.image.id, tmp.id, bm.label.id)).start()

            # cleanup
            Process(target=remove_outlier, args=(bm.image.id, tmp.id, tmp.id, bm.label.id)).start()
            if bm.label.smooth:
                Process(target=remove_outlier, args=(bm.image.id, smooth.id, tmp.id, bm.label.id, False)).start()

            # create slices
            Process(target=create_slices, args=(bm.path_to_data, bm.path_to_final)).start()
            if bm.label.smooth:
                Process(target=create_slices, args=(bm.path_to_data, bm.path_to_smooth)).start()
            if bm.label.uncertainty:
                Process(target=create_slices, args=(bm.path_to_uq, None)).start()

    else:

        data_z, data_y, data_x, data_dtype = comm.recv(source=0, tag=0)
        data = np.empty((data_z, data_y, data_x), dtype=data_dtype)
        if data_dtype == 'uint8':
            comm.Recv([data, MPI.BYTE], source=0, tag=1)
        else:
            comm.Recv([data, MPI.FLOAT], source=0, tag=1)
        allx, nbrw, sorw = comm.recv(source=0, tag=2)
        if allx:
            labels = []
            for k in range(3):
                labels_z, labels_y, labels_x = comm.recv(source=0, tag=k+3)
                labels_tmp = np.empty((labels_z, labels_y, labels_x), dtype=np.int32)
                comm.Recv([labels_tmp, MPI.INT], source=0, tag=k+6)
                labels.append(labels_tmp)
        else:
            labels_z, labels_y, labels_x = comm.recv(source=0, tag=3)
            labels = np.empty((labels_z, labels_y, labels_x), dtype=np.int32)
            comm.Recv([labels, MPI.INT], source=0, tag=6)
        indices = comm.recv(source=0, tag=9)
        indices_child = comm.recv(source=0, tag=10)

        # init cuda device
        cuda.init()
        dev = cuda.Device(rank)
        ctx = dev.make_context()

        # select the desired script
        if allx:
            from pycuda_small_allx import walk
        else:
            from pycuda_small import walk

        # run random walks
        tic = time.time()
        walkmap = walk(data, labels, indices, indices_child, nbrw, sorw, name)
        tac = time.time()
        print('Walktime_%s: ' %(name) + str(int(tac - tic)) + ' ' + 'seconds')

        # free device
        ctx.pop()
        del ctx

        # send data
        for k in range(walkmap.shape[0]):
            datatemporaer = np.copy(walkmap[k])
            comm.Barrier()
            comm.Reduce([datatemporaer, MPI.FLOAT], None, root=0, op=MPI.SUM)
Esempio n. 58
0
"""
CUDA module; all cuda subclasses go here
that is, context, kernel, and codegen
"""

import numpy as np

from pycuda.driver import init, Device
init()

from pycuda import gpuarray
from pycuda.compiler import SourceModule


from ..context import AbstractContext
import threadpy.backend.CUDA

class Context(AbstractContext, threadpy.backend.CUDA.Context):
    """CUDA context wrapper"""

    def __init__(self, device = 0, context = None):
        #init backend
        threadpy.backend.CUDA.Context.__init__(self, device, context)
        #init threadweave specific features
        AbstractContext.__init__(self)

    #device property accessors. only used by threadweave at the moment
    #but they are probably better abstracted away in threadpy
    #note that these are far from complete
    def supported_axes(self):
Esempio n. 59
0
def train_net(config):

    # UNPACK CONFIGS
    (flag_para_load, flag_datalayer, train_filenames, val_filenames,
     train_labels, val_labels, img_mean) = unpack_configs(config)

    if flag_para_load:
        # pycuda and zmq set up
        drv.init()
        dev = drv.Device(int(config['gpu'][-1]))
        ctx = dev.make_context()
        sock = zmq.Context().socket(zmq.PAIR)
        sock.connect('tcp://*****:*****@ iter = ', num_iter
                print 'training cost:', cost_ij
                if config['print_train_error']:
                    print 'training error rate:', train_error()

            if flag_para_load and (count < len(minibatch_range)):
                load_send_queue.put('calc_finished')

        ############### Test on Validation Set ##################

        DropoutLayer.SetDropoutOff()

        this_validation_error, this_validation_loss = get_val_error_loss(
            rand_arr,
            shared_x,
            shared_y,
            val_filenames,
            val_labels,
            flag_datalayer,
            flag_para_load,
            batch_size,
            validate_model,
            send_queue=load_send_queue,
            recv_queue=load_recv_queue)

        print('epoch %i: validation loss %f ' % (epoch, this_validation_loss))
        print('epoch %i: validation error %f %%' %
              (epoch, this_validation_error * 100.))
        val_record.append([this_validation_error, this_validation_loss])
        np.save(config['weights_dir'] + 'val_record.npy', val_record)

        DropoutLayer.SetDropoutOn()
        ############################################

        # Adapt Learning Rate
        step_idx = adjust_learning_rate(config, epoch, step_idx, val_record,
                                        learning_rate)

        # Save weights
        if epoch % config['snapshot_freq'] == 0:
            save_weights(layers, config['weights_dir'], epoch)
            np.save(config['weights_dir'] + 'lr_' + str(epoch) + '.npy',
                    learning_rate.get_value())
            save_momentums(vels, config['weights_dir'], epoch)

    print('Optimization complete.')
Esempio n. 60
0
def cal_field(pnts, gt_pnts, gpu=0):
    # print("gpu",gpu)
    # print("CUDA_VISIBLE_DEVICES",os.environ["CUDA_VISIBLE_DEVICES"])
    # os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    # os.environ["CUDA_VISIBLE_DEVICES"]=str(gpu)
    if gpu < 0:
        import pycuda.autoinit
    else:
        drv.init()
        dev1 = drv.Device(gpu)
        ctx1 = dev1.make_context()

    mod = SourceModule("""
    __device__ float compute_force_scalar(float dist)
    {
        // float dist_expand = dist*100;
	    // return 1/(dist_expand*dist_expand*dist_expand*dist_expand*dist_expand*dist_expand*dist_expand*dist_expand+1E-6);
	    // float dist_expand = dist*1000;
	    // return 1/(dist_expand*dist_expand*dist_expand*dist_expand+1E-12);
	    float dist_expand = dist*1000;
	    return 1/(dist_expand*dist_expand*dist_expand*dist_expand+1E-14);
    }

    __global__ void p2g(float *gvfs, float *pnts, float *gt_pnts, int pnt_num, int gt_num)
    {
        int p_id = blockIdx.x * blockDim.x + threadIdx.x;
        float px = pnts[p_id*3];
        float py = pnts[p_id*3+1];
        float pz = pnts[p_id*3+2];
        float force, force_sum=0, x_sum=0, y_sum=0, z_sum=0;
        float dist, x_dist, y_dist, z_dist;
        for (int gt_id=0; gt_id<gt_num; gt_id++){
            x_dist = gt_pnts[gt_id*3] - px;
            y_dist = gt_pnts[gt_id*3+1] - py;
            z_dist = gt_pnts[gt_id*3+2] - pz;
            dist = sqrt(x_dist*x_dist + y_dist*y_dist + z_dist*z_dist);
            force = compute_force_scalar(dist);
            force_sum = force_sum + force;
            x_sum = x_sum + x_dist * force;
            y_sum = y_sum + y_dist * force;
            z_sum = z_sum + z_dist * force;
        }
        //printf("%f  ",y_sum);
        gvfs[p_id*3] = x_sum / force_sum;
        gvfs[p_id*3+1] = y_sum / force_sum;
        gvfs[p_id*3+2] = z_sum / force_sum;
    }
    """)

    kMaxThreadsPerBlock = 1024
    pnt_num = pnts.shape[0]
    gt_num = gt_pnts.shape[0]

    # print("start to cal gvf gt field pnt num: ", gt_num)
    gvfs = np.zeros((pnt_num, 3)).astype(np.float32)
    gridSize = int((pnt_num + kMaxThreadsPerBlock - 1) / kMaxThreadsPerBlock)
    pnts_tries_ivt = mod.get_function("p2g")
    pnts_tries_ivt(drv.Out(gvfs), drv.In(np.float32(pnts)), drv.In(np.float32(gt_pnts)), np.int32(pnt_num), np.int32(gt_num), block=(kMaxThreadsPerBlock,1,1), grid=(gridSize,1))
    # print("ivt[0,0,:]", ivt[0,0,:])
    if gpu >= 0: 
        ctx1.pop()
    return gvfs