Ejemplo n.º 1
0
    def __init__(self, profile=False, device=None, manual=False):
        """
        Initialize a device, a context and a queue.
        The preferred device is a NVIDIA GPU with maximum compute capability.

        @param profile : (optional) if True, enable profiling of the OpenCL events
        @param device : (optional) device in the format (0, 0)
        @param manual : (optional) if True, choose manually a device from the PyOpenCL prompt.
        """
        platforms = cl.get_platforms()

        if manual:
            self.ctx = cl.create_some_context()
            self.device = ctx.devices[0]

        elif device:
            self.device = platforms[device[0]].get_devices()[device[1]]
            self.ctx = cl.Context([self.device])

        else:
            # Try to choose a NVIDIA card with best compute capability
            cc_max = -1
            cc_argmax = (0, 0)
            for i_p, p in enumerate(platforms):
                for i_dev, dev in enumerate(p.get_devices()):
                    try:
                        cc = dev.compute_capability_major_nv + 0.1 * dev.compute_capability_minor_nv
                        if cc > cc_max:
                            cc_max = cc
                            cc_argmax = (i_p, i_dev)
                    except:
                        pass
            if cc_max == -1:
                print("Warning: could not find a NVIDIA card. Please pick up manually the target device")
                self.ctx = cl.create_some_context()
                self.device = ctx.devices[0]
            else:
                self.device = platforms[cc_argmax[0]].get_devices()[cc_argmax[1]]
                self.ctx = cl.Context([self.device])
            # ------------
        self.devicename = self.device.name
        if profile:
            self.queue = cl.CommandQueue(self.ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)
        else:
            self.queue = cl.CommandQueue(self.ctx)
        self.mf = cl.mem_flags
        self.path = []
        self.book = {}
Ejemplo n.º 2
0
  def __init__(self):
    self.context = cl.create_some_context()
    self.queue = cl.CommandQueue( self.context )
    self.costs = loadProgram( self.context, "costs.cl" )
    self.dijkstra = loadProgram( self.context, "dijkstra.cl" )

    mf = cl.mem_flags
    fdirections = np.array( [
      [ 1.,  0.],
      [ 1.,  1.],
      [ 0.,  1.],
      [-1.,  1.],
      [-1.,  0.],
      [-1., -1.],
      [ 0., -1.],
      [ 1., -1.]
    ], dtype = np.float32 )
    idirections = fdirections.astype( np.int32 )
    angles = np.array( [
      [cos( pi / 8 ), sin( pi / 8 )],
      [cos( 3 * pi / 8 ), sin( 3 * pi / 8 )],
      [cos( 5 * pi / 8 ), sin( 5 * pi / 8 )],
      [cos( 7 * pi / 8 ), sin( 7 * pi / 8 )],
    ], dtype = np.float32 )
    speeds = np.array( [0., 0.02, 0.08], dtype = np.float32 )
    self.fdirection_buffer = cl.Buffer( self.context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = fdirections )
    self.idirection_buffer = cl.Buffer( self.context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = idirections )
    self.angle_buffer = cl.Buffer( self.context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = angles )
    self.speed_buffer = cl.Buffer( self.context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = speeds )
def calc_force(Gal,dt):
    """Calculate forces between bodies
    F = ((G m_a m_b)/r^2)/((x_b-x_a)/r)
   """
    
    ctx = cl.create_some_context(0)#use device 0, the GPU
    queue = cl.CommandQueue(ctx)    

    if Timing:
        start = time.time()

    #Convention: dx[i,j] = x[i] - x[j]

    Gal.dvx,Gal.dvy,Gal.dvz = GPU_functions.CalcF(ctx,queue,Gal.x,Gal.y,Gal.z,Gal.m,1.0,1.0)
    if Timing:
        stop = time.time()    
        print 'Time for F_ij computation', stop-start    

#    
    if DebugMode==True:
#
        print 'Check that the force is attracting'
        print Gal.x, Gal.y, Gal.z
        print Gal.dvx, Gal.dvy, Gal.dvz
        print '----End check'        

    print 'Check that the force is attracting'
    print Gal.x[120]
    print Gal.dvx[120]
    print '----End check'  
Ejemplo n.º 4
0
    def __init__(self, lmb, prompt=False, user_dev_selection=None, bindings=None):
        """
        """
	assert not (prompt and user_dev_selection), "Can't ask for @prompt and provide @user_dev_selection at the same time"
	self.user_dev_selection = user_dev_selection

	if prompt:
	    self.user_dev_selection = None if Py2OpenCL.only_one_device() \
		else self.init()

        self.ctx = cl.create_some_context( interactive=False, answers=self.user_dev_selection ) \
		if self.user_dev_selection else cl.create_some_context()

        self.queue = cl.CommandQueue(self.ctx)
        self.bindings = bindings
        self.lmb = lmb
Ejemplo n.º 5
0
def test_cl():
    ctx = cl.create_some_context()  # (interactive=False)

    # print 'ctx', ctx
    queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)
    f = open("part1.cl", "r")
    fstr = "".join(f.readlines())
    program = cl.Program(ctx, fstr).build()
    mf = cl.mem_flags

    cameraPos = np.array([0, 6, -1, 0])
    invView = la.inv(look_at((0, 6, -1), (0, 1, 1), (0, 1, 0)))
    invProj = la.inv(perspective(60, 1, 1, 1000))
    print "view", invView
    print "proj", invProj
    viewParamsData = (
        cameraPos.flatten().tolist()
        + np.transpose(invView).flatten().tolist()
        + np.transpose(invProj).flatten().tolist()
    )
    # print 'vpd', viewParamsData
    viewParams = struct.pack("4f16f16f", *viewParamsData)
    viewParams_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=viewParams)
    num_pixels = 1000 * 1000
    # setup opencl
    dest = np.ndarray((1000, 1000, 4), dtype=np.float32)
    dest_buf = cl.Buffer(ctx, mf.WRITE_ONLY, dest.nbytes)
    local_shape = (8, 8)
    # run kernel
    evt = program.part1(queue, (dest.shape[0], dest.shape[1]), None, viewParams_buf, dest_buf)
    # evt = program.part1(queue, dest.shape, None, dest_buf)
    cl.enqueue_read_buffer(queue, dest_buf, dest).wait()
    print "time", (evt.profile.end - evt.profile.start) * 0.000001, "ms"
    return dest
Ejemplo n.º 6
0
def main():
    ctx = cl.create_some_context()
    # devices = ctx.get_info(cl.context_info.DEVICES)
    # print(devices[0].get_info(cl.device_info.VERSION))
    queue = cl.CommandQueue(ctx, properties=0)

    dtype = 'float64'
    n = 500
    k = 30

    A = setup_lowrank(n, dtype=dtype)
    #mvt = pyclid.util.setup_matvect(queue, A)

    print('finished setup')
    L = pyclid.util.setup_op(queue, A)
    idx, proj = pyclid.interp_decomp(queue, L, k)
    #idx, proj = pyclid.iddr_rid(queue, n, n, mvt, k)

    # begin debug
    import scipy.linalg as la
    import scipy.linalg.interpolative as sli
    from scipy.sparse.linalg import aslinearoperator
    B = A[:,idx[:k]]
    P = np.hstack([np.eye(k), proj])[:,np.argsort(idx)]
    Aapprox = np.dot(B,P)
    print(la.norm(A - Aapprox, 2))
Ejemplo n.º 7
0
def test_bitonic_argsort(ctx_factory, size, dtype):
    ctx = cl.create_some_context()
    queue = cl.CommandQueue(ctx)

    dev = ctx.devices[0]
    if (dev.platform.name == "Apple" and dev.type & cl.device_type.CPU):
        pytest.xfail("Bitonic sort won't work on Apple CPU: no workgroup "
            "parallelism")
    if (dev.platform.name == "Portable Computing Language"
            and dtype == np.float64):
        pytest.xfail("Double precision bitonic sort doesn't work on POCL")

    import pyopencl.clrandom as clrandom
    from pyopencl.bitonic_sort import BitonicSort

    index = cl_array.arange(queue, 0, size, 1, dtype=np.int32)
    m = clrandom.rand(queue, (size,), dtype, luxury=None, a=0, b=239432234)

    sorterm = BitonicSort(ctx)

    ms, evt = sorterm(m.copy(), idx=index, axis=0)

    assert np.array_equal(np.sort(m.get()), ms.get())

    # may be False because of identical values in array
    # assert np.array_equal(np.argsort(m.get()), index.get())

    # Check values by indices
    assert np.array_equal(m.get()[np.argsort(m.get())], m.get()[index.get()])
Ejemplo n.º 8
0
Archivo: idt.py Proyecto: Fuhji/TopoMC
    def __init__(self, coords, values, wantCL=True, platform_num=None):
        """
        Take the coordinates and values and build a KD tree.

        Keyword arguments:
        coords -- input coordinates (x, y)
        values -- input values

        """

        self.coords = np.asarray(coords, dtype=np.float32)
        self.values = np.asarray(values, dtype=np.int32)

        if self.coords.shape[0] != self.values.shape[0]:
            raise AssertionError('lencoords does not equal lenvalues')

        self.wantCL = wantCL
        self.canCL = False

        if hasCL and self.wantCL:
            try:
                platforms = cl.get_platforms()
                try:
                    platform = platforms[platform_num]
                    self.devices = self.platform.get_devices()
                    self.context = cl.Context(self.devices)
                except TypeError:
                    # The user may be asked to select a platform.
                    self.context = cl.create_some_context()
                    self.devices = self.context.devices
                except IndexError:
                    raise
                self.queue = cl.CommandQueue(self.context)
                filestr = ''.join(open('idt.cl', 'r').readlines())
                self.program = cl.Program(self.context, filestr).build(devices=self.devices)
                for device in self.devices:
                    buildlog = self.program.get_build_info(device, cl.program_build_info.LOG)
                    if (len(buildlog) > 1):
                        print 'Build log for device', device, ':\n', buildlog
                # Only the first kernel is used.
                self.kernel = self.program.all_kernels()[0]

                # Local and global sizes are device-dependent.
                self.local_size = {}
                self.global_size = {}
                # Groups should be overcommitted.
                # For now, use 3 (48 cores / 16 cores per halfwarp) * 2
                for device in self.devices:
                    work_group_size = self.kernel.get_work_group_info(cl.kernel_work_group_info.WORK_GROUP_SIZE, device)
                    num_groups_for_1d = device.max_compute_units * 3 * 2
                    self.local_size[device] = (work_group_size,)
                    self.global_size[device] = (num_groups_for_1d * work_group_size,)
                self.canCL = True
            except cl.RuntimeError:
                print 'warning: unable to use pyopencl, defaulting to cKDTree'

        if self.canCL:
            self.tree = build_tree(coords)
        else:
            self.tree = KDTree(coords)
Ejemplo n.º 9
0
    def __init__(self, cl_mode = True, cl_device = None):
        """Initialize the class.
        """
        if cl_mode:
            import pyopencl as cl
            import pyopencl.array
            if cl_device == 'gpu':
                gpu_devices = []
                for platform in cl.get_platforms():
                    try: gpu_devices += platform.get_devices(device_type=cl.device_type.GPU)
                    except: pass
                self.ctx = cl.Context(gpu_devices)
            elif cl_device == 'cpu':
                cpu_devices = []
                for platform in cl.get_platforms():
                    try: cpu_devices += platform.get_devices(device_type=cl.device_type.CPU)
                    except: pass
                self.ctx = cl.Context([cpu_devices[0]])
            else:
                self.ctx = cl.create_some_context()

            self.queue = cl.CommandQueue(self.ctx)
            self.mf = cl.mem_flags
            self.device = self.ctx.get_info(cl.context_info.DEVICES)[0]
            self.device_type = self.device.type
            self.device_compute_units = self.device.max_compute_units

        self.cl_mode = cl_mode
        self.obs = []
        self.samples = {}
Ejemplo n.º 10
0
    def __init__(self, cl_file_location='../../../c/kernels', interactive=True):
        """
        Initialising the Kernel loads the C code from kernel_file and sets up
        the necessary CommandQueue and context.
        """
        #TODO: replace print with logger
        #TODO: investigate precompilation of kernels
        #TODO: set dType uniformly
        # log here print cl.version.VERSION_TEXT
        if self.kernel_file:
            base = __file__.rsplit('/', 1)[0]
            filename = os.path.join(base, cl_file_location, self.kernel_file)
            # log here print os.path.abspath(filename)
            self.kernel_string = open(filename).read()
	    if self.function_string:
		self.kernel_string = self.function_string+'\n'+self.kernel_string
	    else:
		self.function_string = ""
        else:
            self.kernel_string = ""
        self.ctx = cl.create_some_context(interactive=interactive)
	# This should be a logging statement...
	print 'Using device: %s' % self.ctx.get_info(cl.context_info.DEVICES)
        self.queue = cl.CommandQueue(self.ctx)
        self.buffers = []
        # This is a tuple describing the dimensions of the output
        self.global_size = (0,)
Ejemplo n.º 11
0
def calc_range(start, num, perexec):
	"""Calculate the otp-md5 of the 64-bit numbers range(start, num),
	   with otp sequence of rounds."""

	assert(num % perexec == 0)

	# Boilerplate OpenCL stuff
	ctx = cl.create_some_context()
	queue = cl.CommandQueue(ctx)
	mf = cl.mem_flags

	# Read the program source and compile
	sourcecode = open("otpmd5.cl").read()
	prg = cl.Program(ctx, sourcecode).build()

	for i in xrange(num / perexec):
		offset = start + (perexec * i)

		host_input = numpy.arange(offset, offset+perexec, dtype=numpy.uint64)
		result = numpy.empty_like(host_input)
		dev_input = cl.Buffer(ctx, mf.READ_ONLY | mf.USE_HOST_PTR, hostbuf=host_input)
		dev_output = cl.Buffer(ctx, mf.READ_WRITE, size=result.size * result.itemsize)
		prg.get_otpmd5_64k_rounds(queue, host_input.shape, None, dev_input, dev_output).wait()
		cl.enqueue_copy(queue, result, dev_output).wait()
		send_output(host_input, result)
Ejemplo n.º 12
0
Archivo: lab1.py Proyecto: spetz911/CL
def cl_init(type = 'GPU'):
	if type == 'GPU':
		my_type = cl.device_type.GPU
	elif type == 'CPU':
		my_type = cl.device_type.CPU
	
	try:
		platform = cl.get_platforms()[0]
		devices = platform.get_devices(device_type=my_type)
		ctx = cl.Context(devices = devices)
	except:
		ctx = cl.create_some_context(interactive=True)
	
	device = devices[0]
	print("===============================================================")
	print("Platform name: " + platform.name)
	print("Platform vendor: " + platform.vendor)
	print("Platform version: " + platform.version)
	print("---------------------------------------------------------------")
	print("Device name: " + device.name)
	print("Device type: " + cl.device_type.to_string(device.type))
	print("Local memory: " + str(device.local_mem_size//1024) + ' KB')
	print("Device memory: " + str(device.global_mem_size//1024//1024) + ' MB')
	print("Device max clock speed:" + str(device.max_clock_frequency) + ' MHz')
	print("Device compute units:" + str(device.max_compute_units))
	
	return ctx
Ejemplo n.º 13
0
    def __init__(self, coords, values, base, wantCL=True, split=None, nnear=None, majority=True):
        self.coords = np.asarray(coords, dtype=np.int32)
        self.values = np.asarray(values, dtype=np.int32)
        self.base = np.asarray(base, dtype=np.int32)
        lencoords = self.coords.shape[0]
        lenvalues = self.values.shape[0]
        assert lencoords == lenvalues, "lencoords does not equal lenvalues"

        self.wantCL = wantCL
        if hasCL == True and self.wantCL == True:
            if split == None:
                self.split = CLIDT.OpenCLmaxsize
            else:
                self.split = split
            try:
                self.ctx = cl.create_some_context()
                self.queue = cl.CommandQueue(self.ctx)
                filestr = "".join(open("idt.cl", "r").readlines())
                self.program = cl.Program(self.ctx, filestr).build()
                self.coordindices = self.genindices(self.coords)
                self.baseindices = self.genindices(self.base)
                self.canCL = True
            # FIXME: specify an exception type
            except:
                print "warning: unable to use pyopencl, defaulting to Invdisttree"
                self.canCL = False
        else:
            self.canCL = False

        if nnear == None:
            self.nnear = np.int32(CLIDT.nnear)
        else:
            self.nnear = np.int32(nnear)

        self.usemajority = np.int32(1 if majority else 0)
Ejemplo n.º 14
0
    def create_context(self, devicetype="ALL", useFp64=False, platformid=None, deviceid=None):
        """
        Choose a device and initiate a context.

        Devicetypes can be GPU,gpu,CPU,cpu,DEF,ACC,ALL.
        Suggested are GPU,CPU.
        For each setting to work there must be such an OpenCL device and properly installed.
        E.g.: If Nvidia driver is installed, GPU will succeed but CPU will fail. The AMD SDK kit is required for CPU via OpenCL.
        :param devicetype: string in ["cpu","gpu", "all", "acc"]
        :param useFp64: boolean specifying if double precision will be used
        :param platformid: integer
        :param devid: integer
        :return: OpenCL context on the selected device
        """
        if (platformid is not None) and (deviceid is not None):
            platformid = int(platformid)
            deviceid = int(deviceid)
        else:
            if useFp64:
                ids = ocl.select_device(type=devicetype, extensions=["cl_khr_int64_base_atomics"])
            else:
                ids = ocl.select_device(type=devicetype)
            if ids:
                platformid = ids[0]
                deviceid = ids[1]
        if (platformid is not None) and  (deviceid is not None):
            ctx = pyopencl.Context(devices=[pyopencl.get_platforms()[platformid].get_devices()[deviceid]])
        else:
            logger.warn("Last chance to get an OpenCL device ... probably not the one requested")
            ctx = pyopencl.create_some_context(interactive=False)
        return ctx
Ejemplo n.º 15
0
    def __init__(self, network, dt=0.001, seed=None, model=None, context=None,
                 n_prealloc_probes=32, profiling=None, ocl_only=False):
        if context is None:
            print('No context argument was provided to sim_ocl.Simulator')
            print("Calling pyopencl.create_some_context() for you now:")
            context = cl.create_some_context()
        if profiling is None:
            profiling = int(os.getenv("NENGO_OCL_PROFILING", 0))
        self.context = context
        self.profiling = profiling
        if self.profiling:
            self.queue = cl.CommandQueue(context, properties=PROFILING_ENABLE)
        else:
            self.queue = cl.CommandQueue(context)

        self.n_prealloc_probes = n_prealloc_probes
        self.ocl_only = ocl_only
        self.cl_rng_state = None

        # -- allocate data
        sim_npy.Simulator.__init__(
            self, network=network, dt=dt, seed=seed, model=model)

        # -- create object to execute list of plans
        self._plans = Plans(self._plan, self.profiling)
Ejemplo n.º 16
0
def main():
    cl_ctx = cl.create_some_context()
    queue = cl.CommandQueue(cl_ctx)

    from meshmode.mesh.generation import (  # noqa
            generate_icosphere, generate_icosahedron,
            generate_torus)
    #mesh = generate_icosphere(1, order=order)
    mesh = generate_icosahedron(1, order=order)
    #mesh = generate_torus(3, 1, order=order)

    from meshmode.discretization import Discretization
    from meshmode.discretization.poly_element import \
            PolynomialWarpAndBlendGroupFactory

    discr = Discretization(
            cl_ctx, mesh, PolynomialWarpAndBlendGroupFactory(order))

    from meshmode.discretization.visualization import make_visualizer
    vis = make_visualizer(queue, discr, order)

    vis.write_vtk_file("geometry.vtu", [
        ("f", discr.nodes()[0]),
        ])

    from meshmode.discretization.visualization import \
            write_mesh_connectivity_vtk_file

    write_mesh_connectivity_vtk_file("connectivity.vtu",
            mesh)
Ejemplo n.º 17
0
    def __init__(self, model, dt=0.001, seed=None, builder=None, context=None,
                 n_prealloc_probes=1000, profiling=None):
        if context is None:
            print 'No context argument was provided to sim_ocl.Simulator'
            print "Calling pyopencl.create_some_context() for you now:"
            context = cl.create_some_context()
        if profiling is None:
            profiling = int(os.getenv("NENGO_OCL_PROFILING", 0))
        self.context = context
        self.profiling = profiling
        if self.profiling:
            self.queue = cl.CommandQueue(context,
                                         properties=PROFILING_ENABLE)
        else:
            self.queue = cl.CommandQueue(context)

        self.n_prealloc_probes = n_prealloc_probes
        # -- allocate data
        sim_npy.Simulator.__init__(
            self, model=model, dt=dt, seed=seed, builder=builder)

        # -- set up the DAG for executing OCL kernels
        self._plandict = OrderedDict()
        self.step_marker = Marker(self.queue)
        # -- marker is used to do the op_groups in order
        deps = []
        for op_type, op_list in self.op_groups:
            deps = self.plandict_op_group(op_type, op_list, deps)
        probe_plans = self.plan_probes()
        for p in probe_plans:
            self._plandict[p] = deps
        self._dag = DAG(context, self.step_marker,
                           self._plandict,
                           self.profiling)
Ejemplo n.º 18
0
def main():
    ctx = cl.create_some_context()

    prof_overhead, latency = perf.get_profiling_overhead(ctx)
    print("command latency: %g s" % latency)
    print("profiling overhead: %g s -> %.1f %%" % (
            prof_overhead, 100*prof_overhead/latency))
    queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)

    print("empty kernel: %g s" % perf.get_empty_kernel_time(queue))
    print("float32 add: %g GOps/s" % (perf.get_add_rate(queue)/1e9))

    for tx_type in [
            perf.HostToDeviceTransfer,
            perf.DeviceToHostTransfer,
            perf.DeviceToDeviceTransfer]:
        print("----------------------------------------")
        print(tx_type.__name__)
        print("----------------------------------------")

        print("latency: %g s" % perf.transfer_latency(queue, tx_type))
        for i in range(6, 28, 2):
            bs = 1<<i
            print("bandwidth @ %d bytes: %g GB/s" % (
                    bs, perf.transfer_bandwidth(queue, tx_type, bs)/1e9))
Ejemplo n.º 19
0
def main2():
    ctx = cl.create_some_context()
    queue = cl.CommandQueue(ctx)
    dev = queue.device
    knl = _get_wave_kernel(ctx)

    gs, ls = get_group_sizes(len_x * 2, dev, knl)

    def f(t, y_in, y_out, wait_for=None):
        return run_elwise_kernel(knl, queue, gs, ls, len_x * 2, wait_for,
                                 y_out, y_in, h_x, len_x)

    xs = np.arange(len_x) * np.pi / (len_x - 1)
    y0 = np.r_[(np.sin(xs) + np.sin(xs * 2) + np.sin(xs * 3)
                + np.sin(xs * 4) + np.sin(xs * 5)) / 5,
               np.zeros(len_x)].astype(np.float32)
    # y0 += np.r_[np.zeros(len_x),
    #             [(min((i / len_x) - 0.4, 0.5 - (i / len_x)) * 20
    #               if 0.4 < (i / len_x) < 0.5 else 0)
    #               for i in range(len_x)]].astype(np.float32)
    y0 += np.r_[np.zeros(len_x),
                [((i / len_x) - 0.2 if 0.15 < (i / len_x) < 0.25 else 0) * 20
                 for i in range(len_x)]].astype(np.float32)
    # y0 = np.r_[[(1 if 0.4 < (i / len_x) < 0.5 else 0)
    #             for i in range(len_x)],
    #            np.zeros(len_x)].astype(np.float32)
    y0 += np.r_[[(1 if 0.75 < (i / len_x) < 0.85 else 0)
                 for i in range(len_x)],
                np.zeros(len_x)].astype(np.float32)

    res, evt = solve_ode(t0, t1, h, y0, f, queue)
    print('queued')
    evt.wait()
    print('finished')
    res_np = [a.get() for a in res]
def matrix_deg_centrality(h_a,threshold,a_height):
    ### h_a is the input matrix in array form, so shape=(rowsxcolumns,1)
    ### assumes that the connectivity matrix is symmetric
    ### threshold is the threshold applied to the connectivity matrix
    ### a_height is the number of columns or row of the input matrix


    block_size = 16
    a_width = a_height###assumes symmetric matrix
    h_b_int = a_height
    c_width = a_width
    c_height = a_height
    h_result=np.empty(a_height).astype(np.float32);


    ctx=cl.create_some_context()
    queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)
    from pyopencl.scan import GenericScanKernel
    scan_kernel = GenericScanKernel(
            ctx, np.float32,
            arguments="__global float *ary,__global float *out, __global int segflag,__global float threshold",
            input_expr="(ary[i] < threshold) ? 0 : 1",
            scan_expr="across_seg_boundary ? b: (a+b)", neutral="0",is_segment_start_expr="(i)%segflag==0",
            output_statement="(i+1)%segflag==0 ? (out[i/segflag] = item,ary[i] = item) : (ary[i] = item);")


    mf = cl.mem_flags
    a_gpu=cl.array.to_device(queue,h_a)
    result_gpu=cl.array.to_device(queue,h_result)
    event = scan_kernel(a_gpu,result_gpu,h_b_int,threshold,queue=queue)
    gpu_centrality= result_gpu.get(); ##check if everything is correct
    return gpu_centrality
Ejemplo n.º 21
0
def lomb_scargle32(x, y, f):
    '''single percesion version of lomb-scargle'''
    x = np.float32(x)
    y = np.float32(y)
    f = np.float32(f)
    ctx = cl.create_some_context()
    queue = cl.CommandQueue(ctx)
    mf = cl.mem_flags
    # make max arrays
    Nx, Nf = np.int32(x.shape[0]), np.int32(f.shape[0])
    # send data to card
    x_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=x)
    y_g = cl.Buffer(ctx, mf.READ_ONLY| mf.COPY_HOST_PTR, hostbuf=y)
    f_g = cl.Buffer(ctx, mf.READ_ONLY| mf.COPY_HOST_PTR, hostbuf=f)
    # make output
    pgram = np.empty_like(f)
    pgram_g = cl.Buffer(ctx, mf.WRITE_ONLY, pgram.nbytes)
    prg = cl.Program(ctx, lomb_txt32)

    try:
        prg.build()
    except:
        # 
        print("Error:")
        print(prg.get_build_info(ctx.devices[0], cl.program_build_info.LOG))
        raise

    prg.lombscargle(queue, pgram.shape, None, x_g, y_g, f_g, pgram_g, Nx)
    cl.enqueue_read_buffer(queue, pgram_g, pgram)

    return pgram
Ejemplo n.º 22
0
def benchmark_overlapfiltfilt():
    ctx = pyopencl.create_some_context()
    print(ctx)
    
    #~ chunksizes = [256,1024,2048]
    chunksizes = [2048]
    #~ chunksizes = [64]
    #~ n_sections = [2,8,16,24]
    n_sections = [8, 24]
    #~ n_sections = [24]
    #~ nb_channels = [1,10, 50,100, 200]
    nb_channels = [10, 50, 100]
    #~ nb_channels = [10, 50, 100, 500]
    #~ nb_channels = [10, 50, 100]
    #~ chunksizes = [1024]
    #~ n_sections = [4]
    #~ nb_channels = [100]
    
    if HAVE_PYOPENCL:
        engines = ['scipy', 'opencl', 'opencl3']
    else:
        engines = ['scipy']

    extra_kargs = {'overlapsize' : 64}
    
    for chunksize in chunksizes:
        for n_section in n_sections:
            for nb_channel in nb_channels:
                print('*'*20)
                print('chunksize', chunksize, 'n_section', n_section, 'nb_channel', nb_channel)
                compare(chunksize,n_section, nb_channel, sosfiltfilt_engines, engines, **extra_kargs)
Ejemplo n.º 23
0
def gpu_gradient():
				
	if len(sys.argv) != 3:
		print "USAGE: " + sys.argv[0] + " <inputImageFile> <outputImageFile>"
		return 1
	
	# create context and command queue
	ctx = cl.create_some_context()
	queue = cl.CommandQueue(ctx)
	
	# load image
	im = Image.open(sys.argv[1])
	if im.mode != "RGBA":
		im = im.convert("RGBA")
	imgSize = im.size
	buffer = im.tostring() # len(buffer) = imgSize[0] * imgSize[1] * 4

	
	# Create ouput image object
	clImageFormat = cl.ImageFormat(cl.channel_order.RGBA, 
								cl.channel_type.UNSIGNED_INT8)
	input_image = cl.Image(ctx,
								cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR,
								clImageFormat,
								imgSize,
								None,
								buffer)
	output_image = cl.Image(ctx,
							cl.mem_flags.WRITE_ONLY,
							clImageFormat,
							imgSize)

	# load the kernel source code
	kernelFile = open("gradient.cl", "r")
	kernelSrc = kernelFile.read()

	# Create OpenCL program
	program = cl.Program(ctx, kernelSrc).build()
	# Call the kernel directly
	globalWorkSize = ( imgSize[0],imgSize[1] ) 
	gpu_start_time = time()
	program.gradient(queue,
							globalWorkSize,
							None,
							input_image,
							output_image)
		
	# Read the output buffer back to the Host
	buffer = numpy.zeros(imgSize[0] * imgSize[1] * 4, numpy.uint8)
	origin = ( 0, 0, 0 )
	region = ( imgSize[0], imgSize[1], 1 )
	
	cl.enqueue_read_image(queue, output_image,
						origin, region, buffer).wait()
	
	# Save the image to disk
	gsim = Image.fromstring("RGBA", imgSize, buffer.tostring())
	gsim.save("GPU_"+sys.argv[2])
	gpu_end_time = time()
	print("GPU Time: {0} s".format(gpu_end_time - gpu_start_time))
Ejemplo n.º 24
0
    def compile_source(self):
        self.context = pyopencl.create_some_context()
        self.queue = pyopencl.CommandQueue(self.context)
        self.mf = pyopencl.mem_flags

        opencl_source = load_file("geneticvehicle.cl") % {
                                "vertices_per_car" : self.number_of_vertices_per_car,
                                "number_of_cars" : self.number_of_cars,
                                "density" : self.density,
                                "number_of_wheels" : self.number_of_wheels_per_car,
                                "number_of_contact_points" : self.number_of_contact_points,
                                "island_start" : self.island.island_start,
                                "island_step" : self.island.island_step,
                                "island_end" : self.island.island_end,
                                "island_acceleration"  : int(self.island.island_acceleration),
                                "island_range" : self.island.range(),
                                "crossover_points" : self.crossover_points,
                                "point_mutations" : self.point_mutations}

        self.program = pyopencl.Program(self.context, opencl_source)

        try:
            self.program.build()
        except Exception as why:
            print why
            print(self.program.get_build_info(self.context.devices[0], pyopencl.program_build_info.LOG))
Ejemplo n.º 25
0
  def __init__( self, im, fil, fil_1d=None, fil_2d=None, larger_buffer=True, sep=True, buffer_flip=False, type=numpy.float32 ):
    
    self.ctx = cl.create_some_context()
    self.queue = cl.CommandQueue( self.ctx )
    
    self.larger_buffer = larger_buffer
    self.sep = sep # whether or not the convolution is separated into 1D chunks
    self.type = type #TODO: type should just come from the input image, do a check to see if it matches the filter
    self.buffer_flip = buffer_flip # Optimization for separable convolutions where only the x direction is required
    if self.type == numpy.float32:
      self.ctype = 'float'
    elif self.type == numpy.float64:
      self.ctype = 'double'
    else:
      raise TypeError, "Data type specified is not currently supported: " + str( self.type )

    # For special convolutions, if required
    self.fil_1d = fil_1d
    self.fil_1d_origin = 0
    self.fil_2d = fil_2d
    self.fil_2d_origin = ( 0, 0 ) # offset of the center of the filter
    self.max_2d_buffer = False # just set this to false for now, it might be used in the future
    
    if im is not None and fil is not None:
      self.set_params( im, fil )
Ejemplo n.º 26
0
 def init_context_queue(self):
     if self.ctx is None:
         if self.choose_best_device:
             self.ctx = ocl.create_context()
         else:
             self.ctx = cl.create_some_context()
     self.queue = cl.CommandQueue(self.ctx)
Ejemplo n.º 27
0
def test_opencl():
    import numpy as np
    import pyopencl as cl

    a = np.random.rand(50000).astype(np.float32)
    b = np.random.rand(50000).astype(np.float32)

    context = cl.create_some_context()
    queue = cl.CommandQueue(context)

    mf = cl.mem_flags

    a_cl = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a)
    b_cl = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b)

    program = cl.Program(context, r'''
        __kernel void sum(__global const float * a, __global const float * b, __global float * out) {
            int gid = get_global_id(0);
            out[gid] = a[gid] + b[gid];
        }
    ''').build()

    out_cl = cl.Buffer(context, mf.WRITE_ONLY, a.nbytes)

    program.sum(queue, a.shape, None, a_cl, b_cl, out_cl)

    out = np.empty_like(a)
    cl.enqueue_copy(queue, out, out_cl)

    print(np.linalg.norm(out - (a + b)))
Ejemplo n.º 28
0
def gpu_array_sum(a, b):
    context = cl.create_some_context()  # Initialize the Context
    queue = cl.CommandQueue(context, properties=cl.command_queue_properties.PROFILING_ENABLE)  # Instantiate a Queue with profiling (timing) enabled
    a_buffer = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=a)
    b_buffer = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=b)
    c_buffer = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, b.nbytes)  # Create three buffers (plans for areas of memory on the device)
    program = cl.Program(context, """
    __kernel void sum(__global const float *a, __global const float *b, __global float *c)
    {
        int i = get_global_id(0);
        int j;
        for(j = 0; j < 1000; j++)
        {
            c[i] = a[i] + b[i];
        }
    }""").build()  # Compile the device program
    gpu_start_time = time()  # Get the GPU start time
    event = program.sum(queue, a.shape, None, a_buffer, b_buffer, c_buffer)  # Enqueue the GPU sum program XXX
    event.wait()  # Wait until the event finishes XXX
    elapsed = 1e-9*(event.profile.end - event.profile.start)  # Calculate the time it took to execute the kernel
    print("GPU Kernel Time: {0} s".format(elapsed))  # Print the time it took to execute the kernel
    c_gpu = np.empty_like(a)  # Create an empty array the same size as array a
    cl.enqueue_read_buffer(queue, c_buffer, c_gpu).wait()  # Read back the data from GPU memory into array c_gpu
    gpu_end_time = time()  # Get the GPU end time
    print("GPU Time: {0} s".format(gpu_end_time - gpu_start_time))  # Print the time the GPU program took, including both memory copies
    return c_gpu  # Return the sum of the two arrays
Ejemplo n.º 29
0
    def __init__(self, seed=None):
        self.ctx = cl.create_some_context()
        self.queue = cl.CommandQueue(self.ctx)
        self.seed = seed

        numpy.random.seed(seed)
        self._compute_seed()
Ejemplo n.º 30
0
    def __init__(self):
        t_np = np.arange(0, 100000000, dtype=np.float32)

        self.ctx = cl.create_some_context()
        self.queue = cl.CommandQueue(self.ctx)

        self.mf = cl.mem_flags
        self.t_g = cl.Buffer(
            self.ctx,
            self.mf.READ_ONLY | self.mf.COPY_HOST_PTR,
            hostbuf=t_np)

        f = open("ex.cl", "r")
        fstr = "".join(f.readlines())
        f.close()
        self.prg = cl.Program(self.ctx, fstr).build()

        self.res_g = cl.Buffer(self.ctx, self.mf.WRITE_ONLY, t_np.nbytes)
        self.prg.proc(self.queue, t_np.shape, None, self.t_g, self.res_g)

        res_np = np.empty_like(t_np)
        cl.enqueue_copy(self.queue, res_np, self.res_g)

        # Check on CPU with Numpy:
        print(res_np)
        print(np.amax(res_np))
Ejemplo n.º 31
0
# Use OpenCL To Add Two Random Arrays (Using PyOpenCL Arrays and Elementwise)

import pyopencl as cl  # Import the OpenCL GPU computing API
import pyopencl.array as cl_array  # Import PyOpenCL Array (a Numpy array plus an OpenCL buffer object)
import numpy  # Import Numpy number tools

context = cl.create_some_context()  # Initialize the Context
queue = cl.CommandQueue(context)  # Instantiate a Queue

a = cl_array.to_device(queue,
                       numpy.random.randn(50000).astype(
                           numpy.float32))  # Create a random pyopencl array
b = cl_array.to_device(queue,
                       numpy.random.randn(50000).astype(
                           numpy.float32))  # Create a random pyopencl array
c = cl_array.empty_like(a)  # Create an empty pyopencl destination array

sum = cl.elementwise.ElementwiseKernel(context, "float *a, float *b, float *c",
                                       "c[i] = a[i] + b[i]", "sum")
# Create an elementwise kernel object
#  - Arguments: a string formatted as a C argument list
#  - Operation: a snippet of C that carries out the desired map operation
#  - Name: the fuction name as which the kernel is compiled

sum(a, b, c)  # Call the elementwise kernel

print("a: {}".format(a))
print("b: {}".format(b))
print("c: {}".format(c))
# Print all three arrays, to show sum() worked
Ejemplo n.º 32
0
                                                3,
                                                *params,
                                                test_case='exact')


@pytest.mark.parametrize("params", [
    [2, 5, 4, 4],
    [3, 7, 5, 3],
    [4, 7, 3, 5],
])
def test_to_meshmode_interpolation_3d_nonexact(ctx_factory, params):
    cl_ctx = ctx_factory()
    queue = cl.CommandQueue(cl_ctx)
    assert drive_test_to_meshmode_interpolation(
        cl_ctx, queue, 3, *params, test_case='non-exact') < 1e-3


# }}} End 3d tests

if __name__ == '__main__':
    cl_ctx = cl.create_some_context()
    queue = cl.CommandQueue(cl_ctx)
    resid = drive_test_to_meshmode_interpolation(cl_ctx,
                                                 queue,
                                                 dim=3,
                                                 degree=9,
                                                 nel_1d=7,
                                                 n_levels=2,
                                                 q_order=10,
                                                 test_case="exact")
Ejemplo n.º 33
0
def main(snapshot_pattern="wave-mpi-{step:04d}-{rank:04d}.pkl", restart_step=None,
         use_profiling=False, use_logmgr=False, actx_class=PyOpenCLArrayContext):
    """Drive the example."""
    cl_ctx = cl.create_some_context()
    queue = cl.CommandQueue(cl_ctx)

    from mpi4py import MPI
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    num_parts = comm.Get_size()

    logmgr = initialize_logmgr(use_logmgr,
        filename="wave-mpi.sqlite", mode="wu", mpi_comm=comm)
    if use_profiling:
        queue = cl.CommandQueue(cl_ctx,
            properties=cl.command_queue_properties.PROFILING_ENABLE)
        actx = actx_class(queue,
            allocator=cl_tools.MemoryPool(cl_tools.ImmediateAllocator(queue)),
            logmgr=logmgr)
    else:
        queue = cl.CommandQueue(cl_ctx)
        actx = actx_class(queue,
            allocator=cl_tools.MemoryPool(cl_tools.ImmediateAllocator(queue)))

    if restart_step is None:

        from meshmode.distributed import MPIMeshDistributor, get_partition_by_pymetis
        mesh_dist = MPIMeshDistributor(comm)

        dim = 2
        nel_1d = 16

        if mesh_dist.is_mananger_rank():
            from meshmode.mesh.generation import generate_regular_rect_mesh
            mesh = generate_regular_rect_mesh(
                a=(-0.5,)*dim, b=(0.5,)*dim,
                nelements_per_axis=(nel_1d,)*dim)

            print("%d elements" % mesh.nelements)
            part_per_element = get_partition_by_pymetis(mesh, num_parts)
            local_mesh = mesh_dist.send_mesh_parts(mesh, part_per_element, num_parts)

            del mesh

        else:
            local_mesh = mesh_dist.receive_mesh_part()

        fields = None

    else:
        from mirgecom.restart import read_restart_data
        restart_data = read_restart_data(
            actx, snapshot_pattern.format(step=restart_step, rank=rank)
        )
        local_mesh = restart_data["local_mesh"]
        nel_1d = restart_data["nel_1d"]
        assert comm.Get_size() == restart_data["num_parts"]

    order = 3

    discr = EagerDGDiscretization(actx, local_mesh, order=order,
                                  mpi_communicator=comm)

    current_cfl = 0.485
    wave_speed = 1.0
    from grudge.dt_utils import characteristic_lengthscales
    dt = current_cfl * characteristic_lengthscales(actx, discr) / wave_speed

    from grudge.op import nodal_min
    dt = nodal_min(discr, "vol", dt)

    t_final = 1

    if restart_step is None:
        t = 0
        istep = 0

        fields = flat_obj_array(
            bump(actx, discr),
            [discr.zeros(actx) for i in range(discr.dim)]
            )

    else:
        t = restart_data["t"]
        istep = restart_step
        assert istep == restart_step
        restart_fields = restart_data["fields"]
        old_order = restart_data["order"]
        if old_order != order:
            old_discr = EagerDGDiscretization(actx, local_mesh, order=old_order,
                                              mpi_communicator=comm)
            from meshmode.discretization.connection import make_same_mesh_connection
            connection = make_same_mesh_connection(actx, discr.discr_from_dd("vol"),
                                                   old_discr.discr_from_dd("vol"))
            fields = connection(restart_fields)
        else:
            fields = restart_fields

    if logmgr:
        logmgr_add_cl_device_info(logmgr, queue)
        logmgr_add_device_memory_usage(logmgr, queue)

        logmgr.add_watches(["step.max", "t_step.max", "t_log.max"])

        try:
            logmgr.add_watches(["memory_usage_python.max", "memory_usage_gpu.max"])
        except KeyError:
            pass

        if use_profiling:
            logmgr.add_watches(["multiply_time.max"])

        vis_timer = IntervalTimer("t_vis", "Time spent visualizing")
        logmgr.add_quantity(vis_timer)

    vis = make_visualizer(discr)

    def rhs(t, w):
        return wave_operator(discr, c=wave_speed, w=w)

    compiled_rhs = actx.compile(rhs)

    while t < t_final:
        if logmgr:
            logmgr.tick_before()

        # restart must happen at beginning of step
        if istep % 100 == 0 and (
                # Do not overwrite the restart file that we just read.
                istep != restart_step):
            from mirgecom.restart import write_restart_file
            write_restart_file(
                actx, restart_data={
                    "local_mesh": local_mesh,
                    "order": order,
                    "fields": fields,
                    "t": t,
                    "step": istep,
                    "nel_1d": nel_1d,
                    "num_parts": num_parts},
                filename=snapshot_pattern.format(step=istep, rank=rank),
                comm=comm
            )

        if istep % 10 == 0:
            print(istep, t, discr.norm(fields[0]))
            vis.write_parallel_vtk_file(
                comm,
                "fld-wave-mpi-%03d-%04d.vtu" % (rank, istep),
                [
                    ("u", fields[0]),
                    ("v", fields[1:]),
                ], overwrite=True
            )

        fields = thaw(freeze(fields, actx), actx)
        fields = rk4_step(fields, t, dt, compiled_rhs)

        t += dt
        istep += 1

        if logmgr:
            set_dt(logmgr, dt)
            logmgr.tick_after()

    final_soln = discr.norm(fields[0])
    assert np.abs(final_soln - 0.04409852463947439) < 1e-14
Ejemplo n.º 34
0
def kMerCount(file, nK):
    K = nK
    h_seq = genSeq(file)
    h_seq = np.concatenate(
        (np.zeros(2 + 4 + 4**K).astype(CPU_SIDE_INT), h_seq))

    kernelsource = '''
	__kernel void mapToNumb(
		const int N,
		const int M,
		const int numbKmer,
		__global int* seq,
		__global int* numb_seq
	)
	{
		int gid = get_global_id(0);
		int idx = gid * M + numbKmer + 2 + 4;
		int i, letter;

		if(idx < N*M + numbKmer + 2 + 4) {
			for(i=0; i < M; i++) {
				letter = seq[idx+i];
				if(letter == 65) {
					numb_seq[idx+i] = 0;
					atomic_inc(&numb_seq[2]);
				} else {
				if(letter == 67) {
					numb_seq[idx+i] = 1;
					atomic_inc(&numb_seq[3]);
				} else {
				if(letter == 71) {
					numb_seq[idx+i] = 2;
					atomic_inc(&numb_seq[4]);
				} else {
				if(letter == 84) {
					numb_seq[idx+i] = 3;
					atomic_inc(&numb_seq[5]);
				} else {
				if(letter == 78) {
					numb_seq[idx+i] = -1;
				} else {
					numb_seq[idx+i] = -2;
				}
				}
				}
				}
				}
			}
		}
	}
	__kernel void freqTab(
		const int N,
		const int M,
		const int nK,
		const int numbKmer,
		__global int* numb_seq
	) {
		int gid = get_global_id(0);
		int idx = gid * M + numbKmer + 2 + 4;
		int i, numb;
		int k, p, loc_idx, ptn_idx;
		int dgt;
		int kmin;
		for(i=0; i < M; i++) {
			ptn_idx = 0;
			loc_idx = idx + i;
			kmin = 0;
			if(loc_idx <= (N*M + numbKmer + 2 + 4 - nK)) {
				for(k=0; k < nK; k++) {
					numb = numb_seq[loc_idx + k];
					switch(numb) {
						case (-1):
							atomic_inc(&numb_seq[1]);
							break;
						case (-2):
							atomic_inc(&numb_seq[0]);
							break;
						default:
							dgt = (int)(pow(4, (float)(nK-1-k)));
							ptn_idx += dgt * numb;
							break;
					}
					if(numb < kmin) {
						kmin = numb;
					}
				}
				if(kmin >= 0) {
					atomic_inc(&numb_seq[ptn_idx+2+4]);
				}
			}
		}
	}
	'''

    context = cl.create_some_context()
    device = context.devices[0]

    work_group_size = device.max_work_group_size
    work_item_size = device.max_work_item_sizes[0]
    print(work_group_size)
    print(work_item_size)

    numbGroups = work_group_size
    numbItems = work_item_size

    seqLen = np.size(h_seq) - 4**K - 2 - 4
    q, r = divmod(seqLen, numbGroups * numbItems)
    q = q + 1
    h_seq = np.concatenate(
        (h_seq, np.repeat(78,
                          numbGroups * numbItems - r).astype(CPU_SIDE_INT)))
    h_numb_seq = np.zeros(np.size(h_seq)).astype(CPU_SIDE_INT)
    print(q)
    print(r)

    queue = cl.CommandQueue(context)
    program = cl.Program(context, kernelsource).build()
    mapToNumb = program.mapToNumb
    mapToNumb.set_scalar_arg_dtypes([np.int32, np.int32, np.int32, None, None])
    freqTab = program.freqTab
    freqTab.set_scalar_arg_dtypes(
        [np.int32, np.int32, np.int32, np.int32, None])

    d_seq = cl.Buffer(context,
                      cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR,
                      hostbuf=h_seq)
    d_numb_seq = cl.Buffer(context, cl.mem_flags.READ_WRITE, h_numb_seq.nbytes)
    cl.enqueue_fill_buffer(queue, d_numb_seq,
                           np.zeros(1).astype(np.int), 0, h_numb_seq.nbytes)

    N = numbGroups * numbItems
    M = q
    numbKmer = 4**K
    globalsize = (N, )
    localsize = (numbItems, )

    mapToNumb(queue, globalsize, None, N, M, numbKmer, d_seq, d_numb_seq)

    queue.finish()

    freqTab(queue, globalsize, None, N, M, K, numbKmer, d_numb_seq)

    queue.finish()

    cl.enqueue_copy(queue, h_numb_seq, d_numb_seq)

    print("Counting Done")

    print(h_numb_seq[:numbKmer + 2 + 4])
    assert (h_numb_seq[0] == 0
            ), "File contains unknown nucleotide characters"  #Sanity check

    return h_numb_seq[2:numbKmer + 2 + 4]
Ejemplo n.º 35
0
    def __init__(self, coefficients, nb_channel, dtype, chunksize,
                 overlapsize):
        SosFiltfilt_Base.__init__(self, coefficients, nb_channel, dtype,
                                  chunksize, overlapsize)

        assert self.dtype == np.dtype('float32')
        assert self.chunksize is not None, 'chunksize for opencl must be fixed'

        self.coefficients = self.coefficients.astype(self.dtype)
        if self.coefficients.ndim == 2:  #(nb_section, 6) to (nb_channel, nb_section, 6)
            self.coefficients = np.tile(self.coefficients[None, :, :],
                                        (nb_channel, 1, 1))
        if not self.coefficients.flags['C_CONTIGUOUS']:
            self.coefficients = self.coefficients.copy()
        assert self.coefficients.shape[
            0] == self.nb_channel, 'wrong coefficients.shape'
        assert self.coefficients.shape[2] == 6, 'wrong coefficients.shape'

        self.nb_section = self.coefficients.shape[1]

        self.ctx = pyopencl.create_some_context()
        #TODO : add arguments gpu_platform_index/gpu_device_index
        #self.devices =  [pyopencl.get_platforms()[self.gpu_platform_index].get_devices()[self.gpu_device_index] ]
        #self.ctx = pyopencl.Context(self.devices)
        self.queue = pyopencl.CommandQueue(self.ctx)

        #host arrays
        self.zi1 = np.zeros((nb_channel, self.nb_section, 2), dtype=self.dtype)
        self.zi2 = np.zeros((nb_channel, self.nb_section, 2), dtype=self.dtype)
        self.output1 = np.zeros((self.chunksize, self.nb_channel),
                                dtype=self.dtype)
        self.output2 = np.zeros((self.backward_chunksize, self.nb_channel),
                                dtype=self.dtype)

        #GPU buffers
        self.coefficients_cl = pyopencl.Buffer(self.ctx,
                                               mf.READ_ONLY | mf.COPY_HOST_PTR,
                                               hostbuf=self.coefficients)
        self.zi1_cl = pyopencl.Buffer(self.ctx,
                                      mf.READ_WRITE | mf.COPY_HOST_PTR,
                                      hostbuf=self.zi1)
        self.zi2_cl = pyopencl.Buffer(self.ctx,
                                      mf.READ_WRITE | mf.COPY_HOST_PTR,
                                      hostbuf=self.zi2)
        self.input1_cl = pyopencl.Buffer(self.ctx,
                                         mf.READ_WRITE,
                                         size=self.output1.nbytes)
        self.output1_cl = pyopencl.Buffer(self.ctx,
                                          mf.READ_WRITE,
                                          size=self.output1.nbytes)
        self.input2_cl = pyopencl.Buffer(self.ctx,
                                         mf.READ_WRITE,
                                         size=self.output2.nbytes)
        self.output2_cl = pyopencl.Buffer(self.ctx,
                                          mf.READ_WRITE,
                                          size=self.output2.nbytes)

        #nb works
        kernel = self.kernel % dict(forward_chunksize=self.chunksize,
                                    backward_chunksize=self.backward_chunksize,
                                    nb_section=self.nb_section,
                                    nb_channel=self.nb_channel)
        prg = pyopencl.Program(self.ctx, kernel)
        self.opencl_prg = prg.build(options='-cl-mad-enable')
Ejemplo n.º 36
0
def refine_and_generate_chart_function(mesh, filename, function):
    from time import clock
    cl_ctx = cl.create_some_context()
    queue = cl.CommandQueue(cl_ctx)
    print("NELEMENTS: ", mesh.nelements)
    #print mesh
    for i in range(len(mesh.groups[0].vertex_indices[0])):
        for k in range(len(mesh.vertices)):
            print(mesh.vertices[k, i])

    #check_nodal_adj_against_geometry(mesh);
    r = Refiner(mesh)
    #random.seed(0)
    #times = 3
    num_elements = []
    time_t = []
    #nelements = mesh.nelements
    while True:
        print("NELS:", mesh.nelements)
        #flags = get_corner_flags(mesh)
        flags = get_function_flags(mesh, function)
        nels = 0
        for i in flags:
            if i:
                nels += 1
        if nels == 0:
            break
        print("LKJASLFKJALKASF:", nels)
        num_elements.append(nels)
        #flags = get_corner_flags(mesh)
        beg = clock()
        mesh = r.refine(flags)
        end = clock()
        time_taken = end - beg
        time_t.append(time_taken)
        #if nelements == mesh.nelements:
        #break
        #nelements = mesh.nelements
        #from meshmode.mesh.visualization import draw_2d_mesh
        #draw_2d_mesh(mesh, True, True, True, fill=None)
        #import matplotlib.pyplot as pt
        #pt.show()

        #poss_flags = np.zeros(len(mesh.groups[0].vertex_indices))
        #for i in range(0, len(flags)):
        #    poss_flags[i] = flags[i]
        #for i in range(len(flags), len(poss_flags)):
        #    poss_flags[i] = 1

    import matplotlib.pyplot as pt
    pt.xlabel('Number of elements being refined')
    pt.ylabel('Time taken')
    pt.plot(num_elements, time_t, "o")
    pt.savefig(filename, format='pdf')
    pt.clf()
    print('DONE REFINING')
    '''
    flags = np.zeros(len(mesh.groups[0].vertex_indices))
    flags[0] = 1
    flags[1] = 1
    mesh = r.refine(flags)
    flags = np.zeros(len(mesh.groups[0].vertex_indices))
    flags[0] = 1
    flags[1] = 1
    flags[2] = 1
    mesh = r.refine(flags)
    '''
    #check_nodal_adj_against_geometry(mesh)
    #r.print_rays(70)
    #r.print_rays(117)
    #r.print_hanging_elements(10)
    #r.print_hanging_elements(117)
    #r.print_hanging_elements(757)
    #from meshmode.mesh.visualization import draw_2d_mesh
    #draw_2d_mesh(mesh, False, False, False, fill=None)
    #import matplotlib.pyplot as pt
    #pt.show()

    from meshmode.discretization import Discretization
    from meshmode.discretization.poly_element import \
            PolynomialWarpAndBlendGroupFactory
    discr = Discretization(cl_ctx, mesh,
                           PolynomialWarpAndBlendGroupFactory(order))
    from meshmode.discretization.visualization import make_visualizer
    vis = make_visualizer(queue, discr, order)
    remove_if_exists("connectivity2.vtu")
    remove_if_exists("geometry2.vtu")
    vis.write_vtk_file("geometry2.vtu", [
        ("f", discr.nodes()[0]),
    ])

    from meshmode.discretization.visualization import \
            write_nodal_adjacency_vtk_file

    write_nodal_adjacency_vtk_file("connectivity2.vtu", mesh)
Ejemplo n.º 37
0
def main(mesh_name="ellipsoid"):
    import logging
    logger = logging.getLogger(__name__)
    logging.basicConfig(level=logging.WARNING)  # INFO for more progress info

    import pyopencl as cl
    cl_ctx = cl.create_some_context()
    queue = cl.CommandQueue(cl_ctx)
    actx = PyOpenCLArrayContext(queue, force_device_scalars=True)

    if mesh_name == "ellipsoid":
        cad_file_name = "geometries/ellipsoid.step"
        h = 0.6
    elif mesh_name == "two-cylinders":
        cad_file_name = "geometries/two-cylinders-smooth.step"
        h = 0.4
    else:
        raise ValueError("unknown mesh name: %s" % mesh_name)

    from meshmode.mesh.io import generate_gmsh, FileSource
    mesh = generate_gmsh(
        FileSource(cad_file_name),
        2,
        order=2,
        other_options=["-string",
                       "Mesh.CharacteristicLengthMax = %g;" % h],
        target_unit="MM")

    from meshmode.mesh.processing import perform_flips
    # Flip elements--gmsh generates inside-out geometry.
    mesh = perform_flips(mesh, np.ones(mesh.nelements))

    from meshmode.mesh.processing import find_bounding_box
    bbox_min, bbox_max = find_bounding_box(mesh)
    bbox_center = 0.5 * (bbox_min + bbox_max)
    bbox_size = max(bbox_max - bbox_min) / 2

    logger.info("%d elements" % mesh.nelements)

    from pytential.qbx import QBXLayerPotentialSource
    from meshmode.discretization import Discretization
    from meshmode.discretization.poly_element import \
            InterpolatoryQuadratureSimplexGroupFactory

    density_discr = Discretization(
        actx, mesh, InterpolatoryQuadratureSimplexGroupFactory(target_order))

    qbx = QBXLayerPotentialSource(density_discr,
                                  4 * target_order,
                                  qbx_order,
                                  fmm_order=qbx_order + 3,
                                  target_association_tolerance=0.15)

    from pytential.target import PointsTarget
    fplot = FieldPlotter(bbox_center, extent=3.5 * bbox_size, npoints=150)

    from pytential import GeometryCollection
    places = GeometryCollection(
        {
            "qbx": qbx,
            "targets": PointsTarget(actx.from_numpy(fplot.points))
        },
        auto_where="qbx")
    density_discr = places.get_discretization("qbx")

    nodes = thaw(density_discr.nodes(), actx)
    angle = actx.np.arctan2(nodes[1], nodes[0])

    if k:
        kernel = HelmholtzKernel(3)
    else:
        kernel = LaplaceKernel(3)

    #op = sym.d_dx(sym.S(kernel, sym.var("sigma"), qbx_forced_limit=None))
    op = sym.D(kernel, sym.var("sigma"), qbx_forced_limit=None)
    #op = sym.S(kernel, sym.var("sigma"), qbx_forced_limit=None)

    if 0:
        from random import randrange
        sigma = actx.zeros(density_discr.ndofs, angle.entry_dtype)
        for _ in range(5):
            sigma[randrange(len(sigma))] = 1

        from arraycontext import unflatten
        sigma = unflatten(angle, sigma, actx)
    else:
        sigma = actx.np.cos(mode_nr * angle)

    if isinstance(kernel, HelmholtzKernel):
        for i, elem in np.ndenumerate(sigma):
            sigma[i] = elem.astype(np.complex128)

    fld_in_vol = actx.to_numpy(
        bind(places, op, auto_where=("qbx", "targets"))(actx, sigma=sigma,
                                                        k=k))

    #fplot.show_scalar_in_mayavi(fld_in_vol.real, max_val=5)
    fplot.write_vtk_file("layerpot-3d-potential.vts",
                         [("potential", fld_in_vol)])

    bdry_normals = bind(places, sym.normal(
        density_discr.ambient_dim))(actx).as_vector(dtype=object)

    from meshmode.discretization.visualization import make_visualizer
    bdry_vis = make_visualizer(actx, density_discr, target_order)
    bdry_vis.write_vtk_file("layerpot-3d-density.vtu", [
        ("sigma", sigma),
        ("bdry_normals", bdry_normals),
    ])
import cv2
import numpy
import pyopencl

from proc_tex.OpenCLCellNoise3D import OpenCLCellNoise3D
from proc_tex.texture_transforms import tex_scale_to_region, tex_to_dtype
from proc_tex.texture_transforms_opencl import tex_3d_to_sphere_map

if __name__ == '__main__':
    cl_context = pyopencl.create_some_context()
    texture = tex_3d_to_sphere_map(OpenCLCellNoise3D(cl_context, 4, 1),
                                   cl_context)
    texture = tex_to_dtype(tex_scale_to_region(texture),
                           numpy.uint16,
                           scale=65535)
    eval_pts = texture.gen_eval_pts((1024, 1024), numpy.array([[0, 1], [0,
                                                                        1]]))
    image = texture.to_image(None, None, eval_pts=eval_pts)
    # cv2.imshow('image', image)
    # cv2.waitKey(0)
    # cv2.destroyAllWindows()

    cv2.imwrite('./example.png', image)

    texture.to_video(None,
                     None,
                     120,
                     30,
                     './example.webm',
                     pix_fmt='gray16le',
                     codec_params=['-lossless', '0'],
Ejemplo n.º 39
0
def find_mode():
    import warnings
    warnings.simplefilter("error", np.ComplexWarning)

    cl_ctx = cl.create_some_context()
    queue = cl.CommandQueue(cl_ctx)

    k0 = 1.4447
    k1 = k0 * 1.02
    beta_sym = sym.var("beta")

    from pytential.symbolic.pde.scalar import (  # noqa
        DielectricSRep2DBoundaryOperator as SRep,
        DielectricSDRep2DBoundaryOperator as SDRep)
    pde_op = SDRep(mode="te",
                   k_vacuum=1,
                   interfaces=((0, 1, sym.DEFAULT_SOURCE), ),
                   domain_k_exprs=(k0, k1),
                   beta=beta_sym,
                   use_l2_weighting=False)

    u_sym = pde_op.make_unknown("u")
    op = pde_op.operator(u_sym)

    # {{{ discretization setup

    from meshmode.mesh.generation import ellipse, make_curve_mesh
    curve_f = partial(ellipse, 1)

    target_order = 7
    qbx_order = 4
    nelements = 30

    from meshmode.mesh.processing import affine_map
    mesh = make_curve_mesh(curve_f, np.linspace(0, 1, nelements + 1),
                           target_order)
    lambda_ = 1.55
    circle_radius = 3.4 * 2 * np.pi / lambda_
    mesh = affine_map(mesh, A=circle_radius * np.eye(2))

    from meshmode.discretization import Discretization
    from meshmode.discretization.poly_element import \
            InterpolatoryQuadratureSimplexGroupFactory
    from pytential.qbx import QBXLayerPotentialSource
    density_discr = Discretization(
        cl_ctx, mesh, InterpolatoryQuadratureSimplexGroupFactory(target_order))

    qbx = QBXLayerPotentialSource(
        density_discr,
        4 * target_order,
        qbx_order,
        # Don't use FMM for now
        fmm_order=False)

    # }}}

    x_vec = np.random.randn(len(u_sym) * density_discr.nnodes)
    y_vec = np.random.randn(len(u_sym) * density_discr.nnodes)

    def muller_solve_func(beta):
        from pytential.symbolic.execution import build_matrix
        mat = build_matrix(queue, qbx, op, u_sym, context={"beta": beta}).get()

        return 1 / x_vec.dot(la.solve(mat, y_vec))

    starting_guesses = (1 + 0j) * (k0 + (k1 - k0) * np.random.rand(3))

    from pytential.muller import muller
    beta, niter = muller(muller_solve_func, z_start=starting_guesses)
    print("beta")
Ejemplo n.º 40
0
def main(use_profiling=False):
    """Drive the example."""
    cl_ctx = cl.create_some_context()
    if use_profiling:
        queue = cl.CommandQueue(
            cl_ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)
        actx = PyOpenCLProfilingArrayContext(
            queue,
            allocator=cl_tools.MemoryPool(cl_tools.ImmediateAllocator(queue)))
    else:
        queue = cl.CommandQueue(cl_ctx)
        actx = PyOpenCLArrayContext(queue,
                                    allocator=cl_tools.MemoryPool(
                                        cl_tools.ImmediateAllocator(queue)))

    dim = 2
    nel_1d = 16
    from meshmode.mesh.generation import generate_regular_rect_mesh

    mesh = generate_regular_rect_mesh(a=(-0.5, ) * dim,
                                      b=(0.5, ) * dim,
                                      nelements_per_axis=(nel_1d, ) * dim)

    order = 3

    if dim == 2:
        # no deep meaning here, just a fudge factor
        dt = 0.7 / (nel_1d * order**2)
    elif dim == 3:
        # no deep meaning here, just a fudge factor
        dt = 0.4 / (nel_1d * order**2)
    else:
        raise ValueError("don't have a stable time step guesstimate")

    print("%d elements" % mesh.nelements)

    discr = EagerDGDiscretization(actx, mesh, order=order)

    fields = flat_obj_array(bump(actx, discr),
                            [discr.zeros(actx) for i in range(discr.dim)])

    vis = make_visualizer(discr)

    def rhs(t, w):
        return wave_operator(discr, c=1, w=w)

    t = 0
    t_final = 3
    istep = 0
    while t < t_final:
        fields = rk4_step(fields, t, dt, rhs)

        if istep % 10 == 0:
            if use_profiling:
                print(actx.tabulate_profiling_data())
            print(istep, t, discr.norm(fields[0], np.inf))
            vis.write_vtk_file("fld-wave-eager-%04d.vtu" % istep, [
                ("u", fields[0]),
                ("v", fields[1:]),
            ])

        t += dt
        istep += 1
Ejemplo n.º 41
0
 def setUp(self):
     self.ctx = create_some_context(answers=[0, 0])
     self.queue = CommandQueue(self.ctx)
Ejemplo n.º 42
0
def demo_cost_model():
    if not SUPPORTS_PROCESS_TIME:
        raise NotImplementedError(
            "Currently this script uses process time which only works on Python>=3.3"
        )

    from boxtree.pyfmmlib_integration import FMMLibExpansionWrangler

    nsources_list = [1000, 2000, 3000, 4000, 5000]
    ntargets_list = [1000, 2000, 3000, 4000, 5000]
    dims = 3
    dtype = np.float64

    ctx = cl.create_some_context()
    queue = cl.CommandQueue(ctx)

    traversals = []
    traversals_dev = []
    level_to_orders = []
    timing_results = []

    def fmm_level_to_nterms(tree, ilevel):
        return 10

    for nsources, ntargets in zip(nsources_list, ntargets_list):
        # {{{ Generate sources, targets and target_radii

        from boxtree.tools import make_normal_particle_array as p_normal
        sources = p_normal(queue, nsources, dims, dtype, seed=15)
        targets = p_normal(queue, ntargets, dims, dtype, seed=18)

        from pyopencl.clrandom import PhiloxGenerator
        rng = PhiloxGenerator(queue.context, seed=22)
        target_radii = rng.uniform(
            queue, ntargets, a=0, b=0.05, dtype=dtype
        ).get()

        # }}}

        # {{{ Generate tree and traversal

        from boxtree import TreeBuilder
        tb = TreeBuilder(ctx)
        tree, _ = tb(
            queue, sources, targets=targets, target_radii=target_radii,
            stick_out_factor=0.15, max_particles_in_box=30, debug=True
        )

        from boxtree.traversal import FMMTraversalBuilder
        tg = FMMTraversalBuilder(ctx, well_sep_is_n_away=2)
        trav_dev, _ = tg(queue, tree, debug=True)
        trav = trav_dev.get(queue=queue)

        traversals.append(trav)
        traversals_dev.append(trav_dev)

        # }}}

        wrangler = FMMLibExpansionWrangler(trav.tree, 0, fmm_level_to_nterms)
        level_to_orders.append(wrangler.level_nterms)

        timing_data = {}
        from boxtree.fmm import drive_fmm
        src_weights = np.random.rand(tree.nsources).astype(tree.coord_dtype)
        drive_fmm(trav, wrangler, (src_weights,), timing_data=timing_data)

        timing_results.append(timing_data)

    time_field_name = "process_elapsed"

    from boxtree.cost import FMMCostModel
    from boxtree.cost import make_pde_aware_translation_cost_model
    cost_model = FMMCostModel(make_pde_aware_translation_cost_model)

    model_results = []
    for icase in range(len(traversals)-1):
        traversal = traversals_dev[icase]
        model_results.append(
            cost_model.cost_per_stage(
                queue, traversal, level_to_orders[icase],
                FMMCostModel.get_unit_calibration_params(),
            )
        )
    queue.finish()

    params = cost_model.estimate_calibration_params(
        model_results, timing_results[:-1], time_field_name=time_field_name
    )

    predicted_time = cost_model.cost_per_stage(
        queue, traversals_dev[-1], level_to_orders[-1], params,
    )
    queue.finish()

    for field in ["form_multipoles", "eval_direct", "multipole_to_local",
                  "eval_multipoles", "form_locals", "eval_locals",
                  "coarsen_multipoles", "refine_locals"]:
        measured = timing_results[-1][field]["process_elapsed"]
        pred_err = (
                (measured - predicted_time[field])
                / measured)
        logger.info("actual/predicted time for %s: %.3g/%.3g -> %g %% error",
                field,
                measured,
                predicted_time[field],
                abs(100*pred_err))
Ejemplo n.º 43
0
 def __init__(self):
     self.ctx = cl.create_some_context()
     self.queue = cl.CommandQueue(self.ctx)
     self.tick = False
Ejemplo n.º 44
0
def main(write_output=True, order=4):
    cl_ctx = cl.create_some_context()
    queue = cl.CommandQueue(cl_ctx)
    actx = PyOpenCLArrayContext(queue)

    dims = 2
    from meshmode.mesh.generation import generate_regular_rect_mesh
    mesh = generate_regular_rect_mesh(a=(-0.5, ) * dims,
                                      b=(0.5, ) * dims,
                                      nelements_per_axis=(16, ) * dims)

    if mesh.dim == 2:
        dt = 0.04
    elif mesh.dim == 3:
        dt = 0.02

    print("%d elements" % mesh.nelements)

    discr = DiscretizationCollection(actx, mesh, order=order)

    source_center = np.array([0.1, 0.22, 0.33])[:mesh.dim]
    source_width = 0.05
    source_omega = 3

    sym_x = sym.nodes(mesh.dim)
    sym_source_center_dist = sym_x - source_center
    sym_t = sym.ScalarVariable("t")

    from grudge.models.wave import WeakWaveOperator
    from meshmode.mesh import BTAG_ALL, BTAG_NONE
    op = WeakWaveOperator(
        0.1,
        discr.dim,
        source_f=(
            sym.sin(source_omega * sym_t) *
            sym.exp(-np.dot(sym_source_center_dist, sym_source_center_dist) /
                    source_width**2)),
        dirichlet_tag=BTAG_NONE,
        neumann_tag=BTAG_NONE,
        radiation_tag=BTAG_ALL,
        flux_type="upwind")

    from pytools.obj_array import flat_obj_array
    fields = flat_obj_array(discr.zeros(actx),
                            [discr.zeros(actx) for i in range(discr.dim)])

    # FIXME
    #dt = op.estimate_rk4_timestep(discr, fields=fields)

    op.check_bc_coverage(mesh)

    # print(sym.pretty(op.sym_operator()))
    bound_op = bind(discr, op.sym_operator())

    def rhs(t, w):
        return bound_op(t=t, w=w)

    dt_stepper = set_up_rk4("w", dt, fields, rhs)

    final_t = 10
    nsteps = int(final_t / dt)
    print("dt=%g nsteps=%d" % (dt, nsteps))

    from grudge.shortcuts import make_visualizer
    vis = make_visualizer(discr)

    step = 0

    norm = bind(discr, sym.norm(2, sym.var("u")))

    from time import time
    t_last_step = time()

    for event in dt_stepper.run(t_end=final_t):
        if isinstance(event, dt_stepper.StateComputed):
            assert event.component_id == "w"

            step += 1

            print(step, event.t, norm(u=event.state_component[0]),
                  time() - t_last_step)
            if step % 10 == 0:
                vis.write_vtk_file("fld-wave-min-%04d.vtu" % step, [
                    ("u", event.state_component[0]),
                    ("v", event.state_component[1:]),
                ])
            t_last_step = time()
Ejemplo n.º 45
0
    def __call__(self, q, w, scale=1.0, bkg=0.0, threads=0):
        """
        Abeles matrix formalism for calculating reflectivity from a
        stratified
        medium.
        Uses pyopencl on a GPU to calculate reflectivity. The accuracy of
        this function may not as good as the C and Python based versions.
        Furthermore, it can be tricky to use when using multiprocessing
        based parallelism.

        Parameters
        ----------
        q: array_like
            the q values required for the calculation.
            Q = 4 * Pi / lambda * sin(omega).
            Units = Angstrom**-1
        layers: np.ndarray
            coefficients required for the calculation, has shape
            (2 + N, 4), where N is the number of layers
            layers[0, 1] - SLD of fronting (/1e-6 Angstrom**-2)
            layers[0, 2] - iSLD of fronting (/1e-6 Angstrom**-2)
            layers[N, 0] - thickness of layer N
            layers[N, 1] - SLD of layer N (/1e-6 Angstrom**-2)
            layers[N, 2] - iSLD of layer N (/1e-6 Angstrom**-2)
            layers[N, 3] - roughness between layer N-1/N
            layers[-1, 1] - SLD of backing (/1e-6 Angstrom**-2)
            layers[-1, 2] - iSLD of backing (/1e-6 Angstrom**-2)
            layers[-1, 3] - roughness between backing and last layer
        scale: float
            Multiply all reflectivities by this value.
        bkg: float
            Linear background to be added to all reflectivities
        threads: int, optional
            <THIS OPTION IS CURRENTLY IGNORED>

        Returns
        -------
        Reflectivity: np.ndarray
            Calculated reflectivity values for each q value.
        """
        import pyopencl as cl

        if self.ctx is None or self.prg is None:
            self.ctx = cl.create_some_context(interactive=False)
            pth = os.path.dirname(os.path.abspath(__file__))
            with open(os.path.join(pth, "abeles_pyopencl.cl"), "r") as f:
                src = f.read()
            self.prg = cl.Program(self.ctx, src).build()

        qvals = np.asfarray(q)
        flatq = qvals.ravel()

        nlayers = len(w) - 2
        coefs = np.empty((nlayers * 4 + 8))
        coefs[0] = nlayers
        coefs[1] = scale
        coefs[2:4] = w[0, 1:3]
        coefs[4:6] = w[-1, 1:3]
        coefs[6] = bkg
        coefs[7] = w[-1, 3]
        if nlayers:
            coefs[8::4] = w[1:-1, 0]
            coefs[9::4] = w[1:-1, 1]
            coefs[10::4] = w[1:-1, 2]
            coefs[11::4] = w[1:-1, 3]

        mf = cl.mem_flags
        with cl.CommandQueue(self.ctx) as queue:
            q_g = cl.Buffer(
                self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=flatq
            )
            coefs_g = cl.Buffer(
                self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=coefs
            )
            ref_g = cl.Buffer(self.ctx, mf.WRITE_ONLY, flatq.nbytes)

            self.prg.abeles(queue, flatq.shape, None, q_g, coefs_g, ref_g)

            reflectivity = np.empty_like(flatq)
            cl.enqueue_copy(queue, reflectivity, ref_g)
        return np.reshape(reflectivity, qvals.shape)
Ejemplo n.º 46
0
    large_node = np.array(extendedMatData['large_nodes'])

    nn0 = np.array(extendedMatData['rimg_NN0'])
    nn1 = np.array(extendedMatData['rimg_NN1'])
    ss = len(nn0)
    #nn0 = np.transpose(nn0);
    #nn1 = np.transpose(nn1);
    nn0 = np.ravel(nn0)
    nn1 = np.ravel(nn1)

    load_time = time.time()
    print("load time", load_time - start_time)

    extendedData = np.zeros(ss, dtype=np.float32)

    context = cl.create_some_context()
    queue = cl.CommandQueue(context)
    program_extension = cl.Program(context, kernel_extension).build()
    program_reducing = cl.Program(context, kernel_reducing).build()

    points = np.zeros((len(large_elem), 3))

    get_tpoints(points, large_elem, large_node)

    subfigure = []
    c = 0
    while (True):
        try:
            plt.cla()
            ax.set_xlim3d(-150, 150)
            ax.set_ylim3d(-150, 150)
Ejemplo n.º 47
0
def main():
    # Config
    steps = 40000
    num_gbl = 2048
    num_lcl = 128
    num_grp = num_gbl / num_lcl
    num_sec = 25
    num_keep = 2

    # Generate some input
    alpha = rnd.uniform(-1, 1, size=num_sec).astype(np.float32)
    prc = rnd.uniform(1, 10, size=num_sec).astype(np.float32) * 10
    bid = prc - np.multiply(rnd.uniform(size=prc.size), prc / 100).astype(
        np.float32)
    ask = prc + np.multiply(rnd.uniform(size=prc.size), prc / 100).astype(
        np.float32)
    adv = rnd.uniform(10000000, size=num_sec).astype(np.float32)
    port_out = np.zeros((num_grp, num_sec), dtype=np.int32)

    def get_fit(p):
        s = np.float64(0)
        for i in range(len(p)):
            s += alpha[i] * p[i]
        return s

    def get_max(res):
        m = None
        f = 0
        for fit, port in map((lambda p: (get_fit(p), p)), res):
            gmv = sum(abs(port[i] * prc[i]) for i in range(len(port)))
            if fit * gmv > f or m is None:
                f = fit * gmv
                m = port
        return f, m

    ctx = cl.create_some_context()
    queue = cl.CommandQueue(ctx)

    # Create OpenCL buffers
    mf = cl.mem_flags
    sec_rnd = rnd.uniform(size=num_sec).astype(np.float32)
    thd_rnd = rnd.uniform(size=num_gbl).astype(np.float32)
    alpha_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=alpha)
    bid_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=bid)
    ask_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=ask)
    prc_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=prc)
    adv_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=adv)
    port_buf = cl.Buffer(ctx, mf.WRITE_ONLY, port_out.nbytes)
    thd_rnd_buf = cl.Buffer(ctx,
                            mf.READ_ONLY | mf.COPY_HOST_PTR,
                            hostbuf=thd_rnd)
    sec_rnd_buf = cl.Buffer(ctx,
                            mf.READ_ONLY | mf.COPY_HOST_PTR,
                            hostbuf=sec_rnd)
    port_scratch_buf = cl.Buffer(ctx, mf.READ_WRITE, num_gbl * num_sec * 4)
    fit_buf = cl.Buffer(ctx, mf.READ_WRITE, num_gbl * 8)
    keep_buf = cl.LocalMemory(num_keep * 4)

    prg = cl.Program(ctx, open('kernel.c').read()).build()

    # Init buffers
    e = prg.init(queue, (num_gbl, ), (num_lcl, ), port_scratch_buf,
                 thd_rnd_buf, fit_buf, np.int32(num_sec), np.int32(num_gbl),
                 np.int32(rnd.randint(0, max(num_sec, num_gbl))))
    e = cl.enqueue_barrier(queue, wait_for=[e])

    # Init fitness
    e = prg.get_fitness(queue, (num_gbl, ), (num_lcl, ),
                        alpha_buf,
                        bid_buf,
                        ask_buf,
                        prc_buf,
                        adv_buf,
                        port_scratch_buf,
                        fit_buf,
                        thd_rnd_buf,
                        np.int32(num_sec),
                        np.int32(num_gbl),
                        wait_for=[e])
    e = cl.enqueue_barrier(queue, wait_for=[e])

    for i in range(0, steps - 1):
        sec_rnd = rnd.uniform(size=num_sec).astype(np.float32)
        thd_rnd = rnd.uniform(size=num_gbl).astype(np.float32)
        thd_rnd_buf = cl.Buffer(ctx,
                                mf.READ_ONLY | mf.COPY_HOST_PTR,
                                hostbuf=thd_rnd)
        sec_rnd_buf = cl.Buffer(ctx,
                                mf.READ_ONLY | mf.COPY_HOST_PTR,
                                hostbuf=sec_rnd)

        # Reap, mutate, and recombinate
        e = prg.mutate(queue, (num_gbl, ), (num_lcl, ),
                       port_scratch_buf,
                       fit_buf,
                       sec_rnd_buf,
                       thd_rnd_buf,
                       keep_buf,
                       np.int32(num_sec),
                       np.int32(num_gbl),
                       np.int32(num_keep),
                       np.int32(rnd.randint(0, max(num_sec, num_gbl))),
                       wait_for=[e])
        e = cl.enqueue_barrier(queue, wait_for=[e])

        # Recomute fitness
        e = prg.get_fitness(queue, (num_gbl, ), (num_lcl, ),
                            alpha_buf,
                            bid_buf,
                            ask_buf,
                            prc_buf,
                            adv_buf,
                            port_scratch_buf,
                            fit_buf,
                            thd_rnd_buf,
                            np.int32(num_sec),
                            np.int32(num_gbl),
                            wait_for=[e])
        e = cl.enqueue_barrier(queue, wait_for=[e])

    # Get the top portfolios from each work group
    port_buf = cl.Buffer(ctx, mf.WRITE_ONLY, port_out.nbytes)
    e = prg.get_max(queue, (num_gbl, ), (num_lcl, ),
                    port_scratch_buf,
                    port_buf,
                    fit_buf,
                    np.int32(num_sec),
                    wait_for=[e])
    e = cl.enqueue_barrier(queue, wait_for=[e])
    e = cl.enqueue_copy(queue, port_out, port_buf, wait_for=[e])
    e = cl.enqueue_barrier(queue, wait_for=[e])

    # (Meta-)Select the one we want
    f, port_out = get_max(port_out)

    print('Signal:')
    print(alpha)
    print('Prices:')
    print(prc)
    print('ADV:')
    print(adv)
    print('Spreads:')
    print(ask - bid)
    print('Portfolio:')
    print(port_out)
    print('Fitness:', f)
    print(
        'Max Participation:',
        max(abs(port_out[i] * prc[i] / adv[i]) for i in range(len(port_out))))
    print('GMV:', sum(abs(port_out[i] * prc[i]) for i in range(len(port_out))))
    print('NMV:', sum(port_out[i] * prc[i] for i in range(len(port_out))))
    print('Peason R bt alpha and port:', pearsonr(port_out, alpha))
import time, math
import numpy as np
import pyopencl as cl
import pygame

# Default values for pyopencl
#platform = cl.get_platforms()[0]
#device = platform.get_devices()[0]
#ctx = cl.Context([device])

# Manually enter settings each time
ctx = cl.create_some_context(interactive=True)


# Returns array of pixel rgb values for Mandelbrot set
# xMin, xMax, yMin and yMax are values for the actual frame of the set, width and height is the size of the image in pixels
# Higher maxIterations will result in better quality, but will take longer
def mandelbrot(xMin, xMax, yMin, yMax, width, height, maxIterations):

    # Set up pixel values as array
    r1 = np.linspace(xMin, xMax, width, dtype=np.float64)
    r2 = np.linspace(yMin, yMax, height, dtype=np.float64)
    c = r1 + r2[:, None] * 1j
    c = np.ravel(c)

    # Set up context
    global ctx
    queue = cl.CommandQueue(ctx)
    output = np.empty(c.shape, dtype=np.uint32)

    # Mandelbrot program
Ejemplo n.º 49
0
def main(write_output=True, order=4):
    cl_ctx = cl.create_some_context()
    queue = cl.CommandQueue(cl_ctx)
    actx = PyOpenCLArrayContext(queue)

    comm = MPI.COMM_WORLD
    num_parts = comm.Get_size()

    from meshmode.distributed import MPIMeshDistributor, get_partition_by_pymetis
    mesh_dist = MPIMeshDistributor(comm)

    if mesh_dist.is_mananger_rank():
        dims = 2
        from meshmode.mesh.generation import generate_regular_rect_mesh
        mesh = generate_regular_rect_mesh(
                a=(-0.5,)*dims,
                b=(0.5,)*dims,
                n=(16,)*dims)

        print("%d elements" % mesh.nelements)

        part_per_element = get_partition_by_pymetis(mesh, num_parts)

        local_mesh = mesh_dist.send_mesh_parts(mesh, part_per_element, num_parts)

        del mesh

    else:
        local_mesh = mesh_dist.receive_mesh_part()

    discr = DGDiscretizationWithBoundaries(actx, local_mesh, order=order,
            mpi_communicator=comm)

    if local_mesh.dim == 2:
        dt = 0.04
    elif local_mesh.dim == 3:
        dt = 0.02

    source_center = np.array([0.1, 0.22, 0.33])[:local_mesh.dim]
    source_width = 0.05
    source_omega = 3

    sym_x = sym.nodes(local_mesh.dim)
    sym_source_center_dist = sym_x - source_center
    sym_t = sym.ScalarVariable("t")

    from grudge.models.wave import WeakWaveOperator
    from meshmode.mesh import BTAG_ALL, BTAG_NONE
    op = WeakWaveOperator(0.1, discr.dim,
            source_f=(
                sym.sin(source_omega*sym_t)
                * sym.exp(
                    -np.dot(sym_source_center_dist, sym_source_center_dist)
                    / source_width**2)),
            dirichlet_tag=BTAG_NONE,
            neumann_tag=BTAG_NONE,
            radiation_tag=BTAG_ALL,
            flux_type="upwind")

    from pytools.obj_array import flat_obj_array
    fields = flat_obj_array(
            discr.zeros(actx),
            [discr.zeros(actx) for i in range(discr.dim)])

    # FIXME
    #dt = op.estimate_rk4_timestep(discr, fields=fields)

    op.check_bc_coverage(local_mesh)

    # print(sym.pretty(op.sym_operator()))
    bound_op = bind(discr, op.sym_operator())

    def rhs(t, w):
        return bound_op(t=t, w=w)

    dt_stepper = set_up_rk4("w", dt, fields, rhs)

    final_t = 10
    nsteps = int(final_t/dt)
    print("dt=%g nsteps=%d" % (dt, nsteps))

    from grudge.shortcuts import make_visualizer
    vis = make_visualizer(discr, vis_order=order)

    step = 0

    norm = bind(discr, sym.norm(2, sym.var("u")))

    from time import time
    t_last_step = time()

    for event in dt_stepper.run(t_end=final_t):
        if isinstance(event, dt_stepper.StateComputed):
            assert event.component_id == "w"

            step += 1

            print(step, event.t, norm(u=event.state_component[0]),
                    time()-t_last_step)
            if step % 10 == 0:
                vis.write_parallel_vtk_file(
                        comm,
                        f"fld-wave-min-mpi-{{rank:03d}}-{step:04d}.vtu",
                        [
                            ("u", event.state_component[0]),
                            ("v", event.state_component[1:]),
                            ])
            t_last_step = time()
Ejemplo n.º 50
0
def get_context():
    global _ctx
    if _ctx is None:
        _ctx = cl.create_some_context()
    return _ctx
Ejemplo n.º 51
0
def main():
    import logging
    logging.basicConfig(level=logging.INFO)

    cl_ctx = cl.create_some_context()
    queue = cl.CommandQueue(cl_ctx)

    from meshmode.mesh.generation import ellipse, make_curve_mesh
    from functools import partial

    mesh = make_curve_mesh(
                partial(ellipse, 2),
                np.linspace(0, 1, nelements+1),
                mesh_order)

    pre_density_discr = Discretization(
            cl_ctx, mesh,
            InterpolatoryQuadratureSimplexGroupFactory(bdry_quad_order))

    from pytential.qbx import (
            QBXLayerPotentialSource, QBXTargetAssociationFailedException)
    qbx, _ = QBXLayerPotentialSource(
            pre_density_discr, fine_order=bdry_ovsmp_quad_order, qbx_order=qbx_order,
            fmm_order=fmm_order,
            expansion_disks_in_tree_have_extent=True,
            ).with_refinement()
    density_discr = qbx.density_discr

    from pytential.symbolic.pde.cahn_hilliard import CahnHilliardOperator
    chop = CahnHilliardOperator(
            # FIXME: Constants?
            lambda1=1.5,
            lambda2=1.25,
            c=1)

    unk = chop.make_unknown("sigma")
    bound_op = bind(qbx, chop.operator(unk))

    # {{{ fix rhs and solve

    nodes = density_discr.nodes().with_queue(queue)

    def g(xvec):
        x, y = xvec
        return cl.clmath.atan2(y, x)

    bc = sym.make_obj_array([
        # FIXME: Realistic BC
        g(nodes),
        -g(nodes),
        ])

    from pytential.solve import gmres
    gmres_result = gmres(
            bound_op.scipy_op(queue, "sigma", dtype=np.complex128),
            bc, tol=1e-8, progress=True,
            stall_iterations=0,
            hard_failure=True)

    # }}}

    # {{{ postprocess/visualize

    sigma = gmres_result.solution

    from sumpy.visualization import FieldPlotter
    fplot = FieldPlotter(np.zeros(2), extent=5, npoints=500)

    targets = cl.array.to_device(queue, fplot.points)

    qbx_stick_out = qbx.copy(target_association_tolerance=0.05)

    indicator_qbx = qbx_stick_out.copy(qbx_order=2)

    from sumpy.kernel import LaplaceKernel
    ones_density = density_discr.zeros(queue)
    ones_density.fill(1)
    indicator = bind(
            (indicator_qbx, PointsTarget(targets)),
            sym.D(LaplaceKernel(2), sym.var("sigma")))(
            queue, sigma=ones_density).get()

    try:
        fld_in_vol = bind(
                (qbx_stick_out, PointsTarget(targets)),
                chop.representation(unk))(queue, sigma=sigma).get()
    except QBXTargetAssociationFailedException as e:
        fplot.write_vtk_file(
                "failed-targets.vts",
                [
                    ("failed", e.failed_target_flags.get(queue))
                    ]
                )
        raise

    #fplot.show_scalar_in_mayavi(fld_in_vol.real, max_val=5)
    fplot.write_vtk_file(
            "potential.vts",
            [
                ("potential", fld_in_vol),
                ("indicator", indicator),
                ]
            )
Ejemplo n.º 52
0
 def setup(proc_id):
     context = pyopencl.create_some_context(answers=[0, proc_id])
     return {'cl_context': context}
Ejemplo n.º 53
0
def require_init_gpu():
    global cl_ctx, cl_queue
    if cl_queue is None:
        cl_ctx = cl.create_some_context(
            answers=[0, 2])  # change if you don't have mac
        cl_queue = cl.CommandQueue(cl_ctx)
Ejemplo n.º 54
0
def main():
    logging.basicConfig(level=logging.INFO)

    nelements = 60
    qbx_order = 3
    k_fac = 4
    k0 = 3*k_fac
    k1 = 2.9*k_fac
    mesh_order = 10
    bdry_quad_order = mesh_order
    bdry_ovsmp_quad_order = bdry_quad_order * 4
    fmm_order = qbx_order * 2

    cl_ctx = cl.create_some_context()
    queue = cl.CommandQueue(cl_ctx)

    from meshmode.mesh.generation import ellipse, make_curve_mesh
    from functools import partial
    mesh = make_curve_mesh(
            partial(ellipse, 3),
            np.linspace(0, 1, nelements+1),
            mesh_order)

    density_discr = Discretization(
            cl_ctx, mesh,
            InterpolatoryQuadratureSimplexGroupFactory(bdry_quad_order))

    logger.info("%d elements" % mesh.nelements)

    # from meshmode.discretization.visualization import make_visualizer
    # bdry_vis = make_visualizer(queue, density_discr, 20)

    # {{{ solve bvp

    from sumpy.kernel import HelmholtzKernel
    kernel = HelmholtzKernel(2)

    beta = 2.5*k_fac
    K0 = np.sqrt(k0**2-beta**2)
    K1 = np.sqrt(k1**2-beta**2)

    from pytential.symbolic.pde.scalar import DielectricSDRep2DBoundaryOperator
    pde_op = DielectricSDRep2DBoundaryOperator(
            mode='tm',
            k_vacuum=1,
            interfaces=((0, 1, sym.DEFAULT_SOURCE),),
            domain_k_exprs=(k0, k1),
            beta=beta)

    op_unknown_sym = pde_op.make_unknown("unknown")

    representation0_sym = pde_op.representation(op_unknown_sym, 0)
    representation1_sym = pde_op.representation(op_unknown_sym, 1)

    from pytential.qbx import QBXLayerPotentialSource
    qbx = QBXLayerPotentialSource(
            density_discr, fine_order=bdry_ovsmp_quad_order, qbx_order=qbx_order,
            fmm_order=fmm_order
            )

    bound_pde_op = bind(qbx, pde_op.operator(op_unknown_sym))

    # in inner domain
    sources_1 = make_obj_array(list(np.array([
        [-1.5, 0.5]
        ]).T.copy()))
    strengths_1 = np.array([1])

    from sumpy.p2p import P2P
    pot_p2p = P2P(cl_ctx, [kernel], exclude_self=False)

    _, (Einc,) = pot_p2p(queue, density_discr.nodes(), sources_1, [strengths_1],
                    out_host=False, k=K0)

    sqrt_w = bind(density_discr, sym.sqrt_jac_q_weight())(queue)

    bvp_rhs = np.zeros(len(pde_op.bcs), dtype=np.object)
    for i_bc, terms in enumerate(pde_op.bcs):
        for term in terms:
            assert term.i_interface == 0
            assert term.field_kind == pde_op.field_kind_e

            if term.direction == pde_op.dir_none:
                bvp_rhs[i_bc] += (
                        term.coeff_outer * (-Einc)
                        )
            elif term.direction == pde_op.dir_normal:
                # no jump in normal derivative
                bvp_rhs[i_bc] += 0*Einc
            else:
                raise NotImplementedError("direction spec in RHS")

        bvp_rhs[i_bc] *= sqrt_w

    from pytential.solve import gmres
    gmres_result = gmres(
            bound_pde_op.scipy_op(queue, "unknown", dtype=np.complex128,
                domains=[sym.DEFAULT_TARGET]*2, K0=K0, K1=K1),
            bvp_rhs, tol=1e-6, progress=True,
            hard_failure=True, stall_iterations=0)

    # }}}

    unknown = gmres_result.solution

    # {{{ visualize

    from pytential.qbx import QBXLayerPotentialSource
    lap_qbx = QBXLayerPotentialSource(
            density_discr, fine_order=bdry_ovsmp_quad_order, qbx_order=qbx_order,
            fmm_order=qbx_order
            )

    from sumpy.visualization import FieldPlotter
    fplot = FieldPlotter(np.zeros(2), extent=5, npoints=300)
    from pytential.target import PointsTarget
    fld0 = bind(
            (qbx, PointsTarget(fplot.points)),
            representation0_sym)(queue, unknown=unknown, K0=K0).get()
    fld1 = bind(
            (qbx, PointsTarget(fplot.points)),
            representation1_sym)(queue, unknown=unknown, K1=K1).get()
    ones = cl.array.empty(queue, density_discr.nnodes, np.float64)
    dom1_indicator = -bind(
            (lap_qbx, PointsTarget(fplot.points)),
            sym.D(0, sym.var("sigma")))(
                    queue, sigma=ones.fill(1)).get()
    _, (fld_inc_vol,) = pot_p2p(queue, fplot.points, sources_1, [strengths_1],
                    out_host=True, k=K0)

    #fplot.show_scalar_in_mayavi(fld_in_vol.real, max_val=5)
    fplot.write_vtk_file(
            "potential.vts",
            [
                ("fld0", fld0),
                ("fld1", fld1),
                ("fld_inc_vol", fld_inc_vol),
                ("fld_total", (
                    (fld_inc_vol + fld0)*(1-dom1_indicator)
                    +
                    fld1*dom1_indicator
                    )),
                ("dom1_indicator", dom1_indicator),
                ]
            )
Ejemplo n.º 55
0
harmless, albeit annoying.
"""
from __future__ import print_function

import os
import warnings
import logging
import time

import numpy as np  # type: ignore

try:
    #raise NotImplementedError("OpenCL not yet implemented for new kernel template")
    import pyopencl as cl  # type: ignore
    # Ask OpenCL for the default context so that we know that one exists
    cl.create_some_context(interactive=False)
except Exception as exc:
    warnings.warn("OpenCL startup failed with ***" + str(exc) +
                  "***; using C compiler instead")
    raise RuntimeError("OpenCL not available")

from pyopencl import mem_flags as mf
from pyopencl.characterize import get_fast_inaccurate_build_options

from . import generate
from .kernel import KernelModel, Kernel

# pylint: disable=unused-import
try:
    from typing import Tuple, Callable, Any
    from .modelinfo import ModelInfo
Ejemplo n.º 56
0
def main():
    import logging
    logging.basicConfig(level=logging.WARNING)  # INFO for more progress info

    cl_ctx = cl.create_some_context()
    queue = cl.CommandQueue(cl_ctx)
    actx = PyOpenCLArrayContext(queue)

    target_order = 16
    qbx_order = 3
    nelements = 60
    mode_nr = 0

    k = 0
    if k:
        kernel = HelmholtzKernel(2)
    else:
        kernel = LaplaceKernel(2)

    mesh = make_curve_mesh(
        #lambda t: ellipse(1, t),
        starfish,
        np.linspace(0, 1, nelements + 1),
        target_order)

    from pytential.qbx import QBXLayerPotentialSource
    from meshmode.discretization import Discretization
    from meshmode.discretization.poly_element import \
            InterpolatoryQuadratureSimplexGroupFactory

    pre_density_discr = Discretization(
        actx, mesh, InterpolatoryQuadratureSimplexGroupFactory(target_order))

    unaccel_qbx = QBXLayerPotentialSource(
        pre_density_discr,
        fine_order=2 * target_order,
        qbx_order=qbx_order,
        fmm_order=False,
        target_association_tolerance=.05,
    )

    from pytential.target import PointsTarget
    fplot = FieldPlotter(np.zeros(2), extent=5, npoints=600)

    from pytential import GeometryCollection
    places = GeometryCollection({
        "unaccel_qbx": unaccel_qbx,
        "qbx": unaccel_qbx.copy(fmm_order=10),
        "targets": PointsTarget(fplot.points)
    })
    density_discr = places.get_discretization("unaccel_qbx")

    nodes = thaw(actx, density_discr.nodes())
    angle = actx.np.arctan2(nodes[1], nodes[0])

    from pytential import bind, sym
    if k:
        kernel_kwargs = {"k": sym.var("k")}
    else:
        kernel_kwargs = {}

    def get_op():
        kwargs = dict(qbx_forced_limit=None)
        kwargs.update(kernel_kwargs)
        # return sym.d_dx(2, sym.S(kernel, sym.var("sigma"), **kwargs))
        # return sym.D(kernel, sym.var("sigma"), **kwargs)
        return sym.S(kernel, sym.var("sigma"), **kwargs)

    op = get_op()

    sigma = actx.np.cos(mode_nr * angle)

    if isinstance(kernel, HelmholtzKernel):
        for i, elem in np.ndenumerate(sigma):
            sigma[i] = elem.astype(np.complex128)

    fld_in_vol = bind(places, op,
                      auto_where=("unaccel_qbx", "targets"))(actx,
                                                             sigma=sigma,
                                                             k=k).get()

    fmm_fld_in_vol = bind(places, op,
                          auto_where=("qbx", "targets"))(actx,
                                                         sigma=sigma,
                                                         k=k).get()

    err = fmm_fld_in_vol - fld_in_vol

    try:
        import matplotlib
    except ImportError:
        return

    matplotlib.use("Agg")
    im = fplot.show_scalar_in_matplotlib(np.log10(np.abs(err) + 1e-17))

    from matplotlib.colors import Normalize
    im.set_norm(Normalize(vmin=-12, vmax=0))

    import matplotlib.pyplot as pt
    from matplotlib.ticker import NullFormatter
    pt.gca().xaxis.set_major_formatter(NullFormatter())
    pt.gca().yaxis.set_major_formatter(NullFormatter())

    cb = pt.colorbar(shrink=0.9)
    cb.set_label(r"$\log_{10}(\mathrm{Error})$")

    pt.savefig("fmm-error-order-%d.pdf" % qbx_order)
def generateAN(wavelen, modes):

    with open('generateAN.cl', 'r') as myfile:
        integratePI = myfile.read()

    # Some constant values
    INSTEPS = 512 * 512
    ITERS = 262144 / 2048

    # Set some default values:
    # Default number of steps (updated later to device prefereable)
    in_nsteps = INSTEPS
    # Default number of iterations
    niters = ITERS

    # Create context, queue and build program
    context = pycl.create_some_context()
    queue = pycl.CommandQueue(context)

    program = pycl.Program(context, integratePI).build()
    pi = program.pi
    pi.set_scalar_arg_dtypes([
        numpy.int32, numpy.int32, numpy.float32, numpy.float32, numpy.float32,
        None, None
    ])

    # Get the max work group size for the kernel pi on our device
    device = context.devices[0]

    work_group_size = program.pi.get_work_group_info(
        pycl.kernel_work_group_info.WORK_GROUP_SIZE, device)

    # Now that we know the size of the work_groups, we can set the number of work
    # groups, the actual number of steps, and the step size
    nwork_groups = in_nsteps / (work_group_size * niters)
    print(nwork_groups)
    # if nwork_groups < 1:
    #     nwork_groups = device.max_compute_units
    #     work_group_size = in_nsteps/(nwork_groups*niters)

    nsteps = work_group_size * niters * nwork_groups

    d = 3 * wavelen
    t = 6 * wavelen

    #Define Bounds
    a1 = -d - t / 2
    b2 = d + t / 2

    start = a1
    end = b2
    step_size = (end - start) / float(nsteps)
    print(step_size)
    # # vector to hold partial sum
    h_psum = numpy.empty(int(nwork_groups)).astype(numpy.float32)

    print("%s work groups of size %s" % (nwork_groups, work_group_size))
    print("Integration steps %s" % nsteps)

    d_partial_sums = pycl.Buffer(context, pycl.mem_flags.WRITE_ONLY,
                                 h_psum.nbytes)

    # Start the timer
    rtime = time()

    # # Execute the kernel over the entire range of our 1d input data et
    # # using the maximum number of work group items for this device
    # # Set the global and local size as tuples
    global_size = (int(nwork_groups * work_group_size), )
    local_size = ((work_group_size), )
    localmem = pycl.LocalMemory(
        numpy.dtype(numpy.float32).itemsize * work_group_size)
    print(niters)
    AN = []
    for n in range(0, modes):
        pi(queue, global_size, local_size, int(n), int(niters), step_size,
           start, wavelen, localmem, d_partial_sums)
        #print("done")
        pycl.enqueue_copy(queue, h_psum, d_partial_sums)

        # # complete the sum and compute the final integral value
        pi_res = (h_psum.sum() * step_size)
        AN.append(pi_res)

        # Stop the timer
    rtime = time() - rtime
    print(rtime)
    return AN
Ejemplo n.º 58
0
def run(double_precision=False):
    context = cl.create_some_context()
    queue = cl.CommandQueue(context)

    dtype = np.complex64 if not double_precision else np.complex128

    n_run = 100  #set to 1 for proper testing

    if n_run > 1:
        nd_dataC = np.random.normal(size=(4, 1024, 1024)).astype(
            dtype)  #faster than 1024x1024?
    else:
        nd_dataC = np.ones((4, 1024, 1024), dtype=dtype)  #set n_run to 1

    nd_dataF = np.asfortranarray(nd_dataC)
    dataC = cla.to_device(queue, nd_dataC)
    dataF = cla.to_device(queue, nd_dataF)

    nd_result = np.zeros_like(nd_dataC, dtype=dtype)
    resultC = cla.to_device(queue, nd_result)
    resultF = cla.to_device(queue, np.asfortranarray(nd_result))
    result = resultF

    axes_list = [(1, 2), (2, 1)]  #batched 2d transforms

    if True:
        print('out of place transforms', dataC.shape, dataC.dtype)
        print('axes         in out')
        for axes in axes_list:
            for data in (dataC, dataF):
                for result in (resultC, resultF):
                    try:

                        transform = FFT(context,
                                        queue,
                                        data,
                                        result,
                                        axes=axes)
                        #transform.plan.transpose_result = True #not implemented for some transforms (works e.g. for out of place, (2,1) C C)
                        print(
                            '%-10s %3s %3s' % (
                                axes,
                                'C' if data.flags.c_contiguous else 'F',
                                'C' if result.flags.c_contiguous else 'F',
                            ),
                            end=' ',
                        )

                        tic = timeit.default_timer()
                        for i in range(n_run):
                            events = transform.enqueue()
                            #events = transform.enqueue(False)
                        for e in events:
                            e.wait()
                        toc = timeit.default_timer()
                        t_ms = 1e3 * (toc - tic) / n_run
                        gflops = 5e-9 * np.log2(np.prod(
                            transform.t_shape)) * np.prod(
                                transform.t_shape) * transform.batchsize / (
                                    1e-3 * t_ms)

                        npfft_result = npfftn(nd_dataC, axes=axes)
                        if transform.plan.transpose_result:
                            npfft_result = np.swapaxes(npfft_result, axes[0],
                                                       axes[1])
                        max_error = np.max(abs(result.get() - npfft_result))
                        print('%8.1e' % max_error, end=' ')
                        assert_allclose(
                            result.get(),
                            npfft_result,
                            atol=1e-8 if double_precision else 1e-3,
                            rtol=1e-8 if double_precision else 1e-3)

                        #assert_array_almost_equal(abs(result.get() - npfftn(data.get(), axes = axes)),
                        #                          1e-4)

                    except GpyFFT_Error as e:
                        print(e)
                        t_ms, gflops = 0, 0
                    except AssertionError as e:
                        print(e)
                    finally:
                        print('%5.2fms %6.2f Gflops' % (t_ms, gflops))

        print('in place transforms', nd_dataC.shape, nd_dataC.dtype)

    for axes in axes_list:
        for nd_data in (nd_dataC, nd_dataF):
            data = cla.to_device(queue, nd_data)
            transform = FFT(context, queue, data, axes=axes)
            #transform.plan.transpose_result = True #not implemented
            tic = timeit.default_timer()
            for i in range(n_run):  # inplace transform fails for n_run > 1
                events = transform.enqueue()
            for e in events:
                e.wait()
            toc = timeit.default_timer()
            t_ms = 1e3 * (toc - tic) / n_run
            gflops = 5e-9 * np.log2(np.prod(transform.t_shape)) * np.prod(
                transform.t_shape) * transform.batchsize / (1e-3 * t_ms)
            print(
                '%-10s %3s %5.2fms %6.2f Gflops' %
                (axes, 'C' if data.flags.c_contiguous else 'F', t_ms, gflops))
Ejemplo n.º 59
0
def simple_wave_entrypoint(dim=2,
                           num_elems=256,
                           order=4,
                           num_steps=30,
                           log_filename="grudge.dat"):
    cl_ctx = cl.create_some_context()
    queue = cl.CommandQueue(cl_ctx)

    from mpi4py import MPI
    comm = MPI.COMM_WORLD
    num_parts = comm.Get_size()
    n = int(num_elems**(1. / dim))

    from meshmode.distributed import MPIMeshDistributor
    mesh_dist = MPIMeshDistributor(comm)

    if mesh_dist.is_mananger_rank():
        from meshmode.mesh.generation import generate_regular_rect_mesh
        mesh = generate_regular_rect_mesh(a=(-0.5, ) * dim,
                                          b=(0.5, ) * dim,
                                          n=(n, ) * dim)

        from pymetis import part_graph
        _, p = part_graph(num_parts,
                          xadj=mesh.nodal_adjacency.neighbors_starts.tolist(),
                          adjncy=mesh.nodal_adjacency.neighbors.tolist())
        part_per_element = np.array(p)

        local_mesh = mesh_dist.send_mesh_parts(mesh, part_per_element,
                                               num_parts)
    else:
        local_mesh = mesh_dist.receive_mesh_part()

    vol_discr = DGDiscretizationWithBoundaries(cl_ctx,
                                               local_mesh,
                                               order=order,
                                               mpi_communicator=comm)

    source_center = np.array([0.1, 0.22, 0.33])[:local_mesh.dim]
    source_width = 0.05
    source_omega = 3

    sym_x = sym.nodes(local_mesh.dim)
    sym_source_center_dist = sym_x - source_center
    sym_t = sym.ScalarVariable("t")

    from grudge.models.wave import StrongWaveOperator
    from meshmode.mesh import BTAG_ALL, BTAG_NONE
    op = StrongWaveOperator(
        -0.1,
        vol_discr.dim,
        source_f=(
            sym.sin(source_omega * sym_t) *
            sym.exp(-np.dot(sym_source_center_dist, sym_source_center_dist) /
                    source_width**2)),
        dirichlet_tag=BTAG_NONE,
        neumann_tag=BTAG_NONE,
        radiation_tag=BTAG_ALL,
        flux_type="upwind")

    from pytools.obj_array import join_fields
    fields = join_fields(
        vol_discr.zeros(queue),
        [vol_discr.zeros(queue) for i in range(vol_discr.dim)])

    from pytools.log import LogManager, \
            add_general_quantities, \
            add_run_info, \
            IntervalTimer, EventCounter
    # NOTE: LogManager hangs when using a file on a shared directory.
    logmgr = LogManager(log_filename, "w", comm)
    add_run_info(logmgr)
    add_general_quantities(logmgr)
    log_quantities =\
        {"rank_data_swap_timer": IntervalTimer("rank_data_swap_timer",
                        "Time spent evaluating RankDataSwapAssign"),
        "rank_data_swap_counter": EventCounter("rank_data_swap_counter",
                        "Number of RankDataSwapAssign instructions evaluated"),
        "exec_timer": IntervalTimer("exec_timer",
                        "Total time spent executing instructions"),
        "insn_eval_timer": IntervalTimer("insn_eval_timer",
                        "Time spend evaluating instructions"),
        "future_eval_timer": IntervalTimer("future_eval_timer",
                        "Time spent evaluating futures"),
        "busy_wait_timer": IntervalTimer("busy_wait_timer",
                        "Time wasted doing busy wait")}
    for quantity in log_quantities.values():
        logmgr.add_quantity(quantity)

    bound_op = bind(vol_discr, op.sym_operator())

    def rhs(t, w):
        val, rhs.profile_data = bound_op(queue,
                                         profile_data=rhs.profile_data,
                                         log_quantities=log_quantities,
                                         t=t,
                                         w=w)
        return val

    rhs.profile_data = {}

    dt = 0.04
    dt_stepper = set_up_rk4("w", dt, fields, rhs)

    logmgr.tick_before()
    for event in dt_stepper.run(t_end=dt * num_steps):
        if isinstance(event, dt_stepper.StateComputed):
            logmgr.tick_after()
            logmgr.tick_before()
    logmgr.tick_after()

    def print_profile_data(data):
        print("""execute() for rank %d:
            \tInstruction Evaluation: %f%%
            \tFuture Evaluation: %f%%
            \tBusy Wait: %f%%
            \tTotal: %f seconds""" %
              (comm.Get_rank(), data['insn_eval_time'] / data['total_time'] *
               100, data['future_eval_time'] / data['total_time'] * 100,
               data['busy_wait_time'] / data['total_time'] * 100,
               data['total_time']))

    print_profile_data(rhs.profile_data)
    logmgr.close()
Ejemplo n.º 60
0
    def __init__(self,
                 gene_mat,
                 pop,
                 adj_mat,
                 bound=None,
                 secretion=None,
                 reception=None,
                 receptors=None,
                 init_env=None,
                 secr_amount=1.0,
                 leak=1.0,
                 max_con=1000.0,
                 max_dist=None,
                 opencl=False):
        """
        Init of Stops
        Parameters:
         - gene_mat - matrix of gene interactions [GENE_NUM, GENE_NUM]
         - pop - array with initial population [POP_SIZE, GENE_NUM]
         - adj_mat - matrix with distances between each cell in population[POP_SIZE, POP_SIZE]
         - bound - vector of max value of each gene [GENE_NUM]
         - secretion - vector of length LIG_NUM where secretion[i] contains index
            of a gene which must be on to secrete ligand i
         - reception - vector of length LIG_NUM where reception[i] contains index
            of a gene which will be set to on when ligand i is accepted
         - receptors - vector of length LIG_NUM where receptors[i] contains index
            of a gene which has to be on to accept ligand i; special value -1 means that there is no
            need for specific gene expression for the ligand
         - secr_amount - amount of ligand secreted to the environment each time
         - leak - amount of ligand leaking from the environment each time
         - max_con - maximal ligand concentration
         - max_dist - maximal distance between a cell and an environment needed for
            the cell to accept ligands from the environment
         - opencl - if set to True opencl is used to boost the speed
        """
        self.gene_mat = numpy.array(gene_mat).astype(numpy.float32)
        self.pop = numpy.array(pop).astype(numpy.float32)
        self.adj_mat = numpy.array(adj_mat).astype(numpy.float32)
        self.secr_amount = secr_amount
        self.leak = leak
        self.max_con = max_con
        self.row_size = self.gene_mat.shape[0]
        self.pop_size = self.pop.shape[0]

        self.max_dist = numpy.max(adj_mat) if max_dist is None else max_dist

        if bound != None:
            self.bound = numpy.array(bound).astype(numpy.float32)
        else:
            # bound default - all ones
            self.bound = numpy.ones(self.row_size).astype(numpy.float32)

        if secretion != None:
            self.secretion = numpy.array(secretion).astype(numpy.int32)
        else:
            self.secretion = numpy.array([]).astype(numpy.int32)

        if reception != None:
            self.reception = numpy.array(reception).astype(numpy.int32)
        else:
            self.reception = numpy.array([]).astype(numpy.int32)

        self.max_lig = len(secretion)

        if init_env is None:
            self.init_env = numpy.zeros(self.max_lig)
        else:
            self.init_env = init_env

        self.env = numpy.array([self.init_env] * self.pop.shape[0]).astype(
            numpy.float32)

        if receptors != None:
            self.receptors = numpy.array(receptors).astype(numpy.int32)
        else:
            # receptors - default value "-1" - no receptor for ligand is necessary
            self.receptors = numpy.array([-1] * self.max_lig).astype(
                numpy.int32)

        self._random = numpy.random.random

        self.opencl = opencl
        self.pop_hit = numpy.zeros(
            (self.pop_size, self.max_lig)).astype(numpy.int32)

        if opencl:
            self.ctx = cl.create_some_context()
            self.queue = cl.CommandQueue(self.ctx)
            self.mf = cl.mem_flags
            #init kernel
            self.program = self.__prepare_kernel()
            self.rand_state_buf = cl.Buffer(self.ctx,
                                            self.mf.READ_WRITE,
                                            size=self.pop.shape[0] * 112)
            self.program.init_ranlux(self.queue, (self.pop.shape[0], 1), None,
                                     numpy.uint32(numpy.random.randint(4e10)),
                                     self.rand_state_buf)
            # prepare multiplication matrix
            adj_mat_buf = cl.Buffer(self.ctx,
                                    self.mf.READ_ONLY | self.mf.COPY_HOST_PTR,
                                    hostbuf=self.adj_mat)
            self.mul_mat_buf = cl.Buffer(self.ctx,
                                         self.mf.READ_WRITE,
                                         size=self.adj_mat.nbytes)
            self.program.init_mul_mat(self.queue, (self.pop.shape[0], 1), None,
                                      self.mul_mat_buf, adj_mat_buf,
                                      numpy.float32(self.max_dist))
        else:
            self.mul_mat = mmap(
                lambda x: 1. / x if x != 0 and x <= max_dist else 0., adj_mat)
            n_density = numpy.sum(self.mul_mat, axis=0)
            self.mul_mat = self.mul_mat / n_density  # what if density is 0
            self.mul_mat = self.mul_mat.astype(numpy.float32)