def __init__(self, profile=False, device=None, manual=False): """ Initialize a device, a context and a queue. The preferred device is a NVIDIA GPU with maximum compute capability. @param profile : (optional) if True, enable profiling of the OpenCL events @param device : (optional) device in the format (0, 0) @param manual : (optional) if True, choose manually a device from the PyOpenCL prompt. """ platforms = cl.get_platforms() if manual: self.ctx = cl.create_some_context() self.device = ctx.devices[0] elif device: self.device = platforms[device[0]].get_devices()[device[1]] self.ctx = cl.Context([self.device]) else: # Try to choose a NVIDIA card with best compute capability cc_max = -1 cc_argmax = (0, 0) for i_p, p in enumerate(platforms): for i_dev, dev in enumerate(p.get_devices()): try: cc = dev.compute_capability_major_nv + 0.1 * dev.compute_capability_minor_nv if cc > cc_max: cc_max = cc cc_argmax = (i_p, i_dev) except: pass if cc_max == -1: print("Warning: could not find a NVIDIA card. Please pick up manually the target device") self.ctx = cl.create_some_context() self.device = ctx.devices[0] else: self.device = platforms[cc_argmax[0]].get_devices()[cc_argmax[1]] self.ctx = cl.Context([self.device]) # ------------ self.devicename = self.device.name if profile: self.queue = cl.CommandQueue(self.ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) else: self.queue = cl.CommandQueue(self.ctx) self.mf = cl.mem_flags self.path = [] self.book = {}
def __init__(self): self.context = cl.create_some_context() self.queue = cl.CommandQueue( self.context ) self.costs = loadProgram( self.context, "costs.cl" ) self.dijkstra = loadProgram( self.context, "dijkstra.cl" ) mf = cl.mem_flags fdirections = np.array( [ [ 1., 0.], [ 1., 1.], [ 0., 1.], [-1., 1.], [-1., 0.], [-1., -1.], [ 0., -1.], [ 1., -1.] ], dtype = np.float32 ) idirections = fdirections.astype( np.int32 ) angles = np.array( [ [cos( pi / 8 ), sin( pi / 8 )], [cos( 3 * pi / 8 ), sin( 3 * pi / 8 )], [cos( 5 * pi / 8 ), sin( 5 * pi / 8 )], [cos( 7 * pi / 8 ), sin( 7 * pi / 8 )], ], dtype = np.float32 ) speeds = np.array( [0., 0.02, 0.08], dtype = np.float32 ) self.fdirection_buffer = cl.Buffer( self.context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = fdirections ) self.idirection_buffer = cl.Buffer( self.context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = idirections ) self.angle_buffer = cl.Buffer( self.context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = angles ) self.speed_buffer = cl.Buffer( self.context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = speeds )
def calc_force(Gal,dt): """Calculate forces between bodies F = ((G m_a m_b)/r^2)/((x_b-x_a)/r) """ ctx = cl.create_some_context(0)#use device 0, the GPU queue = cl.CommandQueue(ctx) if Timing: start = time.time() #Convention: dx[i,j] = x[i] - x[j] Gal.dvx,Gal.dvy,Gal.dvz = GPU_functions.CalcF(ctx,queue,Gal.x,Gal.y,Gal.z,Gal.m,1.0,1.0) if Timing: stop = time.time() print 'Time for F_ij computation', stop-start # if DebugMode==True: # print 'Check that the force is attracting' print Gal.x, Gal.y, Gal.z print Gal.dvx, Gal.dvy, Gal.dvz print '----End check' print 'Check that the force is attracting' print Gal.x[120] print Gal.dvx[120] print '----End check'
def __init__(self, lmb, prompt=False, user_dev_selection=None, bindings=None): """ """ assert not (prompt and user_dev_selection), "Can't ask for @prompt and provide @user_dev_selection at the same time" self.user_dev_selection = user_dev_selection if prompt: self.user_dev_selection = None if Py2OpenCL.only_one_device() \ else self.init() self.ctx = cl.create_some_context( interactive=False, answers=self.user_dev_selection ) \ if self.user_dev_selection else cl.create_some_context() self.queue = cl.CommandQueue(self.ctx) self.bindings = bindings self.lmb = lmb
def test_cl(): ctx = cl.create_some_context() # (interactive=False) # print 'ctx', ctx queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) f = open("part1.cl", "r") fstr = "".join(f.readlines()) program = cl.Program(ctx, fstr).build() mf = cl.mem_flags cameraPos = np.array([0, 6, -1, 0]) invView = la.inv(look_at((0, 6, -1), (0, 1, 1), (0, 1, 0))) invProj = la.inv(perspective(60, 1, 1, 1000)) print "view", invView print "proj", invProj viewParamsData = ( cameraPos.flatten().tolist() + np.transpose(invView).flatten().tolist() + np.transpose(invProj).flatten().tolist() ) # print 'vpd', viewParamsData viewParams = struct.pack("4f16f16f", *viewParamsData) viewParams_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=viewParams) num_pixels = 1000 * 1000 # setup opencl dest = np.ndarray((1000, 1000, 4), dtype=np.float32) dest_buf = cl.Buffer(ctx, mf.WRITE_ONLY, dest.nbytes) local_shape = (8, 8) # run kernel evt = program.part1(queue, (dest.shape[0], dest.shape[1]), None, viewParams_buf, dest_buf) # evt = program.part1(queue, dest.shape, None, dest_buf) cl.enqueue_read_buffer(queue, dest_buf, dest).wait() print "time", (evt.profile.end - evt.profile.start) * 0.000001, "ms" return dest
def main(): ctx = cl.create_some_context() # devices = ctx.get_info(cl.context_info.DEVICES) # print(devices[0].get_info(cl.device_info.VERSION)) queue = cl.CommandQueue(ctx, properties=0) dtype = 'float64' n = 500 k = 30 A = setup_lowrank(n, dtype=dtype) #mvt = pyclid.util.setup_matvect(queue, A) print('finished setup') L = pyclid.util.setup_op(queue, A) idx, proj = pyclid.interp_decomp(queue, L, k) #idx, proj = pyclid.iddr_rid(queue, n, n, mvt, k) # begin debug import scipy.linalg as la import scipy.linalg.interpolative as sli from scipy.sparse.linalg import aslinearoperator B = A[:,idx[:k]] P = np.hstack([np.eye(k), proj])[:,np.argsort(idx)] Aapprox = np.dot(B,P) print(la.norm(A - Aapprox, 2))
def test_bitonic_argsort(ctx_factory, size, dtype): ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) dev = ctx.devices[0] if (dev.platform.name == "Apple" and dev.type & cl.device_type.CPU): pytest.xfail("Bitonic sort won't work on Apple CPU: no workgroup " "parallelism") if (dev.platform.name == "Portable Computing Language" and dtype == np.float64): pytest.xfail("Double precision bitonic sort doesn't work on POCL") import pyopencl.clrandom as clrandom from pyopencl.bitonic_sort import BitonicSort index = cl_array.arange(queue, 0, size, 1, dtype=np.int32) m = clrandom.rand(queue, (size,), dtype, luxury=None, a=0, b=239432234) sorterm = BitonicSort(ctx) ms, evt = sorterm(m.copy(), idx=index, axis=0) assert np.array_equal(np.sort(m.get()), ms.get()) # may be False because of identical values in array # assert np.array_equal(np.argsort(m.get()), index.get()) # Check values by indices assert np.array_equal(m.get()[np.argsort(m.get())], m.get()[index.get()])
def __init__(self, coords, values, wantCL=True, platform_num=None): """ Take the coordinates and values and build a KD tree. Keyword arguments: coords -- input coordinates (x, y) values -- input values """ self.coords = np.asarray(coords, dtype=np.float32) self.values = np.asarray(values, dtype=np.int32) if self.coords.shape[0] != self.values.shape[0]: raise AssertionError('lencoords does not equal lenvalues') self.wantCL = wantCL self.canCL = False if hasCL and self.wantCL: try: platforms = cl.get_platforms() try: platform = platforms[platform_num] self.devices = self.platform.get_devices() self.context = cl.Context(self.devices) except TypeError: # The user may be asked to select a platform. self.context = cl.create_some_context() self.devices = self.context.devices except IndexError: raise self.queue = cl.CommandQueue(self.context) filestr = ''.join(open('idt.cl', 'r').readlines()) self.program = cl.Program(self.context, filestr).build(devices=self.devices) for device in self.devices: buildlog = self.program.get_build_info(device, cl.program_build_info.LOG) if (len(buildlog) > 1): print 'Build log for device', device, ':\n', buildlog # Only the first kernel is used. self.kernel = self.program.all_kernels()[0] # Local and global sizes are device-dependent. self.local_size = {} self.global_size = {} # Groups should be overcommitted. # For now, use 3 (48 cores / 16 cores per halfwarp) * 2 for device in self.devices: work_group_size = self.kernel.get_work_group_info(cl.kernel_work_group_info.WORK_GROUP_SIZE, device) num_groups_for_1d = device.max_compute_units * 3 * 2 self.local_size[device] = (work_group_size,) self.global_size[device] = (num_groups_for_1d * work_group_size,) self.canCL = True except cl.RuntimeError: print 'warning: unable to use pyopencl, defaulting to cKDTree' if self.canCL: self.tree = build_tree(coords) else: self.tree = KDTree(coords)
def __init__(self, cl_mode = True, cl_device = None): """Initialize the class. """ if cl_mode: import pyopencl as cl import pyopencl.array if cl_device == 'gpu': gpu_devices = [] for platform in cl.get_platforms(): try: gpu_devices += platform.get_devices(device_type=cl.device_type.GPU) except: pass self.ctx = cl.Context(gpu_devices) elif cl_device == 'cpu': cpu_devices = [] for platform in cl.get_platforms(): try: cpu_devices += platform.get_devices(device_type=cl.device_type.CPU) except: pass self.ctx = cl.Context([cpu_devices[0]]) else: self.ctx = cl.create_some_context() self.queue = cl.CommandQueue(self.ctx) self.mf = cl.mem_flags self.device = self.ctx.get_info(cl.context_info.DEVICES)[0] self.device_type = self.device.type self.device_compute_units = self.device.max_compute_units self.cl_mode = cl_mode self.obs = [] self.samples = {}
def __init__(self, cl_file_location='../../../c/kernels', interactive=True): """ Initialising the Kernel loads the C code from kernel_file and sets up the necessary CommandQueue and context. """ #TODO: replace print with logger #TODO: investigate precompilation of kernels #TODO: set dType uniformly # log here print cl.version.VERSION_TEXT if self.kernel_file: base = __file__.rsplit('/', 1)[0] filename = os.path.join(base, cl_file_location, self.kernel_file) # log here print os.path.abspath(filename) self.kernel_string = open(filename).read() if self.function_string: self.kernel_string = self.function_string+'\n'+self.kernel_string else: self.function_string = "" else: self.kernel_string = "" self.ctx = cl.create_some_context(interactive=interactive) # This should be a logging statement... print 'Using device: %s' % self.ctx.get_info(cl.context_info.DEVICES) self.queue = cl.CommandQueue(self.ctx) self.buffers = [] # This is a tuple describing the dimensions of the output self.global_size = (0,)
def calc_range(start, num, perexec): """Calculate the otp-md5 of the 64-bit numbers range(start, num), with otp sequence of rounds.""" assert(num % perexec == 0) # Boilerplate OpenCL stuff ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) mf = cl.mem_flags # Read the program source and compile sourcecode = open("otpmd5.cl").read() prg = cl.Program(ctx, sourcecode).build() for i in xrange(num / perexec): offset = start + (perexec * i) host_input = numpy.arange(offset, offset+perexec, dtype=numpy.uint64) result = numpy.empty_like(host_input) dev_input = cl.Buffer(ctx, mf.READ_ONLY | mf.USE_HOST_PTR, hostbuf=host_input) dev_output = cl.Buffer(ctx, mf.READ_WRITE, size=result.size * result.itemsize) prg.get_otpmd5_64k_rounds(queue, host_input.shape, None, dev_input, dev_output).wait() cl.enqueue_copy(queue, result, dev_output).wait() send_output(host_input, result)
def cl_init(type = 'GPU'): if type == 'GPU': my_type = cl.device_type.GPU elif type == 'CPU': my_type = cl.device_type.CPU try: platform = cl.get_platforms()[0] devices = platform.get_devices(device_type=my_type) ctx = cl.Context(devices = devices) except: ctx = cl.create_some_context(interactive=True) device = devices[0] print("===============================================================") print("Platform name: " + platform.name) print("Platform vendor: " + platform.vendor) print("Platform version: " + platform.version) print("---------------------------------------------------------------") print("Device name: " + device.name) print("Device type: " + cl.device_type.to_string(device.type)) print("Local memory: " + str(device.local_mem_size//1024) + ' KB') print("Device memory: " + str(device.global_mem_size//1024//1024) + ' MB') print("Device max clock speed:" + str(device.max_clock_frequency) + ' MHz') print("Device compute units:" + str(device.max_compute_units)) return ctx
def __init__(self, coords, values, base, wantCL=True, split=None, nnear=None, majority=True): self.coords = np.asarray(coords, dtype=np.int32) self.values = np.asarray(values, dtype=np.int32) self.base = np.asarray(base, dtype=np.int32) lencoords = self.coords.shape[0] lenvalues = self.values.shape[0] assert lencoords == lenvalues, "lencoords does not equal lenvalues" self.wantCL = wantCL if hasCL == True and self.wantCL == True: if split == None: self.split = CLIDT.OpenCLmaxsize else: self.split = split try: self.ctx = cl.create_some_context() self.queue = cl.CommandQueue(self.ctx) filestr = "".join(open("idt.cl", "r").readlines()) self.program = cl.Program(self.ctx, filestr).build() self.coordindices = self.genindices(self.coords) self.baseindices = self.genindices(self.base) self.canCL = True # FIXME: specify an exception type except: print "warning: unable to use pyopencl, defaulting to Invdisttree" self.canCL = False else: self.canCL = False if nnear == None: self.nnear = np.int32(CLIDT.nnear) else: self.nnear = np.int32(nnear) self.usemajority = np.int32(1 if majority else 0)
def create_context(self, devicetype="ALL", useFp64=False, platformid=None, deviceid=None): """ Choose a device and initiate a context. Devicetypes can be GPU,gpu,CPU,cpu,DEF,ACC,ALL. Suggested are GPU,CPU. For each setting to work there must be such an OpenCL device and properly installed. E.g.: If Nvidia driver is installed, GPU will succeed but CPU will fail. The AMD SDK kit is required for CPU via OpenCL. :param devicetype: string in ["cpu","gpu", "all", "acc"] :param useFp64: boolean specifying if double precision will be used :param platformid: integer :param devid: integer :return: OpenCL context on the selected device """ if (platformid is not None) and (deviceid is not None): platformid = int(platformid) deviceid = int(deviceid) else: if useFp64: ids = ocl.select_device(type=devicetype, extensions=["cl_khr_int64_base_atomics"]) else: ids = ocl.select_device(type=devicetype) if ids: platformid = ids[0] deviceid = ids[1] if (platformid is not None) and (deviceid is not None): ctx = pyopencl.Context(devices=[pyopencl.get_platforms()[platformid].get_devices()[deviceid]]) else: logger.warn("Last chance to get an OpenCL device ... probably not the one requested") ctx = pyopencl.create_some_context(interactive=False) return ctx
def __init__(self, network, dt=0.001, seed=None, model=None, context=None, n_prealloc_probes=32, profiling=None, ocl_only=False): if context is None: print('No context argument was provided to sim_ocl.Simulator') print("Calling pyopencl.create_some_context() for you now:") context = cl.create_some_context() if profiling is None: profiling = int(os.getenv("NENGO_OCL_PROFILING", 0)) self.context = context self.profiling = profiling if self.profiling: self.queue = cl.CommandQueue(context, properties=PROFILING_ENABLE) else: self.queue = cl.CommandQueue(context) self.n_prealloc_probes = n_prealloc_probes self.ocl_only = ocl_only self.cl_rng_state = None # -- allocate data sim_npy.Simulator.__init__( self, network=network, dt=dt, seed=seed, model=model) # -- create object to execute list of plans self._plans = Plans(self._plan, self.profiling)
def main(): cl_ctx = cl.create_some_context() queue = cl.CommandQueue(cl_ctx) from meshmode.mesh.generation import ( # noqa generate_icosphere, generate_icosahedron, generate_torus) #mesh = generate_icosphere(1, order=order) mesh = generate_icosahedron(1, order=order) #mesh = generate_torus(3, 1, order=order) from meshmode.discretization import Discretization from meshmode.discretization.poly_element import \ PolynomialWarpAndBlendGroupFactory discr = Discretization( cl_ctx, mesh, PolynomialWarpAndBlendGroupFactory(order)) from meshmode.discretization.visualization import make_visualizer vis = make_visualizer(queue, discr, order) vis.write_vtk_file("geometry.vtu", [ ("f", discr.nodes()[0]), ]) from meshmode.discretization.visualization import \ write_mesh_connectivity_vtk_file write_mesh_connectivity_vtk_file("connectivity.vtu", mesh)
def __init__(self, model, dt=0.001, seed=None, builder=None, context=None, n_prealloc_probes=1000, profiling=None): if context is None: print 'No context argument was provided to sim_ocl.Simulator' print "Calling pyopencl.create_some_context() for you now:" context = cl.create_some_context() if profiling is None: profiling = int(os.getenv("NENGO_OCL_PROFILING", 0)) self.context = context self.profiling = profiling if self.profiling: self.queue = cl.CommandQueue(context, properties=PROFILING_ENABLE) else: self.queue = cl.CommandQueue(context) self.n_prealloc_probes = n_prealloc_probes # -- allocate data sim_npy.Simulator.__init__( self, model=model, dt=dt, seed=seed, builder=builder) # -- set up the DAG for executing OCL kernels self._plandict = OrderedDict() self.step_marker = Marker(self.queue) # -- marker is used to do the op_groups in order deps = [] for op_type, op_list in self.op_groups: deps = self.plandict_op_group(op_type, op_list, deps) probe_plans = self.plan_probes() for p in probe_plans: self._plandict[p] = deps self._dag = DAG(context, self.step_marker, self._plandict, self.profiling)
def main(): ctx = cl.create_some_context() prof_overhead, latency = perf.get_profiling_overhead(ctx) print("command latency: %g s" % latency) print("profiling overhead: %g s -> %.1f %%" % ( prof_overhead, 100*prof_overhead/latency)) queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) print("empty kernel: %g s" % perf.get_empty_kernel_time(queue)) print("float32 add: %g GOps/s" % (perf.get_add_rate(queue)/1e9)) for tx_type in [ perf.HostToDeviceTransfer, perf.DeviceToHostTransfer, perf.DeviceToDeviceTransfer]: print("----------------------------------------") print(tx_type.__name__) print("----------------------------------------") print("latency: %g s" % perf.transfer_latency(queue, tx_type)) for i in range(6, 28, 2): bs = 1<<i print("bandwidth @ %d bytes: %g GB/s" % ( bs, perf.transfer_bandwidth(queue, tx_type, bs)/1e9))
def main2(): ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) dev = queue.device knl = _get_wave_kernel(ctx) gs, ls = get_group_sizes(len_x * 2, dev, knl) def f(t, y_in, y_out, wait_for=None): return run_elwise_kernel(knl, queue, gs, ls, len_x * 2, wait_for, y_out, y_in, h_x, len_x) xs = np.arange(len_x) * np.pi / (len_x - 1) y0 = np.r_[(np.sin(xs) + np.sin(xs * 2) + np.sin(xs * 3) + np.sin(xs * 4) + np.sin(xs * 5)) / 5, np.zeros(len_x)].astype(np.float32) # y0 += np.r_[np.zeros(len_x), # [(min((i / len_x) - 0.4, 0.5 - (i / len_x)) * 20 # if 0.4 < (i / len_x) < 0.5 else 0) # for i in range(len_x)]].astype(np.float32) y0 += np.r_[np.zeros(len_x), [((i / len_x) - 0.2 if 0.15 < (i / len_x) < 0.25 else 0) * 20 for i in range(len_x)]].astype(np.float32) # y0 = np.r_[[(1 if 0.4 < (i / len_x) < 0.5 else 0) # for i in range(len_x)], # np.zeros(len_x)].astype(np.float32) y0 += np.r_[[(1 if 0.75 < (i / len_x) < 0.85 else 0) for i in range(len_x)], np.zeros(len_x)].astype(np.float32) res, evt = solve_ode(t0, t1, h, y0, f, queue) print('queued') evt.wait() print('finished') res_np = [a.get() for a in res]
def matrix_deg_centrality(h_a,threshold,a_height): ### h_a is the input matrix in array form, so shape=(rowsxcolumns,1) ### assumes that the connectivity matrix is symmetric ### threshold is the threshold applied to the connectivity matrix ### a_height is the number of columns or row of the input matrix block_size = 16 a_width = a_height###assumes symmetric matrix h_b_int = a_height c_width = a_width c_height = a_height h_result=np.empty(a_height).astype(np.float32); ctx=cl.create_some_context() queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) from pyopencl.scan import GenericScanKernel scan_kernel = GenericScanKernel( ctx, np.float32, arguments="__global float *ary,__global float *out, __global int segflag,__global float threshold", input_expr="(ary[i] < threshold) ? 0 : 1", scan_expr="across_seg_boundary ? b: (a+b)", neutral="0",is_segment_start_expr="(i)%segflag==0", output_statement="(i+1)%segflag==0 ? (out[i/segflag] = item,ary[i] = item) : (ary[i] = item);") mf = cl.mem_flags a_gpu=cl.array.to_device(queue,h_a) result_gpu=cl.array.to_device(queue,h_result) event = scan_kernel(a_gpu,result_gpu,h_b_int,threshold,queue=queue) gpu_centrality= result_gpu.get(); ##check if everything is correct return gpu_centrality
def lomb_scargle32(x, y, f): '''single percesion version of lomb-scargle''' x = np.float32(x) y = np.float32(y) f = np.float32(f) ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) mf = cl.mem_flags # make max arrays Nx, Nf = np.int32(x.shape[0]), np.int32(f.shape[0]) # send data to card x_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=x) y_g = cl.Buffer(ctx, mf.READ_ONLY| mf.COPY_HOST_PTR, hostbuf=y) f_g = cl.Buffer(ctx, mf.READ_ONLY| mf.COPY_HOST_PTR, hostbuf=f) # make output pgram = np.empty_like(f) pgram_g = cl.Buffer(ctx, mf.WRITE_ONLY, pgram.nbytes) prg = cl.Program(ctx, lomb_txt32) try: prg.build() except: # print("Error:") print(prg.get_build_info(ctx.devices[0], cl.program_build_info.LOG)) raise prg.lombscargle(queue, pgram.shape, None, x_g, y_g, f_g, pgram_g, Nx) cl.enqueue_read_buffer(queue, pgram_g, pgram) return pgram
def benchmark_overlapfiltfilt(): ctx = pyopencl.create_some_context() print(ctx) #~ chunksizes = [256,1024,2048] chunksizes = [2048] #~ chunksizes = [64] #~ n_sections = [2,8,16,24] n_sections = [8, 24] #~ n_sections = [24] #~ nb_channels = [1,10, 50,100, 200] nb_channels = [10, 50, 100] #~ nb_channels = [10, 50, 100, 500] #~ nb_channels = [10, 50, 100] #~ chunksizes = [1024] #~ n_sections = [4] #~ nb_channels = [100] if HAVE_PYOPENCL: engines = ['scipy', 'opencl', 'opencl3'] else: engines = ['scipy'] extra_kargs = {'overlapsize' : 64} for chunksize in chunksizes: for n_section in n_sections: for nb_channel in nb_channels: print('*'*20) print('chunksize', chunksize, 'n_section', n_section, 'nb_channel', nb_channel) compare(chunksize,n_section, nb_channel, sosfiltfilt_engines, engines, **extra_kargs)
def gpu_gradient(): if len(sys.argv) != 3: print "USAGE: " + sys.argv[0] + " <inputImageFile> <outputImageFile>" return 1 # create context and command queue ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) # load image im = Image.open(sys.argv[1]) if im.mode != "RGBA": im = im.convert("RGBA") imgSize = im.size buffer = im.tostring() # len(buffer) = imgSize[0] * imgSize[1] * 4 # Create ouput image object clImageFormat = cl.ImageFormat(cl.channel_order.RGBA, cl.channel_type.UNSIGNED_INT8) input_image = cl.Image(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, clImageFormat, imgSize, None, buffer) output_image = cl.Image(ctx, cl.mem_flags.WRITE_ONLY, clImageFormat, imgSize) # load the kernel source code kernelFile = open("gradient.cl", "r") kernelSrc = kernelFile.read() # Create OpenCL program program = cl.Program(ctx, kernelSrc).build() # Call the kernel directly globalWorkSize = ( imgSize[0],imgSize[1] ) gpu_start_time = time() program.gradient(queue, globalWorkSize, None, input_image, output_image) # Read the output buffer back to the Host buffer = numpy.zeros(imgSize[0] * imgSize[1] * 4, numpy.uint8) origin = ( 0, 0, 0 ) region = ( imgSize[0], imgSize[1], 1 ) cl.enqueue_read_image(queue, output_image, origin, region, buffer).wait() # Save the image to disk gsim = Image.fromstring("RGBA", imgSize, buffer.tostring()) gsim.save("GPU_"+sys.argv[2]) gpu_end_time = time() print("GPU Time: {0} s".format(gpu_end_time - gpu_start_time))
def compile_source(self): self.context = pyopencl.create_some_context() self.queue = pyopencl.CommandQueue(self.context) self.mf = pyopencl.mem_flags opencl_source = load_file("geneticvehicle.cl") % { "vertices_per_car" : self.number_of_vertices_per_car, "number_of_cars" : self.number_of_cars, "density" : self.density, "number_of_wheels" : self.number_of_wheels_per_car, "number_of_contact_points" : self.number_of_contact_points, "island_start" : self.island.island_start, "island_step" : self.island.island_step, "island_end" : self.island.island_end, "island_acceleration" : int(self.island.island_acceleration), "island_range" : self.island.range(), "crossover_points" : self.crossover_points, "point_mutations" : self.point_mutations} self.program = pyopencl.Program(self.context, opencl_source) try: self.program.build() except Exception as why: print why print(self.program.get_build_info(self.context.devices[0], pyopencl.program_build_info.LOG))
def __init__( self, im, fil, fil_1d=None, fil_2d=None, larger_buffer=True, sep=True, buffer_flip=False, type=numpy.float32 ): self.ctx = cl.create_some_context() self.queue = cl.CommandQueue( self.ctx ) self.larger_buffer = larger_buffer self.sep = sep # whether or not the convolution is separated into 1D chunks self.type = type #TODO: type should just come from the input image, do a check to see if it matches the filter self.buffer_flip = buffer_flip # Optimization for separable convolutions where only the x direction is required if self.type == numpy.float32: self.ctype = 'float' elif self.type == numpy.float64: self.ctype = 'double' else: raise TypeError, "Data type specified is not currently supported: " + str( self.type ) # For special convolutions, if required self.fil_1d = fil_1d self.fil_1d_origin = 0 self.fil_2d = fil_2d self.fil_2d_origin = ( 0, 0 ) # offset of the center of the filter self.max_2d_buffer = False # just set this to false for now, it might be used in the future if im is not None and fil is not None: self.set_params( im, fil )
def init_context_queue(self): if self.ctx is None: if self.choose_best_device: self.ctx = ocl.create_context() else: self.ctx = cl.create_some_context() self.queue = cl.CommandQueue(self.ctx)
def test_opencl(): import numpy as np import pyopencl as cl a = np.random.rand(50000).astype(np.float32) b = np.random.rand(50000).astype(np.float32) context = cl.create_some_context() queue = cl.CommandQueue(context) mf = cl.mem_flags a_cl = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a) b_cl = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b) program = cl.Program(context, r''' __kernel void sum(__global const float * a, __global const float * b, __global float * out) { int gid = get_global_id(0); out[gid] = a[gid] + b[gid]; } ''').build() out_cl = cl.Buffer(context, mf.WRITE_ONLY, a.nbytes) program.sum(queue, a.shape, None, a_cl, b_cl, out_cl) out = np.empty_like(a) cl.enqueue_copy(queue, out, out_cl) print(np.linalg.norm(out - (a + b)))
def gpu_array_sum(a, b): context = cl.create_some_context() # Initialize the Context queue = cl.CommandQueue(context, properties=cl.command_queue_properties.PROFILING_ENABLE) # Instantiate a Queue with profiling (timing) enabled a_buffer = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=a) b_buffer = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=b) c_buffer = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, b.nbytes) # Create three buffers (plans for areas of memory on the device) program = cl.Program(context, """ __kernel void sum(__global const float *a, __global const float *b, __global float *c) { int i = get_global_id(0); int j; for(j = 0; j < 1000; j++) { c[i] = a[i] + b[i]; } }""").build() # Compile the device program gpu_start_time = time() # Get the GPU start time event = program.sum(queue, a.shape, None, a_buffer, b_buffer, c_buffer) # Enqueue the GPU sum program XXX event.wait() # Wait until the event finishes XXX elapsed = 1e-9*(event.profile.end - event.profile.start) # Calculate the time it took to execute the kernel print("GPU Kernel Time: {0} s".format(elapsed)) # Print the time it took to execute the kernel c_gpu = np.empty_like(a) # Create an empty array the same size as array a cl.enqueue_read_buffer(queue, c_buffer, c_gpu).wait() # Read back the data from GPU memory into array c_gpu gpu_end_time = time() # Get the GPU end time print("GPU Time: {0} s".format(gpu_end_time - gpu_start_time)) # Print the time the GPU program took, including both memory copies return c_gpu # Return the sum of the two arrays
def __init__(self, seed=None): self.ctx = cl.create_some_context() self.queue = cl.CommandQueue(self.ctx) self.seed = seed numpy.random.seed(seed) self._compute_seed()
def __init__(self): t_np = np.arange(0, 100000000, dtype=np.float32) self.ctx = cl.create_some_context() self.queue = cl.CommandQueue(self.ctx) self.mf = cl.mem_flags self.t_g = cl.Buffer( self.ctx, self.mf.READ_ONLY | self.mf.COPY_HOST_PTR, hostbuf=t_np) f = open("ex.cl", "r") fstr = "".join(f.readlines()) f.close() self.prg = cl.Program(self.ctx, fstr).build() self.res_g = cl.Buffer(self.ctx, self.mf.WRITE_ONLY, t_np.nbytes) self.prg.proc(self.queue, t_np.shape, None, self.t_g, self.res_g) res_np = np.empty_like(t_np) cl.enqueue_copy(self.queue, res_np, self.res_g) # Check on CPU with Numpy: print(res_np) print(np.amax(res_np))
# Use OpenCL To Add Two Random Arrays (Using PyOpenCL Arrays and Elementwise) import pyopencl as cl # Import the OpenCL GPU computing API import pyopencl.array as cl_array # Import PyOpenCL Array (a Numpy array plus an OpenCL buffer object) import numpy # Import Numpy number tools context = cl.create_some_context() # Initialize the Context queue = cl.CommandQueue(context) # Instantiate a Queue a = cl_array.to_device(queue, numpy.random.randn(50000).astype( numpy.float32)) # Create a random pyopencl array b = cl_array.to_device(queue, numpy.random.randn(50000).astype( numpy.float32)) # Create a random pyopencl array c = cl_array.empty_like(a) # Create an empty pyopencl destination array sum = cl.elementwise.ElementwiseKernel(context, "float *a, float *b, float *c", "c[i] = a[i] + b[i]", "sum") # Create an elementwise kernel object # - Arguments: a string formatted as a C argument list # - Operation: a snippet of C that carries out the desired map operation # - Name: the fuction name as which the kernel is compiled sum(a, b, c) # Call the elementwise kernel print("a: {}".format(a)) print("b: {}".format(b)) print("c: {}".format(c)) # Print all three arrays, to show sum() worked
3, *params, test_case='exact') @pytest.mark.parametrize("params", [ [2, 5, 4, 4], [3, 7, 5, 3], [4, 7, 3, 5], ]) def test_to_meshmode_interpolation_3d_nonexact(ctx_factory, params): cl_ctx = ctx_factory() queue = cl.CommandQueue(cl_ctx) assert drive_test_to_meshmode_interpolation( cl_ctx, queue, 3, *params, test_case='non-exact') < 1e-3 # }}} End 3d tests if __name__ == '__main__': cl_ctx = cl.create_some_context() queue = cl.CommandQueue(cl_ctx) resid = drive_test_to_meshmode_interpolation(cl_ctx, queue, dim=3, degree=9, nel_1d=7, n_levels=2, q_order=10, test_case="exact")
def main(snapshot_pattern="wave-mpi-{step:04d}-{rank:04d}.pkl", restart_step=None, use_profiling=False, use_logmgr=False, actx_class=PyOpenCLArrayContext): """Drive the example.""" cl_ctx = cl.create_some_context() queue = cl.CommandQueue(cl_ctx) from mpi4py import MPI comm = MPI.COMM_WORLD rank = comm.Get_rank() num_parts = comm.Get_size() logmgr = initialize_logmgr(use_logmgr, filename="wave-mpi.sqlite", mode="wu", mpi_comm=comm) if use_profiling: queue = cl.CommandQueue(cl_ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) actx = actx_class(queue, allocator=cl_tools.MemoryPool(cl_tools.ImmediateAllocator(queue)), logmgr=logmgr) else: queue = cl.CommandQueue(cl_ctx) actx = actx_class(queue, allocator=cl_tools.MemoryPool(cl_tools.ImmediateAllocator(queue))) if restart_step is None: from meshmode.distributed import MPIMeshDistributor, get_partition_by_pymetis mesh_dist = MPIMeshDistributor(comm) dim = 2 nel_1d = 16 if mesh_dist.is_mananger_rank(): from meshmode.mesh.generation import generate_regular_rect_mesh mesh = generate_regular_rect_mesh( a=(-0.5,)*dim, b=(0.5,)*dim, nelements_per_axis=(nel_1d,)*dim) print("%d elements" % mesh.nelements) part_per_element = get_partition_by_pymetis(mesh, num_parts) local_mesh = mesh_dist.send_mesh_parts(mesh, part_per_element, num_parts) del mesh else: local_mesh = mesh_dist.receive_mesh_part() fields = None else: from mirgecom.restart import read_restart_data restart_data = read_restart_data( actx, snapshot_pattern.format(step=restart_step, rank=rank) ) local_mesh = restart_data["local_mesh"] nel_1d = restart_data["nel_1d"] assert comm.Get_size() == restart_data["num_parts"] order = 3 discr = EagerDGDiscretization(actx, local_mesh, order=order, mpi_communicator=comm) current_cfl = 0.485 wave_speed = 1.0 from grudge.dt_utils import characteristic_lengthscales dt = current_cfl * characteristic_lengthscales(actx, discr) / wave_speed from grudge.op import nodal_min dt = nodal_min(discr, "vol", dt) t_final = 1 if restart_step is None: t = 0 istep = 0 fields = flat_obj_array( bump(actx, discr), [discr.zeros(actx) for i in range(discr.dim)] ) else: t = restart_data["t"] istep = restart_step assert istep == restart_step restart_fields = restart_data["fields"] old_order = restart_data["order"] if old_order != order: old_discr = EagerDGDiscretization(actx, local_mesh, order=old_order, mpi_communicator=comm) from meshmode.discretization.connection import make_same_mesh_connection connection = make_same_mesh_connection(actx, discr.discr_from_dd("vol"), old_discr.discr_from_dd("vol")) fields = connection(restart_fields) else: fields = restart_fields if logmgr: logmgr_add_cl_device_info(logmgr, queue) logmgr_add_device_memory_usage(logmgr, queue) logmgr.add_watches(["step.max", "t_step.max", "t_log.max"]) try: logmgr.add_watches(["memory_usage_python.max", "memory_usage_gpu.max"]) except KeyError: pass if use_profiling: logmgr.add_watches(["multiply_time.max"]) vis_timer = IntervalTimer("t_vis", "Time spent visualizing") logmgr.add_quantity(vis_timer) vis = make_visualizer(discr) def rhs(t, w): return wave_operator(discr, c=wave_speed, w=w) compiled_rhs = actx.compile(rhs) while t < t_final: if logmgr: logmgr.tick_before() # restart must happen at beginning of step if istep % 100 == 0 and ( # Do not overwrite the restart file that we just read. istep != restart_step): from mirgecom.restart import write_restart_file write_restart_file( actx, restart_data={ "local_mesh": local_mesh, "order": order, "fields": fields, "t": t, "step": istep, "nel_1d": nel_1d, "num_parts": num_parts}, filename=snapshot_pattern.format(step=istep, rank=rank), comm=comm ) if istep % 10 == 0: print(istep, t, discr.norm(fields[0])) vis.write_parallel_vtk_file( comm, "fld-wave-mpi-%03d-%04d.vtu" % (rank, istep), [ ("u", fields[0]), ("v", fields[1:]), ], overwrite=True ) fields = thaw(freeze(fields, actx), actx) fields = rk4_step(fields, t, dt, compiled_rhs) t += dt istep += 1 if logmgr: set_dt(logmgr, dt) logmgr.tick_after() final_soln = discr.norm(fields[0]) assert np.abs(final_soln - 0.04409852463947439) < 1e-14
def kMerCount(file, nK): K = nK h_seq = genSeq(file) h_seq = np.concatenate( (np.zeros(2 + 4 + 4**K).astype(CPU_SIDE_INT), h_seq)) kernelsource = ''' __kernel void mapToNumb( const int N, const int M, const int numbKmer, __global int* seq, __global int* numb_seq ) { int gid = get_global_id(0); int idx = gid * M + numbKmer + 2 + 4; int i, letter; if(idx < N*M + numbKmer + 2 + 4) { for(i=0; i < M; i++) { letter = seq[idx+i]; if(letter == 65) { numb_seq[idx+i] = 0; atomic_inc(&numb_seq[2]); } else { if(letter == 67) { numb_seq[idx+i] = 1; atomic_inc(&numb_seq[3]); } else { if(letter == 71) { numb_seq[idx+i] = 2; atomic_inc(&numb_seq[4]); } else { if(letter == 84) { numb_seq[idx+i] = 3; atomic_inc(&numb_seq[5]); } else { if(letter == 78) { numb_seq[idx+i] = -1; } else { numb_seq[idx+i] = -2; } } } } } } } } __kernel void freqTab( const int N, const int M, const int nK, const int numbKmer, __global int* numb_seq ) { int gid = get_global_id(0); int idx = gid * M + numbKmer + 2 + 4; int i, numb; int k, p, loc_idx, ptn_idx; int dgt; int kmin; for(i=0; i < M; i++) { ptn_idx = 0; loc_idx = idx + i; kmin = 0; if(loc_idx <= (N*M + numbKmer + 2 + 4 - nK)) { for(k=0; k < nK; k++) { numb = numb_seq[loc_idx + k]; switch(numb) { case (-1): atomic_inc(&numb_seq[1]); break; case (-2): atomic_inc(&numb_seq[0]); break; default: dgt = (int)(pow(4, (float)(nK-1-k))); ptn_idx += dgt * numb; break; } if(numb < kmin) { kmin = numb; } } if(kmin >= 0) { atomic_inc(&numb_seq[ptn_idx+2+4]); } } } } ''' context = cl.create_some_context() device = context.devices[0] work_group_size = device.max_work_group_size work_item_size = device.max_work_item_sizes[0] print(work_group_size) print(work_item_size) numbGroups = work_group_size numbItems = work_item_size seqLen = np.size(h_seq) - 4**K - 2 - 4 q, r = divmod(seqLen, numbGroups * numbItems) q = q + 1 h_seq = np.concatenate( (h_seq, np.repeat(78, numbGroups * numbItems - r).astype(CPU_SIDE_INT))) h_numb_seq = np.zeros(np.size(h_seq)).astype(CPU_SIDE_INT) print(q) print(r) queue = cl.CommandQueue(context) program = cl.Program(context, kernelsource).build() mapToNumb = program.mapToNumb mapToNumb.set_scalar_arg_dtypes([np.int32, np.int32, np.int32, None, None]) freqTab = program.freqTab freqTab.set_scalar_arg_dtypes( [np.int32, np.int32, np.int32, np.int32, None]) d_seq = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=h_seq) d_numb_seq = cl.Buffer(context, cl.mem_flags.READ_WRITE, h_numb_seq.nbytes) cl.enqueue_fill_buffer(queue, d_numb_seq, np.zeros(1).astype(np.int), 0, h_numb_seq.nbytes) N = numbGroups * numbItems M = q numbKmer = 4**K globalsize = (N, ) localsize = (numbItems, ) mapToNumb(queue, globalsize, None, N, M, numbKmer, d_seq, d_numb_seq) queue.finish() freqTab(queue, globalsize, None, N, M, K, numbKmer, d_numb_seq) queue.finish() cl.enqueue_copy(queue, h_numb_seq, d_numb_seq) print("Counting Done") print(h_numb_seq[:numbKmer + 2 + 4]) assert (h_numb_seq[0] == 0 ), "File contains unknown nucleotide characters" #Sanity check return h_numb_seq[2:numbKmer + 2 + 4]
def __init__(self, coefficients, nb_channel, dtype, chunksize, overlapsize): SosFiltfilt_Base.__init__(self, coefficients, nb_channel, dtype, chunksize, overlapsize) assert self.dtype == np.dtype('float32') assert self.chunksize is not None, 'chunksize for opencl must be fixed' self.coefficients = self.coefficients.astype(self.dtype) if self.coefficients.ndim == 2: #(nb_section, 6) to (nb_channel, nb_section, 6) self.coefficients = np.tile(self.coefficients[None, :, :], (nb_channel, 1, 1)) if not self.coefficients.flags['C_CONTIGUOUS']: self.coefficients = self.coefficients.copy() assert self.coefficients.shape[ 0] == self.nb_channel, 'wrong coefficients.shape' assert self.coefficients.shape[2] == 6, 'wrong coefficients.shape' self.nb_section = self.coefficients.shape[1] self.ctx = pyopencl.create_some_context() #TODO : add arguments gpu_platform_index/gpu_device_index #self.devices = [pyopencl.get_platforms()[self.gpu_platform_index].get_devices()[self.gpu_device_index] ] #self.ctx = pyopencl.Context(self.devices) self.queue = pyopencl.CommandQueue(self.ctx) #host arrays self.zi1 = np.zeros((nb_channel, self.nb_section, 2), dtype=self.dtype) self.zi2 = np.zeros((nb_channel, self.nb_section, 2), dtype=self.dtype) self.output1 = np.zeros((self.chunksize, self.nb_channel), dtype=self.dtype) self.output2 = np.zeros((self.backward_chunksize, self.nb_channel), dtype=self.dtype) #GPU buffers self.coefficients_cl = pyopencl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=self.coefficients) self.zi1_cl = pyopencl.Buffer(self.ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=self.zi1) self.zi2_cl = pyopencl.Buffer(self.ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=self.zi2) self.input1_cl = pyopencl.Buffer(self.ctx, mf.READ_WRITE, size=self.output1.nbytes) self.output1_cl = pyopencl.Buffer(self.ctx, mf.READ_WRITE, size=self.output1.nbytes) self.input2_cl = pyopencl.Buffer(self.ctx, mf.READ_WRITE, size=self.output2.nbytes) self.output2_cl = pyopencl.Buffer(self.ctx, mf.READ_WRITE, size=self.output2.nbytes) #nb works kernel = self.kernel % dict(forward_chunksize=self.chunksize, backward_chunksize=self.backward_chunksize, nb_section=self.nb_section, nb_channel=self.nb_channel) prg = pyopencl.Program(self.ctx, kernel) self.opencl_prg = prg.build(options='-cl-mad-enable')
def refine_and_generate_chart_function(mesh, filename, function): from time import clock cl_ctx = cl.create_some_context() queue = cl.CommandQueue(cl_ctx) print("NELEMENTS: ", mesh.nelements) #print mesh for i in range(len(mesh.groups[0].vertex_indices[0])): for k in range(len(mesh.vertices)): print(mesh.vertices[k, i]) #check_nodal_adj_against_geometry(mesh); r = Refiner(mesh) #random.seed(0) #times = 3 num_elements = [] time_t = [] #nelements = mesh.nelements while True: print("NELS:", mesh.nelements) #flags = get_corner_flags(mesh) flags = get_function_flags(mesh, function) nels = 0 for i in flags: if i: nels += 1 if nels == 0: break print("LKJASLFKJALKASF:", nels) num_elements.append(nels) #flags = get_corner_flags(mesh) beg = clock() mesh = r.refine(flags) end = clock() time_taken = end - beg time_t.append(time_taken) #if nelements == mesh.nelements: #break #nelements = mesh.nelements #from meshmode.mesh.visualization import draw_2d_mesh #draw_2d_mesh(mesh, True, True, True, fill=None) #import matplotlib.pyplot as pt #pt.show() #poss_flags = np.zeros(len(mesh.groups[0].vertex_indices)) #for i in range(0, len(flags)): # poss_flags[i] = flags[i] #for i in range(len(flags), len(poss_flags)): # poss_flags[i] = 1 import matplotlib.pyplot as pt pt.xlabel('Number of elements being refined') pt.ylabel('Time taken') pt.plot(num_elements, time_t, "o") pt.savefig(filename, format='pdf') pt.clf() print('DONE REFINING') ''' flags = np.zeros(len(mesh.groups[0].vertex_indices)) flags[0] = 1 flags[1] = 1 mesh = r.refine(flags) flags = np.zeros(len(mesh.groups[0].vertex_indices)) flags[0] = 1 flags[1] = 1 flags[2] = 1 mesh = r.refine(flags) ''' #check_nodal_adj_against_geometry(mesh) #r.print_rays(70) #r.print_rays(117) #r.print_hanging_elements(10) #r.print_hanging_elements(117) #r.print_hanging_elements(757) #from meshmode.mesh.visualization import draw_2d_mesh #draw_2d_mesh(mesh, False, False, False, fill=None) #import matplotlib.pyplot as pt #pt.show() from meshmode.discretization import Discretization from meshmode.discretization.poly_element import \ PolynomialWarpAndBlendGroupFactory discr = Discretization(cl_ctx, mesh, PolynomialWarpAndBlendGroupFactory(order)) from meshmode.discretization.visualization import make_visualizer vis = make_visualizer(queue, discr, order) remove_if_exists("connectivity2.vtu") remove_if_exists("geometry2.vtu") vis.write_vtk_file("geometry2.vtu", [ ("f", discr.nodes()[0]), ]) from meshmode.discretization.visualization import \ write_nodal_adjacency_vtk_file write_nodal_adjacency_vtk_file("connectivity2.vtu", mesh)
def main(mesh_name="ellipsoid"): import logging logger = logging.getLogger(__name__) logging.basicConfig(level=logging.WARNING) # INFO for more progress info import pyopencl as cl cl_ctx = cl.create_some_context() queue = cl.CommandQueue(cl_ctx) actx = PyOpenCLArrayContext(queue, force_device_scalars=True) if mesh_name == "ellipsoid": cad_file_name = "geometries/ellipsoid.step" h = 0.6 elif mesh_name == "two-cylinders": cad_file_name = "geometries/two-cylinders-smooth.step" h = 0.4 else: raise ValueError("unknown mesh name: %s" % mesh_name) from meshmode.mesh.io import generate_gmsh, FileSource mesh = generate_gmsh( FileSource(cad_file_name), 2, order=2, other_options=["-string", "Mesh.CharacteristicLengthMax = %g;" % h], target_unit="MM") from meshmode.mesh.processing import perform_flips # Flip elements--gmsh generates inside-out geometry. mesh = perform_flips(mesh, np.ones(mesh.nelements)) from meshmode.mesh.processing import find_bounding_box bbox_min, bbox_max = find_bounding_box(mesh) bbox_center = 0.5 * (bbox_min + bbox_max) bbox_size = max(bbox_max - bbox_min) / 2 logger.info("%d elements" % mesh.nelements) from pytential.qbx import QBXLayerPotentialSource from meshmode.discretization import Discretization from meshmode.discretization.poly_element import \ InterpolatoryQuadratureSimplexGroupFactory density_discr = Discretization( actx, mesh, InterpolatoryQuadratureSimplexGroupFactory(target_order)) qbx = QBXLayerPotentialSource(density_discr, 4 * target_order, qbx_order, fmm_order=qbx_order + 3, target_association_tolerance=0.15) from pytential.target import PointsTarget fplot = FieldPlotter(bbox_center, extent=3.5 * bbox_size, npoints=150) from pytential import GeometryCollection places = GeometryCollection( { "qbx": qbx, "targets": PointsTarget(actx.from_numpy(fplot.points)) }, auto_where="qbx") density_discr = places.get_discretization("qbx") nodes = thaw(density_discr.nodes(), actx) angle = actx.np.arctan2(nodes[1], nodes[0]) if k: kernel = HelmholtzKernel(3) else: kernel = LaplaceKernel(3) #op = sym.d_dx(sym.S(kernel, sym.var("sigma"), qbx_forced_limit=None)) op = sym.D(kernel, sym.var("sigma"), qbx_forced_limit=None) #op = sym.S(kernel, sym.var("sigma"), qbx_forced_limit=None) if 0: from random import randrange sigma = actx.zeros(density_discr.ndofs, angle.entry_dtype) for _ in range(5): sigma[randrange(len(sigma))] = 1 from arraycontext import unflatten sigma = unflatten(angle, sigma, actx) else: sigma = actx.np.cos(mode_nr * angle) if isinstance(kernel, HelmholtzKernel): for i, elem in np.ndenumerate(sigma): sigma[i] = elem.astype(np.complex128) fld_in_vol = actx.to_numpy( bind(places, op, auto_where=("qbx", "targets"))(actx, sigma=sigma, k=k)) #fplot.show_scalar_in_mayavi(fld_in_vol.real, max_val=5) fplot.write_vtk_file("layerpot-3d-potential.vts", [("potential", fld_in_vol)]) bdry_normals = bind(places, sym.normal( density_discr.ambient_dim))(actx).as_vector(dtype=object) from meshmode.discretization.visualization import make_visualizer bdry_vis = make_visualizer(actx, density_discr, target_order) bdry_vis.write_vtk_file("layerpot-3d-density.vtu", [ ("sigma", sigma), ("bdry_normals", bdry_normals), ])
import cv2 import numpy import pyopencl from proc_tex.OpenCLCellNoise3D import OpenCLCellNoise3D from proc_tex.texture_transforms import tex_scale_to_region, tex_to_dtype from proc_tex.texture_transforms_opencl import tex_3d_to_sphere_map if __name__ == '__main__': cl_context = pyopencl.create_some_context() texture = tex_3d_to_sphere_map(OpenCLCellNoise3D(cl_context, 4, 1), cl_context) texture = tex_to_dtype(tex_scale_to_region(texture), numpy.uint16, scale=65535) eval_pts = texture.gen_eval_pts((1024, 1024), numpy.array([[0, 1], [0, 1]])) image = texture.to_image(None, None, eval_pts=eval_pts) # cv2.imshow('image', image) # cv2.waitKey(0) # cv2.destroyAllWindows() cv2.imwrite('./example.png', image) texture.to_video(None, None, 120, 30, './example.webm', pix_fmt='gray16le', codec_params=['-lossless', '0'],
def find_mode(): import warnings warnings.simplefilter("error", np.ComplexWarning) cl_ctx = cl.create_some_context() queue = cl.CommandQueue(cl_ctx) k0 = 1.4447 k1 = k0 * 1.02 beta_sym = sym.var("beta") from pytential.symbolic.pde.scalar import ( # noqa DielectricSRep2DBoundaryOperator as SRep, DielectricSDRep2DBoundaryOperator as SDRep) pde_op = SDRep(mode="te", k_vacuum=1, interfaces=((0, 1, sym.DEFAULT_SOURCE), ), domain_k_exprs=(k0, k1), beta=beta_sym, use_l2_weighting=False) u_sym = pde_op.make_unknown("u") op = pde_op.operator(u_sym) # {{{ discretization setup from meshmode.mesh.generation import ellipse, make_curve_mesh curve_f = partial(ellipse, 1) target_order = 7 qbx_order = 4 nelements = 30 from meshmode.mesh.processing import affine_map mesh = make_curve_mesh(curve_f, np.linspace(0, 1, nelements + 1), target_order) lambda_ = 1.55 circle_radius = 3.4 * 2 * np.pi / lambda_ mesh = affine_map(mesh, A=circle_radius * np.eye(2)) from meshmode.discretization import Discretization from meshmode.discretization.poly_element import \ InterpolatoryQuadratureSimplexGroupFactory from pytential.qbx import QBXLayerPotentialSource density_discr = Discretization( cl_ctx, mesh, InterpolatoryQuadratureSimplexGroupFactory(target_order)) qbx = QBXLayerPotentialSource( density_discr, 4 * target_order, qbx_order, # Don't use FMM for now fmm_order=False) # }}} x_vec = np.random.randn(len(u_sym) * density_discr.nnodes) y_vec = np.random.randn(len(u_sym) * density_discr.nnodes) def muller_solve_func(beta): from pytential.symbolic.execution import build_matrix mat = build_matrix(queue, qbx, op, u_sym, context={"beta": beta}).get() return 1 / x_vec.dot(la.solve(mat, y_vec)) starting_guesses = (1 + 0j) * (k0 + (k1 - k0) * np.random.rand(3)) from pytential.muller import muller beta, niter = muller(muller_solve_func, z_start=starting_guesses) print("beta")
def main(use_profiling=False): """Drive the example.""" cl_ctx = cl.create_some_context() if use_profiling: queue = cl.CommandQueue( cl_ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) actx = PyOpenCLProfilingArrayContext( queue, allocator=cl_tools.MemoryPool(cl_tools.ImmediateAllocator(queue))) else: queue = cl.CommandQueue(cl_ctx) actx = PyOpenCLArrayContext(queue, allocator=cl_tools.MemoryPool( cl_tools.ImmediateAllocator(queue))) dim = 2 nel_1d = 16 from meshmode.mesh.generation import generate_regular_rect_mesh mesh = generate_regular_rect_mesh(a=(-0.5, ) * dim, b=(0.5, ) * dim, nelements_per_axis=(nel_1d, ) * dim) order = 3 if dim == 2: # no deep meaning here, just a fudge factor dt = 0.7 / (nel_1d * order**2) elif dim == 3: # no deep meaning here, just a fudge factor dt = 0.4 / (nel_1d * order**2) else: raise ValueError("don't have a stable time step guesstimate") print("%d elements" % mesh.nelements) discr = EagerDGDiscretization(actx, mesh, order=order) fields = flat_obj_array(bump(actx, discr), [discr.zeros(actx) for i in range(discr.dim)]) vis = make_visualizer(discr) def rhs(t, w): return wave_operator(discr, c=1, w=w) t = 0 t_final = 3 istep = 0 while t < t_final: fields = rk4_step(fields, t, dt, rhs) if istep % 10 == 0: if use_profiling: print(actx.tabulate_profiling_data()) print(istep, t, discr.norm(fields[0], np.inf)) vis.write_vtk_file("fld-wave-eager-%04d.vtu" % istep, [ ("u", fields[0]), ("v", fields[1:]), ]) t += dt istep += 1
def setUp(self): self.ctx = create_some_context(answers=[0, 0]) self.queue = CommandQueue(self.ctx)
def demo_cost_model(): if not SUPPORTS_PROCESS_TIME: raise NotImplementedError( "Currently this script uses process time which only works on Python>=3.3" ) from boxtree.pyfmmlib_integration import FMMLibExpansionWrangler nsources_list = [1000, 2000, 3000, 4000, 5000] ntargets_list = [1000, 2000, 3000, 4000, 5000] dims = 3 dtype = np.float64 ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) traversals = [] traversals_dev = [] level_to_orders = [] timing_results = [] def fmm_level_to_nterms(tree, ilevel): return 10 for nsources, ntargets in zip(nsources_list, ntargets_list): # {{{ Generate sources, targets and target_radii from boxtree.tools import make_normal_particle_array as p_normal sources = p_normal(queue, nsources, dims, dtype, seed=15) targets = p_normal(queue, ntargets, dims, dtype, seed=18) from pyopencl.clrandom import PhiloxGenerator rng = PhiloxGenerator(queue.context, seed=22) target_radii = rng.uniform( queue, ntargets, a=0, b=0.05, dtype=dtype ).get() # }}} # {{{ Generate tree and traversal from boxtree import TreeBuilder tb = TreeBuilder(ctx) tree, _ = tb( queue, sources, targets=targets, target_radii=target_radii, stick_out_factor=0.15, max_particles_in_box=30, debug=True ) from boxtree.traversal import FMMTraversalBuilder tg = FMMTraversalBuilder(ctx, well_sep_is_n_away=2) trav_dev, _ = tg(queue, tree, debug=True) trav = trav_dev.get(queue=queue) traversals.append(trav) traversals_dev.append(trav_dev) # }}} wrangler = FMMLibExpansionWrangler(trav.tree, 0, fmm_level_to_nterms) level_to_orders.append(wrangler.level_nterms) timing_data = {} from boxtree.fmm import drive_fmm src_weights = np.random.rand(tree.nsources).astype(tree.coord_dtype) drive_fmm(trav, wrangler, (src_weights,), timing_data=timing_data) timing_results.append(timing_data) time_field_name = "process_elapsed" from boxtree.cost import FMMCostModel from boxtree.cost import make_pde_aware_translation_cost_model cost_model = FMMCostModel(make_pde_aware_translation_cost_model) model_results = [] for icase in range(len(traversals)-1): traversal = traversals_dev[icase] model_results.append( cost_model.cost_per_stage( queue, traversal, level_to_orders[icase], FMMCostModel.get_unit_calibration_params(), ) ) queue.finish() params = cost_model.estimate_calibration_params( model_results, timing_results[:-1], time_field_name=time_field_name ) predicted_time = cost_model.cost_per_stage( queue, traversals_dev[-1], level_to_orders[-1], params, ) queue.finish() for field in ["form_multipoles", "eval_direct", "multipole_to_local", "eval_multipoles", "form_locals", "eval_locals", "coarsen_multipoles", "refine_locals"]: measured = timing_results[-1][field]["process_elapsed"] pred_err = ( (measured - predicted_time[field]) / measured) logger.info("actual/predicted time for %s: %.3g/%.3g -> %g %% error", field, measured, predicted_time[field], abs(100*pred_err))
def __init__(self): self.ctx = cl.create_some_context() self.queue = cl.CommandQueue(self.ctx) self.tick = False
def main(write_output=True, order=4): cl_ctx = cl.create_some_context() queue = cl.CommandQueue(cl_ctx) actx = PyOpenCLArrayContext(queue) dims = 2 from meshmode.mesh.generation import generate_regular_rect_mesh mesh = generate_regular_rect_mesh(a=(-0.5, ) * dims, b=(0.5, ) * dims, nelements_per_axis=(16, ) * dims) if mesh.dim == 2: dt = 0.04 elif mesh.dim == 3: dt = 0.02 print("%d elements" % mesh.nelements) discr = DiscretizationCollection(actx, mesh, order=order) source_center = np.array([0.1, 0.22, 0.33])[:mesh.dim] source_width = 0.05 source_omega = 3 sym_x = sym.nodes(mesh.dim) sym_source_center_dist = sym_x - source_center sym_t = sym.ScalarVariable("t") from grudge.models.wave import WeakWaveOperator from meshmode.mesh import BTAG_ALL, BTAG_NONE op = WeakWaveOperator( 0.1, discr.dim, source_f=( sym.sin(source_omega * sym_t) * sym.exp(-np.dot(sym_source_center_dist, sym_source_center_dist) / source_width**2)), dirichlet_tag=BTAG_NONE, neumann_tag=BTAG_NONE, radiation_tag=BTAG_ALL, flux_type="upwind") from pytools.obj_array import flat_obj_array fields = flat_obj_array(discr.zeros(actx), [discr.zeros(actx) for i in range(discr.dim)]) # FIXME #dt = op.estimate_rk4_timestep(discr, fields=fields) op.check_bc_coverage(mesh) # print(sym.pretty(op.sym_operator())) bound_op = bind(discr, op.sym_operator()) def rhs(t, w): return bound_op(t=t, w=w) dt_stepper = set_up_rk4("w", dt, fields, rhs) final_t = 10 nsteps = int(final_t / dt) print("dt=%g nsteps=%d" % (dt, nsteps)) from grudge.shortcuts import make_visualizer vis = make_visualizer(discr) step = 0 norm = bind(discr, sym.norm(2, sym.var("u"))) from time import time t_last_step = time() for event in dt_stepper.run(t_end=final_t): if isinstance(event, dt_stepper.StateComputed): assert event.component_id == "w" step += 1 print(step, event.t, norm(u=event.state_component[0]), time() - t_last_step) if step % 10 == 0: vis.write_vtk_file("fld-wave-min-%04d.vtu" % step, [ ("u", event.state_component[0]), ("v", event.state_component[1:]), ]) t_last_step = time()
def __call__(self, q, w, scale=1.0, bkg=0.0, threads=0): """ Abeles matrix formalism for calculating reflectivity from a stratified medium. Uses pyopencl on a GPU to calculate reflectivity. The accuracy of this function may not as good as the C and Python based versions. Furthermore, it can be tricky to use when using multiprocessing based parallelism. Parameters ---------- q: array_like the q values required for the calculation. Q = 4 * Pi / lambda * sin(omega). Units = Angstrom**-1 layers: np.ndarray coefficients required for the calculation, has shape (2 + N, 4), where N is the number of layers layers[0, 1] - SLD of fronting (/1e-6 Angstrom**-2) layers[0, 2] - iSLD of fronting (/1e-6 Angstrom**-2) layers[N, 0] - thickness of layer N layers[N, 1] - SLD of layer N (/1e-6 Angstrom**-2) layers[N, 2] - iSLD of layer N (/1e-6 Angstrom**-2) layers[N, 3] - roughness between layer N-1/N layers[-1, 1] - SLD of backing (/1e-6 Angstrom**-2) layers[-1, 2] - iSLD of backing (/1e-6 Angstrom**-2) layers[-1, 3] - roughness between backing and last layer scale: float Multiply all reflectivities by this value. bkg: float Linear background to be added to all reflectivities threads: int, optional <THIS OPTION IS CURRENTLY IGNORED> Returns ------- Reflectivity: np.ndarray Calculated reflectivity values for each q value. """ import pyopencl as cl if self.ctx is None or self.prg is None: self.ctx = cl.create_some_context(interactive=False) pth = os.path.dirname(os.path.abspath(__file__)) with open(os.path.join(pth, "abeles_pyopencl.cl"), "r") as f: src = f.read() self.prg = cl.Program(self.ctx, src).build() qvals = np.asfarray(q) flatq = qvals.ravel() nlayers = len(w) - 2 coefs = np.empty((nlayers * 4 + 8)) coefs[0] = nlayers coefs[1] = scale coefs[2:4] = w[0, 1:3] coefs[4:6] = w[-1, 1:3] coefs[6] = bkg coefs[7] = w[-1, 3] if nlayers: coefs[8::4] = w[1:-1, 0] coefs[9::4] = w[1:-1, 1] coefs[10::4] = w[1:-1, 2] coefs[11::4] = w[1:-1, 3] mf = cl.mem_flags with cl.CommandQueue(self.ctx) as queue: q_g = cl.Buffer( self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=flatq ) coefs_g = cl.Buffer( self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=coefs ) ref_g = cl.Buffer(self.ctx, mf.WRITE_ONLY, flatq.nbytes) self.prg.abeles(queue, flatq.shape, None, q_g, coefs_g, ref_g) reflectivity = np.empty_like(flatq) cl.enqueue_copy(queue, reflectivity, ref_g) return np.reshape(reflectivity, qvals.shape)
large_node = np.array(extendedMatData['large_nodes']) nn0 = np.array(extendedMatData['rimg_NN0']) nn1 = np.array(extendedMatData['rimg_NN1']) ss = len(nn0) #nn0 = np.transpose(nn0); #nn1 = np.transpose(nn1); nn0 = np.ravel(nn0) nn1 = np.ravel(nn1) load_time = time.time() print("load time", load_time - start_time) extendedData = np.zeros(ss, dtype=np.float32) context = cl.create_some_context() queue = cl.CommandQueue(context) program_extension = cl.Program(context, kernel_extension).build() program_reducing = cl.Program(context, kernel_reducing).build() points = np.zeros((len(large_elem), 3)) get_tpoints(points, large_elem, large_node) subfigure = [] c = 0 while (True): try: plt.cla() ax.set_xlim3d(-150, 150) ax.set_ylim3d(-150, 150)
def main(): # Config steps = 40000 num_gbl = 2048 num_lcl = 128 num_grp = num_gbl / num_lcl num_sec = 25 num_keep = 2 # Generate some input alpha = rnd.uniform(-1, 1, size=num_sec).astype(np.float32) prc = rnd.uniform(1, 10, size=num_sec).astype(np.float32) * 10 bid = prc - np.multiply(rnd.uniform(size=prc.size), prc / 100).astype( np.float32) ask = prc + np.multiply(rnd.uniform(size=prc.size), prc / 100).astype( np.float32) adv = rnd.uniform(10000000, size=num_sec).astype(np.float32) port_out = np.zeros((num_grp, num_sec), dtype=np.int32) def get_fit(p): s = np.float64(0) for i in range(len(p)): s += alpha[i] * p[i] return s def get_max(res): m = None f = 0 for fit, port in map((lambda p: (get_fit(p), p)), res): gmv = sum(abs(port[i] * prc[i]) for i in range(len(port))) if fit * gmv > f or m is None: f = fit * gmv m = port return f, m ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) # Create OpenCL buffers mf = cl.mem_flags sec_rnd = rnd.uniform(size=num_sec).astype(np.float32) thd_rnd = rnd.uniform(size=num_gbl).astype(np.float32) alpha_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=alpha) bid_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=bid) ask_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=ask) prc_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=prc) adv_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=adv) port_buf = cl.Buffer(ctx, mf.WRITE_ONLY, port_out.nbytes) thd_rnd_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=thd_rnd) sec_rnd_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=sec_rnd) port_scratch_buf = cl.Buffer(ctx, mf.READ_WRITE, num_gbl * num_sec * 4) fit_buf = cl.Buffer(ctx, mf.READ_WRITE, num_gbl * 8) keep_buf = cl.LocalMemory(num_keep * 4) prg = cl.Program(ctx, open('kernel.c').read()).build() # Init buffers e = prg.init(queue, (num_gbl, ), (num_lcl, ), port_scratch_buf, thd_rnd_buf, fit_buf, np.int32(num_sec), np.int32(num_gbl), np.int32(rnd.randint(0, max(num_sec, num_gbl)))) e = cl.enqueue_barrier(queue, wait_for=[e]) # Init fitness e = prg.get_fitness(queue, (num_gbl, ), (num_lcl, ), alpha_buf, bid_buf, ask_buf, prc_buf, adv_buf, port_scratch_buf, fit_buf, thd_rnd_buf, np.int32(num_sec), np.int32(num_gbl), wait_for=[e]) e = cl.enqueue_barrier(queue, wait_for=[e]) for i in range(0, steps - 1): sec_rnd = rnd.uniform(size=num_sec).astype(np.float32) thd_rnd = rnd.uniform(size=num_gbl).astype(np.float32) thd_rnd_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=thd_rnd) sec_rnd_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=sec_rnd) # Reap, mutate, and recombinate e = prg.mutate(queue, (num_gbl, ), (num_lcl, ), port_scratch_buf, fit_buf, sec_rnd_buf, thd_rnd_buf, keep_buf, np.int32(num_sec), np.int32(num_gbl), np.int32(num_keep), np.int32(rnd.randint(0, max(num_sec, num_gbl))), wait_for=[e]) e = cl.enqueue_barrier(queue, wait_for=[e]) # Recomute fitness e = prg.get_fitness(queue, (num_gbl, ), (num_lcl, ), alpha_buf, bid_buf, ask_buf, prc_buf, adv_buf, port_scratch_buf, fit_buf, thd_rnd_buf, np.int32(num_sec), np.int32(num_gbl), wait_for=[e]) e = cl.enqueue_barrier(queue, wait_for=[e]) # Get the top portfolios from each work group port_buf = cl.Buffer(ctx, mf.WRITE_ONLY, port_out.nbytes) e = prg.get_max(queue, (num_gbl, ), (num_lcl, ), port_scratch_buf, port_buf, fit_buf, np.int32(num_sec), wait_for=[e]) e = cl.enqueue_barrier(queue, wait_for=[e]) e = cl.enqueue_copy(queue, port_out, port_buf, wait_for=[e]) e = cl.enqueue_barrier(queue, wait_for=[e]) # (Meta-)Select the one we want f, port_out = get_max(port_out) print('Signal:') print(alpha) print('Prices:') print(prc) print('ADV:') print(adv) print('Spreads:') print(ask - bid) print('Portfolio:') print(port_out) print('Fitness:', f) print( 'Max Participation:', max(abs(port_out[i] * prc[i] / adv[i]) for i in range(len(port_out)))) print('GMV:', sum(abs(port_out[i] * prc[i]) for i in range(len(port_out)))) print('NMV:', sum(port_out[i] * prc[i] for i in range(len(port_out)))) print('Peason R bt alpha and port:', pearsonr(port_out, alpha))
import time, math import numpy as np import pyopencl as cl import pygame # Default values for pyopencl #platform = cl.get_platforms()[0] #device = platform.get_devices()[0] #ctx = cl.Context([device]) # Manually enter settings each time ctx = cl.create_some_context(interactive=True) # Returns array of pixel rgb values for Mandelbrot set # xMin, xMax, yMin and yMax are values for the actual frame of the set, width and height is the size of the image in pixels # Higher maxIterations will result in better quality, but will take longer def mandelbrot(xMin, xMax, yMin, yMax, width, height, maxIterations): # Set up pixel values as array r1 = np.linspace(xMin, xMax, width, dtype=np.float64) r2 = np.linspace(yMin, yMax, height, dtype=np.float64) c = r1 + r2[:, None] * 1j c = np.ravel(c) # Set up context global ctx queue = cl.CommandQueue(ctx) output = np.empty(c.shape, dtype=np.uint32) # Mandelbrot program
def main(write_output=True, order=4): cl_ctx = cl.create_some_context() queue = cl.CommandQueue(cl_ctx) actx = PyOpenCLArrayContext(queue) comm = MPI.COMM_WORLD num_parts = comm.Get_size() from meshmode.distributed import MPIMeshDistributor, get_partition_by_pymetis mesh_dist = MPIMeshDistributor(comm) if mesh_dist.is_mananger_rank(): dims = 2 from meshmode.mesh.generation import generate_regular_rect_mesh mesh = generate_regular_rect_mesh( a=(-0.5,)*dims, b=(0.5,)*dims, n=(16,)*dims) print("%d elements" % mesh.nelements) part_per_element = get_partition_by_pymetis(mesh, num_parts) local_mesh = mesh_dist.send_mesh_parts(mesh, part_per_element, num_parts) del mesh else: local_mesh = mesh_dist.receive_mesh_part() discr = DGDiscretizationWithBoundaries(actx, local_mesh, order=order, mpi_communicator=comm) if local_mesh.dim == 2: dt = 0.04 elif local_mesh.dim == 3: dt = 0.02 source_center = np.array([0.1, 0.22, 0.33])[:local_mesh.dim] source_width = 0.05 source_omega = 3 sym_x = sym.nodes(local_mesh.dim) sym_source_center_dist = sym_x - source_center sym_t = sym.ScalarVariable("t") from grudge.models.wave import WeakWaveOperator from meshmode.mesh import BTAG_ALL, BTAG_NONE op = WeakWaveOperator(0.1, discr.dim, source_f=( sym.sin(source_omega*sym_t) * sym.exp( -np.dot(sym_source_center_dist, sym_source_center_dist) / source_width**2)), dirichlet_tag=BTAG_NONE, neumann_tag=BTAG_NONE, radiation_tag=BTAG_ALL, flux_type="upwind") from pytools.obj_array import flat_obj_array fields = flat_obj_array( discr.zeros(actx), [discr.zeros(actx) for i in range(discr.dim)]) # FIXME #dt = op.estimate_rk4_timestep(discr, fields=fields) op.check_bc_coverage(local_mesh) # print(sym.pretty(op.sym_operator())) bound_op = bind(discr, op.sym_operator()) def rhs(t, w): return bound_op(t=t, w=w) dt_stepper = set_up_rk4("w", dt, fields, rhs) final_t = 10 nsteps = int(final_t/dt) print("dt=%g nsteps=%d" % (dt, nsteps)) from grudge.shortcuts import make_visualizer vis = make_visualizer(discr, vis_order=order) step = 0 norm = bind(discr, sym.norm(2, sym.var("u"))) from time import time t_last_step = time() for event in dt_stepper.run(t_end=final_t): if isinstance(event, dt_stepper.StateComputed): assert event.component_id == "w" step += 1 print(step, event.t, norm(u=event.state_component[0]), time()-t_last_step) if step % 10 == 0: vis.write_parallel_vtk_file( comm, f"fld-wave-min-mpi-{{rank:03d}}-{step:04d}.vtu", [ ("u", event.state_component[0]), ("v", event.state_component[1:]), ]) t_last_step = time()
def get_context(): global _ctx if _ctx is None: _ctx = cl.create_some_context() return _ctx
def main(): import logging logging.basicConfig(level=logging.INFO) cl_ctx = cl.create_some_context() queue = cl.CommandQueue(cl_ctx) from meshmode.mesh.generation import ellipse, make_curve_mesh from functools import partial mesh = make_curve_mesh( partial(ellipse, 2), np.linspace(0, 1, nelements+1), mesh_order) pre_density_discr = Discretization( cl_ctx, mesh, InterpolatoryQuadratureSimplexGroupFactory(bdry_quad_order)) from pytential.qbx import ( QBXLayerPotentialSource, QBXTargetAssociationFailedException) qbx, _ = QBXLayerPotentialSource( pre_density_discr, fine_order=bdry_ovsmp_quad_order, qbx_order=qbx_order, fmm_order=fmm_order, expansion_disks_in_tree_have_extent=True, ).with_refinement() density_discr = qbx.density_discr from pytential.symbolic.pde.cahn_hilliard import CahnHilliardOperator chop = CahnHilliardOperator( # FIXME: Constants? lambda1=1.5, lambda2=1.25, c=1) unk = chop.make_unknown("sigma") bound_op = bind(qbx, chop.operator(unk)) # {{{ fix rhs and solve nodes = density_discr.nodes().with_queue(queue) def g(xvec): x, y = xvec return cl.clmath.atan2(y, x) bc = sym.make_obj_array([ # FIXME: Realistic BC g(nodes), -g(nodes), ]) from pytential.solve import gmres gmres_result = gmres( bound_op.scipy_op(queue, "sigma", dtype=np.complex128), bc, tol=1e-8, progress=True, stall_iterations=0, hard_failure=True) # }}} # {{{ postprocess/visualize sigma = gmres_result.solution from sumpy.visualization import FieldPlotter fplot = FieldPlotter(np.zeros(2), extent=5, npoints=500) targets = cl.array.to_device(queue, fplot.points) qbx_stick_out = qbx.copy(target_association_tolerance=0.05) indicator_qbx = qbx_stick_out.copy(qbx_order=2) from sumpy.kernel import LaplaceKernel ones_density = density_discr.zeros(queue) ones_density.fill(1) indicator = bind( (indicator_qbx, PointsTarget(targets)), sym.D(LaplaceKernel(2), sym.var("sigma")))( queue, sigma=ones_density).get() try: fld_in_vol = bind( (qbx_stick_out, PointsTarget(targets)), chop.representation(unk))(queue, sigma=sigma).get() except QBXTargetAssociationFailedException as e: fplot.write_vtk_file( "failed-targets.vts", [ ("failed", e.failed_target_flags.get(queue)) ] ) raise #fplot.show_scalar_in_mayavi(fld_in_vol.real, max_val=5) fplot.write_vtk_file( "potential.vts", [ ("potential", fld_in_vol), ("indicator", indicator), ] )
def setup(proc_id): context = pyopencl.create_some_context(answers=[0, proc_id]) return {'cl_context': context}
def require_init_gpu(): global cl_ctx, cl_queue if cl_queue is None: cl_ctx = cl.create_some_context( answers=[0, 2]) # change if you don't have mac cl_queue = cl.CommandQueue(cl_ctx)
def main(): logging.basicConfig(level=logging.INFO) nelements = 60 qbx_order = 3 k_fac = 4 k0 = 3*k_fac k1 = 2.9*k_fac mesh_order = 10 bdry_quad_order = mesh_order bdry_ovsmp_quad_order = bdry_quad_order * 4 fmm_order = qbx_order * 2 cl_ctx = cl.create_some_context() queue = cl.CommandQueue(cl_ctx) from meshmode.mesh.generation import ellipse, make_curve_mesh from functools import partial mesh = make_curve_mesh( partial(ellipse, 3), np.linspace(0, 1, nelements+1), mesh_order) density_discr = Discretization( cl_ctx, mesh, InterpolatoryQuadratureSimplexGroupFactory(bdry_quad_order)) logger.info("%d elements" % mesh.nelements) # from meshmode.discretization.visualization import make_visualizer # bdry_vis = make_visualizer(queue, density_discr, 20) # {{{ solve bvp from sumpy.kernel import HelmholtzKernel kernel = HelmholtzKernel(2) beta = 2.5*k_fac K0 = np.sqrt(k0**2-beta**2) K1 = np.sqrt(k1**2-beta**2) from pytential.symbolic.pde.scalar import DielectricSDRep2DBoundaryOperator pde_op = DielectricSDRep2DBoundaryOperator( mode='tm', k_vacuum=1, interfaces=((0, 1, sym.DEFAULT_SOURCE),), domain_k_exprs=(k0, k1), beta=beta) op_unknown_sym = pde_op.make_unknown("unknown") representation0_sym = pde_op.representation(op_unknown_sym, 0) representation1_sym = pde_op.representation(op_unknown_sym, 1) from pytential.qbx import QBXLayerPotentialSource qbx = QBXLayerPotentialSource( density_discr, fine_order=bdry_ovsmp_quad_order, qbx_order=qbx_order, fmm_order=fmm_order ) bound_pde_op = bind(qbx, pde_op.operator(op_unknown_sym)) # in inner domain sources_1 = make_obj_array(list(np.array([ [-1.5, 0.5] ]).T.copy())) strengths_1 = np.array([1]) from sumpy.p2p import P2P pot_p2p = P2P(cl_ctx, [kernel], exclude_self=False) _, (Einc,) = pot_p2p(queue, density_discr.nodes(), sources_1, [strengths_1], out_host=False, k=K0) sqrt_w = bind(density_discr, sym.sqrt_jac_q_weight())(queue) bvp_rhs = np.zeros(len(pde_op.bcs), dtype=np.object) for i_bc, terms in enumerate(pde_op.bcs): for term in terms: assert term.i_interface == 0 assert term.field_kind == pde_op.field_kind_e if term.direction == pde_op.dir_none: bvp_rhs[i_bc] += ( term.coeff_outer * (-Einc) ) elif term.direction == pde_op.dir_normal: # no jump in normal derivative bvp_rhs[i_bc] += 0*Einc else: raise NotImplementedError("direction spec in RHS") bvp_rhs[i_bc] *= sqrt_w from pytential.solve import gmres gmres_result = gmres( bound_pde_op.scipy_op(queue, "unknown", dtype=np.complex128, domains=[sym.DEFAULT_TARGET]*2, K0=K0, K1=K1), bvp_rhs, tol=1e-6, progress=True, hard_failure=True, stall_iterations=0) # }}} unknown = gmres_result.solution # {{{ visualize from pytential.qbx import QBXLayerPotentialSource lap_qbx = QBXLayerPotentialSource( density_discr, fine_order=bdry_ovsmp_quad_order, qbx_order=qbx_order, fmm_order=qbx_order ) from sumpy.visualization import FieldPlotter fplot = FieldPlotter(np.zeros(2), extent=5, npoints=300) from pytential.target import PointsTarget fld0 = bind( (qbx, PointsTarget(fplot.points)), representation0_sym)(queue, unknown=unknown, K0=K0).get() fld1 = bind( (qbx, PointsTarget(fplot.points)), representation1_sym)(queue, unknown=unknown, K1=K1).get() ones = cl.array.empty(queue, density_discr.nnodes, np.float64) dom1_indicator = -bind( (lap_qbx, PointsTarget(fplot.points)), sym.D(0, sym.var("sigma")))( queue, sigma=ones.fill(1)).get() _, (fld_inc_vol,) = pot_p2p(queue, fplot.points, sources_1, [strengths_1], out_host=True, k=K0) #fplot.show_scalar_in_mayavi(fld_in_vol.real, max_val=5) fplot.write_vtk_file( "potential.vts", [ ("fld0", fld0), ("fld1", fld1), ("fld_inc_vol", fld_inc_vol), ("fld_total", ( (fld_inc_vol + fld0)*(1-dom1_indicator) + fld1*dom1_indicator )), ("dom1_indicator", dom1_indicator), ] )
harmless, albeit annoying. """ from __future__ import print_function import os import warnings import logging import time import numpy as np # type: ignore try: #raise NotImplementedError("OpenCL not yet implemented for new kernel template") import pyopencl as cl # type: ignore # Ask OpenCL for the default context so that we know that one exists cl.create_some_context(interactive=False) except Exception as exc: warnings.warn("OpenCL startup failed with ***" + str(exc) + "***; using C compiler instead") raise RuntimeError("OpenCL not available") from pyopencl import mem_flags as mf from pyopencl.characterize import get_fast_inaccurate_build_options from . import generate from .kernel import KernelModel, Kernel # pylint: disable=unused-import try: from typing import Tuple, Callable, Any from .modelinfo import ModelInfo
def main(): import logging logging.basicConfig(level=logging.WARNING) # INFO for more progress info cl_ctx = cl.create_some_context() queue = cl.CommandQueue(cl_ctx) actx = PyOpenCLArrayContext(queue) target_order = 16 qbx_order = 3 nelements = 60 mode_nr = 0 k = 0 if k: kernel = HelmholtzKernel(2) else: kernel = LaplaceKernel(2) mesh = make_curve_mesh( #lambda t: ellipse(1, t), starfish, np.linspace(0, 1, nelements + 1), target_order) from pytential.qbx import QBXLayerPotentialSource from meshmode.discretization import Discretization from meshmode.discretization.poly_element import \ InterpolatoryQuadratureSimplexGroupFactory pre_density_discr = Discretization( actx, mesh, InterpolatoryQuadratureSimplexGroupFactory(target_order)) unaccel_qbx = QBXLayerPotentialSource( pre_density_discr, fine_order=2 * target_order, qbx_order=qbx_order, fmm_order=False, target_association_tolerance=.05, ) from pytential.target import PointsTarget fplot = FieldPlotter(np.zeros(2), extent=5, npoints=600) from pytential import GeometryCollection places = GeometryCollection({ "unaccel_qbx": unaccel_qbx, "qbx": unaccel_qbx.copy(fmm_order=10), "targets": PointsTarget(fplot.points) }) density_discr = places.get_discretization("unaccel_qbx") nodes = thaw(actx, density_discr.nodes()) angle = actx.np.arctan2(nodes[1], nodes[0]) from pytential import bind, sym if k: kernel_kwargs = {"k": sym.var("k")} else: kernel_kwargs = {} def get_op(): kwargs = dict(qbx_forced_limit=None) kwargs.update(kernel_kwargs) # return sym.d_dx(2, sym.S(kernel, sym.var("sigma"), **kwargs)) # return sym.D(kernel, sym.var("sigma"), **kwargs) return sym.S(kernel, sym.var("sigma"), **kwargs) op = get_op() sigma = actx.np.cos(mode_nr * angle) if isinstance(kernel, HelmholtzKernel): for i, elem in np.ndenumerate(sigma): sigma[i] = elem.astype(np.complex128) fld_in_vol = bind(places, op, auto_where=("unaccel_qbx", "targets"))(actx, sigma=sigma, k=k).get() fmm_fld_in_vol = bind(places, op, auto_where=("qbx", "targets"))(actx, sigma=sigma, k=k).get() err = fmm_fld_in_vol - fld_in_vol try: import matplotlib except ImportError: return matplotlib.use("Agg") im = fplot.show_scalar_in_matplotlib(np.log10(np.abs(err) + 1e-17)) from matplotlib.colors import Normalize im.set_norm(Normalize(vmin=-12, vmax=0)) import matplotlib.pyplot as pt from matplotlib.ticker import NullFormatter pt.gca().xaxis.set_major_formatter(NullFormatter()) pt.gca().yaxis.set_major_formatter(NullFormatter()) cb = pt.colorbar(shrink=0.9) cb.set_label(r"$\log_{10}(\mathrm{Error})$") pt.savefig("fmm-error-order-%d.pdf" % qbx_order)
def generateAN(wavelen, modes): with open('generateAN.cl', 'r') as myfile: integratePI = myfile.read() # Some constant values INSTEPS = 512 * 512 ITERS = 262144 / 2048 # Set some default values: # Default number of steps (updated later to device prefereable) in_nsteps = INSTEPS # Default number of iterations niters = ITERS # Create context, queue and build program context = pycl.create_some_context() queue = pycl.CommandQueue(context) program = pycl.Program(context, integratePI).build() pi = program.pi pi.set_scalar_arg_dtypes([ numpy.int32, numpy.int32, numpy.float32, numpy.float32, numpy.float32, None, None ]) # Get the max work group size for the kernel pi on our device device = context.devices[0] work_group_size = program.pi.get_work_group_info( pycl.kernel_work_group_info.WORK_GROUP_SIZE, device) # Now that we know the size of the work_groups, we can set the number of work # groups, the actual number of steps, and the step size nwork_groups = in_nsteps / (work_group_size * niters) print(nwork_groups) # if nwork_groups < 1: # nwork_groups = device.max_compute_units # work_group_size = in_nsteps/(nwork_groups*niters) nsteps = work_group_size * niters * nwork_groups d = 3 * wavelen t = 6 * wavelen #Define Bounds a1 = -d - t / 2 b2 = d + t / 2 start = a1 end = b2 step_size = (end - start) / float(nsteps) print(step_size) # # vector to hold partial sum h_psum = numpy.empty(int(nwork_groups)).astype(numpy.float32) print("%s work groups of size %s" % (nwork_groups, work_group_size)) print("Integration steps %s" % nsteps) d_partial_sums = pycl.Buffer(context, pycl.mem_flags.WRITE_ONLY, h_psum.nbytes) # Start the timer rtime = time() # # Execute the kernel over the entire range of our 1d input data et # # using the maximum number of work group items for this device # # Set the global and local size as tuples global_size = (int(nwork_groups * work_group_size), ) local_size = ((work_group_size), ) localmem = pycl.LocalMemory( numpy.dtype(numpy.float32).itemsize * work_group_size) print(niters) AN = [] for n in range(0, modes): pi(queue, global_size, local_size, int(n), int(niters), step_size, start, wavelen, localmem, d_partial_sums) #print("done") pycl.enqueue_copy(queue, h_psum, d_partial_sums) # # complete the sum and compute the final integral value pi_res = (h_psum.sum() * step_size) AN.append(pi_res) # Stop the timer rtime = time() - rtime print(rtime) return AN
def run(double_precision=False): context = cl.create_some_context() queue = cl.CommandQueue(context) dtype = np.complex64 if not double_precision else np.complex128 n_run = 100 #set to 1 for proper testing if n_run > 1: nd_dataC = np.random.normal(size=(4, 1024, 1024)).astype( dtype) #faster than 1024x1024? else: nd_dataC = np.ones((4, 1024, 1024), dtype=dtype) #set n_run to 1 nd_dataF = np.asfortranarray(nd_dataC) dataC = cla.to_device(queue, nd_dataC) dataF = cla.to_device(queue, nd_dataF) nd_result = np.zeros_like(nd_dataC, dtype=dtype) resultC = cla.to_device(queue, nd_result) resultF = cla.to_device(queue, np.asfortranarray(nd_result)) result = resultF axes_list = [(1, 2), (2, 1)] #batched 2d transforms if True: print('out of place transforms', dataC.shape, dataC.dtype) print('axes in out') for axes in axes_list: for data in (dataC, dataF): for result in (resultC, resultF): try: transform = FFT(context, queue, data, result, axes=axes) #transform.plan.transpose_result = True #not implemented for some transforms (works e.g. for out of place, (2,1) C C) print( '%-10s %3s %3s' % ( axes, 'C' if data.flags.c_contiguous else 'F', 'C' if result.flags.c_contiguous else 'F', ), end=' ', ) tic = timeit.default_timer() for i in range(n_run): events = transform.enqueue() #events = transform.enqueue(False) for e in events: e.wait() toc = timeit.default_timer() t_ms = 1e3 * (toc - tic) / n_run gflops = 5e-9 * np.log2(np.prod( transform.t_shape)) * np.prod( transform.t_shape) * transform.batchsize / ( 1e-3 * t_ms) npfft_result = npfftn(nd_dataC, axes=axes) if transform.plan.transpose_result: npfft_result = np.swapaxes(npfft_result, axes[0], axes[1]) max_error = np.max(abs(result.get() - npfft_result)) print('%8.1e' % max_error, end=' ') assert_allclose( result.get(), npfft_result, atol=1e-8 if double_precision else 1e-3, rtol=1e-8 if double_precision else 1e-3) #assert_array_almost_equal(abs(result.get() - npfftn(data.get(), axes = axes)), # 1e-4) except GpyFFT_Error as e: print(e) t_ms, gflops = 0, 0 except AssertionError as e: print(e) finally: print('%5.2fms %6.2f Gflops' % (t_ms, gflops)) print('in place transforms', nd_dataC.shape, nd_dataC.dtype) for axes in axes_list: for nd_data in (nd_dataC, nd_dataF): data = cla.to_device(queue, nd_data) transform = FFT(context, queue, data, axes=axes) #transform.plan.transpose_result = True #not implemented tic = timeit.default_timer() for i in range(n_run): # inplace transform fails for n_run > 1 events = transform.enqueue() for e in events: e.wait() toc = timeit.default_timer() t_ms = 1e3 * (toc - tic) / n_run gflops = 5e-9 * np.log2(np.prod(transform.t_shape)) * np.prod( transform.t_shape) * transform.batchsize / (1e-3 * t_ms) print( '%-10s %3s %5.2fms %6.2f Gflops' % (axes, 'C' if data.flags.c_contiguous else 'F', t_ms, gflops))
def simple_wave_entrypoint(dim=2, num_elems=256, order=4, num_steps=30, log_filename="grudge.dat"): cl_ctx = cl.create_some_context() queue = cl.CommandQueue(cl_ctx) from mpi4py import MPI comm = MPI.COMM_WORLD num_parts = comm.Get_size() n = int(num_elems**(1. / dim)) from meshmode.distributed import MPIMeshDistributor mesh_dist = MPIMeshDistributor(comm) if mesh_dist.is_mananger_rank(): from meshmode.mesh.generation import generate_regular_rect_mesh mesh = generate_regular_rect_mesh(a=(-0.5, ) * dim, b=(0.5, ) * dim, n=(n, ) * dim) from pymetis import part_graph _, p = part_graph(num_parts, xadj=mesh.nodal_adjacency.neighbors_starts.tolist(), adjncy=mesh.nodal_adjacency.neighbors.tolist()) part_per_element = np.array(p) local_mesh = mesh_dist.send_mesh_parts(mesh, part_per_element, num_parts) else: local_mesh = mesh_dist.receive_mesh_part() vol_discr = DGDiscretizationWithBoundaries(cl_ctx, local_mesh, order=order, mpi_communicator=comm) source_center = np.array([0.1, 0.22, 0.33])[:local_mesh.dim] source_width = 0.05 source_omega = 3 sym_x = sym.nodes(local_mesh.dim) sym_source_center_dist = sym_x - source_center sym_t = sym.ScalarVariable("t") from grudge.models.wave import StrongWaveOperator from meshmode.mesh import BTAG_ALL, BTAG_NONE op = StrongWaveOperator( -0.1, vol_discr.dim, source_f=( sym.sin(source_omega * sym_t) * sym.exp(-np.dot(sym_source_center_dist, sym_source_center_dist) / source_width**2)), dirichlet_tag=BTAG_NONE, neumann_tag=BTAG_NONE, radiation_tag=BTAG_ALL, flux_type="upwind") from pytools.obj_array import join_fields fields = join_fields( vol_discr.zeros(queue), [vol_discr.zeros(queue) for i in range(vol_discr.dim)]) from pytools.log import LogManager, \ add_general_quantities, \ add_run_info, \ IntervalTimer, EventCounter # NOTE: LogManager hangs when using a file on a shared directory. logmgr = LogManager(log_filename, "w", comm) add_run_info(logmgr) add_general_quantities(logmgr) log_quantities =\ {"rank_data_swap_timer": IntervalTimer("rank_data_swap_timer", "Time spent evaluating RankDataSwapAssign"), "rank_data_swap_counter": EventCounter("rank_data_swap_counter", "Number of RankDataSwapAssign instructions evaluated"), "exec_timer": IntervalTimer("exec_timer", "Total time spent executing instructions"), "insn_eval_timer": IntervalTimer("insn_eval_timer", "Time spend evaluating instructions"), "future_eval_timer": IntervalTimer("future_eval_timer", "Time spent evaluating futures"), "busy_wait_timer": IntervalTimer("busy_wait_timer", "Time wasted doing busy wait")} for quantity in log_quantities.values(): logmgr.add_quantity(quantity) bound_op = bind(vol_discr, op.sym_operator()) def rhs(t, w): val, rhs.profile_data = bound_op(queue, profile_data=rhs.profile_data, log_quantities=log_quantities, t=t, w=w) return val rhs.profile_data = {} dt = 0.04 dt_stepper = set_up_rk4("w", dt, fields, rhs) logmgr.tick_before() for event in dt_stepper.run(t_end=dt * num_steps): if isinstance(event, dt_stepper.StateComputed): logmgr.tick_after() logmgr.tick_before() logmgr.tick_after() def print_profile_data(data): print("""execute() for rank %d: \tInstruction Evaluation: %f%% \tFuture Evaluation: %f%% \tBusy Wait: %f%% \tTotal: %f seconds""" % (comm.Get_rank(), data['insn_eval_time'] / data['total_time'] * 100, data['future_eval_time'] / data['total_time'] * 100, data['busy_wait_time'] / data['total_time'] * 100, data['total_time'])) print_profile_data(rhs.profile_data) logmgr.close()
def __init__(self, gene_mat, pop, adj_mat, bound=None, secretion=None, reception=None, receptors=None, init_env=None, secr_amount=1.0, leak=1.0, max_con=1000.0, max_dist=None, opencl=False): """ Init of Stops Parameters: - gene_mat - matrix of gene interactions [GENE_NUM, GENE_NUM] - pop - array with initial population [POP_SIZE, GENE_NUM] - adj_mat - matrix with distances between each cell in population[POP_SIZE, POP_SIZE] - bound - vector of max value of each gene [GENE_NUM] - secretion - vector of length LIG_NUM where secretion[i] contains index of a gene which must be on to secrete ligand i - reception - vector of length LIG_NUM where reception[i] contains index of a gene which will be set to on when ligand i is accepted - receptors - vector of length LIG_NUM where receptors[i] contains index of a gene which has to be on to accept ligand i; special value -1 means that there is no need for specific gene expression for the ligand - secr_amount - amount of ligand secreted to the environment each time - leak - amount of ligand leaking from the environment each time - max_con - maximal ligand concentration - max_dist - maximal distance between a cell and an environment needed for the cell to accept ligands from the environment - opencl - if set to True opencl is used to boost the speed """ self.gene_mat = numpy.array(gene_mat).astype(numpy.float32) self.pop = numpy.array(pop).astype(numpy.float32) self.adj_mat = numpy.array(adj_mat).astype(numpy.float32) self.secr_amount = secr_amount self.leak = leak self.max_con = max_con self.row_size = self.gene_mat.shape[0] self.pop_size = self.pop.shape[0] self.max_dist = numpy.max(adj_mat) if max_dist is None else max_dist if bound != None: self.bound = numpy.array(bound).astype(numpy.float32) else: # bound default - all ones self.bound = numpy.ones(self.row_size).astype(numpy.float32) if secretion != None: self.secretion = numpy.array(secretion).astype(numpy.int32) else: self.secretion = numpy.array([]).astype(numpy.int32) if reception != None: self.reception = numpy.array(reception).astype(numpy.int32) else: self.reception = numpy.array([]).astype(numpy.int32) self.max_lig = len(secretion) if init_env is None: self.init_env = numpy.zeros(self.max_lig) else: self.init_env = init_env self.env = numpy.array([self.init_env] * self.pop.shape[0]).astype( numpy.float32) if receptors != None: self.receptors = numpy.array(receptors).astype(numpy.int32) else: # receptors - default value "-1" - no receptor for ligand is necessary self.receptors = numpy.array([-1] * self.max_lig).astype( numpy.int32) self._random = numpy.random.random self.opencl = opencl self.pop_hit = numpy.zeros( (self.pop_size, self.max_lig)).astype(numpy.int32) if opencl: self.ctx = cl.create_some_context() self.queue = cl.CommandQueue(self.ctx) self.mf = cl.mem_flags #init kernel self.program = self.__prepare_kernel() self.rand_state_buf = cl.Buffer(self.ctx, self.mf.READ_WRITE, size=self.pop.shape[0] * 112) self.program.init_ranlux(self.queue, (self.pop.shape[0], 1), None, numpy.uint32(numpy.random.randint(4e10)), self.rand_state_buf) # prepare multiplication matrix adj_mat_buf = cl.Buffer(self.ctx, self.mf.READ_ONLY | self.mf.COPY_HOST_PTR, hostbuf=self.adj_mat) self.mul_mat_buf = cl.Buffer(self.ctx, self.mf.READ_WRITE, size=self.adj_mat.nbytes) self.program.init_mul_mat(self.queue, (self.pop.shape[0], 1), None, self.mul_mat_buf, adj_mat_buf, numpy.float32(self.max_dist)) else: self.mul_mat = mmap( lambda x: 1. / x if x != 0 and x <= max_dist else 0., adj_mat) n_density = numpy.sum(self.mul_mat, axis=0) self.mul_mat = self.mul_mat / n_density # what if density is 0 self.mul_mat = self.mul_mat.astype(numpy.float32)