def __init__(self): self.context = cl.clCreateContextFromType() self.queue = cl.clCreateCommandQueue(self.context)
def __init__(self): self.context = cl.clCreateContextFromType() self.queue = cl.clCreateCommandQueue( context=self.context ) #, properties=cl.cl_command_queue_properties.CL_QUEUE_PROFILING_ENABLE) self.device = self.queue.device
def transform(self, py_ast, program_config): """ Convert the Python AST to a C AST according to the directions given in program_config. """ arg_config, tuner_config = program_config dot = DotWriter() # hack yo ComputedVector._next_id = 0 CopiedVector._next_id = 0 # set up OpenCL context and memory spaces import pycl context = pycl.clCreateContextFromType(pycl.CL_DEVICE_TYPE_ALL) queues = [pycl.clCreateCommandQueue(context, dev) for dev in context.devices] c_func = ElementwiseFunction(context, queues) memories = [MainMemory()] + [OclMemory(q) for q in queues] main_memory = memories[0] ptrs = arg_config['ptrs'] dtype, length = ptrs[0]._dtype_, arg_config['len'] # pull stuff out of autotuner distribute_directives = tuner_config['distribute'] reassoc_directives = tuner_config['reassociate'] locs = [memories[loc] for loc in tuner_config['locs']] fusion_directives = tuner_config['fusion'] parallelize_directives = tuner_config['parallelize'] dot.write(py_ast) # run basic conversions proj = PyBasicConversions().visit(py_ast) dot.write(proj) # run platform-independent transformations proj = ApplyDistributiveProperty(distribute_directives).visit(proj) dot.write(proj) proj = ApplyAssociativeProperty(reassoc_directives).visit(proj) dot.write(proj) # set parameter types proj = VectorFinder(ptrs, main_memory).visit(proj) dot.write(proj) proj = LocationTagger(locs).visit(proj) dot.write(proj) proj = InsertIntermediates(main_memory).visit(proj) dot.write(proj) proj = CopyInserter(main_memory).visit(proj) dot.write(proj) proj = DoFusion(fusion_directives).visit(proj) dot.write(proj) proj = AllocateIntermediates(dtype, length).visit(proj) dot.write(proj, "postintermed") py_op = proj.find(FunctionDecl, name="py_op") schedules = FindParallelism(parallelize_directives).visit(py_op) py_op.defn = parallelize_tasks(schedules) dot.write(proj, "postparallel") proj = KernelOutliner(length).visit(proj) dot.write(proj) proj = LowerKernelCalls().visit(proj) dot.write(proj) proj = RefConverter().visit(proj) dot.write(proj) proj = LowerLoopsAndCopies(length).visit(proj) dot.write(proj) zipper = ArgZipper() proj = zipper.visit( Lifter().visit(proj) ) c_func.extra_args = zipper.extra_args c_func.answer = zipper.answer dot.write(proj) fn = proj.find(FunctionDecl) return c_func.finalize("py_op", proj, fn.get_type())
def __init__(self): self.context = cl.clCreateContextFromType() self.queue = cl.clCreateCommandQueue(self.context, device=TARGET_GPU)
# ### Specialist-Writtern Code ### # # The code below is written by an industrt SPECIALIST. This code is meant # to be more complicated and requires specialized knowledge to write. # # # Global Constants # WORK_GROUP_SIZE = 1024 devices = cl.clCreateContextFromType().devices + cl.clCreateContext().devices TARGET_GPU = devices[1] ITERATIONS = 0 class ConcreteReduction(ConcreteSpecializedFunction): def __init__(self): self.context = cl.clCreateContextFromType() self.queue = cl.clCreateCommandQueue(self.context, device=TARGET_GPU) def finalize(self, kernel, tree, entry_name, entry_type): self.kernel = kernel self._c_function = self._compile(entry_name, tree, entry_type) return self def __call__(self, A):