def loadKernel(self, device): #Load the kernel and initialize the device. self.context = cl.Context([device], None, None) # get the maximum worksize of the device maxWorkSize = self.device.get_info(cl.device_info.MAX_WORK_GROUP_SIZE) # If the user didn't specify their own worksize, use the maximum supported worksize of the device if self.WORKSIZE is None: self.interface.error('WORKSIZE not supplied, using HW max. of ' + str(maxWorkSize)) self.WORKSIZE = maxWorkSize else: # If the worksize is larger than the maximum supported worksize of the device if (self.WORKSIZE > maxWorkSize): self.interface.error('WORKSIZE out of range, using HW max. of ' + str(maxWorkSize)) self.WORKSIZE = maxWorkSize # If the worksize is not a power of 2 if (self.WORKSIZE & (self.WORKSIZE - 1)) != 0: self.interface.error('WORKSIZE invalid, using HW max. of ' + str(maxWorkSize)) self.WORKSIZE = maxWorkSize # These definitions are required for the kernel to function. self.defines += (' -DOUTPUT_SIZE=' + str(self.OUTPUT_SIZE)) self.defines += (' -DOUTPUT_MASK=' + str(self.OUTPUT_SIZE - 1)) self.defines += (' -DWORKSIZE=' + str(self.WORKSIZE)) # If the user wants to mine with vectors, enable the appropriate code # in the kernel source. if self.VECTORS: self.defines += ' -DVECTORS' self.rateDivisor = 2 elif self.VECTORS4: self.defines += ' -DVECTORS4' self.rateDivisor = 4 else: self.rateDivisor = 1 # Some AMD devices support a special "bitalign" instruction that makes # bitwise rotation (required for SHA-256) much faster. if (device.extensions.find('cl_amd_media_ops') != -1): self.defines += ' -DBITALIGN' #enable the expierimental BFI_INT instruction optimization if self.BFI_INT: self.defines += ' -DBFI_INT' # Locate and read the OpenCL source code in the kernel's directory. kernelFileDir, pyfile = os.path.split(__file__) kernelFilePath = os.path.join(kernelFileDir, 'kernel.cl') kernelFile = open(kernelFilePath, 'r') kernel = kernelFile.read() kernelFile.close() # For fast startup, we cache the compiled OpenCL code. The name of the # cache is determined as the hash of a few important, # compilation-specific pieces of information. m = md5() m.update(device.platform.name) m.update(device.platform.version) m.update(device.name) m.update(self.defines) m.update(kernel) cacheName = '%s.elf' % m.hexdigest() fileName = os.path.join(kernelFileDir, cacheName) # Finally, the actual work of loading the kernel... try: binary = open(fileName, 'rb') except IOError: binary = None try: if binary is None: self.kernel = cl.Program( self.context, kernel).build(self.defines) #apply BFI_INT if enabled if self.BFI_INT: #patch the binary output from the compiler patcher = BFIPatcher(self.interface) binaryData = patcher.patch(self.kernel.binaries[0]) self.interface.debug("Applied BFI_INT patch") #reload the kernel with the patched binary self.kernel = cl.Program( self.context, [device], [binaryData]).build(self.defines) #write the kernel binaries to file binaryW = open(fileName, 'wb') binaryW.write(self.kernel.binaries[0]) binaryW.close() else: binaryData = binary.read() self.kernel = cl.Program( self.context, [device], [binaryData]).build(self.defines) except cl.LogicError: self.interface.fatal("Failed to compile OpenCL kernel!") return except PatchError: self.interface.fatal('Failed to apply BFI_INT patch to kernel! ' 'Is BFI_INT supported on this hardware?') return finally: if binary: binary.close() #unload the compiler to reduce memory usage cl.unload_compiler()
def loadKernel(self, device): """Load the kernel and initialize the device.""" self.context = cl.Context([device], None, None) # These definitions are required for the kernel to function. self.defines += (' -DOUTPUT_SIZE=' + str(self.OUTPUT_SIZE)) self.defines += (' -DOUTPUT_MASK=' + str(self.OUTPUT_SIZE - 1)) # If the user wants to mine with vectors, enable the appropriate code # in the kernel source. if self.VECTORS: self.defines += ' -DVECTORS' # Some AMD devices support a special "bitalign" instruction that makes # bitwise rotation (required for SHA-256) much faster. if (device.extensions.find('cl_amd_media_ops') != -1): self.defines += ' -DBITALIGN' #enable the expierimental BFI_INT instruction optimization if self.BFI_INT: self.defines += ' -DBFI_INT' else: #since BFI_INT requires cl_amd_media_ops, disable it if self.BFI_INT: self.BFI_INT = False # Locate and read the OpenCL source code in the kernel's directory. kernelFileDir, pyfile = os.path.split(__file__) kernelFilePath = os.path.join(kernelFileDir, 'kernel.cl') kernelFile = open(kernelFilePath, 'r') kernel = kernelFile.read() kernelFile.close() # For fast startup, we cache the compiled OpenCL code. The name of the # cache is determined as the hash of a few important, # compilation-specific pieces of information. m = md5() m.update(device.platform.name) m.update(device.platform.version) m.update(device.name) m.update(self.defines) m.update(kernel) cacheName = '%s.elf' % m.hexdigest() fileName = os.path.join(kernelFileDir, cacheName) # Finally, the actual work of loading the kernel... try: binary = open(fileName, 'rb') except IOError: binary = None try: if binary is None: self.kernel = cl.Program( self.context, kernel).build(self.defines) #apply BFI_INT if enabled if self.BFI_INT: #patch the binary output from the compiler patcher = BFIPatcher(self.interface) binaryData = patcher.patch(self.kernel.binaries[0]) self.interface.debug("Applied BFI_INT patch") #reload the kernel with the patched binary self.kernel = cl.Program( self.context, [device], [binaryData]).build(self.defines) #write the kernel binaries to file binaryW = open(fileName, 'wb') binaryW.write(self.kernel.binaries[0]) binaryW.close() else: binaryData = binary.read() self.kernel = cl.Program( self.context, [device], [binaryData]).build(self.defines) except cl.LogicError: self.interface.fatal("Failed to compile OpenCL kernel!") return except PatchError: self.interface.fatal('Failed to apply BFI_INT patch to kernel! ' 'Is BFI_INT supported on this hardware?') return finally: if binary: binary.close() cl.unload_compiler() # If the user didn't specify their own worksize, use the maxium # supported by the device. maxSize = self.kernel.search.get_work_group_info( cl.kernel_work_group_info.WORK_GROUP_SIZE, self.device) if self.WORKSIZE is None: self.WORKSIZE = maxSize else: if self.WORKSIZE > maxSize: self.interface.log('Warning: Worksize exceeds the maximum of ' + str(maxSize) + ', using default.') if self.WORKSIZE < 1: self.interface.log('Warning: Invalid worksize, using default.') self.WORKSIZE = min(self.WORKSIZE, maxSize) self.WORKSIZE = max(self.WORKSIZE, 1) #if the worksize is not a power of 2, round down to the nearest one if (self.WORKSIZE & (self.WORKSIZE - 1)) != 0: self.WORKSIZE = 1 << int(math.floor(math.log(X)/math.log(2))) self.interface.setWorkFactor(self.WORKSIZE)
def test_get_info(self, platform, device): failure_count = [0] CRASH_QUIRKS = [ (("NVIDIA Corporation", "NVIDIA CUDA", "OpenCL 1.0 CUDA 3.0.1"), [ (cl.Event, cl.event_info.COMMAND_QUEUE), ]), ] QUIRKS = [] plat_quirk_key = (platform.vendor, platform.name, platform.version) def find_quirk(quirk_list, cl_obj, info): for entry_plat_key, quirks in quirk_list: if entry_plat_key == plat_quirk_key: for quirk_cls, quirk_info in quirks: if (isinstance(cl_obj, quirk_cls) and quirk_info == info): return True return False def do_test(cl_obj, info_cls, func=None, try_attr_form=True): if func is None: def func(info): cl_obj.get_info(info) for info_name in dir(info_cls): if not info_name.startswith("_") and info_name != "to_string": info = getattr(info_cls, info_name) if find_quirk(CRASH_QUIRKS, cl_obj, info): print "not executing get_info", type(cl_obj), info_name print "(known crash quirk for %s)" % platform.name continue try: func(info) except: msg = "failed get_info", type(cl_obj), info_name if find_quirk(QUIRKS, cl_obj, info): msg += ("(known quirk for %s)" % platform.name) else: failure_count[0] += 1 if try_attr_form: try: getattr(cl_obj, info_name.lower()) except: print "failed attr-based get_info", type( cl_obj), info_name if find_quirk(QUIRKS, cl_obj, info): print "(known quirk for %s)" % platform.name else: failure_count[0] += 1 do_test(platform, cl.platform_info) do_test(device, cl.device_info) ctx = cl.Context([device]) do_test(ctx, cl.context_info) props = 0 if (device.queue_properties & cl.command_queue_properties.PROFILING_ENABLE): profiling = True props = cl.command_queue_properties.PROFILING_ENABLE queue = cl.CommandQueue(ctx, properties=props) do_test(queue, cl.command_queue_info) prg = cl.Program( ctx, """ __kernel void sum(__global float *a) { a[get_global_id(0)] *= 2; } """).build() do_test(prg, cl.program_info) do_test(prg, cl.program_build_info, lambda info: prg.get_build_info(device, info), try_attr_form=False) cl.unload_compiler() # just for the heck of it mf = cl.mem_flags n = 2000 a_buf = cl.Buffer(ctx, 0, n * 4) do_test(a_buf, cl.mem_info) kernel = prg.sum do_test(kernel, cl.kernel_info) evt = kernel(queue, (n, ), None, a_buf) do_test(evt, cl.event_info) if profiling: evt.wait() do_test(evt, cl.profiling_info, lambda info: evt.get_profiling_info(info), try_attr_form=False) if device.image_support: smp = cl.Sampler(ctx, True, cl.addressing_mode.CLAMP, cl.filter_mode.NEAREST) do_test(smp, cl.sampler_info) img_format = cl.get_supported_image_formats( ctx, cl.mem_flags.READ_ONLY, cl.mem_object_type.IMAGE2D)[0] img = cl.Image(ctx, cl.mem_flags.READ_ONLY, img_format, (128, 256)) assert img.shape == (128, 256) img.depth img.image.depth do_test(img, cl.image_info, lambda info: img.get_image_info(info)) if failure_count[0]: raise RuntimeError( "get_info testing had %d errors " "(If you compiled against OpenCL 1.1 but are testing a 1.0 " "implementation, you can safely ignore this.)" % failure_count[0])
def test_get_info(self, platform, device): failure_count = [0] CRASH_QUIRKS = [ (("NVIDIA Corporation", "NVIDIA CUDA", "OpenCL 1.0 CUDA 3.0.1"), [ (cl.Event, cl.event_info.COMMAND_QUEUE), ]), ] QUIRKS = [] plat_quirk_key = ( platform.vendor, platform.name, platform.version) def find_quirk(quirk_list, cl_obj, info): for entry_plat_key, quirks in quirk_list: if entry_plat_key == plat_quirk_key: for quirk_cls, quirk_info in quirks: if (isinstance(cl_obj, quirk_cls) and quirk_info == info): return True return False def do_test(cl_obj, info_cls, func=None, try_attr_form=True): if func is None: def func(info): cl_obj.get_info(info) for info_name in dir(info_cls): if not info_name.startswith("_") and info_name != "to_string": info = getattr(info_cls, info_name) if find_quirk(CRASH_QUIRKS, cl_obj, info): print("not executing get_info", type(cl_obj), info_name) print("(known crash quirk for %s)" % platform.name) continue try: func(info) except: msg = "failed get_info", type(cl_obj), info_name if find_quirk(QUIRKS, cl_obj, info): msg += ("(known quirk for %s)" % platform.name) else: failure_count[0] += 1 if try_attr_form: try: getattr(cl_obj, info_name.lower()) except: print("failed attr-based get_info", type(cl_obj), info_name) if find_quirk(QUIRKS, cl_obj, info): print("(known quirk for %s)" % platform.name) else: failure_count[0] += 1 do_test(platform, cl.platform_info) do_test(device, cl.device_info) ctx = cl.Context([device]) do_test(ctx, cl.context_info) props = 0 if (device.queue_properties & cl.command_queue_properties.PROFILING_ENABLE): profiling = True props = cl.command_queue_properties.PROFILING_ENABLE queue = cl.CommandQueue(ctx, properties=props) do_test(queue, cl.command_queue_info) prg = cl.Program(ctx, """ __kernel void sum(__global float *a) { a[get_global_id(0)] *= 2; } """).build() do_test(prg, cl.program_info) do_test(prg, cl.program_build_info, lambda info: prg.get_build_info(device, info), try_attr_form=False) cl.unload_compiler() # just for the heck of it mf = cl.mem_flags n = 2000 a_buf = cl.Buffer(ctx, 0, n*4) do_test(a_buf, cl.mem_info) kernel = prg.sum do_test(kernel, cl.kernel_info) evt = kernel(queue, (n,), None, a_buf) do_test(evt, cl.event_info) if profiling: evt.wait() do_test(evt, cl.profiling_info, lambda info: evt.get_profiling_info(info), try_attr_form=False) if device.image_support: smp = cl.Sampler(ctx, True, cl.addressing_mode.CLAMP, cl.filter_mode.NEAREST) do_test(smp, cl.sampler_info) img_format = cl.get_supported_image_formats( ctx, cl.mem_flags.READ_ONLY, cl.mem_object_type.IMAGE2D)[0] img = cl.Image(ctx, cl.mem_flags.READ_ONLY, img_format, (128, 256)) assert img.shape == (128, 256) img.depth img.image.depth do_test(img, cl.image_info, lambda info: img.get_image_info(info))
else: binaryData = binary.read() self.kernel = cl.Program( self.context, [device], [binaryData]).build(self.defines) except cl.LogicError: self.interface.fatal("Failed to compile OpenCL kernel!") return except PatchError: self.interface.fatal('Failed to apply BFI_INT patch to kernel! ' 'Is BFI_INT supported on this hardware?') return finally: if binary: binary.close() cl.unload_compiler() # If the user didn't specify their own worksize, use the maxium # supported by the device. maxSize = self.kernel.search.get_work_group_info( cl.kernel_work_group_info.WORK_GROUP_SIZE, self.device) if self.WORKSIZE is None: self.WORKSIZE = maxSize else: if self.WORKSIZE > maxSize: self.interface.log('Warning: Worksize exceeds the maximum of ' + str(maxSize) + ', using default.') if self.WORKSIZE < 1: self.interface.log('Warning: Invalid worksize, using default.')
def loadKernel(self, device): #Load the kernel and initialize the device. self.context = cl.Context([device], None, None) # If the user didn't specify their own worksize, use 256 if self.WORKSIZE is None: self.WORKSIZE = 256 else: #if the worksize is not a power of 2, round down to the nearest one if (self.WORKSIZE & (self.WORKSIZE - 1)) != 0: self.WORKSIZE = 1 << int(math.floor(math.log(X)/math.log(2))) # These definitions are required for the kernel to function. self.defines += (' -DOUTPUT_SIZE=' + str(self.OUTPUT_SIZE)) self.defines += (' -DOUTPUT_MASK=' + str(self.OUTPUT_SIZE - 1)) self.defines += (' -DWORKSIZE=' + str(self.WORKSIZE)) # If the user wants to mine with vectors, enable the appropriate code # in the kernel source. if self.VECTORS: self.defines += ' -DVECTORS' self.rateDivisor = 2 elif self.VECTORS4: self.defines += ' -DVECTORS4' self.rateDivisor = 4 else: self.rateDivisor = 1 # Some AMD devices support a special "bitalign" instruction that makes # bitwise rotation (required for SHA-256) much faster. if (device.extensions.find('cl_amd_media_ops') != -1): self.defines += ' -DBITALIGN' #enable the expierimental BFI_INT instruction optimization if self.BFI_INT: self.defines += ' -DBFI_INT' else: #Since phatk and phatk2 will error out on Nvidia GPUs #make sure the user knows that they need to use poclbm self.interface.fatal("GPU not supported! phatk2 is designed for " "ATI 5xxx and newer only. Try -k poclbm instead.") return # Locate and read the OpenCL source code in the kernel's directory. kernelFileDir, pyfile = os.path.split(__file__) kernelFilePath = os.path.join(kernelFileDir, 'kernel.cl') kernelFile = open(kernelFilePath, 'r') kernel = kernelFile.read() kernelFile.close() # For fast startup, we cache the compiled OpenCL code. The name of the # cache is determined as the hash of a few important, # compilation-specific pieces of information. m = md5() m.update(device.platform.name) m.update(device.platform.version) m.update(device.name) m.update(self.defines) m.update(kernel) cacheName = '%s.elf' % m.hexdigest() fileName = os.path.join(kernelFileDir, cacheName) # Finally, the actual work of loading the kernel... try: binary = open(fileName, 'rb') except IOError: binary = None try: if binary is None: self.kernel = cl.Program( self.context, kernel).build(self.defines) #apply BFI_INT if enabled if self.BFI_INT: #patch the binary output from the compiler patcher = BFIPatcher(self.interface) binaryData = patcher.patch(self.kernel.binaries[0]) self.interface.debug("Applied BFI_INT patch") #reload the kernel with the patched binary self.kernel = cl.Program( self.context, [device], [binaryData]).build(self.defines) #write the kernel binaries to file binaryW = open(fileName, 'wb') binaryW.write(self.kernel.binaries[0]) binaryW.close() else: binaryData = binary.read() self.kernel = cl.Program( self.context, [device], [binaryData]).build(self.defines) except cl.LogicError: self.interface.fatal("Failed to compile OpenCL kernel!") return except PatchError: self.interface.fatal('Failed to apply BFI_INT patch to kernel! ' 'Is BFI_INT supported on this hardware?') return finally: if binary: binary.close() cl.unload_compiler() # Since this can't be run before compiling the kernel, all we can do is # check to make sure the selected size is not too large maxSize = self.kernel.search.get_work_group_info( cl.kernel_work_group_info.WORK_GROUP_SIZE, self.device) if self.WORKSIZE > maxSize: self.interface.fatal('Maximum WORKSIZE on the selected device is ' + str(maxSize))