def main(): ctx = cl.create_some_context() prof_overhead, latency = perf.get_profiling_overhead(ctx) print("command latency: %g s" % latency) print("profiling overhead: %g s -> %.1f %%" % (prof_overhead, 100 * prof_overhead / latency)) queue = cl.CommandQueue( ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) print("empty kernel: %g s" % perf.get_empty_kernel_time(queue)) print("float32 add: %g GOps/s" % (perf.get_add_rate(queue) / 1e9)) for tx_type in [ perf.HostToDeviceTransfer, perf.DeviceToHostTransfer, perf.DeviceToDeviceTransfer ]: print("----------------------------------------") print(tx_type.__name__) print("----------------------------------------") print("latency: %g s" % perf.transfer_latency(queue, tx_type)) for i in range(6, 28, 2): bs = 1 << i print("bandwidth @ %d bytes: %g GB/s" % (bs, perf.transfer_bandwidth(queue, tx_type, bs) / 1e9))
def main(): ctx = cl.create_some_context() prof_overhead, latency = perf.get_profiling_overhead(ctx) print("command latency: %g s" % latency) print("profiling overhead: %g s -> %.1f %%" % ( prof_overhead, 100*prof_overhead/latency)) queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) print("empty kernel: %g s" % perf.get_empty_kernel_time(queue)) print("float32 add: %g GOps/s" % (perf.get_add_rate(queue)/1e9)) for tx_type in [ perf.HostToDeviceTransfer, perf.DeviceToHostTransfer, perf.DeviceToDeviceTransfer]: print("----------------------------------------") print(tx_type.__name__) print("----------------------------------------") print("latency: %g s" % perf.transfer_latency(queue, tx_type)) for i in range(6, 28, 2): bs = 1<<i print("bandwidth @ %d bytes: %g GB/s" % ( bs, perf.transfer_bandwidth(queue, tx_type, bs)/1e9))
def main(): ctx = cl.create_some_context() prof_overhead, latency = perf.get_profiling_overhead(ctx) print("command latency: %g s" % latency) print("profiling overhead: {:g} s -> {:.1f} %".format( prof_overhead, 100 * prof_overhead / latency)) queue = cl.CommandQueue( ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) print("empty kernel: %g s" % perf.get_empty_kernel_time(queue)) print("float32 add: %g GOps/s" % (perf.get_add_rate(queue) / 1e9)) for tx_type in [ perf.HostToDeviceTransfer, perf.DeviceToHostTransfer, perf.DeviceToDeviceTransfer ]: print("----------------------------------------") print(tx_type.__name__) print("----------------------------------------") print("latency: %g s" % perf.transfer_latency(queue, tx_type)) for i in range(6, 31, 2): bs = 1 << i try: result = "%g GB/s" % ( perf.transfer_bandwidth(queue, tx_type, bs) / 1e9) except Exception as e: result = "exception: %s" % e.__class__.__name__ print("bandwidth @ %d bytes: %s" % (bs, result))
def profile_opencl_device(platform_id=0, device_id=0, verbose=False): interact = Interactor(__file__.split(os.sep)[-1]) interact.set_verbosity(verbose) platform = cl.get_platforms()[platform_id] device = platform.get_devices()[device_id] context = cl.Context([device]) queue = cl.CommandQueue(context, properties=cl.command_queue_properties.PROFILING_ENABLE) interact('Collecting profiling info for the following device:') interact('Platform:\t' + platform.name) interact('Device:\t' + device.name) interact('Version:\t' + device.version.strip()) interact('Please wait, this may take a while...') prof_overhead, latency = clperf.get_profiling_overhead(context) h2d_latency = clperf.transfer_latency(queue, clperf.HostToDeviceTransfer) * 1000 d2h_latency = clperf.transfer_latency(queue, clperf.DeviceToHostTransfer) * 1000 d2d_latency = clperf.transfer_latency(queue, clperf.DeviceToDeviceTransfer) * 1000 device_profile = { 'profiling overhead (time)': prof_overhead * 1000, 'profiling overhead (percentage)': f'{(100 * prof_overhead / latency):.2f}%', 'command latency': latency * 1000, 'host-to-device transfer latency': h2d_latency, 'device-to-host transfer latency': d2h_latency, 'device-to-device transfer latency': d2d_latency } for tx_type, tx_type_name in zip( [clperf.HostToDeviceTransfer, clperf.DeviceToHostTransfer, clperf.DeviceToDeviceTransfer], ['host-device', 'device-host', 'device-device'] ): tx_type_bw = tx_type_name + ' bandwidth' device_profile[tx_type_bw] = {} for i in range(6, 31, 2): bs = 1 << i try: bw = str(clperf.transfer_bandwidth(queue, tx_type, bs)/1e9) + ' GB/s' except Exception as e: bw = 'exception: ' + e.__class__.__name__ device_profile[tx_type_bw][f'{bs} bytes'] = bw return device_profile
def run_perf_tests(self) -> None: """[summary] Raises: RuntimeError: [description] """ if self.queue is not None: prof_overhead, latency = perf.get_profiling_overhead(self.ctx) logging.debug("command latency: %g s" % latency) logging.debug("profiling overhead: %g s -> %.1f %%" % (prof_overhead, 100 * prof_overhead / latency)) logging.debug("empty kernel: %g s" % perf.get_empty_kernel_time(self.queue)) logging.debug("float32 add: %g GOps/s" % (perf.get_add_rate(self.queue) / 1e9)) for tx_type in [ perf.HostToDeviceTransfer, perf.DeviceToHostTransfer, perf.DeviceToDeviceTransfer ]: logging.debug("----------------------------------------") logging.debug(tx_type.__name__) logging.debug("----------------------------------------") logging.debug("latency: %g s" % perf.transfer_latency(self.queue, tx_type)) for i in range(6, 30, 2): bs = 1 << i try: result = "%g GB/s" % (perf.transfer_bandwidth( self.queue, tx_type, bs) / 1e9) except Exception as e: result = "exception: %s" % e.__class__.__name__ logging.debug("bandwidth @ %d bytes: %s" % (bs, result)) else: raise RuntimeError("perf tests cannot be executed without a queue")
import pyopencl as cl import pyopencl.characterize.performance as performance context = cl.create_some_context() queue = cl.CommandQueue( context, properties=cl.command_queue_properties.PROFILING_ENABLE) overhead, latency = performance.get_profiling_overhead(context) print("\n\nCommand Latency: {} s".format(latency)) print("Profiling Overhead: {} s -> {}".format(overhead, 100 * overhead / latency)) # XXX Both these lines break the program on a Mac XXX print("Empty Kernel: {} s".format(performance.get_empty_kernel_time(queue))) print("Float32 Add: {} GOps/s\n".format(performance.get_add_rate(queue) / 1e9)) for transfer_type in [ performance.HostToDeviceTransfer, performance.DeviceToHostTransfer, performance.DeviceToDeviceTransfer ]: print("\n" + transfer_type.__name__) print(" Latency: {0} s".format( performance.transfer_latency(queue, transfer_type))) for exponent in range(6, 28, 2): bytes = 1 << exponent # This bit shift << operation does 'two to the exponent' (2^exponent) print(" Bandwidth at {0} Bytes: {1} GB/s".format( bytes, performance.transfer_bandwidth(queue, transfer_type, bytes) / 1e9))
# XXX Find out more about pyopencl.characterize - why does this exist? XXX context = cl.create_some_context() queue = cl.CommandQueue(context, properties=cl.command_queue_properties.PROFILING_ENABLE) overhead, latency = performance.get_profiling_overhead(context) print("\n\nCommand Latency: {} s".format(latency)) print("Profiling Overhead: {} s -> {}".format(overhead, 100 * overhead / latency)) # XXX Both these lines break the program on a Mac XXX print("Empty Kernel: {} s".format(performance.get_empty_kernel_time(queue))) print("Float32 Add: {} GOps/s\n".format(performance.get_add_rate(queue) / 1e9)) for transfer_type in [ performance.HostToDeviceTransfer, performance.DeviceToHostTransfer, performance.DeviceToDeviceTransfer, ]: print("\n" + transfer_type.__name__) print(" Latency: {0} s".format(performance.transfer_latency(queue, transfer_type))) for exponent in range(6, 28, 2): bytes = 1 << exponent # This bit shift << operation does 'two to the exponent' (2^exponent) print( " Bandwidth at {0} Bytes: {1} GB/s".format( bytes, performance.transfer_bandwidth(queue, transfer_type, bytes) / 1e9 ) )