コード例 #1
0
def main():
    ctx = cl.create_some_context()

    prof_overhead, latency = perf.get_profiling_overhead(ctx)
    print("command latency: %g s" % latency)
    print("profiling overhead: %g s -> %.1f %%" %
          (prof_overhead, 100 * prof_overhead / latency))
    queue = cl.CommandQueue(
        ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)

    print("empty kernel: %g s" % perf.get_empty_kernel_time(queue))
    print("float32 add: %g GOps/s" % (perf.get_add_rate(queue) / 1e9))

    for tx_type in [
            perf.HostToDeviceTransfer, perf.DeviceToHostTransfer,
            perf.DeviceToDeviceTransfer
    ]:
        print("----------------------------------------")
        print(tx_type.__name__)
        print("----------------------------------------")

        print("latency: %g s" % perf.transfer_latency(queue, tx_type))
        for i in range(6, 28, 2):
            bs = 1 << i
            print("bandwidth @ %d bytes: %g GB/s" %
                  (bs, perf.transfer_bandwidth(queue, tx_type, bs) / 1e9))
コード例 #2
0
ファイル: dump-performance.py プロジェクト: AI42/pyopencl
def main():
    ctx = cl.create_some_context()

    prof_overhead, latency = perf.get_profiling_overhead(ctx)
    print("command latency: %g s" % latency)
    print("profiling overhead: %g s -> %.1f %%" % (
            prof_overhead, 100*prof_overhead/latency))
    queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)

    print("empty kernel: %g s" % perf.get_empty_kernel_time(queue))
    print("float32 add: %g GOps/s" % (perf.get_add_rate(queue)/1e9))

    for tx_type in [
            perf.HostToDeviceTransfer,
            perf.DeviceToHostTransfer,
            perf.DeviceToDeviceTransfer]:
        print("----------------------------------------")
        print(tx_type.__name__)
        print("----------------------------------------")

        print("latency: %g s" % perf.transfer_latency(queue, tx_type))
        for i in range(6, 28, 2):
            bs = 1<<i
            print("bandwidth @ %d bytes: %g GB/s" % (
                    bs, perf.transfer_bandwidth(queue, tx_type, bs)/1e9))
コード例 #3
0
ファイル: dump-performance.py プロジェクト: zeta1999/pyopencl
def main():
    ctx = cl.create_some_context()

    prof_overhead, latency = perf.get_profiling_overhead(ctx)
    print("command latency: %g s" % latency)
    print("profiling overhead: {:g} s -> {:.1f} %".format(
        prof_overhead, 100 * prof_overhead / latency))
    queue = cl.CommandQueue(
        ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)

    print("empty kernel: %g s" % perf.get_empty_kernel_time(queue))
    print("float32 add: %g GOps/s" % (perf.get_add_rate(queue) / 1e9))

    for tx_type in [
            perf.HostToDeviceTransfer, perf.DeviceToHostTransfer,
            perf.DeviceToDeviceTransfer
    ]:
        print("----------------------------------------")
        print(tx_type.__name__)
        print("----------------------------------------")

        print("latency: %g s" % perf.transfer_latency(queue, tx_type))
        for i in range(6, 31, 2):
            bs = 1 << i
            try:
                result = "%g GB/s" % (
                    perf.transfer_bandwidth(queue, tx_type, bs) / 1e9)
            except Exception as e:
                result = "exception: %s" % e.__class__.__name__
            print("bandwidth @ %d bytes: %s" % (bs, result))
コード例 #4
0
ファイル: hostcode.py プロジェクト: zehanort/oclude
def profile_opencl_device(platform_id=0, device_id=0, verbose=False):

    interact = Interactor(__file__.split(os.sep)[-1])
    interact.set_verbosity(verbose)

    platform = cl.get_platforms()[platform_id]
    device = platform.get_devices()[device_id]
    context = cl.Context([device])
    queue = cl.CommandQueue(context, properties=cl.command_queue_properties.PROFILING_ENABLE)

    interact('Collecting profiling info for the following device:')
    interact('Platform:\t' + platform.name)
    interact('Device:\t' + device.name)
    interact('Version:\t' + device.version.strip())
    interact('Please wait, this may take a while...')
    prof_overhead, latency = clperf.get_profiling_overhead(context)
    h2d_latency = clperf.transfer_latency(queue, clperf.HostToDeviceTransfer) * 1000
    d2h_latency = clperf.transfer_latency(queue, clperf.DeviceToHostTransfer) * 1000
    d2d_latency = clperf.transfer_latency(queue, clperf.DeviceToDeviceTransfer) * 1000

    device_profile = {
        'profiling overhead (time)': prof_overhead * 1000,
        'profiling overhead (percentage)': f'{(100 * prof_overhead / latency):.2f}%',
        'command latency': latency * 1000,
        'host-to-device transfer latency': h2d_latency,
        'device-to-host transfer latency': d2h_latency,
        'device-to-device transfer latency': d2d_latency
    }

    for tx_type, tx_type_name in zip(
                [clperf.HostToDeviceTransfer, clperf.DeviceToHostTransfer, clperf.DeviceToDeviceTransfer],
                ['host-device', 'device-host', 'device-device']
            ):
        tx_type_bw = tx_type_name + ' bandwidth'
        device_profile[tx_type_bw] = {}
        for i in range(6, 31, 2):
            bs = 1 << i
            try:
                bw = str(clperf.transfer_bandwidth(queue, tx_type, bs)/1e9) + ' GB/s'
            except Exception as e:
                bw = 'exception: ' + e.__class__.__name__
            device_profile[tx_type_bw][f'{bs} bytes'] = bw

    return device_profile
コード例 #5
0
    def run_perf_tests(self) -> None:
        """[summary]

        Raises:
            RuntimeError: [description]
        """
        if self.queue is not None:

            prof_overhead, latency = perf.get_profiling_overhead(self.ctx)
            logging.debug("command latency: %g s" % latency)
            logging.debug("profiling overhead: %g s -> %.1f %%" %
                          (prof_overhead, 100 * prof_overhead / latency))

            logging.debug("empty kernel: %g s" %
                          perf.get_empty_kernel_time(self.queue))
            logging.debug("float32 add: %g GOps/s" %
                          (perf.get_add_rate(self.queue) / 1e9))

            for tx_type in [
                    perf.HostToDeviceTransfer, perf.DeviceToHostTransfer,
                    perf.DeviceToDeviceTransfer
            ]:
                logging.debug("----------------------------------------")
                logging.debug(tx_type.__name__)
                logging.debug("----------------------------------------")

                logging.debug("latency: %g s" %
                              perf.transfer_latency(self.queue, tx_type))
                for i in range(6, 30, 2):
                    bs = 1 << i
                    try:
                        result = "%g GB/s" % (perf.transfer_bandwidth(
                            self.queue, tx_type, bs) / 1e9)
                    except Exception as e:
                        result = "exception: %s" % e.__class__.__name__
                    logging.debug("bandwidth @ %d bytes: %s" % (bs, result))
        else:
            raise RuntimeError("perf tests cannot be executed without a queue")
コード例 #6
0
import pyopencl as cl
import pyopencl.characterize.performance as performance

context = cl.create_some_context()
queue = cl.CommandQueue(
    context, properties=cl.command_queue_properties.PROFILING_ENABLE)

overhead, latency = performance.get_profiling_overhead(context)

print("\n\nCommand Latency: {} s".format(latency))
print("Profiling Overhead: {} s -> {}".format(overhead,
                                              100 * overhead / latency))

# XXX Both these lines break the program on a Mac XXX
print("Empty Kernel: {} s".format(performance.get_empty_kernel_time(queue)))
print("Float32 Add: {} GOps/s\n".format(performance.get_add_rate(queue) / 1e9))

for transfer_type in [
        performance.HostToDeviceTransfer, performance.DeviceToHostTransfer,
        performance.DeviceToDeviceTransfer
]:

    print("\n" + transfer_type.__name__)
    print("    Latency: {0} s".format(
        performance.transfer_latency(queue, transfer_type)))
    for exponent in range(6, 28, 2):
        bytes = 1 << exponent  # This bit shift << operation does 'two to the exponent' (2^exponent)
        print("    Bandwidth at {0} Bytes: {1} GB/s".format(
            bytes,
            performance.transfer_bandwidth(queue, transfer_type, bytes) / 1e9))
コード例 #7
0
# XXX Find out more about pyopencl.characterize - why does this exist? XXX

context = cl.create_some_context()
queue = cl.CommandQueue(context, properties=cl.command_queue_properties.PROFILING_ENABLE)

overhead, latency = performance.get_profiling_overhead(context)

print("\n\nCommand Latency: {} s".format(latency))
print("Profiling Overhead: {} s -> {}".format(overhead, 100 * overhead / latency))

# XXX Both these lines break the program on a Mac XXX
print("Empty Kernel: {} s".format(performance.get_empty_kernel_time(queue)))
print("Float32 Add: {} GOps/s\n".format(performance.get_add_rate(queue) / 1e9))

for transfer_type in [
    performance.HostToDeviceTransfer,
    performance.DeviceToHostTransfer,
    performance.DeviceToDeviceTransfer,
]:

    print("\n" + transfer_type.__name__)
    print("    Latency: {0} s".format(performance.transfer_latency(queue, transfer_type)))
    for exponent in range(6, 28, 2):
        bytes = 1 << exponent  # This bit shift << operation does 'two to the exponent' (2^exponent)
        print(
            "    Bandwidth at {0} Bytes: {1} GB/s".format(
                bytes, performance.transfer_bandwidth(queue, transfer_type, bytes) / 1e9
            )
        )