コード例 #1
0
    def pretty_print(self,
            abscissa_label="h",
            error_label="Error",
            gliding_mean=2,
            abscissa_format="%s",
            error_format="%s",
            eoc_format="%s"):
        from pytools import Table

        tbl = Table()
        tbl.add_row((abscissa_label, error_label, "Running EOC"))

        gm_eoc = self.estimate_order_of_convergence(gliding_mean)
        for i, (absc, err) in enumerate(self.history):
            absc_str = abscissa_format % absc
            err_str = error_format % err
            if i < gliding_mean-1:
                eoc_str = ""
            else:
                eoc_str = eoc_format % (gm_eoc[i - gliding_mean + 1, 1])

            tbl.add_row((absc_str, err_str, eoc_str))

        if len(self.history) > 1:
            return "%s\n\nOverall EOC: %s" % (str(tbl),
                    self.estimate_order_of_convergence()[0, 1])
        else:
            return str(tbl)
コード例 #2
0
ファイル: convergence.py プロジェクト: inducer/pytools
    def _to_table(self, *,
            abscissa_label="h",
            error_label="Error",
            gliding_mean=2,
            abscissa_format="%s",
            error_format="%s",
            eoc_format="%s"):
        from pytools import Table

        tbl = Table()
        tbl.add_row((abscissa_label, error_label, "Running EOC"))

        gm_eoc = self.estimate_order_of_convergence(gliding_mean)
        for i, (absc, err) in enumerate(self.history):
            absc_str = abscissa_format % absc
            err_str = error_format % err
            if i < gliding_mean-1:
                eoc_str = ""
            else:
                eoc_str = eoc_format % (gm_eoc[i - gliding_mean + 1, 1])

            tbl.add_row((absc_str, err_str, eoc_str))

        if len(self.history) > 1:
            order = self.estimate_order_of_convergence()[0, 1]
            tbl.add_row(("Overall", "", eoc_format % order))

        return tbl
コード例 #3
0
    def pretty_print(self,
                     abscissa_label="N",
                     error_label="Error",
                     gliding_mean=2):
        from pytools import Table

        tbl = Table()
        tbl.add_row((abscissa_label, error_label, "Running EOC"))

        gm_eoc = self.estimate_order_of_convergence(gliding_mean)
        for i, (absc, err) in enumerate(self.history):
            if i < gliding_mean - 1:
                tbl.add_row((str(absc), str(err), ""))
            else:
                tbl.add_row(
                    (str(absc), str(err), str(gm_eoc[i - gliding_mean + 1,
                                                     1])))

        if len(self.history) > 1:
            return str(
                tbl
            ) + "\n\nOverall EOC: %s" % self.estimate_order_of_convergence()[0,
                                                                             1]
        else:
            return str(tbl)
コード例 #4
0
def tabulate_ascii(rows, col_fmt):
    del col_fmt
    from pytools import Table
    result = Table()
    for row in rows:
        result.add_row(row)
    return str(result)
コード例 #5
0
def table_from_cursor(cursor):
    from pytools import Table
    tbl = Table()
    tbl.add_row([column[0] for column in cursor.description])
    for row in cursor:
        tbl.add_row(row)
    return tbl
コード例 #6
0
ファイル: convergence.py プロジェクト: inducer/pytools
    def __str__(self):
        from pytools import Table
        tbl = Table()
        tbl.add_row(("p", "error"))

        for p, err in zip(self.orders, self.errors):
            tbl.add_row((str(p), str(err)))

        return str(tbl)
コード例 #7
0
ファイル: reduction-perf.py プロジェクト: thecobb/PyCUDA
def main():
    from pytools import Table
    tbl = Table()
    tbl.add_row(("type", "size [MiB]", "time [ms]", "mem.bw [GB/s]"))

    from random import shuffle
    for dtype_out in [numpy.float32, numpy.float64]:
        for ex in range(15, 27):
            sz = 1 << ex
            print sz

            from pycuda.curandom import rand as curand
            a_gpu = curand((sz, ))
            b_gpu = curand((sz, ))
            assert sz == a_gpu.shape[0]
            assert len(a_gpu.shape) == 1

            from pycuda.reduction import get_sum_kernel, get_dot_kernel
            krnl = get_dot_kernel(dtype_out, a_gpu.dtype)

            elapsed = [0]

            def wrap_with_timer(f):
                def result(*args, **kwargs):
                    start = cuda.Event()
                    stop = cuda.Event()
                    start.record()
                    f(*args, **kwargs)
                    stop.record()
                    stop.synchronize()
                    elapsed[0] += stop.time_since(start)

                return result

            # warm-up
            for i in range(3):
                krnl(a_gpu, b_gpu)

            cnt = 10

            for i in range(cnt):
                krnl(
                    a_gpu,
                    b_gpu,
                    #krnl(a_gpu,
                    kernel_wrapper=wrap_with_timer)

            bytes = a_gpu.nbytes * 2 * cnt
            secs = elapsed[0] * 1e-3

            tbl.add_row((str(dtype_out), a_gpu.nbytes / (1 << 20),
                         elapsed[0] / cnt, bytes / secs / 1e9))

    print tbl
コード例 #8
0
def test_table():
    import math
    from pytools import Table

    tbl = Table()
    tbl.add_row(("i", "i^2", "i^3", "sqrt(i)"))

    for i in range(8):
        tbl.add_row((i, i**2, i**3, math.sqrt(i)))

    print(tbl)
    print()
    print(tbl.latex())
コード例 #9
0
def ascii_table(table_format, header, rows):
    from pytools import Table
    table = Table()
    table.add_row(header)

    for input_row in rows:
        row = []
        for item in input_row:
            if item.startswith(r"\num{"):
                # Strip \num{...} formatting
                row.append(item[5:-1])
            else:
                row.append(item)
        table.add_row(row)

    return str(table)
コード例 #10
0
ファイル: cost.py プロジェクト: choward1491/pytential
def test_cost_model(ctx, calibration_params):
    queue = cl.CommandQueue(ctx)
    actx = PyOpenCLArrayContext(queue, force_device_scalars=True)
    cost_model = QBXCostModel()

    for lpot_source in test_geometries(actx):
        lpot_source = lpot_source.copy(cost_model=cost_model)

        from pytential import GeometryCollection
        places = GeometryCollection(lpot_source)
        density_discr = places.get_discretization(places.auto_source.geometry)

        bound_op = get_bound_op(places)
        sigma = get_test_density(actx, density_discr)

        cost_S, _ = bound_op.cost_per_stage(calibration_params, sigma=sigma)
        model_result = one(cost_S.values())

        # Warm-up run.
        bound_op.eval({"sigma": sigma}, array_context=actx)

        temp_timing_results = []
        for _ in range(RUNS):
            timing_data = {}
            bound_op.eval({"sigma": sigma},
                          array_context=actx,
                          timing_data=timing_data)
            temp_timing_results.append(one(timing_data.values()))

        timing_result = {}
        for param in model_result:
            timing_result[param] = (sum(
                temp_timing_result[param]["process_elapsed"]
                for temp_timing_result in temp_timing_results)) / RUNS

        from pytools import Table
        table = Table()
        table.add_row(["stage", "actual (s)", "predicted (s)"])
        for stage in model_result:
            row = [
                stage,
                f"{timing_result[stage]:.2f}",
                f"{model_result[stage]:.2f}",
            ]
            table.add_row(row)

        print(table)
コード例 #11
0
def test_table():
    import math
    from pytools import Table

    tbl = Table()
    tbl.add_row(("i", "i^2", "i^3", "sqrt(i)"))

    for i in range(8):
        tbl.add_row((i, i**2, i**3, math.sqrt(i)))

    print(tbl)
    print()
    print(tbl.latex())

    # {{{ test merging

    from pytools import merge_tables
    tbl = merge_tables(tbl, tbl, tbl, skip_columns=(0, ))
    print(tbl.github_markdown())
コード例 #12
0
def test_cost_model(ctx, cost_model):
    queue = cl.CommandQueue(ctx)

    for lpot_source in test_geometries(queue):
        lpot_source = lpot_source.copy(cost_model=cost_model)
        bound_op = get_bound_op(lpot_source)
        sigma = get_test_density(queue, lpot_source)

        cost_S = bound_op.get_modeled_cost(queue, sigma=sigma)
        model_result = (one(
            cost_S.values()).get_predicted_times(merge_close_lists=True))

        # Warm-up run.
        bound_op.eval(queue, {"sigma": sigma})

        temp_timing_results = []
        for _ in range(RUNS):
            timing_data = {}
            bound_op.eval(queue, {"sigma": sigma}, timing_data=timing_data)
            temp_timing_results.append(one(timing_data.values()))

        timing_result = {}
        for param in model_result:
            timing_result[param] = (sum(
                temp_timing_result[param]["process_elapsed"]
                for temp_timing_result in temp_timing_results)) / RUNS

        from pytools import Table
        table = Table()
        table.add_row(["stage", "actual (s)", "predicted (s)"])
        for stage in model_result:
            row = [
                stage,
                "%.2f" % timing_result[stage],
                "%.2f" % model_result[stage]
            ]
            table.add_row(row)

        print(table)
コード例 #13
0
def main():
    from pytools import Table

    tbl = Table()
    tbl.add_row(("size [MiB]", "time [s]", "mem.bw [GB/s]"))

    import pycuda.gpuarray as gpuarray

    # they're floats, i.e. 4 bytes each
    for power in range(10, 28):
        size = 1 << power
        print(size)

        a = gpuarray.empty((size, ), dtype=numpy.float32)
        b = gpuarray.empty_like(a)
        a.fill(1)
        b.fill(2)

        if power > 20:
            count = 10
        else:
            count = 100

        elapsed = [0]

        def add_timer(_, time):
            elapsed[0] += time()

        for i in range(count):
            a.mul_add(1, b, 2, add_timer)

        bytes = a.nbytes * count * 3
        bytes = a.nbytes * count * 3

        tbl.add_row((a.nbytes / (1 << 20), elapsed[0] / count,
                     bytes / elapsed[0] / 1e9))

    print(tbl)
コード例 #14
0
def main():
    import pycuda.gpuarray as gpuarray

    sizes = []
    times = []
    flops = []
    flopsCPU = []
    timesCPU = []

    for power in range(10, 25):  # 24
        size = 1 << power
        print size
        sizes.append(size)
        a = gpuarray.zeros((size, ), dtype=numpy.float32)

        if power > 20:
            count = 100
        else:
            count = 1000

        #start timer
        start = drv.Event()
        end = drv.Event()
        start.record()

        #cuda operation which fills the array with random numbers
        for i in range(count):
            curandom.rand((size, ))

        #stop timer
        end.record()
        end.synchronize()

        #calculate used time
        secs = start.time_till(end) * 1e-3

        times.append(secs / count)
        flops.append(size)

        #cpu operations which fills teh array with random data
        a = numpy.array((size, ), dtype=numpy.float32)

        #start timer
        start = drv.Event()
        end = drv.Event()
        start.record()

        #cpu operation which fills the array with random data
        for i in range(count):
            numpy.random.rand(size).astype(numpy.float32)

        #stop timer
        end.record()
        end.synchronize()

        #calculate used time
        secs = start.time_till(end) * 1e-3

        #add results to variable
        timesCPU.append(secs / count)
        flopsCPU.append(size)

    #calculate pseudo flops
    flops = [f / t for f, t in zip(flops, times)]
    flopsCPU = [f / t for f, t in zip(flopsCPU, timesCPU)]

    #print the data out
    tbl = Table()
    tbl.add_row(("Size", "Time GPU", "Size/Time GPU", "Time CPU",
                 "Size/Time CPU", "GPU vs CPU speedup"))
    for s, t, f, tCpu, fCpu in zip(sizes, times, flops, timesCPU, flopsCPU):
        tbl.add_row((s, t, f, tCpu, fCpu, f / fCpu))
    print tbl
コード例 #15
0
def main():
    import pycuda.gpuarray as gpuarray

    sizes = []
    times_gpu = []
    flops_gpu = []
    flops_cpu = []
    times_cpu = []

    from pycuda.tools import bitlog2

    max_power = bitlog2(drv.mem_get_info()[0]) - 2
    # they're floats, i.e. 4 bytes each
    for power in range(10, max_power):
        size = 1 << power
        print(size)
        sizes.append(size)
        a = gpuarray.zeros((size,), dtype=numpy.float32)
        b = gpuarray.zeros((size,), dtype=numpy.float32)
        b.fill(1)

        if power > 20:
            count = 100
        else:
            count = 1000

        # gpu -----------------------------------------------------------------
        start = drv.Event()
        end = drv.Event()
        start.record()

        for i in range(count):
            a + b

        end.record()
        end.synchronize()

        secs = start.time_till(end) * 1e-3

        times_gpu.append(secs / count)
        flops_gpu.append(size)
        del a
        del b

        # cpu -----------------------------------------------------------------
        a_cpu = numpy.random.randn(size).astype(numpy.float32)
        b_cpu = numpy.random.randn(size).astype(numpy.float32)

        # start timer
        from time import time

        start = time()
        for i in range(count):
            a_cpu + b_cpu
        secs = time() - start

        times_cpu.append(secs / count)
        flops_cpu.append(size)

    # calculate pseudo flops
    flops_gpu = [f / t for f, t in zip(flops_gpu, times_gpu)]
    flops_cpu = [f / t for f, t in zip(flops_cpu, times_cpu)]

    from pytools import Table

    tbl = Table()
    tbl.add_row(
        (
            "Size",
            "Time GPU",
            "Size/Time GPU",
            "Time CPU",
            "Size/Time CPU",
            "GPU vs CPU speedup",
        )
    )
    for s, t, f, t_cpu, f_cpu in zip(sizes, times_gpu, flops_gpu, times_cpu, flops_cpu):
        tbl.add_row((s, t, f, t_cpu, f_cpu, f / f_cpu))
    print(tbl)
コード例 #16
0
    def __call__(self, eval_dependency, lift_plan):
        discr = self.discr
        fplan = self.plan
        given = fplan.given
        elgroup, = discr.element_groups

        all_fluxes_on_faces = [
            gpuarray.empty(given.matmul_preimage_shape(lift_plan),
                           dtype=given.float_type,
                           allocator=discr.pool.allocate)
            for i in range(len(self.fluxes))
        ]

        fdata = self.flux_face_data_block(elgroup)
        ilist_data = self.index_list_data()

        block, gather, texref_map = self.get_kernel(fdata,
                                                    ilist_data,
                                                    for_benchmark=False)

        for dep_expr in self.all_deps:
            dep_field = eval_dependency(dep_expr)

            from hedge.tools import is_zero
            if is_zero(dep_field):
                if dep_expr in self.dep_to_tag:
                    dep_field = discr.boundary_zeros(self.dep_to_tag[dep_expr])
                else:
                    dep_field = discr.volume_zeros()

            assert dep_field.dtype == given.float_type, "Wrong types: %s: %s, %s: %s" % (
                dep_expr, dep_field.dtype, given, given.float_type)
            dep_field.bind_to_texref_ext(texref_map[dep_expr],
                                         allow_double_hack=True)

        if set(["cuda_flux", "cuda_debugbuf"]) <= discr.debug:
            debugbuf = gpuarray.zeros((10000, ), dtype=given.float_type)
        else:
            from hedge.backends.cuda.tools import FakeGPUArray
            debugbuf = FakeGPUArray()

        if discr.instrumented:
            discr.flux_gather_timer.add_timer_callable(
                gather.prepared_timed_call(
                    (len(discr.blocks), 1), block, debugbuf.gpudata,
                    fdata.device_memory,
                    *tuple(fof.gpudata for fof in all_fluxes_on_faces)))

            discr.gmem_bytes_gather.add(
                len(discr.blocks) * fdata.block_bytes + given.float_size() * (
                    # fetch
                    len(self.fluxes) * 2 * fdata.fp_count * fplan.dofs_per_face

                    # store
                    + len(discr.blocks) * len(self.fluxes) *
                    fplan.microblocks_per_block() *
                    fplan.aligned_face_dofs_per_microblock()))
        else:
            gather.prepared_call(
                (len(discr.blocks), 1), block, debugbuf.gpudata,
                fdata.device_memory,
                *tuple(fof.gpudata for fof in all_fluxes_on_faces))

        if set(["cuda_flux", "cuda_debugbuf"]) <= discr.debug:
            from hedge.tools import get_rank, wait_for_keypress
            if get_rank(discr) == 0:
                copied_debugbuf = debugbuf.get()
                print "DEBUG", len(discr.blocks)
                numpy.set_printoptions(linewidth=130)
                #print numpy.reshape(copied_debugbuf, (32, 16))
                print copied_debugbuf[:50]

                #for i in range(len(discr.blocks)*6):
                #print i, copied_debugbuf[i*16:(i+1)*16]
                #print i, [x-10000 for x in sorted(copied_debugbuf[i*16:(i+1)*16]) if x != 0]

                wait_for_keypress(discr)

        if "cuda_flux" in discr.debug:
            from hedge.tools import get_rank, wait_for_keypress
            if get_rank(discr) == 0:
                numpy.set_printoptions(linewidth=130,
                                       precision=2,
                                       threshold=10**6)
                if True:

                    cols = []
                    for k in range(len(all_fluxes_on_faces)):
                        my_fof = all_fluxes_on_faces[k].get()

                        def sstruc(a):
                            result = ""
                            for i in a:
                                if i == 0:
                                    result += "0"
                                elif abs(i) < 1e-10:
                                    result += "-"
                                elif numpy.isnan(i):
                                    result += "N"
                                elif i == 17:
                                    result += "*"
                                else:
                                    result += "#"

                            return result

                        useful_sz = given.block_count \
                                * given.microblocks_per_block \
                                * lift_plan.aligned_preimage_dofs_per_microblock

                        my_col = []
                        i = 0
                        while i < useful_sz:
                            my_col.append(sstruc(my_fof[i:i + 16]))
                            i += 16

                        cols.append(my_col)

                    from pytools import Table
                    tbl = Table()
                    tbl.add_row(["num"] + range(len(cols)))
                    i = 0
                    for row in zip(*cols):
                        tbl.add_row((i, ) + row)
                        i += 1
                    print tbl
                else:
                    for i in range(len(all_fluxes_on_faces)):
                        print i
                        print all_fluxes_on_faces[i].get()

                wait_for_keypress(discr)
                #print "B", [la.norm(fof.get()) for fof in all_fluxes_on_faces]

        return all_fluxes_on_faces