Ejemplo n.º 1
0
def subprocess_main(args):
    seed = args.DETAIL_seed
    cuda = (args.DETAIL_device == _GPU)

    with open(args.DETAIL_result_file, "ab") as f:
        for dtype_str in _DTYPES_TO_TEST[args.pr]:
            dtype = _DTYPE_STR_TO_DTYPE[dtype_str]
            iterator = unary.UnaryOpFuzzer(seed=seed, dtype=dtype,
                                           cuda=cuda).take(_RUNS_PER_LOOP)
            for i, (tensors, tensor_parameters, params) in enumerate(iterator):
                params["dtype_str"] = dtype_str
                stmt, label = construct_stmt_and_label(args.pr, params)
                timer = Timer(
                    stmt=stmt,
                    globals=tensors,
                    label=label,
                    description=
                    f"[{i}, seed={seed}] ({dtype_str}), stmt = {stmt}",
                    env=args.DETAIL_env,
                )

                measurement = timer.blocked_autorange(
                    min_run_time=_MIN_RUN_SEC)
                measurement.metadata = {
                    "tensor_parameters": tensor_parameters,
                    "params": params,
                }
                print(measurement)
                pickle.dump(measurement, f)
Ejemplo n.º 2
0
def benchmark_filterer(
    dataset: Dataset,
    filterer: Hint[Filterer],
    filterer_kwargs: Optional[Mapping[str, Any]] = None,
) -> Iterable[Mapping[str, Any]]:
    """Benchmark a filterer."""
    filterer_kwargs = filterer_kwargs or {}

    # include some metadata into each entry
    kwargs = dict(
        dataset=dataset.get_normalized_name(),
        filterer=filterer,
        **filterer_kwargs,
    )

    filterer_cls = filterer_resolver.lookup(filterer)
    tqdm.write(f'[{filterer_cls.__name__}] measure creation (=indexing) time')
    timer = TorchTimer(stmt="filterer_cls(triples_factory=factory, **kwargs)",
                       globals=dict(
                           filterer_cls=filterer_cls,
                           factory=dataset.training,
                           kwargs=filterer_kwargs,
                       ))
    measurement = timer.blocked_autorange()
    yield dict(
        operation="index",
        subset="train",
        time=measurement.median,
        num_triples=dataset.training.num_triples,
        **kwargs,
    )

    # instantiate filterer for further tests
    filterer = filterer_resolver.make(filterer,
                                      pos_kwargs=filterer_kwargs,
                                      triples_factory=dataset.training)
    for key, value in dataset.factory_dict.items():
        if key == 'training':
            continue
        tqdm.write(f'[{filterer}] measure inference time ({key})')
        timer = TorchTimer(stmt="filterer(mapped_triples)",
                           globals=dict(
                               filterer=filterer,
                               mapped_triples=value.mapped_triples,
                           ))
        measurement = timer.blocked_autorange()

        # check for correctness
        error_rate = float(
            (~filterer(value.mapped_triples)[1]).float().mean().item())
        yield dict(
            operation="inference",
            subset=key,
            time=measurement.median,
            num_triples=value.num_triples,
            observed_error_rate=error_rate,
            **kwargs,
        )
Ejemplo n.º 3
0
def time_with_torch_timer(fn, args, string_id, kwargs={}):
    print("################################################")
    print(f"#### Torch Timer for {string_id} starts #########")
    print("################################################")
    ref = fn(*args, **kwargs)
    gO = torch.rand_like(ref)
    env = {"args": args, "gO": gO, "kwargs": kwargs, "fn": fn}
    grad_none = {"for x in args: x.grad=None"}
    fn_call = "fn(*args, **kwargs)"
    # Measure end-to-end fwd time
    timer = Timer(stmt=f"{fn_call}", globals=env)
    fwd_latency = round(timer.timeit(1000).mean * 10**6, 3)
    timer_blocked = timer.blocked_autorange()
    print(f"Forward = {fwd_latency}")

    # Measure end-to-end fwd bwd
    timer = Timer(
        stmt=f"{grad_none}; fwd = {fn_call}; fwd.backward(gO)",
        globals=env,
    )
    fwd_bwd_latency = round(timer.timeit(1000).mean * 10**6, 3)
    timer_blocked = timer.blocked_autorange()
    # print(f"Forward + sum + Backward = {fwd_sum_bwd_latency}")

    bwd_latency = round(fwd_bwd_latency - fwd_latency, 3)
    print(f"Backward = {bwd_latency}")

    print("################################################")
    print(f"#### Torch Timer for {string_id} ends ###############")
    print("################################################\n\n\n\n")
Ejemplo n.º 4
0
def _subprocess_main(seed=0,
                     num_threads=1,
                     sub_label="N/A",
                     result_file=None,
                     env=None):
    import torch
    from torch.utils.benchmark import Timer

    conda_prefix = os.getenv("CONDA_PREFIX")
    assert conda_prefix
    if not torch.__file__.startswith(conda_prefix):
        raise ValueError(
            f"PyTorch mismatch: `import torch` resolved to `{torch.__file__}`, "
            f"which is not in the correct conda env: {conda_prefix}")

    torch.manual_seed(seed)
    results = []
    for n in [4, 8, 16, 32, 64, 128, 256, 512, 1024, 7, 96, 150, 225]:
        dtypes = (("Single", torch.float32), ("Double", torch.float64))
        shapes = (
            # Square MatMul
            ((n, n), (n, n), "(n x n) x (n x n)", "Matrix-Matrix Product"),

            # Matrix-Vector product
            ((n, n), (n, 1), "(n x n) x (n x 1)", "Matrix-Vector Product"),
        )
        for (dtype_name, dtype), (x_shape, y_shape, shape_str,
                                  blas_type) in it.product(dtypes, shapes):
            t = Timer(
                stmt="torch.mm(x, y)",
                label=f"torch.mm {shape_str} {blas_type} ({dtype_name})",
                sub_label=sub_label,
                description=f"n = {n}",
                env=os.path.split(env or "")[1] or None,
                globals={
                    "x": torch.rand(x_shape, dtype=dtype),
                    "y": torch.rand(y_shape, dtype=dtype),
                },
                num_threads=num_threads,
            ).blocked_autorange(min_run_time=MIN_RUN_TIME)
            results.append(t)

    if result_file is not None:
        with open(result_file, "wb") as f:
            pickle.dump(results, f)
Ejemplo n.º 5
0
def run(n, stmt, fuzzer_cls):
    float_iter = fuzzer_cls(seed=0, dtype=torch.float32).take(n)
    double_iter = fuzzer_cls(seed=0, dtype=torch.float64).take(n)
    raw_results = []
    for i, (float_values, int_values) in enumerate(zip(float_iter, double_iter)):
        float_tensors, float_tensor_params, float_params = float_values
        int_tensors, int_tensor_params, int_params = int_values

        assert_dicts_equal(float_params, int_params)
        assert_dicts_equal(float_tensor_params["x"], int_tensor_params["x"])

        float_measurement, int_measurement = [
            Timer(
                stmt,
                globals=tensors,
            ).blocked_autorange(min_run_time=_MEASURE_TIME)
            for tensors in (float_tensors, int_tensors)
        ]

        descriptions = []
        for name in float_tensors:
            shape_str = "(" + ", ".join([
                f"2 ** {int(np.log2(i))}"
                if 2 ** int(np.log2(i)) == i and i > 1
                else str(i)
                for i in float_tensors[name].shape
            ]) + ")"
            sparse_dim = float_tensor_params[name]["sparse_dim"]
            sparse_dim_str = str(sparse_dim)
            is_coalesced = float_tensor_params[name]["is_coalesced"]
            is_coalesced_str = "True" if is_coalesced else "False"
            descriptions.append((name, shape_str, sparse_dim_str, is_coalesced_str))
        raw_results.append((float_measurement, int_measurement, descriptions))

        print(f"\r{i + 1} / {n}", end="")
    print()

    parsed_results, name_len, shape_len, sparse_dim_len, is_coalesced_len = [], 0, 0, 0, 0
    for float_measurement, int_measurement, descriptions in raw_results:
        t_float = float_measurement.median * 1e6
        t_int = int_measurement.median * 1e6
        rel_diff = abs(t_float - t_int) / (t_float + t_int) * 2
        parsed_results.append((t_float, t_int, rel_diff, descriptions))
        for name, shape, sparse_dim, is_coalesced in descriptions:
            name_len = max(name_len, len(name))
            shape_len = max(shape_len, len(shape))
            sparse_dim_len = max(sparse_dim_len, len(sparse_dim))
            is_coalesced_len = max(is_coalesced_len, len(is_coalesced))

    parsed_results.sort(key=lambda x: x[2])

    print(f"stmt: {stmt}")
    print(f" diff    faster{'':>17}{' ' * name_len} ", end="")
    print(f"{'shape'.ljust(shape_len)}{'':>12}{'sparse_dim'.ljust(sparse_dim_len)}", end="")
    print(f"          is_coalesced\n{'-' * 100}")
    for results, spacer in [(parsed_results[:10], "..."), (parsed_results[-10:], "")]:
        for t_float, t_int, rel_diff, descriptions in results:
            time_str = [f"{rel_diff * 100:>4.1f}%    {'int' if t_int < t_float else 'float':<20}"]
            time_str.extend(["".ljust(len(time_str[0])) for _ in descriptions[:-1]])
            for t_str, (name, shape, sparse_dim, is_coalesced) in zip(time_str, descriptions):
                name = f"{name}:".ljust(name_len + 1)
                shape = shape.ljust(shape_len + 10)
                sparse_dim = sparse_dim.ljust(sparse_dim_len)
                print(f"{t_str} {name}  {shape}|     {sparse_dim}      |   {is_coalesced}")
        print(spacer)
Ejemplo n.º 6
0
def time_cuda(fn, inputs, test_runs):
    t = Timer(stmt="fn(*inputs)", globals={"fn": fn, "inputs": inputs})
    times = t.blocked_autorange()
    return times.median * 1000  # time in ms
Ejemplo n.º 7
0
            workload = parallel_workload

        if args.use_script:
            traced_workload = torch.jit.trace(workload, (input_x, ))
            workload = traced_workload

        if profiling_enabled:

            def payload():
                x = None
                with torch.autograd.profiler.profile(
                        use_cuda=args.with_cuda,
                        with_stack=args.with_stack,
                        use_kineto=args.use_kineto,
                        use_cpu=not args.cuda_only) as prof:
                    x = workload(input_x)
                return x
        else:

            def payload():
                return workload(input_x)

        t = Timer(
            "payload()",
            globals={
                "payload": payload
            },
            timer=timeit.default_timer,
        ).blocked_autorange(min_run_time=args.timer_min_run_time)
        print(t)
Ejemplo n.º 8
0
def run(n, stmt, fuzzer_cls):
    float_iter = fuzzer_cls(seed=0, dtype=torch.float32).take(n)
    int_iter = fuzzer_cls(seed=0, dtype=torch.int32).take(n)
    raw_results = []
    for i, (float_values, int_values) in enumerate(zip(float_iter, int_iter)):
        float_tensors, float_tensor_params, float_params = float_values
        int_tensors, int_tensor_params, int_params = int_values

        # This benchmark assumes that the two fuzzers generate identically
        # sized and strided Tensors, since the same seed is used.
        assert_dicts_equal(float_params, int_params)
        assert_dicts_equal(float_tensor_params["x"], int_tensor_params["x"])

        float_measurement, int_measurement = [
            Timer(
                stmt,
                globals=tensors,
            ).blocked_autorange(min_run_time=_MEASURE_TIME)
            for tensors in (float_tensors, int_tensors)
        ]

        descriptions = []
        for name in float_tensors:
            shape_str = "(" + ", ".join([
                f"2 ** {int(np.log2(i))}"
                if 2**int(np.log2(i)) == i and i > 1 else str(i)
                for i in float_tensors[name].shape
            ]) + ")"
            order = float_tensor_params[name]["order"]
            order_str = ("" if all(
                order == np.arange(len(order))) else str(tuple(order)))
            steps = float_tensor_params[name]["steps"]
            steps_str = str(steps) if sum(steps) > len(steps) else ""
            descriptions.append((name, shape_str, order_str, steps_str))
        raw_results.append((float_measurement, int_measurement, descriptions))

        print(f"\r{i + 1} / {n}", end="")
    print()

    parsed_results, name_len, shape_len, order_len, steps_len = [], 0, 0, 0, 0
    for float_measurement, int_measurement, descriptions in raw_results:
        t_float = float_measurement.median * 1e6
        t_int = int_measurement.median * 1e6
        rel_diff = abs(t_float - t_int) / (t_float + t_int) * 2
        parsed_results.append((t_float, t_int, rel_diff, descriptions))
        for name, shape, order, steps in descriptions:
            name_len = max(name_len, len(name))
            shape_len = max(shape_len, len(shape))
            order_len = max(order_len, len(order))
            steps_len = max(steps_len, len(steps))

    parsed_results.sort(key=lambda x: x[2])

    print(f"stmt: {stmt}")
    print(f" diff    faster{'':>17}{' ' * name_len} ", end="")
    print(f"{'shape'.ljust(shape_len)}{'':>16}{'order'.ljust(order_len)}",
          end="")
    print(f"          steps\n{'-' * 100}")
    for results, spacer in [(parsed_results[:10], "..."),
                            (parsed_results[-10:], "")]:
        for t_float, t_int, rel_diff, descriptions in results:
            time_str = [
                f"{rel_diff * 100:>4.1f}%    {'int' if t_int < t_float else 'float':<20}"
            ]
            time_str.extend(
                ["".ljust(len(time_str[0])) for _ in descriptions[:-1]])
            for t_str, (name, shape, order,
                        steps) in zip(time_str, descriptions):
                name = f"{name}:".ljust(name_len + 1)
                shape = shape.ljust(shape_len + 10)
                order = order.ljust(order_len)
                print(f"{t_str} {name}  {shape}|     {order}      |   {steps}")
        print(spacer)
Ejemplo n.º 9
0
# transforms can give us different interesting quantities.
#
# functorch provides ``jacrev`` as a convenience function that performs
# the vmap-vjp composition to compute jacobians. ``jacrev`` accepts an argnums
# argument that says which argument we would like to compute Jacobians with
# respect to.
from functorch import jacrev
ft_jacobian = jacrev(predict, argnums=2)(weight, bias, x)
assert torch.allclose(ft_jacobian, jacobian)

# Let's compare the performance of the two ways to compute jacobian.
# The functorch version is much faster (and becomes even faster the more outputs
# there are). In general, we expect that vectorization via ``vmap`` can help
# eliminate overhead and give better utilization of your hardware.
from torch.utils.benchmark import Timer
without_vmap = Timer(stmt="compute_jac(xp)", globals=globals())
with_vmap = Timer(stmt="jacrev(predict, argnums=2)(weight, bias, x)",
                  globals=globals())
print(without_vmap.timeit(500))
print(with_vmap.timeit(500))

# It's pretty easy to flip the problem around and say we want to compute
# Jacobians of the parameters to our model (weight, bias) instead of the input.
ft_jac_weight, ft_jac_bias = jacrev(predict, argnums=(0, 1))(weight, bias, x)

######################################################################
# reverse-mode Jacobian (jacrev) vs forward-mode Jacobian (jacfwd)
# --------------------------------------------------------------------
# We offer two APIs to compute jacobians: jacrev and jacfwd:
# - jacrev uses reverse-mode AD. As you saw above it is a composition of our
#   vjp and vmap transforms.
Ejemplo n.º 10
0
# A `Timer` serves as a task definition.
#

from torch.utils.benchmark import Timer

timer = Timer(
    # The computation which will be run in a loop and timed.
    stmt="x * y",

    # `setup` will be run before calling the measurement loop, and is used to
    # populate any state which is needed by `stmt`
    setup="""
        x = torch.ones((128,))
        y = torch.ones((128,))
    """,

    # Alternately, `globals` can be used to pass variables from the outer scope.
    # -------------------------------------------------------------------------
    # globals={
    #     "x": torch.ones((128,)),
    #     "y": torch.ones((128,)),
    # },

    # Control the number of threads that PyTorch uses. (Default: 1)
    num_threads=1,
)

###############################################################################
# 2. Wall time: `Timer.blocked_autorange(...)`
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#
Ejemplo n.º 11
0
def main():
    parser = argparse.ArgumentParser(prog="tensor_product_benchmark")
    parser.add_argument("--jit", type=t_or_f, default=True)
    parser.add_argument("--irreps",
                        type=str,
                        default="8x0e + 8x1e + 8x2e + 8x3o")
    parser.add_argument("--irreps-in1", type=str, default=None)
    parser.add_argument("--irreps-in2", type=str, default=None)
    parser.add_argument("--irreps-out", type=str, default=None)
    parser.add_argument("--cuda", type=t_or_f, default=True)
    parser.add_argument("--backward", type=t_or_f, default=True)
    parser.add_argument("--opt-ein", type=t_or_f, default=True)
    parser.add_argument("--specialized-code", type=t_or_f, default=True)
    parser.add_argument("--elementwise", action='store_true')
    parser.add_argument("-n", type=int, default=1000)
    parser.add_argument("--batch", type=int, default=10)

    args = parser.parse_args()

    device = 'cuda' if (torch.cuda.is_available() and args.cuda) else 'cpu'
    args.cuda = device == 'cuda'

    print("======= Benchmark with settings: ======")
    for key, val in vars(args).items():
        print(f"{key:>18} : {val}")
    print("=" * 40)

    irreps_in1 = Irreps(args.irreps_in1 if args.irreps_in1 else args.irreps)
    irreps_in2 = Irreps(args.irreps_in2 if args.irreps_in2 else args.irreps)
    irreps_out = Irreps(args.irreps_out if args.irreps_out else args.irreps)

    if args.elementwise:
        tp = ElementwiseTensorProduct(irreps_in1,
                                      irreps_in2,
                                      _specialized_code=args.specialized_code,
                                      _optimize_einsums=args.opt_ein)
        if args.backward:
            print(
                "Elementwise TP has no weights, cannot backward. Setting --backward False."
            )
            args.backward = False
    else:
        tp = FullyConnectedTensorProduct(
            irreps_in1,
            irreps_in2,
            irreps_out,
            _specialized_code=args.specialized_code,
            _optimize_einsums=args.opt_ein)
    tp = tp.to(device=device)
    assert len(tp.instructions) > 0, "Bad irreps, no instructions"
    print(f"Tensor product: {tp}")
    print("Instructions:")
    for ins in tp.instructions:
        print(f"  {ins}")

    # from https://pytorch.org/docs/master/_modules/torch/utils/benchmark/utils/timer.html#Timer.timeit
    warmup = max(int(args.n // 100), 1)

    inputs = iter([(irreps_in1.randn(args.batch, -1).to(device=device),
                    irreps_in2.randn(args.batch, -1).to(device=device))
                   for _ in range(args.n + warmup)])

    # compile
    if args.jit:
        tp = compile(tp)

    print("starting...")

    # tanh() forces it to realize the grad as a full size matrix rather than expanded (stride 0) ones
    t = Timer(
        stmt=("tp.zero_grad()\n"
              "out = tp(*next(inputs))\n" +
              ("out.tanh().sum().backward()\n" if args.backward else '')),
        globals={
            'tp': tp,
            'inputs': inputs
        })

    perloop = t.timeit(args.n)

    print()
    print(perloop)