def test_codegen_with_DictOfNamedArrays(ctx_factory): # noqa ctx = ctx_factory() queue = cl.CommandQueue(ctx) namespace = pt.Namespace() x = Placeholder(namespace, "x", (5, ), np.int) y = Placeholder(namespace, "y", (5, ), np.int) x_in = np.array([1, 2, 3, 4, 5]) y_in = np.array([6, 7, 8, 9, 10]) result = pt.DictOfNamedArrays(dict(x_out=x, y_out=y)) # Without return_dict. prog = pt.generate_loopy(result, target=pt.PyOpenCLTarget(queue)) _, (x_out, y_out) = prog(x=x_in, y=y_in) assert (x_out == x_in).all() assert (y_out == y_in).all() # With return_dict. prog = pt.generate_loopy(result, target=pt.PyOpenCLTarget(queue), options=lp.Options(return_dict=True)) _, outputs = prog(x=x_in, y=y_in) assert (outputs["x_out"] == x_in).all() assert (outputs["y_out"] == y_in).all()
def test_dict_of_named_array_codegen_avoids_recomputation(): ns = pt.Namespace() x = pt.make_placeholder(ns, shape=(10, 4), dtype=float, name="x") y = 2 * x z = y + 4 * x yz = pt.DictOfNamedArrays({"y": y, "z": z}) knl = pt.generate_loopy(yz).program assert ("y" in knl.id_to_insn["z_store"].read_dependency_names())
def main(): rank = comm.Get_rank() size = comm.Get_size() rng = np.random.default_rng() x_in = rng.integers(100, size=(4, 4)) x = pt.make_data_wrapper(x_in) mytag = (main, "x") halo = staple_distributed_send(x, dest_rank=(rank - 1) % size, comm_tag=mytag, stapled_to=make_distributed_recv( src_rank=(rank + 1) % size, comm_tag=mytag, shape=(4, 4), dtype=int)) y = x + halo # Find the partition outputs = pt.DictOfNamedArrays({"out": y}) distributed_parts = find_distributed_partition(outputs) distributed_parts, _ = number_distributed_tags(comm, distributed_parts, base_tag=42) prg_per_partition = generate_code_for_partition(distributed_parts) if 0: from pytato.visualization import show_dot_graph show_dot_graph(distributed_parts) # Sanity check from pytato.visualization import get_dot_graph_from_partition get_dot_graph_from_partition(distributed_parts) # Execute the distributed partition ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) context = execute_distributed_partition(distributed_parts, prg_per_partition, queue, comm) final_res = context["out"].get(queue) ref_res = comm.bcast(final_res) np.testing.assert_allclose(ref_res, final_res) if rank == 0: print("Distributed test succeeded.")
def _do_test_distributed_execution_basic(ctx_factory): from mpi4py import MPI # pylint: disable=import-error comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size() rng = np.random.default_rng(seed=27) x_in = rng.integers(100, size=(4, 4)) x = pt.make_data_wrapper(x_in) halo = staple_distributed_send(x, dest_rank=(rank - 1) % size, comm_tag=42, stapled_to=make_distributed_recv( src_rank=(rank + 1) % size, comm_tag=42, shape=(4, 4), dtype=int)) y = x + halo # Find the partition outputs = pt.DictOfNamedArrays({"out": y}) distributed_parts = find_distributed_partition(outputs) prg_per_partition = generate_code_for_partition(distributed_parts) # Execute the distributed partition ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) context = execute_distributed_partition(distributed_parts, prg_per_partition, queue, comm) final_res = context["out"].get(queue) # All ranks generate the same random numbers (same seed). np.testing.assert_allclose(x_in * 2, final_res)
def main(): x_in = np.random.randn(2, 2) x = pt.make_data_wrapper(x_in) y = pt.stack([x @ x.T, 2 * x, 42 + x]) y = y + 55 tm = TopoSortMapper() tm(y) from functools import partial pfunc = partial(get_partition_id, tm.topological_order) # Find the partitions outputs = pt.DictOfNamedArrays({"out": y}) partition = find_partition(outputs, pfunc) # Show the partitions from pytato.visualization import get_dot_graph_from_partition get_dot_graph_from_partition(partition) # Execute the partitions ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) prg_per_partition = generate_code_for_partition(partition) context = execute_partition(partition, prg_per_partition, queue) final_res = [context[k] for k in outputs.keys()] # Execute the unpartitioned code for comparison prg = pt.generate_loopy(y) _, (out, ) = prg(queue) np.testing.assert_allclose([out], final_res) print("Partitioning test succeeded.")
import numpy as np import pytato as pt n = pt.make_size_param("n") a = pt.make_placeholder(name="a", shape=(n, n), dtype=np.float64) a2a = a @ (2 * a) aat = a @ a.T result = pt.DictOfNamedArrays({"a2a": a2a, "aat": aat}) # {{{ execute import pyopencl as cl ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) prg = pt.generate_loopy(result, cl_device=queue.device) a = np.random.randn(20, 20) _, out = prg(queue, a=a) assert np.allclose(out["a2a"], a @ (2 * a)) assert np.allclose(out["aat"], a @ a.T) # }}} # {{{ generate OpenCL code prg = pt.generate_loopy(result) import loopy as lp print(lp.generate_code_v2(prg.program).device_code())
def _do_test_distributed_execution_random_dag(ctx_factory): from mpi4py import MPI # pylint: disable=import-error comm = MPI.COMM_WORLD ctx = ctx_factory() queue = cl.CommandQueue(ctx) rank = comm.Get_rank() size = comm.Get_size() from testlib import RandomDAGContext, make_random_dag axis_len = 4 comm_fake_prob = 500 gen_comm_called = False ntests = 10 for i in range(ntests): seed = 120 + i print(f"Step {i} {seed}") # {{{ compute value with communication comm_tag = 17 def gen_comm(rdagc): nonlocal gen_comm_called gen_comm_called = True nonlocal comm_tag comm_tag += 1 tag = (comm_tag, _RandomDAGTag) inner = make_random_dag(rdagc) return staple_distributed_send(inner, dest_rank=(rank - 1) % size, comm_tag=tag, stapled_to=make_distributed_recv( src_rank=(rank + 1) % size, comm_tag=tag, shape=inner.shape, dtype=inner.dtype)) rdagc_comm = RandomDAGContext(np.random.default_rng(seed=seed), axis_len=axis_len, use_numpy=False, additional_generators=[(comm_fake_prob, gen_comm)]) x_comm = make_random_dag(rdagc_comm) distributed_partition = find_distributed_partition( pt.DictOfNamedArrays({"result": x_comm})) # Transform symbolic tags into numeric ones for MPI distributed_partition, _new_mpi_base_tag = number_distributed_tags( comm, distributed_partition, base_tag=comm_tag) prg_per_partition = generate_code_for_partition(distributed_partition) context = execute_distributed_partition(distributed_partition, prg_per_partition, queue, comm) res_comm = context["result"] # }}} # {{{ compute ref value without communication # compiled evaluation (i.e. use_numpy=False) fails for some of these # graphs, cf. https://github.com/inducer/pytato/pull/255 rdagc_no_comm = RandomDAGContext( np.random.default_rng(seed=seed), axis_len=axis_len, use_numpy=True, additional_generators=[(comm_fake_prob, lambda rdagc: make_random_dag(rdagc))]) res_no_comm_numpy = make_random_dag(rdagc_no_comm) # }}} if not isinstance(res_comm, np.ndarray): res_comm = res_comm.get(queue=queue) np.testing.assert_allclose(res_comm, res_no_comm_numpy) assert gen_comm_called