Ejemplo n.º 1
0
def test_get_output_3(workflow_start_regular, tmp_path):
    cnt_file = tmp_path / "counter"
    cnt_file.write_text("0")
    error_flag = tmp_path / "error"
    error_flag.touch()

    @ray.remote
    def incr():
        v = int(cnt_file.read_text())
        cnt_file.write_text(str(v + 1))
        if error_flag.exists():
            raise ValueError()
        return 10

    with pytest.raises(workflow.WorkflowExecutionError):
        workflow.run(incr.options(max_retries=0).bind(), workflow_id="incr")

    assert cnt_file.read_text() == "1"

    from ray.exceptions import RaySystemError

    # TODO(suquark): We should prevent Ray from raising "RaySystemError",
    #   in workflow, because "RaySystemError" does not inherit the underlying
    #   error, so users and developers cannot catch the expected error.
    #   I feel this issue is a very annoying.
    with pytest.raises((RaySystemError, ValueError)):
        workflow.get_output("incr")

    assert cnt_file.read_text() == "1"
    error_flag.unlink()
    with pytest.raises((RaySystemError, ValueError)):
        workflow.get_output("incr")
    assert workflow.resume("incr") == 10
Ejemplo n.º 2
0
def test_get_output_3(workflow_start_regular, tmp_path):
    cnt_file = tmp_path / "counter"
    cnt_file.write_text("0")
    error_flag = tmp_path / "error"
    error_flag.touch()

    @workflow.step
    def incr():
        v = int(cnt_file.read_text())
        cnt_file.write_text(str(v + 1))
        if error_flag.exists():
            raise ValueError()
        return 10

    with pytest.raises(ray.exceptions.RaySystemError):
        incr.options(max_retries=1).step().run("incr")

    assert cnt_file.read_text() == "1"

    with pytest.raises(ray.exceptions.RaySystemError):
        ray.get(workflow.get_output("incr"))

    assert cnt_file.read_text() == "1"
    error_flag.unlink()
    with pytest.raises(ray.exceptions.RaySystemError):
        ray.get(workflow.get_output("incr"))
    assert ray.get(workflow.resume("incr")) == 10
Ejemplo n.º 3
0
def test_output_with_name(workflow_start_regular):
    @ray.remote
    def double(v):
        return 2 * v

    inner_task = double.options(**workflow.options(name="inner")).bind(1)
    outer_task = double.options(**workflow.options(name="outer")).bind(inner_task)
    result = workflow.create(outer_task).run_async("double")
    inner = workflow.get_output("double", name="inner")
    outer = workflow.get_output("double", name="outer")

    assert ray.get(inner) == 2
    assert ray.get(outer) == 4
    assert ray.get(result) == 4

    @workflow.options(name="double")
    @ray.remote
    def double_2(s):
        return s * 2

    inner_task = double_2.bind(1)
    outer_task = double_2.bind(inner_task)
    workflow_id = "double_2"
    result = workflow.create(outer_task).run_async(workflow_id)

    inner = workflow.get_output(workflow_id, name="double")
    outer = workflow.get_output(workflow_id, name="double_1")

    assert ray.get(inner) == 2
    assert ray.get(outer) == 4
    assert ray.get(result) == 4
Ejemplo n.º 4
0
def test_get_named_step_output_finished(workflow_start_regular, tmp_path):
    @workflow.step
    def double(v):
        return 2 * v

    # Get the result from named step after workflow finished
    assert 4 == double.options(name="outer").step(
        double.options(name="inner").step(1)).run("double")
    assert ray.get(workflow.get_output("double", name="inner")) == 2
    assert ray.get(workflow.get_output("double", name="outer")) == 4
Ejemplo n.º 5
0
def test_get_named_step_output_running(workflow_start_regular, tmp_path):
    @ray.remote
    def double(v, lock=None):
        if lock is not None:
            with FileLock(lock_path):
                return 2 * v
        else:
            return 2 * v

    # Get the result from named step after workflow before it's finished
    lock_path = str(tmp_path / "lock")
    lock = FileLock(lock_path)
    lock.acquire()
    output = workflow.create(
        double.options(**workflow.options(name="outer")).bind(
            double.options(**workflow.options(name="inner")).bind(
                1, lock_path),
            lock_path,
        )).run_async("double-2")

    inner = workflow.get_output("double-2", name="inner")
    outer = workflow.get_output("double-2", name="outer")

    @ray.remote
    def wait(obj_ref):
        return ray.get(obj_ref[0])

    # Make sure nothing is finished.
    ready, waiting = ray.wait(
        [wait.remote([output]),
         wait.remote([inner]),
         wait.remote([outer])],
        timeout=1)
    assert 0 == len(ready)
    assert 3 == len(waiting)

    # Once job finished, we'll be able to get the result.
    lock.release()
    assert 4 == ray.get(output)

    # Here sometimes inner will not be generated when we call
    # run_async. So there is a race condition here.
    try:
        v = ray.get(inner)
    except Exception:
        v = None
    if v is not None:
        assert 2 == v
    assert 4 == ray.get(outer)

    inner = workflow.get_output("double-2", name="inner")
    outer = workflow.get_output("double-2", name="outer")
    assert 2 == ray.get(inner)
    assert 4 == ray.get(outer)
Ejemplo n.º 6
0
def test_get_named_step_output_finished(workflow_start_regular, tmp_path):
    @ray.remote
    def double(v):
        return 2 * v

    # Get the result from named step after workflow finished
    assert 4 == workflow.create(
        update_workflow_options(double, name="outer").bind(
            update_workflow_options(double,
                                    name="inner").bind(1))).run("double")
    assert ray.get(workflow.get_output("double", name="inner")) == 2
    assert ray.get(workflow.get_output("double", name="outer")) == 4
Ejemplo n.º 7
0
def test_get_named_step_duplicate(workflow_start_regular):
    @workflow.step(name="f")
    def f(n, dep):
        return n

    inner = f.step(10, None)
    outer = f.step(20, inner)
    assert 20 == outer.run("duplicate")
    # The outer will be checkpointed first. So there is no suffix for the name
    assert ray.get(workflow.get_output("duplicate", name="f")) == 20
    # The inner will be checkpointed after the outer. And there is a duplicate
    # for the name. suffix _1 is added automatically
    assert ray.get(workflow.get_output("duplicate", name="f_1")) == 10
Ejemplo n.º 8
0
def test_get_named_step_output_finished(workflow_start_regular, tmp_path):
    @ray.remote
    def double(v):
        return 2 * v

    # Get the result from named step after workflow finished
    assert 4 == workflow.run(
        double.options(**workflow.options(name="outer")).bind(
            double.options(**workflow.options(name="inner")).bind(1)),
        workflow_id="double",
    )
    assert workflow.get_output("double", name="inner") == 2
    assert workflow.get_output("double", name="outer") == 4
Ejemplo n.º 9
0
def test_get_named_step_duplicate(workflow_start_regular):
    @workflow.options(name="f")
    @ray.remote
    def f(n, dep):
        return n

    inner = f.bind(10, None)
    outer = f.bind(20, inner)
    assert 20 == workflow.run(outer, workflow_id="duplicate")
    # The outer will be checkpointed first. So there is no suffix for the name
    assert workflow.get_output("duplicate", name="f") == 10
    # The inner will be checkpointed after the outer. And there is a duplicate
    # for the name. suffix _1 is added automatically
    assert workflow.get_output("duplicate", name="f_1") == 20
Ejemplo n.º 10
0
def test_crash_during_event_checkpointing(workflow_start_regular_shared):
    """Ensure that if the cluster dies while the event is being checkpointed, we
    properly re-poll for the event."""

    from ray._private import storage

    storage_uri = storage._storage_uri

    """Ensure that we don't re-call poll_for_event after `event_checkpointed`
       returns, even after a crash."""

    class MyEventListener(workflow.EventListener):
        async def poll_for_event(self):
            assert not utils.check_global_mark("committed")
            if utils.check_global_mark("first"):
                utils.set_global_mark("second")
            utils.set_global_mark("first")

            utils.set_global_mark("time_to_die")
            while not utils.check_global_mark("resume"):
                time.sleep(0.1)

        async def event_checkpointed(self, event):
            utils.set_global_mark("committed")

    @ray.remote
    def wait_then_finish(arg):
        pass

    event_promise = workflow.wait_for_event(MyEventListener)
    workflow.run_async(wait_then_finish.bind(event_promise), workflow_id="workflow")

    while not utils.check_global_mark("time_to_die"):
        time.sleep(0.1)

    assert utils.check_global_mark("first")
    ray.shutdown()
    subprocess.check_output(["ray", "stop", "--force"])

    # Give the workflow some time to kill the cluster.
    # time.sleep(3)

    ray.init(num_cpus=4, storage=storage_uri)
    workflow.init()
    workflow.resume_async("workflow")
    utils.set_global_mark("resume")

    workflow.get_output("workflow")
    assert utils.check_global_mark("second")
Ejemplo n.º 11
0
def test_get_output_1(workflow_start_regular, tmp_path):
    @workflow.step
    def simple(v):
        return v

    assert 0 == simple.step(0).run("simple")
    assert 0 == ray.get(workflow.get_output("simple"))
Ejemplo n.º 12
0
def test_workflow_lifetime_1(workflow_start_cluster):
    # Case 1: driver exits normally
    address, storage_uri = workflow_start_cluster
    with patch.dict(os.environ, {"RAY_ADDRESS": address}):
        ray.init()
        run_string_as_driver(driver_script.format(5))
        assert workflow.get_output("driver_terminated") == 20
Ejemplo n.º 13
0
def test_workflow_lifetime_1(call_ray_start, reset_workflow):
    # Case 1: driver exits normally
    with patch.dict(os.environ, {"RAY_ADDRESS": call_ray_start}):
        run_string_as_driver(driver_script.format(5))
        workflow.init()
        output = workflow.get_output("driver_terminated")
        assert ray.get(output) == 20
Ejemplo n.º 14
0
def test_get_output_4(workflow_start_regular, tmp_path):
    """Test getting output of a workflow tasks that are dynamically generated."""
    lock_path = str(tmp_path / "lock")
    lock = FileLock(lock_path)

    @ray.remote
    def recursive(n):
        if n <= 0:
            with FileLock(lock_path):
                return 42
        return workflow.continuation(
            recursive.options(**workflow.options(name=str(n - 1))).bind(n - 1)
        )

    workflow_id = "test_get_output_4"
    lock.acquire()
    obj = workflow.create(
        recursive.options(**workflow.options(name="10")).bind(10)
    ).run_async(workflow_id)

    outputs = [workflow.get_output(workflow_id, name=str(i)) for i in range(11)]
    outputs.append(obj)

    import time

    # wait so that 'get_output' is scheduled before executing the workflow
    time.sleep(3)
    lock.release()
    assert ray.get(outputs) == [42] * len(outputs)
Ejemplo n.º 15
0
def test_crash_after_commit(workflow_start_regular_shared):
    _storage = storage.get_global_storage()
    """Ensure that we don't re-call poll_for_event after `event_checkpointed`
       returns, even after a crash. Here we must call `event_checkpointed`
       twice, because there's no way to know if we called it after
       checkpointing.

    """
    class MyEventListener(workflow.EventListener):
        async def poll_for_event(self):
            assert not utils.check_global_mark("committed")

        async def event_checkpointed(self, event):
            utils.set_global_mark("committed")
            if utils.check_global_mark("first"):
                utils.set_global_mark("second")
            else:
                utils.set_global_mark("first")
                await asyncio.sleep(1000000)

    event_promise = workflow.wait_for_event(MyEventListener)
    event_promise.run_async("workflow")

    while not utils.check_global_mark("first"):
        time.sleep(0.1)

    ray.shutdown()
    subprocess.check_output(["ray", "stop", "--force"])

    ray.init(num_cpus=4)
    workflow.init(storage=_storage)
    workflow.resume("workflow")

    ray.get(workflow.get_output("workflow"))
    assert utils.check_global_mark("second")
Ejemplo n.º 16
0
def test_get_output_1(workflow_start_regular, tmp_path):
    @ray.remote
    def simple(v):
        return v

    assert 0 == workflow.create(simple.bind(0)).run("simple")
    assert 0 == ray.get(workflow.get_output("simple"))
Ejemplo n.º 17
0
def test_task_id_generation(workflow_start_regular_shared, request):
    @ray.remote
    def simple(x):
        return x + 1

    x = simple.options(**workflow.options(name="simple")).bind(-1)
    n = 20
    for i in range(1, n):
        x = simple.options(**workflow.options(name="simple")).bind(x)

    workflow_id = "test_task_id_generation"
    ret = workflow.create(x).run_async(workflow_id=workflow_id)
    outputs = [workflow.get_output(workflow_id, name="simple")]
    for i in range(1, n):
        outputs.append(workflow.get_output(workflow_id, name=f"simple_{i}"))
    assert ray.get(ret) == n - 1
    assert ray.get(outputs) == list(range(n))
Ejemplo n.º 18
0
def test_get_named_step_output_error(workflow_start_regular, tmp_path):
    @workflow.step
    def double(v, error):
        if error:
            raise Exception()
        return v + v

    # Force it to fail for the outer step
    with pytest.raises(Exception):
        double.options(name="outer").step(
            double.options(name="inner").step(1, False), True).run("double")

    # For the inner step, it should have already been executed.
    assert 2 == ray.get(workflow.get_output("double", name="inner"))
    outer = workflow.get_output("double", name="outer")
    with pytest.raises(Exception):
        ray.get(outer)
Ejemplo n.º 19
0
def test_workflow_lifetime_2(call_ray_start, reset_workflow):
    # Case 2: driver terminated
    proc = run_string_as_driver_nonblocking(driver_script.format(100))
    time.sleep(10)
    proc.kill()
    time.sleep(1)
    workflow.init()
    output = workflow.get_output("driver_terminated")
    assert ray.get(output) == 20
Ejemplo n.º 20
0
def test_workflow_lifetime_2(call_ray_start, reset_workflow):
    # Case 2: driver terminated
    with patch.dict(os.environ, {"RAY_ADDRESS": call_ray_start}):
        proc = run_string_as_driver_nonblocking(driver_script.format(100))
        time.sleep(10)
        proc.kill()
        time.sleep(1)
        workflow.init()
        output = workflow.get_output("driver_terminated")
        assert ray.get(output) == 20
Ejemplo n.º 21
0
def test_workflow_lifetime_2(workflow_start_cluster):
    # Case 2: driver terminated
    address, storage_uri = workflow_start_cluster
    with patch.dict(os.environ, {"RAY_ADDRESS": address}):
        ray.init()
        proc = run_string_as_driver_nonblocking(driver_script.format(100))
        time.sleep(10)
        proc.kill()
        time.sleep(1)
        assert workflow.get_output("driver_terminated") == 20
Ejemplo n.º 22
0
def test_get_named_step_output_error(workflow_start_regular, tmp_path):
    @ray.remote
    def double(v, error):
        if error:
            raise Exception()
        return v + v

    # Force it to fail for the outer step
    with pytest.raises(Exception):
        workflow.run(
            double.options(**workflow.options(name="outer")).bind(
                double.options(**workflow.options(name="inner")).bind(
                    1, False), True),
            workflow_id="double",
        )

    # For the inner step, it should have already been executed.
    assert 2 == workflow.get_output("double", name="inner")
    with pytest.raises(Exception):
        workflow.get_output("double", name="outer")
Ejemplo n.º 23
0
def test_get_non_exist_output(workflow_start_regular, tmp_path):
    lock_path = str(tmp_path / "lock")

    @ray.remote
    def simple():
        with FileLock(lock_path):
            return "hello"

    workflow_id = "test_get_non_exist_output"

    with FileLock(lock_path):
        dag = simple.options(**workflow.options(name="simple")).bind()
        ret = workflow.create(dag).run_async(workflow_id=workflow_id)
        exist = workflow.get_output(workflow_id, name="simple")
        non_exist = workflow.get_output(workflow_id, name="non_exist")

    assert ray.get(ret) == "hello"
    assert ray.get(exist) == "hello"
    with pytest.raises(ValueError, match="non_exist"):
        ray.get(non_exist)
Ejemplo n.º 24
0
def test_get_named_step_output_running(workflow_start_regular, tmp_path):
    @ray.remote
    def double(v, lock=None):
        if lock is not None:
            with FileLock(lock_path):
                return 2 * v
        else:
            return 2 * v

    # Get the result from named step after workflow before it's finished
    lock_path = str(tmp_path / "lock")
    lock = FileLock(lock_path)
    lock.acquire()
    output = workflow.create(
        double.options(**workflow.options(name="outer")).bind(
            double.options(**workflow.options(name="inner")).bind(1, lock_path),
            lock_path,
        )
    ).run_async("double-2")

    inner = workflow.get_output("double-2", name="inner")
    outer = workflow.get_output("double-2", name="outer")

    @ray.remote
    def wait(obj_ref):
        return ray.get(obj_ref[0])

    # Make sure nothing is finished.
    ready, waiting = ray.wait(
        [wait.remote([output]), wait.remote([inner]), wait.remote([outer])], timeout=1
    )
    assert 0 == len(ready)
    assert 3 == len(waiting)

    # Once job finished, we'll be able to get the result.
    lock.release()
    assert [4, 2, 4] == ray.get([output, inner, outer])

    inner = workflow.get_output("double-2", name="inner")
    outer = workflow.get_output("double-2", name="outer")
    assert [2, 4] == ray.get([inner, outer])
Ejemplo n.º 25
0
def test_get_output_2(workflow_start_regular, tmp_path):
    lock_path = str(tmp_path / "lock")
    lock = FileLock(lock_path)

    @workflow.step
    def simple(v):
        with FileLock(lock_path):
            return v

    lock.acquire()
    obj = simple.step(0).run_async("simple")
    obj2 = workflow.get_output("simple")
    lock.release()
    assert ray.get([obj, obj2]) == [0, 0]
Ejemplo n.º 26
0
def test_get_output_5(workflow_start_regular, tmp_path):
    """Test getting output of a workflow task immediately after executing it
    asynchronously."""

    @ray.remote
    def simple():
        return 314

    workflow_id = "test_get_output_5_{}"

    outputs = []
    for i in range(20):
        workflow.create(simple.bind()).run_async(workflow_id.format(i))
        outputs.append(workflow.get_output(workflow_id.format(i)))

    assert ray.get(outputs) == [314] * len(outputs)
Ejemplo n.º 27
0
def test_get_named_step_default(workflow_start_regular, tmp_path):
    @workflow.step
    def factorial(n, r=1):
        if n == 1:
            return r
        return factorial.step(n - 1, r * n)

    import math
    assert math.factorial(5) == factorial.step(5).run("factorial")
    for i in range(5):
        step_name = ("test_basic_workflows_2."
                     "test_get_named_step_default.locals.factorial")
        if i != 0:
            step_name += "_" + str(i)
        # All outputs will be 120
        assert math.factorial(5) == ray.get(
            workflow.get_output("factorial", name=step_name))
Ejemplo n.º 28
0
def test_workflow_queuing_1(shutdown_only, tmp_path):
    ray.init(storage=str(tmp_path))
    workflow.init(max_running_workflows=2, max_pending_workflows=2)

    import queue
    import filelock

    lock_path = str(tmp_path / ".lock")

    @ray.remote
    def long_running(x):
        with filelock.FileLock(lock_path):
            return x

    wfs = [long_running.bind(i) for i in range(5)]

    with filelock.FileLock(lock_path):
        refs = [
            workflow.run_async(wfs[i], workflow_id=f"workflow_{i}")
            for i in range(4)
        ]

        assert sorted(x[0] for x in workflow.list_all({workflow.RUNNING})) == [
            "workflow_0",
            "workflow_1",
        ]
        assert sorted(x[0] for x in workflow.list_all({workflow.PENDING})) == [
            "workflow_2",
            "workflow_3",
        ]

        with pytest.raises(queue.Full, match="Workflow queue has been full"):
            workflow.run(wfs[4], workflow_id="workflow_4")

    assert ray.get(refs) == [0, 1, 2, 3]
    assert workflow.run(wfs[4], workflow_id="workflow_4") == 4
    assert sorted(x[0] for x in workflow.list_all({workflow.SUCCESSFUL})) == [
        "workflow_0",
        "workflow_1",
        "workflow_2",
        "workflow_3",
        "workflow_4",
    ]
    for i in range(5):
        assert workflow.get_output(f"workflow_{i}") == i
Ejemplo n.º 29
0
def test_get_named_step_default(workflow_start_regular, tmp_path):
    @ray.remote
    def factorial(n, r=1):
        if n == 1:
            return r
        return workflow.continuation(factorial.bind(n - 1, r * n))

    import math

    assert math.factorial(5) == workflow.run(factorial.bind(5),
                                             workflow_id="factorial")
    for i in range(5):
        step_name = ("python.ray.workflow.tests.test_basic_workflows_2."
                     "test_get_named_step_default.locals.factorial")
        if i != 0:
            step_name += "_" + str(i)
        # All outputs will be 120
        assert math.factorial(5) == workflow.get_output("factorial",
                                                        name=step_name)
Ejemplo n.º 30
0
def test_workflow_queuing_2(shutdown_only, tmp_path):
    ray.init(storage=str(tmp_path))
    workflow.init(max_running_workflows=2, max_pending_workflows=2)

    @ray.remote
    def short_running(x):
        return x

    wfs = [short_running.bind(i) for i in range(5)]
    refs = [
        workflow.run_async(wfs[i], workflow_id=f"workflow_{i}")
        for i in range(4)
    ]
    for i in range(4):
        assert workflow.get_output(f"workflow_{i}") == i
    assert ray.get(refs) == [0, 1, 2, 3]
    assert workflow.run(wfs[4], workflow_id="workflow_4") == 4
    assert sorted(x[0] for x in workflow.list_all({workflow.SUCCESSFUL})) == [
        "workflow_0",
        "workflow_1",
        "workflow_2",
        "workflow_3",
        "workflow_4",
    ]