Ejemplo n.º 1
0
def test_wait_failure_recovery_2(workflow_start_regular_shared):
    # Test failing "workflow.wait" and its input steps.

    @workflow.step
    def sleep_identity(x: int):
        # block the step by a global mark
        while not utils.check_global_mark():
            time.sleep(0.1)
        time.sleep(x)
        return x

    @workflow.step
    def identity(x):
        return x

    ws = [
        sleep_identity.step(2),
        sleep_identity.step(5),
        sleep_identity.step(1),
    ]
    w = workflow.wait(ws, num_returns=2, timeout=None)
    utils.unset_global_mark()
    _ = identity.step(w).run_async(workflow_id="wait_failure_recovery_2")
    # wait util "workflow.wait" has been running
    time.sleep(10)
    workflow.cancel("wait_failure_recovery_2")
    time.sleep(2)

    utils.set_global_mark()
    ready, unready = ray.get(workflow.resume("wait_failure_recovery_2"))
    assert ready == [2, 1]
Ejemplo n.º 2
0
def test_wait_recovery_step_id(workflow_start_regular_shared):
    # This test ensures workflow reuse the original directory and
    # step id for "workflow.wait" during recovery.

    @workflow.step
    def identity(x: int):
        # block the step by a global mark
        assert utils.check_global_mark()
        return x

    w = workflow.wait([identity.step(42)], num_returns=1, timeout=None)
    utils.unset_global_mark()
    with pytest.raises(RaySystemError):
        _ = w.run(workflow_id="test_wait_recovery_step_id")
    utils.set_global_mark()
    ready, unready = ray.get(workflow.resume("test_wait_recovery_step_id"))
    assert ready == [42]

    from ray.workflow import storage, workflow_storage

    global_storage = storage.get_global_storage()
    wf_storage = workflow_storage.WorkflowStorage("test_wait_recovery_step_id",
                                                  global_storage)
    index = wf_storage.gen_step_id("workflow.wait")
    # no new step id
    assert index <= 1
Ejemplo n.º 3
0
def test_wait_for_multiple_events(workflow_start_regular_shared):
    """If a workflow has multiple event arguments, it should wait for them at the
    same time.
    """
    class EventListener1(workflow.EventListener):
        async def poll_for_event(self):
            utils.set_global_mark("listener1")
            while not utils.check_global_mark("trigger_event"):
                await asyncio.sleep(0.1)
            return "event1"

    class EventListener2(workflow.EventListener):
        async def poll_for_event(self):
            utils.set_global_mark("listener2")
            while not utils.check_global_mark("trigger_event"):
                await asyncio.sleep(0.1)
            return "event2"

    @ray.remote
    def trivial_step(arg1, arg2):
        return f"{arg1} {arg2}"

    event1_promise = workflow.wait_for_event(EventListener1)
    event2_promise = workflow.wait_for_event(EventListener2)

    promise = workflow.create(trivial_step.bind(event1_promise,
                                                event2_promise)).run_async()

    while not (utils.check_global_mark("listener1")
               and utils.check_global_mark("listener2")):
        time.sleep(0.1)

    utils.set_global_mark("trigger_event")
    assert ray.get(promise) == "event1 event2"
Ejemplo n.º 4
0
def test_wait_failure_recovery_1(workflow_start_regular_shared):
    # This tests that if a step using the output of "workflow.wait" as its
    # input, it can be recovered after failure.
    @workflow.step
    def get_all(ready, unready):
        return ready, unready

    @workflow.step
    def filter_all_2(wait_results):
        assert wait_results[0] == [1, 3, 2]
        # failure point
        assert utils.check_global_mark()
        ready, unready = wait_results
        return get_all.step(ready, unready)

    @workflow.step
    def composite_2():
        w = wait_multiple_steps.step()
        return filter_all_2.step(w)

    utils.unset_global_mark()

    with pytest.raises(RaySystemError):
        composite_2.step().run(workflow_id="wait_failure_recovery")

    utils.set_global_mark()

    ready, unready = ray.get(workflow.resume("wait_failure_recovery"))
    assert ready == [1, 3, 2]
    assert unready == [10, 12]
Ejemplo n.º 5
0
def test_recovery_simple(workflow_start_regular):
    @ray.remote
    def append1(x):
        return x + "[append1]"

    @ray.remote
    def append2(x):
        return x + "[append2]"

    @ray.remote
    def simple(x):
        x = append1.bind(x)
        y = the_failed_step.bind(x)
        z = append2.bind(y)
        return workflow.continuation(z)

    utils.unset_global_mark()
    workflow_id = "test_recovery_simple"
    with pytest.raises(RaySystemError):
        # internally we get WorkerCrashedError
        workflow.create(simple.bind("x")).run(workflow_id=workflow_id)

    assert workflow.get_status(
        workflow_id) == workflow.WorkflowStatus.RESUMABLE

    utils.set_global_mark()
    output = workflow.resume(workflow_id)
    assert ray.get(output) == "foo(x[append1])[append2]"
    utils.unset_global_mark()
    # resume from workflow output checkpoint
    output = workflow.resume(workflow_id)
    assert ray.get(output) == "foo(x[append1])[append2]"
Ejemplo n.º 6
0
def test_event_as_workflow(workflow_start_regular_shared):
    class MyEventListener(workflow.EventListener):
        async def poll_for_event(self):
            while not utils.check_global_mark():
                await asyncio.sleep(1)

    utils.unset_global_mark()
    promise = workflow.wait_for_event(MyEventListener).run_async("wf")

    assert workflow.get_status("wf") == workflow.WorkflowStatus.RUNNING

    utils.set_global_mark()
    assert ray.get(promise) is None
Ejemplo n.º 7
0
def test_recovery_simple_1(workflow_start_regular):
    utils.unset_global_mark()
    workflow_id = "test_recovery_simple_1"
    with pytest.raises(workflow.WorkflowExecutionError):
        # internally we get WorkerCrashedError
        workflow.run(the_failed_step.bind("x"), workflow_id=workflow_id)

    assert workflow.get_status(workflow_id) == workflow.WorkflowStatus.FAILED

    utils.set_global_mark()
    assert workflow.resume(workflow_id) == "foo(x)"
    utils.unset_global_mark()
    # resume from workflow output checkpoint
    assert workflow.resume(workflow_id) == "foo(x)"
Ejemplo n.º 8
0
def test_crash_during_event_checkpointing(workflow_start_regular_shared):
    """Ensure that if the cluster dies while the event is being checkpointed, we
    properly re-poll for the event."""

    from ray._private import storage

    storage_uri = storage._storage_uri

    """Ensure that we don't re-call poll_for_event after `event_checkpointed`
       returns, even after a crash."""

    class MyEventListener(workflow.EventListener):
        async def poll_for_event(self):
            assert not utils.check_global_mark("committed")
            if utils.check_global_mark("first"):
                utils.set_global_mark("second")
            utils.set_global_mark("first")

            utils.set_global_mark("time_to_die")
            while not utils.check_global_mark("resume"):
                time.sleep(0.1)

        async def event_checkpointed(self, event):
            utils.set_global_mark("committed")

    @ray.remote
    def wait_then_finish(arg):
        pass

    event_promise = workflow.wait_for_event(MyEventListener)
    workflow.run_async(wait_then_finish.bind(event_promise), workflow_id="workflow")

    while not utils.check_global_mark("time_to_die"):
        time.sleep(0.1)

    assert utils.check_global_mark("first")
    ray.shutdown()
    subprocess.check_output(["ray", "stop", "--force"])

    # Give the workflow some time to kill the cluster.
    # time.sleep(3)

    ray.init(num_cpus=4, storage=storage_uri)
    workflow.init()
    workflow.resume_async("workflow")
    utils.set_global_mark("resume")

    workflow.get_output("workflow")
    assert utils.check_global_mark("second")
Ejemplo n.º 9
0
def test_recovery_complex(workflow_start_regular):
    utils.unset_global_mark()
    workflow_id = "test_recovery_complex"
    with pytest.raises(RaySystemError):
        # internally we get WorkerCrashedError
        complex.step("x").run(workflow_id=workflow_id)
    utils.set_global_mark()
    output = workflow.resume(workflow_id)
    r = "join(join(foo(x[append1]), [source1][append2]), join(x, [source1]))"
    assert ray.get(output) == r
    utils.unset_global_mark()
    # resume from workflow output checkpoint
    output = workflow.resume(workflow_id)
    r = "join(join(foo(x[append1]), [source1][append2]), join(x, [source1]))"
    assert ray.get(output) == r
Ejemplo n.º 10
0
def test_recovery_simple(workflow_start_regular):
    utils.unset_global_mark()
    workflow_id = "test_recovery_simple"
    with pytest.raises(RaySystemError):
        # internally we get WorkerCrashedError
        simple.step("x").run(workflow_id=workflow_id)

    assert workflow.get_status(
        workflow_id) == workflow.WorkflowStatus.RESUMABLE

    utils.set_global_mark()
    output = workflow.resume(workflow_id)
    assert ray.get(output) == "foo(x[append1])[append2]"
    utils.unset_global_mark()
    # resume from workflow output checkpoint
    output = workflow.resume(workflow_id)
    assert ray.get(output) == "foo(x[append1])[append2]"
Ejemplo n.º 11
0
def test_checkpoint_dag_recovery_partial(workflow_start_regular_shared):
    utils.unset_global_mark()

    start = time.time()
    with pytest.raises(RaySystemError):
        workflow.create(checkpoint_dag.bind(False)).run(
            workflow_id="checkpoint_partial_recovery")
    run_duration_partial = time.time() - start

    utils.set_global_mark()

    start = time.time()
    recovered = ray.get(workflow.resume("checkpoint_partial_recovery"))
    recover_duration_partial = time.time() - start
    assert np.isclose(recovered, np.arange(SIZE).mean())
    print(f"[partial] run_duration = {run_duration_partial}, "
          f"recover_duration = {recover_duration_partial}")
Ejemplo n.º 12
0
def test_checkpoint_dag_recovery_whole(workflow_start_regular_shared):
    utils.unset_global_mark()

    start = time.time()
    with pytest.raises(workflow.WorkflowExecutionError):
        workflow.run(checkpoint_dag.bind(True),
                     workflow_id="checkpoint_whole_recovery")
    run_duration_whole = time.time() - start

    utils.set_global_mark()

    start = time.time()
    recovered = workflow.resume("checkpoint_whole_recovery")
    recover_duration_whole = time.time() - start
    assert np.isclose(recovered, np.arange(SIZE).mean())

    print(f"[whole] run_duration = {run_duration_whole}, "
          f"recover_duration = {recover_duration_whole}")
Ejemplo n.º 13
0
def test_recovery_complex(workflow_start_regular):
    @ray.remote
    def source1():
        return "[source1]"

    @ray.remote
    def append1(x):
        return x + "[append1]"

    @ray.remote
    def append2(x):
        return x + "[append2]"

    @ray.remote
    def join(x, y):
        return f"join({x}, {y})"

    @ray.remote
    def complex(x1):
        x2 = source1.bind()
        v = join.bind(x1, x2)
        y = append1.bind(x1)
        y = the_failed_step.bind(y)
        z = append2.bind(x2)
        u = join.bind(y, z)
        return workflow.continuation(join.bind(u, v))

    utils.unset_global_mark()
    workflow_id = "test_recovery_complex"
    with pytest.raises(workflow.WorkflowExecutionError):
        # internally we get WorkerCrashedError
        workflow.create(complex.bind("x")).run(workflow_id=workflow_id)

    assert workflow.get_status(workflow_id) == workflow.WorkflowStatus.FAILED

    utils.set_global_mark()
    output = workflow.resume(workflow_id)
    r = "join(join(foo(x[append1]), [source1][append2]), join(x, [source1]))"
    assert ray.get(output) == r
    utils.unset_global_mark()
    # resume from workflow output checkpoint
    output = workflow.resume(workflow_id)
    r = "join(join(foo(x[append1]), [source1][append2]), join(x, [source1]))"
    assert ray.get(output) == r
Ejemplo n.º 14
0
def test_checkpoint_dag_recovery_skip(workflow_start_regular_shared):
    utils.unset_global_mark()

    start = time.time()
    with pytest.raises(workflow.WorkflowExecutionError):
        workflow.create(
            checkpoint_dag.options(**workflow.options(
                checkpoint=False)).bind(False)).run(
                    workflow_id="checkpoint_skip_recovery")
    run_duration_skipped = time.time() - start

    utils.set_global_mark()

    start = time.time()
    recovered = ray.get(workflow.resume("checkpoint_skip_recovery"))
    recover_duration_skipped = time.time() - start
    assert np.isclose(recovered, np.arange(SIZE).mean())

    print(f"[skipped] run_duration = {run_duration_skipped}, "
          f"recover_duration = {recover_duration_skipped}")
Ejemplo n.º 15
0
def test_recovery_simple_2(workflow_start_regular):
    @ray.remote
    def simple(x):
        return workflow.continuation(the_failed_step.bind(x))

    utils.unset_global_mark()
    workflow_id = "test_recovery_simple_2"
    with pytest.raises(workflow.WorkflowExecutionError):
        # internally we get WorkerCrashedError
        workflow.create(simple.bind("x")).run(workflow_id=workflow_id)

    assert workflow.get_status(workflow_id) == workflow.WorkflowStatus.FAILED

    utils.set_global_mark()
    output = workflow.resume(workflow_id)
    assert ray.get(output) == "foo(x)"
    utils.unset_global_mark()
    # resume from workflow output checkpoint

    output = workflow.resume(workflow_id)
    assert ray.get(output) == "foo(x)"
Ejemplo n.º 16
0
 async def event_checkpointed(self, event):
     utils.set_global_mark("committed")
     if utils.check_global_mark("first"):
         utils.set_global_mark("second")
     else:
         utils.set_global_mark("first")
         await asyncio.sleep(1000000)
Ejemplo n.º 17
0
        async def poll_for_event(self):
            assert not utils.check_global_mark("committed")
            if utils.check_global_mark("first"):
                utils.set_global_mark("second")
            utils.set_global_mark("first")

            utils.set_global_mark("time_to_die")
            while not utils.check_global_mark("resume"):
                time.sleep(0.1)
Ejemplo n.º 18
0
 async def poll_for_event(self):
     utils.set_global_mark("listener2")
     while not utils.check_global_mark("trigger_event"):
         await asyncio.sleep(0.1)
     return "event2"
Ejemplo n.º 19
0
 async def event_checkpointed(self, event):
     utils.set_global_mark("committed")
Ejemplo n.º 20
0
 def triggers_event():
     utils.set_global_mark()
     while not utils.check_global_mark("event_returning"):
         time.sleep(0.1)
Ejemplo n.º 21
0
 def never_ends(x):
     utils.set_global_mark()
     time.sleep(1000000)
     return x
Ejemplo n.º 22
0
 def triggers_event():
     utils.set_global_mark()
Ejemplo n.º 23
0
def test_checkpoint_dag_recovery(workflow_start_regular):
    utils.set_global_mark()
    # warm up to ensure precise timing
    for _ in range(3):
        outputs = checkpoint_dag2.step(True).run()
        assert np.isclose(outputs, 8388607.5)

    utils.unset_global_mark()

    start = time.time()
    with pytest.raises(RaySystemError):
        checkpoint_dag2.options(checkpoint=False).step(False).run(
            workflow_id="checkpoint_skip2"
        )
    run_duration_skipped = time.time() - start

    utils.set_global_mark()

    start = time.time()
    recovered = ray.get(workflow.resume("checkpoint_skip2"))
    recover_duration_skipped = time.time() - start
    assert np.isclose(recovered, 8388607.5)

    utils.unset_global_mark()

    start = time.time()
    with pytest.raises(RaySystemError):
        checkpoint_dag2.step(False).run(workflow_id="checkpoint_partial2")
    run_duration_partial = time.time() - start

    utils.set_global_mark()

    start = time.time()
    recovered = ray.get(workflow.resume("checkpoint_partial2"))
    recover_duration_partial = time.time() - start
    assert np.isclose(recovered, 8388607.5)

    utils.unset_global_mark()

    start = time.time()
    with pytest.raises(RaySystemError):
        checkpoint_dag2.step(True).run(workflow_id="checkpoint_whole2")
    run_duration_whole = time.time() - start

    utils.set_global_mark()

    start = time.time()
    recovered = ray.get(workflow.resume("checkpoint_whole2"))
    recover_duration_whole = time.time() - start
    assert np.isclose(recovered, 8388607.5)

    print(
        f"[skipped] run_duration = {run_duration_skipped}, "
        f"recover_duration = {recover_duration_skipped}"
    )
    print(
        f"[partial] run_duration = {run_duration_partial}, "
        f"recover_duration = {recover_duration_partial}"
    )
    print(
        f"[whole] run_duration = {run_duration_whole}, "
        f"recover_duration = {recover_duration_whole}"
    )
Ejemplo n.º 24
0
 async def poll_for_event(self):
     while not utils.check_global_mark():
         await asyncio.sleep(0.1)
     utils.set_global_mark("event_returning")