Beispiel #1
0
def test_crash_after_commit(workflow_start_regular_shared):
    _storage = storage.get_global_storage()
    """Ensure that we don't re-call poll_for_event after `event_checkpointed`
       returns, even after a crash. Here we must call `event_checkpointed`
       twice, because there's no way to know if we called it after
       checkpointing.

    """
    class MyEventListener(workflow.EventListener):
        async def poll_for_event(self):
            assert not utils.check_global_mark("committed")

        async def event_checkpointed(self, event):
            utils.set_global_mark("committed")
            if utils.check_global_mark("first"):
                utils.set_global_mark("second")
            else:
                utils.set_global_mark("first")
                await asyncio.sleep(1000000)

    event_promise = workflow.wait_for_event(MyEventListener)
    event_promise.run_async("workflow")

    while not utils.check_global_mark("first"):
        time.sleep(0.1)

    ray.shutdown()
    subprocess.check_output(["ray", "stop", "--force"])

    ray.init(num_cpus=4)
    workflow.init(storage=_storage)
    workflow.resume("workflow")

    ray.get(workflow.get_output("workflow"))
    assert utils.check_global_mark("second")
Beispiel #2
0
def test_wait_for_multiple_events(workflow_start_regular_shared):
    """If a workflow has multiple event arguments, it should wait for them at the
    same time.
    """
    class EventListener1(workflow.EventListener):
        async def poll_for_event(self):
            utils.set_global_mark("listener1")
            while not utils.check_global_mark("trigger_event"):
                await asyncio.sleep(0.1)
            return "event1"

    class EventListener2(workflow.EventListener):
        async def poll_for_event(self):
            utils.set_global_mark("listener2")
            while not utils.check_global_mark("trigger_event"):
                await asyncio.sleep(0.1)
            return "event2"

    @ray.remote
    def trivial_step(arg1, arg2):
        return f"{arg1} {arg2}"

    event1_promise = workflow.wait_for_event(EventListener1)
    event2_promise = workflow.wait_for_event(EventListener2)

    promise = workflow.create(trivial_step.bind(event1_promise,
                                                event2_promise)).run_async()

    while not (utils.check_global_mark("listener1")
               and utils.check_global_mark("listener2")):
        time.sleep(0.1)

    utils.set_global_mark("trigger_event")
    assert ray.get(promise) == "event1 event2"
Beispiel #3
0
        async def poll_for_event(self):
            assert not utils.check_global_mark("committed")
            if utils.check_global_mark("first"):
                utils.set_global_mark("second")
            utils.set_global_mark("first")

            utils.set_global_mark("time_to_die")
            while not utils.check_global_mark("resume"):
                time.sleep(0.1)
Beispiel #4
0
def test_crash_during_event_checkpointing(workflow_start_regular_shared):
    """Ensure that if the cluster dies while the event is being checkpointed, we
    properly re-poll for the event."""

    from ray._private import storage

    storage_uri = storage._storage_uri

    """Ensure that we don't re-call poll_for_event after `event_checkpointed`
       returns, even after a crash."""

    class MyEventListener(workflow.EventListener):
        async def poll_for_event(self):
            assert not utils.check_global_mark("committed")
            if utils.check_global_mark("first"):
                utils.set_global_mark("second")
            utils.set_global_mark("first")

            utils.set_global_mark("time_to_die")
            while not utils.check_global_mark("resume"):
                time.sleep(0.1)

        async def event_checkpointed(self, event):
            utils.set_global_mark("committed")

    @ray.remote
    def wait_then_finish(arg):
        pass

    event_promise = workflow.wait_for_event(MyEventListener)
    workflow.run_async(wait_then_finish.bind(event_promise), workflow_id="workflow")

    while not utils.check_global_mark("time_to_die"):
        time.sleep(0.1)

    assert utils.check_global_mark("first")
    ray.shutdown()
    subprocess.check_output(["ray", "stop", "--force"])

    # Give the workflow some time to kill the cluster.
    # time.sleep(3)

    ray.init(num_cpus=4, storage=storage_uri)
    workflow.init()
    workflow.resume_async("workflow")
    utils.set_global_mark("resume")

    workflow.get_output("workflow")
    assert utils.check_global_mark("second")
Beispiel #5
0
 async def event_checkpointed(self, event):
     utils.set_global_mark("committed")
     if utils.check_global_mark("first"):
         utils.set_global_mark("second")
     else:
         utils.set_global_mark("first")
         await asyncio.sleep(1000000)
Beispiel #6
0
def identity2(x):
    if not utils.check_global_mark():
        import os

        os.kill(os.getpid(), 9)
    return x
Beispiel #7
0
def test_delete(workflow_start_regular):
    from ray._private.storage import _storage_uri

    # Try deleting a random workflow that never existed.
    with pytest.raises(WorkflowNotFoundError):
        workflow.delete(workflow_id="never_existed")

    # Delete a workflow that has not finished and is not running.
    @ray.remote
    def never_ends(x):
        utils.set_global_mark()
        time.sleep(1000000)
        return x

    workflow.create(never_ends.bind("hello world")).run_async("never_finishes")

    # Make sure the step is actualy executing before killing the cluster
    while not utils.check_global_mark():
        time.sleep(0.1)

    # Restart
    ray.shutdown()
    subprocess.check_output("ray stop --force", shell=True)
    ray.init(storage=_storage_uri)
    workflow.init()

    with pytest.raises(ray.exceptions.RaySystemError):
        result = workflow.get_output("never_finishes")
        ray.get(result)

    workflow.delete("never_finishes")

    with pytest.raises(ValueError):
        ouput = workflow.get_output("never_finishes")

    # TODO(Alex): Uncomment after
    # https://github.com/ray-project/ray/issues/19481.
    # with pytest.raises(WorkflowNotFoundError):
    #     workflow.resume("never_finishes")

    with pytest.raises(WorkflowNotFoundError):
        workflow.delete(workflow_id="never_finishes")

    # Delete a workflow which has finished.
    @ray.remote
    def basic_step(arg):
        return arg

    result = workflow.create(
        basic_step.bind("hello world")).run(workflow_id="finishes")
    assert result == "hello world"
    ouput = workflow.get_output("finishes")
    assert ray.get(ouput) == "hello world"

    workflow.delete(workflow_id="finishes")

    with pytest.raises(ValueError):
        ouput = workflow.get_output("finishes")

    # TODO(Alex): Uncomment after
    # https://github.com/ray-project/ray/issues/19481.
    # with pytest.raises(ValueError):
    #     workflow.resume("finishes")

    with pytest.raises(WorkflowNotFoundError):
        workflow.delete(workflow_id="finishes")

    assert workflow.list_all() == []

    # The workflow can be re-run as if it was never run before.
    assert workflow.create(
        basic_step.bind("123")).run(workflow_id="finishes") == "123"
Beispiel #8
0
 async def poll_for_event(self):
     while not utils.check_global_mark():
         await asyncio.sleep(0.1)
     # Give the other step time to finish.
     await asyncio.sleep(1)
Beispiel #9
0
 async def poll_for_event(self):
     utils.set_global_mark("listener2")
     while not utils.check_global_mark("trigger_event"):
         await asyncio.sleep(0.1)
     return "event2"
Beispiel #10
0
 def identity(x: int):
     # block the step by a global mark
     assert utils.check_global_mark()
     return x
Beispiel #11
0
 async def poll_for_event(self):
     while not utils.check_global_mark():
         await asyncio.sleep(1)
Beispiel #12
0
 async def poll_for_event(self):
     assert not utils.check_global_mark("committed")
Beispiel #13
0
 def filter_all_2(wait_results):
     assert wait_results[0] == [1, 3, 2]
     # failure point
     assert utils.check_global_mark()
     ready, unready = wait_results
     return get_all.step(ready, unready)
Beispiel #14
0
 def triggers_event():
     utils.set_global_mark()
     while not utils.check_global_mark("event_returning"):
         time.sleep(0.1)
Beispiel #15
0
 async def poll_for_event(self):
     while not utils.check_global_mark():
         await asyncio.sleep(0.1)
     utils.set_global_mark("event_returning")
Beispiel #16
0
def the_failed_step(x):
    if not utils.check_global_mark():
        import os

        os.kill(os.getpid(), 9)
    return "foo(" + x + ")"
Beispiel #17
0
 def sleep_identity(x: int):
     # block the step by a global mark
     while not utils.check_global_mark():
         time.sleep(0.1)
     time.sleep(x)
     return x