Exemple #1
0
def test_cancellation(tmp_path, workflow_start_regular):
    lock_a = tmp_path / "lock_a"
    lock_b = tmp_path / "lock_b"

    @ray.remote
    def simple():
        with filelock.FileLock(lock_a):
            with filelock.FileLock(lock_b):
                pass

    workflow_id = "test_cancellation"

    with filelock.FileLock(lock_b):
        r = workflow.run_async(simple.bind(), workflow_id=workflow_id)
        try:
            ray.get(r, timeout=5)
        except GetTimeoutError:
            pass
        else:
            assert False

        assert workflow.get_status(workflow_id) == WorkflowStatus.RUNNING

        workflow.cancel(workflow_id)
        with pytest.raises(workflow.WorkflowCancellationError):
            ray.get(r)
        lock = filelock.FileLock(lock_a)
        lock.acquire(timeout=5)

        assert workflow.get_status(workflow_id) == WorkflowStatus.CANCELED
Exemple #2
0
def test_workflow_queuing_3(shutdown_only, tmp_path):
    """This test ensures the queuing workflow is indeed pending."""
    ray.init(storage=str(tmp_path))
    workflow.init(max_running_workflows=1, max_pending_workflows=1)

    import time
    import filelock
    from ray.exceptions import GetTimeoutError

    lock_path = str(tmp_path / ".lock")

    @ray.remote
    def long_running(x):
        (tmp_path / str(x)).write_text(str(x))
        with filelock.FileLock(lock_path):
            return x

    workflow_id = "test_workflow_queuing_3"

    with filelock.FileLock(lock_path):
        wf_1 = workflow.run_async(long_running.bind(1),
                                  workflow_id=f"{workflow_id}_1")
        wf_2 = workflow.run_async(long_running.bind(2),
                                  workflow_id=f"{workflow_id}_2")
        time.sleep(5)
        assert (tmp_path / str(1)).exists()
        assert not (tmp_path / str(2)).exists()
        assert workflow.get_status(
            workflow_id=f"{workflow_id}_1") == workflow.RUNNING
        assert workflow.get_status(
            workflow_id=f"{workflow_id}_2") == workflow.PENDING
        with pytest.raises(GetTimeoutError):
            ray.get(wf_2, timeout=5)

    assert ray.get([wf_1, wf_2]) == [1, 2]
Exemple #3
0
def test_recovery_simple(workflow_start_regular):
    @ray.remote
    def append1(x):
        return x + "[append1]"

    @ray.remote
    def append2(x):
        return x + "[append2]"

    @ray.remote
    def simple(x):
        x = append1.bind(x)
        y = the_failed_step.bind(x)
        z = append2.bind(y)
        return workflow.continuation(z)

    utils.unset_global_mark()
    workflow_id = "test_recovery_simple"
    with pytest.raises(RaySystemError):
        # internally we get WorkerCrashedError
        workflow.create(simple.bind("x")).run(workflow_id=workflow_id)

    assert workflow.get_status(
        workflow_id) == workflow.WorkflowStatus.RESUMABLE

    utils.set_global_mark()
    output = workflow.resume(workflow_id)
    assert ray.get(output) == "foo(x[append1])[append2]"
    utils.unset_global_mark()
    # resume from workflow output checkpoint
    output = workflow.resume(workflow_id)
    assert ray.get(output) == "foo(x[append1])[append2]"
Exemple #4
0
def test_event_as_workflow(workflow_start_regular_shared):
    class MyEventListener(workflow.EventListener):
        async def poll_for_event(self):
            while not utils.check_global_mark():
                await asyncio.sleep(1)

    utils.unset_global_mark()
    promise = workflow.wait_for_event(MyEventListener).run_async("wf")

    assert workflow.get_status("wf") == workflow.WorkflowStatus.RUNNING

    utils.set_global_mark()
    assert ray.get(promise) is None
Exemple #5
0
def test_recovery_simple_1(workflow_start_regular):
    utils.unset_global_mark()
    workflow_id = "test_recovery_simple_1"
    with pytest.raises(workflow.WorkflowExecutionError):
        # internally we get WorkerCrashedError
        workflow.run(the_failed_step.bind("x"), workflow_id=workflow_id)

    assert workflow.get_status(workflow_id) == workflow.WorkflowStatus.FAILED

    utils.set_global_mark()
    assert workflow.resume(workflow_id) == "foo(x)"
    utils.unset_global_mark()
    # resume from workflow output checkpoint
    assert workflow.resume(workflow_id) == "foo(x)"
Exemple #6
0
def test_recovery_simple(workflow_start_regular):
    utils.unset_global_mark()
    workflow_id = "test_recovery_simple"
    with pytest.raises(RaySystemError):
        # internally we get WorkerCrashedError
        simple.step("x").run(workflow_id=workflow_id)

    assert workflow.get_status(
        workflow_id) == workflow.WorkflowStatus.RESUMABLE

    utils.set_global_mark()
    output = workflow.resume(workflow_id)
    assert ray.get(output) == "foo(x[append1])[append2]"
    utils.unset_global_mark()
    # resume from workflow output checkpoint
    output = workflow.resume(workflow_id)
    assert ray.get(output) == "foo(x[append1])[append2]"
Exemple #7
0
def test_recovery_complex(workflow_start_regular):
    @ray.remote
    def source1():
        return "[source1]"

    @ray.remote
    def append1(x):
        return x + "[append1]"

    @ray.remote
    def append2(x):
        return x + "[append2]"

    @ray.remote
    def join(x, y):
        return f"join({x}, {y})"

    @ray.remote
    def complex(x1):
        x2 = source1.bind()
        v = join.bind(x1, x2)
        y = append1.bind(x1)
        y = the_failed_step.bind(y)
        z = append2.bind(x2)
        u = join.bind(y, z)
        return workflow.continuation(join.bind(u, v))

    utils.unset_global_mark()
    workflow_id = "test_recovery_complex"
    with pytest.raises(workflow.WorkflowExecutionError):
        # internally we get WorkerCrashedError
        workflow.create(complex.bind("x")).run(workflow_id=workflow_id)

    assert workflow.get_status(workflow_id) == workflow.WorkflowStatus.FAILED

    utils.set_global_mark()
    output = workflow.resume(workflow_id)
    r = "join(join(foo(x[append1]), [source1][append2]), join(x, [source1]))"
    assert ray.get(output) == r
    utils.unset_global_mark()
    # resume from workflow output checkpoint
    output = workflow.resume(workflow_id)
    r = "join(join(foo(x[append1]), [source1][append2]), join(x, [source1]))"
    assert ray.get(output) == r
Exemple #8
0
def test_run_off_main_thread(workflow_start_regular):
    @workflow.step
    def fake_data(num: int):
        return list(range(num))

    succ = False

    # Start new thread here ⚠️
    def run():
        global succ
        # Setup the workflow.
        data = fake_data.step(10)
        assert data.run(workflow_id="run") == list(range(10))

    import threading

    t = threading.Thread(target=run)
    t.start()
    t.join()
    assert workflow.get_status("run") == workflow.SUCCESSFUL
Exemple #9
0
def test_run_off_main_thread(workflow_start_regular_shared):
    @ray.remote
    def fake_data(num: int):
        return list(range(num))

    succ = False

    # Start new thread here ⚠️
    def run():
        global succ
        # Setup the workflow.
        assert workflow.run(fake_data.bind(10),
                            workflow_id="run") == list(range(10))

    import threading

    t = threading.Thread(target=run)
    t.start()
    t.join()
    assert workflow.get_status("run") == workflow.SUCCESSFUL
Exemple #10
0
def test_recovery_simple_2(workflow_start_regular):
    @ray.remote
    def simple(x):
        return workflow.continuation(the_failed_step.bind(x))

    utils.unset_global_mark()
    workflow_id = "test_recovery_simple_2"
    with pytest.raises(workflow.WorkflowExecutionError):
        # internally we get WorkerCrashedError
        workflow.create(simple.bind("x")).run(workflow_id=workflow_id)

    assert workflow.get_status(workflow_id) == workflow.WorkflowStatus.FAILED

    utils.set_global_mark()
    output = workflow.resume(workflow_id)
    assert ray.get(output) == "foo(x)"
    utils.unset_global_mark()
    # resume from workflow output checkpoint

    output = workflow.resume(workflow_id)
    assert ray.get(output) == "foo(x)"
def test_workflow_manager_simple(workflow_start_regular):
    assert [] == workflow.list_all()
    with pytest.raises(workflow.common.WorkflowNotFoundError):
        workflow.get_status("X")
def test_workflow_manager(workflow_start_regular, tmp_path):
    # For sync between jobs
    tmp_file = str(tmp_path / "lock")
    lock = FileLock(tmp_file)
    lock.acquire()

    # For sync between jobs
    flag_file = tmp_path / "flag"
    flag_file.touch()

    @ray.remote
    def long_running(i):
        lock = FileLock(tmp_file)
        with lock.acquire():
            pass

        if i % 2 == 0:
            if flag_file.exists():
                raise ValueError()
        return 100

    outputs = [
        workflow.create(long_running.bind(i)).run_async(workflow_id=str(i))
        for i in range(100)
    ]
    # Test list all, it should list all jobs running
    all_tasks = workflow.list_all()
    assert len(all_tasks) == 100
    all_tasks_running = workflow.list_all(workflow.RUNNING)
    assert dict(all_tasks) == dict(all_tasks_running)
    assert workflow.get_status("0") == "RUNNING"

    # Release lock and make sure all tasks finished
    lock.release()
    for o in outputs:
        try:
            r = ray.get(o)
        except Exception:
            continue
        assert 100 == r
    all_tasks_running = workflow.list_all(workflow.WorkflowStatus.RUNNING)
    assert len(all_tasks_running) == 0
    # Half of them failed and half succeed
    failed_jobs = workflow.list_all("FAILED")
    assert len(failed_jobs) == 50
    finished_jobs = workflow.list_all("SUCCESSFUL")
    assert len(finished_jobs) == 50

    all_tasks_status = workflow.list_all(
        {
            workflow.WorkflowStatus.SUCCESSFUL,
            workflow.WorkflowStatus.FAILED,
            workflow.WorkflowStatus.RUNNING,
        }
    )
    assert len(all_tasks_status) == 100
    assert failed_jobs == [
        (k, v) for (k, v) in all_tasks_status if v == workflow.WorkflowStatus.FAILED
    ]
    assert finished_jobs == [
        (k, v) for (k, v) in all_tasks_status if v == workflow.WorkflowStatus.SUCCESSFUL
    ]

    # Test get_status
    assert workflow.get_status("0") == "FAILED"
    assert workflow.get_status("1") == "SUCCESSFUL"
    lock.acquire()
    r = workflow.resume("0")
    assert workflow.get_status("0") == workflow.RUNNING
    flag_file.unlink()
    lock.release()
    assert 100 == ray.get(r)
    assert workflow.get_status("0") == workflow.SUCCESSFUL

    # Test cancel
    lock.acquire()
    workflow.resume("2")
    assert workflow.get_status("2") == workflow.RUNNING
    workflow.cancel("2")
    assert workflow.get_status("2") == workflow.CANCELED

    # Now resume_all
    resumed = workflow.resume_all(include_failed=True)
    assert len(resumed) == 48
    lock.release()
    assert [ray.get(o) for (_, o) in resumed] == [100] * 48
Exemple #13
0
def test_workflow_manager_simple(workflow_start_regular):
    assert [] == workflow.list_all()
    with pytest.raises(ValueError):
        workflow.get_status("X")
Exemple #14
0
def test_workflow_manager_simple(workflow_start_regular):
    from ray.workflow.exceptions import WorkflowNotFoundError

    assert [] == workflow.list_all()
    with pytest.raises(WorkflowNotFoundError):
        workflow.get_status("X")