Example #1
0
def test_run_or_resume_during_running(workflow_start_regular_shared):
    output = simple_sequential.step().run_async(workflow_id="running_workflow")
    with pytest.raises(RuntimeError):
        simple_sequential.step().run_async(workflow_id="running_workflow")
    with pytest.raises(RuntimeError):
        workflow.resume(workflow_id="running_workflow")
    assert ray.get(output) == "[source1][append1][append2]"
Example #2
0
def test_run_or_resume_during_running():
    ray.init(namespace="workflow")
    output = workflow.run(
        simple_sequential.step(), workflow_id="running_workflow")

    with pytest.raises(ValueError):
        workflow.run(simple_sequential.step(), workflow_id="running_workflow")
    with pytest.raises(ValueError):
        workflow.resume(workflow_id="running_workflow")

    assert ray.get(output) == "[source1][append1][append2]"
    ray.shutdown()
Example #3
0
def test_recovery_simple(workflow_start_regular):
    utils.unset_global_mark()
    workflow_id = "test_recovery_simple"
    with pytest.raises(RaySystemError):
        # internally we get WorkerCrashedError
        simple.step("x").run(workflow_id=workflow_id)
    utils.set_global_mark()
    output = workflow.resume(workflow_id)
    assert ray.get(output) == "foo(x[append1])[append2]"
    utils.unset_global_mark()
    # resume from workflow output checkpoint
    output = workflow.resume(workflow_id)
    assert ray.get(output) == "foo(x[append1])[append2]"
Example #4
0
def test_recovery_complex(workflow_start_regular):
    utils.unset_global_mark()
    workflow_id = "test_recovery_complex"
    with pytest.raises(RaySystemError):
        # internally we get WorkerCrashedError
        complex.step("x").run(workflow_id=workflow_id)
    utils.set_global_mark()
    output = workflow.resume(workflow_id)
    r = "join(join(foo(x[append1]), [source1][append2]), join(x, [source1]))"
    assert ray.get(output) == r
    utils.unset_global_mark()
    # resume from workflow output checkpoint
    output = workflow.resume(workflow_id)
    r = "join(join(foo(x[append1]), [source1][append2]), join(x, [source1]))"
    assert ray.get(output) == r
Example #5
0
def test_resume_different_storage(ray_start_regular):
    constant_1.step().run(workflow_id="const")
    tmp_dir = tempfile.mkdtemp()
    constant_2.step().run(workflow_id="const", storage=tmp_dir)
    assert ray.get(workflow.resume(workflow_id="const",
                                   storage=tmp_dir)) == 31416
    shutil.rmtree(tmp_dir)
Example #6
0
def test_get_output_3(workflow_start_regular, tmp_path):
    cnt_file = tmp_path / "counter"
    cnt_file.write_text("0")
    error_flag = tmp_path / "error"
    error_flag.touch()

    @workflow.step
    def incr():
        v = int(cnt_file.read_text())
        cnt_file.write_text(str(v + 1))
        if error_flag.exists():
            raise ValueError()
        return 10

    with pytest.raises(ray.exceptions.RaySystemError):
        incr.step().run("incr")

    assert cnt_file.read_text() == "1"

    with pytest.raises(ray.exceptions.RaySystemError):
        ray.get(workflow.get_output("incr"))

    assert cnt_file.read_text() == "1"
    error_flag.unlink()
    with pytest.raises(ray.exceptions.RaySystemError):
        ray.get(workflow.get_output("incr"))
    assert ray.get(workflow.resume("incr")) == 10
Example #7
0
def test_recovery_simple():
    ray.init()
    utils.unset_global_mark()
    workflow_id = "test_recovery_simple"
    with pytest.raises(ObjectLostError):
        # internally we get WorkerCrashedError
        output = workflow.run(simple.step("x"), workflow_id=workflow_id)
        ray.get(output)
    utils.set_global_mark()
    output = workflow.resume(workflow_id)
    assert ray.get(output) == "foo(x[append1])[append2]"
    utils.unset_global_mark()
    # resume from workflow output checkpoint
    output = workflow.resume(workflow_id)
    assert ray.get(output) == "foo(x[append1])[append2]"
    ray.shutdown()
Example #8
0
 def resume(num_records_replayed):
     key = debug_store.wrapped_storage.make_key("complex_workflow")
     asyncio_run(debug_store.wrapped_storage.delete_prefix(key))
     replays = [
         debug_store.replay(i) for i in range(num_records_replayed)
     ]
     asyncio_run(asyncio.gather(*replays))
     return ray.get(workflow.resume(workflow_id="complex_workflow"))
Example #9
0
def test_recovery_complex():
    ray.init()
    utils.unset_global_mark()
    workflow_id = "test_recovery_complex"
    with pytest.raises(RayTaskError):
        # internally we get WorkerCrashedError
        output = workflow.run(complex.step("x"), workflow_id=workflow_id)
        ray.get(output)
    utils.set_global_mark()
    output = workflow.resume(workflow_id)
    r = "join(join(foo(x[append1]), [source1][append2]), join(x, [source1]))"
    assert ray.get(output) == r
    utils.unset_global_mark()
    # resume from workflow output checkpoint
    output = workflow.resume(workflow_id)
    r = "join(join(foo(x[append1]), [source1][append2]), join(x, [source1]))"
    assert ray.get(output) == r
    ray.shutdown()
Example #10
0
def test_recovery_cluster_failure():
    subprocess.run(["ray start --head"], shell=True)
    time.sleep(1)
    proc = run_string_as_driver_nonblocking(driver_script)
    time.sleep(10)
    subprocess.run(["ray stop"], shell=True)
    proc.kill()
    time.sleep(1)
    ray.init()
    assert ray.get(workflow.resume("cluster_failure")) == 20
    ray.shutdown()
Example #11
0
def test_recovery_cluster_failure():
    subprocess.check_call(["ray", "start", "--head"])
    time.sleep(1)
    proc = run_string_as_driver_nonblocking(driver_script)
    time.sleep(10)
    subprocess.check_call(["ray", "stop"])
    proc.kill()
    time.sleep(1)
    workflow.init()
    assert ray.get(workflow.resume("cluster_failure")) == 20
    ray.shutdown()
Example #12
0
def test_recovery_cluster_failure():
    subprocess.run(["ray start --head"], shell=True)
    time.sleep(1)
    script = os.path.join(os.path.abspath(os.path.dirname(__file__)),
                          "workflows_to_fail.py")
    proc = subprocess.Popen([sys.executable, script])
    time.sleep(10)
    subprocess.run(["ray stop"], shell=True)
    proc.kill()
    time.sleep(1)
    ray.init()
    assert ray.get(workflow.resume("cluster_failure")) == 20
    ray.shutdown()
Example #13
0
def test_recovery_cluster_failure(reset_workflow, tmp_path):
    subprocess.check_call(["ray", "start", "--head"])
    time.sleep(1)
    proc = run_string_as_driver_nonblocking(
        driver_script.format(tmp_path=str(tmp_path)))
    time.sleep(10)
    subprocess.check_call(["ray", "stop"])
    proc.kill()
    time.sleep(1)
    workflow.init(str(tmp_path))
    assert ray.get(workflow.resume("cluster_failure")) == 20
    workflow.storage.set_global_storage(None)
    ray.shutdown()
Example #14
0
def test_recovery_non_exists_workflow():
    ray.init()
    with pytest.raises(WorkflowNotResumableError):
        workflow.resume("this_workflow_id_does_not_exist")
    ray.shutdown()
Example #15
0
def test_recovery_non_exists_workflow(workflow_start_regular):
    with pytest.raises(ValueError):
        ray.get(workflow.resume("this_workflow_id_does_not_exist"))
Example #16
0
def test_recovery_non_exists_workflow():
    ray.init(namespace="workflow")
    with pytest.raises(RayTaskError):
        ray.get(workflow.resume("this_workflow_id_does_not_exist"))
    ray.shutdown()
Example #17
0
def test_resume_different_storage(ray_start_regular, tmp_path, reset_workflow):
    workflow.init(storage=str(tmp_path))
    constant.step().run(workflow_id="const")
    assert ray.get(workflow.resume(workflow_id="const")) == 31416
    workflow.storage.set_global_storage(None)
Example #18
0
def test_actor_writer_2(workflow_start_regular, tmp_path):
    g_lock = str(Path(tmp_path / "g.lock"))
    incr_lock = str(Path(tmp_path / "incr.lock"))
    val_lock = str(Path(tmp_path / "val.lock"))

    val_err = str(Path(tmp_path / "val.err"))
    incr_err = str(Path(tmp_path / "incr.err"))

    @workflow.virtual_actor
    class SyncCounter:
        def __init__(self, val_lock: str, incr_lock: str, g_lock: str,
                     val_err: str, incr_err: str):
            self.val_lock = val_lock
            self.incr_lock = incr_lock
            self.g_lock = g_lock

            self.val_err = val_err
            self.incr_err = incr_err
            self.v = 0
            if Path(self.val_err).exists():
                raise ValueError()

        @workflow.virtual_actor.readonly
        def val(self):
            with FileLock(self.val_lock), FileLock(self.g_lock):
                if Path(self.val_err).exists():
                    raise ValueError()
                return self.v

        def incr(self, create_incr_err=False):
            with FileLock(self.incr_lock), FileLock(self.g_lock):
                if Path(self.incr_err).exists():
                    raise ValueError()
                if create_incr_err:
                    Path(incr_err).touch()
                self.v += 1
                return self.v

        def __getstate__(self):
            return (self.v, self.val_lock, self.incr_lock, self.g_lock,
                    self.val_err, self.incr_err)

        def __setstate__(self, state):
            (self.v, self.val_lock, self.incr_lock, self.g_lock, self.val_err,
             self.incr_err) = state

    # trigger error in init
    Path(val_err).touch()
    actor = SyncCounter.get_or_create("sync_counter", val_lock, incr_lock,
                                      g_lock, val_err, incr_err)
    with pytest.raises(Exception):
        actor.incr.run()
    Path(val_err).unlink()

    assert ray.get([actor.incr.run_async()
                    for _ in range(9)]) == list(range(2, 11))

    incr_lock = FileLock(incr_lock)
    incr_lock.acquire()

    objs = [actor.incr.run_async() for _ in range(10)]
    assert 10 == actor.val.run()
    Path(val_err).touch()
    with pytest.raises(Exception):
        actor.val.run()
    Path(val_err).unlink()
    incr_lock.release()
    assert ray.get(objs) == list(range(11, 21))

    # test error cases
    actor.incr.run_async()  # 21
    actor.incr.run_async()  # 22
    actor.incr.run_async(create_incr_err=True)  # 23
    actor.incr.run_async()  # 24
    s5 = actor.incr.run_async()  # 25
    with pytest.raises(Exception):
        ray.get(s5)

    assert 23 == actor.val.run()
    Path(incr_err).unlink()
    obj = workflow.resume("sync_counter")
    assert 25 == ray.get(obj)[0]
    assert 25 == actor.val.run()
Example #19
0
def test_wf_in_actor_chain(workflow_start_regular, tmp_path):
    file_lock = [str(tmp_path / str(i)) for i in range(5)]
    fail_flag = tmp_path / "fail"

    @workflow.virtual_actor
    class Counter:
        def __init__(self):
            self._counter = 0

        def incr(self, n):
            with FileLock(file_lock[n]):
                self._counter += 1
                if fail_flag.exists():
                    raise Exception()

            if n == 0:
                return self._counter
            else:
                return self.incr.step(n - 1)

        @workflow.virtual_actor.readonly
        def val(self):
            return self._counter

        def __getstate__(self):
            return self._counter

        def __setstate__(self, v):
            self._counter = v

    locks = [FileLock(f) for f in file_lock]
    for lock in locks:
        lock.acquire()

    c = Counter.get_or_create("counter")
    ray.get(c.ready())
    final_ret = c.incr.run_async(len(file_lock) - 1)
    for i in range(0, len(file_lock) - 2):
        locks[-i - 1].release()
        val = c.val.run()
        for _ in range(0, 60):
            if val == i + 1:
                break
            val = c.val.run()
            time.sleep(1)
        assert val == i + 1

    fail_flag.touch()
    locks[1 - len(file_lock)].release()
    # Fail the pipeline
    with pytest.raises(Exception):
        ray.get(final_ret)

    fail_flag.unlink()
    workflow.resume("counter")
    # After resume, it'll start form the place where it failed
    for i in range(len(file_lock) - 1, len(file_lock)):
        locks[-i - 1].release()
        val = c.val.run()
        for _ in range(0, 60):
            if val == i + 1:
                break
            val = c.val.run()
            time.sleep(1)
        assert val == i + 1

    assert c.val.run() == 5
Example #20
0
def test_wf_in_actor(workflow_start_regular, tmp_path):
    fail_flag = tmp_path / "fail"
    cnt = tmp_path / "count"
    cnt.write_text(str(0))
    lock_file = tmp_path / "lock"

    @workflow.step
    def start_session():
        if fail_flag.exists():
            raise Exception()
        v = int(cnt.read_text()) + 1
        cnt.write_text(str(v))
        with FileLock(str(lock_file)):
            return "UP"

    @workflow.virtual_actor
    class Session:
        def __init__(self):
            self._session_status = "DOWN"

        @workflow.virtual_actor.readonly
        def get_status(self):
            return self._session_status

        def update_session(self, up):
            (ret, err) = up
            if err is None:
                self._session_status = ret
            else:
                self._session_status = err
            return self._session_status

        def session_start(self):
            step = start_session.step()
            return step

        def session_start_with_status(self):
            self._session_status = "STARTING"
            return self.update_session.step(
                start_session.options(catch_exceptions=True).step())

        def __getstate__(self):
            return self._session_status

        def __setstate__(self, state):
            self._session_status = state

    actor = Session.get_or_create("session_id")
    fail_flag.touch()
    with pytest.raises(Exception):
        actor.session_start.run()
    fail_flag.unlink()
    ray.get(workflow.resume("session_id"))
    # After resume, it'll rerun start_session which will
    # generate 1
    assert cnt.read_text() == "1"
    assert actor.session_start.run() == "UP"
    assert cnt.read_text() == "2"
    assert actor.session_start_with_status.run() == "UP"
    assert cnt.read_text() == "3"

    # Now test a new session.
    actor = Session.get_or_create("session_id")
    fail_flag.touch()
    assert isinstance(actor.session_start_with_status.run(), Exception)
    assert cnt.read_text() == "3"
    lock = FileLock(str(lock_file))
    lock.acquire()
    fail_flag.unlink()
    ret = actor.session_start_with_status.run_async()
    for i in range(0, 60):
        if cnt.read_text() == "4":
            break
        time.sleep(1)
    assert cnt.read_text() == "4"
    # This means when return from session_start_with_status,
    # the session got updated
    assert actor.get_status.run() == "STARTING"
    lock.release()
    assert ray.get(ret) == "UP"