コード例 #1
0
    def test_configurable_scheduler(self):
        e = LocalDaskExecutor(scheduler="synchronous")
        assert e.scheduler == "synchronous"

        def check_scheduler(val):
            assert dask.config.get("scheduler") == val

        with dask.config.set(scheduler="threads"):
            check_scheduler("threads")
            with e.start():
                e.submit(check_scheduler, "synchronous")
コード例 #2
0
    def test_temporary_pool_created_of_proper_size_and_kind(
            self, scheduler, num_workers):
        from dask.system import CPU_COUNT
        from multiprocessing.pool import Pool, ThreadPool

        e = LocalDaskExecutor(scheduler, num_workers=num_workers)
        with e.start():
            if scheduler == "synchronous":
                assert e._pool is None
            else:
                sol = num_workers or CPU_COUNT
                kind = ThreadPool if scheduler == "threads" else Pool
                assert isinstance(e._pool, kind)
                assert e._pool._processes == sol
        assert e._pool is None
コード例 #3
0
ファイル: pipeline.py プロジェクト: ak-gupta/nbaspa
def run_pipeline(flow: Flow, data_dir: str, output_dir: str,
                 **kwargs) -> Optional[State]:
    """Run the pipeline.

    Parameters
    ----------
    flow : Flow
        The generated flow.
    data_dir : str
        The directory containing the data.
    output_dir : str
        The output location for the data.
    **kwargs
        Additional parameters

    Returns
    -------
    State
        The output of ``flow.run``.
    """
    output = flow.run(
        parameters={
            "data_dir": data_dir,
            "output_dir": output_dir,
            **kwargs
        },
        executor=LocalDaskExecutor(scheduler="processes"),
    )

    return output
コード例 #4
0
    def test_temporary_pool_created_of_proper_size_and_kind(
            self, scheduler, num_workers):
        from dask.system import CPU_COUNT
        from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor

        e = LocalDaskExecutor(scheduler, num_workers=num_workers)
        with e.start():
            if scheduler == "synchronous":
                assert e._pool is None
            else:
                sol = num_workers or CPU_COUNT
                kind = (ThreadPoolExecutor
                        if scheduler == "threads" else ProcessPoolExecutor)
                assert e._pool._max_workers == sol
                assert isinstance(e._pool, kind)
        assert e._pool is None
コード例 #5
0
 def test_submit(self):
     e = LocalDaskExecutor()
     with e.start():
         assert e.submit(lambda: 1).compute(scheduler="sync") == 1
         assert e.submit(lambda x: x, 1).compute(scheduler="sync") == 1
         assert e.submit(lambda x: x, x=1).compute(scheduler="sync") == 1
         assert e.submit(lambda: prefect).compute(scheduler="sync") is prefect
コード例 #6
0
    def test_captures_prefect_signals(self):
        e = LocalDaskExecutor()

        @prefect.task(timeout=2)
        def succeed():
            raise SUCCESS()

        with prefect.Flow("signal-test", executor=e) as flow:
            succeed()

        res = flow.run()
        assert res.is_successful()
コード例 #7
0
ファイル: test_executors.py プロジェクト: tuanchris/prefect
    def test_interrupt_stops_running_tasks_quickly(self, scheduler):
        # Windows implements `queue.get` using polling,
        # which means we can set an exception to interrupt the call to `get`.
        # Python 3 on other platforms requires sending SIGINT to the main thread.
        if os.name == "nt":
            from _thread import interrupt_main
        else:
            main_thread = threading.get_ident()

            def interrupt_main():
                import signal

                signal.pthread_kill(main_thread, signal.SIGINT)

        def long_task():
            for i in range(50):
                time.sleep(0.1)

        e = LocalDaskExecutor(scheduler)
        try:
            interrupter = threading.Timer(0.5, interrupt_main)
            interrupter.start()
            start = time.time()
            with e.start():
                e.wait(e.submit(long_task))
        except KeyboardInterrupt:
            pass
        except Exception:
            assert False, "Failed to interrupt"
        stop = time.time()
        if stop - start > 4:
            assert False, "Failed to interrupt"
コード例 #8
0
ファイル: prefect.py プロジェクト: oysteoh/ert
def _get_executor(custom_port_range, name="local"):
    # See https://github.com/equinor/ert/pull/2757#discussion_r794368854
    _, port, sock = find_available_port(custom_range=custom_port_range)
    sock.close()  # do this explicitly, not relying on GC

    if name == "local":
        cluster_kwargs = {
            "silence_logs": "debug",
        }
        return LocalDaskExecutor(**cluster_kwargs)
    elif name == "lsf":
        LSFJob._submit_job = _eq_submit_job
        cluster_kwargs = {
            "queue": "mr",
            "project": None,
            "cores": 1,
            "memory": "1GB",
            "use_stdin": True,
            "n_workers": 2,
            "silence_logs": "debug",
            "scheduler_options": {
                "port": port
            },
        }
        return DaskExecutor(
            cluster_class="dask_jobqueue.LSFCluster",
            cluster_kwargs=cluster_kwargs,
            debug=True,
        )
    elif name == "pbs":
        cluster_kwargs = {
            "n_workers": 10,
            "queue": "normal",
            "project": "ERT-TEST",
            "local_directory": "$TMPDIR",
            "cores": 1,
            "memory": "32gb",
            "resource_spec": "select=1:ncpus=1:mem=32gb",
            "scheduler_options": {
                "port": port
            },
            "extra": ["--worker-port", "51820:51840"],
        }
        return DaskExecutor(
            cluster_class="dask_jobqueue.PBSCluster",
            cluster_kwargs=cluster_kwargs,
            debug=True,
        )
    else:
        raise ValueError(f"Unknown executor name {name}")
コード例 #9
0
    def test_submit_sets_task_name(self):
        e = LocalDaskExecutor()
        with e.start():
            f = e.submit(lambda x: x + 1, 1, extra_context={"task_name": "inc"})
            (res,) = e.wait([f])
            assert f.key.startswith("inc-")
            assert res == 2

            f = e.submit(
                lambda x: x + 1, 1, extra_context={"task_name": "inc", "task_index": 1}
            )
            (res,) = e.wait([f])
            assert f.key.startswith("inc-1-")
            assert res == 2
コード例 #10
0
    def test_only_compute_once(self, scheduler, tmpdir):
        e = LocalDaskExecutor(scheduler)

        def inc(x, path):
            if os.path.exists(path):
                raise ValueError("Should only run once!")
            with open(path, "wb"):
                pass
            return x + 1

        with e.start():
            f1 = e.submit(inc, 0, str(tmpdir.join("f1")))
            f2 = e.submit(inc, f1, str(tmpdir.join("f2")))
            f3 = e.submit(inc, f2, str(tmpdir.join("f3")))
            assert e.wait([f1]) == [1]
            assert e.wait([f2]) == [2]
            assert e.wait([f3]) == [3]
            assert e.wait([f1, f2, f3]) == [1, 2, 3]
コード例 #11
0
def _get_executor(name="local"):
    if name == "local":
        cluster_kwargs = {
            "silence_logs": "debug",
            "scheduler_options": {
                "port": find_open_port()
            },
        }
        return LocalDaskExecutor(**cluster_kwargs)
    elif name == "lsf":
        LSFJob._submit_job = _eq_submit_job
        cluster_kwargs = {
            "queue": "mr",
            "project": None,
            "cores": 1,
            "memory": "1GB",
            "use_stdin": True,
            "n_workers": 2,
            "silence_logs": "debug",
            "scheduler_options": {
                "port": find_open_port()
            },
        }
        return DaskExecutor(
            cluster_class="dask_jobqueue.LSFCluster",
            cluster_kwargs=cluster_kwargs,
            debug=True,
        )
    elif name == "pbs":
        cluster_kwargs = {
            "n_workers": 10,
            "queue": "normal",
            "project": "ERT-TEST",
            "local_directory": "$TMPDIR",
            "cores": 4,
            "memory": "16GB",
            "resource_spec": "select=1:ncpus=4:mem=16GB",
        }
        return DaskExecutor(
            cluster_class="dask_jobqueue.PBSCluster",
            cluster_kwargs=cluster_kwargs,
            debug=True,
        )
    else:
        raise ValueError(f"Unknown executor name {name}")
コード例 #12
0
def test_create_environment_populated():
    def f():
        pass

    executor = LocalDaskExecutor()
    environment = LocalEnvironment(
        executor=executor,
        labels=["test"],
        on_start=f,
        on_exit=f,
        metadata={"test": "here"},
    )
    assert environment.executor is executor
    assert environment.labels == set(["test"])
    assert environment.on_start is f
    assert environment.on_exit is f
    assert environment.metadata == {"test": "here"}
    assert environment.logger.name == "prefect.LocalEnvironment"
コード例 #13
0
    def test_interrupt_stops_running_tasks_quickly(self, scheduler):
        # TODO: remove this skip
        if scheduler == "processes":
            pytest.skip(
                "This test periodically hangs for some reason on circleci, but passes "
                "locally. We should debug this later, but squashing it for now"
            )

        main_thread = threading.get_ident()

        def interrupt():

            if sys.platform == "win32":
                # pthread_kill is Windows only
                from _thread import interrupt_main

                interrupt_main()
            else:
                import signal

                signal.pthread_kill(main_thread, signal.SIGINT)

        def long_task():
            for i in range(50):
                time.sleep(0.1)

        e = LocalDaskExecutor(scheduler)
        try:
            interrupter = threading.Timer(0.5, interrupt)
            interrupter.start()
            start = time.time()
            with e.start():
                e.wait(e.submit(long_task))
        except KeyboardInterrupt:
            pass  # Don't exit test on the interrupt

        stop = time.time()

        # Defining "quickly" here as 4 seconds generally and 6 seconds on
        # Windows which tends to be a little slower
        assert (stop - start) < (6 if sys.platform == "win32" else 4)
コード例 #14
0
ファイル: test_executors.py プロジェクト: zschumacher/prefect
    def test_interrupt_stops_running_tasks_quickly(self, scheduler):
        # TODO: remove this skip
        if scheduler == "processes":
            pytest.skip(
                "This test periodically hangs for some reason on circleci, but passes "
                "locally. We should debug this later, but squashing it for now"
            )

        # Windows implements `queue.get` using polling,
        # which means we can set an exception to interrupt the call to `get`.
        # Python 3 on other platforms requires sending SIGINT to the main thread.
        if os.name == "nt":
            from _thread import interrupt_main
        else:
            main_thread = threading.get_ident()

            def interrupt_main():
                import signal

                signal.pthread_kill(main_thread, signal.SIGINT)

        def long_task():
            for i in range(50):
                time.sleep(0.1)

        e = LocalDaskExecutor(scheduler)
        try:
            interrupter = threading.Timer(0.5, interrupt_main)
            interrupter.start()
            start = time.time()
            with e.start():
                e.wait(e.submit(long_task))
        except KeyboardInterrupt:
            pass
        except Exception:
            assert False, "Failed to interrupt"
        stop = time.time()
        if stop - start > 4:
            assert False, "Failed to interrupt"
コード例 #15
0
 def test_scheduler_defaults_to_threads(self):
     e = LocalDaskExecutor()
     assert e.scheduler == "threads"
コード例 #16
0
ファイル: conftest.py プロジェクト: omarbelkady/prefect
def threaded_local():
    "Multithreaded executor using local dask (not distributed cluster)"
    yield LocalDaskExecutor(scheduler="threads")
コード例 #17
0
 def test_is_pickleable_after_start(self):
     e = LocalDaskExecutor()
     with e.start():
         post = cloudpickle.loads(cloudpickle.dumps(e))
         assert isinstance(post, LocalDaskExecutor)
         assert post._pool is None
コード例 #18
0
 def test_is_pickleable(self):
     e = LocalDaskExecutor()
     post = cloudpickle.loads(cloudpickle.dumps(e))
     assert isinstance(post, LocalDaskExecutor)
コード例 #19
0

with Flow("Image ETL") as flow:
    Path("src/pipeline/temp").touch()
    image_path = Path("src/pipeline/temp/image-data.img")
    gif_path = Path("src/pipeline/temp/comb.gif")

    DATA_URL = Parameter(
        "DATA_URL",
        default=
        "https://github.com/cicdw/image-data/blob/master/all-images.img?raw=true",
    )
    DATA_FILE = Parameter("DATA_FILE", default=image_path)

    # Extract
    command = curl_cmd(DATA_URL, DATA_FILE)
    curl = download(command=command)

    # Transform
    # we use the `upstream_tasks` keyword to specify non-data dependencies
    images = load_and_split(fname=DATA_FILE, upstream_tasks=[curl])

    # Load
    frames = write_to_disk.map(images)
    combine_to_gif(frames, gif_path)

if __name__ == "__main__":
    flow.visualize()
    flow.executor = LocalDaskExecutor(scheduler="threads", num_workers=4)
    flow.run()
コード例 #20
0
def mproc_local():
    "Multiprocessing executor using local dask (not distributed cluster)"
    yield LocalDaskExecutor(scheduler="processes")
コード例 #21
0
def sync():
    "Synchronous dask (not dask.distributed) executor"
    yield LocalDaskExecutor(scheduler="sync")
コード例 #22
0
def test_prefect_executors(train_data, grid_search, parallel_columns):
    from dask.distributed import Client
    from prefect.executors import DaskExecutor
    from prefect.executors import LocalDaskExecutor
    from prefect.executors import LocalExecutor

    client = Client()

    executors = {
        "dask_already_running": DaskExecutor(address=client.scheduler.address),
        "local": LocalExecutor(),
        "local_dask": LocalDaskExecutor(),
        # this spins up LocalDaskExecutor, but just to check the interface
        "dask_create_on_call": DaskExecutor(),
    }

    for executor_name, executor in executors.items():
        flow, state = run_model_selection(
            df=train_data,
            grid_search=grid_search,
            target_col_name="Quantity",
            frequency="D",
            partition_columns=["Product"],
            parallel_over_columns=parallel_columns,
            include_rules=None,
            exclude_rules=None,
            country_code_column="Holidays_code",
            output_path="",
            persist_cv_results=False,
            persist_cv_data=False,
            persist_model_reprs=False,
            persist_best_model=False,
            persist_partition=False,
            persist_model_selector_results=False,
            visualize_success=False,
            executor=executor,
        )
        assert state.is_successful()

        results = select_model_general(
            df=train_data,
            grid_search=grid_search,
            target_col_name="Quantity",
            frequency="D",
            partition_columns=["Product"],
            parallel_over_columns=parallel_columns,
            executor=executor,
            include_rules=None,
            exclude_rules=None,
            country_code_column="Holidays_code",
            output_path="",
            persist_cv_results=False,
            persist_cv_data=False,
            persist_model_reprs=False,
            persist_best_model=False,
            persist_partition=False,
            persist_model_selector_results=False,
        )

        assert len(results) == len(train_data[parallel_columns +
                                              ["Product"]].drop_duplicates())
        assert isinstance(results[0], ModelSelectorResult)

        if executor_name == "dask_already_running":
            client.shutdown()

    if client.status != "closed":
        client.shutdown()
コード例 #23
0
 def executor(self) -> Executor:
     return LocalDaskExecutor(scheduler="processes")
コード例 #24
0
@task  #(log_stdout=True)
def run_thing1():
    if things.thing1():
        return True


@task  #(log_stdout=True)
def run_thing2():
    if things.thing2():
        return True


if __name__ == "__main__":
    with Flow("test_flow",
              executor=LocalDaskExecutor(scheduler="processes")) as flow:
        if run_thing1():
            thing2_var = run_thing2()

    flow.storage = Docker(
        base_image="prefect_logger_repro",
        local_image=True,
        build_kwargs={"labels": {
            "logging_job_name": "test_flow"
        }},
        ignore_healthchecks=True,
        registry_url="artifactory.aq.tc/prefect/",
    )
    flow.run_config = DockerRun()
    #flow.run()
    flow.register(project_name="default")
コード例 #25
0
def test_create_fargate_task_environment_with_executor():
    executor = LocalDaskExecutor()
    environment = FargateTaskEnvironment(executor=executor)
    assert environment.executor is executor
コード例 #26
0
 def test_wait(self):
     e = LocalDaskExecutor()
     with e.start():
         assert e.wait(1) == 1
         assert e.wait(prefect) is prefect
         assert e.wait(e.submit(lambda: 1)) == 1
         assert e.wait(e.submit(lambda x: x, 1)) == 1
         assert e.wait(e.submit(lambda x: x, x=1)) == 1
         assert e.wait(e.submit(lambda: prefect)) is prefect
コード例 #27
0
ファイル: flu_a.py プロジェクト: vsmetansky/v_m
from prefect import Flow
from prefect.executors import LocalDaskExecutor

from v_m.flows import init_schedule, init_dates, init_date_parameters
from v_m.tasks import flu_a as tasks
from v_m.constants import flu_a as const

dates = init_dates(weeks=const.LOOKBACK)
start, stop = init_date_parameters(dates)
schedule = init_schedule(start, stop, dates, weeks=const.INTERVAL)
executor = LocalDaskExecutor()

with Flow(name='flu_a', executor=executor, schedule=schedule) as flow:
    date_range = tasks.process_parameters(start, stop)
    df = tasks.extract(date_range)
    df = tasks.transform(df)
    tasks.load(df)

if __name__ == '__main__':
    flow.run(run_on_schedule=False)