def test_configurable_scheduler(self): e = LocalDaskExecutor(scheduler="synchronous") assert e.scheduler == "synchronous" def check_scheduler(val): assert dask.config.get("scheduler") == val with dask.config.set(scheduler="threads"): check_scheduler("threads") with e.start(): e.submit(check_scheduler, "synchronous")
def test_temporary_pool_created_of_proper_size_and_kind( self, scheduler, num_workers): from dask.system import CPU_COUNT from multiprocessing.pool import Pool, ThreadPool e = LocalDaskExecutor(scheduler, num_workers=num_workers) with e.start(): if scheduler == "synchronous": assert e._pool is None else: sol = num_workers or CPU_COUNT kind = ThreadPool if scheduler == "threads" else Pool assert isinstance(e._pool, kind) assert e._pool._processes == sol assert e._pool is None
def run_pipeline(flow: Flow, data_dir: str, output_dir: str, **kwargs) -> Optional[State]: """Run the pipeline. Parameters ---------- flow : Flow The generated flow. data_dir : str The directory containing the data. output_dir : str The output location for the data. **kwargs Additional parameters Returns ------- State The output of ``flow.run``. """ output = flow.run( parameters={ "data_dir": data_dir, "output_dir": output_dir, **kwargs }, executor=LocalDaskExecutor(scheduler="processes"), ) return output
def test_temporary_pool_created_of_proper_size_and_kind( self, scheduler, num_workers): from dask.system import CPU_COUNT from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor e = LocalDaskExecutor(scheduler, num_workers=num_workers) with e.start(): if scheduler == "synchronous": assert e._pool is None else: sol = num_workers or CPU_COUNT kind = (ThreadPoolExecutor if scheduler == "threads" else ProcessPoolExecutor) assert e._pool._max_workers == sol assert isinstance(e._pool, kind) assert e._pool is None
def test_submit(self): e = LocalDaskExecutor() with e.start(): assert e.submit(lambda: 1).compute(scheduler="sync") == 1 assert e.submit(lambda x: x, 1).compute(scheduler="sync") == 1 assert e.submit(lambda x: x, x=1).compute(scheduler="sync") == 1 assert e.submit(lambda: prefect).compute(scheduler="sync") is prefect
def test_captures_prefect_signals(self): e = LocalDaskExecutor() @prefect.task(timeout=2) def succeed(): raise SUCCESS() with prefect.Flow("signal-test", executor=e) as flow: succeed() res = flow.run() assert res.is_successful()
def test_interrupt_stops_running_tasks_quickly(self, scheduler): # Windows implements `queue.get` using polling, # which means we can set an exception to interrupt the call to `get`. # Python 3 on other platforms requires sending SIGINT to the main thread. if os.name == "nt": from _thread import interrupt_main else: main_thread = threading.get_ident() def interrupt_main(): import signal signal.pthread_kill(main_thread, signal.SIGINT) def long_task(): for i in range(50): time.sleep(0.1) e = LocalDaskExecutor(scheduler) try: interrupter = threading.Timer(0.5, interrupt_main) interrupter.start() start = time.time() with e.start(): e.wait(e.submit(long_task)) except KeyboardInterrupt: pass except Exception: assert False, "Failed to interrupt" stop = time.time() if stop - start > 4: assert False, "Failed to interrupt"
def _get_executor(custom_port_range, name="local"): # See https://github.com/equinor/ert/pull/2757#discussion_r794368854 _, port, sock = find_available_port(custom_range=custom_port_range) sock.close() # do this explicitly, not relying on GC if name == "local": cluster_kwargs = { "silence_logs": "debug", } return LocalDaskExecutor(**cluster_kwargs) elif name == "lsf": LSFJob._submit_job = _eq_submit_job cluster_kwargs = { "queue": "mr", "project": None, "cores": 1, "memory": "1GB", "use_stdin": True, "n_workers": 2, "silence_logs": "debug", "scheduler_options": { "port": port }, } return DaskExecutor( cluster_class="dask_jobqueue.LSFCluster", cluster_kwargs=cluster_kwargs, debug=True, ) elif name == "pbs": cluster_kwargs = { "n_workers": 10, "queue": "normal", "project": "ERT-TEST", "local_directory": "$TMPDIR", "cores": 1, "memory": "32gb", "resource_spec": "select=1:ncpus=1:mem=32gb", "scheduler_options": { "port": port }, "extra": ["--worker-port", "51820:51840"], } return DaskExecutor( cluster_class="dask_jobqueue.PBSCluster", cluster_kwargs=cluster_kwargs, debug=True, ) else: raise ValueError(f"Unknown executor name {name}")
def test_submit_sets_task_name(self): e = LocalDaskExecutor() with e.start(): f = e.submit(lambda x: x + 1, 1, extra_context={"task_name": "inc"}) (res,) = e.wait([f]) assert f.key.startswith("inc-") assert res == 2 f = e.submit( lambda x: x + 1, 1, extra_context={"task_name": "inc", "task_index": 1} ) (res,) = e.wait([f]) assert f.key.startswith("inc-1-") assert res == 2
def test_only_compute_once(self, scheduler, tmpdir): e = LocalDaskExecutor(scheduler) def inc(x, path): if os.path.exists(path): raise ValueError("Should only run once!") with open(path, "wb"): pass return x + 1 with e.start(): f1 = e.submit(inc, 0, str(tmpdir.join("f1"))) f2 = e.submit(inc, f1, str(tmpdir.join("f2"))) f3 = e.submit(inc, f2, str(tmpdir.join("f3"))) assert e.wait([f1]) == [1] assert e.wait([f2]) == [2] assert e.wait([f3]) == [3] assert e.wait([f1, f2, f3]) == [1, 2, 3]
def _get_executor(name="local"): if name == "local": cluster_kwargs = { "silence_logs": "debug", "scheduler_options": { "port": find_open_port() }, } return LocalDaskExecutor(**cluster_kwargs) elif name == "lsf": LSFJob._submit_job = _eq_submit_job cluster_kwargs = { "queue": "mr", "project": None, "cores": 1, "memory": "1GB", "use_stdin": True, "n_workers": 2, "silence_logs": "debug", "scheduler_options": { "port": find_open_port() }, } return DaskExecutor( cluster_class="dask_jobqueue.LSFCluster", cluster_kwargs=cluster_kwargs, debug=True, ) elif name == "pbs": cluster_kwargs = { "n_workers": 10, "queue": "normal", "project": "ERT-TEST", "local_directory": "$TMPDIR", "cores": 4, "memory": "16GB", "resource_spec": "select=1:ncpus=4:mem=16GB", } return DaskExecutor( cluster_class="dask_jobqueue.PBSCluster", cluster_kwargs=cluster_kwargs, debug=True, ) else: raise ValueError(f"Unknown executor name {name}")
def test_create_environment_populated(): def f(): pass executor = LocalDaskExecutor() environment = LocalEnvironment( executor=executor, labels=["test"], on_start=f, on_exit=f, metadata={"test": "here"}, ) assert environment.executor is executor assert environment.labels == set(["test"]) assert environment.on_start is f assert environment.on_exit is f assert environment.metadata == {"test": "here"} assert environment.logger.name == "prefect.LocalEnvironment"
def test_interrupt_stops_running_tasks_quickly(self, scheduler): # TODO: remove this skip if scheduler == "processes": pytest.skip( "This test periodically hangs for some reason on circleci, but passes " "locally. We should debug this later, but squashing it for now" ) main_thread = threading.get_ident() def interrupt(): if sys.platform == "win32": # pthread_kill is Windows only from _thread import interrupt_main interrupt_main() else: import signal signal.pthread_kill(main_thread, signal.SIGINT) def long_task(): for i in range(50): time.sleep(0.1) e = LocalDaskExecutor(scheduler) try: interrupter = threading.Timer(0.5, interrupt) interrupter.start() start = time.time() with e.start(): e.wait(e.submit(long_task)) except KeyboardInterrupt: pass # Don't exit test on the interrupt stop = time.time() # Defining "quickly" here as 4 seconds generally and 6 seconds on # Windows which tends to be a little slower assert (stop - start) < (6 if sys.platform == "win32" else 4)
def test_interrupt_stops_running_tasks_quickly(self, scheduler): # TODO: remove this skip if scheduler == "processes": pytest.skip( "This test periodically hangs for some reason on circleci, but passes " "locally. We should debug this later, but squashing it for now" ) # Windows implements `queue.get` using polling, # which means we can set an exception to interrupt the call to `get`. # Python 3 on other platforms requires sending SIGINT to the main thread. if os.name == "nt": from _thread import interrupt_main else: main_thread = threading.get_ident() def interrupt_main(): import signal signal.pthread_kill(main_thread, signal.SIGINT) def long_task(): for i in range(50): time.sleep(0.1) e = LocalDaskExecutor(scheduler) try: interrupter = threading.Timer(0.5, interrupt_main) interrupter.start() start = time.time() with e.start(): e.wait(e.submit(long_task)) except KeyboardInterrupt: pass except Exception: assert False, "Failed to interrupt" stop = time.time() if stop - start > 4: assert False, "Failed to interrupt"
def test_scheduler_defaults_to_threads(self): e = LocalDaskExecutor() assert e.scheduler == "threads"
def threaded_local(): "Multithreaded executor using local dask (not distributed cluster)" yield LocalDaskExecutor(scheduler="threads")
def test_is_pickleable_after_start(self): e = LocalDaskExecutor() with e.start(): post = cloudpickle.loads(cloudpickle.dumps(e)) assert isinstance(post, LocalDaskExecutor) assert post._pool is None
def test_is_pickleable(self): e = LocalDaskExecutor() post = cloudpickle.loads(cloudpickle.dumps(e)) assert isinstance(post, LocalDaskExecutor)
with Flow("Image ETL") as flow: Path("src/pipeline/temp").touch() image_path = Path("src/pipeline/temp/image-data.img") gif_path = Path("src/pipeline/temp/comb.gif") DATA_URL = Parameter( "DATA_URL", default= "https://github.com/cicdw/image-data/blob/master/all-images.img?raw=true", ) DATA_FILE = Parameter("DATA_FILE", default=image_path) # Extract command = curl_cmd(DATA_URL, DATA_FILE) curl = download(command=command) # Transform # we use the `upstream_tasks` keyword to specify non-data dependencies images = load_and_split(fname=DATA_FILE, upstream_tasks=[curl]) # Load frames = write_to_disk.map(images) combine_to_gif(frames, gif_path) if __name__ == "__main__": flow.visualize() flow.executor = LocalDaskExecutor(scheduler="threads", num_workers=4) flow.run()
def mproc_local(): "Multiprocessing executor using local dask (not distributed cluster)" yield LocalDaskExecutor(scheduler="processes")
def sync(): "Synchronous dask (not dask.distributed) executor" yield LocalDaskExecutor(scheduler="sync")
def test_prefect_executors(train_data, grid_search, parallel_columns): from dask.distributed import Client from prefect.executors import DaskExecutor from prefect.executors import LocalDaskExecutor from prefect.executors import LocalExecutor client = Client() executors = { "dask_already_running": DaskExecutor(address=client.scheduler.address), "local": LocalExecutor(), "local_dask": LocalDaskExecutor(), # this spins up LocalDaskExecutor, but just to check the interface "dask_create_on_call": DaskExecutor(), } for executor_name, executor in executors.items(): flow, state = run_model_selection( df=train_data, grid_search=grid_search, target_col_name="Quantity", frequency="D", partition_columns=["Product"], parallel_over_columns=parallel_columns, include_rules=None, exclude_rules=None, country_code_column="Holidays_code", output_path="", persist_cv_results=False, persist_cv_data=False, persist_model_reprs=False, persist_best_model=False, persist_partition=False, persist_model_selector_results=False, visualize_success=False, executor=executor, ) assert state.is_successful() results = select_model_general( df=train_data, grid_search=grid_search, target_col_name="Quantity", frequency="D", partition_columns=["Product"], parallel_over_columns=parallel_columns, executor=executor, include_rules=None, exclude_rules=None, country_code_column="Holidays_code", output_path="", persist_cv_results=False, persist_cv_data=False, persist_model_reprs=False, persist_best_model=False, persist_partition=False, persist_model_selector_results=False, ) assert len(results) == len(train_data[parallel_columns + ["Product"]].drop_duplicates()) assert isinstance(results[0], ModelSelectorResult) if executor_name == "dask_already_running": client.shutdown() if client.status != "closed": client.shutdown()
def executor(self) -> Executor: return LocalDaskExecutor(scheduler="processes")
@task #(log_stdout=True) def run_thing1(): if things.thing1(): return True @task #(log_stdout=True) def run_thing2(): if things.thing2(): return True if __name__ == "__main__": with Flow("test_flow", executor=LocalDaskExecutor(scheduler="processes")) as flow: if run_thing1(): thing2_var = run_thing2() flow.storage = Docker( base_image="prefect_logger_repro", local_image=True, build_kwargs={"labels": { "logging_job_name": "test_flow" }}, ignore_healthchecks=True, registry_url="artifactory.aq.tc/prefect/", ) flow.run_config = DockerRun() #flow.run() flow.register(project_name="default")
def test_create_fargate_task_environment_with_executor(): executor = LocalDaskExecutor() environment = FargateTaskEnvironment(executor=executor) assert environment.executor is executor
def test_wait(self): e = LocalDaskExecutor() with e.start(): assert e.wait(1) == 1 assert e.wait(prefect) is prefect assert e.wait(e.submit(lambda: 1)) == 1 assert e.wait(e.submit(lambda x: x, 1)) == 1 assert e.wait(e.submit(lambda x: x, x=1)) == 1 assert e.wait(e.submit(lambda: prefect)) is prefect
from prefect import Flow from prefect.executors import LocalDaskExecutor from v_m.flows import init_schedule, init_dates, init_date_parameters from v_m.tasks import flu_a as tasks from v_m.constants import flu_a as const dates = init_dates(weeks=const.LOOKBACK) start, stop = init_date_parameters(dates) schedule = init_schedule(start, stop, dates, weeks=const.INTERVAL) executor = LocalDaskExecutor() with Flow(name='flu_a', executor=executor, schedule=schedule) as flow: date_range = tasks.process_parameters(start, stop) df = tasks.extract(date_range) df = tasks.transform(df) tasks.load(df) if __name__ == '__main__': flow.run(run_on_schedule=False)