Beispiel #1
0
def test_simple_map(monkeypatch):

    flow_run_id = str(uuid.uuid4())
    task_run_id_1 = str(uuid.uuid4())

    with prefect.Flow(name="test", result_handler=JSONResultHandler()) as flow:
        t1 = plus_one.map([0, 1, 2])

    client = MockedCloudClient(
        flow_runs=[FlowRun(id=flow_run_id)],
        task_runs=[
            TaskRun(id=task_run_id_1,
                    task_slug=flow.slugs[t1],
                    flow_run_id=flow_run_id)
        ] + [
            TaskRun(id=str(uuid.uuid4()),
                    task_slug=flow.slugs[t],
                    flow_run_id=flow_run_id) for t in flow.tasks if t is not t1
        ],
        monkeypatch=monkeypatch,
    )

    with prefect.context(flow_run_id=flow_run_id):
        state = CloudFlowRunner(flow=flow).run(return_tasks=flow.tasks,
                                               executor=LocalExecutor())

    assert state.is_successful()
    assert client.flow_runs[flow_run_id].state.is_successful()
    assert client.task_runs[task_run_id_1].state.is_mapped()
    # there should be a total of 4 task runs corresponding to the mapped task
    assert (len([
        tr for tr in client.task_runs.values()
        if tr.task_slug == flow.slugs[t1]
    ]) == 4)
    def run(
        self, clean: bool = False, debug: bool = False, **kwargs,
    ):
        """
        Run a flow with your steps.

        Parameters
        ----------
        clean: bool
            Should the local staging directory be cleaned prior to this run.
            Default: False (Do not clean)
        debug: bool
            A debug flag for the developer to use to manipulate how much data runs,
            how it is processed, etc.
            Default: False (Do not debug)

        Notes
        -----
        Documentation on prefect:
        https://docs.prefect.io/core/

        Basic prefect example:
        https://docs.prefect.io/core/
        """
        # Initalize steps
        raw = steps.Raw()

        # Choose executor
        if debug:
            exe = LocalExecutor()
        else:
            # Set up connection to computation cluster
            cluster = LocalCluster()

            # Inform of Dask UI
            log.info(f"Cluster dashboard available at: {cluster.dashboard_link}")

            # Create dask executor
            exe = DaskExecutor(cluster.scheduler_address)

        # Configure your flow
        with Flow("{{ cookiecutter.project_slug }}") as flow:
            # If you want to clean the local staging directories pass clean
            # If you want to utilize some debugging functionality pass debug
            # If you don't utilize any of these, just pass the parameters you need.
            raw(
                clean=clean,
                debug=debug,
                **kwargs,  # Allows us to pass `--n {some integer}` or other params
            )

        # Run flow and get ending state
        state = flow.run(executor=exe)

        # Get and display any outputs you want to see on your local terminal
        log.info(raw.get_result(state, flow))
Beispiel #3
0
 def test_running_state_finishes(self):
     flow = Flow(name="test", tasks=[Task()])
     new_state = FlowRunner(flow=flow).get_flow_run_state(
         state=Running(),
         task_states={},
         task_contexts={},
         return_tasks=set(),
         task_runner_state_handlers=[],
         executor=LocalExecutor(),
     )
     assert new_state.is_successful()
 def test_determine_final_state_preserves_running_states_when_tasks_still_running(
     self,
 ):
     task = Task()
     flow = Flow(name="test", tasks=[task])
     old_state = Running()
     new_state = FlowRunner(flow=flow).get_flow_run_state(
         state=old_state,
         task_states={task: Retrying(start_time=pendulum.now("utc").add(days=1))},
         task_contexts={},
         return_tasks=set(),
         task_runner_state_handlers=[],
         executor=LocalExecutor(),
     )
     assert new_state is old_state
Beispiel #5
0
    def test_determine_final_state_has_final_say(self):
        class MyFlowRunner(FlowRunner):
            def determine_final_state(self, *args, **kwargs):
                return Failed("Very specific error message")

        flow = Flow(name="test", tasks=[Task()])
        new_state = MyFlowRunner(flow=flow).get_flow_run_state(
            state=Running(),
            task_states={},
            task_contexts={},
            return_tasks=set(),
            task_runner_state_handlers=[],
            executor=LocalExecutor(),
        )
        assert new_state.is_failed()
        assert new_state.message == "Very specific error message"
Beispiel #6
0
def prepare_executor(executor_type, executor_address=None):
    """Instantiate a prefect executor"""
    if executor_type == 'dask':
        if executor_address is not None:
            executor = DaskExecutor(executor_address)
        else:
            executor = DaskExecutor(local_processes=True)
    elif executor_type == "synchronous":
        executor = SynchronousExecutor()
    elif executor_type == 'local':
        executor = LocalExecutor()
    else:
        # Should not happen if click parameters are done correctly, but
        # kept for completeness
        raise ValueError(f'Unknown executor type "{executor_type}".')

    return executor
Beispiel #7
0
def test_can_queue_successfully_and_run(monkeypatch):
    @prefect.task
    def return_one():
        return 1

    with prefect.Flow("test-queues-work!") as flow:
        t1 = return_one()

    flow_run_id = str(uuid.uuid4())
    task_run_id_1 = str(uuid.uuid4())

    client = QueueingMockCloudClient(
        flow_runs=[FlowRun(id=flow_run_id)],
        task_runs=[
            TaskRun(
                id=task_run_id_1, task_slug=flow.slugs[t1], flow_run_id=flow_run_id
            ),
        ]
        + [
            TaskRun(
                id=str(uuid.uuid4()), task_slug=flow.slugs[t1], flow_run_id=flow_run_id
            )
            for t in flow.tasks
            if t
            not in [
                t1,
            ]
        ],
        monkeypatch=monkeypatch,
        num_times_in_queue=6,
    )

    with prefect.context(flow_run_id=flow_run_id):
        run_state = CloudFlowRunner(flow=flow).run(
            executor=LocalExecutor(), return_tasks=flow.tasks
        )

    assert run_state.is_successful()

    # Pending -> Running -> Queued (4x) -> Success
    # State transitions that result in `set_flow_run_state` calls are from
    # Pending -> Running and Running -> Success, all others
    # are from Running -> Queued or Queued -> Queued
    assert client.call_count["set_flow_run_state"] == 2 + (client.num_times_in_queue)
Beispiel #8
0
def test_prefect_executors(train_data, grid_search, parallel_columns):
    try:
        from prefect.engine.executors import DaskExecutor
        from prefect.engine.executors import LocalDaskExecutor
        from prefect.engine.executors import LocalExecutor
        from dask.distributed import Client
    except:
        print("`prefect` not installed, skipping the test...")
        pass
    else:
        client = Client()

        executors = {
            "dask_already_running":
            DaskExecutor(address=client.scheduler.address),
            "local": LocalExecutor(),
            "local_dask": LocalDaskExecutor(),
            "dask_create_on_call": DaskExecutor(
            ),  # this spins up LocalDaskExecutor, but just to check the interface
        }

        for executor_name, executor in executors.items():
            flow, state = run_model_selection(
                df=train_data,
                grid_search=grid_search,
                target_col_name="Quantity",
                frequency="D",
                partition_columns=["Product"],
                parallel_over_columns=parallel_columns,
                include_rules=None,
                exclude_rules=None,
                country_code_column="Holidays_code",
                output_path="",
                persist_cv_results=False,
                persist_cv_data=False,
                persist_model_reprs=False,
                persist_best_model=False,
                persist_partition=False,
                persist_model_selector_results=False,
                visualize_success=False,
                executor=executor,
            )
            assert state.is_successful()

            results = select_model_general(
                df=train_data,
                grid_search=grid_search,
                target_col_name="Quantity",
                frequency="D",
                partition_columns=["Product"],
                parallel_over_columns=parallel_columns,
                executor=executor,
                include_rules=None,
                exclude_rules=None,
                country_code_column="Holidays_code",
                output_path="",
                persist_cv_results=False,
                persist_cv_data=False,
                persist_model_reprs=False,
                persist_best_model=False,
                persist_partition=False,
                persist_model_selector_results=False,
            )

            assert len(results) == len(
                train_data[parallel_columns + ["Product"]].drop_duplicates())
            assert isinstance(results[0], ModelSelectorResult)

            if executor_name == "dask_already_running":
                client.shutdown()

        if client.status != "closed":
            client.shutdown()
Beispiel #9
0
    def run(
        self,
        dataset: str,
        include_raw: bool = False,
        batch_size: Optional[int] = None,
        distributed: bool = False,
        n_workers: int = 10,
        worker_cpu: int = 8,
        worker_mem: str = "120GB",
        overwrite: bool = False,
        debug: bool = False,
        **kwargs,
    ):
        """
        Run a flow with your steps.

        Parameters
        ----------
        dataset: str
            The dataset to use for the pipeline.

        include_raw: bool
            A boolean option to determine if the raw data should be included in the
            Quilt package.
            Default: False (Do not include the raw data)

        batch_size: Optional[int]
            An optional batch size to provide to each step for processing their items.
            Default: None (auto batch size depending on CPU / threads available)

        distributed: bool
            A boolean option to determine if the jobs should be distributed to a SLURM
            cluster when possible.
            Default: False (Do not distribute)

        n_workers: int
            Number of workers to request (when distributed is enabled).
            Default: 10

        worker_cpu: int
            Number of cores to provide per worker (when distributed is enabled).
            Default: 8

        worker_mem: str
            Amount of memory to provide per worker (when distributed is enabled).
            Default: 120GB

        overwrite: bool
            If this pipeline has already partially or completely run, should it
            overwrite the previous files or not.
            Default: False (Do not overwrite or regenerate files)

        debug: bool
            A debug flag for the developer to use to manipulate how much data runs,
            how it is processed, etc. Additionally, if debug is True, any mapped
            operation will run on threads instead of processes.
            Default: False (Do not debug)
        """
        # Initalize steps
        raw = steps.Raw()
        standardize_fov_array = steps.StandardizeFOVArray()
        single_cell_features = steps.SingleCellFeatures()
        single_cell_images = steps.SingleCellImages()
        diagnostic_sheets = steps.DiagnosticSheets()

        # Cluster / distributed defaults
        distributed_executor_address = None

        # Choose executor
        if debug:
            exe = LocalExecutor()
            log.info("Debug flagged. Will use threads instead of Dask.")
        else:
            if distributed:
                # Create or get log dir
                # Do not include ms
                log_dir_name = datetime.now().isoformat().split(".")[0]
                log_dir = Path(f".dask_logs/{log_dir_name}").expanduser()
                # Log dir settings
                log_dir.mkdir(parents=True, exist_ok=True)

                # Create cluster
                log.info("Creating SLURMCluster")
                cluster = SLURMCluster(
                    cores=worker_cpu,
                    memory=worker_mem,
                    queue="aics_cpu_general",
                    walltime="9-23:00:00",
                    local_directory=str(log_dir),
                    log_directory=str(log_dir),
                )

                # Spawn workers
                cluster.scale(jobs=n_workers)
                log.info("Created SLURMCluster")

                # Use the port from the created connector to set executor address
                distributed_executor_address = cluster.scheduler_address

                # Only auto batch size if it is not None
                if batch_size is None:
                    # Batch size is n_workers * worker_cpu * 0.75
                    # We could just do n_workers * worker_cpu but 3/4 of that is safer
                    batch_size = int(n_workers * worker_cpu * 0.75)

                # Log dashboard URI
                log.info(
                    f"Dask dashboard available at: {cluster.dashboard_link}")
            else:
                # Create local cluster
                log.info("Creating LocalCluster")
                cluster = LocalCluster()
                log.info("Created LocalCluster")

                # Set distributed_executor_address
                distributed_executor_address = cluster.scheduler_address

                # Log dashboard URI
                log.info(
                    f"Dask dashboard available at: {cluster.dashboard_link}")

            # Use dask cluster
            exe = DaskExecutor(distributed_executor_address)

        # Configure your flow
        with Flow("actk") as flow:
            if include_raw:
                dataset = raw(dataset, **kwargs)

            standardized_fov_paths_dataset = standardize_fov_array(
                dataset=dataset,
                distributed_executor_address=distributed_executor_address,
                batch_size=batch_size,
                overwrite=overwrite,
                debug=debug,
                # Allows us to pass `--desired_pixel_sizes [{float},{float},{float}]`
                **kwargs,
            )

            single_cell_features_dataset = single_cell_features(
                dataset=standardized_fov_paths_dataset,
                distributed_executor_address=distributed_executor_address,
                batch_size=batch_size,
                overwrite=overwrite,
                debug=debug,
                # Allows us to pass `--cell_ceiling_adjustment {int}`
                **kwargs,
            )

            single_cell_images_dataset = single_cell_images(
                dataset=single_cell_features_dataset,
                distributed_executor_address=distributed_executor_address,
                batch_size=batch_size,
                overwrite=overwrite,
                debug=debug,
                # Allows us to pass `--cell_ceiling_adjustment {int}`
                **kwargs,
            )

            diagnostic_sheets(
                dataset=single_cell_images_dataset,
                distributed_executor_address=distributed_executor_address,
                overwrite=overwrite,
                # Allows us to pass `--metadata {str}`,
                # `--feature {str}'`
                **kwargs,
            )

        # Run flow and get ending state, log duration
        start = datetime.now()
        state = flow.run(executor=exe)
        duration = datetime.now() - start
        log.info(f"Total duration of pipeline: "
                 f"{duration.seconds // 60 // 60}:"
                 f"{duration.seconds // 60}:"
                 f"{duration.seconds % 60}")

        # Get and display any outputs you want to see on your local terminal
        log.info(single_cell_images_dataset.get_result(state, flow))
Beispiel #10
0
def test_deep_map_with_a_retry(monkeypatch):
    """
    Creates a situation in which a deeply-mapped Flow encounters a one-time error in one
    of the middle layers. Running the flow a second time should resolve the error.

    DOES NOT WORK WITH DASK EXECUTORS because of the need for shared state on second run
    """

    flow_run_id = str(uuid.uuid4())
    task_run_id_1 = str(uuid.uuid4())
    task_run_id_2 = str(uuid.uuid4())
    task_run_id_3 = str(uuid.uuid4())

    with prefect.Flow(name="test", result_handler=JSONResultHandler()) as flow:
        t1 = plus_one.map([-1, 0, 1])
        t2 = invert_fail_once.map(t1)
        t3 = plus_one.map(t2)

    t2.max_retries = 1
    t2.retry_delay = datetime.timedelta(seconds=100)

    monkeypatch.setattr("requests.Session", MagicMock())
    monkeypatch.setattr("requests.post", MagicMock())

    client = MockedCloudClient(
        flow_runs=[FlowRun(id=flow_run_id)],
        task_runs=[
            TaskRun(id=task_run_id_1, task_slug=t1.slug, flow_run_id=flow_run_id),
            TaskRun(id=task_run_id_2, task_slug=t2.slug, flow_run_id=flow_run_id),
            TaskRun(id=task_run_id_3, task_slug=t3.slug, flow_run_id=flow_run_id),
        ]
        + [
            TaskRun(id=str(uuid.uuid4()), task_slug=t.slug, flow_run_id=flow_run_id)
            for t in flow.tasks
            if t not in [t1, t2, t3]
        ],
        monkeypatch=monkeypatch,
    )

    with prefect.context(flow_run_id=flow_run_id):
        CloudFlowRunner(flow=flow).run(executor=LocalExecutor())

    assert client.flow_runs[flow_run_id].state.is_running()
    assert client.task_runs[task_run_id_1].state.is_mapped()
    assert client.task_runs[task_run_id_2].state.is_mapped()
    assert client.task_runs[task_run_id_3].state.is_mapped()

    # there should be a total of 4 task runs corresponding to each mapped task
    for t in [t1, t2, t3]:
        assert (
            len([tr for tr in client.task_runs.values() if tr.task_slug == t.slug]) == 4
        )

    # t2's first child task should be retrying
    t2_0 = next(
        tr
        for tr in client.task_runs.values()
        if tr.task_slug == t2.slug and tr.map_index == 0
    )
    assert isinstance(t2_0.state, Retrying)

    # t3's first child task should be pending
    t3_0 = next(
        tr
        for tr in client.task_runs.values()
        if tr.task_slug == t3.slug and tr.map_index == 0
    )
    assert t3_0.state.is_pending()

    # RUN A SECOND TIME with an artificially updated start time
    failed_id = [
        t_id
        for t_id, tr in client.task_runs.items()
        if tr.task_slug == t2.slug and tr.map_index == 0
    ].pop()
    client.task_runs[failed_id].state.start_time = pendulum.now("UTC")

    with prefect.context(flow_run_id=flow_run_id):
        CloudFlowRunner(flow=flow).run(executor=LocalExecutor())

    # t2's first child task should be successful
    t2_0 = next(
        tr
        for tr in client.task_runs.values()
        if tr.task_slug == t2.slug and tr.map_index == 0
    )
    assert t2_0.state.is_successful()

    # t3's first child task should be successful
    t3_0 = next(
        tr
        for tr in client.task_runs.values()
        if tr.task_slug == t3.slug and tr.map_index == 0
    )
    assert t3_0.state.is_successful()
def test_non_keyed_states_are_hydrated_correctly_with_retries(
        monkeypatch, tmpdir):
    """
    Ensures that retries longer than 10 minutes properly "hydrate" upstream states
    so that mapped tasks retry correctly - for mapped tasks, even non-data dependencies
    can affect the number of children spawned.
    """
    @prefect.task
    def return_list():
        return [1, 2, 3]

    @prefect.task(max_retries=1, retry_delay=datetime.timedelta(minutes=20))
    def fail_once():
        if prefect.context.get("task_run_count", 0) < 2:
            raise SyntaxError("bad")
        else:
            return 100

    flow_run_id = str(uuid.uuid4())
    task_run_id_1 = str(uuid.uuid4())
    task_run_id_2 = str(uuid.uuid4())

    with prefect.Flow(name="test-retries",
                      result=LocalResult(dir=tmpdir)) as flow:
        t1 = fail_once.map(upstream_tasks=[return_list])

    monkeypatch.setattr("requests.Session", MagicMock())
    monkeypatch.setattr("requests.post", MagicMock())

    client = MockedCloudClient(
        flow_runs=[FlowRun(id=flow_run_id)],
        task_runs=[
            TaskRun(id=task_run_id_1,
                    task_slug=flow.slugs[t1],
                    flow_run_id=flow_run_id),
            TaskRun(
                id=task_run_id_2,
                task_slug=flow.slugs[return_list],
                flow_run_id=flow_run_id,
            ),
        ] + [
            TaskRun(id=str(uuid.uuid4()),
                    task_slug=flow.slugs[t],
                    flow_run_id=flow_run_id)
            for t in flow.tasks if t not in [t1, return_list]
        ],
        monkeypatch=monkeypatch,
    )

    with prefect.context(flow_run_id=flow_run_id):
        CloudFlowRunner(flow=flow).run(executor=LocalExecutor())

    assert client.flow_runs[flow_run_id].state.is_running()
    assert client.task_runs[task_run_id_1].state.is_mapped()
    assert client.task_runs[task_run_id_2].state.is_successful()

    # there should be a total of 4 task runs corresponding to each mapped task
    assert (len([
        tr for tr in client.task_runs.values()
        if tr.task_slug == flow.slugs[t1]
    ]) == 4)

    # t1's first child task should be retrying
    assert all([
        isinstance(tr.state, Retrying) for tr in client.task_runs.values()
        if (tr.task_slug == flow.slugs[t1] and tr.map_index != -1)
    ])

    # RUN A SECOND TIME with an artificially updated start time
    # and remove all in-memory data
    for idx, tr in client.task_runs.items():
        if tr.task_slug == flow.slugs[t1] and tr.map_index != -1:
            tr.state.start_time = pendulum.now("UTC")

    for idx, tr in client.task_runs.items():
        tr.state._result.value = None

    with prefect.context(flow_run_id=flow_run_id):
        CloudFlowRunner(flow=flow).run(executor=LocalExecutor())

    assert (len([
        tr for tr in client.task_runs.values()
        if tr.task_slug == flow.slugs[t1]
    ]) == 4)
    assert all(tr.state.is_successful() for tr in client.task_runs.values())
Beispiel #12
0
 def test_is_pickleable_after_start(self):
     e = LocalExecutor()
     with e.start():
         post = cloudpickle.loads(cloudpickle.dumps(e))
         assert isinstance(post, LocalExecutor)
def main():

    p = argparse.ArgumentParser(prog="process", description="Process the FOV pipeline")
    p.add_argument(
        "-s",
        "--save_dir",
        action="store",
        default=Path("./results/"),
        help="Save directory for results",
    )
    p.add_argument(
        "--dataset",
        type=str,
        default="quilt",
        help='Which dataset to use, current can be "quilt", or "labkey"',
    )

    p.add_argument(
        "--n_fovs", type=int, default=100, help="Number of fov's per cell line to use.",
    )
    p.add_argument(
        "--overwrite", type=utils.str2bool, default=False, help="overwite saved results"
    )
    p.add_argument(
        "--use_current_results",
        type=utils.str2bool,
        default=False,
        help="Dont do any processing. just make figures. Set to True by default so you don't overwrite your stuff.",
    )

    # distributed stuff
    p.add_argument(
        "--distributed",
        type=utils.str2bool,
        default=False,
        help="Use Prefect/Dask to do distributed compute.",
    )
    p.add_argument(
        "--port",
        type=int,
        default=99999,
        help="Port over which to communicate with the Dask scheduler.",
    )

    args = p.parse_args()
    args = vars(args)

    distributed = args.pop("distributed")
    port = args.pop("port")

    # For distributed instructions see:
    # https://github.com/AllenCellModeling/fov_processing_pipeline/blob/master/docs/distributed_instructions.md
    if distributed:
        from prefect.engine.executors import DaskExecutor

        executor = DaskExecutor(address=f"tcp://localhost:{port}")

    else:
        executor = LocalExecutor()

    args["executor"] = executor

    process(**args)
Beispiel #14
0
    def run(
        self,
        clean: bool = False,
        debug: bool = False,
        **kwargs,
    ):
        """
        Run a flow with your steps.
        Parameters
        ----------
        clean: bool
            Should the local staging directory be cleaned prior to this run.
            Default: False (Do not clean)
        debug: bool
            A debug flag for the developer to use to manipulate how much data runs,
            how it is processed, etc.
            Default: False (Do not debug)
        Notes
        -----
        Documentation on prefect:
        https://docs.prefect.io/core/
        Basic prefect example:
        https://docs.prefect.io/core/
        """
        # Initalize steps
        select_data = steps.SelectData()
        compute_cell_metrics = steps.ComputeCellMetrics()
        gather_test_visualize = steps.GatherTestVisualize()

        # Choose executor
        if debug:
            exe = LocalExecutor()
        else:

            # Create local cluster
            log.info("Creating LocalCluster")
            current_mem_gb = psutil.virtual_memory().available / 2**30
            n_workers = int(current_mem_gb // 4)
            cluster = LocalCluster(n_workers=n_workers)
            log.info("Created LocalCluster")

            # Set distributed_executor_address
            distributed_executor_address = cluster.scheduler_address

            # Batch size on local cluster
            batch_size = int(psutil.cpu_count() // n_workers)

            # Log dashboard URI
            log.info(f"Dask dashboard available at: {cluster.dashboard_link}")

            # Use dask cluster
            exe = DaskExecutor(distributed_executor_address)

        # Configure your flow
        with Flow("polar_express") as flow:
            # If you want to clean the local staging directories pass clean
            # If you want to utilize some debugging functionality pass debug
            # If you don't utilize any of these, just pass the parameters you need.

            # step 1: select cells and store in annotation file
            selected_cells_manifest = select_data(
                clean=clean,
                debug=debug,
                distributed_executor_address=distributed_executor_address,
                batch_size=batch_size,
                **
                kwargs,  # Allows us to pass `--n {some integer}` or other params
            )

            # step 2: compute metrics for each of the cells
            cell_metrics_manifest = compute_cell_metrics(
                selected_cells_manifest,
                clean=clean,
                debug=debug,
                distributed_executor_address=distributed_executor_address,
                batch_size=batch_size,
                **
                kwargs,  # Allows us to pass `--n {some integer}` or other params
            )

            # step 3: gather the computed metrics and create visualizations
            gather_test_visualize(
                cell_metrics_manifest,
                clean=clean,
                debug=debug,
                **
                kwargs,  # Allows us to pass `--n {some integer}` or other params
            )

        # Run flow and get ending state
        state = flow.run(executor=exe)

        # Get and display any outputs you want to see on your local terminal
        log.info(select_data.get_result(state, flow))
        log.info(compute_cell_metrics.get_result(state, flow))
        log.info(gather_test_visualize.get_result(state, flow))
Beispiel #15
0
    load = postgres.load_datafile.map(datafile=downloads)
    # commit new data to database and clean up
    complete = postgres.complete_load()

    # make sure prep runs before load
    flow.add_edge(upstream_task=prep, downstream_task=load)
    # make sure load runs before complete
    flow.add_edge(upstream_task=load, downstream_task=complete)

if __name__ == "__main__":
    logger = prefect.context.get("logger")

    dask = prefect.config.dask
    mode = prefect.config.mode
    reset_db = prefect.config.reset_db

    all_datasets = dict(prefect.config.socrata.datasets)
    years = list(prefect.config.data.years)

    # use only year datasets if in full mode otherwise use all w/since
    if mode == 'full':
        run_datasets = dict((k, all_datasets[k]) for k in years)
    else:
        run_datasets = all_datasets

    logger.info(
        f"Starting \"{mode}\" flow for {', '.join(run_datasets.keys())}"
        f" {'and resetting db' if reset_db else ''}")
    state = flow.run(datasets=list(run_datasets.values()),
                     executor=LocalDaskExecutor() if dask else LocalExecutor())
Beispiel #16
0
    def run(
        self,
        distributed: bool = False,
        overwrite: bool = False,
        debug: bool = False,
        **kwargs,
    ):
        """
        Run a flow with your steps.

        Parameters
        ----------
        distributed: bool
            A boolean option to determine if the jobs should be distributed to a SLURM
            cluster when possible.
            Default: False (Do not distribute)
        overwrite: bool
            If this pipeline has already partially or completely run, should it
            overwrite the previous files or not.
            Default: False (Do not overwrite or regenerate files)
        debug: bool
            A debug flag for the developer to use to manipulate how much data runs,
            how it is processed, etc. Additionally, if debug is True, any mapped
            operation will run on threads instead of processes.
            Default: False (Do not debug)

        Notes
        -----
        Documentation on prefect:
        https://docs.prefect.io/core/

        Basic prefect example:
        https://docs.prefect.io/core/
        """
        # Initalize steps
        validate_dataset = steps.ValidateDataset()
        prep_analysis_sc = steps.PrepAnalysisSingleCellDs()
        # run_mito_class = steps.MitoClass()
        # merge_data_for_cfe = steps.MergeDataset()

        # Choose executor
        if debug:
            exe = LocalExecutor()
            distributed_executor_address = None
            log.info("Debug flagged. Will use threads instead of Dask.")
        else:
            if distributed:
                # Create or get log dir
                # Do not include ms
                log_dir_name = datetime.now().isoformat().split(".")[0]
                log_dir = Path(f".dask_logs/{log_dir_name}").expanduser()
                # Log dir settings
                log_dir.mkdir(parents=True, exist_ok=True)

                # Create cluster
                log.info("Creating SLURMCluster")
                cluster = SLURMCluster(
                    cores=1,
                    memory="15GB",
                    queue="aics_gpu_general",
                    walltime="10:00:00",
                    local_directory=str(log_dir),
                    log_directory=str(log_dir),
                )

                # Spawn workers
                cluster.scale(180)
                log.info("Created SLURMCluster")

                # Use the port from the created connector to set executor address
                distributed_executor_address = cluster.scheduler_address

                # Log dashboard URI
                log.info(
                    f"Dask dashboard available at: {cluster.dashboard_link}")
            else:
                # Create local cluster
                log.info("Creating LocalCluster")
                cluster = LocalCluster()
                log.info("Created LocalCluster")

                # Set distributed_executor_address
                distributed_executor_address = cluster.scheduler_address

                # Log dashboard URI
                log.info(
                    f"Dask dashboard available at: {cluster.dashboard_link}")

            # Use dask cluster
            exe = DaskExecutor(distributed_executor_address)

        # Configure your flow
        with Flow("cvapipe") as flow:
            # Allows us to pass `--raw_dataset {some path}`
            validated_data_path = validate_dataset(**kwargs)

            prep_analysis_sc(
                dataset=validated_data_path,
                distributed_executor_address=distributed_executor_address,
                **kwargs,
            )

            # mitotic classifier was implemented with plt.
            # PLT has its own distributed handler, which is not quite
            # compatible with prefect + dask
            """
            cell_data_with_annotation = run_mito_class(
                dataset=single_cell_data_path,
                **kwargs,
            )

            cell_data_cfe = merge_data_for_cfe(
                dataset_with_annotation=cell_data_with_annotation,
                dataset_from_labkey=validated_data_path,
                **kwargs,
            )
            """

        # Run flow and get ending state
        state = flow.run(executor=exe)

        # Get and display any outputs you want to see on your local terminal
        log.info(validate_dataset.get_result(state, flow))
def process(
    save_dir: Path,
    overwrite: bool,
    use_current_results: bool,
    n_fovs: int = 100,
    dataset: str = "quilt",
    executor=LocalExecutor(),
):
    """
    Dask/Prefect distributed command for running pipeline
    """

    save_dir = str(save_dir.resolve())

    log.info("Saving in {}".format(save_dir))

    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    # This is the main function
    with Flow("FOV_processing_pipeline") as flow:
        # for every FOV, do the processing steps

        ###########
        # load data
        ###########
        data = wrappers.save_load_data(
            save_dir, n_fovs=n_fovs, overwrite=overwrite, dataset=dataset
        )

        # we have to unpack this way because of Prefect-reasons
        cell_data = data[0]
        fov_data = data[1]

        ###########
        # get all of the save paths
        ###########
        paths = wrappers.get_save_paths(save_dir, fov_data)

        # we have to unpack this way because of Prefect-reasons
        summary_path = paths[0]
        stats_paths = paths[1]
        proj_paths = paths[2]

        ###########
        # Summary Table
        ###########
        wrappers.cell_data_to_summary_table(cell_data, summary_path)

        ###########
        # The per-fov map step
        ###########
        fov_rows = wrappers.get_data_rows(fov_data)

        if not use_current_results:
            process_fov_row_map = wrappers.process_fov_row.map(
                fov_row=fov_rows,
                stats_path=stats_paths,
                proj_path=proj_paths,
                overwrite=unmapped(overwrite),
            )
            upstream_tasks = [process_fov_row_map]
        else:
            upstream_tasks = None

        ###########
        # Load relevant data as a reduce step
        ###########
        df_stats = wrappers.load_stats(
            fov_data, stats_paths, upstream_tasks=upstream_tasks
        )

        ###########
        # QC data based on previous thresholds, etc
        ###########
        df_stats_qc = wrappers.qc_stats(df_stats, save_dir)

        if not use_current_results:

            ###########
            # Make Plots
            ###########
            wrappers.stats2plots(
                df_stats_qc, parent_dir=save_dir, upstream_tasks=[df_stats_qc]
            )

            ###########
            # Make diagnostic images
            ###########
            wrappers.im2diagnostics(
                fov_data, proj_paths, parent_dir=save_dir, upstream_tasks=[df_stats]
            )

        ###########
        # Do data splits for the data that survived QC
        ###########
        splits_dict = wrappers.data_splits(
            df_stats_qc, parent_dir=save_dir, upstream_tasks=[df_stats_qc]
        )

    state = flow.run(executor=executor)

    fov_data = state.result[flow.get_tasks(name="save_load_data")[0]].result[1]
    df_stats = state.result[flow.get_tasks(name="load_stats")[0]].result
    splits_dict = state.result[flow.get_tasks(name="data_splits")[0]].result

    log.info("Done!")

    return fov_data, df_stats, splits_dict
Beispiel #18
0
    
    # Scrape the website
    tsx_imb_df = get_tsx_moc_imb(tsx_url)

    # Get the connection string from prefect cloud 
    conn_str = PrefectSecret("moc_pgdb_conn")
    
    # Partition the df to 
    tsx_imb_df_lst = partition_df(tsx_imb_df, n_conn)

    df_shape = df_to_db.map(tsx_imb_df_lst, tbl_name=unmapped(imb_tbl_nm), conn_str=unmapped(conn_str))

if __name__ == "__main__":

    # Inputs
    tsx_url = 'https://api.tmxmoney.com/mocimbalance/en/TSX/moc.html'
    backup_url = "https://web.archive.org/web/20200414202757/https://api.tmxmoney.com/mocimbalance/en/TSX/moc.html"

    # Script
    from prefect.engine.executors import LocalExecutor

    tsx_imb_fl.visualize()
    fl_state = tsx_imb_fl.run(
        parameters=dict(
            tsx_url=backup_url,
            n_conn=4
        ), 
        executor=LocalExecutor()

    )
    tsx_imb_fl.visualize(flow_state=fl_state)
Beispiel #19
0
def local():
    "Local, immediate execution executor"
    yield LocalExecutor()
Beispiel #20
0
 def test_wait(self):
     """LocalExecutor's wait() method just returns its input"""
     assert LocalExecutor().wait(1) == 1
     assert LocalExecutor().wait(prefect) is prefect
Beispiel #21
0
 def test_submit(self):
     """LocalExecutor directly executes the function"""
     assert LocalExecutor().submit(lambda: 1) == 1
     assert LocalExecutor().submit(lambda x: x, 1) == 1
     assert LocalExecutor().submit(lambda x: x, x=1) == 1
     assert LocalExecutor().submit(lambda: prefect) is prefect
Beispiel #22
0
    def run(
        self,
        distributed: bool = False,
        clean: bool = False,
        debug: bool = False,
        structs: list = ["Nuc"],
        flow_viz: bool = False,
        **kwargs,
    ):
        """
        Run a flow with your steps.
        Parameters
        ----------
        distributed: bool
            A boolean option to determine if the jobs should be distributed to a remote
            cluster when possible.
            Default: False (Do not distribute)
        clean: bool
            Should the local staging directory be cleaned prior to this run.
            Default: False (Do not clean)
        debug: bool
            A debug flag for the developer to use to manipulate how much data runs,
            how it is processed, etc.
            Default: False (Do not debug)
        structs: List
            List of structure data to run pipeline on. Currently, only
            'Nuc' (nuclear membrane) and 'Cell' (cell membrane) are supported.
        flow_viz: bool
            Make flow chart to visualize pipeline - requires conda install of graphviz.

        Notes
        -----
        Documentation on prefect:
        https://docs.prefect.io/core/
        Basic prefect example:
        https://docs.prefect.io/core/
        """

        # Initalize steps
        if "Nuc" in structs:
            loaddata_nuc = steps.LoadData()
            shparam_nuc = steps.Shparam(step_name="shparam_nuc")
            avgshape_nuc = steps.Avgshape(step_name="avgshape_nuc")
            nma_nuc = steps.Nma(step_name="nma_nuc")

        if "Cell" in structs:
            single_cell = steps.Singlecell(step_name="single_cell")
            shparam_cell = steps.Shparam(step_name="shparam_cell")
            avgshape_cell = steps.Avgshape(step_name="avgshape_cell")
            nma_cell = steps.Nma(step_name="nma_cell")

        if "Nuc" in structs and "Cell" in structs:
            compare_nuc_cell = steps.CompareNucCell()

        # Choose executor
        if debug:
            exe = LocalExecutor()
            distributed_executor_address = None
            log.info(f"Debug flagged. Will use threads instead of Dask.")
        else:
            if distributed:
                # Create or get log dir
                # Do not include ms
                log_dir_name = datetime.now().isoformat().split(".")[0]
                log_dir = Path(f".dask_logs/{log_dir_name}").expanduser()
                # Log dir settings
                log_dir.mkdir(parents=True, exist_ok=True)

                # Configure dask config
                dask.config.set({
                    "scheduler.work-stealing": False,
                    "logging.distributed.worker": "info",
                })

                # Create cluster
                log.info("Creating SLURMCluster")
                cluster = SLURMCluster(
                    cores=4,
                    memory="20GB",
                    queue="aics_cpu_general",
                    walltime="10:00:00",
                    local_directory=str(log_dir),
                    log_directory=str(log_dir),
                )
                log.info("Created SLURMCluster")

                # Scale cluster
                cluster.scale(60)

                # Use the port from the created connector to set executor address
                distributed_executor_address = cluster.scheduler_address

                # Log dashboard URI
                log.info(
                    f"Dask dashboard available at: {cluster.dashboard_link}")
            else:
                # Create local cluster
                log.info("Creating LocalCluster")
                cluster = LocalCluster()
                log.info("Created LocalCluster")

                # Set distributed_executor_address
                distributed_executor_address = cluster.scheduler_address

                # Log dashboard URI
                log.info(
                    f"Dask dashboard available at: {cluster.dashboard_link}")

            # Use dask cluster
            exe = DaskExecutor(distributed_executor_address)

        try:
            # Configure your flow
            with Flow("mti_nma") as flow:
                # If your step utilizes dask pass the executor address
                # If you want to clean the local staging directories pass clean
                # If you want to utilize some debugging functionality pass debug
                # If you don't utilize any of these, just pass the parameters you need.

                if "Nuc" in structs:
                    struct = "Nuc"

                    ld_nuc_df = loaddata_nuc(distributed_executor_address=
                                             distributed_executor_address,
                                             clean=clean,
                                             debug=debug,
                                             struct=struct,
                                             **kwargs)
                    sh_nuc_df = shparam_nuc(sc_df=ld_nuc_df,
                                            distributed_executor_address=
                                            distributed_executor_address,
                                            clean=clean,
                                            debug=debug,
                                            struct=struct,
                                            **kwargs)
                    avg_nuc_df = avgshape_nuc(sh_df=sh_nuc_df,
                                              distributed_executor_address=
                                              distributed_executor_address,
                                              clean=clean,
                                              debug=debug,
                                              struct=struct,
                                              **kwargs)
                    nma_nuc_df = nma_nuc(avg_df=avg_nuc_df,
                                         distributed_executor_address=
                                         distributed_executor_address,
                                         clean=clean,
                                         debug=debug,
                                         struct=struct,
                                         **kwargs)

                if "Cell" in structs:
                    struct = "Cell"

                    sc_cell_df = single_cell(distributed_executor_address=
                                             distributed_executor_address,
                                             clean=clean,
                                             debug=debug,
                                             struct=struct,
                                             **kwargs)
                    sh_cell_df = shparam_cell(sc_df=sc_cell_df,
                                              distributed_executor_address=
                                              distributed_executor_address,
                                              clean=clean,
                                              debug=debug,
                                              struct=struct,
                                              **kwargs)
                    avg_cell_df = avgshape_cell(sh_df=sh_cell_df,
                                                distributed_executor_address=
                                                distributed_executor_address,
                                                clean=clean,
                                                debug=debug,
                                                struct=struct,
                                                **kwargs)
                    nma_cell_df = nma_cell(avg_df=avg_cell_df,
                                           distributed_executor_address=
                                           distributed_executor_address,
                                           clean=clean,
                                           debug=debug,
                                           struct=struct,
                                           **kwargs)

                # If nucleus and cell membrane were anlyzed, draw comparison plot
                if "Nuc" in structs and "Cell" in structs:
                    compare_nuc_cell(nma_nuc_df, nma_cell_df)

            # Run flow, get ending state, and visualize pipeline
            flow.run(executor=exe)

            # Create pipeline visualization if flag is True
            # Note:
            # Flag False by default as a required package is not pip-installable
            # To use this feature, first `conda install graphviz`
            if flow_viz:
                flow.visualize()

        # Catch any error and kill the remote dask cluster
        except Exception as err:
            log.error(f"Something went wrong during pipeline run: {err}")
Beispiel #23
0
 def test_is_pickleable(self):
     e = LocalExecutor()
     post = cloudpickle.loads(cloudpickle.dumps(e))
     assert isinstance(post, LocalExecutor)
Beispiel #24
0
@task
def transform(df):

    insert_loc = 0
    for col_name in df.columns[0:4]:
        df.insert(insert_loc + 1, col_name + '_mean',
                  df.groupby('species')[col_name].transform(np.mean))
        df.insert(insert_loc + 2, col_name + '_stdev',
                  df.groupby('species')[col_name].transform(np.std))
        insert_loc += 3

    return df


@task
def load(df):

    conn = create_engine(
        'postgresql://*****:*****@redshift-cluster-1.cbcap9uylzfk.us-east-2.redshift.amazonaws.com:5439/dev'
    )
    df.to_sql('iris_data', conn, index=False)


with Flow("ETL") as flow:
    e = extract()
    t = transform(e)
    l = load(t)

state = flow.run(executor=LocalExecutor())
def generate_movies(
    img: Union[str, Path],
    distributed_executor_port: Optional[Union[str, int]] = None,
    save_path: Optional[Union[str, Path]] = None,
    operating_dim: str = Dimensions.Time,
    overwrite: bool = False,
    fps: int = 12,
    quality: int = 6,
    save_format: str = "mp4",
    save_workflow: bool = False,
    normalization_func: Callable = single_channel_percentile_norm,
    normalization_kwargs: Dict[str, Any] = {},
    projection_func: Callable = single_channel_max_project,
    projection_kwargs: Dict[str, Any] = {},
    S: Optional[Union[int, slice]] = None,
    C: Optional[Union[int, slice]] = None,
    B: Union[int, slice] = 0,
) -> Path:
    """
    Generate a movie for every scene and channel pair found in a file through an
    operating dimension.

    Parameters
    ----------
    img: Union[str, Path]
        Path to a CZI file to read and generate movies for.
    distributed_executor_port: Optional[Union[str, int]]
        If provided a port to use for connecting to the distributed scheduler. All image
        computation and workflow tasks will be distributed using Dask.
        Default: None
    save_path: Optional[Union[str, Path]]
        A specific path to save the generated movies to.
        Default: The a directory named after the provided file.
    operating_dim: str
        Which dimension to operating through for each frame of the movie.
        Default: Dimensions.Time ("T")
    overwrite: bool
        Should existing files found under the same directory name be overwritten.
        Default: False
    fps: int
        Frames per second of each produces movie.
        Default: 12
    quality: int
        ImageIO's compression system. 0 is high compression, 10 is no compression.
        Default: 6
    save_format: str
        Which movie format should be used for each produced file.
        Default: mp4
        Available: mov, avi, mpg, mpeg, mp4, mkv, wmv
    save_workflow: bool
        Optionally, save a PNG and PDF of the workflow that ran.
        If this is set to True, be sure you have installed graphviz and added
        it's executable to your PATH.
        Default: False
    normalization_func: Callable
        A function to normalize the entire movie data prior to projection.
        Default: timelapse_tools.normalization.single_channel_percentile_norm
    normalization_kwargs: Dict[str, Any]
        Any extra arguments to pass to the normalization function.
        Default: {}
    projection_func: Callable
        A function to project the data for at each frame of the movie.
        Default: timelapse_tools.projection.single_channel_max_project
    projection_kwargs: Dict[str, Any]
        Any extra arguments to pass to the projection function.
        Default: {}
    S: Optional[Union[int, slice]]
        A specific integer or slice to use for selecting down the scenes to process.
        Default: None (process all scenes)
    C: Optional[Union[int, slice]]
        A specific integer or slice to use for selecting down the channels to process.
        Default: None (process all channels)
    B: Union[int, slice]
        A specific integer or slice to use for selecting down the channels to process.
        Default: 0
    Returns
    -------
    save_path: Path
        The path to the produced scene-channel pairings of movies.
    """
    if distributed_executor_port:
        from prefect.engine.executors import DaskExecutor

        executor = DaskExecutor(
            address=f"tcp://localhost:{distributed_executor_port}")
    else:
        from prefect.engine.executors import LocalExecutor

        executor = LocalExecutor()

    # Run all processing through prefect + dask for better
    # parallelization and task optimization
    with Flow("czi_to_mp4_conversion") as flow:
        # Convert img to Path
        img = Path(img).expanduser().resolve(strict=True)

        # Determine save path
        save_path = _get_save_path(save_path=save_path,
                                   overwrite=overwrite,
                                   fname=img.with_suffix("").name)

        # Setup and check image and operating dimension provided
        img_details = _img_prep(
            img=img,
            operating_dim=operating_dim,
            # Don't run if save path checking failed
            upstream_tasks=[save_path],
        )

        # Select scene data
        img_details = _select_dimension(
            img=img_details[0],
            dims=img_details[1],
            dim_name=Dimensions.Scene,
            dim_indicies_selected=S,
        )

        # Select channel data
        img_details = _select_dimension(
            img=img_details[0],
            dims=img_details[1],
            dim_name=Dimensions.Channel,
            dim_indicies_selected=C,
        )

        # Select 'B' data
        img_details = _select_dimension(
            img=img_details[0],
            dims=img_details[1],
            dim_name=Dimensions.B,
            dim_indicies_selected=B,
        )

        # Generate all the indicie sets we will need to process
        getitem_indicies = _generate_getitem_indicies(
            img_shape=_get_image_shape(img_details[0]), dims=img_details[1])

        # Generate all the movie selections
        to_process = _generate_process_list(img=img_details[0],
                                            getitem_indicies=getitem_indicies)

        # Generate a list of dictionaries that map dimension to selected data
        selected_indices = _generate_selected_dims_list(
            dims=img_details[1], getitem_indicies=getitem_indicies)

        # Generate movies for each
        _generate_movie.map(
            data=to_process,
            selected_indices=selected_indices,
            dims=unmapped(img_details[1]),
            operating_dim=unmapped(operating_dim),
            save_path=unmapped(save_path),
            fps=unmapped(fps),
            save_format=unmapped(save_format),
            normalization_func=unmapped(normalization_func),
            normalization_kwargs=unmapped(normalization_kwargs),
            projection_func=unmapped(projection_func),
            projection_kwargs=unmapped(projection_kwargs),
        )

    # Run the flow
    state = flow.run(executor=executor)

    # Get resulting path
    save_path = state.result[flow.get_tasks(name="_get_save_path")[0]].result

    # Save the flow viz to the same save_path
    if save_workflow:
        flow.visualize(filename=str(save_path / "workflow.png"))

    return save_path
def test_states_are_hydrated_correctly_with_retries(monkeypatch, tmpdir):
    """
    Ensures that retries longer than 10 minutes properly "hydrate" upstream states
    so that mapped tasks retry correctly.
    """

    flow_run_id = str(uuid.uuid4())
    task_run_id_1 = str(uuid.uuid4())
    task_run_id_2 = str(uuid.uuid4())

    with prefect.Flow(name="test-retries",
                      result=LocalResult(dir=tmpdir)) as flow:
        t1 = plus_one.map([-1, 0, 1])
        t2 = invert_fail_once.map(t1)

    t2.max_retries = 1
    t2.retry_delay = datetime.timedelta(minutes=100)

    monkeypatch.setattr("requests.Session", MagicMock())
    monkeypatch.setattr("requests.post", MagicMock())

    client = MockedCloudClient(
        flow_runs=[FlowRun(id=flow_run_id)],
        task_runs=[
            TaskRun(id=task_run_id_1,
                    task_slug=flow.slugs[t1],
                    flow_run_id=flow_run_id),
            TaskRun(id=task_run_id_2,
                    task_slug=flow.slugs[t2],
                    flow_run_id=flow_run_id),
        ] + [
            TaskRun(id=str(uuid.uuid4()),
                    task_slug=flow.slugs[t],
                    flow_run_id=flow_run_id)
            for t in flow.tasks if t not in [t1, t2]
        ],
        monkeypatch=monkeypatch,
    )

    with prefect.context(flow_run_id=flow_run_id):
        CloudFlowRunner(flow=flow).run(executor=LocalExecutor())

    assert client.flow_runs[flow_run_id].state.is_running()
    assert client.task_runs[task_run_id_1].state.is_mapped()
    assert client.task_runs[task_run_id_2].state.is_mapped()

    # there should be a total of 4 task runs corresponding to each mapped task
    for t in [t1, t2]:
        assert (len([
            tr for tr in client.task_runs.values()
            if tr.task_slug == flow.slugs[t]
        ]) == 4)

    # t2's first child task should be retrying
    t2_0 = next(tr for tr in client.task_runs.values()
                if tr.task_slug == flow.slugs[t2] and tr.map_index == 0)
    assert isinstance(t2_0.state, Retrying)

    # RUN A SECOND TIME with an artificially updated start time
    # and remove all in-memory data
    failed_id = [
        t_id for t_id, tr in client.task_runs.items()
        if tr.task_slug == flow.slugs[t2] and tr.map_index == 0
    ].pop()
    client.task_runs[failed_id].state.start_time = pendulum.now("UTC")

    for idx, tr in client.task_runs.items():
        tr.state._result.value = None

    with prefect.context(flow_run_id=flow_run_id):
        CloudFlowRunner(flow=flow).run(executor=LocalExecutor())

    # t2's first child task should be successful
    t2_0 = next(tr for tr in client.task_runs.values()
                if tr.task_slug == flow.slugs[t2] and tr.map_index == 0)
    assert t2_0.state.is_successful()