Example #1
0
 def test_local_result_creates_necessary_dirs(self, tmp_dir):
     os_independent_template = os.path.join("mydir", "mysubdir",
                                            "{thing}.txt")
     result = LocalResult(dir=tmp_dir, location=os_independent_template)
     new_result = result.write("so-much-data", thing=42)
     assert new_result.location == os.path.join("mydir", "mysubdir",
                                                "42.txt")
     assert new_result.value == "so-much-data"
Example #2
0
    def test_local_result_writes_to_dir(self, tmp_dir, res):
        result = LocalResult(dir=tmp_dir, location="test.txt")
        fpath = result.write(res).location
        assert isinstance(fpath, str)
        assert fpath.endswith("test.txt")

        with open(os.path.join(tmp_dir, fpath), "rb") as f:
            val = f.read()
        assert isinstance(val, bytes)
Example #3
0
    def test_copy_appropriately_sets_result_target_if_target_provided(self):
        # https://github.com/PrefectHQ/prefect/issues/2588
        @task(target="target", result=LocalResult(dir="."))
        def X():
            pass

        @task
        def Y():
            pass

        with Flow("test"):
            x = X()
            y = Y(task_args=dict(target="target", result=LocalResult(dir=".")))

        assert x.result.location == "target"
        assert y.result.location == "target"
Example #4
0
    def __init__(
        self,
        directory: str = None,
        validate: bool = True,
        path: str = None,
        stored_as_script: bool = False,
        **kwargs: Any,
    ) -> None:
        directory = directory or os.path.join(prefect.config.home_dir, "flows")
        self.flows = dict()  # type: Dict[str, str]
        self._flows = dict()  # type: Dict[str, "prefect.core.flow.Flow"]

        self.path = path

        if validate:
            abs_directory = os.path.abspath(os.path.expanduser(directory))
            os.makedirs(abs_directory, exist_ok=True)
        else:
            abs_directory = directory

        self.directory = abs_directory
        result = LocalResult(self.directory, validate_dir=validate)
        super().__init__(result=result,
                         stored_as_script=stored_as_script,
                         **kwargs)
Example #5
0
    def test_getitem_preserves_result_info(self):
        with Flow(name="test") as f:
            z = Task(checkpoint=False)()[0]
            y = Task(checkpoint=True, result=LocalResult(dir="home"))[1]

        assert z.checkpoint is False
        assert isinstance(y.result, LocalResult)
        assert y.result.dir.endswith("home")
Example #6
0
    def test_getattr_preserves_result_info(self):
        with Flow(name="test") as f:
            p = Parameter("p")
            z = GetAttr(checkpoint=False)(p, "foo")
            y = GetAttr(checkpoint=True, result=LocalResult(dir="home"))(p, "bar")

        assert z.checkpoint is False
        assert isinstance(y.result, LocalResult)
        assert y.result.dir.endswith("home")
Example #7
0
    def test_doesnt_raise_for_mapped_tasks_with_correctly_specified_result_location(
            self, location, tmpdir):
        @task(result=LocalResult(dir=tmpdir, location=location))
        def down(x):
            pass

        with Flow("upstream-test") as f:
            result = down.map(x=[1, 2, 3])

        assert healthchecks.result_check([f]) is None
Example #8
0
    def test_raises_for_mapped_tasks_with_poorly_specified_result_location(
            self, tmpdir):
        @task(result=LocalResult(dir=tmpdir, location="{task_name}.txt"))
        def down(x):
            pass

        with Flow("upstream-test") as f:
            result = down.map(x=[1, 2, 3])

        with pytest.raises(ValueError, match="filename"):
            healthchecks.result_check([f])
def load_result(checkpoint_dir: Union[str, Path], date: str, name: str) -> Any:
    """Loads a Prefct checkpointed result from file for the given date.

    Args:
        date (str): date to load the checkpoint from
        name (str): name of the file (stem, e.g. 'p' if file name is 'p.prefect')

    Returns:
        Any
    """
    result_existence = LocalResult(dir=Path(checkpoint_dir).as_posix()).exists(
        location=Path(date, f'{name}.prefect').as_posix()
    )
    assert (
        result_existence
    ), f'Result must exist, checked {Path(checkpoint_dir, date).as_posix()} for {name}.prefect.'
    return (
        LocalResult(dir=Path(checkpoint_dir).as_posix())
        .read(location=Path(date, f'{name}.prefect').as_posix())
        .value
    )
Example #10
0
def test(e: Optional[Executor]):
    with TemporaryDirectory() as tmpdir:
        flow_result = LocalResult(tmpdir, serializer=JSONSerializer(),
                                  location="{task_name}.json")

        with Flow("write_result", result=flow_result) as f:
            _terminal = task(lambda: 42, checkpoint=True, name="magic")()

        with set_temporary_config({"flows.checkpointing": True}), \
             raise_on_exception():
            f.run(executor=e)

        files = os.listdir(tmpdir)
        assert files == ["magic.json"], files
        with open(os.path.join(tmpdir, files[0]), "rb") as file:
            val = json.load(file)
        assert val==42
Example #11
0
    def __init__(
        self, directory: str = None, validate: bool = True, **kwargs: Any
    ) -> None:
        directory = directory or os.path.join(prefect.config.home_dir, "flows")
        self.flows = dict()  # type: Dict[str, str]
        self._flows = dict()  # type: Dict[str, "prefect.core.flow.Flow"]

        if validate:
            abs_directory = os.path.abspath(os.path.expanduser(directory))
            if not os.path.exists(abs_directory):
                os.makedirs(abs_directory)
        else:
            abs_directory = directory

        self.directory = abs_directory
        result = LocalResult(self.directory, validate_dir=validate)
        super().__init__(result=result, **kwargs)
Example #12
0
def test_task_call_with_self_succeeds():
    import dataclasses

    @dataclasses.dataclass
    class TestClass:
        count: int

        def increment(self):
            self.count = self.count + 1

    seconds_task = task(TestClass.increment,
                        target="{{task_slug}}_{{map_index}}",
                        result=LocalResult())
    initial = TestClass(count=0)

    with Flow("test") as flow:
        seconds_task(initial)
    assert flow.run().is_successful()
Example #13
0
def test_task_runner_treats_unfound_files_as_invalid_caches(client, tmpdir):
    @prefect.task(cache_for=datetime.timedelta(minutes=1), result=PrefectResult())
    def cached_task():
        return 42

    state = Cached(
        cached_result_expiration=datetime.datetime.utcnow()
        + datetime.timedelta(minutes=2),
        result=LocalResult(location=str(tmpdir / "made_up_data.prefect")),
    )
    old_state = Cached(
        cached_result_expiration=datetime.datetime.utcnow()
        + datetime.timedelta(days=1),
        result=PrefectResult(location="13"),
    )
    client.get_latest_cached_states = MagicMock(return_value=[state, old_state])

    res = CloudTaskRunner(task=cached_task).run()
    assert client.get_latest_cached_states.called
    assert res.is_successful()
    assert res.is_cached()
    assert res.result == 13
Example #14
0
def generate_list():
    return [1, 2, 3]


@task
def do_something(n):
    return n


@task
def fail(x):
    print(x)
    raise ValueError()


result = LocalResult(location="{task_full_name}.pb")
with Flow(
    "Restart Me",
    storage=Local(
        stored_as_script=True,
        path="/Users/josh/Desktop/code/Dummy-Flows/restartme.py",
    ),
    result=result,
) as flow:
    lst = generate_list()
    d = do_something.map(lst)
    fail(d)

environment = LocalEnvironment(executor=DaskExecutor())
flow.environment = environment
Example #15
0
            player_stats, team_stats).loc[team_rosters[team_name]]

    def __repr__(self):
        return f'League {self.league_id}'

    @classmethod
    def load_league(cls, file_path: Union[str, Path]):
        with open(file_path, 'r') as f:
            league_config = json.load(f)
        return cls(**league_config)


@task(
    name='Get League Mean Statistics',
    result=LocalResult(
        location=
        "{output_directory}/{date:%m}-{date:%d}-{date:%Y}/league_mean_statistics.prefect"
    ),
    checkpoint=True,
)
def get_league_mean_statistics(team_stats: pd.DataFrame) -> pd.DataFrame:
    """Aggregates team aggregated statistics over the whole league.

    Args:
        team_stats (pd.DataFrame): multi-indexed dataframe with all teams statistics

    Returns:
        pd.DataFrame: 2D dataframes containing traditional mean statistics
            (e.g. PTS, AST) over
            * the last 7 days
            * the last 15 days
            * the last 30 days
    def test_build_and_register(self, capsys, monkeypatch, force):
        """Build and register a few flows:
        - 1 new flow
        - 1 updated flow
        - 1 skipped flow
        - 1 error during registration
        - 2 sharing the same storage (which fails to build properly)
        - 2 from a pre-built JSON file
        """
        build_call_count = 0

        class MyModule(Module):
            def build(self):
                nonlocal build_call_count
                build_call_count += 1

        class BadStorage(Module):
            def build(self):
                raise ValueError("whoops!")

        client = MagicMock()
        register_serialized_flow = MagicMock()
        register_serialized_flow.side_effect = [
            ("new-id-1", 1, True),
            ("old-id-2", 2, False),
            ("new-id-3", 3, True),
            ValueError("Oh no!"),
            ("new-id-7", 1, True),
            ("old-id-8", 2, False),
        ]
        monkeypatch.setattr(
            "prefect.cli.build_register.register_serialized_flow",
            register_serialized_flow,
        )

        storage1 = MyModule("testing")
        storage1.result = LocalResult()
        flow1 = Flow("flow 1",
                     storage=storage1,
                     run_config=UniversalRun(labels=["a"]))
        flow2 = Flow(
            "flow 2",
            storage=MyModule("testing"),
            environment=LocalEnvironment(labels=["a"]),
        )
        storage2 = MyModule("testing")
        flow3 = Flow("flow 3", storage=storage2)
        flow4 = Flow("flow 4", storage=storage2)
        storage3 = BadStorage("testing")
        flow5 = Flow("flow 5", storage=storage3)
        flow6 = Flow("flow 6", storage=storage3)
        flow7 = box.Box(
            Flow("flow 7",
                 run_config=UniversalRun(labels=["a"])).serialize(build=False))
        flow8 = box.Box(
            Flow("flow 8", environment=LocalEnvironment(
                labels=["a"])).serialize(build=False))
        flows = [flow1, flow2, flow3, flow4, flow5, flow6, flow7, flow8]

        stats = build_and_register(client,
                                   flows,
                                   "my-project-id",
                                   labels=["b", "c"],
                                   force=force)

        # 3 calls (one for each unique `MyModule` storage object)
        assert build_call_count == 3

        # 6 register calls (8 - 2 that failed to build storage)
        assert register_serialized_flow.call_count == 6
        for flow, (args,
                   kwargs) in zip(flows,
                                  register_serialized_flow.call_args_list):
            assert not args
            assert kwargs["client"] is client
            assert kwargs["serialized_flow"]
            assert kwargs["project_id"] == "my-project-id"
            assert kwargs["force"] == force

        # Stats are recorded properly
        assert dict(stats) == {"registered": 3, "skipped": 2, "errored": 3}

        # Flows are properly configured
        assert flow1.result is storage1.result
        assert flow1.run_config.labels == {"a", "b", "c"}
        assert flow2.environment.labels == {"a", "b", "c"}
        assert isinstance(flow3.run_config, UniversalRun)
        assert flow3.run_config.labels == {"b", "c"}
        assert isinstance(flow4.run_config, UniversalRun)
        assert flow4.run_config.labels == {"b", "c"}
        assert set(flow7["run_config"]["labels"]) == {"a", "b", "c"}
        assert set(flow8["environment"]["labels"]) == {"a", "b", "c"}

        # The output contains a traceback, which will vary between machines
        # We only check that the following fixed sections exist in the output
        parts = [
            ("  Building `MyModule` storage...\n"
             "  Registering 'flow 1'... Done\n"
             "  └── ID: new-id-1\n"
             "  └── Version: 1\n"
             "  Building `MyModule` storage...\n"
             "  Registering 'flow 2'... Skipped (metadata unchanged)\n"
             "  Building `MyModule` storage...\n"
             "  Registering 'flow 3'... Done\n"
             "  └── ID: new-id-3\n"
             "  └── Version: 3\n"
             "  Registering 'flow 4'... Error\n"
             "    Traceback (most recent call last):\n"),
            ("    ValueError: Oh no!\n"
             "\n"
             "  Building `BadStorage` storage...\n"
             "    Error building storage:\n"
             "      Traceback (most recent call last):\n"),
            ("      ValueError: whoops!\n"
             "\n"
             "  Registering 'flow 5'... Error\n"
             "  Registering 'flow 6'... Error\n"
             "  Registering 'flow 7'... Done\n"
             "  └── ID: new-id-7\n"
             "  └── Version: 1\n"
             "  Registering 'flow 8'... Skipped (metadata unchanged)\n"),
        ]
        out, err = capsys.readouterr()
        assert not err
        for part in parts:
            assert part in out
Example #17
0
from prefect import Flow, task
from prefect.engine.flow_runner import FlowRunner
from nlps_extraction.tasks.baselines import ReadPSInput, CustomTokenizeInput
from nlps_extraction.tasks.baselines.sbert import GenerateKBSBertTask, EvaluateSBertTask
from prefect.engine.results import LocalResult

cache_args = dict(
    target="{task_name}.pkl",
    checkpoint=True,
    result=LocalResult(dir=f"./cache/"),
)

read_input_files = ReadPSInput()
tokenize_data = CustomTokenizeInput()
encode_kb_task = GenerateKBSBertTask()
evaluation_task = EvaluateSBertTask()

MODEL = "sentence-transformers/all-mpnet-base-v2"
with Flow("Running S-BERT baselines") as flow:
    input_files = read_input_files()
    encoded_kb = encode_kb_task(input_files["kb"], MODEL)
    evaluation_task(input_files, encoded_kb, model_name=MODEL)

FlowRunner(flow=flow).run()
Example #18
0
"""Where all of the tasks for our pipeline go
"""
from pathlib import Path

import prefect
from config_vars import db_name, host, user, result_folder
from prefect import task
from prefect.engine.results import LocalResult
from prefect.tasks.sql_server import SqlServerFetch
from sql import get_manual_override_rows

path = Path(__file__).resolve().parent / result_folder

result_formatter = LocalResult(dir=path,
                               location="{flow_name}/"
                               "{scheduled_start_time:%d-%m_%H-%M-%S}/"
                               "{task_full_name}-{task_run_id}.prefect_result")

# Get our database items
sql_task = SqlServerFetch(db_name=db_name,
                          user=user,
                          host=host,
                          query=get_manual_override_rows,
                          fetch='many',
                          fetch_count=3,
                          result=result_formatter,
                          name="SQL-stuff"
                          # commit: bool = False,
                          )

Example #19
0
def test_non_keyed_states_are_hydrated_correctly_with_retries(
        monkeypatch, tmpdir):
    """
    Ensures that retries longer than 10 minutes properly "hydrate" upstream states
    so that mapped tasks retry correctly - for mapped tasks, even non-data dependencies
    can affect the number of children spawned.
    """
    @prefect.task
    def return_list():
        return [1, 2, 3]

    @prefect.task(max_retries=1, retry_delay=datetime.timedelta(minutes=20))
    def fail_once():
        if prefect.context.get("task_run_count", 0) < 2:
            raise SyntaxError("bad")
        else:
            return 100

    flow_run_id = str(uuid.uuid4())
    task_run_id_1 = str(uuid.uuid4())
    task_run_id_2 = str(uuid.uuid4())

    with prefect.Flow(name="test-retries",
                      result=LocalResult(dir=tmpdir)) as flow:
        t1 = fail_once.map(upstream_tasks=[return_list])

    monkeypatch.setattr("requests.Session", MagicMock())
    monkeypatch.setattr("requests.post", MagicMock())

    client = MockedCloudClient(
        flow_runs=[FlowRun(id=flow_run_id)],
        task_runs=[
            TaskRun(id=task_run_id_1,
                    task_slug=flow.slugs[t1],
                    flow_run_id=flow_run_id),
            TaskRun(
                id=task_run_id_2,
                task_slug=flow.slugs[return_list],
                flow_run_id=flow_run_id,
            ),
        ] + [
            TaskRun(id=str(uuid.uuid4()),
                    task_slug=flow.slugs[t],
                    flow_run_id=flow_run_id)
            for t in flow.tasks if t not in [t1, return_list]
        ],
        monkeypatch=monkeypatch,
    )

    with prefect.context(flow_run_id=flow_run_id):
        CloudFlowRunner(flow=flow).run(executor=LocalExecutor())

    assert client.flow_runs[flow_run_id].state.is_running()
    assert client.task_runs[task_run_id_1].state.is_mapped()
    assert client.task_runs[task_run_id_2].state.is_successful()

    # there should be a total of 4 task runs corresponding to each mapped task
    assert (len([
        tr for tr in client.task_runs.values()
        if tr.task_slug == flow.slugs[t1]
    ]) == 4)

    # t1's first child task should be retrying
    assert all([
        isinstance(tr.state, Retrying) for tr in client.task_runs.values()
        if (tr.task_slug == flow.slugs[t1] and tr.map_index != -1)
    ])

    # RUN A SECOND TIME with an artificially updated start time
    # and remove all in-memory data
    for idx, tr in client.task_runs.items():
        if tr.task_slug == flow.slugs[t1] and tr.map_index != -1:
            tr.state.start_time = pendulum.now("UTC")

    for idx, tr in client.task_runs.items():
        tr.state._result.value = None

    with prefect.context(flow_run_id=flow_run_id):
        CloudFlowRunner(flow=flow).run(executor=LocalExecutor())

    assert (len([
        tr for tr in client.task_runs.values()
        if tr.task_slug == flow.slugs[t1]
    ]) == 4)
    assert all(tr.state.is_successful() for tr in client.task_runs.values())
from typing import Dict, List

import numpy as np
import pandas as pd

import prefect
from prefect import task
from prefect.engine.results import LocalResult

from .teams import Team


@task(
    name='Get Team Rosters',
    result=LocalResult(
        location=
        "{output_directory}/{date:%m}-{date:%d}-{date:%Y}/team_rosters.prefect"
    ),
    checkpoint=True,
)
def get_team_rosters(season: int, teams: List[Team]) -> Dict[str, List[str]]:
    """Gets rosters for each team using ESPN's fantasy API.

    Args:
        season (int): the season to get roster statistics for
        teams (List[Team]): the teams in the league to consider

    Returns:
        Dict[str, List[str]]: mapping from roster name to roster (list of player names)
    """
    return prefect.context.league.get_team_rosters(season=season, teams=teams)
Example #21
0
def test_states_are_hydrated_correctly_with_retries(monkeypatch, tmpdir):
    """
    Ensures that retries longer than 10 minutes properly "hydrate" upstream states
    so that mapped tasks retry correctly.
    """

    flow_run_id = str(uuid.uuid4())
    task_run_id_1 = str(uuid.uuid4())
    task_run_id_2 = str(uuid.uuid4())

    with prefect.Flow(name="test-retries",
                      result=LocalResult(dir=tmpdir)) as flow:
        t1 = plus_one.map([-1, 0, 1])
        t2 = invert_fail_once.map(t1)

    t2.max_retries = 1
    t2.retry_delay = datetime.timedelta(minutes=100)

    monkeypatch.setattr("requests.Session", MagicMock())
    monkeypatch.setattr("requests.post", MagicMock())

    client = MockedCloudClient(
        flow_runs=[FlowRun(id=flow_run_id)],
        task_runs=[
            TaskRun(id=task_run_id_1,
                    task_slug=flow.slugs[t1],
                    flow_run_id=flow_run_id),
            TaskRun(id=task_run_id_2,
                    task_slug=flow.slugs[t2],
                    flow_run_id=flow_run_id),
        ] + [
            TaskRun(id=str(uuid.uuid4()),
                    task_slug=flow.slugs[t],
                    flow_run_id=flow_run_id)
            for t in flow.tasks if t not in [t1, t2]
        ],
        monkeypatch=monkeypatch,
    )

    with prefect.context(flow_run_id=flow_run_id):
        CloudFlowRunner(flow=flow).run(executor=LocalExecutor())

    assert client.flow_runs[flow_run_id].state.is_running()
    assert client.task_runs[task_run_id_1].state.is_mapped()
    assert client.task_runs[task_run_id_2].state.is_mapped()

    # there should be a total of 4 task runs corresponding to each mapped task
    for t in [t1, t2]:
        assert (len([
            tr for tr in client.task_runs.values()
            if tr.task_slug == flow.slugs[t]
        ]) == 4)

    # t2's first child task should be retrying
    t2_0 = next(tr for tr in client.task_runs.values()
                if tr.task_slug == flow.slugs[t2] and tr.map_index == 0)
    assert isinstance(t2_0.state, Retrying)

    # RUN A SECOND TIME with an artificially updated start time
    # and remove all in-memory data
    failed_id = [
        t_id for t_id, tr in client.task_runs.items()
        if tr.task_slug == flow.slugs[t2] and tr.map_index == 0
    ].pop()
    client.task_runs[failed_id].state.start_time = pendulum.now("UTC")

    for idx, tr in client.task_runs.items():
        tr.state._result.value = None

    with prefect.context(flow_run_id=flow_run_id):
        CloudFlowRunner(flow=flow).run(executor=LocalExecutor())

    # t2's first child task should be successful
    t2_0 = next(tr for tr in client.task_runs.values()
                if tr.task_slug == flow.slugs[t2] and tr.map_index == 0)
    assert t2_0.state.is_successful()
Example #22
0
from prefect import Flow, task, unmapped, Parameter
from prefect.engine.results import LocalResult
from prefect.engine.executors import LocalDaskExecutor, DaskExecutor
from prefect.engine.cache_validators import all_parameters

lr = LocalResult(location="{flow_name}-{task_name}-{x}-{y}.pkl",
                 validators=all_parameters)


@task(log_stdout=True, checkpoint=True)
def add(x, y):
    print(f"add ran with {x} {y}")
    try:
        return sum(x) + y
    except TypeError:
        return x + y


with Flow("iterated map", result=lr) as flow:
    y = unmapped(Parameter("y", default=7))
    x = Parameter("x", default=[1, 2, 3])
    mapped_result = add.map(x, y=y)
    out = add(mapped_result, y)

if __name__ == "__main__":
    flow.run(executor=DaskExecutor())
# %%
# -------------------------------------------------------
# Pipeline scehduler 
# -------------------------------------------------------

schedule = IntervalSchedule(interval=dt.timedelta(days=30)) 



# %%
# -------------------------------------------------------
# Build pipeline 
# -------------------------------------------------------

with Flow(name='malaysia_bank_card_scraping_flow', result=LocalResult(dir="result_config")) as flow: 

    # Step 1: Compile a list of bank names for credit cards. 
    ls_banks_for_card = name_scraping.compile_bank_names_for_card(URL_CARD, '''/html/body/main/section/form/label/select''') 

    # Step 2: Compile a list of credit cards for each bank. 
    dict_cards = name_scraping.compile_credit_cards(
        upstream_tasks=[ls_banks_for_card], 
        ls_banks=ls_banks_for_card, 
        xpath='''/html/body/main/section/ul''',
    ) 

    # Step 3: Run the scrapers. 
    df_card = card_scraping.card_scraping_procedure(
        upstream_tasks=[ls_banks_for_card, dict_cards], 
        url=URL_CARD, 
Example #24
0
def cat(i: int, result=LocalResult(dir=path)):
    logger = prefect.context.get("logger")
    logger.debug(i)
    return "nine lives"
train_path = settings["rico_sca"]["train"]
dev_path = settings["rico_sca"]["dev"]
test_path = settings["rico_sca"]["test"]

# train_path = settings["sample_rico_sca"]
# dev_path = settings["sample_rico_sca"]
# # test_path = settings["sample_rico_sca"]

# train_path = settings["rico_sca_sample"]["train"]
# dev_path = settings["rico_sca_sample"]["dev"]
# test_path = settings["rico_sca_sample"]["test"]

cache_args = dict(
    target="{task_name}-{task_tags}.pkl",
    checkpoint=True,
    result=LocalResult(dir=f"./cache/datasets/rico/"),
)

prepare_rico_task = PrepareRicoScaPair()
prepare_rico_layout_lm_task = PrepareLayoutLMPairTask()
layout_lm_trainer_task = LayoutLMPair()

INSTRUCTION_TYPE = [2]
#  where: 0 and 3 - Lexical Matching
#             1 - Spatial (Relative to screen)
#             2 - Spatial (Relative to other elements)

with Flow("Running the Transformers for Pair Classification") as flow1:
    with tags("train"):
        train_input = prepare_rico_task(train_path,
                                        type_instructions=INSTRUCTION_TYPE)
Example #26
0
import prefect
from prefect import task, Flow
from prefect.engine.results import LocalResult

# prefect.config.flows.checkpointing = True


@task(result=LocalResult(location="test.prefect"))
def test():
    print("Hello!")
    return 1


with Flow("test") as flow:
    test()
flow.run()
Example #27
0
    def test_build_and_register(self, capsys, monkeypatch, force):
        """Build and register a few flows:
        - 1 new flow
        - 1 updated flow
        - 1 skipped flow
        - 1 error during registration
        - 2 sharing the same storage (which fails to build properly)
        """
        build_call_count = 0

        class MyModule(Module):
            def build(self):
                nonlocal build_call_count
                build_call_count += 1

        class BadStorage(Module):
            def build(self):
                raise ValueError("whoops!")

        client = MagicMock()
        client.graphql.side_effect = [
            GraphQLResult({"data": {"flow": []}}),
            GraphQLResult({"data": {"flow": [{"id": "old-id-2", "version": 1}]}}),
            GraphQLResult({"data": {"flow": [{"id": "old-id-3", "version": 2}]}}),
            GraphQLResult({"data": {"flow": [{"id": "old-id-4", "version": 3}]}}),
        ]
        client.register.side_effect = [
            "new-id-1",
            "old-id-2",
            "new-id-3",
            ValueError("Oh no!"),
        ]

        storage1 = MyModule("testing")
        storage1.result = LocalResult()
        flow1 = Flow("flow 1", storage=storage1, run_config=UniversalRun(labels=["a"]))
        flow2 = Flow(
            "flow 2",
            storage=MyModule("testing"),
            environment=LocalEnvironment(labels=["a"]),
        )
        storage2 = MyModule("testing")
        flow3 = Flow("flow 3", storage=storage2)
        flow4 = Flow("flow 4", storage=storage2)
        storage3 = BadStorage("testing")
        flow5 = Flow("flow 5", storage=storage3)
        flow6 = Flow("flow 6", storage=storage3)
        flows = [flow1, flow2, flow3, flow4, flow5, flow6]

        stats = build_and_register(
            client, flows, "testing", labels=["b", "c"], force=force
        )

        # 3 calls (one for each unique `MyModule` storage object)
        assert build_call_count == 3

        # 4 register calls (6 - 2 that failed to build storage)
        assert client.register.call_count == 4
        for flow, (args, kwargs) in zip(flows, client.register.call_args_list):
            assert not args
            assert kwargs["flow"] is flow
            assert kwargs["project_name"] == "testing"
            assert kwargs["build"] is False
            assert kwargs["no_url"] is True
            if force:
                assert kwargs["idempotency_key"] is None
            else:
                assert kwargs["idempotency_key"]

        # Stats are recorded properly
        assert dict(stats) == {"registered": 2, "skipped": 1, "errored": 3}

        # Flows are properly configured
        assert flow1.result is storage1.result
        assert flow1.run_config.labels == {"a", "b", "c"}
        assert flow2.environment.labels == {"a", "b", "c"}
        assert isinstance(flow3.run_config, UniversalRun)
        assert flow3.run_config.labels == {"b", "c"}
        assert isinstance(flow4.run_config, UniversalRun)
        assert flow4.run_config.labels == {"b", "c"}

        # The output contains a traceback, which will vary between machines
        # We only check that the following fixed sections exist in the output
        parts = [
            (
                "  Building `MyModule` storage...\n"
                "  Registering 'flow 1'... Done\n"
                "  └── ID: new-id-1\n"
                "  └── Version: 1\n"
                "  Building `MyModule` storage...\n"
                "  Registering 'flow 2'... Skipped\n"
                "  Building `MyModule` storage...\n"
                "  Registering 'flow 3'... Done\n"
                "  └── ID: new-id-3\n"
                "  └── Version: 3\n"
                "  Registering 'flow 4'... Error\n"
                "    Traceback (most recent call last):\n"
            ),
            (
                "    ValueError: Oh no!\n"
                "\n"
                "  Building `BadStorage` storage...\n"
                "    Error building storage:\n"
                "      Traceback (most recent call last):\n"
            ),
            (
                "      ValueError: whoops!\n"
                "\n"
                "  Registering 'flow 5'... Error\n"
                "  Registering 'flow 6'... Error\n"
            ),
        ]
        out, err = capsys.readouterr()
        assert not err
        for part in parts:
            assert part in out
Example #28
0
from typing import Optional
from datetime import datetime
from pathlib import Path
from prefect import task, Flow
from prefect.engine.results import LocalResult
from pyspark.sql.session import SparkSession
from data_source.prefect.tasks import constant
from data_source import catalog
from data_source.core import entry_key_str


# pylint: disable=no-value-for-parameter
@task(
    target="{flow_name}/{task_name}",
    checkpoint=True,
    result=LocalResult(dir="~/.prefect"),
)
def download(ftp_dir, csv_dir, n_mgrel_files):
    csv_dir = Path(csv_dir)
    if not csv_dir.exists():
        csv_dir.mkdir(parents=True, exist_ok=True)

    files = []
    for i in range(n_mgrel_files):
        filename = f'MGREL_{i + 1}.csv.gz'
        path = str(csv_dir / filename)
        url = ftp_dir + '/' + filename
        of = fsspec.open(url)
        of.fs.download(url, path)
        files.append(path)
    return csv_dir
Example #29
0
from prefect import Task, Flow, task
from prefect.engine.results import LocalResult

@task(target=lambda **kwargs: str(kwargs['task_run_count']))
def get_data():
    """test"""
    return "data"

@task
def print_data(data):
    print(data)


with Flow("using-targets", result=LocalResult(), ) as flow:
    data = get_data()
    print_data(data)

flow.run()
Example #30
0
from prefect.engine.results import LocalResult


@task(target="{parameters[val]}")
def get_data(val):
    return [val]


@task
def print_data(data):
    print(data)


with Flow(
        "using-targets",
        result=LocalResult(),
) as flow:
    val = Parameter("val", default="asdf")
    data = get_data(val)
    print_data(data)

flow.run()
# flow.register(project_name="Demo")

# class GetData(Task):
#     def run(self):
#         print(1)

# GetData()

# get_data()