コード例 #1
0
def create_scraper_monitor_flow():
    # Run every day at 15 UTC (or 10 ET)
    schedule = CronSchedule("0 15 * * *")
    with Flow("MonitorFailingScrapers", schedule) as flow:
        slack_webhook_url = EnvVarSecret("SLACK_WEBHOOK_URL")
        run_monitor_scrapers(slack_webhook_url)

    return flow
コード例 #2
0
def create_flow_for_table(table_name):
    sched = CronSchedule("50 */2 * * *")
    tn = f"data.{table_name}"
    sn = f"{tn}_id_seq"
    with Flow(f"clean-sql-{table_name}", sched) as flow:
        connstr = EnvVarSecret("COVID_DB_CONN_URI")
        ready = truncate_table(connstr, tn)
        reset_sequence(connstr, sn, ready)

    return flow
コード例 #3
0
def main(project_name):
    schedule = CronSchedule('0 * * * *')
    config = Config()
    with Flow('Purple Air hourly download flow', schedule) as flow:
        run_time = get_run_time
        fetch_result = fetch_results(config)
        transform_result = transform_results(config, fetch_result)
        write_results(config, transform_result, run_time)
    print(flow.register(project_name=project_name))
    flow.run_agent()
コード例 #4
0
def create_cdc_all_states_flow():
    """Creates a flow that runs the CDC data update on all states."""
    sched = CronSchedule("17 */4 * * *")

    flow = Flow("CDCAllStatesDataUpdate", sched)
    for state in ALL_STATES_PLUS_DC:
        task = StartFlowRun(
            flow_name=CDCCovidDataTracker.__name__,
            project_name="can-scrape",
            wait=True,
            parameters={"state": state.abbr},
        )
        flow.add_task(task)

    return flow
コード例 #5
0
def create_main_flow(flows: List[Flow], project_name):
    schedule = CronSchedule("0 */3 * * *")

    with Flow("MainFlow", schedule) as main_flow:
        tasks = []
        for flow in flows:
            task = StartFlowRun(flow_name=flow.name,
                                project_name=project_name,
                                wait=True)
            tasks.append(task)

        parquet_flow = StartFlowRun(flow_name="UpdateParquetFiles",
                                    project_name=project_name,
                                    wait=True)

        for task in tasks:
            task.set_downstream(parquet_flow)

    return main_flow
コード例 #6
0
ファイル: test_sensor.py プロジェクト: vishalbelsare/FlowKit
def test_run_workflow_ignores_schedule(test_logger):
    """
    Test that run_workflow ignores the workflow's schedule.
    """
    function_mock = create_autospec(lambda dummy_param: None)
    # Flow with no more scheduled runs
    with prefect.Flow(
            "Dummy_workflow",
            schedule=CronSchedule("0 0 * * *",
                                  end_date=pendulum.now().subtract(days=2)),
    ) as dummy_workflow:
        dummy_param = prefect.Parameter("dummy_param")
        FunctionTask(function_mock)(dummy_param=dummy_param)

    with prefect.context(logger=test_logger):
        run_workflow.run(
            parametrised_workflow=(dummy_workflow,
                                   dict(dummy_param="DUMMY_VALUE")))
    function_mock.assert_called_once_with(dummy_param="DUMMY_VALUE")
コード例 #7
0
def create_flow_for_scraper(ix: int, cls: Type[DatasetBase]):
    sched = CronSchedule(f"{ix % 60} */4 * * *")

    with Flow(cls.__name__, sched) as flow:
        connstr = EnvVarSecret("COVID_DB_CONN_URI")
        sentry_dsn = EnvVarSecret("SENTRY_DSN")
        sentry_sdk_task = initialize_sentry(sentry_dsn)

        d = create_scraper(cls)
        fetched = fetch(d)
        normalized = normalize(d)
        validated = validate(d)
        done = put(d, connstr)

        d.set_upstream(sentry_sdk_task)
        normalized.set_upstream(fetched)
        validated.set_upstream(normalized)
        done.set_upstream(validated)

    return flow
コード例 #8
0
ファイル: custom_fields.py プロジェクト: shreyasgm/FlowKit
    def _deserialize(self, value, attr, data,
                     **kwargs) -> "prefect.schedules.schedules.Schedule":
        """
        Deserialise a cron string as a cron schedule.

        Returns
        -------
        Schedule
            Prefect CronSchedule to run a flow according to the schedule
            defined by the input string.
        
        Raises
        ------
        ValidationError
            if the input value is not a valid cron string or None
        """
        cron_string = super()._deserialize(value, attr, data, **kwargs)
        try:
            schedule = CronSchedule(cron_string)
        except ValueError:
            raise ValidationError(f"Invalid cron string: '{cron_string}'.")
        return schedule
コード例 #9
0
ファイル: etl.py プロジェクト: sticknor/hn_app
@task
def frontfill():
    now = datetime.datetime.now()
    pm.execute_notebook(
        "etl/hn_etl_front_fill.ipynb",
        "s3://python-portfolio-notebooks/hn_updates/frontfill" +
        str(now.year) + "-" + str(now.month) + "-" + str(now.day) + ".ipynb",
    )


@task
def test_changes():

    now = datetime.datetime.now()
    pm.execute_notebook(
        "etl/hn_data_test.ipynb",
        "s3://python-portfolio-notebooks/hn_updates/test" + str(now.year) +
        "-" + str(now.month) + "-" + str(now.day) + ".ipynb",
    )


with Flow("ETL", schedule=CronSchedule("0 9 * * *")) as flow:

    frontfill = frontfill()
    backfill = backfill()
    test_changes = test_changes()

if __name__ == "__main__":
    flow.run()
コード例 #10
0
    retry_delay=timedelta(minutes=1),
    nout=2,
    trigger=triggers.all_finished,
)
def create_parquet(_success):
    ts = prefect.context.scheduled_start_time
    dt_str = pd.to_datetime(ts).strftime("%Y-%m-%dT%H")
    vintage_fn = FN_STR.format(dt_str) + ".parquet"
    fn = FN_STR.format("") + ".parquet"

    df = pd.read_csv(CSV_FN, parse_dates=["dt"])
    df.to_parquet(DATA_PATH / vintage_fn, index=False)
    df.to_parquet(DATA_PATH / fn, index=False)
    return vintage_fn, fn


@task
def get_gcs_cmd(fn):
    return f"gsutil acl ch -u AllUsers:R gs://can-scrape-outputs/final/{fn}"


shell = ShellTask()
with Flow("UpdateParquetFiles", CronSchedule("10 */2 * * *")) as f:
    connstr = EnvVarSecret("COVID_DB_CONN_URI")
    success = export_to_csv(connstr)
    vintage_fn, fn = create_parquet(success)
    shell(get_gcs_cmd(vintage_fn))
    shell(get_gcs_cmd(fn))

f.register(project_name="can-scrape")
コード例 #11
0
from datetime import datetime, timedelta
import pendulum
import prefect
from prefect import task, Flow
from prefect.schedules import IntervalSchedule, CronSchedule
import pandas as pd
from io import BytesIO
import zipfile
import requests

schedule = CronSchedule(cron="*/10 * * * *",
                        start_date=pendulum.datetime(2020,
                                                     11,
                                                     25,
                                                     18,
                                                     40,
                                                     tz="America/Sao_Paulo"))


@task
def get_raw_data():
    url = "http://download.inep.gov.br/microdados/Enade_Microdados/microdados_enade_2019.zip"
    filebytes = BytesIO(requests.get(url).content)

    myzip = zipfile.ZipFile(filebytes)
    myzip.extractall()
    path = './microdados_enade_2019/2019/3.DADOS/'
    return path


@task
コード例 #12
0
    info_keys=["stargazers_count", "subscribers_count"],
    max_retries=1,
    retry_delay=datetime.timedelta(minutes=1),
)


@task
def process_stats(stats):
    data = {
        "Stars": stats["stargazers_count"],
        "Watchers": stats["subscribers_count"],
        "Date": pendulum.now("utc").isoformat(),
    }
    return data


airtable = WriteAirtableRow(
    base_key="XXXXXXX",
    table_name="Stars",
    max_retries=1,
    retry_delay=datetime.timedelta(minutes=1),
)
daily_schedule = CronSchedule("*/1 * * * *")


with Flow("Collect Repo Stats", schedule=daily_schedule) as flow:
    data = process_stats(repo_stats)
    final = airtable(data)


flow.run()
コード例 #13
0
ファイル: schedules.py プロジェクト: MTES-MCT/monitorfish
    fleet_segments,
    infractions,
    init_species_groups,
    last_positions,
    ports,
    species,
    vessels,
)

################################ Define flow schedules ################################
control_anteriority.flow.schedule = IntervalSchedule(interval=timedelta(
    hours=1))
current_segments.flow.schedule = IntervalSchedule(interval=timedelta(
    minutes=10))
ers.flow.schedule = IntervalSchedule(interval=timedelta(minutes=1))
fishing_gear_codes.flow.schedule = CronSchedule("0 3 * * *")
last_positions.flow.schedule = IntervalSchedule(interval=timedelta(minutes=1))
species.flow.schedule = CronSchedule("0 4 * * *")
vessels.flow.schedule = CronSchedule("0 2 * * *")

###################### List flows to register with prefect server #####################
flows_to_register = [
    controllers.flow,
    controls.flow,
    control_anteriority.flow,
    current_segments.flow,
    ers.flow,
    fishing_gear_codes.flow,
    fleet_segments.flow,
    infractions.flow,
    init_species_groups.flow,
コード例 #14
0
ファイル: pref02.py プロジェクト: mari0611/IGTI_Prefect
from datetime import datetime, timedelta
import pendulum
import prefect
from prefect import task, Flow
from prefect.schedules import CronSchedule
import pandas as pd
from io import BytesIO
import zipfile
import requests
import sqlalchemy
import pyodbc

schedule = CronSchedule(cron="*/10 * * * *",
                        start_date=pendulum.datetime(2020,
                                                     12,
                                                     1,
                                                     13,
                                                     45,
                                                     tz='America/Sao_Paulo'))


@task
def get_raw_data():
    url = "http://download.inep.gov.br/microdados/Enade_Microdados/microdados_enade_2019.zip",
    filebytes = BytesIO(requests.get(url).content)

    #extrair conteudo do zip
    myzip = zipfile.ZipeFile(filebytes)
    myzip.extractall()
    path = './microdados_enade_2019/2019/3.DADOS/'
    return path
コード例 #15
0
from datetime import datetime, timedelta
import pendulum
import prefect
from prefect import task, Flow
from prefect.schedules import CronSchedule
import pandas as pd
from io import BytesIO
import zipfile
import requests

schedule = CronSchedule(cron="*/30 * * * *",
                        start_date=pendulum.datetime(2021,
                                                     3,
                                                     12,
                                                     17,
                                                     00,
                                                     tz='America/Sao_Paulo'))


@task
def get_raw_data():
    url = 'http://download.inep.gov.br/microdados/microdados_enem_2019.zip'
    filebytes = BytesIO(requests.get(url).content)

    logger = prefect.context.get('logger')
    logger.info('Dados obtidos')

    # Extrair o conteudo do zipfile
    myzip = zipfile.ZipFile(filebytes)
    myzip.extractall()
    path = './DADOS/'
コード例 #16
0
ファイル: etl.py プロジェクト: sungchun12/prefect-examples
from prefect.schedules import CronSchedule


@task
def extract():
    return [1, 2, 3, 50]


@task
def transform(x):
    return [i * 10 for i in x]


@task
def load(y):
    print("Received y: {}".format(y))


with Flow("ETL") as flow:
    e = extract()
    t = transform(e)
    l = load(t)
    schedule = CronSchedule("0 0 * * *")  # setup a cron scheduler

flow_state = flow.run()  # set the flow run to an object to track state

flow.visualize(
    flow_state=flow_state)  # visualize how the data moves throughout the DAG

#%%
コード例 #17
0
from datetime import datetime, timedelta
import pendulum
import prefect
from prefect import task, Flow
from prefect.schedules import CronSchedule
import pandas as pd
from io import BytesIO
import zipfile
import requests
import mysql
import pymysql
import sqlalchemy
from sqlalchemy import create_engine

schedule = CronSchedule(
    cron="*/10 * * * * ",  # *minutos *horas *dia *mês *dia semana 
    start_date=pendulum.datetime(2020, 11, 26, 14, 25, tz='America/Sao_Paulo'))


@task
def get_raw_data():
    # Atribuindo o link a um objeto 'url'
    url = 'http://download.inep.gov.br/microdados/Enade_Microdados/microdados_enade_2019.zip'

    # Faz o download do conteúdo
    filebytes = BytesIO(requests.get(url).content)

    # Extrair o conteúdo do 'zipfile'
    myzip = zipfile.ZipFile(filebytes)
    myzip.extractall()
    path = './microdados_enade_2019/2019/3.DADOS/'
コード例 #18
0
    repo="PrefectHQ/prefect",
    info_keys=["stargazers_count", "subscribers_count"],
    max_retries=1,
    retry_delay=datetime.timedelta(minutes=1),
)


@task
def process_stats(stats):
    data = {
        "Stars": stats["stargazers_count"],
        "Watchers": stats["subscribers_count"],
        "Date": pendulum.now("utc").isoformat(),
    }
    return data


airtable = WriteAirtableRow(
    base_key="XXXXXXX",
    table_name="Stars",
    max_retries=1,
    retry_delay=datetime.timedelta(minutes=1),
)
daily_schedule = CronSchedule("0 8 * * *")

with Flow("Collect Repo Stats", schedule=daily_schedule) as flow:
    data = process_stats(repo_stats)
    final = airtable(data)

flow.run()
    'labels': ['ecs-agent', 'ooi', 'prod'],
    'run_task_kwargs': {
        'cluster': 'prefectECSCluster',
        'launchType': 'FARGATE',
    },
}

project_name = "ooi-harvest"
data_org = "ooi-data"
config_json = yaml.safe_load(CONFIG_PATH.open())
flow_run_name = "-".join([
    config_json['instrument'],
    config_json['stream']['method'],
    config_json['stream']['name'],
])
schedule = CronSchedule(config_json['workflow_config']['schedule'])
run_config = ECSRun(**RUN_OPTIONS)

parent_run_opts = dict(**copy.deepcopy(RUN_OPTIONS))
parent_run_opts.update({'cpu': '0.5 vcpu', 'memory': '2 GB'})
parent_run_config = ECSRun(**parent_run_opts)

with Flow(flow_run_name, schedule=schedule,
          run_config=parent_run_config) as parent_flow:
    flow_run = create_flow_run(
        flow_name="stream_harvest",
        run_name=flow_run_name,
        project_name=project_name,
        parameters={
            'config': config_json,
            'error_test': False,
コード例 #20
0
ファイル: deploy.py プロジェクト: dylanbhughes/reddit-daily
from prefect import Client
from prefect.schedules import CronSchedule
from reddit_daily import flow

c = Client()
s = CronSchedule("0 * * * *")

flow.schedule = s

flow.deploy(project="Dylan's Project")
コード例 #21
0
from datetime import datetime, timedelta
import pendulum
import prefect
from prefect import task, Flow
from prefect.schedules import CronSchedule
import pandas as pd
from io import BytesIO
import zipfile
import requests

schedule = CronSchedule(cron='*/10 * * * *',
                        start_date=pendulum.datetime(2020,
                                                     12,
                                                     5,
                                                     14,
                                                     tz="America/Sao_Paulo"))


@task
def get_raw_date():
    url = 'http://download.inep.gov.br/microdados/Enade_Microdados/microdados_enade_2019.zip'
    filebytes = BytesIO(requests.get(url).content)

    zipped = zipfile.ZipFile(filebytes)
    zipped.extractall()
    return './microdados_enade_2019/2019/3.DADOS/'


@task
def apply_filters(path):
    interested_cols = [
コード例 #22
0
import sys
import prefect
from prefect import task, Flow, Parameter
from prefect.schedules import CronSchedule

sys.path.append('../pyoilfundy')

from pyoilfundy import fundyproducts as p

daily_7_sched = CronSchedule('0 7 * * 1-5')


def register_products_dash_flow():
    with Flow('product_by_region', schedule=daily_7_sched) as f:
        commods = ['lpg', 'naphtha', 'gasoline', 'diesel', 'jet', 'fueloil']
        p.make_specified_product_dash.map(commods)

    f.register(project_name='pyoilfundy')