def create_scraper_monitor_flow():
    # Run every day at 15 UTC (or 10 ET)
    schedule = CronSchedule("0 15 * * *")
    with Flow("MonitorFailingScrapers", schedule) as flow:
        slack_webhook_url = EnvVarSecret("SLACK_WEBHOOK_URL")
        run_monitor_scrapers(slack_webhook_url)

    return flow
def create_flow_for_table(table_name):
    sched = CronSchedule("50 */2 * * *")
    tn = f"data.{table_name}"
    sn = f"{tn}_id_seq"
    with Flow(f"clean-sql-{table_name}", sched) as flow:
        connstr = EnvVarSecret("COVID_DB_CONN_URI")
        ready = truncate_table(connstr, tn)
        reset_sequence(connstr, sn, ready)

    return flow
Exemple #3
0
def main(project_name):
    schedule = CronSchedule('0 * * * *')
    config = Config()
    with Flow('Purple Air hourly download flow', schedule) as flow:
        run_time = get_run_time
        fetch_result = fetch_results(config)
        transform_result = transform_results(config, fetch_result)
        write_results(config, transform_result, run_time)
    print(flow.register(project_name=project_name))
    flow.run_agent()
Exemple #4
0
def create_cdc_all_states_flow():
    """Creates a flow that runs the CDC data update on all states."""
    sched = CronSchedule("17 */4 * * *")

    flow = Flow("CDCAllStatesDataUpdate", sched)
    for state in ALL_STATES_PLUS_DC:
        task = StartFlowRun(
            flow_name=CDCCovidDataTracker.__name__,
            project_name="can-scrape",
            wait=True,
            parameters={"state": state.abbr},
        )
        flow.add_task(task)

    return flow
def create_main_flow(flows: List[Flow], project_name):
    schedule = CronSchedule("0 */3 * * *")

    with Flow("MainFlow", schedule) as main_flow:
        tasks = []
        for flow in flows:
            task = StartFlowRun(flow_name=flow.name,
                                project_name=project_name,
                                wait=True)
            tasks.append(task)

        parquet_flow = StartFlowRun(flow_name="UpdateParquetFiles",
                                    project_name=project_name,
                                    wait=True)

        for task in tasks:
            task.set_downstream(parquet_flow)

    return main_flow
Exemple #6
0
def test_run_workflow_ignores_schedule(test_logger):
    """
    Test that run_workflow ignores the workflow's schedule.
    """
    function_mock = create_autospec(lambda dummy_param: None)
    # Flow with no more scheduled runs
    with prefect.Flow(
            "Dummy_workflow",
            schedule=CronSchedule("0 0 * * *",
                                  end_date=pendulum.now().subtract(days=2)),
    ) as dummy_workflow:
        dummy_param = prefect.Parameter("dummy_param")
        FunctionTask(function_mock)(dummy_param=dummy_param)

    with prefect.context(logger=test_logger):
        run_workflow.run(
            parametrised_workflow=(dummy_workflow,
                                   dict(dummy_param="DUMMY_VALUE")))
    function_mock.assert_called_once_with(dummy_param="DUMMY_VALUE")
Exemple #7
0
def create_flow_for_scraper(ix: int, cls: Type[DatasetBase]):
    sched = CronSchedule(f"{ix % 60} */4 * * *")

    with Flow(cls.__name__, sched) as flow:
        connstr = EnvVarSecret("COVID_DB_CONN_URI")
        sentry_dsn = EnvVarSecret("SENTRY_DSN")
        sentry_sdk_task = initialize_sentry(sentry_dsn)

        d = create_scraper(cls)
        fetched = fetch(d)
        normalized = normalize(d)
        validated = validate(d)
        done = put(d, connstr)

        d.set_upstream(sentry_sdk_task)
        normalized.set_upstream(fetched)
        validated.set_upstream(normalized)
        done.set_upstream(validated)

    return flow
Exemple #8
0
    def _deserialize(self, value, attr, data,
                     **kwargs) -> "prefect.schedules.schedules.Schedule":
        """
        Deserialise a cron string as a cron schedule.

        Returns
        -------
        Schedule
            Prefect CronSchedule to run a flow according to the schedule
            defined by the input string.
        
        Raises
        ------
        ValidationError
            if the input value is not a valid cron string or None
        """
        cron_string = super()._deserialize(value, attr, data, **kwargs)
        try:
            schedule = CronSchedule(cron_string)
        except ValueError:
            raise ValidationError(f"Invalid cron string: '{cron_string}'.")
        return schedule
Exemple #9
0
@task
def frontfill():
    now = datetime.datetime.now()
    pm.execute_notebook(
        "etl/hn_etl_front_fill.ipynb",
        "s3://python-portfolio-notebooks/hn_updates/frontfill" +
        str(now.year) + "-" + str(now.month) + "-" + str(now.day) + ".ipynb",
    )


@task
def test_changes():

    now = datetime.datetime.now()
    pm.execute_notebook(
        "etl/hn_data_test.ipynb",
        "s3://python-portfolio-notebooks/hn_updates/test" + str(now.year) +
        "-" + str(now.month) + "-" + str(now.day) + ".ipynb",
    )


with Flow("ETL", schedule=CronSchedule("0 9 * * *")) as flow:

    frontfill = frontfill()
    backfill = backfill()
    test_changes = test_changes()

if __name__ == "__main__":
    flow.run()
Exemple #10
0
    retry_delay=timedelta(minutes=1),
    nout=2,
    trigger=triggers.all_finished,
)
def create_parquet(_success):
    ts = prefect.context.scheduled_start_time
    dt_str = pd.to_datetime(ts).strftime("%Y-%m-%dT%H")
    vintage_fn = FN_STR.format(dt_str) + ".parquet"
    fn = FN_STR.format("") + ".parquet"

    df = pd.read_csv(CSV_FN, parse_dates=["dt"])
    df.to_parquet(DATA_PATH / vintage_fn, index=False)
    df.to_parquet(DATA_PATH / fn, index=False)
    return vintage_fn, fn


@task
def get_gcs_cmd(fn):
    return f"gsutil acl ch -u AllUsers:R gs://can-scrape-outputs/final/{fn}"


shell = ShellTask()
with Flow("UpdateParquetFiles", CronSchedule("10 */2 * * *")) as f:
    connstr = EnvVarSecret("COVID_DB_CONN_URI")
    success = export_to_csv(connstr)
    vintage_fn, fn = create_parquet(success)
    shell(get_gcs_cmd(vintage_fn))
    shell(get_gcs_cmd(fn))

f.register(project_name="can-scrape")
Exemple #11
0
from datetime import datetime, timedelta
import pendulum
import prefect
from prefect import task, Flow
from prefect.schedules import IntervalSchedule, CronSchedule
import pandas as pd
from io import BytesIO
import zipfile
import requests

schedule = CronSchedule(cron="*/10 * * * *",
                        start_date=pendulum.datetime(2020,
                                                     11,
                                                     25,
                                                     18,
                                                     40,
                                                     tz="America/Sao_Paulo"))


@task
def get_raw_data():
    url = "http://download.inep.gov.br/microdados/Enade_Microdados/microdados_enade_2019.zip"
    filebytes = BytesIO(requests.get(url).content)

    myzip = zipfile.ZipFile(filebytes)
    myzip.extractall()
    path = './microdados_enade_2019/2019/3.DADOS/'
    return path


@task
Exemple #12
0
    info_keys=["stargazers_count", "subscribers_count"],
    max_retries=1,
    retry_delay=datetime.timedelta(minutes=1),
)


@task
def process_stats(stats):
    data = {
        "Stars": stats["stargazers_count"],
        "Watchers": stats["subscribers_count"],
        "Date": pendulum.now("utc").isoformat(),
    }
    return data


airtable = WriteAirtableRow(
    base_key="XXXXXXX",
    table_name="Stars",
    max_retries=1,
    retry_delay=datetime.timedelta(minutes=1),
)
daily_schedule = CronSchedule("*/1 * * * *")


with Flow("Collect Repo Stats", schedule=daily_schedule) as flow:
    data = process_stats(repo_stats)
    final = airtable(data)


flow.run()
Exemple #13
0
    fleet_segments,
    infractions,
    init_species_groups,
    last_positions,
    ports,
    species,
    vessels,
)

################################ Define flow schedules ################################
control_anteriority.flow.schedule = IntervalSchedule(interval=timedelta(
    hours=1))
current_segments.flow.schedule = IntervalSchedule(interval=timedelta(
    minutes=10))
ers.flow.schedule = IntervalSchedule(interval=timedelta(minutes=1))
fishing_gear_codes.flow.schedule = CronSchedule("0 3 * * *")
last_positions.flow.schedule = IntervalSchedule(interval=timedelta(minutes=1))
species.flow.schedule = CronSchedule("0 4 * * *")
vessels.flow.schedule = CronSchedule("0 2 * * *")

###################### List flows to register with prefect server #####################
flows_to_register = [
    controllers.flow,
    controls.flow,
    control_anteriority.flow,
    current_segments.flow,
    ers.flow,
    fishing_gear_codes.flow,
    fleet_segments.flow,
    infractions.flow,
    init_species_groups.flow,
Exemple #14
0
from datetime import datetime, timedelta
import pendulum
import prefect
from prefect import task, Flow
from prefect.schedules import CronSchedule
import pandas as pd
from io import BytesIO
import zipfile
import requests
import sqlalchemy
import pyodbc

schedule = CronSchedule(cron="*/10 * * * *",
                        start_date=pendulum.datetime(2020,
                                                     12,
                                                     1,
                                                     13,
                                                     45,
                                                     tz='America/Sao_Paulo'))


@task
def get_raw_data():
    url = "http://download.inep.gov.br/microdados/Enade_Microdados/microdados_enade_2019.zip",
    filebytes = BytesIO(requests.get(url).content)

    #extrair conteudo do zip
    myzip = zipfile.ZipeFile(filebytes)
    myzip.extractall()
    path = './microdados_enade_2019/2019/3.DADOS/'
    return path
Exemple #15
0
from datetime import datetime, timedelta
import pendulum
import prefect
from prefect import task, Flow
from prefect.schedules import CronSchedule
import pandas as pd
from io import BytesIO
import zipfile
import requests

schedule = CronSchedule(cron="*/30 * * * *",
                        start_date=pendulum.datetime(2021,
                                                     3,
                                                     12,
                                                     17,
                                                     00,
                                                     tz='America/Sao_Paulo'))


@task
def get_raw_data():
    url = 'http://download.inep.gov.br/microdados/microdados_enem_2019.zip'
    filebytes = BytesIO(requests.get(url).content)

    logger = prefect.context.get('logger')
    logger.info('Dados obtidos')

    # Extrair o conteudo do zipfile
    myzip = zipfile.ZipFile(filebytes)
    myzip.extractall()
    path = './DADOS/'
Exemple #16
0
from prefect.schedules import CronSchedule


@task
def extract():
    return [1, 2, 3, 50]


@task
def transform(x):
    return [i * 10 for i in x]


@task
def load(y):
    print("Received y: {}".format(y))


with Flow("ETL") as flow:
    e = extract()
    t = transform(e)
    l = load(t)
    schedule = CronSchedule("0 0 * * *")  # setup a cron scheduler

flow_state = flow.run()  # set the flow run to an object to track state

flow.visualize(
    flow_state=flow_state)  # visualize how the data moves throughout the DAG

#%%
Exemple #17
0
from datetime import datetime, timedelta
import pendulum
import prefect
from prefect import task, Flow
from prefect.schedules import CronSchedule
import pandas as pd
from io import BytesIO
import zipfile
import requests
import mysql
import pymysql
import sqlalchemy
from sqlalchemy import create_engine

schedule = CronSchedule(
    cron="*/10 * * * * ",  # *minutos *horas *dia *mês *dia semana 
    start_date=pendulum.datetime(2020, 11, 26, 14, 25, tz='America/Sao_Paulo'))


@task
def get_raw_data():
    # Atribuindo o link a um objeto 'url'
    url = 'http://download.inep.gov.br/microdados/Enade_Microdados/microdados_enade_2019.zip'

    # Faz o download do conteúdo
    filebytes = BytesIO(requests.get(url).content)

    # Extrair o conteúdo do 'zipfile'
    myzip = zipfile.ZipFile(filebytes)
    myzip.extractall()
    path = './microdados_enade_2019/2019/3.DADOS/'
Exemple #18
0
    repo="PrefectHQ/prefect",
    info_keys=["stargazers_count", "subscribers_count"],
    max_retries=1,
    retry_delay=datetime.timedelta(minutes=1),
)


@task
def process_stats(stats):
    data = {
        "Stars": stats["stargazers_count"],
        "Watchers": stats["subscribers_count"],
        "Date": pendulum.now("utc").isoformat(),
    }
    return data


airtable = WriteAirtableRow(
    base_key="XXXXXXX",
    table_name="Stars",
    max_retries=1,
    retry_delay=datetime.timedelta(minutes=1),
)
daily_schedule = CronSchedule("0 8 * * *")

with Flow("Collect Repo Stats", schedule=daily_schedule) as flow:
    data = process_stats(repo_stats)
    final = airtable(data)

flow.run()
    'labels': ['ecs-agent', 'ooi', 'prod'],
    'run_task_kwargs': {
        'cluster': 'prefectECSCluster',
        'launchType': 'FARGATE',
    },
}

project_name = "ooi-harvest"
data_org = "ooi-data"
config_json = yaml.safe_load(CONFIG_PATH.open())
flow_run_name = "-".join([
    config_json['instrument'],
    config_json['stream']['method'],
    config_json['stream']['name'],
])
schedule = CronSchedule(config_json['workflow_config']['schedule'])
run_config = ECSRun(**RUN_OPTIONS)

parent_run_opts = dict(**copy.deepcopy(RUN_OPTIONS))
parent_run_opts.update({'cpu': '0.5 vcpu', 'memory': '2 GB'})
parent_run_config = ECSRun(**parent_run_opts)

with Flow(flow_run_name, schedule=schedule,
          run_config=parent_run_config) as parent_flow:
    flow_run = create_flow_run(
        flow_name="stream_harvest",
        run_name=flow_run_name,
        project_name=project_name,
        parameters={
            'config': config_json,
            'error_test': False,
Exemple #20
0
from prefect import Client
from prefect.schedules import CronSchedule
from reddit_daily import flow

c = Client()
s = CronSchedule("0 * * * *")

flow.schedule = s

flow.deploy(project="Dylan's Project")
from datetime import datetime, timedelta
import pendulum
import prefect
from prefect import task, Flow
from prefect.schedules import CronSchedule
import pandas as pd
from io import BytesIO
import zipfile
import requests

schedule = CronSchedule(cron='*/10 * * * *',
                        start_date=pendulum.datetime(2020,
                                                     12,
                                                     5,
                                                     14,
                                                     tz="America/Sao_Paulo"))


@task
def get_raw_date():
    url = 'http://download.inep.gov.br/microdados/Enade_Microdados/microdados_enade_2019.zip'
    filebytes = BytesIO(requests.get(url).content)

    zipped = zipfile.ZipFile(filebytes)
    zipped.extractall()
    return './microdados_enade_2019/2019/3.DADOS/'


@task
def apply_filters(path):
    interested_cols = [
import sys
import prefect
from prefect import task, Flow, Parameter
from prefect.schedules import CronSchedule

sys.path.append('../pyoilfundy')

from pyoilfundy import fundyproducts as p

daily_7_sched = CronSchedule('0 7 * * 1-5')


def register_products_dash_flow():
    with Flow('product_by_region', schedule=daily_7_sched) as f:
        commods = ['lpg', 'naphtha', 'gasoline', 'diesel', 'jet', 'fueloil']
        p.make_specified_product_dash.map(commods)

    f.register(project_name='pyoilfundy')