def test_skipping(self):
        latest_task = LatestOnlyOperator(
            task_id='latest',
            dag=self.dag)
        downstream_task = DummyOperator(
            task_id='downstream',
            dag=self.dag)
        downstream_task.set_upstream(latest_task)

        latest_task.run(start_date=DEFAULT_DATE, end_date=END_DATE)
        downstream_task.run(start_date=DEFAULT_DATE, end_date=END_DATE)

        latest_instances = get_task_instances('latest')
        exec_date_to_latest_state = {
            ti.execution_date: ti.state for ti in latest_instances}
        self.assertEqual({
            datetime.datetime(2016, 1, 1): 'success',
            datetime.datetime(2016, 1, 1, 12): 'success',
            datetime.datetime(2016, 1, 2): 'success', },
            exec_date_to_latest_state)

        downstream_instances = get_task_instances('downstream')
        exec_date_to_downstream_state = {
            ti.execution_date: ti.state for ti in downstream_instances}
        self.assertEqual({
            datetime.datetime(2016, 1, 1): 'skipped',
            datetime.datetime(2016, 1, 1, 12): 'skipped',
            datetime.datetime(2016, 1, 2): 'success',},
            exec_date_to_downstream_state)
    def test_skipping(self):
        latest_task = LatestOnlyOperator(
            task_id='latest',
            dag=self.dag)
        downstream_task = DummyOperator(
            task_id='downstream',
            dag=self.dag)
        downstream_task.set_upstream(latest_task)

        latest_task.run(start_date=DEFAULT_DATE, end_date=END_DATE)
        downstream_task.run(start_date=DEFAULT_DATE, end_date=END_DATE)

        latest_instances = get_task_instances('latest')
        exec_date_to_latest_state = {
            ti.execution_date: ti.state for ti in latest_instances}
        assert exec_date_to_latest_state == {
            datetime.datetime(2016, 1, 1): 'success',
            datetime.datetime(2016, 1, 1, 12): 'success',
            datetime.datetime(2016, 1, 2): 'success',
        }

        downstream_instances = get_task_instances('downstream')
        exec_date_to_downstream_state = {
            ti.execution_date: ti.state for ti in downstream_instances}
        assert exec_date_to_downstream_state == {
            datetime.datetime(2016, 1, 1): 'skipped',
            datetime.datetime(2016, 1, 1, 12): 'skipped',
            datetime.datetime(2016, 1, 2): 'success',
        }
    def test_skipping_dagrun(self):
        latest_task = LatestOnlyOperator(
            task_id='latest',
            dag=self.dag)
        downstream_task = DummyOperator(
            task_id='downstream',
            dag=self.dag)
        downstream_task2 = DummyOperator(
            task_id='downstream_2',
            dag=self.dag)

        downstream_task.set_upstream(latest_task)
        downstream_task2.set_upstream(downstream_task)

        latest_task.run(start_date=DEFAULT_DATE, end_date=END_DATE)
        downstream_task.run(start_date=DEFAULT_DATE, end_date=END_DATE)
        downstream_task2.run(start_date=DEFAULT_DATE, end_date=END_DATE)

        latest_instances = get_task_instances('latest')
        self.dag_file_processor._process_task_instances(self.dag, task_instances_list=latest_instances)

        exec_date_to_latest_state = {
            ti.execution_date: ti.state for ti in latest_instances}
        self.assertEqual({
            timezone.datetime(2016, 1, 1): 'success',
            timezone.datetime(2016, 1, 1, 12): 'success',
            timezone.datetime(2016, 1, 2): 'success'},
            exec_date_to_latest_state)

        downstream_instances = get_task_instances('downstream')
        self.dag_file_processor._process_task_instances(self.dag, task_instances_list=downstream_instances)

        exec_date_to_downstream_state = {
            ti.execution_date: ti.state for ti in downstream_instances}
        self.assertEqual({
            timezone.datetime(2016, 1, 1): 'skipped',
            timezone.datetime(2016, 1, 1, 12): 'skipped',
            timezone.datetime(2016, 1, 2): 'success'},
            exec_date_to_downstream_state)

        downstream_instances = get_task_instances('downstream_2')
        self.dag_file_processor._process_task_instances(self.dag, task_instances_list=downstream_instances)
        exec_date_to_downstream_state = {
            ti.execution_date: ti.state for ti in downstream_instances}
        self.assertEqual({
            timezone.datetime(2016, 1, 1): 'skipped',
            timezone.datetime(2016, 1, 1, 12): 'skipped',
            timezone.datetime(2016, 1, 2): 'success'},
            exec_date_to_downstream_state)
Exemple #4
0
def load_dm():
    loo = LatestOnlyOperator(task_id = "dm_latest_only", dag = dag)
    dm_tmp_begin = DummyOperator(task_id = "dm_tmp_begin", dag = dag)
    dm_tmp_end_dm_dims_begin = DummyOperator(task_id = "dm_tmp_end_dm_dims_begin", dag = dag)
    dm_dims_end_dm_facts_begin = DummyOperator(task_id = "dm_dims_end_dm_facts", dag = dag)
    c.dds_sats_end_dm_begin >> loo >> dm_tmp_begin
    for table, sql in c.dm_tmp.items():
        po = PostgresOperator(
            dag = dag,
            task_id = 'dm_tmp_' + table + '_recreate',
            sql = sql
        )
        dm_tmp_begin >> po >> dm_tmp_end_dm_dims_begin
    for table, sql in c.dm_dims.items():
        po = PostgresOperator(
            dag = dag,
            task_id = 'dm_dim_' + table + '_recreate',
            sql = sql
        )
        dm_tmp_end_dm_dims_begin >> po >> dm_dims_end_dm_facts_begin
    for table, sql in c.dm_facts.items():
        po = PostgresOperator(
            dag = dag,
            task_id = 'dm_fact_' + table + '_recreate',
            sql = sql
        )
        dm_dims_end_dm_facts_begin >> po
    return
Exemple #5
0
def create_dag(report, default_args):
    dag = DAG(
        report.dag_id, schedule_interval=report.schedule, default_args=default_args
    )

    with dag:
        test_prefix = "test_"

        start = LatestOnlyOperator(task_id="start_dag")
        send_email = PythonOperator(
            task_id="call_email_function",
            python_callable=report_notify_email,
            trigger_rule="all_done",
            op_kwargs={
                "report": report,
                "email_template_location": SINGLE_EMAIL_TEMPLATE,
            },
            provide_context=True,
        )
        for test in report.tests:
            t1 = StatusSensor(
                task_id=test_prefix + test,
                test_dag_id=test.split(".")[0],
                test_task_id=test.split(".")[1],
            )
            start >> t1 >> send_email

    return dag
    # 'queue': 'bash_queue',
    # 'pool': 'backfill',
    # 'priority_weight': 10,
    # 'end_date': datetime(2016, 1, 1),
}

dag = DAG('ats_hourly',
          default_args=default_args,
          schedule_interval='0 3,7,10,13,15,17,19,21-23 * * *',
          catchup=False)

python_executable = '~/venv/bin/python3.7'
python_script_path = '~/PycharmProjects/TwitterStats'

latest_only = LatestOnlyOperator(task_id='latest_only',
                                 dag=dag,
                                 trigger_rule=TriggerRule.ALL_DONE)

# t1, t2 and t3 are examples of tasks created by instantiating operators
t1 = BashOperator(task_id='words_trends',
                  bash_command='cd {};{} words.py trends'.format(
                      python_script_path, python_executable),
                  dag=dag,
                  trigger_rule=TriggerRule.ALL_DONE)

t2 = BashOperator(task_id='draft_trends',
                  bash_command='cd {};{} drafttrends.py'.format(
                      python_script_path, python_executable),
                  dag=dag,
                  trigger_rule=TriggerRule.ALL_DONE)
Exemple #7
0
from trident.util.seaboard_updates import update_seaboard_date, get_seaboard_update_dag

# All times in Airflow UTC.  Set Start Time in PST?
args = general.args
conf = general.config
schedule = general.schedule['streets']
start_date = general.start_date['streets']

#: Dag spec
dag = DAG(dag_id='sidewalk',
          default_args=args,
          start_date=start_date,
          schedule_interval=schedule)

#: Latest Only Operator for sdif
sidewalk_latest_only = LatestOnlyOperator(task_id='sidewalk_latest_only',
                                          dag=dag)

#: Get sidewalk data from DB
get_sidewalk_data = PythonOperator(task_id='get_sidewalk_oci',
                                   python_callable=get_sidewalk_data,
                                   on_failure_callback=notify,
                                   on_retry_callback=notify,
                                   on_success_callback=notify,
                                   dag=dag)

#: Get sidewalks shapefile from Atlas
get_sw_shapefiles = PythonOperator(task_id='get_sidewalk_gis',
                                   python_callable=get_sidewalk_gis,
                                   on_failure_callback=notify,
                                   on_retry_callback=notify,
                                   on_success_callback=notify,
from dags.water_tests.indicator_bacteria_jobs import get_indicator_bacteria_tests
from dags.water_tests.indicator_bacteria_jobs import get_latest_bac_tests
from trident.util.seaboard_updates import update_seaboard_date, get_seaboard_update_dag, update_json_date

args = general.args
conf = general.config
start_date = general.start_date['indicator_bacteria_tests']

dag = DAG(dag_id='indicator_bacteria_tests',
          default_args=args,
          start_date=start_date,
          schedule_interval=general.schedule['indicator_bacteria_tests'])

#: Latest Only Operator for traffic_counts
wtr_latest_only = LatestOnlyOperator(task_id='water_latest_only', dag=dag)

# TODO - teach me how to be yearly
# Pull out all indicator bac tests.
get_indicator_bac_tests = PythonOperator(
    task_id='get_indicator_bac_tests',
    python_callable=get_indicator_bacteria_tests,
    op_kwargs={
        'date_start': '01-JUN-2014',
        'date_end': (datetime.now() + timedelta(days=5)).strftime('%d-%b-%Y')
    },
    provide_context=True,
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    dag=dag)
Exemple #9
0
def create_dag(
    *,
    dag_id: str,
    cdr_type: str,
    start_date: datetime,
    extract_sql: str,
    end_date: Optional[datetime] = None,
    retries: int = 10,
    retry_delay: timedelta = timedelta(days=1),
    schedule_interval: Union[str, Interval] = "@daily",
    indexes: Iterable[str] = ("msisdn_counterpart", "location_id", "datetime",
                              "tac"),
    data_present_poke_interval: int = 60,
    data_present_timeout: int = 60 * 60 * 24 * 7,
    flux_check_poke_interval: int = 60,
    flux_check_wait_interval: int = 60,
    flux_check_timeout: int = 60 * 60 * 24 * 7,
    source_table: Optional[str] = None,
    staging_view_sql: Optional[str] = None,
    cluster_field: Optional[str] = None,
    program: Optional[str] = None,
    filename: Optional[str] = None,
    fields: Optional[Dict[str, str]] = None,
    null: str = "",
    additional_macros: Dict[str, Union[str, Callable]] = dict(),
    header: bool = True,
    delimiter: str = ",",
    quote: str = '"',
    escape: str = '"',
    encoding: Optional[str] = None,
) -> "DAG":
    """
    Create an ETL DAG that will load data from files, or a table within the database.

    Parameters
    ----------
    dag_id : str
        Name of the dag
    cdr_type : {"calls", "sms", "mds", "topups"}
        Type of CDR data
    start_date : datetime
        First date the dag should run for
    extract_sql : str
        SQL template. May be an SQL string, or the name of a file in the dags folder. The SQL should output
        a table with fields matching the corresponding cdr type schema. Where the source data is missing a
        field, the field must be introduced using NULL::<field_type> as <field_name>.
    end_date : datetime or None
        Optionally specify the final day the day should run on
    retries : int, default 10
        Number of times to retry the dag if it fails
    retry_delay : timedelta, default timedelta(days=1)
        Delay between retries
    schedule_interval : str or Interval, default "@daily"
        Time interval between execution dates.
    indexes : iterable of str, default ("msisdn_counterpart", "location_id", "datetime", "tac")
        Fields to create indexes on.
    data_present_poke_interval : int, default 60
        Number of seconds to wait between runs for the data present check
    data_present_timeout : int, default 604800
        Maximum number of seconds to keep checking before failing
    flux_check_poke_interval : int, default 60
        Number of seconds to wait between runs for the data in flux check
    flux_check_wait_interval : int, default 60
        Number of seconds to monitor data when checking for flux
    flux_check_timeout : int, default 604800
        Maximum number of seconds to keep checking before failing
    source_table : str or None
        If extracting from a table within the database (e.g. when using a FDW to connect to another db),
        the schema qualified name of the table.
    staging_view_sql : str or None
        If extracting from a table within the database (e.g. when using a FDW to connect to another db), the sql template
        or name of the template which will be used to create a date limited view of the data.
    cluster_field : str or None
        Optionally require that the data tables be 'clustered' on a field, which improves the performance of queries
        which need to subset based on that field at the cost of a significant increase in ETL time.
    program : str or None
        When loading data from files, set to the name of a program to be used when reading them (e.g. zcat to load
        from compressed csv files).
    filename : str or None
        When loading data from files, the filename pattern to be used - may include Airflow macros.
    fields : dict or None
        When loading data from files, a mapping of field names to postgres types.
    null : str, default ""
        When loading data from files, optionally specify a null value character
    additional_macros : dict or None
        Optionally provide additional macros to be available in SQL templates.
    header : bool, default True
        Set to False when loading files if the files do not have a header row.
    delimiter : str, default ","
        When loading from files, you may specify the delimiter character
    quote : str, default '"'
        When loading from files, you may specify the quote character
    escape : str, default '"'
        When loading from files, you may specify the escape character
    encoding : str or None
        Optionally specify file encoding when loading from files.

    Returns
    -------
    DAG

    """

    from airflow import DAG
    from airflow.operators.latest_only_operator import LatestOnlyOperator
    from flowetl.operators.add_constraints_operator import AddConstraintsOperator
    from flowetl.operators.analyze_operator import AnalyzeOperator
    from flowetl.operators.attach_operator import AttachOperator
    from flowetl.operators.cluster_operator import ClusterOperator
    from flowetl.operators.create_foreign_staging_table_operator import (
        CreateForeignStagingTableOperator, )
    from flowetl.operators.create_indexes_operator import CreateIndexesOperator
    from flowetl.operators.create_staging_view_operator import CreateStagingViewOperator
    from flowetl.operators.extract_from_foreign_table_operator import (
        ExtractFromForeignTableOperator, )
    from flowetl.operators.extract_from_view_operator import ExtractFromViewOperator
    from flowetl.operators.update_etl_table_operator import UpdateETLTableOperator
    from flowetl.sensors.data_present_sensor import DataPresentSensor
    from flowetl.sensors.file_flux_sensor import FileFluxSensor
    from flowetl.sensors.table_flux_sensor import TableFluxSensor

    args = {
        "owner": "airflow",
        "retries": retries,
        "retry_delay": retry_delay,
        "postgres_conn_id": "flowdb",
        "conn_id": "flowdb",
        "start_date": start_date,
        "end_date": end_date,
    }

    macros = dict(**additional_macros)
    if source_table is not None:
        macros["source_table"] = source_table

    with DAG(
            dag_id=dag_id,
            schedule_interval=schedule_interval,
            default_args=args,
            user_defined_macros=macros,
            params=dict(cdr_type=cdr_type),
    ) as dag:
        if staging_view_sql is not None and source_table is not None:
            create_staging_view = CreateStagingViewOperator(
                task_id="create_staging_view",
                sql=staging_view_sql,
            )
            extract = ExtractFromViewOperator(task_id="extract",
                                              sql=extract_sql,
                                              pool="postgres_etl")
        elif filename is not None and len(fields) > 0:
            create_staging_view = CreateForeignStagingTableOperator(
                task_id="create_staging_view",
                program=program,
                filename=filename,
                fields=fields,
                null=null,
                header=header,
                delimiter=delimiter,
                quote=quote,
                escape=escape,
                encoding=encoding,
            )
            extract = ExtractFromForeignTableOperator(task_id="extract",
                                                      sql=extract_sql,
                                                      pool="postgres_etl")
        else:
            raise TypeError(
                "Either staging_view_sql and source_table, or filename and fields must be provided."
            )
        check_not_empty = DataPresentSensor(
            task_id="wait_for_data",
            mode="reschedule",
            poke_interval=data_present_poke_interval,
            timeout=data_present_timeout,
        )
        if filename is not None:
            check_not_in_flux = FileFluxSensor(
                task_id="check_not_in_flux",
                filename=filename,
                mode="reschedule",
                poke_interval=flux_check_poke_interval,
                flux_check_interval=flux_check_wait_interval,
                timeout=flux_check_timeout,
            )
        else:
            check_not_in_flux = TableFluxSensor(
                task_id="check_not_in_flux",
                mode="reschedule",
                poke_interval=flux_check_poke_interval,
                flux_check_interval=flux_check_wait_interval,
                timeout=flux_check_timeout,
            )

        add_constraints = AddConstraintsOperator(task_id="add_constraints",
                                                 pool="postgres_etl")
        add_indexes = CreateIndexesOperator(
            task_id="add_indexes",
            index_columns=indexes,
            pool="postgres_etl",
        )
        attach = AttachOperator(task_id="attach")
        analyze = AnalyzeOperator(
            task_id="analyze",
            target="{{ extract_table }}",
            pool="postgres_etl",
        )
        latest_only = LatestOnlyOperator(task_id="analyze_parent_only_for_new")
        analyze_parent = AnalyzeOperator(
            task_id="analyze_parent",
            target="{{ parent_table }}",
            pool="postgres_etl",
        )
        update_records = UpdateETLTableOperator(task_id="update_records")

        create_staging_view >> check_not_empty >> check_not_in_flux >> extract
        from_stage = extract

        if cluster_field is not None:
            cluster = ClusterOperator(task_id="cluster",
                                      cluster_field=cluster_field,
                                      pool="postgres_etl")
            extract >> cluster
            from_stage = cluster
        from_stage >> [
            add_constraints,
            add_indexes,
        ] >> analyze >> attach >> latest_only >> analyze_parent
        attach >> [update_records, *get_qa_checks()]
    globals()[dag_id] = dag
    return dag
from dags.netfile.netfile2_jobs import *

# All times in Airflow UTC.  Set Start Time in PST?
args = general.args
conf = general.config
schedule = general.schedule['campaign_fin']
start_date = general.start_date['campaign_fin']
cur_yr = general.get_year()

#: Dag spec
dag = DAG(dag_id='campaign_fin_reports',
          default_args=args,
          start_date=start_date,
          schedule_interval=schedule)

campaign_fin_latest_only = LatestOnlyOperator(
    task_id='campaign_fin_latest_only', dag=dag)

#: Get 460A transactions
schedule_460A = PythonOperator(task_id='get_transactions_a',
                               python_callable=get_transactions_a,
                               on_failure_callback=notify,
                               on_retry_callback=notify,
                               on_success_callback=notify,
                               dag=dag)

#: Get 460B1 transactions
schedule_460B1 = PythonOperator(task_id='get_transactions_b',
                                python_callable=get_transactions_b,
                                on_failure_callback=notify,
                                on_retry_callback=notify,
                                on_success_callback=notify,
Exemple #11
0
 def test_run(self):
     task = LatestOnlyOperator(
         task_id='latest',
         dag=self.dag)
     task.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)
# This might need some refactoring (filenameing dates)

flist = {
    'full': 'treas_parking_payments_{}_datasd_v1.csv'.format(cur_yr),
    'by_month': 'treas_meters_{}_pole_by_month_datasd_v1.csv'.format(cur_yr),
    'by_day': 'treas_meters_{}_pole_by_mo_day_datasd_v1.csv'.format(cur_yr)
}

dag = DAG(
    dag_id='parking_meters',
    default_args=args,
    start_date=start_date,
    schedule_interval=schedule)

#: Latest Only Operator for parking meters
parking_meters_latest_only = LatestOnlyOperator(
    task_id='parking_meters_latest_only', dag=dag)


#: Downloads all parking files from FTP
get_parking_files = BashOperator(
    task_id='get_parking_files',
    bash_command=ftp_download_wget(),
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    dag=dag)

#: Joins downloaded files from ftp to production
build_prod_file = PythonOperator(
    task_id='build_prod_file',
    python_callable=build_prod_file,
Exemple #13
0
args = general.args
conf = general.config
schedule = general.schedule['budget']
start_date = general.start_date['budget']
budget_fy = general.get_FY_short() + 1


dag = DAG(
    dag_id='budget',
    default_args=args,
    start_date=start_date,
    schedule_interval=schedule)


#: Latest Only Operator for budget
budget_latest_only = LatestOnlyOperator(
    task_id='budget_latest_only', dag=dag)

get_accounts = PythonOperator(
    task_id='get_chart_of_accounts',
    python_callable=get_accounts_chart,
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    dag=dag)

get_capital_ptd = PythonOperator(
    task_id='get_capital_ptd',
    python_callable=get_capital_ptd,
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
# All times in Airflow UTC.  Set Start Time in PST?
args = general.args
conf = general.config
schedule = general.schedule['documentum_hr_30']
start_date = general.start_date['documentum_hr_30']

#: Dag spec
dag = DAG(dag_id='documentum_hourly_30',
          catchup=False,
          default_args=args,
          start_date=start_date,
          schedule_interval=schedule)
prod_data = conf['prod_data_dir']
schedule_mode = 'schedule_hourly_30'

documentum_docs_latest_only = LatestOnlyOperator(
    task_id='documentum_others_docs_latest_only', dag=dag)

#: Get documentum tables
get_doc_tables = PythonOperator(task_id='get_documentum_tables',
                                python_callable=get_documentum,
                                op_kwargs={'mode': schedule_mode},
                                on_failure_callback=notify,
                                on_retry_callback=notify,
                                on_success_callback=notify,
                                dag=dag)

div_doc_table = PythonOperator(task_id='divide_doc_table',
                               python_callable=latest_res_ords,
                               on_failure_callback=notify,
                               on_retry_callback=notify,
                               on_success_callback=notify,
#get_stale_cmd = "python /data-portal-monitoring/late_updated_datasets.py"
get_stale_cmd = BASEPYTHON + BASEDIR + "late_updated_datasets.py"
t4 = BashOperator(task_id='stale_delayed_datasets',
                  bash_command=get_stale_cmd,
                  dag=dag)

#digest = SubDagOperator(
#    subdag=dag2,
#    task_id= 'data_monitoring_workflow_dag.digest_dag',
#    dag=dag,

#)

#dag >> t1 #>> t2 >> t3 >> t4

latest_only = LatestOnlyOperator(task_id='latest_only', dag=dag)
t1.set_upstream(latest_only)
t2.set_upstream(t1)
t3.set_upstream(t1)
t4.set_upstream(t1)
#t1 >> digest

#run thje digest every 12 hours
dag2 = DAG(
    dag_id='data_monitoring_late_updated_digest_dag',
    default_args=WORKFLOW_DEFAULT_ARGS,
    start_date=WORKFLOW_START_DATE,
    schedule_interval='30 */12 * * *',
)

#stale_delayed_datasets_digest_cmd = "python2 /Users/j9/Desktop/data-portal-monitoring/digest_late_updated_datasets.py"
from trident.util import general
from trident.util.notifications import notify
from trident.util.seaboard_updates import update_seaboard_date, get_seaboard_update_dag, update_json_date

args = general.args
conf = general.config
schedule = general.schedule['traffic_counts']
start_date = general.start_date['traffic_counts']

dag = DAG(dag_id='traffic_counts',
          default_args=args,
          start_date=start_date,
          schedule_interval=schedule)

#: Latest Only Operator for traffic_counts
tc_latest_only = LatestOnlyOperator(task_id='traffic_counts_latest_only',
                                    dag=dag)

#: Downloads traffic counts xlsx from share
get_traffic_counts = PythonOperator(task_id='get_traffic_counts',
                                    python_callable=get_traffic_counts,
                                    on_failure_callback=notify,
                                    on_retry_callback=notify,
                                    on_success_callback=notify,
                                    dag=dag)

#: Cleans the downloaded XLSX file, converts it to CSV data.
clean_traffic_counts = PythonOperator(task_id='clean_traffic_counts',
                                      python_callable=clean_traffic_counts,
                                      on_failure_callback=notify,
                                      on_retry_callback=notify,
                                      on_success_callback=notify,
from trident.util.seaboard_updates import update_seaboard_date, get_seaboard_update_dag

conf = general.config
args = general.args
schedule = general.schedule['dsd_approvals']
start_date = general.start_date['dsd_approvals']
year = general.get_year()

#: Dag spec for dsd permits
dag = DAG(dag_id='dsd_permits',
          default_args=args,
          start_date=start_date,
          schedule_interval=schedule)

#: Latest Only Operator for dsd permits.
dsd_permits_latest_only = LatestOnlyOperator(
    task_id='dsd_permits_latest_only', dag=dag)

#: Get permits reports
get_permits_files = BashOperator(
    task_id='get_permits_files',
    bash_command=get_permits_files(),
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    dag=dag)

#: Clean permits reports
clean_data = PythonOperator(
    task_id='clean_data',
    python_callable=clean_data,
    on_failure_callback=notify,
from dags.public_art.public_art_jobs import *
from trident.util.seaboard_updates import update_seaboard_date, get_seaboard_update_dag, update_json_date

# All times in Airflow UTC.  Set Start Time in PST?
args = general.args
conf = general.config
schedule = general.schedule['public_art']
start_date = general.start_date['public_art']

#: Dag spec
dag = DAG(dag_id='public_art',
          default_args=args,
          start_date=start_date,
          schedule_interval=schedule)

public_art_latest_only = LatestOnlyOperator(task_id='public_art_latest_only',
                                            dag=dag)

#: Get public art from NetX, process, output prod file
get_public_art = PythonOperator(task_id='get_public_art',
                                python_callable=get_public_art,
                                on_failure_callback=notify,
                                on_retry_callback=notify,
                                on_success_callback=notify,
                                dag=dag)

process_public_art = PythonOperator(task_id='process_public_art',
                                    python_callable=process_public_art,
                                    on_failure_callback=notify,
                                    on_retry_callback=notify,
                                    on_success_callback=notify,
                                    dag=dag)
 def test_run(self):
     task = LatestOnlyOperator(
         task_id='latest',
         dag=self.dag)
     task.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)
from dags.tsw_integration.tsw_integration_jobs import *

# All times in Airflow UTC.  Set Start Time in PST?
args = general.args
conf = general.config
schedule = general.schedule['tsw_integration']
start_date = general.start_date['tsw_integration']

#: Dag spec
dag = DAG(dag_id='tsw_integration',
          default_args=args,
          start_date=start_date,
          schedule_interval=schedule)

violations_latest_only = LatestOnlyOperator(task_id='violations_latest_only',
                                            dag=dag)

# VPM Extraction Support Tasks

#: Download VPM dump from FTP
get_vpm_violations = BashOperator(task_id='get_vpm_violations',
                                  bash_command=get_vpm_violations_wget(),
                                  on_failure_callback=notify,
                                  on_retry_callback=notify,
                                  on_success_callback=notify,
                                  dag=dag)

#: Download VPM dump from FTP
#get_vpm_dump = BashOperator(
#    task_id='get_vpm_dump',
#    bash_command=ftp_download_wget(),
Exemple #21
0
        main_task = PythonOperator(
            task_id=dag_id,
            python_callable=wrapper(operator, dag_id),
            op_args=[pipeline['name'], pipeline['params'], pipeline],
            dag=dag)
        if description and (match := depends_on.match(description)):
            parent_dag_id = match.group(1)
            t0 = ExternalTaskSensor(task_id=dag_id + '__trigger',
                                    external_dag_id=parent_dag_id,
                                    external_task_id=parent_dag_id,
                                    mode='reschedule',
                                    dag=dag)
            t0 >> main_task
        elif schedule is not None:
            t0 = LatestOnlyOperator(task_id=dag_id + '__latest_only', dag=dag)
            t0 >> main_task
        globals()[dag_id] = dag
    except Exception as e:
        logging.error(
            f'Failed to create a DAG with id {dag_id}, schedule {schedule} because {e}'
        )

task_id = '_clean_scheduler_logs'
dag_id = task_id + '_dag'
schedule = '0 * * * *'
args = {
    'owner': 'Airflow',
    'depends_on_past': False,
    'start_date': datetime.datetime.now(),
    'is_paused_upon_creation': False,
from trident.util.seaboard_updates import *

from dags.streets.streets_jobs import *

# All times in Airflow UTC.  Set Start Time in PST?
args = general.args
conf = general.config
schedule = general.schedule['streets']
start_date = general.start_date['streets']

#: Dag spec
dag = DAG(dag_id='streets', default_args=args, start_date=start_date, schedule_interval=schedule)


#: Latest Only Operator for imcat
streets_latest_only = LatestOnlyOperator(task_id='streets_latest_only', dag=dag)

#: Get streets data from DB
get_streets_data = PythonOperator(
    task_id='get_streets_paving_data',
    python_callable=get_streets_paving_data,
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    dag=dag)

#: Process data for public
process_data_sdif = PythonOperator(
    task_id='process_sdif',
    python_callable=process_paving_data,
    op_kwargs={'mode': 'sdif'},
Exemple #23
0
    'email_on_retry':
    True,
    'retries':
    3,
    'retry_delay':
    timedelta(minutes=15),
}

DASHBOARD_DAG_ID = 'dashboard_aggregation'

dashboard_dag = DAG(DASHBOARD_DAG_ID,
                    default_args=default_args,
                    schedule_interval='0 18 * * *')

latest_only = LatestOnlyOperator(task_id='latest_only',
                                 dag=dashboard_dag,
                                 depends_on_past=True)

prev_month = SubDagOperator(subdag=monthly_subdag(
    DASHBOARD_DAG_ID,
    'prev_month',
    dashboard_dag.default_args,
    dashboard_dag.schedule_interval,
    interval=-1),
                            task_id='prev_month',
                            dag=dashboard_dag)

current_month = SubDagOperator(subdag=monthly_subdag(
    DASHBOARD_DAG_ID,
    'current_month',
    dashboard_dag.default_args,
from trident.util.notifications import notify
from dags.inventory.inv_jobs import *

conf = general.config

args = general.args

schedule = general.schedule['inventory']
start_date = general.start_date['inventory']

dag = DAG(dag_id='inventory',
          default_args=args,
          start_date=start_date,
          schedule_interval=schedule)

inv_latest_only = LatestOnlyOperator(task_id='inventory_latest_only', dag=dag)

#: Inventory Doc To CSV
inventory_to_csv = PythonOperator(task_id='inventory_to_csv',
                                  python_callable=inventory_to_csv,
                                  on_failure_callback=notify,
                                  on_retry_callback=notify,
                                  on_success_callback=notify,
                                  dag=dag)

#: Upload Inventory CSV to S3
upload_inventory = S3FileTransferOperator(
    task_id='upload_inventory',
    source_base_path=conf['prod_data_dir'],
    source_key='inventory_datasd_v1.csv',
    dest_s3_conn_id=conf['default_s3_conn_id'],
Exemple #25
0
            continue

        try:
            response = requests.get(QUEUE_URL.format(queue_name),
                                    auth=(QUEUE_USERNAME, QUEUE_PASSWORD))
            stats = json.loads(response.text)
            size = stats['messages_ready'] + stats['messages_unacknowledged']
            queue_sizes[queue_name] = size
        except Exception:
            logger.exception('No tasks found for %s', queue_name)
            queue_sizes[queue_name] = 0

    return queue_sizes


latest = LatestOnlyOperator(task_id='latest_only', queue='manager', dag=dag)

queue_sizes_task = PythonOperator(task_id=QUEUE_SIZES_TASK_ID,
                                  python_callable=get_queue_sizes,
                                  queue="manager",
                                  dag=dag)

rescale_task = BashOperator(task_id=RESCALE_TASK_ID,
                            bash_command=templated_resize_command,
                            queue="manager",
                            params={'task_id': QUEUE_SIZES_TASK_ID},
                            dag=dag)

latest.set_downstream(queue_sizes_task)
queue_sizes_task.set_downstream(rescale_task)
Exemple #26
0
    def test_not_skipping_external(self):
        latest_task = LatestOnlyOperator(
            task_id='latest',
            dag=self.dag)
        downstream_task = DummyOperator(
            task_id='downstream',
            dag=self.dag)
        downstream_task2 = DummyOperator(
            task_id='downstream_2',
            dag=self.dag)

        downstream_task.set_upstream(latest_task)
        downstream_task2.set_upstream(downstream_task)

        self.dag.create_dagrun(
            run_id="manual__1",
            start_date=timezone.utcnow(),
            execution_date=DEFAULT_DATE,
            state=State.RUNNING,
            external_trigger=True,
        )

        self.dag.create_dagrun(
            run_id="manual__2",
            start_date=timezone.utcnow(),
            execution_date=timezone.datetime(2016, 1, 1, 12),
            state=State.RUNNING,
            external_trigger=True,
        )

        self.dag.create_dagrun(
            run_id="manual__3",
            start_date=timezone.utcnow(),
            execution_date=END_DATE,
            state=State.RUNNING,
            external_trigger=True,
        )

        latest_task.run(start_date=DEFAULT_DATE, end_date=END_DATE)
        downstream_task.run(start_date=DEFAULT_DATE, end_date=END_DATE)
        downstream_task2.run(start_date=DEFAULT_DATE, end_date=END_DATE)

        latest_instances = get_task_instances('latest')
        exec_date_to_latest_state = {
            ti.execution_date: ti.state for ti in latest_instances}
        self.assertEqual({
            timezone.datetime(2016, 1, 1): 'success',
            timezone.datetime(2016, 1, 1, 12): 'success',
            timezone.datetime(2016, 1, 2): 'success'},
            exec_date_to_latest_state)

        downstream_instances = get_task_instances('downstream')
        exec_date_to_downstream_state = {
            ti.execution_date: ti.state for ti in downstream_instances}
        self.assertEqual({
            timezone.datetime(2016, 1, 1): 'success',
            timezone.datetime(2016, 1, 1, 12): 'success',
            timezone.datetime(2016, 1, 2): 'success'},
            exec_date_to_downstream_state)

        downstream_instances = get_task_instances('downstream_2')
        exec_date_to_downstream_state = {
            ti.execution_date: ti.state for ti in downstream_instances}
        self.assertEqual({
            timezone.datetime(2016, 1, 1): 'success',
            timezone.datetime(2016, 1, 1, 12): 'success',
            timezone.datetime(2016, 1, 2): 'success'},
            exec_date_to_downstream_state)
from trident.util import general
from trident.util.notifications import notify
from trident.util.seaboard_updates import update_seaboard_date, get_seaboard_update_dag, update_json_date

args = general.args
conf = general.config
schedule = general.schedule
start_date = general.start_date['pd_col']

dag = DAG(dag_id='pd_col',
          default_args=args,
          start_date=start_date,
          schedule_interval=schedule['pd_col'])

#: Latest Only Operator for pd_col
pd_col_latest_only = LatestOnlyOperator(task_id='pd_col_latest_only', dag=dag)

#: Get collisions data from FTP and save to temp folder
get_collisions_data = PythonOperator(task_id='get_collisions_data',
                                     python_callable=get_collisions_data,
                                     on_failure_callback=notify,
                                     on_retry_callback=notify,
                                     on_success_callback=notify,
                                     dag=dag)

#: Process collisions data and save result to prod folder
process_collisions_data = PythonOperator(
    task_id='process_collisions_data',
    python_callable=process_collisions_data,
    on_failure_callback=notify,
    on_retry_callback=notify,
Exemple #28
0
    def test_skipping_non_latest(self):
        latest_task = LatestOnlyOperator(
            task_id='latest',
            dag=self.dag)
        downstream_task = DummyOperator(
            task_id='downstream',
            dag=self.dag)
        downstream_task2 = DummyOperator(
            task_id='downstream_2',
            dag=self.dag)
        downstream_task3 = DummyOperator(
            task_id='downstream_3',
            trigger_rule=TriggerRule.NONE_FAILED,
            dag=self.dag)

        downstream_task.set_upstream(latest_task)
        downstream_task2.set_upstream(downstream_task)
        downstream_task3.set_upstream(downstream_task)

        self.dag.create_dagrun(
            run_id="scheduled__1",
            start_date=timezone.utcnow(),
            execution_date=DEFAULT_DATE,
            state=State.RUNNING,
        )

        self.dag.create_dagrun(
            run_id="scheduled__2",
            start_date=timezone.utcnow(),
            execution_date=timezone.datetime(2016, 1, 1, 12),
            state=State.RUNNING,
        )

        self.dag.create_dagrun(
            run_id="scheduled__3",
            start_date=timezone.utcnow(),
            execution_date=END_DATE,
            state=State.RUNNING,
        )

        latest_task.run(start_date=DEFAULT_DATE, end_date=END_DATE)
        downstream_task.run(start_date=DEFAULT_DATE, end_date=END_DATE)
        downstream_task2.run(start_date=DEFAULT_DATE, end_date=END_DATE)
        downstream_task3.run(start_date=DEFAULT_DATE, end_date=END_DATE)

        latest_instances = get_task_instances('latest')
        exec_date_to_latest_state = {
            ti.execution_date: ti.state for ti in latest_instances}
        self.assertEqual({
            timezone.datetime(2016, 1, 1): 'success',
            timezone.datetime(2016, 1, 1, 12): 'success',
            timezone.datetime(2016, 1, 2): 'success'},
            exec_date_to_latest_state)

        downstream_instances = get_task_instances('downstream')
        exec_date_to_downstream_state = {
            ti.execution_date: ti.state for ti in downstream_instances}
        self.assertEqual({
            timezone.datetime(2016, 1, 1): 'skipped',
            timezone.datetime(2016, 1, 1, 12): 'skipped',
            timezone.datetime(2016, 1, 2): 'success'},
            exec_date_to_downstream_state)

        downstream_instances = get_task_instances('downstream_2')
        exec_date_to_downstream_state = {
            ti.execution_date: ti.state for ti in downstream_instances}
        self.assertEqual({
            timezone.datetime(2016, 1, 1): None,
            timezone.datetime(2016, 1, 1, 12): None,
            timezone.datetime(2016, 1, 2): 'success'},
            exec_date_to_downstream_state)

        downstream_instances = get_task_instances('downstream_3')
        exec_date_to_downstream_state = {
            ti.execution_date: ti.state for ti in downstream_instances}
        self.assertEqual({
            timezone.datetime(2016, 1, 1): 'success',
            timezone.datetime(2016, 1, 1, 12): 'success',
            timezone.datetime(2016, 1, 2): 'success'},
            exec_date_to_downstream_state)
import glob

args = general.args
conf = general.config
schedule = general.schedule
start_date = general.start_date['claims_stat']
email_recips = conf['mail_notify_claims']

#: Dag definition
dag = DAG(dag_id='claims_stat',
          default_args=args,
          start_date=start_date,
          schedule_interval=schedule['claims_stat'])

#: Latest Only Operator for claims
claims_stat_latest_only = LatestOnlyOperator(task_id='claims_stat_latest_only',
                                             dag=dag)

#: Pull claims data from oracle
get_claims_data = PythonOperator(task_id='get_claims_data',
                                 python_callable=get_claims_data,
                                 on_failure_callback=notify,
                                 on_retry_callback=notify,
                                 on_success_callback=notify,
                                 dag=dag)

#: Upload clean and geocode claims data
clean_geocode = PythonOperator(task_id='clean_geocode_claims',
                               python_callable=clean_geocode_claims,
                               on_failure_callback=notify,
                               on_retry_callback=notify,
                               on_success_callback=notify,
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
"""
Example LatestOnlyOperator and TriggerRule interactions
"""
import datetime as dt

import airflow
from airflow.models import DAG
from airflow.operators.dummy_operator import DummyOperator
from airflow.operators.latest_only_operator import LatestOnlyOperator
from airflow.utils.trigger_rule import TriggerRule

dag = DAG(
    dag_id='latest_only_with_trigger',
    schedule_interval=dt.timedelta(hours=4),
    start_date=airflow.utils.dates.days_ago(2),
)

latest_only = LatestOnlyOperator(task_id='latest_only', dag=dag)
task1 = DummyOperator(task_id='task1', dag=dag)
task2 = DummyOperator(task_id='task2', dag=dag)
task3 = DummyOperator(task_id='task3', dag=dag)
task4 = DummyOperator(task_id='task4',
                      dag=dag,
                      trigger_rule=TriggerRule.ALL_DONE)

latest_only >> task1 >> [task3, task4]
task2 >> [task3, task4]
    def test_skipping_dagrun(self):
        latest_task = LatestOnlyOperator(
            task_id='latest',
            dag=self.dag)
        downstream_task = DummyOperator(
            task_id='downstream',
            dag=self.dag)
        downstream_task2 = DummyOperator(
            task_id='downstream_2',
            dag=self.dag)

        downstream_task.set_upstream(latest_task)
        downstream_task2.set_upstream(downstream_task)

        self.dag.create_dagrun(
            run_id="manual__1",
            start_date=timezone.utcnow(),
            execution_date=DEFAULT_DATE,
            state=State.RUNNING
        )

        self.dag.create_dagrun(
            run_id="manual__2",
            start_date=timezone.utcnow(),
            execution_date=timezone.datetime(2016, 1, 1, 12),
            state=State.RUNNING
        )

        self.dag.create_dagrun(
            run_id="manual__3",
            start_date=timezone.utcnow(),
            execution_date=END_DATE,
            state=State.RUNNING
        )

        latest_task.run(start_date=DEFAULT_DATE, end_date=END_DATE)
        downstream_task.run(start_date=DEFAULT_DATE, end_date=END_DATE)
        downstream_task2.run(start_date=DEFAULT_DATE, end_date=END_DATE)

        latest_instances = get_task_instances('latest')
        exec_date_to_latest_state = {
            ti.execution_date: ti.state for ti in latest_instances}
        self.assertEqual({
            timezone.datetime(2016, 1, 1): 'success',
            timezone.datetime(2016, 1, 1, 12): 'success',
            timezone.datetime(2016, 1, 2): 'success'},
            exec_date_to_latest_state)

        downstream_instances = get_task_instances('downstream')
        exec_date_to_downstream_state = {
            ti.execution_date: ti.state for ti in downstream_instances}
        self.assertEqual({
            timezone.datetime(2016, 1, 1): 'skipped',
            timezone.datetime(2016, 1, 1, 12): 'skipped',
            timezone.datetime(2016, 1, 2): 'success'},
            exec_date_to_downstream_state)

        downstream_instances = get_task_instances('downstream_2')
        exec_date_to_downstream_state = {
            ti.execution_date: ti.state for ti in downstream_instances}
        self.assertEqual({
            timezone.datetime(2016, 1, 1): 'skipped',
            timezone.datetime(2016, 1, 1, 12): 'skipped',
            timezone.datetime(2016, 1, 2): 'success'},
            exec_date_to_downstream_state)
Exemple #32
0
def create_new_dags(dag_id, database):
    def f_print_log():
        return '{} start processing tables in database: {}'.format(
            dag_id, database)

    def f_check_table_exists(table_name):
        connect = PostgresHook(postgres_conn_id=database)
        query = """
            select count(1) 
              from information_schema.tables 
             where table_schema not like  %s
               and table_name = %s
            """
        res = connect.get_first(query, parameters=('', table_name))
        if res[0] == 0:
            return 'create_table'
        else:
            return 'table_exists'

    def f_create_table(table_name):
        connect = PostgresHook(postgres_conn_id=database)
        query = """
            create table {}( 
              id integer not null,
              "user" varchar(50) not null,
              timestamp timestamp not null
             )""".format(table_name)
        connect.run(query)

    def f_insert_row(table_name, **context):
        connect = PostgresHook(postgres_conn_id=database)
        user = context['ti'].xcom_pull(task_ids='get_current_user',
                                       key='return_value')
        # do not do this in production, sql injection is possible
        query = """
            insert into {}
            values(%s, %s, %s)
            """.format(table_name)
        connect.run(query,
                    parameters=(uuid.uuid4().int % 123456789, user,
                                datetime.now()))

    # for every table replicate records
    for key in config:
        table_name = config[key].get('table_name')
        name = '{}_table_{}'.format(dag_id, table_name)
        with DAG(name,
                 schedule_interval=config[key].get('schedule_interval'),
                 start_date=config[key].get('start_date')) as dag:
            # ignore the previous tasks, no backfilling
            latest_only = LatestOnlyOperator(task_id='latest_only', dag=dag)
            # logging the dag
            print_the_context = PythonOperator(task_id='print_the_context',
                                               python_callable=f_print_log,
                                               dag=dag)
            # write the current username to xcom
            get_current_user = BashOperator(task_id='get_current_user',
                                            bash_command='whoami',
                                            xcom_push=True,
                                            dag=dag)
            # check table exists
            check_table_exists = BranchPythonOperator(
                task_id='check_table_exists',
                python_callable=f_check_table_exists,
                op_args=[table_name],
                dag=dag)

            # create table
            create_table = PythonOperator(task_id='create_table',
                                          python_callable=f_create_table,
                                          op_args=[table_name],
                                          dag=dag)
            # skip table generation
            table_exists = DummyOperator(task_id='table_exists', dag=dag)

            # insert to a table
            insert_new_rows = PythonOperator(
                task_id='insert_new_rows',
                python_callable=f_insert_row,
                op_kwargs={'table_name': table_name},
                provide_context=True,
                dag=dag,
                trigger_rule='none_failed')
            # query a table
            query_the_table = PostgreSQLCountRows(task_id='query_the_table',
                                                  table_name=table_name,
                                                  connection_id=database,
                                                  dag=dag)

            latest_only >> print_the_context >> get_current_user >> check_table_exists >> (
                create_table,
                table_exists) >> insert_new_rows >> query_the_table

            yield name, dag
Exemple #33
0
args = general.args
conf = general.config
schedule = general.schedule['gis_tree_canopy']
start_date = general.start_date['gis_tree_canopy']
folder = 'trees'
layer = 'tree_canopy'
datasd_name = 'tree_canopy_datasd'
path_to_file = conf['prod_data_dir'] + '/' + datasd_name

dag = DAG(dag_id='gis_{layer}'.format(layer=layer),
          default_args=args,
          start_date=start_date,
          schedule_interval=schedule)

#: Latest Only Operator for sdif
treecan_latest_only = LatestOnlyOperator(task_id='tree_canopy_latest_only',
                                         dag=dag)

#: Get tree canopy shapefile from Atlas
get_shapefiles = PythonOperator(task_id='get_tree_canopy_gis',
                                python_callable=sde_to_shp,
                                on_failure_callback=notify,
                                on_retry_callback=notify,
                                on_success_callback=notify,
                                dag=dag)

#: Convert shp to geojson
shp_to_geojson = BashOperator(task_id='tree_canopy_to_geojson',
                              bash_command=shp_to_geojson(),
                              on_failure_callback=notify,
                              on_retry_callback=notify,
                              on_success_callback=notify,