Ejemplo n.º 1
0
from airflow.operators import FileDownloadOperator
from airflow.operators import DatumLoadOperator
from airflow.operators import CleanupOperator
from airflow.operators import SlackNotificationOperator
from datetime import datetime, timedelta

# ============================================================
# Defaults - these arguments apply to all operators

default_args = {
    'owner': 'airflow',
    # 'depends_on_past' False,
    'retries': 0,
    'retry_delay': timedelta(minutes=5),
    'start_date': datetime(2017, 1, 1, 0, 0, 0),
    'on_failure_callback': SlackNotificationOperator.failed(),
}

pipeline_tax = DAG('etl_tax_v1', default_args=default_args)

# ------------------------------------------------------------
# Extract - copy files to the staging area

def mkdir( ):
    import tempfile
    return tempfile.mkdtemp()

mk_staging = PythonOperator(
    task_id='staging',
    dag=pipeline_tax,
Ejemplo n.º 2
0
from datetime import datetime, timedelta

from airflow import DAG
from airflow.operators import BashOperator
from airflow.operators import SlackNotificationOperator

# ============================================================
# Defaults - these arguments apply to all operators

default_args = {
    'owner': 'airflow',
    'on_failure_callback': SlackNotificationOperator.failed(),
}

dag = DAG('s3_sftp_sync',
          start_date=datetime.now() - timedelta(days=1),
          schedule_interval='@hourly',
          concurrency=1,
          max_active_runs=1,
          default_args=default_args)

# Uses https://github.com/CityOfPhiladelphia/s3-sftp-sync
sync = BashOperator(
    task_id='sync',
    dag=dag,
    bash_command=
    'source <(eastern_state load_environment "$EASTERN_STATE_BUCKET" "$EASTERN_STATE_NAME" "$EASTERN_STATE_ENV") && s3_sftp_sync'
)
def carto_geodb2_dag_factory(geodb2_schema,
                            table_name,
                            schema_file,
                            geometry_support=None,
                            geodb2_table_name=None, # defaults to same as table_name
                            final_carto_table_name=None, # overides final carto table - useful for testing like test_table
                            schedule_interval='0 7 * * *', # defaults to 7am UTC (2am EST)
                            retries=0,
                            to_srid=None,
                            from_srid=None):
    dag_id = 'etl_carto_geodb2_{}'.format(table_name)

    default_args = {
        'owner': 'airflow',
        'on_failure_callback': SlackNotificationOperator.failed(),
        'retries': retries
    }

    dag = DAG(dag_id,
        start_date=datetime.now() - timedelta(days=1),
        schedule_interval=schedule_interval,
        default_args=default_args,
        max_active_runs=1
    )

    data_file = 's3://"$S3_STAGING_BUCKET"/' + dag_id + '/{{run_id.split(".")[0].lower()}}/' + dag_id + '.csv'

    extract_from_geodb2 = TheELOperator(
        task_id='extract_{}'.format(table_name),
        dag=dag,
        el_command='read',
        db_schema=geodb2_schema,
        table_name=geodb2_table_name or table_name,
        geometry_support= geometry_support,
        connection_string='"$GEODB2_CONN_STRING"',
        output_file=data_file,
        to_srid=to_srid,
        from_srid=from_srid
    )

    postgis_geometry_support = None
    if geometry_support != None:
        postgis_geometry_support = 'postgis'

    create_temp_carto_table = TheELOperator(
        task_id='create_temp_table_' + table_name,
        dag=dag,
        el_command='create_table',
        db_schema='phl',
        table_name=table_name + '_{{run_id.split(".")[0].lower()}}',
        table_schema_path=schema_file,
        geometry_support= postgis_geometry_support,
        connection_string='"$CARTO_CONN_STRING"'
    )

    load_to_temp_carto_table = TheELOperator(
        task_id='load_' + table_name,
        dag=dag,
        el_command='write',
        db_schema='phl',
        table_name=table_name + '_{{run_id.split(".")[0].lower()}}',
        skip_headers=True,
        table_schema_path=schema_file,
        geometry_support= postgis_geometry_support,
        connection_string='"$CARTO_CONN_STRING"',
        input_file=data_file
    )

    swap_and_real_tables = TheELOperator(
        task_id='swap_' + table_name,
        dag=dag,
        el_command='swap_table',
        db_schema='phl',
        new_table_name=table_name + '_{{run_id.split(".")[0].lower()}}',
        old_table_name=final_carto_table_name or table_name,
        connection_string='"$CARTO_CONN_STRING"'
    )

    extract_from_geodb2 >> create_temp_carto_table >> load_to_temp_carto_table >> swap_and_real_tables

    globals()[dag_id] = dag # Airflow looks at the module global vars for DAG type variables
Ejemplo n.º 4
0
def carto_geodb2_dag_factory(
        geodb2_schema,
        table_name,
        schema_file,
        geometry_support=None,
        geodb2_table_name=None,  # defaults to same as table_name
        final_carto_table_name=None,  # overides final carto table - useful for testing like test_table
        schedule_interval='0 7 * * *',  # defaults to 7am UTC (2am EST)
        retries=0,
        to_srid=None,
        from_srid=None):
    dag_id = 'etl_carto_geodb2_{}'.format(table_name)

    default_args = {
        'owner': 'airflow',
        'on_failure_callback': SlackNotificationOperator.failed(),
        'retries': retries
    }

    dag = DAG(dag_id,
              start_date=datetime.now() - timedelta(days=1),
              schedule_interval=schedule_interval,
              default_args=default_args,
              max_active_runs=1)

    data_file = 's3://"$S3_STAGING_BUCKET"/' + dag_id + '/{{run_id.split(".")[0].lower()}}/' + dag_id + '.csv'

    extract_from_geodb2 = TheELOperator(
        task_id='extract_{}'.format(table_name),
        dag=dag,
        el_command='read',
        db_schema=geodb2_schema,
        table_name=geodb2_table_name or table_name,
        geometry_support=geometry_support,
        connection_string='"$GEODB2_CONN_STRING"',
        output_file=data_file,
        to_srid=to_srid,
        from_srid=from_srid)

    postgis_geometry_support = None
    if geometry_support != None:
        postgis_geometry_support = 'postgis'

    create_temp_carto_table = TheELOperator(
        task_id='create_temp_table_' + table_name,
        dag=dag,
        el_command='create_table',
        db_schema='phl',
        table_name=table_name + '_{{run_id.split(".")[0].lower()}}',
        table_schema_path=schema_file,
        geometry_support=postgis_geometry_support,
        connection_string='"$CARTO_CONN_STRING"')

    load_to_temp_carto_table = TheELOperator(
        task_id='load_' + table_name,
        dag=dag,
        el_command='write',
        db_schema='phl',
        table_name=table_name + '_{{run_id.split(".")[0].lower()}}',
        skip_headers=True,
        table_schema_path=schema_file,
        geometry_support=postgis_geometry_support,
        connection_string='"$CARTO_CONN_STRING"',
        input_file=data_file)

    swap_and_real_tables = TheELOperator(
        task_id='swap_' + table_name,
        dag=dag,
        el_command='swap_table',
        db_schema='phl',
        new_table_name=table_name + '_{{run_id.split(".")[0].lower()}}',
        old_table_name=final_carto_table_name or table_name,
        connection_string='"$CARTO_CONN_STRING"')

    extract_from_geodb2 >> create_temp_carto_table >> load_to_temp_carto_table >> swap_and_real_tables

    globals(
    )[dag_id] = dag  # Airflow looks at the module global vars for DAG type variables