def get_s3_to_redshift_dag(parent_dag_name, task_id, redshift_conn_id,
                           aws_credentials_id, table, create_sql_stmt,
                           s3_bucket, s3_key, *args, **kwargs):
    dag = DAG(f"{parent_dag_name}.{task_id}", **kwargs)

    create_task = PostgresOperator(task_id=f"create_{table}_table",
                                   dag=dag,
                                   postgres_conn_id=redshift_conn_id,
                                   sql=create_sql_stmt)

    copy_task = S3ToRedshiftOperator(
        task_id=f"load_{table}_from_s3_to_redshift",
        dag=dag,
        table=table,
        redshift_conn_id=redshift_conn_id,
        aws_credentials_id=aws_credentials_id,
        s3_bucket=s3_bucket,
        s3_key=s3_key)

    #
    # TODO: Move the HasRowsOperator task here from the DAG
    #
    check_task = HasRowsOperator(task_id=f"check_{table}_has_rows",
                                 dag=dag,
                                 redshift_conn_id=redshift_conn_id,
                                 table=table)

    create_task >> copy_task
    copy_task >> check_task
    #
    # TODO: Use DAG ordering to place the check task
    #

    return dag
Ejemplo n.º 2
0
def get_s3_to_redshift_dag(parent_dag_name,
                           task_id,
                           redshift_conn_id,
                           aws_credentials_id,
                           staging_table,
                           create_sql_stmt,
                           s3_bucket,
                           s3_key,
                           create_table_option=True,
                           redshift_copy_params=[],
                           *args,
                           **kwargs):

    dag = DAG(f"{parent_dag_name}.{task_id}", **kwargs)

    create_table_task = PythonOperator(task_id=f"create_{staging_table}_table",
                                       dag=dag,
                                       python_callable=create_table,
                                       provide_context=True,
                                       params={
                                           'create_table_option':
                                           create_table_option,
                                           'redshift_conn_id':
                                           redshift_conn_id,
                                           'create_sql_stmt': create_sql_stmt,
                                           'table': staging_table
                                       })

    stage_table_from_s3_task = S3ToRedshiftOperator(
        dag=dag,
        task_id=f"load_{staging_table}_from_s3_to_redshift",
        redshift_conn_id=redshift_conn_id,
        aws_conn_id=aws_credentials_id,
        table=staging_table,
        s3_bucket=s3_bucket,
        s3_key=s3_key,
        overwrite=True,
        copy_params=redshift_copy_params)

    verify_staged_table_task = PostgresHasRowsOperator(
        dag=dag,
        task_id=f"verify_{staging_table}_count",
        table=staging_table,
        connection_id=redshift_conn_id)

    create_table_task >> stage_table_from_s3_task >> verify_staged_table_task

    return dag
Ejemplo n.º 3
0
def get_s3_to_redshift_dag(
        parent_dag_name,
        task_id,
        redshift_conn_id,
        aws_credentials_id,
        table,
        create_sql_stmt,
        s3_bucket,
        s3_key,
        *args, **kwargs
):
    dag = DAG(
        dag_id=f"{parent_dag_name}.{task_id}",  # subdags' dag_id must follow this naming pattern
        **kwargs
    )

    # TODO: Move the create table code here
    create_table_task = PostgresOperator(
        task_id=f"create_{table}_table",
        postgres_conn_id=redshift_conn_id,
        sql=create_sql_stmt,
        dag=dag
    )

    # TODO: Move the S3ToRedshiftOperator code here
    copy_table_task = S3ToRedshiftOperator(
        task_id=f"load_{table}_from_s3_to_redshift",
        table=table,
        redshift_conn_id=redshift_conn_id,
        aws_credentials_id=aws_credentials_id,
        s3_bucket=s3_bucket,
        s3_key=s3_key,
        dag=dag
    )

    # TODO: Move the check table code here
    check_table_task = HasRowsOperator(
        task_id=f"check_{table}_data",
        redshift_conn_id=redshift_conn_id,
        table=table,
        dag=dag
    )

    # TODO: Define ordering of tasks within this subdag
    create_table_task >> copy_table_task
    copy_table_task >> check_table_task

    return dag
Ejemplo n.º 4
0
    def create_task(table):
        """Returns an operator for copying the table into Redshift"""

        return S3ToRedshiftOperator(
            dag=dag,
            task_id=f"copy_{table.get('table_name', None)}_to_redshift",
            redshift_conn_id=redshift_conn_id,
            redshift_schema=redshift_schema,
            table=table.get('table_name', None),
            s3_conn_id=s3_conn_id,
            s3_bucket=s3_bucket,
            s3_key=table.get('s3_key', None),
            load_type=load_type,
            copy_params=table.get('copy_params', None),
            origin_schema=table.get('origin_schema', None),
            primary_key=table.get('primary_key', None),
            foreign_key=table.get('foreign_key', {}),
            schema_location=schema_location)
def get_s3_to_redshift(parent_dag_name, task_id, tables_definition,
                       redshift_conn_id, redshift_schema, s3_conn_id,
                       s3_bucket, load_type, schema_location, *args, **kwargs):

    dag = DAG(f"{parent_dag_name}.{task_id}", **kwargs)

    for table in tables_definition:
        S3ToRedshiftOperator(
            dag=dag,
            task_id=f"copy_{table.get('table_name',None)}_to_redshift",
            redshift_conn_id=redshift_conn_id,
            redshift_schema=redshift_schema,
            table=f"staging_{table.get('table_name',None)}",
            s3_conn_id=s3_conn_id,
            s3_bucket=s3_bucket,
            s3_key=table.get('s3_key', None),
            load_type=load_type,
            copy_params=table.get('copy_params', None),
            origin_schema=table.get('origin_schema', None),
            schema_location=schema_location)

    return dag
Ejemplo n.º 6
0
            S3_KEY = 'github/{0}/{1}.json'.format(org['name'], endpoint['name'])
            TI_PREFIX = 'github_{0}_from_{1}'.format(endpoint['name'], org['name'])
            GITHUB_TASK_ID = '{0}_to_s3'.format(TI_PREFIX)
            REDSHIFT_TASK_ID = '{0}_to_redshift'.format(TI_PREFIX)

            github = GithubToS3Operator(task_id=GITHUB_TASK_ID,
                                        github_conn_id=org['github_conn_id'],
                                        github_org=org['name'],
                                        github_repo='all',
                                        github_object=endpoint['name'],
                                        payload=endpoint['payload'],
                                        s3_conn_id=S3_CONN_ID,
                                        s3_bucket=S3_BUCKET,
                                        s3_key=S3_KEY)

            redshift = S3ToRedshiftOperator(task_id=REDSHIFT_TASK_ID,
                                            s3_conn_id=S3_CONN_ID,
                                            s3_bucket=S3_BUCKET,
                                            s3_key=S3_KEY,
                                            origin_schema=ORIGIN_SCHEMA,
                                            SCHEMA_LOCATION=SCHEMA_LOCATION,
                                            load_type=LOAD_TYPE,
                                            copy_params=COPY_PARAMS,
                                            redshift_schema=REDSHIFT_SCHEMA,
                                            table='{0}_{1}'.format(org['name'],
                                                                   endpoint['name']),
                                            redshift_conn_id=REDSHIFT_CONN_ID,
                                            primary_key='id')

            kick_off_dag >> github >> redshift
    'Catchup': False
}

dag = DAG('udac_example_dag',
          default_args=default_args,
          description='Load and transform data in Redshift with Airflow',
          schedule_interval='0 * * * *'
        )

start_operator = DummyOperator(task_id='Begin_execution',  dag=dag)

stage_events_to_redshift = S3ToRedshiftOperator(
    task_id='stage_events',
    dag=dag,
    table="staging_events",
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    s3_bucket="udacity-dend",
    s3_key="log_data",
)

stage_songs_to_redshift = S3ToRedshiftOperator(
    task_id='staging_songs',
    dag=dag,
    table="staging_songs",
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    s3_bucket="udacity-dend",
    s3_key="song_data",
)
Ejemplo n.º 8
0
                                dag=dag,
                                provide_context=True)
load_sas_file = PythonOperator(task_id='clean_and_load_i94',
                               python_callable=load_sas_data,
                               dag=dag,
                               provide_context=True)

##
##clean_data_check = PythonOperator(task_id='clean_data_query', python_callable = clean_data, dag=dag, provide_context=True)

load_s3_to_redshift = S3ToRedshiftOperator(
    task_id='s3_to_redshift',
    dag=dag,
    schema='public',
    table='us_city_demography',
    redshift_conn_id='pipeline_redshift',
    aws_conn_id='aws_credentials',
    copy_options=tuple(['csv', 'Delimiter', '\';\'', 'IGNOREHEADER 1']),
    s3_bucket='udacitycapstonedata',
    s3_key='us-cities-demographics.csv',
)

run_quality_checks = \
    DataQualityOperator(task_id='Run_data_quality_checks', dag=dag,
                        tables=['us_city_demography', 'immigration'],
                        redshift_conn_id='pipeline_redshift')

## analyze_data = PythonOperator(task_id='analyze_data_from_redshift', python_callable = analyze_query, dag=dag, provide_context=True)
## hello_operator = PythonOperator(task_id='hello_task', python_callable=print_hello, dag=dag, provide_context=True)

dummy_operator >> create_redshift
Ejemplo n.º 9
0
    schedule_interval="@monthly",
    max_active_runs=1
)

create_trips_table = PostgresOperator(
    task_id="create_trips_table",
    dag=dag,
    postgres_conn_id="redshift",
    sql=sql_statements.CREATE_TRIPS_TABLE_SQL
)

copy_trips_task = S3ToRedshiftOperator(
    task_id="load_trips_from_s3_to_redshift",
    dag=dag,
    table="trips",
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    s3_bucket="udac-data-pipelines",
    s3_key="divvy/partitioned/{execution_date.year}/{execution_date.month}/divvy_trips.csv"
)

#
# TODO: Replace this data quality check with the HasRowsOperator
#
check_trips = HasRowsOperator(
    task_id='check_trips_data',
    dag=dag,
    redshitft_conn_id = "redshift",
    table = "trips"
)
Ejemplo n.º 10
0
for table in tables:
    S3_KEY = 'salesforce/{}/{}.json'.format('{{ execution_date }}',
                                            table['name'].lower())

    salesforce_to_s3 = SalesforceToS3Operator(task_id='{0}_to_S3'.format(table['name']),
                                              sf_conn_id=SF_CONN_ID,
                                              obj=table,
                                              output=S3_KEY,
                                              fmt='ndjson',
                                              s3_conn_id=S3_CONN_ID,
                                              s3_bucket=S3_BUCKET,
                                              record_time_added=True,
                                              coerce_to_timestamp=True,
                                              dag=dag)

    s3_to_redshift = S3ToRedshiftOperator(task_id='{0}_to_Redshift'.format(table['name']),
                                          redshift_conn_id=REDSHIFT_CONN_ID,
                                          redshift_schema=REDSHIFT_SCHEMA_NAME,
                                          table=table,
                                          s3_conn_id=S3_CONN_ID,
                                          s3_bucket=S3_BUCKET,
                                          s3_key=S3_KEY,
                                          origin_schema=ORIGIN_SCHEMA,
                                          copy_params=COPY_PARAMS,
                                          schema_location=SCHEMA_LOCATION,
                                          load_type=table['load_type'],
                                          dag=dag)

    kick_off_dag >> salesforce_to_s3 >> s3_to_redshift
Ejemplo n.º 11
0
from datetime import datetime
from airflow import DAG
from airflow.operators import S3ToRedshiftOperator
from airflow.operators.dummy_operator import DummyOperator
from airflow import configuration

args = {'owner': 'scopeworker', 'provide_context': True}

dag = DAG('s3_to_redshift_dag',
          description='REDSHIFT DAG',
          schedule_interval='*/1 * * * *',
          start_date=datetime(2017, 3, 20),
          catchup=False,
          default_args=args)

redshift_operator = S3ToRedshiftOperator(
    task_id="s3_to_redshift",
    redshift_conn_id=configuration.get("postgresql", "postgresql_conn_id"),
    s3_bucket="scopeworkerproduction",
    aws_conn_id=configuration.get("s3", "s3_conn_id"),
    s3_access_key_id=configuration.get("s3", "s3_access_key_id"),
    s3_secret_access_key=configuration.get("s3", "s3_secret_access_key"),
    delimiter='|',
    region="us-west-2",
    dag=dag)

dummy_operator = DummyOperator(task_id='dummy_task', retries=3, dag=dag)

dummy_operator >> redshift_operator
                start_date=datetime.datetime(2020, 6, 21),
                max_active_runs=1)

# create trips table
createTripsTable = PostgresOperator(task_id='createTripsTable',
                                    postgres_conn_id='redshift',
                                    sql=sql_statements.CREATE_TRIPS_TABLE_SQL,
                                    dag=customDag)

# load data from S3 to Resdhift
loadTripData = S3ToRedshiftOperator(
    task_id='loadTripData',
    redshift_conn_id='redshift',
    aws_credentials_id='aws_credentials',
    table='trips',
    truncate=False,
    s3_bucket='udacity-dend',
    s3_key='data-pipelines/divvy/unpartitioned/divvy_trips_2018.csv',
    delimiter=',',
    ignore_headers=1,
    dag=customDag)

# check data quality
checkDataQuality = HasRowsOperator(task_id='checkDataQuality',
                                   redshift_conn_id='redshift',
                                   table='trips',
                                   dag=customDag)

# create fact table
createFactTable = FactsCalculatorOperator(task_id='createFactTable',
                                          redshift_conn_id='redshift',
def create_dag(dag_id,
               schedule,
               marketo_conn_id,
               redshift_conn_id,
               redshift_schema,
               s3_conn_id,
               s3_bucket,
               default_args,
               catchup=False):

    dag = DAG(dag_id,
              schedule_interval=schedule,
              default_args=default_args,
              catchup=catchup)

    if 'backfill' in dag_id:
        endpoints = ['leads']
    else:
        endpoints = [
            'activities', 'campaigns', 'leads', 'programs', 'lead_lists'
        ]

    COPY_PARAMS = [
        "COMPUPDATE OFF", "STATUPDATE OFF", "JSON 'auto'", "TIMEFORMAT 'auto'"
        "TRUNCATECOLUMNS", "region as 'us-east-1'"
    ]

    with dag:
        d = DummyOperator(task_id='kick_off_dag')

        l = RateLimitOperator(task_id='check_rate_limit',
                              service='marketo',
                              service_conn_id=marketo_conn_id,
                              threshold=RATE_LIMIT_THRESHOLD,
                              threshold_type=RATE_LIMIT_THRESHOLD_TYPE)

        d >> l

        for endpoint in endpoints:

            MARKETO_SCHEMA = schema[endpoint]
            TABLE_NAME = 'mkto_{0}'.format(endpoint)

            S3_KEY = 'marketo/{0}/{1}_{2}.json'.format(redshift_schema,
                                                       endpoint,
                                                       "{{ ts_nodash }}")

            MARKETO_TASK_ID = 'get_{0}_marketo_data'.format(endpoint)
            REDSHIFT_TASK_ID = 'marketo_{0}_to_redshift'.format(endpoint)

            START_AT = "{{ execution_date.isoformat() }}"
            END_AT = "{{ next_execution_date.isoformat() }}"

            m = MarketoToS3Operator(task_id=MARKETO_TASK_ID,
                                    marketo_conn_id=marketo_conn_id,
                                    endpoint=endpoint,
                                    s3_conn_id=s3_conn_id,
                                    s3_bucket=s3_bucket,
                                    s3_key=S3_KEY,
                                    output_format='json',
                                    start_at=START_AT,
                                    end_at=END_AT)

            l >> m

            if endpoint != 'leads':
                r = S3ToRedshiftOperator(task_id=REDSHIFT_TASK_ID,
                                         s3_conn_id=s3_conn_id,
                                         s3_bucket=s3_bucket,
                                         s3_key=S3_KEY,
                                         load_type='rebuild',
                                         load_format='JSON',
                                         schema_location='local',
                                         origin_schema=MARKETO_SCHEMA,
                                         redshift_schema=redshift_schema,
                                         table=TABLE_NAME,
                                         copy_params=COPY_PARAMS,
                                         redshift_conn_id=redshift_conn_id)
                m >> r
            else:
                rl = S3ToRedshiftOperator(task_id=REDSHIFT_TASK_ID,
                                          s3_conn_id=s3_conn_id,
                                          s3_bucket=s3_bucket,
                                          s3_key=S3_KEY,
                                          load_type='upsert',
                                          load_format='JSON',
                                          schema_location='local',
                                          origin_schema=MARKETO_SCHEMA,
                                          redshift_schema=redshift_schema,
                                          table=TABLE_NAME,
                                          primary_key='id',
                                          copy_params=COPY_PARAMS,
                                          incremental_key='updated_at',
                                          redshift_conn_id=redshift_conn_id)

                m >> rl

    return dag
def create_dag(dag_id,
               schedule,
               hubspot_conn_id,
               redshift_conn_id,
               redshift_schema,
               default_args,
               catchup=False,
               max_active_runs=3):

    try:
        h = HttpHook(method='GET', http_conn_id=hubspot_conn_id)
        pw = h.get_connection(conn_id=hubspot_conn_id).password
        response = h.run('oauth/v1/access-tokens/{0}'.format(pw))
        scopes = response.json()['scopes']

        dag = DAG(dag_id,
                  default_args=default_args,
                  schedule_interval=schedule,
                  catchup=catchup,
                  max_active_runs=max_active_runs)

        with dag:
            kick_off_dag = DummyOperator(task_id='kick_off_dag')
            kick_off_dag

            tables_to_build = []

            for endpoint in endpoints:
                if endpoint['scope'] in scopes:
                    if 'backfill' in dag_id and 'startTimestamp':
                        if endpoint['hubspot_args'].keys():
                            tables_to_build.append(endpoint)
                    else:
                        tables_to_build.append(endpoint)

            for table in tables_to_build:
                HUBSPOT_ARGS = table.get('hubspot_args', {})
                TABLE_NAME = table.get('name', '')
                LOAD_TYPE = table.get('load_type', '')

                PRIMARY_KEY = None
                INCREMENTAL_KEY = None

                if 'primary_key' in table.keys():
                    PRIMARY_KEY = table['primary_key']

                if 'incremental_key' in table.keys():
                    INCREMENTAL_KEY = table['incremental_key']

                S3_KEY = ('hubspot/{0}/{1}_{2}.json'.format(
                    redshift_schema, TABLE_NAME, "{{ ts_nodash }}"))

                split_key = path.splitext(S3_KEY)
                LOAD_KEY = '{0}_core'.format(split_key[0])

                h = HubspotToS3Operator(
                    task_id='hubspot_{0}_data_to_s3'.format(TABLE_NAME),
                    hubspot_conn_id=hubspot_conn_id,
                    hubspot_object=TABLE_NAME,
                    hubspot_args=HUBSPOT_ARGS,
                    s3_conn_id=S3_CONN_ID,
                    s3_bucket=S3_BUCKET,
                    s3_key=S3_KEY)

                kick_off_dag >> h

                if table['name'] == 'timeline':
                    pass
                else:
                    r = S3ToRedshiftOperator(
                        task_id='hubspot_{0}_to_redshift'.format(TABLE_NAME),
                        s3_conn_id=S3_CONN_ID,
                        s3_bucket=S3_BUCKET,
                        s3_key=LOAD_KEY,
                        origin_schema=getattr(hubspot_schema, TABLE_NAME),
                        origin_datatype='json',
                        copy_params=COPY_PARAMS,
                        load_type=LOAD_TYPE,
                        primary_key=PRIMARY_KEY,
                        incremental_key=INCREMENTAL_KEY,
                        schema_location='local',
                        redshift_schema=redshift_schema,
                        table=TABLE_NAME,
                        redshift_conn_id=redshift_conn_id)

                    h >> r

                if table['subtables']:
                    for subtable in table['subtables']:

                        SUBTABLE_LOAD_KEY = '{0}_{1}'.format(
                            split_key[0], subtable)
                        SUBTABLE_NAME = '{0}_{1}'.format(TABLE_NAME, subtable)
                        if SUBTABLE_NAME == 'timeline':
                            SUBTABLE_NAME = TABLE_NAME

                        s = S3ToRedshiftOperator(
                            task_id='hubspot_{0}_{1}_to_redshift'.format(
                                TABLE_NAME, subtable),
                            s3_conn_id=S3_CONN_ID,
                            s3_bucket=S3_BUCKET,
                            s3_key=SUBTABLE_LOAD_KEY,
                            origin_schema=getattr(
                                hubspot_schema,
                                '{0}_{1}'.format(TABLE_NAME, subtable)),
                            origin_datatype='json',
                            load_type=LOAD_TYPE,
                            schema_location='local',
                            copy_params=COPY_PARAMS,
                            redshift_schema=redshift_schema,
                            table=SUBTABLE_NAME,
                            redshift_conn_id=REDSHIFT_CONN_ID)
                        h >> s

        return dag
    except:
        pass
Ejemplo n.º 15
0
#       2. Performs a data quality check on the Trips table in RedShift
#       3. Uses the FactsCalculatorOperator to create a Facts table in Redshift
#           a. **NOTE**: to complete this step you must complete the FactsCalcuatorOperator
#              skeleton defined in plugins/operators/facts_calculator.py
#
dag = DAG("lesson3.exercise4", start_date=datetime.datetime.utcnow())

#
# TODO: Load trips data from S3 to RedShift. Use the s3_key
#       "data-pipelines/divvy/unpartitioned/divvy_trips_2018.csv"
#       and the s3_bucket "udacity-dend"
#
copy_trips_task = S3ToRedshiftOperator(
    task_id='copy_trips_task',
    dag=dag,
    redshift_conn_id='redshift',
    aws_credentials_id='aws_credentials',
    table='trips',
    s3_bucket='udacity-dend',
    s3_key='data-pipelines/divvy/unpartitioned/divvy_trips_2018.csv')

#
# TODO: Perform a data quality check on the Trips table
#
check_trips = HasRowsOperator(
    task_id='check_trips_data',
    dag=dag,
    table='trips',
    redshift_conn_id='redshift',
)

#
Ejemplo n.º 16
0
    init_tables = PostgresOperator(dag=dag,
                                   task_id="init_tables",
                                   postgres_conn_id="redshift_songs",
                                   sql=dag_sql.init_tables)

    verify_staged_songs = PostgresHasRowsOperator(
        task_id="verify_imported_records",
        table="staging_songs",
        connection_id="redshift_songs")

    stage_songs_from_s3 = S3ToRedshiftOperator(
        task_id="stage-songs-to-redshift",
        redshift_conn_id="redshift_songs",
        aws_conn_id="s3_songs",
        table="staging_songs",
        s3_bucket=input_bucket,
        s3_key=song_data,
        overwrite=True,
        copy_params=stage_songs_params)

    extract_artists = PostgresOperator(task_id="load_artists",
                                       postgres_conn_id="redshift_songs",
                                       sql=dag_sql.load_artists)

    extract_songs = PostgresOperator(task_id="load_songs",
                                     postgres_conn_id="redshift_songs",
                                     sql=dag_sql.load_songs)
    init_tables >> stage_songs_from_s3
    stage_songs_from_s3 >> verify_staged_songs
    verify_staged_songs >> [extract_artists, extract_songs]
        type = workflow.get('id', None)
        name = workflow.get('name', None)
        fields = workflow.get('fields', None)

        S3_KEY = '{type}_{time}.csv'.format(type=workflow['id'], time=TIME)

        imap_to_s3 = ImapToS3Operator(
            task_id='{}_to_s3'.format(type),
            imap_conn_id=IMAP_CONN_ID,
            imap_email=IMAP_EMAIL,
            imap_subject=name,
            s3_conn_id=S3_CONN_ID,
            s3_bucket=S3_BUCKET,
            s3_key=S3_KEY,
        )

        s3_to_redshift = S3ToRedshiftOperator(
            task_id='{}_to_redshift'.format(type),
            s3_conn_id=S3_CONN_ID,
            s3_bucket=S3_BUCKET,
            s3_key=S3_KEY,
            redshift_conn_id=REDSHIFT_CONN_ID,
            redshift_schema=REDSHIFT_SCHEMA,
            table=type,
            origin_schema=fields,
            schema_location='local',
        )

    kick_off_dag >> imap_to_s3 >> s3_to_redshift
# create Tasks by instantiating Operator Classes
createTripsTable = PostgresOperator(task_id='createTripsTable',
                                    postgres_conn_id='redshift',
                                    sql=sql_statements.CREATE_TRIPS_TABLE_SQL,
                                    dag=bulkLoadDag)

# IMPORTANT:
# As "s3_key" parameter has been marked as "Templatable" (in its Class
# "template_fields" variable) it'll be able to fetch JINJA template
# variables like {{ds}}, {{execution_date}} and so on.
loadTripsData = S3ToRedshiftOperator(
    task_id="loadTripsData",
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    table="trips",
    truncate=False,
    s3_bucket="udacity-dend",
    s3_key=
    "data-pipelines/divvy/partitioned/{execution_date.year}/{execution_date.month}/divvy_trips.csv",
    delimiter=",",
    ignore_headers=1,
    dag=bulkLoadDag)

checkTripsData = HasRowsOperator(task_id='checkTripsData',
                                 redshift_conn_id='redshift',
                                 table='trips',
                                 dag=bulkLoadDag)

createStationsTable = PostgresOperator(
    task_id="createStationsTable",
    postgres_conn_id="redshift",
    sql=sql_statements.CREATE_STATIONS_TABLE_SQL,
          default_args=default_args,
          description='Extract, load and, transform covid \
          and google mobility data into redshift',
          schedule_interval='@once',
          catchup=True)

# task definition to create tables in redshift
create_tables = CreateTablesInRedshiftOperator(
    task_id="create_redshift_tables", dag=dag, redshift_conn_id="redshift")

# task definition to copy location lookup data into staging table
copy_location_lookup_data_task = S3ToRedshiftOperator(
    task_id="copy_location_lookup_data_from_s3_to_redshift",
    dag=dag,
    table="location_lookup_staging",
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    s3_bucket="btrivedi-udacityproject",
    s3_key=f"location_lookup/location_lookup_11-24-2020.psv.gz",
    provide_context=True,
)

# task definition to copy covid data into staging table
copy_covid_data_task = S3ToRedshiftOperator(
    task_id="copy_covid_data_from_s3_to_redshift",
    dag=dag,
    table="covid_data_staging",
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    s3_bucket="btrivedi-udacityproject",
    s3_key=f"covid/covid",
    file_format="json",
def redshift_table_ddl_plus_bulk_load(parent_dag_id, parent_task_id,
                                      redshift_conn_id, aws_credentials_id,
                                      table, create_table_statement, s3_bucket,
                                      s3_key, *args, **kwargs):
    """
        This Python Factory Function returns an Airflow DAG Class instance to
    be used as a SubDag within the (outer) DAG that calls it.

    The DAG in turn executes three Task within a Redshift database instance:
        - execute CREATE TABLE (DDL) statement through PostgresOperator;
        - execute COPY (bulk load) to pull data from S3 into the newly created
    table through custom S3ToRedshiftOperator
        - perform data quality check through custom HasRowsOperator

    Args:
        parent_dag_id (str): 

    """

    # instantiate DAG Class
    subDagInstance = DAG(
        #----------------------------------------------------------------------
        #
        #   ATTENTION! "parent_dag_id.parent_task_id" IS MANDATORY
        #
        #   It's an Airflow Convention that subdags must have their "dag_id"
        # attribute set to the combination of:
        #
        #   1. "dag_id" of the outer DAG Instance calling the SubDagOperator
        # which in turn calls the Python Factory Function;
        #
        #   2. "task_id" assigned to the SubDagOperator mentioned in item number 1
        #
        #   3. "." a dot must separate  the "dag_id" and "task_id" mentioned in
        # both items 1 and 2, respectively;
        #
        #----------------------------------------------------------------------
        dag_id=f"{parent_dag_id}.{parent_task_id}"
        # make sure keyword arguments are also received
        ,
        **kwargs)

    # NOTICE how Task instances are created in a SubDag just as they would in
    # a "normal" DAG.
    createTable = PostgresOperator(task_id=f'create_{table}_table',
                                   postgres_conn_id=redshift_conn_id,
                                   sql=create_table_statement,
                                   dag=subDagInstance)

    bulkLoadTable = S3ToRedshiftOperator(task_id=f'bulk_load_{table}_table',
                                         redshift_conn_id=redshift_conn_id,
                                         aws_credentials_id=aws_credentials_id,
                                         table=table,
                                         s3_bucket=s3_bucket,
                                         s3_key=s3_key,
                                         dag=subDagInstance)

    checkDataQuality = HasRowsOperator(task_id=f'check_{table}_table',
                                       redshift_conn_id=redshift_conn_id,
                                       table=table,
                                       dag=subDagInstance)

    # Define Task dependencies
    createTable >> bulkLoadTable >> checkDataQuality

    return subDagInstance
Ejemplo n.º 21
0
methods = {'PlayerCareer': 'regular_season_totals', 'PlayerGameLogs': 'info'}

with dag:
    kick_off_dag = DummyOperator(task_id='basketball_analysis_sideproj')
    for player in players:
        for method in methods.keys():
            to_s3 = NbaToS3Operator(
                task_id='{0}_{1}_to_s3'.format(player, method),
                player_name=player,
                endpoint='player',
                method=method,
                id=players[player],
                stats=methods[method],
                s3_conn_id='astronomer-s3',
                s3_bucket='astronomer-workflows-dev',
                s3_key='{0}_{1}.json'.format(player, method),
            )
            s3_to_redshift = S3ToRedshiftOperator(
                task_id='{0}_{1}_to_redshift'.format(player, method),
                redshift_conn_id='astronomer-redshift-dev',
                redshift_schema='viraj_testing',
                table='{player}_{stats}'.format(player=player,
                                                stats=methods[method]),
                s3_conn_id='astronomer-s3',
                s3_bucket='astronomer-workflows-dev',
                s3_key='{0}_{1}.json'.format(player, method),
                origin_schema='{0}_{1}_schema.json'.format(player, method),
                load_type='rebuild')

            kick_off_dag >> to_s3 >> s3_to_redshift
        dag=dag)

    # Append breakdown fields (primary keys) after
    # primary keys which are in every workflow
    output_table_fields = list(fields)
    output_table_fields = output_table_fields[:4] + breakdown[
        'fields'] + output_table_fields[4:]

    primary_key = [
        'ad_id', 'adset_id', 'campaign_id', 'account_id', 'date_start'
    ]

    primary_key.extend(breakdown_fields)

    s3_to_redshift = S3ToRedshiftOperator(task_id='s3_{}_to_redshift'.format(
        breakdown['name']),
                                          s3_conn_id=S3_CONN_ID,
                                          s3_bucket=S3_BUCKET,
                                          s3_key=S3_KEY,
                                          redshift_conn_id=REDSHIFT_CONN_ID,
                                          redshift_schema=REDSHIFT_SCHEMA,
                                          copy_params=COPY_PARAMS,
                                          table=breakdown['name'],
                                          origin_schema=output_table_fields,
                                          schema_location='local',
                                          primary_key=primary_key,
                                          load_type='upsert',
                                          dag=dag)

    start >> facebook_ads >> s3_to_redshift
Ejemplo n.º 23
0
#       2. Performs a data quality check on the Trips table in RedShift
#       3. Uses the FactsCalculatorOperator to create a Facts table in Redshift
#           a. **NOTE**: to complete this step you must complete the FactsCalcuatorOperator
#              skeleton defined in plugins/operators/facts_calculator.py
#
dag = DAG("lesson3.exercise4", start_date=datetime.datetime.utcnow())

#
# The following code will load trips data from S3 to RedShift. Use the s3_key
#       "data-pipelines/divvy/unpartitioned/divvy_trips_2018.csv"
#       and the s3_bucket "udacity-dend"
#
copy_trips_task = S3ToRedshiftOperator(
    task_id="load_trips_from_s3_to_redshift",
    dag=dag,
    table="trips",
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    s3_bucket="udacity-dend",
    s3_key="data-pipelines/divvy/unpartitioned/divvy_trips_2018.csv")

#
#  Data quality check on the Trips table
#
check_trips = HasRowsOperator(task_id="check_trips_data",
                              dag=dag,
                              redshift_conn_id="redshift",
                              table="trips")

#
# We use the FactsCalculatorOperator to create a Facts table in RedShift. The fact column is
#  `tripduration` and the groupby_column is `bikeid`
Ejemplo n.º 24
0
create_mode_dim_table = PostgresOperator(task_id="create_mode_dim_table",
                                         dag=dag,
                                         postgres_conn_id="redshift",
                                         sql=sql_statements.CREATE_MODE_SQL)

# Create Visa Dimension Table
create_visa_dim_table = PostgresOperator(task_id="create_visa_dim_table",
                                         dag=dag,
                                         postgres_conn_id="redshift",
                                         sql=sql_statements.CREATE_VISA_SQL)

# load immigration data to immigration_fact table
copy_immigration_task = S3ToRedshiftOperator(
    task_id="load_immigration_from_s3_to_redshift",
    dag=dag,
    table="immigration_fact",
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    s3_bucket=bucket,
    s3_key="immigration-data/cleanjob.csv")

# Load happiness data to happiness table
copy_happiness_task = S3ToRedshiftOperator(
    task_id="load_happiness_from_s3_to_redshift",
    dag=dag,
    table="happiness",
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    s3_bucket=bucket,
    s3_key="happiness-data/2016.csv")

# Load country data to Country table
            task_id='get_google_analytics_data',
            google_analytics_conn_id=GOOGLE_ANALYTICS_CONN_ID,
            view_id=view_id,
            since=SINCE,
            until=UNTIL,
            sampling_level=SAMPLING_LEVEL,
            dimensions=DIMENSIONS,
            metrics=METRICS,
            page_size=PAGE_SIZE,
            include_empty_rows=INCLUDE_EMPTY_ROWS,
            s3_conn_id=S3_CONN_ID,
            s3_bucket=S3_BUCKET,
            s3_key=S3_KEY)

        redshift = S3ToRedshiftOperator(
            task_id='sink_to_redshift',
            redshift_conn_id=REDSHIFT_CONN_ID,
            redshift_schema=REDSHIFT_SCHEMA,
            table='report_{}'.format(view_id),
            s3_conn_id=S3_CONN_ID,
            s3_bucket=S3_BUCKET,
            s3_key=S3_KEY,
            origin_schema=google_analytics_reporting_schema,
            schema_location='local',
            copy_params=COPY_PARAMS,
            load_type='upsert',
            primary_key=PRIMARY_KEY,
            incremental_key=INCREMENTAL_KEY)

        d >> g >> redshift
                                  s3_key=S3_KEY,
                                  replace=True)

        flatten_object = PythonOperator(
            task_id='flatten_{0}'.format(collection['name']),
            python_callable=flatten_py,
            templates_dict={
                's3_key': S3_KEY,
                's3_conn': S3_CONN,
                's3_bucket': S3_BUCKET,
                'collection_name': collection['name'],
                'flattened_key': FLATTENED_KEY,
                'origin_schema': collection['schema']
            },
            provide_context=True)

        redshift = S3ToRedshiftOperator(
            task_id='{0}_to_redshift'.format(collection['name']),
            s3_conn_id=S3_CONN,
            s3_bucket=S3_BUCKET,
            s3_key=FLATTENED_KEY,
            redshift_conn_id=REDSHIFT_CONN_ID,
            redshift_schema=REDSHIFT_SCHEMA,
            origin_schema=collection['schema'],
            redshift_table=collection['name'],
            primary_key=collection.get('primary_key', None),
            incremental_key=collection.get('incremental_key', None),
            load_type=collection['load_type'])

        kick_off_dag >> mongo >> flatten_object >> redshift