def stg_subdag(parent_dag_name, task_id, aws_credentials_id, postgres_conn_id,
               table, create_sql_stmt, s3_prefix, s3_bucket, s3_key,
               s3_jsonpath_file, sw_delete_data, partition_year,
               partition_month, *args, **kwargs):
    dag = DAG(f"{parent_dag_name}.{task_id}", **kwargs)

    create_table = PostgresOperator(task_id=f"create_table_{table}",
                                    dag=dag,
                                    postgres_conn_id=postgres_conn_id,
                                    sql=create_sql_stmt)

    load_table = StageToRedshiftOperator(task_id=f"load_table_{table}",
                                         dag=dag,
                                         aws_credentials_id=aws_credentials_id,
                                         postgres_conn_id=postgres_conn_id,
                                         table=table,
                                         s3_prefix=s3_prefix,
                                         s3_bucket=s3_bucket,
                                         s3_key=s3_key,
                                         s3_jsonpath_file=s3_jsonpath_file,
                                         sw_delete_data=sw_delete_data,
                                         partition_year=partition_year,
                                         partition_month=partition_month)

    has_rows_table = HasRowsOperator(task_id=f"has_rows_table_{table}",
                                     dag=dag,
                                     postgres_conn_id=postgres_conn_id,
                                     table=table)

    create_table >> load_table >> has_rows_table

    return dag
def perform_data_quality_checks(parent_dag_name, task_id, redshift_conn_id,
                                table, default_args, check_sql_stmt, *args,
                                **kwargs):

    dag = DAG(f"{parent_dag_name}.{task_id}",
              default_args=default_args,
              schedule_interval=None,
              **kwargs)

    check_has_rows_task = HasRowsOperator(
        task_id=f"check_has_rows_for_{table}",
        dag=dag,
        table=table,
        redshift_conn_id=redshift_conn_id)

    check_data_quality_task = DataQualityOperator(
        task_id=f"perform_data_quality_checks_for_{table}",
        dag=dag,
        table=table,
        redshift_conn_id=redshift_conn_id,
        query=check_sql_stmt)

    check_has_rows_task >> check_data_quality_task

    return dag
def get_s3_to_redshift_dag(parent_dag_name, task_id, redshift_conn_id,
                           aws_credentials_id, table, create_sql_stmt,
                           s3_bucket, s3_key, *args, **kwargs):
    dag = DAG(f"{parent_dag_name}.{task_id}", **kwargs)

    create_task = PostgresOperator(task_id=f"create_{table}_table",
                                   dag=dag,
                                   postgres_conn_id=redshift_conn_id,
                                   sql=create_sql_stmt)

    copy_task = S3ToRedshiftOperator(
        task_id=f"load_{table}_from_s3_to_redshift",
        dag=dag,
        table=table,
        redshift_conn_id=redshift_conn_id,
        aws_credentials_id=aws_credentials_id,
        s3_bucket=s3_bucket,
        s3_key=s3_key)

    #
    # TODO: Move the HasRowsOperator task here from the DAG
    #
    check_task = HasRowsOperator(task_id=f"check_{table}_has_rows",
                                 dag=dag,
                                 redshift_conn_id=redshift_conn_id,
                                 table=table)

    create_task >> copy_task
    copy_task >> check_task
    #
    # TODO: Use DAG ordering to place the check task
    #

    return dag
Example #4
0
def get_s3_to_redshift_dag(
        parent_dag_name,
        task_id,
        redshift_conn_id,
        aws_credentials_id,
        table,
        create_sql_stmt,
        s3_bucket,
        s3_key,
        *args, **kwargs
):
    dag = DAG(
        dag_id=f"{parent_dag_name}.{task_id}",  # subdags' dag_id must follow this naming pattern
        **kwargs
    )

    # TODO: Move the create table code here
    create_table_task = PostgresOperator(
        task_id=f"create_{table}_table",
        postgres_conn_id=redshift_conn_id,
        sql=create_sql_stmt,
        dag=dag
    )

    # TODO: Move the S3ToRedshiftOperator code here
    copy_table_task = S3ToRedshiftOperator(
        task_id=f"load_{table}_from_s3_to_redshift",
        table=table,
        redshift_conn_id=redshift_conn_id,
        aws_credentials_id=aws_credentials_id,
        s3_bucket=s3_bucket,
        s3_key=s3_key,
        dag=dag
    )

    # TODO: Move the check table code here
    check_table_task = HasRowsOperator(
        task_id=f"check_{table}_data",
        redshift_conn_id=redshift_conn_id,
        table=table,
        dag=dag
    )

    # TODO: Define ordering of tasks within this subdag
    create_table_task >> copy_table_task
    copy_table_task >> check_table_task

    return dag
Example #5
0
def get_s3_to_redshift_dag(parent_dag_name,
                           task_id,
                           redshift_conn_id,
                           aws_credentials_id,
                           staging_table,
                           create_sql_stmt,
                           s3_bucket,
                           s3_key,
                           create_table_option=True,
                           redshift_copy_params=[],
                           *args,
                           **kwargs):

    dag = DAG(f"{parent_dag_name}.{task_id}", **kwargs)

    create_table_task = PythonOperator(task_id=f"create_{staging_table}_table",
                                       dag=dag,
                                       python_callable=create_table,
                                       provide_context=True,
                                       params={
                                           'create_table_option':
                                           create_table_option,
                                           'redshift_conn_id':
                                           redshift_conn_id,
                                           'create_sql_stmt': create_sql_stmt,
                                           'table': staging_table
                                       })

    stage_table_from_s3_task = StageToRedshiftOperator(
        dag=dag,
        task_id=f"load_{staging_table}_from_s3_to_redshift",
        redshift_conn_id=redshift_conn_id,
        aws_conn_id=aws_credentials_id,
        table=staging_table,
        s3_bucket=s3_bucket,
        s3_key=s3_key,
        overwrite=True,
        copy_params=redshift_copy_params)

    verify_staged_table_task = HasRowsOperator(
        dag=dag,
        task_id=f"verify_{staging_table}_count",
        table=staging_table,
        connection_id=redshift_conn_id)

    create_table_task >> stage_table_from_s3_task >> verify_staged_table_task

    return dag
def dim_subdag(parent_dag_name, task_id, postgres_conn_id, table,
               insert_sql_stmt, sw_delete_dimensions, *args, **kwargs):
    dag = DAG(f"{parent_dag_name}.{task_id}", **kwargs)

    load_table = LoadDimensionOperator(
        task_id=f"load_table_{table}",
        dag=dag,
        postgres_conn_id=postgres_conn_id,
        table=table,
        sql=insert_sql_stmt,
        sw_delete_dimensions=sw_delete_dimensions,
    )

    has_rows_table = HasRowsOperator(task_id=f"has_rows_table_{table}",
                                     dag=dag,
                                     postgres_conn_id=postgres_conn_id,
                                     table=table)

    load_table >> has_rows_table

    return dag
    dag=dag,
    table="trips",
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    s3_bucket="udac-data-pipelines",
    s3_key=
    "divvy/partitioned/{execution_date.year}/{execution_date.month}/divvy_trips.csv"
)

#
# TODO: Replace this data quality check with the HasRowsOperator
#
check_trips = HasRowsOperator(task_id='check_trips_data',
                              dag=dag,
                              python_callable=check_greater_than_zero,
                              provide_context=True,
                              params={
                                  'table': 'trips',
                              })

create_stations_table = PostgresOperator(
    task_id="create_stations_table",
    dag=dag,
    postgres_conn_id="redshift",
    sql=sql_statements.CREATE_STATIONS_TABLE_SQL,
)

copy_stations_task = S3ToRedshiftOperator(
    task_id="load_stations_from_s3_to_redshift",
    dag=dag,
    redshift_conn_id="redshift",
def redshift_table_ddl_plus_bulk_load(parent_dag_id, parent_task_id,
                                      redshift_conn_id, aws_credentials_id,
                                      table, create_table_statement, s3_bucket,
                                      s3_key, *args, **kwargs):
    """
        This Python Factory Function returns an Airflow DAG Class instance to
    be used as a SubDag within the (outer) DAG that calls it.

    The DAG in turn executes three Task within a Redshift database instance:
        - execute CREATE TABLE (DDL) statement through PostgresOperator;
        - execute COPY (bulk load) to pull data from S3 into the newly created
    table through custom S3ToRedshiftOperator
        - perform data quality check through custom HasRowsOperator

    Args:
        parent_dag_id (str): 

    """

    # instantiate DAG Class
    subDagInstance = DAG(
        #----------------------------------------------------------------------
        #
        #   ATTENTION! "parent_dag_id.parent_task_id" IS MANDATORY
        #
        #   It's an Airflow Convention that subdags must have their "dag_id"
        # attribute set to the combination of:
        #
        #   1. "dag_id" of the outer DAG Instance calling the SubDagOperator
        # which in turn calls the Python Factory Function;
        #
        #   2. "task_id" assigned to the SubDagOperator mentioned in item number 1
        #
        #   3. "." a dot must separate  the "dag_id" and "task_id" mentioned in
        # both items 1 and 2, respectively;
        #
        #----------------------------------------------------------------------
        dag_id=f"{parent_dag_id}.{parent_task_id}"
        # make sure keyword arguments are also received
        ,
        **kwargs)

    # NOTICE how Task instances are created in a SubDag just as they would in
    # a "normal" DAG.
    createTable = PostgresOperator(task_id=f'create_{table}_table',
                                   postgres_conn_id=redshift_conn_id,
                                   sql=create_table_statement,
                                   dag=subDagInstance)

    bulkLoadTable = S3ToRedshiftOperator(task_id=f'bulk_load_{table}_table',
                                         redshift_conn_id=redshift_conn_id,
                                         aws_credentials_id=aws_credentials_id,
                                         table=table,
                                         s3_bucket=s3_bucket,
                                         s3_key=s3_key,
                                         dag=subDagInstance)

    checkDataQuality = HasRowsOperator(task_id=f'check_{table}_table',
                                       redshift_conn_id=redshift_conn_id,
                                       table=table,
                                       dag=subDagInstance)

    # Define Task dependencies
    createTable >> bulkLoadTable >> checkDataQuality

    return subDagInstance
# variables like {{ds}}, {{execution_date}} and so on.
loadTripsData = S3ToRedshiftOperator(
    task_id="loadTripsData",
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    table="trips",
    truncate=False,
    s3_bucket="udacity-dend",
    s3_key=
    "data-pipelines/divvy/partitioned/{execution_date.year}/{execution_date.month}/divvy_trips.csv",
    delimiter=",",
    ignore_headers=1,
    dag=bulkLoadDag)

checkTripsData = HasRowsOperator(task_id='checkTripsData',
                                 redshift_conn_id='redshift',
                                 table='trips',
                                 dag=bulkLoadDag)

createStationsTable = PostgresOperator(
    task_id="createStationsTable",
    postgres_conn_id="redshift",
    sql=sql_statements.CREATE_STATIONS_TABLE_SQL,
    dag=bulkLoadDag)

# REMEMBER:
# As "s3_key" parameter has been marked as "Templatable" (in its Class
# "template_fields" variable) it'll be able to fetch JINJA template
# variables like {{ds}}, {{execution_date}} and so on.
loadStationsTable = S3ToRedshiftOperator(
    task_id="loadStationsTable",
    redshift_conn_id="redshift",
Example #10
0
#           a. **NOTE**: to complete this step you must complete the FactsCalcuatorOperator
#              skeleton defined in plugins/operators/facts_calculator.py
#
dag = DAG("lesson3.exercise4", start_date=datetime.datetime.utcnow())

copy_trips_task = S3ToRedshiftOperator(
    task_id="load_trips_from_s3_to_redshift",
    dag=dag,
    table="trips",
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    s3_bucket="udacity-dend",
    s3_key="data-pipelines/divvy/partitioned/2018/10/divvy_trips.csv")

check_trips = HasRowsOperator(task_id='check_trips_data',
                              dag=dag,
                              redshift_conn_id='redshift',
                              table='trips')

#
# TODO: Use the FactsCalculatorOperator to create a Facts table in RedShift. The fact column should
#       be `tripduration` and the groupby_column should be `bikeid`
#
calculate_facts = FactsCalculatorOperator(task_id='calculate_facts',
                                          dag=dag,
                                          redshift_conn_id='redshift',
                                          origin_table='trips',
                                          destination_table='fact_trips',
                                          fact_column='tripduration',
                                          groupby_column='bikeid')

copy_trips_task >> check_trips
Example #11
0
#
# TODO: Replace this data quality check with the HasRowsOperator
#
# check_trips = PythonOperator(
#     task_id='check_trips_data',
#     dag=dag,
#     python_callable=check_greater_than_zero,
#     provide_context=True,
#     params={
#         'table': 'trips',
#     }
# )

check_trips = HasRowsOperator(
    task_id='check_trips_data',
    dag=dag,
    redshift_conn_id="redshift",
    table='trips'
)

create_stations_table = PostgresOperator(
    task_id="create_stations_table",
    dag=dag,
    postgres_conn_id="redshift",
    sql=sql_statements.CREATE_STATIONS_TABLE_SQL
)

copy_stations_task = S3ToRedshiftOperator(
    task_id="load_stations_from_s3_to_redshift",
    dag=dag,
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
                                    dag=customDag)

# load data from S3 to Resdhift
loadTripData = S3ToRedshiftOperator(
    task_id='loadTripData',
    redshift_conn_id='redshift',
    aws_credentials_id='aws_credentials',
    table='trips',
    truncate=False,
    s3_bucket='udacity-dend',
    s3_key='data-pipelines/divvy/unpartitioned/divvy_trips_2018.csv',
    delimiter=',',
    ignore_headers=1,
    dag=customDag)

# check data quality
checkDataQuality = HasRowsOperator(task_id='checkDataQuality',
                                   redshift_conn_id='redshift',
                                   table='trips',
                                   dag=customDag)

# create fact table
createFactTable = FactsCalculatorOperator(task_id='createFactTable',
                                          redshift_conn_id='redshift',
                                          origin_table='trips',
                                          destination_table='trip_facts',
                                          fact_column='tripduration',
                                          group_by_column='bikeid')

# set task dependencies
createTripsTable >> loadTripData >> checkDataQuality >> createFactTable
#       and the s3_bucket "udacity-dend"
#
copy_trips_task = S3ToRedshiftOperator(
    task_id="load_trips_from_s3_to_redshift",
    dag=dag,
    redshift_conn_id=my_redshift_conn,
    aws_credentials_id=my_aws_credentials,
    table="trips",
    s3_bucket="udacity-dend",
    s3_key="data-pipelines/divvy/unpartitioned/divvy_trips_2018.csv")

#
# TODO: Perform a data quality check on the Trips table
#
check_trips = HasRowsOperator(task_id="check_hasrows_trips",
                              dag=dag,
                              redshift_conn_id=my_redshift_conn,
                              table="trips")

#
# TODO: Use the FactsCalculatorOperator to create a Facts table in RedShift. The fact column should
#       be `tripduration` and the groupby_column should be `bikeid`
#
calculate_facts = FactsCalculatorOperator(
    task_id="calculate_facts_trips",
    dag=dag,
    redshift_conn_id=my_redshift_conn,
    origin_table="trips",
    destination_table="table_tripduration",
    fact_column="tripduration",
    groupby_column="bikeid")
copy_trips_task = S3ToRedshiftOperator(
    task_id="load_trips_from_s3_to_redshift",
    dag=dag,
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    s3_bucket="udacity-dend",
    s3_key="/data-pipelines/divvy/unpartitioned/divvy_stations_2017.csv",
    table="trips")

#
# TODO: Perform a data quality check on the Trips table
#
check_trips = HasRowsOperator(task_id="check_trips",
                              dag=dag,
                              redshift_conn_id="redshift",
                              table="trips",
                              provide_context=True)

#
# TODO: Use the FactsCalculatorOperator to create a Facts table in RedShift. The fact column should
#       be `tripduration` and the groupby_column should be `bikeid`
#
calculate_facts = FactsCalculatorOperator(task_id="calculate_facts",
                                          dag=dag,
                                          redshift_conn_id="redshift",
                                          origin_table="trips",
                                          destination_table="trip_calc",
                                          fact_column="tripduration",
                                          groupby_column="bikeid")
Example #15
0
copy_trips_task = S3ToRedshiftOperator(
    task_id="load_trips_from_s3_to_redshift",
    dag=dag,
    table="trips",
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    s3_bucket="udacity-dend/data-pipelines",
    s3_key=
    "divvy/partitioned/{execution_date.year}/{execution_date.month}/divvy_trips.csv"
)

#
# TODO: Replace this data quality check with the HasRowsOperator
#
check_trips = HasRowsOperator(task_id='check_trips',
                              table='trips',
                              redshift_conn_id='redshift',
                              dag=dag)

create_stations_table = PostgresOperator(
    task_id="create_stations_table",
    dag=dag,
    postgres_conn_id="redshift",
    sql=sql_statements.CREATE_STATIONS_TABLE_SQL,
)

copy_stations_task = S3ToRedshiftOperator(
    task_id="load_stations_from_s3_to_redshift",
    dag=dag,
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    s3_bucket="udacity-dend/data-pipelines",
Example #16
0
)

copy_trips_task = S3ToRedshiftOperator(
    aws_credentials_id="aws_credentials",
    dag=dag,
    redshift_conn_id="redshift",
    s3_key="divvy/partitioned/{execution_date.year}/{execution_date.month}/divvy_trips.csv",
    table="trips",
    task_id="load_trips_from_s3_to_redshift",
    s3_bucket="udacity-dend",
    s3_prefix="data-pipelines"
)

check_trips = HasRowsOperator(
    task_id="check_trips_table",
    dag=dag,
    redshift_conn_id="redshift",
    table="trips"
)

create_stations_table = PostgresOperator(
    task_id="create_stations_table",
    dag=dag,
    postgres_conn_id="redshift",
    sql=sql_statements.CREATE_STATIONS_TABLE_SQL,
)

copy_stations_task = S3ToRedshiftOperator(
    aws_credentials_id="aws_credentials",
    dag=dag,
    redshift_conn_id="redshift",
    s3_key="divvy/unpartitioned/divvy_stations_2017.csv",
Example #17
0
#
copy_trips_task = S3ToRedshiftOperator(
    task_id="load_trips_from_s3_to_redshift",
    dag=dag,
    table="trips",
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    s3_bucket="udacity-dend",
    s3_key="data-pipelines/divvy/unpartitioned/divvy_trips_2018.csv")

#
# TODO: Perform a data quality check on the Trips table
#
check_trips = HasRowsOperator(task_id='check_trips_data',
                              dag=dag,
                              provide_context=True,
                              table='trips',
                              redshift_conn_id='redshift')

#
# TODO: Use the FactsCalculatorOperator to create a Facts table in RedShift. The fact column should
#       be `tripduration` and the groupby_column should be `bikeid`
#
calculate_facts = FactsCalculatorOperator(redshift_conn_id="redshift",
                                          origin_table="trips",
                                          destination_table="fact_table",
                                          fact_column="tripduration",
                                          groupby_column="bikeid")

#
# TODO: Define task ordering for the DAG tasks you defined
Example #18
0
                                      aws_credentials_id="aws_credentials",
                                      s3_bucket=bucket,
                                      s3_key="mode.csv")

# Load Visa data to Visa Dimension table
copy_visa_task = S3ToRedshiftOperator(task_id="load_visa_from_s3_to_redshift",
                                      dag=dag,
                                      table="visa_dim",
                                      redshift_conn_id="redshift",
                                      aws_credentials_id="aws_credentials",
                                      s3_bucket=bucket,
                                      s3_key="visa.csv")

# Check Rows count in immigration table
check_immigration = HasRowsOperator(task_id='check_immigration_data',
                                    dag=dag,
                                    table='immigration_fact',
                                    redshift_conn_id='redshift')

# Check Rows count in Happiness table
check_happiness = HasRowsOperator(task_id='check_happiness_data',
                                  dag=dag,
                                  table='happiness',
                                  redshift_conn_id='redshift')

# Check Rows count in Country table
check_country = HasRowsOperator(task_id='check_country_data',
                                dag=dag,
                                table='country',
                                redshift_conn_id='redshift')

# Check Rows count in Mode table
Example #19
0
    task_id="load_trips_from_s3_to_redshift",
    dag=dag,
    table="trips",
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    s3_bucket="udac-data-pipelines",
    s3_key=
    "divvy/partitioned/{execution_date.year}/{execution_date.month}/divvy_trips.csv"
)

#
# TODO: Replace this data quality check with the HasRowsOperator
#
check_trips = HasRowsOperator(task_id='check_trips_data',
                              dag=dag,
                              python_callable=check_greater_than_zero,
                              provide_context=True,
                              table='trips',
                              redshift_conn_id='redshift')

create_stations_table = PostgresOperator(
    task_id="create_stations_table",
    dag=dag,
    postgres_conn_id="redshift",
    sql=sql_statements.CREATE_STATIONS_TABLE_SQL,
)

copy_stations_task = S3ToRedshiftOperator(
    task_id="load_stations_from_s3_to_redshift",
    dag=dag,
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
Example #20
0
    task_id="load_trips_from_s3_to_redshift",
    dag=dag,
    table="trips",
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    s3_bucket="udacity-dend",
    s3_key="data-pipelines/divvy/partitioned/{execution_date.year}/{execution_date.month}/divvy_trips.csv"
)


# Replaced this data quality check with the HasRowsOperator

check_trips = HasRowsOperator(
    task_id='check_trips_data',
    dag=dag,
    provide_context=True,
    redshift_conn_id="redshift",
    table="trips"
)

create_stations_table = PostgresOperator(
    task_id="create_stations_table",
    dag=dag,
    postgres_conn_id="redshift",
    sql=sql_statements.CREATE_STATIONS_TABLE_SQL,
)

copy_stations_task = S3ToRedshiftOperator(
    task_id="load_stations_from_s3_to_redshift",
    dag=dag,
    redshift_conn_id="redshift",
Example #21
0
#       and the s3_bucket "udacity-dend"
#
copy_trips_task = S3ToRedshiftOperator(
    task_id="load_trips_from_s3_to_redshift",
    dag=dag,
    table="trips",
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    s3_bucket="udacity-dend",
    s3_key="data-pipelines/divvy/unpartitioned/divvy_trips_2018.csv")

#
#  Data quality check on the Trips table
#
check_trips = HasRowsOperator(task_id="check_trips_data",
                              dag=dag,
                              redshift_conn_id="redshift",
                              table="trips")

#
# We use the FactsCalculatorOperator to create a Facts table in RedShift. The fact column is
#  `tripduration` and the groupby_column is `bikeid`
#
calculate_facts = FactsCalculatorOperator(task_id="calculate_facts_trips",
                                          dag=dag,
                                          redshift_conn_id="redshift",
                                          origin_table="trips",
                                          destination_table="trips_facts",
                                          fact_column="tripduration",
                                          groupby_column="bikeid")

#
Example #22
0
copy_trips_task = S3ToRedshiftOperator(
    task_id="load_trips_from_s3_to_redshift",
    dag=dag,
    table="trips",
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    s3_bucket="udac-data-pipelines",
    s3_key="divvy/partitioned/{execution_date.year}/{execution_date.month}/divvy_trips.csv"
)

#
# TODO: Replace this data quality check with the HasRowsOperator
#
check_trips = HasRowsOperator(
    task_id='check_trips_data',
    dag=dag,
    redshift_conn_id='redshift'
    table='trips'
)

create_stations_table = PostgresOperator(
    task_id="create_stations_table",
    dag=dag,
    postgres_conn_id="redshift",
    sql=sql_statements.CREATE_STATIONS_TABLE_SQL,
)

copy_stations_task = S3ToRedshiftOperator(
    task_id="load_stations_from_s3_to_redshift",
    dag=dag,
    redshift_conn_id="redshift",
Example #23
0
#
# TODO: Replace this data quality check with the HasRowsOperator
#
# check_trips = PythonOperator(
#     task_id='check_trips_data',
#     dag=dag,
#     python_callable=check_greater_than_zero,
#     provide_context=True,
#     params={
#         'table': 'trips',
#     }
# )

check_trips = HasRowsOperator(redshift_conn_id="redshift",
                              table="trips",
                              task_id='check_trips_data',
                              dag=dag)

create_stations_table = PostgresOperator(
    task_id="create_stations_table",
    dag=dag,
    postgres_conn_id="redshift",
    sql=sql_statements.CREATE_STATIONS_TABLE_SQL,
)

copy_stations_task = S3ToRedshiftOperator(
    task_id="load_stations_from_s3_to_redshift",
    dag=dag,
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    s3_bucket="udac-data-pipelines",
Example #24
0
    task_id="load_trips_from_s3_to_redshift",
    dag=dag,
    table="trips",
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    s3_bucket="udac-data-pipelines",
    s3_key="divvy/partitioned/{execution_date.year}/{execution_date.month}/divvy_trips.csv"
)

#
# TODO: Replace this data quality check with the HasRowsOperator
#
check_trips = HasRowsOperator(
    task_id='check_trips_data',
    dag=dag,
    provide_context=True,
    redshift_conn_id = 'redshift_param',
    table= 'trips'
)

create_stations_table = PostgresOperator(
    task_id="create_stations_table",
    dag=dag,
    postgres_conn_id="redshift",
    sql=sql_statements.CREATE_STATIONS_TABLE_SQL,
)

copy_stations_task = S3ToRedshiftOperator(
    task_id="load_stations_from_s3_to_redshift",
    dag=dag,
    redshift_conn_id="redshift",
Example #25
0
#       and the s3_bucket "udacity-dend"
#
copy_trips_task = S3ToRedshiftOperator(
    task_id="copy_trips_tasks",
    dag=dag,
    table="trips",
    redshift_conn_id='redshift',
    aws_credentials_id='aws_default',
    s3_bucket='udacity-dend',
    s3_key='data-pipelines/divvy/unpartitioned/divvy_trips_2018.csv')

#
# TODO: Perform a data quality check on the Trips table
#
check_trips_task = HasRowsOperator(task_id="check_trips_has_rows",
                                   dag=dag,
                                   redshift_conn_id='redshift',
                                   table='trips')

#
# TODO: Use the FactsCalculatorOperator to create a Facts table in RedShift. The fact column should
#       be `tripduration` and the groupby_column should be `bikeid`
#
calculate_facts_task = FactsCalculatorOperator(task_id="calculate_facts",
                                               dag=dag,
                                               redshift_conn_id='redshift',
                                               origin_table='trips',
                                               destination_table='trip_facts',
                                               fact_column='tripduration',
                                               groupby_column='bikeid')

#
Example #26
0
copy_trips_task = S3ToRedshiftOperator(
    task_id="load_trips_from_s3_to_redshift",
    dag=dag,
    table="trips",
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    s3_bucket="udac-data-pipelines",
    s3_key="divvy/partitioned/{execution_date.year}/{execution_date.month}/divvy_trips.csv"
)

#
# TODO: Replace this data quality check with the HasRowsOperator
#
check_trips = HasRowsOperator(
    task_id='check_trips_data',
    dag=dag,
    redshitft_conn_id = "redshift",
    table = "trips"
)

create_stations_table = PostgresOperator(
    task_id="create_stations_table",
    dag=dag,
    postgres_conn_id="redshift",
    sql=sql_statements.CREATE_STATIONS_TABLE_SQL,
)

copy_stations_task = S3ToRedshiftOperator(
    task_id="load_stations_from_s3_to_redshift",
    dag=dag,
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
#       3. Uses the FactsCalculatorOperator to create a Facts table in Redshift
#
args = {
    'start_date': datetime.datetime.utcnow(),
}
with DAG(dag_id='lesson3.exercise4', default_args=args) as dag:

    copy_trips_task = S3ToRedshiftOperator(
        task_id="copy_trips",
        redshift_conn_id="redshift",
        aws_credentials_id="aws_credentials",
        table="trips",
        s3_bucket="udacity-dend",
        s3_key="data-pipelines/divvy/unpartitioned/divvy_trips_2018.csv",
    )

    check_trips_task = HasRowsOperator(task_id="check_trips",
                                       redshift_conn_id="redshift",
                                       table="trips")

    calculate_facts_task = FactsCalculatorOperator(
        task_id="calculate_facts",
        redshift_conn_id="redshift",
        origin_table="trips",
        destination_table="facts",
        fact_column="tripduration",
        groupby_column="bikeid",
    )

    copy_trips_task >> check_trips_task >> calculate_facts_task