def get_s3_to_redshift_dag(parent_dag_name, task_id, redshift_conn_id,
                           aws_credentials_id, table, create_sql_stmt,
                           s3_bucket, s3_key, *args, **kwargs):
    dag = DAG(f"{parent_dag_name}.{task_id}", **kwargs)

    create_task = PostgresOperator(task_id=f"create_{table}_table",
                                   dag=dag,
                                   postgres_conn_id=redshift_conn_id,
                                   sql=create_sql_stmt)

    copy_task = S3ToRedshiftOperator(
        task_id=f"load_{table}_from_s3_to_redshift",
        dag=dag,
        table=table,
        redshift_conn_id=redshift_conn_id,
        aws_credentials_id=aws_credentials_id,
        s3_bucket=s3_bucket,
        s3_key=s3_key)

    #
    # TODO: Move the HasRowsOperator task here from the DAG
    # Done
    check_task = HasRowsOperator(task_id=f"check_{table}_has_rows",
                                 dag=dag,
                                 redshift_conn_id=redshift_conn_id,
                                 table=table)

    #
    # TODO: Use DAG ordering to place the check task
    # Done
    create_task >> copy_task >> check_task

    return dag
Example #2
0
def get_s3_to_redshift_dag(parent_dag_name, task_id, redshift_conn_id,
                           aws_credentials_id, table, create_sql_stmt,
                           s3_bucket, s3_key, *args, **kwargs):
    dag = DAG(f"{parent_dag_name}.{task_id}", **kwargs)

    create_task = PostgresOperator(task_id=f"create_{table}_table",
                                   dag=dag,
                                   postgres_conn_id=redshift_conn_id,
                                   sql=create_sql_stmt)

    copy_task = S3ToRedshiftOperator(
        task_id=f"load_{table}_from_s3_to_redshift",
        dag=dag,
        table=table,
        redshift_conn_id=redshift_conn_id,
        aws_credentials_id=aws_credentials_id,
        s3_bucket=s3_bucket,
        s3_key=s3_key)

    check_task = HasRowsOperator(ask_id=f"check_{table}_data",
                                 dag=dag,
                                 redshift_conn_id=redshift_conn_id,
                                 table=table)

    create_task >> copy_task
    copy_task >> check_task
    return dag
Example #3
0
    task_id=stations_task_id,
    dag=dag,
)

#
# TODO: Migrate Create and Copy tasks to the Subdag
#
create_trips_table = PostgresOperator(task_id="create_trips_table",
                                      dag=dag,
                                      postgres_conn_id="redshift",
                                      sql=sql.CREATE_TRIPS_TABLE_SQL)

copy_trips_task = S3ToRedshiftOperator(
    task_id="load_trips_from_s3_to_redshift",
    dag=dag,
    table="trips",
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    s3_bucket="udacity-dend",
    s3_key="data-pipelines/divvy/unpartitioned/divvy_trips_2018.csv")

check_trips = HasRowsOperator(task_id="check_trips_data",
                              dag=dag,
                              redshift_conn_id="redshift",
                              table="trips")

create_stations_table = PostgresOperator(
    task_id="create_stations_table",
    dag=dag,
    postgres_conn_id="redshift",
    sql=sql.CREATE_STATIONS_TABLE_SQL,
)
#
# TODO: Create a DAG which performs the following functions:
#
#       1. Loads Trip data from S3 to RedShift
#       2. Performs a data quality check on the Trips table in RedShift
#       3. Uses the FactsCalculatorOperator to create a Facts table in Redshift
#           a. **NOTE**: to complete this step you must complete the FactsCalcuatorOperator
#              skeleton defined in plugins/operators/facts_calculator.py
#
dag = DAG("lesson3.exercise4", start_date=datetime.datetime.utcnow())

copy_trips_task = S3ToRedshiftOperator(
    redshift_conn_id='redshift',
    aws_credentials='aws_credentials',
    table='trips',
    s3_bucket='udacity-dend',
    s3_key='data-pipelines/divvy/unpartitioned/divvy_trips_2018.csv',
    task_id='copy_trips_data_from_s3_to_redshift',
    dag=dag)

check_trips_task = HasRowsOperator(redshift_conn_id='redshift',
                                   table='trips',
                                   task_id='check_trips_table_data',
                                   dag=dag)

calculate_facts_task = FactsCalculatorOperator(redshift_conn_id='redshift',
                                               origin_table='trips',
                                               destination_table='trips_facts',
                                               fact_column='tripduration',
                                               groupby_column='bikeid',
                                               task_id='calculate_facts_trips',
Example #5
0
    dag = DAG(
        f"{parent_dag_name}.{task_id}",
        **kwargs
    )
   create_task = PostgresOperator(
        task_id=f"create_{table}_table",
        dag=dag,
        postgres_conn_id=redshift_conn_id,
        sql=create_sql_stmt
    )

    copy_task = S3ToRedshiftOperator(
        task_id=f"load_{table}_from_s3_to_redshift",
        dag=dag,
        table=table,
        redshift_conn_id=redshift_conn_id,
        aws_credentials_id=aws_credentials_id,
        s3_bucket=s3_bucket,
        s3_key=s3_key
    )

    #
    # TODO: Move the HasRowsOperator task here from the DAG
    #

    create_task >> copy_task
    #
    # TODO: Use DAG ordering to place the check task
    #

    return dag