def get_s3_to_redshift_dag( parent_dag_name, task_id, redshift_conn_id, aws_credentials_id, table, create_sql_stmt, s3_bucket, s3_key, *args, **kwargs): dag = DAG( f"{parent_dag_name}.{task_id}", **kwargs ) create_task = PostgresOperator( task_id=f"create_{table}_table", dag=dag, postgres_conn_id=redshift_conn_id, sql=create_sql_stmt ) copy_task = S3ToRedshiftOperator( task_id=f"load_{table}_from_s3_to_redshift", dag=dag, table=table, redshift_conn_id=redshift_conn_id, aws_credentials_id=aws_credentials_id, s3_bucket=s3_bucket, s3_key=s3_key ) # # TODO: Move the HasRowsOperator task here from the DAG # check_trips = HasRowsOperator( task_id="check_trips_data", dag=dag, redshift_conn_id="redshift", table="trips" ) check_stations = HasRowsOperator( task_id="check_stations_data", dag=dag, redshift_conn_id="redshift", table="stations" ) create_task >> copy_task # # TODO: Use DAG ordering to place the check task # check_trips >> check_stations return dag
def get_s3_to_redshift_dag(parent_dag_name, task_id, redshift_conn_id, aws_credentials_id, table, create_sql_stmt, s3_bucket, s3_key, *args, **kwargs): dag = DAG(f"{parent_dag_name}.{task_id}", **kwargs) create_task = PostgresOperator(task_id=f"create_{table}_table", dag=dag, postgres_conn_id=redshift_conn_id, sql=create_sql_stmt) copy_task = S3ToRedshiftOperator( task_id=f"load_{table}_from_s3_to_redshift", dag=dag, table=table, redshift_conn_id=redshift_conn_id, aws_credentials_id=aws_credentials_id, s3_bucket=s3_bucket, s3_key=s3_key) check_task = HasRowsOperator(ask_id=f"check_{table}_data", dag=dag, redshift_conn_id=redshift_conn_id, table=table) create_task >> copy_task copy_task >> check_task return dag
def load_dim_table_dag(parent_dag_name, task_id, redshift_conn_id, table, create_sql_stmt, select_stmt, append_rows, *args, **kwargs): dag = DAG(f"{parent_dag_name}.{task_id}", **kwargs) insert_sql = """ INSERT INTO {} {} ; """ #create_dimtable_task = DummyOperator(task_id="create_{}_table".format(table), dag=dag) create_dimtable_task = PostgresOperator(task_id=f"create_{table}_table", dag=dag, postgres_conn_id=redshift_conn_id, sql=create_sql_stmt) #insert_to_table = DummyOperator(task_id="insert_into_{}".format(table), dag=dag) #insert_to_table = PostgresOperator( # task_id=f"insert_into_{table}", # dag=dag, # postgres_conn_id=redshift_conn_id, # sql=insert_sql.format( # table, # select_stmt # ) #) insert_to_table = LoadDimensionOperator(task_id=f"insert_into_{table}", dag=dag, redshift_conn_id="redshift", table=table, sql_source=select_stmt, append_rows=append_rows) #check_task = DummyOperator(task_id="check_{}_data".format(table), dag=dag) check_task = HasRowsOperator(task_id=f"check_{table}_data", dag=dag, redshift_conn_id=redshift_conn_id, table=table) create_dimtable_task >> insert_to_table insert_to_table >> check_task return dag
def get_s3_to_redshift_subdag(parent_dag_name, task_id, redshift_conn_id, aws_credentials_id, create_tbl, target_table, sql_row, s3_bucket, s3_key, custom, *args, **kwargs): dag = DAG(f"{parent_dag_name}.{task_id}", **kwargs) templated_command = """ echo "**************** {{ task.owner }}, {{ task.task_id }}" echo "**************** The execution date : {{ ds }}" echo "**************** {{ task_instance_key_str }} is running" """ info_task = BashOperator( task_id=f"Info_about_{parent_dag_name}.{task_id}", dag=dag, depends_on_past=False, bash_command=templated_command, ) copy_task = StageToRedshiftOperator( task_id=f"copy_{target_table}_from_s3_to_redshift", dag=dag, redshift_conn_id=redshift_conn_id, aws_credentials_id=aws_credentials_id, create_tbl=create_tbl, target_table=target_table, s3_bucket=s3_bucket, s3_key=s3_key, custom=custom) check_staging = HasRowsOperator( task_id=f"check_{target_table}_rows", dag=dag, sql_row=sql_row, redshift_conn_id=redshift_conn_id, target_table=target_table, ) info_task >> copy_task copy_task >> check_staging return dag
ingest_fact_transaction_table = IngestOperator( task_id='ingest_fact_transaction_table', schema='public', table='fact_transaction', create_sql=CreateSQL.create_fact_transaction, insert_sql=(InsertSQL.insert_fact_transaction.format( tstamp='{{ ts_nodash }}')), has_rows=( f"{{{{ {xcom_template.format(id='extract_transaction_data')} }}}}" ), ) data_check_dim_country = HasRowsOperator( task_id='data_check_dim_country', schema='public', table='dim_country', date_filter='{{ ds }}', has_rows=( f"{{{{ {xcom_template.format(id='extract_country_data')} }}}}"), ) data_check_dim_date = HasRowsOperator( task_id='data_check_dim_date', schema='public', table='dim_date', date_filter='{{ ds }}', has_rows=( f"{{{{ {xcom_template.format(id='extract_transaction_data')} }}}}" ), ) data_check_dim_fx_rate = HasRowsOperator(
"stations", sql_statements.CREATE_STATIONS_TABLE_SQL, s3_bucket="udac-data-pipelines", s3_key="divvy/unpartitioned/divvy_stations_2017.csv", start_date=start_date, ), task_id=stations_task_id, dag=dag, ) # # TODO: Consolidate check_trips and check_stations into a single check in the subdag # as we did with the create and copy in the demo # check_trips = HasRowsOperator(task_id="check_trips_data", dag=dag, redshift_conn_id="redshift", table="trips") check_stations = HasRowsOperator(task_id="check_stations_data", dag=dag, redshift_conn_id="redshift", table="stations") location_traffic_task = PostgresOperator( task_id="calculate_location_traffic", dag=dag, postgres_conn_id="redshift", sql=sql_statements.LOCATION_TRAFFIC_SQL) # # TODO: Reorder the Graph once you have moved the checks
# 3. Uses the FactsCalculatorOperator to create a Facts table in Redshift # a. **NOTE**: to complete this step you must complete the FactsCalcuatorOperator # skeleton defined in plugins/operators/facts_calculator.py # dag = DAG("lesson3.exercise4", start_date=datetime.datetime.utcnow()) copy_trips_task = S3ToRedshiftOperator( redshift_conn_id='redshift', aws_credentials='aws_credentials', table='trips', s3_bucket='udacity-dend', s3_key='data-pipelines/divvy/unpartitioned/divvy_trips_2018.csv', task_id='copy_trips_data_from_s3_to_redshift', dag=dag) check_trips_task = HasRowsOperator(redshift_conn_id='redshift', table='trips', task_id='check_trips_table_data', dag=dag) calculate_facts_task = FactsCalculatorOperator(redshift_conn_id='redshift', origin_table='trips', destination_table='trips_facts', fact_column='tripduration', groupby_column='bikeid', task_id='calculate_facts_trips', dag=dag) copy_trips_task >> check_trips_task check_trips_task >> calculate_facts_task
sql_statements.CREATE_STATIONS_TABLE_SQL, s3_bucket="udac-data-pipelines", s3_key="divvy/unpartitioned/divvy_stations_2017.csv", start_date=start_date, ), task_id=stations_task_id, dag=dag, ) # # TODO: Consolidate check_trips and check_stations into a single check in the subdag # as we did with the create and copy in the demo # check_trips = HasRowsOperator( task_id="check_trips_data", dag=dag, redshift_conn_id="redshift", table="trips" ) check_stations = HasRowsOperator( task_id="check_stations_data", dag=dag, redshift_conn_id="redshift", table="stations" )location_traffic_task = PostgresOperator( task_id="calculate_location_traffic", dag=dag, postgres_conn_id="redshift", sql=sql_statements.LOCATION_TRAFFIC_SQL )
**kwargs ) create_task = PostgresOperator( task_id=f"create_{table}_table", dag=dag, postgres_conn_id=redshift_conn_id, sql=create_sql_stmt ) copy_task = S3ToRedshiftOperator( task_id=f"load_{table}_from_s3_to_redshift", dag=dag, table=table, redshift_conn_id=redshift_conn_id, aws_credentials_id=aws_credentials_id, s3_bucket=s3_bucket, s3_key=s3_key ) check_rows = HasRowsOperator( task_id=f"check_{table}_rows", dag=dag, redshift_conn_id=redshift_conn_id, table=table ) create_task >> copy_task >> check_rows return dag