def stg_subdag(parent_dag_name, task_id, aws_credentials_id, postgres_conn_id, table, create_sql_stmt, s3_prefix, s3_bucket, s3_key, s3_jsonpath_file, sw_delete_data, partition_year, partition_month, *args, **kwargs): dag = DAG(f"{parent_dag_name}.{task_id}", **kwargs) create_table = PostgresOperator(task_id=f"create_table_{table}", dag=dag, postgres_conn_id=postgres_conn_id, sql=create_sql_stmt) load_table = StageToRedshiftOperator(task_id=f"load_table_{table}", dag=dag, aws_credentials_id=aws_credentials_id, postgres_conn_id=postgres_conn_id, table=table, s3_prefix=s3_prefix, s3_bucket=s3_bucket, s3_key=s3_key, s3_jsonpath_file=s3_jsonpath_file, sw_delete_data=sw_delete_data, partition_year=partition_year, partition_month=partition_month) has_rows_table = HasRowsOperator(task_id=f"has_rows_table_{table}", dag=dag, postgres_conn_id=postgres_conn_id, table=table) create_table >> load_table >> has_rows_table return dag
def perform_data_quality_checks(parent_dag_name, task_id, redshift_conn_id, table, default_args, check_sql_stmt, *args, **kwargs): dag = DAG(f"{parent_dag_name}.{task_id}", default_args=default_args, schedule_interval=None, **kwargs) check_has_rows_task = HasRowsOperator( task_id=f"check_has_rows_for_{table}", dag=dag, table=table, redshift_conn_id=redshift_conn_id) check_data_quality_task = DataQualityOperator( task_id=f"perform_data_quality_checks_for_{table}", dag=dag, table=table, redshift_conn_id=redshift_conn_id, query=check_sql_stmt) check_has_rows_task >> check_data_quality_task return dag
def get_s3_to_redshift_dag(parent_dag_name, task_id, redshift_conn_id, aws_credentials_id, table, create_sql_stmt, s3_bucket, s3_key, *args, **kwargs): dag = DAG(f"{parent_dag_name}.{task_id}", **kwargs) create_task = PostgresOperator(task_id=f"create_{table}_table", dag=dag, postgres_conn_id=redshift_conn_id, sql=create_sql_stmt) copy_task = S3ToRedshiftOperator( task_id=f"load_{table}_from_s3_to_redshift", dag=dag, table=table, redshift_conn_id=redshift_conn_id, aws_credentials_id=aws_credentials_id, s3_bucket=s3_bucket, s3_key=s3_key) # # TODO: Move the HasRowsOperator task here from the DAG # check_task = HasRowsOperator(task_id=f"check_{table}_has_rows", dag=dag, redshift_conn_id=redshift_conn_id, table=table) create_task >> copy_task copy_task >> check_task # # TODO: Use DAG ordering to place the check task # return dag
def get_s3_to_redshift_dag( parent_dag_name, task_id, redshift_conn_id, aws_credentials_id, table, create_sql_stmt, s3_bucket, s3_key, *args, **kwargs ): dag = DAG( dag_id=f"{parent_dag_name}.{task_id}", # subdags' dag_id must follow this naming pattern **kwargs ) # TODO: Move the create table code here create_table_task = PostgresOperator( task_id=f"create_{table}_table", postgres_conn_id=redshift_conn_id, sql=create_sql_stmt, dag=dag ) # TODO: Move the S3ToRedshiftOperator code here copy_table_task = S3ToRedshiftOperator( task_id=f"load_{table}_from_s3_to_redshift", table=table, redshift_conn_id=redshift_conn_id, aws_credentials_id=aws_credentials_id, s3_bucket=s3_bucket, s3_key=s3_key, dag=dag ) # TODO: Move the check table code here check_table_task = HasRowsOperator( task_id=f"check_{table}_data", redshift_conn_id=redshift_conn_id, table=table, dag=dag ) # TODO: Define ordering of tasks within this subdag create_table_task >> copy_table_task copy_table_task >> check_table_task return dag
def get_s3_to_redshift_dag(parent_dag_name, task_id, redshift_conn_id, aws_credentials_id, staging_table, create_sql_stmt, s3_bucket, s3_key, create_table_option=True, redshift_copy_params=[], *args, **kwargs): dag = DAG(f"{parent_dag_name}.{task_id}", **kwargs) create_table_task = PythonOperator(task_id=f"create_{staging_table}_table", dag=dag, python_callable=create_table, provide_context=True, params={ 'create_table_option': create_table_option, 'redshift_conn_id': redshift_conn_id, 'create_sql_stmt': create_sql_stmt, 'table': staging_table }) stage_table_from_s3_task = StageToRedshiftOperator( dag=dag, task_id=f"load_{staging_table}_from_s3_to_redshift", redshift_conn_id=redshift_conn_id, aws_conn_id=aws_credentials_id, table=staging_table, s3_bucket=s3_bucket, s3_key=s3_key, overwrite=True, copy_params=redshift_copy_params) verify_staged_table_task = HasRowsOperator( dag=dag, task_id=f"verify_{staging_table}_count", table=staging_table, connection_id=redshift_conn_id) create_table_task >> stage_table_from_s3_task >> verify_staged_table_task return dag
def dim_subdag(parent_dag_name, task_id, postgres_conn_id, table, insert_sql_stmt, sw_delete_dimensions, *args, **kwargs): dag = DAG(f"{parent_dag_name}.{task_id}", **kwargs) load_table = LoadDimensionOperator( task_id=f"load_table_{table}", dag=dag, postgres_conn_id=postgres_conn_id, table=table, sql=insert_sql_stmt, sw_delete_dimensions=sw_delete_dimensions, ) has_rows_table = HasRowsOperator(task_id=f"has_rows_table_{table}", dag=dag, postgres_conn_id=postgres_conn_id, table=table) load_table >> has_rows_table return dag
dag=dag, table="trips", redshift_conn_id="redshift", aws_credentials_id="aws_credentials", s3_bucket="udac-data-pipelines", s3_key= "divvy/partitioned/{execution_date.year}/{execution_date.month}/divvy_trips.csv" ) # # TODO: Replace this data quality check with the HasRowsOperator # check_trips = HasRowsOperator(task_id='check_trips_data', dag=dag, python_callable=check_greater_than_zero, provide_context=True, params={ 'table': 'trips', }) create_stations_table = PostgresOperator( task_id="create_stations_table", dag=dag, postgres_conn_id="redshift", sql=sql_statements.CREATE_STATIONS_TABLE_SQL, ) copy_stations_task = S3ToRedshiftOperator( task_id="load_stations_from_s3_to_redshift", dag=dag, redshift_conn_id="redshift",
def redshift_table_ddl_plus_bulk_load(parent_dag_id, parent_task_id, redshift_conn_id, aws_credentials_id, table, create_table_statement, s3_bucket, s3_key, *args, **kwargs): """ This Python Factory Function returns an Airflow DAG Class instance to be used as a SubDag within the (outer) DAG that calls it. The DAG in turn executes three Task within a Redshift database instance: - execute CREATE TABLE (DDL) statement through PostgresOperator; - execute COPY (bulk load) to pull data from S3 into the newly created table through custom S3ToRedshiftOperator - perform data quality check through custom HasRowsOperator Args: parent_dag_id (str): """ # instantiate DAG Class subDagInstance = DAG( #---------------------------------------------------------------------- # # ATTENTION! "parent_dag_id.parent_task_id" IS MANDATORY # # It's an Airflow Convention that subdags must have their "dag_id" # attribute set to the combination of: # # 1. "dag_id" of the outer DAG Instance calling the SubDagOperator # which in turn calls the Python Factory Function; # # 2. "task_id" assigned to the SubDagOperator mentioned in item number 1 # # 3. "." a dot must separate the "dag_id" and "task_id" mentioned in # both items 1 and 2, respectively; # #---------------------------------------------------------------------- dag_id=f"{parent_dag_id}.{parent_task_id}" # make sure keyword arguments are also received , **kwargs) # NOTICE how Task instances are created in a SubDag just as they would in # a "normal" DAG. createTable = PostgresOperator(task_id=f'create_{table}_table', postgres_conn_id=redshift_conn_id, sql=create_table_statement, dag=subDagInstance) bulkLoadTable = S3ToRedshiftOperator(task_id=f'bulk_load_{table}_table', redshift_conn_id=redshift_conn_id, aws_credentials_id=aws_credentials_id, table=table, s3_bucket=s3_bucket, s3_key=s3_key, dag=subDagInstance) checkDataQuality = HasRowsOperator(task_id=f'check_{table}_table', redshift_conn_id=redshift_conn_id, table=table, dag=subDagInstance) # Define Task dependencies createTable >> bulkLoadTable >> checkDataQuality return subDagInstance
# variables like {{ds}}, {{execution_date}} and so on. loadTripsData = S3ToRedshiftOperator( task_id="loadTripsData", redshift_conn_id="redshift", aws_credentials_id="aws_credentials", table="trips", truncate=False, s3_bucket="udacity-dend", s3_key= "data-pipelines/divvy/partitioned/{execution_date.year}/{execution_date.month}/divvy_trips.csv", delimiter=",", ignore_headers=1, dag=bulkLoadDag) checkTripsData = HasRowsOperator(task_id='checkTripsData', redshift_conn_id='redshift', table='trips', dag=bulkLoadDag) createStationsTable = PostgresOperator( task_id="createStationsTable", postgres_conn_id="redshift", sql=sql_statements.CREATE_STATIONS_TABLE_SQL, dag=bulkLoadDag) # REMEMBER: # As "s3_key" parameter has been marked as "Templatable" (in its Class # "template_fields" variable) it'll be able to fetch JINJA template # variables like {{ds}}, {{execution_date}} and so on. loadStationsTable = S3ToRedshiftOperator( task_id="loadStationsTable", redshift_conn_id="redshift",
# a. **NOTE**: to complete this step you must complete the FactsCalcuatorOperator # skeleton defined in plugins/operators/facts_calculator.py # dag = DAG("lesson3.exercise4", start_date=datetime.datetime.utcnow()) copy_trips_task = S3ToRedshiftOperator( task_id="load_trips_from_s3_to_redshift", dag=dag, table="trips", redshift_conn_id="redshift", aws_credentials_id="aws_credentials", s3_bucket="udacity-dend", s3_key="data-pipelines/divvy/partitioned/2018/10/divvy_trips.csv") check_trips = HasRowsOperator(task_id='check_trips_data', dag=dag, redshift_conn_id='redshift', table='trips') # # TODO: Use the FactsCalculatorOperator to create a Facts table in RedShift. The fact column should # be `tripduration` and the groupby_column should be `bikeid` # calculate_facts = FactsCalculatorOperator(task_id='calculate_facts', dag=dag, redshift_conn_id='redshift', origin_table='trips', destination_table='fact_trips', fact_column='tripduration', groupby_column='bikeid') copy_trips_task >> check_trips
# # TODO: Replace this data quality check with the HasRowsOperator # # check_trips = PythonOperator( # task_id='check_trips_data', # dag=dag, # python_callable=check_greater_than_zero, # provide_context=True, # params={ # 'table': 'trips', # } # ) check_trips = HasRowsOperator( task_id='check_trips_data', dag=dag, redshift_conn_id="redshift", table='trips' ) create_stations_table = PostgresOperator( task_id="create_stations_table", dag=dag, postgres_conn_id="redshift", sql=sql_statements.CREATE_STATIONS_TABLE_SQL ) copy_stations_task = S3ToRedshiftOperator( task_id="load_stations_from_s3_to_redshift", dag=dag, redshift_conn_id="redshift", aws_credentials_id="aws_credentials",
dag=customDag) # load data from S3 to Resdhift loadTripData = S3ToRedshiftOperator( task_id='loadTripData', redshift_conn_id='redshift', aws_credentials_id='aws_credentials', table='trips', truncate=False, s3_bucket='udacity-dend', s3_key='data-pipelines/divvy/unpartitioned/divvy_trips_2018.csv', delimiter=',', ignore_headers=1, dag=customDag) # check data quality checkDataQuality = HasRowsOperator(task_id='checkDataQuality', redshift_conn_id='redshift', table='trips', dag=customDag) # create fact table createFactTable = FactsCalculatorOperator(task_id='createFactTable', redshift_conn_id='redshift', origin_table='trips', destination_table='trip_facts', fact_column='tripduration', group_by_column='bikeid') # set task dependencies createTripsTable >> loadTripData >> checkDataQuality >> createFactTable
# and the s3_bucket "udacity-dend" # copy_trips_task = S3ToRedshiftOperator( task_id="load_trips_from_s3_to_redshift", dag=dag, redshift_conn_id=my_redshift_conn, aws_credentials_id=my_aws_credentials, table="trips", s3_bucket="udacity-dend", s3_key="data-pipelines/divvy/unpartitioned/divvy_trips_2018.csv") # # TODO: Perform a data quality check on the Trips table # check_trips = HasRowsOperator(task_id="check_hasrows_trips", dag=dag, redshift_conn_id=my_redshift_conn, table="trips") # # TODO: Use the FactsCalculatorOperator to create a Facts table in RedShift. The fact column should # be `tripduration` and the groupby_column should be `bikeid` # calculate_facts = FactsCalculatorOperator( task_id="calculate_facts_trips", dag=dag, redshift_conn_id=my_redshift_conn, origin_table="trips", destination_table="table_tripduration", fact_column="tripduration", groupby_column="bikeid")
copy_trips_task = S3ToRedshiftOperator( task_id="load_trips_from_s3_to_redshift", dag=dag, redshift_conn_id="redshift", aws_credentials_id="aws_credentials", s3_bucket="udacity-dend", s3_key="/data-pipelines/divvy/unpartitioned/divvy_stations_2017.csv", table="trips") # # TODO: Perform a data quality check on the Trips table # check_trips = HasRowsOperator(task_id="check_trips", dag=dag, redshift_conn_id="redshift", table="trips", provide_context=True) # # TODO: Use the FactsCalculatorOperator to create a Facts table in RedShift. The fact column should # be `tripduration` and the groupby_column should be `bikeid` # calculate_facts = FactsCalculatorOperator(task_id="calculate_facts", dag=dag, redshift_conn_id="redshift", origin_table="trips", destination_table="trip_calc", fact_column="tripduration", groupby_column="bikeid")
copy_trips_task = S3ToRedshiftOperator( task_id="load_trips_from_s3_to_redshift", dag=dag, table="trips", redshift_conn_id="redshift", aws_credentials_id="aws_credentials", s3_bucket="udacity-dend/data-pipelines", s3_key= "divvy/partitioned/{execution_date.year}/{execution_date.month}/divvy_trips.csv" ) # # TODO: Replace this data quality check with the HasRowsOperator # check_trips = HasRowsOperator(task_id='check_trips', table='trips', redshift_conn_id='redshift', dag=dag) create_stations_table = PostgresOperator( task_id="create_stations_table", dag=dag, postgres_conn_id="redshift", sql=sql_statements.CREATE_STATIONS_TABLE_SQL, ) copy_stations_task = S3ToRedshiftOperator( task_id="load_stations_from_s3_to_redshift", dag=dag, redshift_conn_id="redshift", aws_credentials_id="aws_credentials", s3_bucket="udacity-dend/data-pipelines",
) copy_trips_task = S3ToRedshiftOperator( aws_credentials_id="aws_credentials", dag=dag, redshift_conn_id="redshift", s3_key="divvy/partitioned/{execution_date.year}/{execution_date.month}/divvy_trips.csv", table="trips", task_id="load_trips_from_s3_to_redshift", s3_bucket="udacity-dend", s3_prefix="data-pipelines" ) check_trips = HasRowsOperator( task_id="check_trips_table", dag=dag, redshift_conn_id="redshift", table="trips" ) create_stations_table = PostgresOperator( task_id="create_stations_table", dag=dag, postgres_conn_id="redshift", sql=sql_statements.CREATE_STATIONS_TABLE_SQL, ) copy_stations_task = S3ToRedshiftOperator( aws_credentials_id="aws_credentials", dag=dag, redshift_conn_id="redshift", s3_key="divvy/unpartitioned/divvy_stations_2017.csv",
# copy_trips_task = S3ToRedshiftOperator( task_id="load_trips_from_s3_to_redshift", dag=dag, table="trips", redshift_conn_id="redshift", aws_credentials_id="aws_credentials", s3_bucket="udacity-dend", s3_key="data-pipelines/divvy/unpartitioned/divvy_trips_2018.csv") # # TODO: Perform a data quality check on the Trips table # check_trips = HasRowsOperator(task_id='check_trips_data', dag=dag, provide_context=True, table='trips', redshift_conn_id='redshift') # # TODO: Use the FactsCalculatorOperator to create a Facts table in RedShift. The fact column should # be `tripduration` and the groupby_column should be `bikeid` # calculate_facts = FactsCalculatorOperator(redshift_conn_id="redshift", origin_table="trips", destination_table="fact_table", fact_column="tripduration", groupby_column="bikeid") # # TODO: Define task ordering for the DAG tasks you defined
aws_credentials_id="aws_credentials", s3_bucket=bucket, s3_key="mode.csv") # Load Visa data to Visa Dimension table copy_visa_task = S3ToRedshiftOperator(task_id="load_visa_from_s3_to_redshift", dag=dag, table="visa_dim", redshift_conn_id="redshift", aws_credentials_id="aws_credentials", s3_bucket=bucket, s3_key="visa.csv") # Check Rows count in immigration table check_immigration = HasRowsOperator(task_id='check_immigration_data', dag=dag, table='immigration_fact', redshift_conn_id='redshift') # Check Rows count in Happiness table check_happiness = HasRowsOperator(task_id='check_happiness_data', dag=dag, table='happiness', redshift_conn_id='redshift') # Check Rows count in Country table check_country = HasRowsOperator(task_id='check_country_data', dag=dag, table='country', redshift_conn_id='redshift') # Check Rows count in Mode table
task_id="load_trips_from_s3_to_redshift", dag=dag, table="trips", redshift_conn_id="redshift", aws_credentials_id="aws_credentials", s3_bucket="udac-data-pipelines", s3_key= "divvy/partitioned/{execution_date.year}/{execution_date.month}/divvy_trips.csv" ) # # TODO: Replace this data quality check with the HasRowsOperator # check_trips = HasRowsOperator(task_id='check_trips_data', dag=dag, python_callable=check_greater_than_zero, provide_context=True, table='trips', redshift_conn_id='redshift') create_stations_table = PostgresOperator( task_id="create_stations_table", dag=dag, postgres_conn_id="redshift", sql=sql_statements.CREATE_STATIONS_TABLE_SQL, ) copy_stations_task = S3ToRedshiftOperator( task_id="load_stations_from_s3_to_redshift", dag=dag, redshift_conn_id="redshift", aws_credentials_id="aws_credentials",
task_id="load_trips_from_s3_to_redshift", dag=dag, table="trips", redshift_conn_id="redshift", aws_credentials_id="aws_credentials", s3_bucket="udacity-dend", s3_key="data-pipelines/divvy/partitioned/{execution_date.year}/{execution_date.month}/divvy_trips.csv" ) # Replaced this data quality check with the HasRowsOperator check_trips = HasRowsOperator( task_id='check_trips_data', dag=dag, provide_context=True, redshift_conn_id="redshift", table="trips" ) create_stations_table = PostgresOperator( task_id="create_stations_table", dag=dag, postgres_conn_id="redshift", sql=sql_statements.CREATE_STATIONS_TABLE_SQL, ) copy_stations_task = S3ToRedshiftOperator( task_id="load_stations_from_s3_to_redshift", dag=dag, redshift_conn_id="redshift",
# and the s3_bucket "udacity-dend" # copy_trips_task = S3ToRedshiftOperator( task_id="load_trips_from_s3_to_redshift", dag=dag, table="trips", redshift_conn_id="redshift", aws_credentials_id="aws_credentials", s3_bucket="udacity-dend", s3_key="data-pipelines/divvy/unpartitioned/divvy_trips_2018.csv") # # Data quality check on the Trips table # check_trips = HasRowsOperator(task_id="check_trips_data", dag=dag, redshift_conn_id="redshift", table="trips") # # We use the FactsCalculatorOperator to create a Facts table in RedShift. The fact column is # `tripduration` and the groupby_column is `bikeid` # calculate_facts = FactsCalculatorOperator(task_id="calculate_facts_trips", dag=dag, redshift_conn_id="redshift", origin_table="trips", destination_table="trips_facts", fact_column="tripduration", groupby_column="bikeid") #
copy_trips_task = S3ToRedshiftOperator( task_id="load_trips_from_s3_to_redshift", dag=dag, table="trips", redshift_conn_id="redshift", aws_credentials_id="aws_credentials", s3_bucket="udac-data-pipelines", s3_key="divvy/partitioned/{execution_date.year}/{execution_date.month}/divvy_trips.csv" ) # # TODO: Replace this data quality check with the HasRowsOperator # check_trips = HasRowsOperator( task_id='check_trips_data', dag=dag, redshift_conn_id='redshift' table='trips' ) create_stations_table = PostgresOperator( task_id="create_stations_table", dag=dag, postgres_conn_id="redshift", sql=sql_statements.CREATE_STATIONS_TABLE_SQL, ) copy_stations_task = S3ToRedshiftOperator( task_id="load_stations_from_s3_to_redshift", dag=dag, redshift_conn_id="redshift",
# # TODO: Replace this data quality check with the HasRowsOperator # # check_trips = PythonOperator( # task_id='check_trips_data', # dag=dag, # python_callable=check_greater_than_zero, # provide_context=True, # params={ # 'table': 'trips', # } # ) check_trips = HasRowsOperator(redshift_conn_id="redshift", table="trips", task_id='check_trips_data', dag=dag) create_stations_table = PostgresOperator( task_id="create_stations_table", dag=dag, postgres_conn_id="redshift", sql=sql_statements.CREATE_STATIONS_TABLE_SQL, ) copy_stations_task = S3ToRedshiftOperator( task_id="load_stations_from_s3_to_redshift", dag=dag, redshift_conn_id="redshift", aws_credentials_id="aws_credentials", s3_bucket="udac-data-pipelines",
task_id="load_trips_from_s3_to_redshift", dag=dag, table="trips", redshift_conn_id="redshift", aws_credentials_id="aws_credentials", s3_bucket="udac-data-pipelines", s3_key="divvy/partitioned/{execution_date.year}/{execution_date.month}/divvy_trips.csv" ) # # TODO: Replace this data quality check with the HasRowsOperator # check_trips = HasRowsOperator( task_id='check_trips_data', dag=dag, provide_context=True, redshift_conn_id = 'redshift_param', table= 'trips' ) create_stations_table = PostgresOperator( task_id="create_stations_table", dag=dag, postgres_conn_id="redshift", sql=sql_statements.CREATE_STATIONS_TABLE_SQL, ) copy_stations_task = S3ToRedshiftOperator( task_id="load_stations_from_s3_to_redshift", dag=dag, redshift_conn_id="redshift",
# and the s3_bucket "udacity-dend" # copy_trips_task = S3ToRedshiftOperator( task_id="copy_trips_tasks", dag=dag, table="trips", redshift_conn_id='redshift', aws_credentials_id='aws_default', s3_bucket='udacity-dend', s3_key='data-pipelines/divvy/unpartitioned/divvy_trips_2018.csv') # # TODO: Perform a data quality check on the Trips table # check_trips_task = HasRowsOperator(task_id="check_trips_has_rows", dag=dag, redshift_conn_id='redshift', table='trips') # # TODO: Use the FactsCalculatorOperator to create a Facts table in RedShift. The fact column should # be `tripduration` and the groupby_column should be `bikeid` # calculate_facts_task = FactsCalculatorOperator(task_id="calculate_facts", dag=dag, redshift_conn_id='redshift', origin_table='trips', destination_table='trip_facts', fact_column='tripduration', groupby_column='bikeid') #
copy_trips_task = S3ToRedshiftOperator( task_id="load_trips_from_s3_to_redshift", dag=dag, table="trips", redshift_conn_id="redshift", aws_credentials_id="aws_credentials", s3_bucket="udac-data-pipelines", s3_key="divvy/partitioned/{execution_date.year}/{execution_date.month}/divvy_trips.csv" ) # # TODO: Replace this data quality check with the HasRowsOperator # check_trips = HasRowsOperator( task_id='check_trips_data', dag=dag, redshitft_conn_id = "redshift", table = "trips" ) create_stations_table = PostgresOperator( task_id="create_stations_table", dag=dag, postgres_conn_id="redshift", sql=sql_statements.CREATE_STATIONS_TABLE_SQL, ) copy_stations_task = S3ToRedshiftOperator( task_id="load_stations_from_s3_to_redshift", dag=dag, redshift_conn_id="redshift", aws_credentials_id="aws_credentials",
# 3. Uses the FactsCalculatorOperator to create a Facts table in Redshift # args = { 'start_date': datetime.datetime.utcnow(), } with DAG(dag_id='lesson3.exercise4', default_args=args) as dag: copy_trips_task = S3ToRedshiftOperator( task_id="copy_trips", redshift_conn_id="redshift", aws_credentials_id="aws_credentials", table="trips", s3_bucket="udacity-dend", s3_key="data-pipelines/divvy/unpartitioned/divvy_trips_2018.csv", ) check_trips_task = HasRowsOperator(task_id="check_trips", redshift_conn_id="redshift", table="trips") calculate_facts_task = FactsCalculatorOperator( task_id="calculate_facts", redshift_conn_id="redshift", origin_table="trips", destination_table="facts", fact_column="tripduration", groupby_column="bikeid", ) copy_trips_task >> check_trips_task >> calculate_facts_task