def get_staging_to_dim( parent_dag_name, task_id, redshift_conn_id, dest_table, truncate, insert_sql_query, validate_sql_query, equals=None, at_least=1, source_table="", *args, **kwargs): dag = DAG( f"{parent_dag_name}.{task_id}", **kwargs ) load_to_dimension_table = LoadDimensionOperator( task_id=f"load_{dest_table}_dim_table", dag=dag, redshift_conn_id=redshift_conn_id, dest_table=dest_table, source_table=source_table, sql_query=insert_sql_query, truncate=truncate ) load_to_dimension_table.doc_md = """\ ### Load Dimension Table This task inserts staging data to \ dimension table. """ run_quality_check = DataQualityOperator( task_id=f"check_{dest_table}_data", dag=dag, redshift_conn_id=redshift_conn_id, table=dest_table, sql_query=validate_sql_query, equals=equals, at_least=at_least, ) run_quality_check.doc_md = """\ ### Check Dimension Table Data This task runs validation checks on \ the newly loaded dimension table. """ # Connect tasks load_to_dimension_table >> run_quality_check return dag
def load_dimension_subdag(parent_dag_name, task_id, redshift_conn_id, *args, **kwargs): """ This function defines the subdag for loading the data into dimension tables. :args parent_dag_name: parent dag name :task_id: task id :redshift_conn_id: redshift cluster :return: subdag """ dag = DAG(f"{parent_dag_name}.{task_id}", **kwargs) load_user_dimension_table = LoadDimensionOperator( task_id='Load_user_dim_table', dag=dag, redshift_conn_id=redshift_conn_id, table="users", sql_stmt=SqlQueries.user_table_insert, mode="insert") load_song_dimension_table = LoadDimensionOperator( task_id='Load_song_dim_table', dag=dag, redshift_conn_id=redshift_conn_id, table="songs", sql_stmt=SqlQueries.song_table_insert, mode="insert") load_artist_dimension_table = LoadDimensionOperator( task_id='Load_artist_dim_table', dag=dag, redshift_conn_id=redshift_conn_id, table="artists", sql_stmt=SqlQueries.artist_table_insert, mode="insert") load_time_dimension_table = LoadDimensionOperator( task_id='Load_time_dim_table', dag=dag, redshift_conn_id=redshift_conn_id, table="time", sql_stmt=SqlQueries.time_table_insert, mode="insert") load_user_dimension_table load_song_dimension_table load_artist_dimension_table load_time_dimension_table return dag
def get_staging_to_dim( parent_dag_name, task_id, redshift_conn_id, table, truncate, insert_sql_query, validate_sql_query, *args, **kwargs): dag = DAG( f"{parent_dag_name}.{task_id}", **kwargs ) load_to_dimension_table = LoadDimensionOperator( task_id=f"load_{table}_dim_table", dag=dag, redshift_conn_id=redshift_conn_id, table=table, sql_query=insert_sql_query, truncate=truncate ) run_quality_check = DataQualityOperator( task_id=f"check_{table}_data", dag=dag, redshift_conn_id=redshift_conn_id, table=table, sql_query=validate_sql_query, ) load_to_dimension_table >> run_quality_check return dag
def get_load_dimension_table_subdag(parent_dag_name, task_id, default_args, postgres_conn_id, sql_queries, tables, truncate_flags, *args, **kwargs): dag = DAG( dag_id=f'{parent_dag_name}.{task_id}', default_args=default_args, **kwargs, ) if (len(tables) != len(sql_queries)) or (len(sql_queries) != len(truncate_flags)): logging.error( 'Tables, SQL queries and truncate settings not of same length') raise ValueError( 'Tables, SQL queries and truncate settings not of same length') tasks = [] for table, query, truncate in zip(tables, sql_queries, truncate_flags): task = LoadDimensionOperator( task_id=f'Load_{table}_dim_table', dag=dag, postgres_conn_id=postgres_conn_id, sql=query, table=table, truncate=truncate, ) tasks.append(task) return dag
def get_dimension_tables_dag(parent_dag_name, task_id, redshift_conn_id, table, columns, append=False, *args, **kwargs): dag = DAG(f"{parent_dag_name}.{task_id}", **kwargs) begin_execution_task = DummyOperator(task_id='Begin_execution', dag=dag) create_task = PostgresOperator( task_id=f"create_{table}_table", dag=dag, postgres_conn_id=redshift_conn_id, sql=SQLQueries.create_load_test_queries[table]["CREATE"]) load_dimension_table_task = LoadDimensionOperator( task_id=f"load_{table}_dim_table", dag=dag, redshift_conn_id=redshift_conn_id, table=table, columns=columns, sql_stmt=SQLQueries.create_load_test_queries[table]["LOAD"], append=append) end_execution_task = DummyOperator(task_id='End_execution', dag=dag) begin_execution_task >> create_task create_task >> load_dimension_table_task load_dimension_table_task >> end_execution_task return dag
def dimension_sub_dag(parent_dag_name, child_dag_name, start_date, schedule_interval, redshift_conn_id, table_query_dict, append): """This function acts as a factory for creating dags for loading dimension tables Parameters: ----------- parent_dag_name (str): parent dag name child_dag_name (str): child dag name start_date (datetime): dag start time schedule_interval: dag schedule interval redshift_conn_id (str): RedShift Connection id table_query_dict (dict): Dictionary in the form {name: [table, insert_query]} append (bool): flag for truncating """ dag = DAG('%s.%s' % (parent_dag_name, child_dag_name), schedule_interval=schedule_interval, start_date=start_date, ) for index, value in table_query_dict.items(): # create a task using the LoadDimensionOperator setattr(dimension_sub_dag, 'load_{}_dimension_table'.format(index), LoadDimensionOperator( task_id='Load_{}_dim_table'.format(index), redshift_conn_id="redshift", table=value[0], append=append, sql=value[1], dag=dag ) ) return dag
def load_dim_tables_subdag(parent_dag_name, child_dag_name, args): global dim_tables, dim_queries dim_tables = ["artists", "songs", "time", "users"] dim_queries = SqlQueries() dag_subdag = DAG(f'{parent_dag_name}.{child_dag_name}', start_date = datetime.today() - timedelta(days=2), default_args=args ) with dag_subdag: for dim_table in dim_tables: t = LoadDimensionOperator( task_id=f'Load_{dim_table}_dim_table', dag=dag_subdag, redshift_conn_id="redshift_conn_id", load_dim_query=vars(dim_queries)[f'{dim_table}_table_insert'], table=dim_table, truncate=True ) return dag_subdag
def load_dim_tables_dag( parent_dag_name, task_id, redshift_conn_id, table, sql, columns, truncate_table=False, *args, **kwargs): dag = DAG( f"{parent_dag_name}.{task_id}", **kwargs ) load_user_dimension_table = LoadDimensionOperator( task_id=f"load_{table}_dim_table", dag=dag, redshift_conn_id=redshift_conn_id, table=table, sql=sql, truncate_table=truncate_table ) run_quality_checks = DataQualityOperator( task_id=f"run_{table}_quality_checks", dag=dag, redshift_conn_id=redshift_conn_id, table=table, columns=columns ) load_user_dimension_table >> run_quality_checks return dag
def load_dimension_subdag( parent_dag_name, task_id, redshift_conn_id, sql_statement, delete_load, table_name, *args, **kwargs): ''' DAG to data from staging tables in to the dimensional tables. ''' dag = DAG( f"{parent_dag_name}.{task_id}", **kwargs) load_dimension_table = LoadDimensionOperator( task_id=task_id, dag=dag, redshift_conn_id=redshift_conn_id, sql_query = sql_statement, delete_load = delete_load, table_name = table_name, ) load_dimension_table return dag
def load_dim_tables(parent_dag_name, task_id, redshift_conn_id, target_table, sql_create_stmt, sql_load_stmt, insert_mode, *args, **kwargs): dag = DAG(f"{parent_dag_name}.{task_id}", start_date=datetime(2019, 1, 12), **kwargs) if (insert_mode == 0): sql_load_stmt = "DROP TABLE IF EXISTS " + target_table + "; " + sql_create_stmt + sql_load_stmt else: pass create_task = PostgresOperator(task_id=f"create_{target_table}_table", dag=dag, postgres_conn_id=redshift_conn_id, sql=sql_create_stmt) load_task = LoadDimensionOperator(task_id=f"load_{target_table}_table", provide_context=True, sql_load_stmt=sql_load_stmt, redshift_conn_id=redshift_conn_id, target_table=target_table, dag=dag) create_task >> load_task return dag
def dimension_SubDAG(parent_dag, task_id, conn_id, query, table_name, *args, **kwargs): # parent child relationship dag = DAG(f"{parent_dag}.{task_id}", **kwargs) # dimension table # `task_id` and `dag` no need to be included in params list when define dimension_table = LoadDimensionOperator(task_id=task_id, conn_id=conn_id, query=query, table_name=table_name, dag=dag) return dag
def load_dimension_table_dag(parent_dag_name, child_dag_name, args, tables_and_queries, redshift_conn_id): subdag = DAG(dag_id='{0}.{1}'.format(parent_dag_name, child_dag_name), default_args=args) with subdag: for table, query in tables_and_queries.items(): load_dimension_table = LoadDimensionOperator( task_id=f'Load_{table}_dim_table', redshift_conn_id=redshift_conn_id, table={table}, query=query, dag=subdag) return subdag
def load_dim_dag(parent_dag_name, task_id, redshift_conn_id, aws_credentials_id, table, sql_query, *args, **kwargs): dag = DAG(f"{parent_dag_name}.{task_id}", **kwargs) load_dimension = LoadDimensionOperator( task_id=f"load_{table}_dim_table", dag=dag, table=table, redshift_conn_id=redshift_conn_id, aws_credentials_id=aws_credentials_id, sql_query=sql_query) load_dimension return dag
def load_dimension_table_subdag(parent_dag_name, child_dag_name, args, tables): dag_subdag = DAG( dag_id=f'{parent_dag_name}.{child_dag_name}', default_args=args, ) with dag_subdag: for table in tables: task = LoadDimensionOperator( task_id=f'load_{table}_dimension_table', default_args=args, dag=dag_subdag, redshift_conn_id='redshift', table=table, queries=SqlQueries, truncate=True) return dag_subdag
def load_dimension_subdag(parent_dag_name, task_id, redshift_conn_id, sql_statement, delete_load, table_name, *args, **kwargs): dag = DAG(f"{parent_dag_name}.{task_id}", **kwargs) load_dimension_table = LoadDimensionOperator( task_id=task_id, dag=dag, redshift_conn_id=redshift_conn_id, sql_query=sql_statement, delete_load=delete_load, table_name=table_name) load_dimension_table return dag
def load_dim_table_dag(parent_dag_name, task_id, redshift_conn_id, table, create_sql_stmt, select_stmt, append_rows, *args, **kwargs): dag = DAG(f"{parent_dag_name}.{task_id}", **kwargs) insert_sql = """ INSERT INTO {} {} ; """ #create_dimtable_task = DummyOperator(task_id="create_{}_table".format(table), dag=dag) create_dimtable_task = PostgresOperator(task_id=f"create_{table}_table", dag=dag, postgres_conn_id=redshift_conn_id, sql=create_sql_stmt) #insert_to_table = DummyOperator(task_id="insert_into_{}".format(table), dag=dag) #insert_to_table = PostgresOperator( # task_id=f"insert_into_{table}", # dag=dag, # postgres_conn_id=redshift_conn_id, # sql=insert_sql.format( # table, # select_stmt # ) #) insert_to_table = LoadDimensionOperator(task_id=f"insert_into_{table}", dag=dag, redshift_conn_id="redshift", table=table, sql_source=select_stmt, append_rows=append_rows) #check_task = DummyOperator(task_id="check_{}_data".format(table), dag=dag) check_task = HasRowsOperator(task_id=f"check_{table}_data", dag=dag, redshift_conn_id=redshift_conn_id, table=table) create_dimtable_task >> insert_to_table insert_to_table >> check_task return dag
def get_load_dim_dag( parent_dag_name, task_id, conn_id, table, append, create_sql, load_sql, *args, **kwargs): # inherit DAG parameters dag = DAG( f"{parent_dag_name}.{task_id}", **kwargs ) action = 'Append data to' if append else 'Populate data in' logging.info(f"{action} {table} dimension table") # Drop Table if append mode is not enabled # Create Table on Postgres Redshift with connection id from airflow sql_drop_table = f"DROP TABLE IF EXISTS {table};" if not append else "" sql_create_table = create_sql.format(sql_drop_table, table) create_task = PostgresOperator( task_id=f"create_{table}_table", dag=dag, postgres_conn_id=conn_id, sql=sql_create_table ) # Enable Load Dimension Operator to # create dim tables from staging tables load_task = LoadDimensionOperator( task_id=f"load_{table}_dim_table", dag=dag, conn_id=conn_id, table=table, append=append, sql=load_sql ) # ensure load task is executed after create task create_task >> load_task return dag
def get_dimension_load_dag(parent_dag_name, task_id, redshift_conn_id, tables, *args, **kwargs): dag = DAG(f"{parent_dag_name}.{task_id}", **kwargs) with dag: for table in tables: task_id = f"load_{table.get('table')}_dimension_table" load_data = LoadDimensionOperator( task_id=task_id, dag=dag, redshift_conn_id=redshift_conn_id, table=table.get("table"), sql=table.get("sql"), append_only=False, ) return dag
def load_dimensional_tables_dag(parent_dag_name, task_id, redshift_conn_id, aws_credentials_id, table, sql_query, *args, **kwargs): dag = DAG(f"{parent_dag_name}.{task_id}", **kwargs) """ Returns a DAG inserts data into a dimensional redshift table from staging tables. """ load_dimension_table = LoadDimensionOperator( task_id=f"load_{table}_dim_table", dag=dag, table=table, redshift_conn_id=redshift_conn_id, sql_query=sql_query) load_dimension_table return dag
def load_dim_table(parent_dag, task_id, default_args, postgres_conn_id, sql_queries, tables, *args, **kwargs): dag = DAG( dag_id=f'{parent_dag}.{task_id}', default_args=default_args, **kwargs, ) tasks = [] for target_table, query in zip(tables, sql_queries): task = LoadDimensionOperator(task_id=f'Load_{target_table}_dim_table', dag=dag, postgres_conn_id=postgres_conn_id, sql=query, table=table) tasks.append(task) return dag
def dim_subdag(parent_dag_name, task_id, postgres_conn_id, table, insert_sql_stmt, sw_delete_dimensions, *args, **kwargs): dag = DAG(f"{parent_dag_name}.{task_id}", **kwargs) load_table = LoadDimensionOperator( task_id=f"load_table_{table}", dag=dag, postgres_conn_id=postgres_conn_id, table=table, sql=insert_sql_stmt, sw_delete_dimensions=sw_delete_dimensions, ) has_rows_table = HasRowsOperator(task_id=f"has_rows_table_{table}", dag=dag, postgres_conn_id=postgres_conn_id, table=table) load_table >> has_rows_table return dag
def create_and_load_table_dag(parent_dag_name, task_id, redshift_conn_id, create_sql, insert_sql, table, truncate, *args, **kwargs): dag = DAG(f"{parent_dag_name}.{task_id}", **kwargs) create_users_table = CreateTableOperator(task_id=f'create_{table}_table', dag=dag, redshift_conn_id=redshift_conn_id, create_sql=create_sql, table=table) load_user_dimension_table = LoadDimensionOperator( task_id=f'Load_{table}_dim_table', dag=dag, table=table, redshift_conn_id=redshift_conn_id, query=insert_sql, truncate=truncate) create_users_table >> load_user_dimension_table return dag
def load_dim_table_dag( parent_dag, task_id, redshift_conn_id, aws_credentials_id, table, sql_create="", drop_first=False, sql_insert="", truncate=True, *args, **kwargs): """ Returns a dag that inserts into a dimension table. Optionally creates table first and truncates it """ dag = DAG(f"{parent_dag}.{task_id}", **kwargs) load_dim_table = LoadDimensionOperator( task_id=f"Load_{table}_dim_table", dag=dag, table=table, redshift_conn_id=redshift_conn_id, aws_credentials_id=aws_credentials_id, sql_create=sql_create, drop_first=drop_first, sql_insert=sql_insert, truncate=truncate ) # Dependencies load_dim_table return dag