def get_staging_to_dim(
        parent_dag_name,
        task_id,
        redshift_conn_id,
        dest_table,
        truncate,
        insert_sql_query,
        validate_sql_query,
        equals=None,
        at_least=1,
        source_table="",
        *args, **kwargs):
    dag = DAG(
        f"{parent_dag_name}.{task_id}",
        **kwargs
    )

    load_to_dimension_table = LoadDimensionOperator(
        task_id=f"load_{dest_table}_dim_table",
        dag=dag,
        redshift_conn_id=redshift_conn_id,
        dest_table=dest_table,
        source_table=source_table,
        sql_query=insert_sql_query,
        truncate=truncate
    )
    load_to_dimension_table.doc_md = """\
    ### Load Dimension Table
    This task inserts staging data to \
    dimension table.
    """

    run_quality_check = DataQualityOperator(
        task_id=f"check_{dest_table}_data",
        dag=dag,
        redshift_conn_id=redshift_conn_id,
        table=dest_table,
        sql_query=validate_sql_query,
        equals=equals,
        at_least=at_least,
    )
    run_quality_check.doc_md = """\
    ### Check Dimension Table Data
    This task runs validation checks on \
    the newly loaded dimension table.
    """

    # Connect tasks
    load_to_dimension_table >> run_quality_check

    return dag
def load_dimension_subdag(parent_dag_name, task_id, redshift_conn_id, *args,
                          **kwargs):
    """
    This function defines the subdag for loading the data into
    dimension tables.
    :args parent_dag_name: parent dag name
    :task_id: task id
    :redshift_conn_id: redshift cluster
    :return: subdag
    """
    dag = DAG(f"{parent_dag_name}.{task_id}", **kwargs)

    load_user_dimension_table = LoadDimensionOperator(
        task_id='Load_user_dim_table',
        dag=dag,
        redshift_conn_id=redshift_conn_id,
        table="users",
        sql_stmt=SqlQueries.user_table_insert,
        mode="insert")

    load_song_dimension_table = LoadDimensionOperator(
        task_id='Load_song_dim_table',
        dag=dag,
        redshift_conn_id=redshift_conn_id,
        table="songs",
        sql_stmt=SqlQueries.song_table_insert,
        mode="insert")

    load_artist_dimension_table = LoadDimensionOperator(
        task_id='Load_artist_dim_table',
        dag=dag,
        redshift_conn_id=redshift_conn_id,
        table="artists",
        sql_stmt=SqlQueries.artist_table_insert,
        mode="insert")

    load_time_dimension_table = LoadDimensionOperator(
        task_id='Load_time_dim_table',
        dag=dag,
        redshift_conn_id=redshift_conn_id,
        table="time",
        sql_stmt=SqlQueries.time_table_insert,
        mode="insert")

    load_user_dimension_table
    load_song_dimension_table
    load_artist_dimension_table
    load_time_dimension_table

    return dag
Example #3
0
def get_staging_to_dim(
        parent_dag_name,
        task_id,
        redshift_conn_id,
        table,
        truncate,
        insert_sql_query,
        validate_sql_query,
        *args, **kwargs):
    dag = DAG(
        f"{parent_dag_name}.{task_id}",
        **kwargs
    )

    load_to_dimension_table = LoadDimensionOperator(
        task_id=f"load_{table}_dim_table",
        dag=dag,
        redshift_conn_id=redshift_conn_id,
        table=table,
        sql_query=insert_sql_query,
        truncate=truncate
    )

    run_quality_check = DataQualityOperator(
        task_id=f"check_{table}_data",
        dag=dag,
        redshift_conn_id=redshift_conn_id,
        table=table,
        sql_query=validate_sql_query,
    )

    load_to_dimension_table >> run_quality_check

    return dag
Example #4
0
def get_load_dimension_table_subdag(parent_dag_name, task_id, default_args,
                                    postgres_conn_id, sql_queries, tables,
                                    truncate_flags, *args, **kwargs):

    dag = DAG(
        dag_id=f'{parent_dag_name}.{task_id}',
        default_args=default_args,
        **kwargs,
    )

    if (len(tables) != len(sql_queries)) or (len(sql_queries) !=
                                             len(truncate_flags)):
        logging.error(
            'Tables, SQL queries and truncate settings not of same length')
        raise ValueError(
            'Tables, SQL queries and truncate settings not of same length')

    tasks = []
    for table, query, truncate in zip(tables, sql_queries, truncate_flags):
        task = LoadDimensionOperator(
            task_id=f'Load_{table}_dim_table',
            dag=dag,
            postgres_conn_id=postgres_conn_id,
            sql=query,
            table=table,
            truncate=truncate,
        )
        tasks.append(task)

    return dag
def get_dimension_tables_dag(parent_dag_name,
                             task_id,
                             redshift_conn_id,
                             table,
                             columns,
                             append=False,
                             *args,
                             **kwargs):

    dag = DAG(f"{parent_dag_name}.{task_id}", **kwargs)

    begin_execution_task = DummyOperator(task_id='Begin_execution', dag=dag)

    create_task = PostgresOperator(
        task_id=f"create_{table}_table",
        dag=dag,
        postgres_conn_id=redshift_conn_id,
        sql=SQLQueries.create_load_test_queries[table]["CREATE"])

    load_dimension_table_task = LoadDimensionOperator(
        task_id=f"load_{table}_dim_table",
        dag=dag,
        redshift_conn_id=redshift_conn_id,
        table=table,
        columns=columns,
        sql_stmt=SQLQueries.create_load_test_queries[table]["LOAD"],
        append=append)

    end_execution_task = DummyOperator(task_id='End_execution', dag=dag)

    begin_execution_task >> create_task
    create_task >> load_dimension_table_task
    load_dimension_table_task >> end_execution_task

    return dag
Example #6
0
def dimension_sub_dag(parent_dag_name, child_dag_name, start_date, schedule_interval, redshift_conn_id, table_query_dict, append):
    """This function acts as a factory for creating dags for loading dimension tables
    
    Parameters:
    -----------
    parent_dag_name (str): parent dag name
    child_dag_name (str): child dag name
    start_date (datetime): dag start time 
    schedule_interval: dag schedule interval
    redshift_conn_id (str): RedShift Connection id
    table_query_dict (dict): Dictionary in the form {name: [table, insert_query]}
    append (bool): flag for truncating
    
    """
    dag = DAG('%s.%s' % (parent_dag_name, child_dag_name),
              schedule_interval=schedule_interval,
              start_date=start_date,
             )
    
    
    for index, value in table_query_dict.items():
        # create a task using the LoadDimensionOperator
        setattr(dimension_sub_dag, 
                'load_{}_dimension_table'.format(index), 
                LoadDimensionOperator(
                    task_id='Load_{}_dim_table'.format(index),
                    redshift_conn_id="redshift",
                    table=value[0],
                    append=append,
                    sql=value[1],
                    dag=dag
                )
               )
    
    return dag
Example #7
0
def load_dim_tables_subdag(parent_dag_name,
                           child_dag_name,
                           args):

    global dim_tables, dim_queries
    dim_tables = ["artists", "songs", "time", "users"]
    dim_queries = SqlQueries()

    dag_subdag = DAG(f'{parent_dag_name}.{child_dag_name}',
                     start_date = datetime.today() - timedelta(days=2),
                     default_args=args
    )

    with dag_subdag:
        for dim_table in dim_tables:
            t = LoadDimensionOperator(
                task_id=f'Load_{dim_table}_dim_table',
                dag=dag_subdag,
                redshift_conn_id="redshift_conn_id",
                load_dim_query=vars(dim_queries)[f'{dim_table}_table_insert'],
                table=dim_table,
                truncate=True
            )

    return dag_subdag
def load_dim_tables_dag(
        parent_dag_name,
        task_id,
        redshift_conn_id,
        table,
        sql,
        columns,
        truncate_table=False,
        *args, **kwargs):
    dag = DAG(
        f"{parent_dag_name}.{task_id}",
        **kwargs
    )

    load_user_dimension_table = LoadDimensionOperator(
        task_id=f"load_{table}_dim_table",
        dag=dag,
        redshift_conn_id=redshift_conn_id,
        table=table,
        sql=sql,
        truncate_table=truncate_table
    )

    run_quality_checks = DataQualityOperator(
        task_id=f"run_{table}_quality_checks",
        dag=dag,
        redshift_conn_id=redshift_conn_id,
        table=table,
        columns=columns
    )

    load_user_dimension_table >> run_quality_checks

    return dag
def load_dimension_subdag(
    parent_dag_name,
    task_id,
    redshift_conn_id,
    sql_statement,
    delete_load,
    table_name,
    *args, **kwargs):
    '''
        DAG to data from staging tables in to the dimensional tables.
    '''
    dag = DAG(
        f"{parent_dag_name}.{task_id}",
        **kwargs)
    
    load_dimension_table = LoadDimensionOperator(
        task_id=task_id,
        dag=dag,
        redshift_conn_id=redshift_conn_id,
        sql_query = sql_statement,
        delete_load = delete_load,
        table_name = table_name,
    )    
    
    load_dimension_table
    
    return dag
Example #10
0
def load_dim_tables(parent_dag_name, task_id, redshift_conn_id, target_table,
                    sql_create_stmt, sql_load_stmt, insert_mode, *args,
                    **kwargs):
    dag = DAG(f"{parent_dag_name}.{task_id}",
              start_date=datetime(2019, 1, 12),
              **kwargs)

    if (insert_mode == 0):
        sql_load_stmt = "DROP TABLE IF EXISTS " + target_table + "; " + sql_create_stmt + sql_load_stmt
    else:
        pass

    create_task = PostgresOperator(task_id=f"create_{target_table}_table",
                                   dag=dag,
                                   postgres_conn_id=redshift_conn_id,
                                   sql=sql_create_stmt)

    load_task = LoadDimensionOperator(task_id=f"load_{target_table}_table",
                                      provide_context=True,
                                      sql_load_stmt=sql_load_stmt,
                                      redshift_conn_id=redshift_conn_id,
                                      target_table=target_table,
                                      dag=dag)

    create_task >> load_task

    return dag
Example #11
0
def dimension_SubDAG(parent_dag, task_id, conn_id, query, table_name, *args,
                     **kwargs):
    # parent child relationship
    dag = DAG(f"{parent_dag}.{task_id}", **kwargs)
    # dimension table
    # `task_id` and `dag` no need to be included in params list when define
    dimension_table = LoadDimensionOperator(task_id=task_id,
                                            conn_id=conn_id,
                                            query=query,
                                            table_name=table_name,
                                            dag=dag)

    return dag
Example #12
0
def load_dimension_table_dag(parent_dag_name, child_dag_name, args,
                             tables_and_queries, redshift_conn_id):

    subdag = DAG(dag_id='{0}.{1}'.format(parent_dag_name, child_dag_name),
                 default_args=args)
    with subdag:
        for table, query in tables_and_queries.items():
            load_dimension_table = LoadDimensionOperator(
                task_id=f'Load_{table}_dim_table',
                redshift_conn_id=redshift_conn_id,
                table={table},
                query=query,
                dag=subdag)

    return subdag
def load_dim_dag(parent_dag_name, task_id, redshift_conn_id,
                 aws_credentials_id, table, sql_query, *args, **kwargs):
    dag = DAG(f"{parent_dag_name}.{task_id}", **kwargs)

    load_dimension = LoadDimensionOperator(
        task_id=f"load_{table}_dim_table",
        dag=dag,
        table=table,
        redshift_conn_id=redshift_conn_id,
        aws_credentials_id=aws_credentials_id,
        sql_query=sql_query)

    load_dimension

    return dag
Example #14
0
def load_dimension_table_subdag(parent_dag_name, child_dag_name, args, tables):
    dag_subdag = DAG(
        dag_id=f'{parent_dag_name}.{child_dag_name}',
        default_args=args,
    )
    with dag_subdag:
        for table in tables:
            task = LoadDimensionOperator(
                task_id=f'load_{table}_dimension_table',
                default_args=args,
                dag=dag_subdag,
                redshift_conn_id='redshift',
                table=table,
                queries=SqlQueries,
                truncate=True)

    return dag_subdag
def load_dimension_subdag(parent_dag_name, task_id, redshift_conn_id,
                          sql_statement, delete_load, table_name, *args,
                          **kwargs):

    dag = DAG(f"{parent_dag_name}.{task_id}", **kwargs)

    load_dimension_table = LoadDimensionOperator(
        task_id=task_id,
        dag=dag,
        redshift_conn_id=redshift_conn_id,
        sql_query=sql_statement,
        delete_load=delete_load,
        table_name=table_name)

    load_dimension_table

    return dag
def load_dim_table_dag(parent_dag_name, task_id, redshift_conn_id, table,
                       create_sql_stmt, select_stmt, append_rows, *args,
                       **kwargs):

    dag = DAG(f"{parent_dag_name}.{task_id}", **kwargs)

    insert_sql = """
        INSERT INTO {}
        {}
        ;
    """

    #create_dimtable_task = DummyOperator(task_id="create_{}_table".format(table),  dag=dag)
    create_dimtable_task = PostgresOperator(task_id=f"create_{table}_table",
                                            dag=dag,
                                            postgres_conn_id=redshift_conn_id,
                                            sql=create_sql_stmt)

    #insert_to_table = DummyOperator(task_id="insert_into_{}".format(table),  dag=dag)
    #insert_to_table = PostgresOperator(
    #    task_id=f"insert_into_{table}",
    #    dag=dag,
    #    postgres_conn_id=redshift_conn_id,
    #    sql=insert_sql.format(
    #        table,
    #        select_stmt
    #    )
    #)
    insert_to_table = LoadDimensionOperator(task_id=f"insert_into_{table}",
                                            dag=dag,
                                            redshift_conn_id="redshift",
                                            table=table,
                                            sql_source=select_stmt,
                                            append_rows=append_rows)

    #check_task = DummyOperator(task_id="check_{}_data".format(table),  dag=dag)
    check_task = HasRowsOperator(task_id=f"check_{table}_data",
                                 dag=dag,
                                 redshift_conn_id=redshift_conn_id,
                                 table=table)

    create_dimtable_task >> insert_to_table
    insert_to_table >> check_task

    return dag
def get_load_dim_dag(
        parent_dag_name,
        task_id,
        conn_id,
        table,
        append,
        create_sql,
        load_sql,
        *args, **kwargs):
    
    # inherit DAG parameters    
    dag = DAG(
        f"{parent_dag_name}.{task_id}",
        **kwargs
    )
    
    action = 'Append data to' if append else 'Populate data in'
    logging.info(f"{action} {table} dimension table")

    # Drop Table if append mode is not enabled
    # Create Table on Postgres Redshift with connection id from airflow
    sql_drop_table = f"DROP TABLE IF EXISTS {table};" if not append else ""
    sql_create_table = create_sql.format(sql_drop_table, table) 
    create_task = PostgresOperator(
        task_id=f"create_{table}_table",
        dag=dag,
        postgres_conn_id=conn_id,
        sql=sql_create_table
    )

    # Enable Load Dimension Operator to 
    # create dim tables from staging tables
    load_task = LoadDimensionOperator(
        task_id=f"load_{table}_dim_table",
        dag=dag,
        conn_id=conn_id,
        table=table,
        append=append,
        sql=load_sql
    )

    # ensure load task is executed after create task
    create_task >> load_task

    return dag
def get_dimension_load_dag(parent_dag_name, task_id, redshift_conn_id, tables,
                           *args, **kwargs):

    dag = DAG(f"{parent_dag_name}.{task_id}", **kwargs)

    with dag:
        for table in tables:
            task_id = f"load_{table.get('table')}_dimension_table"
            load_data = LoadDimensionOperator(
                task_id=task_id,
                dag=dag,
                redshift_conn_id=redshift_conn_id,
                table=table.get("table"),
                sql=table.get("sql"),
                append_only=False,
            )

    return dag
def load_dimensional_tables_dag(parent_dag_name, task_id, redshift_conn_id,
                                aws_credentials_id, table, sql_query, *args,
                                **kwargs):
    dag = DAG(f"{parent_dag_name}.{task_id}", **kwargs)
    """
        Returns a DAG inserts data into a dimensional redshift table from staging tables.
    """

    load_dimension_table = LoadDimensionOperator(
        task_id=f"load_{table}_dim_table",
        dag=dag,
        table=table,
        redshift_conn_id=redshift_conn_id,
        sql_query=sql_query)

    load_dimension_table

    return dag
Example #20
0
def load_dim_table(parent_dag, task_id, default_args, postgres_conn_id,
                   sql_queries, tables, *args, **kwargs):

    dag = DAG(
        dag_id=f'{parent_dag}.{task_id}',
        default_args=default_args,
        **kwargs,
    )

    tasks = []
    for target_table, query in zip(tables, sql_queries):
        task = LoadDimensionOperator(task_id=f'Load_{target_table}_dim_table',
                                     dag=dag,
                                     postgres_conn_id=postgres_conn_id,
                                     sql=query,
                                     table=table)
        tasks.append(task)

    return dag
def dim_subdag(parent_dag_name, task_id, postgres_conn_id, table,
               insert_sql_stmt, sw_delete_dimensions, *args, **kwargs):
    dag = DAG(f"{parent_dag_name}.{task_id}", **kwargs)

    load_table = LoadDimensionOperator(
        task_id=f"load_table_{table}",
        dag=dag,
        postgres_conn_id=postgres_conn_id,
        table=table,
        sql=insert_sql_stmt,
        sw_delete_dimensions=sw_delete_dimensions,
    )

    has_rows_table = HasRowsOperator(task_id=f"has_rows_table_{table}",
                                     dag=dag,
                                     postgres_conn_id=postgres_conn_id,
                                     table=table)

    load_table >> has_rows_table

    return dag
def create_and_load_table_dag(parent_dag_name, task_id, redshift_conn_id,
                              create_sql, insert_sql, table, truncate, *args,
                              **kwargs):

    dag = DAG(f"{parent_dag_name}.{task_id}", **kwargs)

    create_users_table = CreateTableOperator(task_id=f'create_{table}_table',
                                             dag=dag,
                                             redshift_conn_id=redshift_conn_id,
                                             create_sql=create_sql,
                                             table=table)

    load_user_dimension_table = LoadDimensionOperator(
        task_id=f'Load_{table}_dim_table',
        dag=dag,
        table=table,
        redshift_conn_id=redshift_conn_id,
        query=insert_sql,
        truncate=truncate)

    create_users_table >> load_user_dimension_table

    return dag
Example #23
0
def load_dim_table_dag(
    parent_dag,
    task_id,
    redshift_conn_id,
    aws_credentials_id,
    table,
    sql_create="",
    drop_first=False,
    sql_insert="",
    truncate=True,
    *args, **kwargs):
    """
    Returns a dag that inserts into a dimension table.
    Optionally creates table first and truncates it
    """
   
    dag = DAG(f"{parent_dag}.{task_id}", **kwargs) 
    
    load_dim_table = LoadDimensionOperator(
        task_id=f"Load_{table}_dim_table",
        dag=dag,
        table=table,
        redshift_conn_id=redshift_conn_id,
        aws_credentials_id=aws_credentials_id,
        sql_create=sql_create,
        drop_first=drop_first,
        sql_insert=sql_insert,
        truncate=truncate
    )
    
    # Dependencies
    load_dim_table
    
    return dag