コード例 #1
0
def load_dimensional_tables_dag(parent_dag_name, task_id, redshift_conn_id,
                                aws_credentials_id, table, sql_query, *args,
                                **kwargs):
    dag = DAG(f"{parent_dag_name}.{task_id}", **kwargs)
    """
        Returns a DAG inserts data into a dimensional redshift table from staging tables.
    """

    load_dimension_table = LoadDimensionOperator(
        task_id=f"load_{table}_dim_table",
        dag=dag,
        table=table,
        redshift_conn_id=redshift_conn_id,
        aws_credentials_id=aws_credentials_id,
        sql_query=sql_query)

    load_dimension_table

    return dag
コード例 #2
0
def load_dimension_tables_dag(parent_dag_name, task_id, redshift_conn_id,
                              table, append_data, insert_sql_stmt, *args,
                              **kwargs):
    """

        load_dimension_tables_dag is a custom subDAG, to make our code for our custom operator LoadDimensionOperator be reusable accross various DAGs.

        :param parent_dag_name: the name of the parent DAG
        :type parent_dag_name: string

        :param task_id: to give the subDag a unique id or name.
        :type task_id: string

        :param redshift_conn_id: Connection id of the Redshift connection to use
        :type redshift_conn_id: string    
            Default is 'redshift'

        :param table: Redshift dimension table name, where data will be inserted.
        :type table: string

        :param append_data: if True, we will Append data to the table.
        :type append_data: Boolean

        param insert_sql_stmt: Query representing data that will be inserted
        type sql: string

    """

    # A specific convention used to pass the subDAG to the parent DAG.
    dag = DAG(f"{parent_dag_name}.{task_id}", **kwargs)

    # calling our custom operator and passing to it the parameters needed (that our subDAG make it reusable)
    load_dimension_task = LoadDimensionOperator(
        task_id=f'Load_{table}_dim_table',
        dag=dag,
        redshift_conn_id=redshift_conn_id,
        table=table,
        append_data=append_data,
        sql=insert_sql_stmt)

    # return the above DAG to make our subDAG accessible, based on the ocnvention above.
    return dag
def load_dim_subdag(
    parent_dag_name: str,
    task_id: str,
    redshift_conn_id: str,
    sql_statement: str,
    do_truncate: bool,
    table_name: str,
    **kwargs,
):
    """
    Airflow's subdag wrapper. Implements LoadDimensionOperator operator.
    Subdag's name will be f'{parent_dag_name}.{task_id}'

    Subdag related keyword arguments:
    - parent_dag_name -- Parent DAG name
    - task_id         -- Task ID for the subdag to use

    Keyword arguments:
    redshift_conn_id  -- Airflow connection name for Redshift detail
    sql_statement     -- SQL statement to run
    do_truncate       -- Does the table need to be truncated before running
                         SQL statement
    table_name        -- Dimension table name

    All keyword arguments will be passed to LoadDimensionOperator
    """

    dag = DAG(f'{parent_dag_name}.{task_id}', **kwargs)

    load_dimension_table = LoadDimensionOperator(
        task_id=task_id,
        dag=dag,
        redshift_conn_id=redshift_conn_id,
        sql_query=sql_statement,
        do_truncate=do_truncate,
        table_name=table_name,
    )

    load_dimension_table

    return dag
コード例 #4
0
ファイル: subdag.py プロジェクト: euweb/DataPipelines
def load_dimensions_subdag(parent_dag_name,
                           task_id,
                           redshift_conn_id,
                           dimension_tables_config,
                           args,
                           append=True,
                           **kwargs):
    """executes LoadDimensionOperator for every table defined in dimension_tables_config

    Args:
        parent_dag_name (str): name of the parent DAG
        task_id (str): task id of the parent DAG
        redshift_conn_id (str): name of the connection created in Airflow
        dimension_tables_config (dict): structure containing tables and their insert sql statements
        args: default_args
        append (bool, optional): if false the tables will be truncated before insert new rows. Defaults to True.

    Returns:
        DAG: DAG with LoadDimensionOperator for each table
    """
    dag = DAG(dag_id=f"{parent_dag_name}.{task_id}",
              default_args=args,
              start_date=days_ago(2),
              schedule_interval="@daily",
              max_active_runs=1)

    for table in dimension_tables_config:
        sql = dimension_tables_config[table]
        LoadDimensionOperator(task_id=f'Load_{table}_dim_table',
                              dag=dag,
                              postgres_conn_id=redshift_conn_id,
                              table=table,
                              append=append,
                              sql=sql)

    return dag
コード例 #5
0
    columns=
    """airline_name,link,title,author,author_country,review_date,review_content,aircraft,seat_layout,date_flown,
               cabin_flown,type_traveller,overall_rating,seat_legroom_rating,seat_recline_rating,seat_width_rating,
               aisle_space_rating,viewing_tv_rating,power_supply_rating,seat_storage_rating,recommended""",
    redshift_conn_id='redshift',
    aws_credentials_id='aws_credentials',
    s3_bucket='skytrax-warehouse',
    s3_key='source-data/seat.csv',
    copy_extra=
    "FORMAT AS CSV  REGION 'us-east-2' TRUNCATECOLUMNS EMPTYASNULL BLANKSASNULL ACCEPTANYDATE DATEFORMAT 'auto' IGNOREHEADER 1"
)

load_passengers_dimension_table = LoadDimensionOperator(
    task_id='Load_passengers_dim_table',
    dag=dag,
    append_only=False,
    table='passengers',
    redshift_conn_id='redshift',
    sql=SqlQueries.passengers_table_insert)

load_airports_dimension_table = LoadDimensionOperator(
    task_id='Load_airports_dim_table',
    dag=dag,
    append_only=False,
    table='airports',
    redshift_conn_id='redshift',
    sql=SqlQueries.airports_table_insert)

load_airlines_dimension_table = LoadDimensionOperator(
    task_id='Load_airlines_dim_table',
    dag=dag,
コード例 #6
0
load_weather_and_air_quality_data = LoadDataOperator(
    task_id="Load_weather_and_air_quality_data",
    dag=dag,
    postgres_conn_id="postgres",
    open_aq_conn="open_aq",
    open_weather_conn="open_weather",
    app_id=api_key,
    limit=60,
    date=datetime(2020, 7, 4, 0, 0),
)

load_time_dimension_table = LoadDimensionOperator(
    task_id="Load_time_dim_table",
    dag=dag,
    postgres_conn_id="postgres",
    table="time_dim_table",
    sql=SqlQueries.insert_time,
)

load_weather_dimension_table = LoadDimensionOperator(
    task_id="Load_weather_dim_table",
    dag=dag,
    postgres_conn_id="postgres",
    table="weather_dim_table",
    sql=SqlQueries.insert_weather,
    values=["main", "description"],
)

load_measures_fact_table = LoadFactOperator(
    task_id="Load_measures_fact_table",
コード例 #7
0
    s3_bucket="udacity-dend",
    s3_key="song_data",
    region="us-west-2",
    ignore_headers=0,
    data_format="json",
    jsonpaths="")

load_songplays_table = LoadFactOperator(task_id='Load_songplays_fact_table',
                                        dag=dag,
                                        redshift_conn_id="redshift",
                                        sql=SqlQueries.songplay_table_insert)

load_user_dimension_table = LoadDimensionOperator(
    task_id='Load_user_dim_table',
    dag=dag,
    redshift_conn_id="redshift",
    table="users",
    sql=SqlQueries.user_table_insert,
    update_strategy="overwrite")

load_song_dimension_table = LoadDimensionOperator(
    task_id='Load_song_dim_table',
    dag=dag,
    redshift_conn_id="redshift",
    table="songs",
    sql=SqlQueries.song_table_insert,
    update_strategy="overwrite")

load_artist_dimension_table = LoadDimensionOperator(
    task_id='Load_artist_dim_table',
    dag=dag,
コード例 #8
0
                                                  AWS='aws_credentials',
                                                  table='staging_songs',
                                                  s3_bucket='udacity-dend',
                                                  s3_key='song_data')

load_songplays_table = LoadFactOperator(
    task_id='Load_songplays_fact_table',
    dag=dag,
    conn_id='redshift',
    table="songplays",
    insert_sql=SqlQueries.songplay_table_insert)

load_user_dimension_table = LoadDimensionOperator(
    task_id='Load_user_dim_table',
    dag=dag,
    table="users",
    conn_id='redshift',
    insert_sql=SqlQueries.user_table_insert,
    truncate=False,
    primary_key="userid")

load_time_dimension_table = LoadDimensionOperator(
    task_id='Load_time_dim_table',
    dag=dag,
    conn_id='redshift',
    insert_sql=SqlQueries.time_table_insert,
    truncate=False,
    primary_key=None,
    table="time")

load_song_dimension_table = LoadDimensionOperator(
    task_id='Load_song_dim_table',
コード例 #9
0
                                       , redshift_conn_id=redshift_conn_id
                                       , dag=dag
                                       , task_id='Load_Candidate_Fact_Table'
                                       )

city_fact_table = LoadFactOperator(table='city_fact'
                                  , sql_query=SqlQueries.city_fact_insert
                                  , redshift_conn_id=redshift_conn_id
                                  , dag=dag
                                  , task_id='Load_City_Fact_Table'
                                  )

# Load from stage to dimension
candidate_dim_table = LoadDimensionOperator(table='candidate_dim'
                                           , sql_query=SqlQueries.candidate_dim_insert
                                           , redshift_conn_id=redshift_conn_id
                                           , dag=dag
                                           , task_id='Load_Candidate_Dimension_Table'
                                           )
student_dim_table = LoadDimensionOperator(table='student_dim'
                                         , sql_query=SqlQueries.student_dim_insert
                                         , redshift_conn_id=redshift_conn_id
                                         , dag=dag
                                         , task_id='Load_Student_Dimension_Table'
                                         )
special_dim_table = LoadDimensionOperator(table='special_dim'
                                         , sql_query=SqlQueries.special_dim_insert
                                         , redshift_conn_id=redshift_conn_id
                                         , dag=dag
                                         , task_id='Load_Special_Dimension_Table'
                                         )
city_dim_table = LoadDimensionOperator(table='city_dim'
コード例 #10
0
    table="staging_songs",
    json_option="auto",
    provide_context=True,
    dag=dag)

load_songplays_table = LoadFactOperator(
    task_id='Load_songplays_fact_table',
    redshift_conn_id="redshift",
    table="songplays",
    sql_query=SqlQueries.songplays_table_insert,
    dag=dag)

load_users_dimension_table = LoadDimensionOperator(
    task_id='Load_users_dim_table',
    redshift_conn_id="redshift",
    table="users",
    sql_query=SqlQueries.users_table_insert,
    mode='truncate',
    dag=dag)

load_songs_dimension_table = LoadDimensionOperator(
    task_id='Load_songs_dim_table',
    redshift_conn_id="redshift",
    table="songs",
    sql_query=SqlQueries.songs_table_insert,
    mode='truncate',
    dag=dag)

load_artists_dimension_table = LoadDimensionOperator(
    task_id='Load_artists_dim_table',
    redshift_conn_id="redshift",
コード例 #11
0
    s3_key='song_data',
    json_path_option='auto'    
)

load_songplays_table = LoadFactOperator(
    task_id='Load_songplays_fact_table',
    dag=dag,
    redshift_conn_id='redshift', 
    table='songplays',
    sql_query=SqlQueries.songplay_table_insert
)

load_user_dimension_table = LoadDimensionOperator(
    task_id='Load_user_dim_table',
    dag=dag,
    redshift_conn_id='redshift', 
    table='users',
    sql_query=SqlQueries.user_table_insert,
    insert_mode='truncate'
)

load_song_dimension_table = LoadDimensionOperator(
    task_id='Load_song_dim_table',
    dag=dag,
    redshift_conn_id='redshift', 
    table='songs',
    sql_query=SqlQueries.song_table_insert,
    insert_mode='truncate'
)

load_artist_dimension_table = LoadDimensionOperator(
    task_id='Load_artist_dim_table',
コード例 #12
0
load_visitor_arrivals_mapped_staging_table = LoadFactOperator(
    task_id='Load_fact_visitor_arrivals_mapped_staging_table',
    dag=dag,
    redshift_conn_id='redshift',
    table='staging_visitor_arrivals_mapped',
    select_query=SqlQueries.visitor_arrival_mapped_staging_table_insert,
    truncate_insert=True
)

staged_operator = DummyOperator(task_id='All_staged', dag=dag)

load_port_dimension_table = LoadDimensionOperator(
    task_id='Load_dim_port_table',
    dag=dag,
    redshift_conn_id='redshift',
    table='dim_port',
    select_query=SqlQueries.port_table_insert,
    truncate_insert=True
)

load_us_city_dimension_table = LoadDimensionOperator(
    task_id='Load_dim_us_city_table',
    dag=dag,
    redshift_conn_id='redshift',
    table='dim_us_city',
    select_query=SqlQueries.us_city_table_insert,
    truncate_insert=True
)

load_us_state_dimension_table = LoadDimensionOperator(
    task_id='Load_dim_us_state_table',
コード例 #13
0
        aws_credentials_id=AIRFLOW_AWS_CREDENTIALS_ID,
        target_table=target_songs_table,
        s3_bucket=S3_BUCKET,
        s3_key=S3_SONGS_KEY,
        default_args=default_args))

load_songplays_table_task = LoadFactOperator(
    task_id='Load_songplays_fact_table',
    redshift_conn_id=AIRFLOW_REDSHIFT_CONN_ID,
    final_table=facts_songplays_table_name,
    dql_sql=SqlQueries.songplay_table_insert,
    dag=dag)

load_user_dimension_table = LoadDimensionOperator(
    task_id='Load_user_dim_table',
    redshift_conn_id=AIRFLOW_REDSHIFT_CONN_ID,
    final_table=dim_users_table_name,
    dql_sql=SqlQueries.user_table_insert,
    dag=dag)

load_song_dimension_table = LoadDimensionOperator(
    task_id='Load_song_dim_table',
    redshift_conn_id=AIRFLOW_REDSHIFT_CONN_ID,
    final_table=dim_songs_table_name,
    dql_sql=SqlQueries.song_table_insert,
    dag=dag)

load_artist_dimension_table = LoadDimensionOperator(
    task_id='Load_artist_dim_table',
    redshift_conn_id=AIRFLOW_REDSHIFT_CONN_ID,
    final_table=dim_artists_table_name,
    dql_sql=SqlQueries.artist_table_insert,
コード例 #14
0
    s3_key='song_data/A/A/A',
    table='staging_songs',
    copy_json_option='auto',
    dag=dag)

load_songplays_table = LoadFactOperator(
    task_id='Load_songplays_fact_table',
    redshift_conn_id='redshift',
    table='songplays',
    load_sql_stmt=SqlQueries.songplay_table_insert,
    dag=dag)

load_user_dimension_table = LoadDimensionOperator(
    task_id='Load_user_dim_table',
    redshift_conn_id='redshift',
    table='users',
    truncate_table=True,
    load_sql_stmt=SqlQueries.user_table_insert,
    dag=dag)

load_song_dimension_table = LoadDimensionOperator(
    task_id='Load_song_dim_table',
    redshift_conn_id='redshift',
    table='songs',
    truncate_table=True,
    load_sql_stmt=SqlQueries.song_table_insert,
    dag=dag)

load_artist_dimension_table = LoadDimensionOperator(
    task_id='Load_artist_dim_table',
    redshift_conn_id='redshift',
コード例 #15
0
    target_table=table_name_staging_game_match,
    s3_bucket=S3_BUCKET,
    s3_key=S3_TRANSFORMED_RAW_MATCH_DATA_KEY,
    dag=dag,
)
# load_summoner_dimension_table_task = LoadDimensionOperator(
#     task_id="Load_Summoner_Dimension_Table_Task",
#     redshift_conn_id=AWS_REDSHIFT_CONN_ID,
#     final_table="",
#     dql_sql=SqlDmls.summoner_table_insert,
#     dag=dag,
# )
load_champion_dimension_table_task = LoadDimensionOperator(
    task_id="Load_Champion_Dimension_Table_Task",
    redshift_conn_id=AWS_REDSHIFT_CONN_ID,
    final_table="",
    dql_sql=SqlDmls.champion_table_insert,
    # dag=dag,
)
load_item_dimension_table_task = LoadDimensionOperator(
    task_id="Load_Item_Dimension_Table_Task",
    redshift_conn_id=AWS_REDSHIFT_CONN_ID,
    final_table="",
    dql_sql=SqlDmls.item_table_insert,
    # dag=dag,
)
load_fact_match_table_task = LoadFactOperator(
    task_id="Load_Fact_Tables_Task",
    redshift_conn_id=AWS_REDSHIFT_CONN_ID,
    final_table=table_name_fact_game_match,
    dql_sql=SqlDmls.match_table_insert,
コード例 #16
0
load_songplays_table = LoadFactOperator(
    task_id='Load_songplays_fact_table',
    dag=dag,
    table='songplays',
    redshift_conn_id='redshift',
    fact_query=SqlQueries.songplay_table_insert,
    delect_or_append=
    'append',  # can either be 'append' to append data or 'delete' to truncate table and then add data. 
)

load_user_dimension_table = LoadDimensionOperator(
    task_id='Load_user_dim_table',
    dag=dag,
    table='users',
    redshift_conn_id='redshift',
    dimension_query=SqlQueries.user_table_insert,
    delect_or_append=
    'delete',  # can either be 'append' to append data or 'delete' to truncate table and then add data. 
)

load_song_dimension_table = LoadDimensionOperator(
    task_id='Load_song_dim_table',
    dag=dag,
    table='songs',
    redshift_conn_id='redshift',
    dimension_query=SqlQueries.song_table_insert,
    delect_or_append=
    'delete'  # can either be 'append' to append data or 'delete' to truncate table and then add data. 
)
コード例 #17
0
    insert_stmt=SqlQueries.final_immigration_table_insert)

run_quality_checks_final_immigration_data = DataQualityOperator(
    task_id="run_quality_checks_final_immigration_data",
    dag=dag,
    redshift_conn_id="redshift",
    dq_checks=[{
        'check_sql':
        "SELECT COUNT(*) FROM final_immigration WHERE cicid is null",
        'expected_result': 0
    }])

create_D_CITY_DEMO = LoadDimensionOperator(
    task_id="create_D_CITY_DEMO",
    dag=dag,
    redshift_conn_id="redshift",
    append_data=True,
    table="D_CITY_DEMO",
    create_stmt=SqlQueries.create_table_D_CITY_DEMO,
    insert_stmt=SqlQueries.D_CITY_DEMO_INSERT)

create_d_airport = LoadDimensionOperator(
    task_id="create_d_airport",
    dag=dag,
    redshift_conn_id="redshift",
    append_data=True,
    table="D_AIRPORT",
    create_stmt=SqlQueries.create_table_D_AIRPORT,
    insert_stmt=SqlQueries.D_AIRPORT_INSERT)

create_d_time = LoadDimensionOperator(
    task_id="create_d_time",
コード例 #18
0
ファイル: etl.py プロジェクト: Sambeth/Airflow-Data-Pipeline
load_songplays_table = LoadFactOperator(
    task_id='load_songplays_fact_table',
    redshift_conn_id="redshift",
    table="songplays",
    data_source=SqlQueries.songplay_table_insert,
    dag=dag)

dim_tables_and_sources = [
    ("users", SqlQueries.user_table_insert),
    ("songs", SqlQueries.song_table_insert),
    ("artists", SqlQueries.artist_table_insert),
    ("time", SqlQueries.time_table_insert),
]

load_dimension_tables = LoadDimensionOperator(task_id='load_dim_tables',
                                              redshift_conn_id="redshift",
                                              tables=dim_tables_and_sources,
                                              dag=dag)

run_quality_checks = DataQualityOperator(task_id='run_data_quality_checks',
                                         redshift_conn_id="redshift",
                                         tables=dim_tables_and_sources,
                                         dag=dag)

end_operator = DummyOperator(task_id='stop_execution', dag=dag)

start_operator >> stage_events_to_redshift
start_operator >> stage_songs_to_redshift
stage_events_to_redshift >> load_songplays_table
stage_songs_to_redshift >> load_songplays_table
load_songplays_table >> load_dimension_tables
load_dimension_tables >> run_quality_checks
コード例 #19
0
    table='staging_songs',
    data='s3://' + Variable.get('s3_bucket') + '/' + Variable.get('songdata'),
    region=Variable.get('region'),
    json_option='auto',
    redshift_conn_id='redshift_conn_id',
    aws_conn_id='aws_conn_id')

load_songplays_table = LoadFactOperator(task_id='Load_songplays_fact_table',
                                        dag=dag,
                                        table='songplays',
                                        sql=SqlQueries.songplays_table_insert,
                                        redshift_conn_id='redshift_conn_id')

load_user_dimension_table = LoadDimensionOperator(
    task_id='Load_user_dim_table',
    dag=dag,
    table='users',
    sql=SqlQueries.users_table_insert,
    redshift_conn_id='redshift_conn_id')

load_song_dimension_table = LoadDimensionOperator(
    task_id='Load_song_dim_table',
    dag=dag,
    table='songs',
    sql=SqlQueries.songs_table_insert,
    redshift_conn_id='redshift_conn_id')

load_artist_dimension_table = LoadDimensionOperator(
    task_id='Load_artist_dim_table',
    dag=dag,
    table='artists',
    sql=SqlQueries.artists_table_insert,
コード例 #20
0
    redshift_conn_id='redshift',
    s3_bucket='s3://udacity-dend/song_data',
    aws_credentials_id='aws_credentials',
    copy_options="'auto'",
    dag=dag)

load_songplays_table = LoadFactOperator(
    task_id='Load_songplays_fact_table',
    redshift_conn_id="redshift",
    destination_table="songplays",
    songplay_table_insert=SqlQueries.songplay_table_insert,
    dag=dag)

load_user_dimension_table = LoadDimensionOperator(
    task_id='Load_user_dim_table',
    redshift_conn_id="redshift",
    destination_table="users",
    sql_query=SqlQueries.user_table_insert,
    dag=dag)

load_song_dimension_table = LoadDimensionOperator(
    task_id='Load_song_dim_table',
    redshift_conn_id="redshift",
    destination_table="songs",
    sql_query=SqlQueries.song_table_insert,
    dag=dag)

load_artist_dimension_table = LoadDimensionOperator(
    task_id='Load_artist_dim_table',
    redshift_conn_id="redshift",
    destination_table="artists",
    sql_query=SqlQueries.artist_table_insert,
コード例 #21
0
    copy_json_option='auto',
    region="us-west-2",
    data_format="JSON")

load_songplays_table = LoadFactOperator(
    task_id='Load_songplays_fact_table',
    dag=dag,
    redshift_conn_id='redshift',
    sql_query=SqlQueries.songplay_table_insert,
    table_name="songplays",
    append_only=False)

load_user_dimension_table = LoadDimensionOperator(
    task_id='Load_user_dim_table',
    dag=dag,
    redshift_conn_id='redshift',
    sql_query=SqlQueries.user_table_insert,
    table_name="users",
    append_only=False)

load_song_dimension_table = LoadDimensionOperator(
    task_id='Load_song_dim_table',
    dag=dag,
    redshift_conn_id='redshift',
    sql_query=SqlQueries.song_table_insert,
    table_name="songs",
    append_only=False)

load_artist_dimension_table = LoadDimensionOperator(
    task_id='Load_artist_dim_table',
    dag=dag,
コード例 #22
0
    create_query=SqlQueries.create_staging_bikes)

wait_operator = DummyOperator(task_id='waiting_until_completion', dag=dag)

load_rides_facts_table = LoadFactOperator(
    redshift_conn_id="redshift",
    table="rides",
    create_query=SqlQueries.create_rides,
    insert_query=SqlQueries.rides_table_insert,
    task_id='Load_rides_facts_table',
    dag=dag)

load_stations_dimension_table = LoadDimensionOperator(
    redshift_conn_id="redshift",
    table="stations",
    create_query=SqlQueries.create_stations,
    insert_query=SqlQueries.stations_table_insert,
    task_id='Load_stations_dim_table',
    dag=dag)

load_weather_dimension_table = LoadDimensionOperator(
    redshift_conn_id="redshift",
    table="weather",
    create_query=SqlQueries.create_weather,
    insert_query=SqlQueries.weather_table_insert,
    task_id='Load_weather_dim_table',
    dag=dag)

run_quality_checks = DataQualityOperator(task_id='Run_data_quality_checks',
                                         redshift_conn_id="redshift",
                                         dag=dag)
コード例 #23
0
    s3_key="song_data",
    region="us-west-2",
    extra_params="json 'auto' compupdate off region 'us-west-2'",
    execution_date=start_date
)

load_songplays_table = LoadFactOperator(
    task_id=load_songplays_fact_tabletask_id,
    dag=dag,
    provide_context=True,
    aws_credentials_id="aws_credentials",
    redshift_conn_id='redshift',
    sql_source=SqlQueries.songplay_table_insert
)

load_user_dimension_table = LoadDimensionOperator(
    task_id=load_user_dimension_table_task_id'
    start_date= datetime(2018, 5, 1),
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    table="users",
    sql_source=SqlQueries.user_table_insert,
    dag=dag
)

load_song_dimension_table = LoadDimensionOperator(
    task_id=load_song_dimension_table_task_id,
    redshift_conn_id="redshift",
    table="songs",
    aws_credentials_id="aws_credentials",
    start_date= datetime(2018, 5, 1),
    sql_source=SqlQueries.song_table_insert,
コード例 #24
0
                                                          "songs_stage"
                                                      })

    load_songplays_table = LoadFactOperator(
        task_id='Load_songplays_fact_table',
        conn_id="redshift",
        sql=SqlQueries.songplay_table_insert,
        params={
            'table': 'songplays',
            'truncate': True
        })

    load_user_dimension_table = LoadDimensionOperator(
        task_id='Load_user_dim_table',
        conn_id="redshift",
        sql=SqlQueries.user_table_insert,
        params={
            'table': 'users',
            'truncate': True
        })

    load_song_dimension_table = LoadDimensionOperator(
        task_id='Load_song_dim_table',
        conn_id="redshift",
        sql=SqlQueries.song_table_insert,
        params={
            'table': 'songs',
            'truncate': True
        })

    load_artist_dimension_table = LoadDimensionOperator(
        task_id='Load_artist_dim_table',
    task_id='Load_staging_songs_table',
    dag=dag,
    s3_bucket='udacity-dend',
    s3_prefix='song_data',
    table='staging_songs',
    copy_options="FORMAT AS JSON 'auto'")

load_songplays_table = LoadFactOperator(
    task_id='Load_songplays_fact_table',
    dag=dag,
    table='songplays',
    select_sql=SqlQueries.insert_songplays_table)

load_user_dimension_table = LoadDimensionOperator(
    task_id='Load_users_dim_table',
    dag=dag,
    table='users',
    select_sql=SqlQueries.insert_users_table,
    mode='truncate')

load_song_dimension_table = LoadDimensionOperator(
    task_id='Load_songs_dim_table',
    dag=dag,
    table='songs',
    select_sql=SqlQueries.insert_songs_table,
    mode='truncate')

load_artist_dimension_table = LoadDimensionOperator(
    task_id='Load_artists_dim_table',
    dag=dag,
    table='artists',
    select_sql=SqlQueries.insert_artists_table,
コード例 #26
0
    s3_key='song_data/A/A/A',
    file_format='CSV'
)    

load_songplays_table = LoadFactOperator(
    task_id='Load_songplays_fact_table',
    dag=dag,
    redshift_conn_id='redshift',
    target_table = "songplays",
    sql=SqlQueries.songplay_table_insert
)

load_user_dimension_table = LoadDimensionOperator(
    task_id='Load_user_dim_table',
    dag=dag,
    redshift_conn_id='redshift',
    target_table = "users",
    sql=SqlQueries.user_table_insert
)

load_song_dimension_table = LoadDimensionOperator(
    task_id='Load_song_dim_table',
    dag=dag,
    redshift_conn_id='redshift',
    target_table = "songs",
    sql=SqlQueries.song_table_insert

)

load_artist_dimension_table = LoadDimensionOperator(
    task_id='Load_artist_dim_table',
コード例 #27
0
    table='staging_songs',
    s3_bucket='udacity-dend',
    s3_key='song_data/A/A/A',
    json_path='auto',
    region="us-west-2")

load_songplays_table = LoadFactOperator(task_id='Load_songplays_fact_table',
                                        dag=dag,
                                        redshift_conn_id='redshift',
                                        table='songplays',
                                        sql=SqlQueries.songplay_table_insert)

load_user_dimension_table = LoadDimensionOperator(
    task_id='Load_user_dim_table',
    dag=dag,
    redshift_conn_id='redshift',
    table='users',
    sql=SqlQueries.user_table_insert,
    append=False)

load_song_dimension_table = LoadDimensionOperator(
    task_id='Load_song_dim_table',
    dag=dag,
    redshift_conn_id='redshift',
    table='songs',
    sql=SqlQueries.song_table_insert,
    append=False)

load_artist_dimension_table = LoadDimensionOperator(
    task_id='Load_artist_dim_table',
    dag=dag,
コード例 #28
0
ファイル: sparkify_dag.py プロジェクト: jomavera/dataPipeline
    sql_create=SqlQueries.staging_songs_table_create,
    sql_stage=SqlQueries.staging_songs_copy,
    json_path='auto')

load_songplays_table = LoadFactOperator(
    task_id='Load_songplays_fact_table',
    dag=dag,
    table='songplays',
    redshift_conn_id='redshift',
    sql_create=SqlQueries.songplay_table_create,
    sql_select=SqlQueries.songplay_table_insert)

load_user_dimension_table = LoadDimensionOperator(
    task_id='Load_user_dim_table',
    dag=dag,
    table='users',
    redshift_conn_id='redshift',
    sql_create=SqlQueries.user_table_create,
    sql_select=SqlQueries.user_table_insert)

load_song_dimension_table = LoadDimensionOperator(
    task_id='Load_song_dim_table',
    dag=dag,
    table='songs',
    redshift_conn_id='redshift',
    sql_create=SqlQueries.song_table_create,
    sql_select=SqlQueries.song_table_insert)

load_artist_dimension_table = LoadDimensionOperator(
    task_id='Load_artist_dim_table',
    dag=dag,
コード例 #29
0
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    s3_bucket="ctsprojbucket/",
    s3_key="countries",
    fileformat="parquet",
    truncate_flag='Y')

Load_visitorsi94_fact = LoadFactOperator(
    task_id='Load_i94visitors_fact',
    dag=dag,
    redshift_conn_id="redshift",
    table_query=SqlQueries.visitors_fact_insert)

load_dates_dim = LoadDimensionOperator(task_id='Load_dates_dim',
                                       dag=dag,
                                       redshift_conn_id="redshift",
                                       table_query=SqlQueries.dates_dim_insert,
                                       table="dates_dim",
                                       truncate_flag='Y')

run_quality_check = DataQualityOperator(
    task_id='Run_data_quality_checks',
    dag=dag,
    redshift_conn_id="redshift",
    check_query=
    "select count(1) from public.i94visitors_fact where reasonforvisit is null",
    expected_count=10000)

run_intg_check = IntegrityCheckOperator(
    task_id='Run_data_integrity_check',
    dag=dag,
    redshift_conn_id="redshift",
コード例 #30
0
    iam_role=Variable.get('iam_role'),
    json_format='auto',
    table_name='staging_songs'
)

load_songplays_table = LoadFactOperator(
    task_id='Load_songplays_fact_table',
    dag=dag,
    insert_query=SqlQueries.songplays_table_insert,
    table_name='songplays',
    redshift_conn_id='redshift'
)

load_user_dimension_table = LoadDimensionOperator(
    task_id='Load_user_dim_table',
    dag=dag,
    insert_query=SqlQueries.users_table_insert,
    table_name='users',
)

load_song_dimension_table = LoadDimensionOperator(
    task_id='Load_song_dim_table',
    dag=dag,
    insert_query=SqlQueries.songs_table_insert,
    table_name='songs',
)

load_artist_dimension_table = LoadDimensionOperator(
    task_id='Load_artist_dim_table',
    dag=dag,
    insert_query=SqlQueries.artists_table_insert,
    table_name='artists',