# =============================================================================
# Task definitions
# =============================================================================
dag = DAG('sparkify_ELT',
          default_args=default_args,
          description='Load and transform data in Redshift with Airflow',
          schedule_interval='@hourly')

start_operator = DummyOperator(task_id='Begin_execution', dag=dag)

stage_events_to_redshift = StageToRedshiftOperator(
    task_id='Stage_events',
    dag=dag,
    s3_key="log_data",
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    table="staging_events",
    s3_bucket="udacity-dend",
    json_path="s3://udacity-dend/log_json_path.json",
    region="us-west-2",
    overwrite=True)

stage_songs_to_redshift = StageToRedshiftOperator(
    task_id='Stage_songs',
    dag=dag,
    s3_key="song_data",
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    table="staging_songs",
    s3_bucket="udacity-dend",
    json_path="auto",
Esempio n. 2
0
          description='Inserts data into the fact and dimension tables in Redshift',
          schedule_interval='@hourly',
          default_args=default_args)

# Operators

start_operator = DummyOperator(
    task_id='start_insert_tables',
    dag=dag)

stage_events = StageToRedshiftOperator(
    task_id='staging_events',
    dag=dag,
    create_table_sql=create_tables.staging_events,
    s3_bucket='udacity-dend',
    s3_key='log_data',
    schema='public',
    table='staging_events',
    redshift_conn_id='redshift',
    aws_conn_id='aws_credentials',
    copy_options=["JSON 'auto ignorecase'"])

stage_songs = StageToRedshiftOperator(
    task_id='staging_songs',
    dag=dag,
    create_table_sql=create_tables.staging_songs,
    s3_bucket='udacity-dend',
    s3_key='song_data',
    schema='public',
    table='staging_songs',
    redshift_conn_id='redshift',
dag = DAG('udaci_example_dag',
          default_args=default_args,
          description='Load and transform data in Redshift with Airflow',
          schedule_interval='@daily')

start_operator = PostgresOperator(task_id='Begin_execution',
                                  dag=dag,
                                  postgres_conn_id='redshift',
                                  sql="create_tables.sql")

stage_events_to_redshift = StageToRedshiftOperator(
    task_id='Stage_events',
    dag=dag,
    table="staging_events",
    json_path="s3://udacity-dend/log_json_path.json",
    file_type='json',
    redshift_conn_id='redshift',
    aws_conn_id="aws_credentials",
    s3_bucket="udacity-dend",
    s3_key="log_data",
)

stage_songs_to_redshift = StageToRedshiftOperator(
    task_id='Stage_songs',
    dag=dag,
    table="staging_songs",
    json_path="auto",
    file_type='json',
    redshift_conn_id='redshift',
    aws_conn_id="aws_credentials",
    s3_bucket="udacity-dend",
Esempio n. 4
0
    dag=dag,
    postgres_conn_id="my_redshift_conn",
    sql=sql_queries_cloud.drop_staging_tables)

pres_staging_table_create = PostgresOperator(
    task_id="pres_staging_table_create",
    dag=dag,
    postgres_conn_id="my_redshift_conn",
    sql=sql_queries_cloud.pres_staging_table_create)

pres_staging_table_populate = StageToRedshiftOperator(
    task_id="pres_staging_table_populate",
    dag=dag,
    provide_context=True,
    redshift_conn_id="my_redshift_conn",
    aws_credentials_id="my_aws_conn",
    table="pres_staging_table",
    s3_bucket="prescribing-data",
    s3_key=
    "{{ execution_date.year }}_{{ ds[5:7] }}/T{{ execution_date.year }}{{ ds[5:7] }}PDPI_BNFT",
    header=True)

pres_fact_table_insert = PostgresOperator(
    task_id="pres_fact_table_insert",
    dag=dag,
    postgres_conn_id="my_redshift_conn",
    sql=sql_queries_cloud.pres_fact_table_insert)

run_quality_checks = DataQualityOperator(
    task_id='Run_data_quality_checks',
    dag=dag,
#dag = DAG('udac_example_dag',
#default_args=default_args,
#description='Load and transform data in Redshift with Airflow',
#schedule_interval='0 * * * *'
#)

start_operator = DummyOperator(task_id='Begin_execution', dag=dag)

stage_events_to_redshift = StageToRedshiftOperator(
    task_id='Stage_events',
    dag=dag,
    table_name='staging_events',
    redshift_conn_id='redshift',
    s3_bucket='udacity-dend',
    s3_key='log_data/2018/11/{ds}-events.json',
    aws_credentials={
        'key': AWS_KEY,
        'secret': AWS_SECRET
    },
    region='us-west-2',
    provide_context=True)

stage_songs_to_redshift = StageToRedshiftOperator(task_id='Stage_songs',
                                                  dag=dag,
                                                  table_name='staging_songs',
                                                  redshift_conn_id='redshift',
                                                  s3_bucket='udacity-dend',
                                                  s3_key='song_data/',
                                                  aws_credentials={
                                                      'key': AWS_KEY,
def stage_dim_s3_to_redshift(
    parent_dag_name,
    child_dag_name,
    start_date,
    end_date,
    schedule_interval,
    redshift_conn_id,
    s3_data,
    create_sql,
    table,
    s3_bucket,
    s3_key,
    iam_role,
    region,
    file_format,
    *args, **kwargs):

    """
    Subdag used to create dimension table, copy data from s3 to Redshift dimension table and lastly perform a data quality check.

    Keyword Arguments:
    parent_dag_name -- Parent DAG name defined in `main_dag.py` dag object
    child_dag_name -- Child DAG name used to define subdag ID
    redshift_conn_id   -- Redshift connection ID (str)
    aws_credentials_id -- AWS connection ID (str)
    table -- Staging table name (str)
    create_sql -- Create staging table query (str)
    s3_bucket -- AWS S3 bucket name (str)
    s3_key -- AWS S3 bucket data directory/file (str)
    region -- Redshift cluster configured region (str)
    file_format -- File format for AWS S3 files  (currently only: 'JSON' or 'CSV') (str)
    """

    dag = DAG(
        dag_id=f"{parent_dag_name}.{child_dag_name}",
        start_date=start_date,
        end_date=end_date,
        schedule_interval=schedule_interval,
        **kwargs
    )

    start_task = DummyOperator(task_id=f'{table}',  dag=dag)
    
    create_task = CreatedTableOperator(
        task_id=f'create_{table}_table',
        redshift_conn_id=redshift_conn_id,
        create_sql=create_sql.format(table),
        table=table,
        provide_context=True
    )

    copy_task = StageToRedshiftOperator(
        task_id=f'staging_{table}_table',
        dag=dag,
        table=table,
        redshift_conn_id=redshift_conn_id,
        s3_bucket=s3_bucket,
        s3_key=s3_key,
        iam_role=iam_role,
        s3_data=s3_data, 
        region=region,
        file_format=file_format,
        provide_context=True
    )

    check_task = DataQualityOperator(
        task_id=f'data_quality_check_{table}',
        dag=dag,
        redshift_conn_id=redshift_conn_id,
        table=table,
        provide_context=True
    )

    start_task >> create_task
    create_task >> copy_task
    copy_task >> check_task

    return dag
Esempio n. 7
0
start_operator = DummyOperator(task_id='Begin_execution',  dag=dag)

create_redshift_tables = CreateTablesOperator(
    task_id='Create_tables',
    dag=dag,
    redshift_conn_id ="redshift"
)

stage_events_to_redshift = StageToRedshiftOperator(
    task_id='Stage_events',
    dag=dag,
    provide_context = True,
    table = "events",
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    s3_bucket = "udacity-dend",
    s3_key = "log_data",
    region="us-west-2",
    file_format="JSON",
    execution_date=start_date
)

stage_songs_to_redshift = StageToRedshiftOperator(
    task_id='Stage_songs',
    dag=dag,
    provide_context=True,
    table="songs",
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    s3_bucket="udacity-dend",
    'catchup_by_default': False,
    'email_on_retry': False
}

dag = DAG('udac_example_dag',
          default_args=default_args,
          description='Load and transform data in Redshift with Airflow',
          schedule_interval='@once',
          max_active_runs=1)

start_operator = DummyOperator(task_id='Begin_execution', dag=dag)

stage_categories_to_redshift = StageToRedshiftOperator(
    task_id='stage_categories',
    redshift_conn_id='redshift',
    aws_conn_id='aws_credentials',
    table='staging_category',
    s3_bucket='podcast-project',
    s3_key='categories.csv',
    dag=dag)

stage_podcast_to_redshift = StageToRedshiftOperator(
    task_id='stage_podcast',
    redshift_conn_id='redshift',
    table='staging_podcast',
    s3_bucket='podcast-project',
    s3_key='podcast.csv',
    dag=dag)

stage_reviews_to_redshift = StageToRedshiftOperator(
    task_id='stage_reviews',
    redshift_conn_id='redshift',
drop_all_tables = PostgresOperator(task_id="drop_all_tables",
                                   dag=dag,
                                   postgres_conn_id="my_redshift_conn",
                                   sql=sql_queries_cloud.drop_all_tables)

create_all_tables_if_not_exist = PostgresOperator(
    task_id="create_all_tables_if_not_exist",
    dag=dag,
    postgres_conn_id="my_redshift_conn",
    sql=sql_queries_cloud.create_all_tables)

pres_staging_table_populate = StageToRedshiftOperator(
    task_id="pres_staging_table_populate",
    dag=dag,
    redshift_conn_id="my_redshift_conn",
    aws_credentials_id="my_aws_conn",
    table="pres_staging_table",
    s3_bucket="prescribing-data",
    s3_key="2019_12/T201912PDPI_BNFT",
    header=True)

gp_prac_staging_table_populate = StageToRedshiftOperator(
    task_id="gp_prac_staging_table_populate",
    dag=dag,
    redshift_conn_id="my_redshift_conn",
    aws_credentials_id="my_aws_conn",
    table="gp_pracs_staging_table",
    s3_bucket="prescribing-data",
    s3_key="2019_11/T201911ADDR_BNFT",
    header=False)
Esempio n. 10
0
dag = DAG('udac_example_dag',
          default_args=default_args,
          description='Load and transform data in Redshift with Airflow',
          schedule_interval='0 * * * *'
          #schedule_interval='@once'
          )

start_operator = DummyOperator(task_id='Begin_execution', dag=dag)

stage_events_to_redshift = StageToRedshiftOperator(
    task_id='Stage_events',
    dag=dag,
    target_table="staging_events",
    sql_table_create=SqlQueries.staging_events_table_create,
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    s3_bucket="udacity-dend",
    s3_key="log_data",
    json_file="s3://udacity-dend/log_json_path.json",
    region="us-west-2")

stage_songs_to_redshift = StageToRedshiftOperator(
    task_id='Stage_songs',
    dag=dag,
    target_table="staging_songs",
    sql_table_create=SqlQueries.staging_songs_table_create,
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    s3_bucket="udacity-dend",
    s3_key="song_data",
Esempio n. 11
0
}

dag = DAG(
    'udac_example_dag',
    default_args=default_args,
    description='Load and transform data from S3 to Redshift with Airflow',
    schedule_interval='@hourly',
)

start_operator = DummyOperator(task_id='Begin_execution', dag=dag)

stage_events_to_redshift = StageToRedshiftOperator(
    task_id='Stage_events',
    dag=dag,
    aws_credentials_id="aws_credentials",
    iam_role="Redshift_Read_S3",
    redshift_conn_id="redshift",
    s3_json_structure_path="s3://udacity-redshift/log_paths.json",
    s3_data_path="s3://udacity-dend/log_data",
    table='staging_logs')

stage_songs_to_redshift = StageToRedshiftOperator(
    task_id='Stage_songs',
    dag=dag,
    aws_credentials_id="aws_credentials",
    iam_role="Redshift_Read_S3",
    redshift_conn_id="redshift",
    s3_json_structure_path="s3://udacity-redshift/song_paths.json",
    s3_data_path="s3://udacity-dend/song_data",
    table='staging_songs')
Esempio n. 12
0
create_staging_events_table = PostgresOperator(
    task_id="create_staging_events_table",
    dag=dag,
    postgres_conn_id="redshift",
    sql=create_tables.CREATE_staging_events_TABLE_SQL)

create_staging_songs_table = PostgresOperator(
    task_id="create_staging_songs_table",
    dag=dag,
    postgres_conn_id="redshift",
    sql=create_tables.CREATE_staging_songs_TABLE_SQL)

stage_events_to_redshift = StageToRedshiftOperator(
    task_id='Stage_events',
    dag=dag,
    table="staging_events",
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    s3_bucket="udacity-dend",
    s3_key="log_data/2018/11/2018-11-01-events.json")

stage_songs_to_redshift = StageToRedshiftOperator(
    task_id='Stage_songs',
    dag=dag,
    table="staging_songs",
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    s3_bucket="udacity-dend",
    s3_key="song_data/A/B/C/TRABCEI128F424C983.json")

load_songplays_table = LoadFactOperator(task_id='Load_songplays_fact_table',
                                        dag=dag,
Esempio n. 13
0
    'catchup': False,
    'email_on_retry': False
}

dag = DAG('udac_example_dag',
          default_args=default_args,
          schedule_interval='0 * * * *',
          description='Load and transform data in Redshift with Airflow')

start_operator = DummyOperator(task_id='Begin_execution', dag=dag)

stage_events_to_redshift = StageToRedshiftOperator(
    task_id='Stage_events',
    dag=dag,
    table='public.staging_events',
    s3_bucket='udacity-dend',
    s3_key="log_data",
    redshift_conn_id='redshift',
    aws_credentials_id="aws_credentials",
    json="s3://udacity-dend/log_json_path.json")

stage_songs_to_redshift = StageToRedshiftOperator(
    task_id='Stage_songs',
    dag=dag,
    table='public.songs',
    redshift_conn_id='redshift',
    aws_credentials_id='aws_credentials',
    json='s3://udacity-dend/song_data',
    s3_bucket='udacity-dend',
    s3_key='song_data')
dag = DAG(
    'udac_example_dag',
    default_args=default_args,
    description='Load and transform data in Redshift with Airflow',
    schedule_interval=None  #'0 * * * *'
)

start_operator = DummyOperator(task_id='Begin_execution', dag=dag)

stage_events_to_redshift = StageToRedshiftOperator(
    task_id='Stage_events',
    dag=dag,
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    table="staging_events",
    s3_bucket="udacity-dend",
    s3_key="log_data/",
    #"log_data/{execution_date.year}/{execution_date.month}/{ds}-events.json",
    #"log_data/2018/11/2018-11-12-events.json",
    aws_region="us-west-2",
    json="s3://udacity-dend/log_json_path.json")

stage_songs_to_redshift = StageToRedshiftOperator(
    task_id='Stage_songs',
    dag=dag,
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    table="staging_songs",
    s3_bucket="udacity-dend",
    s3_key="song_data/",
    aws_region="us-west-2",
dag = DAG('udacity_music_dag',
          default_args=default_args,
          description='Load and transform data in Redshift with Airflow',
          schedule_interval='@hourly',
         )

start_operator = DummyOperator(task_id='Begin_execution',  dag=dag)

#Stage_events
stage_events_to_redshift = StageToRedshiftOperator(
    task_id='Stage_events',
    dag=dag,
    provide_context=True,
    redshift_conn='redshift',
    aws_credentials='aws_credentials',
    table='staging_events',
    s3_src_bucket='udacity-dend',
    s3_src_pattern='log_data',
    jsonpaths='s3://udacity-dend/log_json_path.json',
    data_format='json'
)
#Stage_song
stage_songs_to_redshift = StageToRedshiftOperator(
    task_id='Stage_songs',
    dag=dag,
    provide_context=True,
    redshift_conn='redshift',
    aws_credentials='aws_credentials',
    table='staging_song',
    s3_src_bucket='udacity-dend',
    s3_src_pattern='song_data/A/A/A',
Esempio n. 16
0
#Dag Initialization
dag = DAG('airflow_de_project',
          default_args=default_args,
          description='Load and transform data in Redshift with Airflow',
          schedule_interval='0 * * * *'
          
        )

start_operator = DummyOperator(task_id='Begin_execution',  dag=dag)

# Calling StageToRedShift Custom Operator to load data into staging events table
stage_log_events_to_redshift = StageToRedshiftOperator(
    task_id='Stage_events',
    redshift_connection_id='redshift',
    table_name='staging_events',
    aws_credential_id='aws_credentials',
    s3_bucket='udacity-dend',
    s3_key='log-data/{execution_date.year}/{execution_date.month}',
    json_path="s3://udacity-dend/log_json_path.json",
    dag=dag
)

# Calling StageToRedShift Custom Operator to load data into songs staging table
stage_songs_to_redshift = StageToRedshiftOperator(
    task_id='Stage_songs',
    redshift_connection_id='redshift',
    table_name='staging_songs',
    aws_credential_id='aws_credentials',
    s3_bucket='udacity-dend',
    s3_key='song-data/A/A',
    json_path="auto",
    dag=dag
Esempio n. 17
0
          description='Load and transform data in Redshift with Airflow',
          schedule_interval='0 * * * *',
          max_active_runs=1)

start_operator = DummyOperator(task_id='Begin_execution', dag=dag)

create_tables = PostgresOperator(task_id='Create_tables',
                                 dag=dag,
                                 sql='create_tables.sql',
                                 postgres_conn_id='redshift')

stage_events_to_redshift = StageToRedshiftOperator(
    task_id='Stage_events',
    dag=dag,
    table='staging_events',
    redshift_conn_id='redshift',
    aws_credentials_id='aws_credentials',
    s3_bucket='udacity-dend',
    s3_key='log_data/',
    aws_region='us-west-2',
    jsonpath='log_json_path.json')

stage_songs_to_redshift = StageToRedshiftOperator(
    task_id='Stage_songs',
    dag=dag,
    table='staging_songs',
    redshift_conn_id='redshift',
    aws_credentials_id='aws_credentials',
    s3_bucket='udacity-dend',
    s3_key='song_data',
    aws_region='us-west-2')
}

dag = DAG('load_songs_and_events',
          default_args=default_args,
          description='Load and transform data in Redshift with Airflow',
          max_active_runs=1,
          schedule_interval='@hourly')

start_operator = DummyOperator(task_id='Begin_execution', dag=dag)

stage_events_to_redshift = StageToRedshiftOperator(
    task_id='Stage_events',
    dag=dag,
    table="staging_events",
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    s3_bucket="udacity-dend",
    s3_key=
    "log-data/{execution_date.year}/{execution_date.month}/{execution_date.year}-{execution_date.month}-{execution_date.day}-events.json",
    format_json=Variable.get('json_event_format',
                             default_var=default_json_event_format))

stage_songs_to_redshift = StageToRedshiftOperator(
    task_id='Stage_songs',
    dag=dag,
    table="staging_songs",
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    s3_bucket="udacity-dend",
    s3_key="song_data/",
    format_json=Variable.get('json_song_format',
Esempio n. 19
0
    'kpi_item': 'utilisateur',
    'S3_key': 'Utilisateur'
}, {
    'kpi_item': 'enseigne',
    'S3_key': 'enseignes'
}, {
    'kpi_item': 'magasin',
    'S3_key': 'magasin'
}]

for kpi_item in kpi_items:
    stage_to_redshift = StageToRedshiftOperator(
        task_id=f"stage_{kpi_item['kpi_item']}_to_redshift",
        dag=dag,
        redshift_conn_id="redshift",
        aws_credentials_id="aws_credentials",
        table=f"staging_{kpi_item['kpi_item']}",
        S3_bucket="darties",
        S3_key=kpi_item['S3_key'],
        delimiter=",",
        formatting="JSON 'auto'")
    stage_to_redshifts.append(stage_to_redshift)

### Build and load dimensions
milestone_1 = DummyOperator(task_id='milestone_1', dag=dag)

build_dimension_tables = []
dimension_items = ["temps", "famille_produit"]

for dimension_item in dimension_items:
    build_dimension_table = BuildDimensionOperator(
        task_id=f"build_{dimension_item}_dimension_table",
Esempio n. 20
0
    'sparkify-etl-dag',
    default_args=default_args,
    description='Load and transform data in Redshift with Airflow',
    schedule_interval='0 * * * *',
    catchup=False,
    max_active_runs=1,
)

start_operator = DummyOperator(task_id='Begin_execution', dag=dag)

stage_events_to_redshift = StageToRedshiftOperator(
    task_id='Stage_events',
    dag=dag,
    redshift_conn='redshift',
    aws_credentials='aws_credentials',
    table='staging_events',
    s3_bucket='udacity-dend',
    s3_key='log_data',
    json_path='s3://udacity-dend/log_json_path.json',
    sql=SqlQueries.staging_table_copy,
    provide_context=True,
)

stage_songs_to_redshift = StageToRedshiftOperator(
    task_id='Stage_songs',
    dag=dag,
    redshift_conn='redshift',
    aws_credentials='aws_credentials',
    table='staging_songs',
    s3_bucket='udacity-dend',
    s3_key='song_data',
    json_path='auto',
stage_immigration_to_redshift = StageParquetToRedshiftOperator(
    task_id='Stage_immigration',
    dag=dag,
    table='staging_immigration',
    redshift_conn_id='redshift',
    aws_credentials_id='aws_credentials',
    s3_bucket='dend-bucket-oregon-123',
    s3_key=
    'capstone_immigration/immigration_parquet',  # s3 does not support wildcard such as *
    iam_role_arn=AWS_IAM_ROLE_ARN)

stage_states_to_redshift = StageToRedshiftOperator(
    task_id='Stage_states',
    dag=dag,
    table='states',
    redshift_conn_id='redshift',
    aws_credentials_id='aws_credentials',
    s3_bucket='dend-bucket-oregon-123',
    s3_key='capstone_immigration/states')

stage_airport_code_to_redshift = StageToRedshiftOperator(
    task_id='Stage_airport_code',
    dag=dag,
    table='airport_code',
    redshift_conn_id='redshift',
    aws_credentials_id='aws_credentials',
    s3_bucket='dend-bucket-oregon-123',
    s3_key='capstone_immigration/airport_code')

stage_countries_to_redshift = StageToRedshiftOperator(
    task_id='Stage_countries',
dag = DAG('capstone_main_dag',
          default_args=default_args,
          start_date=datetime.datetime.now() - datetime.timedelta(days=1),
          description='Load and transform data in Redshift with Airflow',
          schedule_interval=None)

start_operator = DummyOperator(task_id='Begin_execution', dag=dag)

stage_immigr_to_redshift = StageToRedshiftOperator(
    task_id='Stage_immigr',
    dag=dag,
    table_name='staging_immigr',
    redshift_conn_id='redshift',
    s3_bucket='capstone-bucket-immigr',
    s3_key='staging_immigr.csv',
    aws_credentials={
        'key': AWS_KEY,
        'secret': AWS_SECRET
    },
    region='us-east-1',
    provide_context=True)

stage_demo_to_redshift = StageToRedshiftOperator(
    task_id='Stage_demo',
    dag=dag,
    table_name='staging_demo',
    redshift_conn_id='redshift',
    s3_bucket='capstone-bucket-demo',
    s3_key='demo',
    aws_credentials={
Esempio n. 23
0
    'catchup': False
}

dag = DAG('dag_s3_to_redshift',
          default_args=default_args,
          description='Load and transform data in Redshift with Airflow',
          schedule_interval='@hourly'
        )

start_operator = DummyOperator(task_id='Begin_execution',  dag=dag)

stage_events_to_redshift = StageToRedshiftOperator(
    task_id='Stage_events',
    dag=dag,
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    target_table_name="staging_events",
    s3_data_path="s3://udacity-dend/log_data",
    json_schema="s3://udacity-dend/log_json_path.json",
)

stage_songs_to_redshift = StageToRedshiftOperator(
    task_id='Stage_songs',
    dag=dag,
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    target_table_name="staging_songs",
    s3_data_path="s3://udacity-dend/song_data",
    json_schema="auto",
)
Esempio n. 24
0
                                       'data/transformed_citibike_data.csv',
                                       'bucket_name': 'ud-covid-citibike',
                                       'key': 'citibike'
                                   },
                                   dag=dag)

table_creation = PostgresOperator(task_id='tables_creation',
                                  dag=dag,
                                  postgres_conn_id='redshift',
                                  sql='/create_tables.sql')

s3_dates_to_redshift = StageToRedshiftOperator(
    task_id='s3_dates_to_redshift',
    dag=dag,
    conn_id="redshift",
    aws_credentials_id="aws_credentials",
    s3_bucket='ud-covid-citibike',
    s3_key='dates',
    table="dates",
    file_format='CSV')

s3_bike_to_redshift = StageToRedshiftOperator(
    task_id='s3_bike_to_redshift',
    dag=dag,
    conn_id="redshift",
    aws_credentials_id="aws_credentials",
    s3_bucket='ud-covid-citibike',
    s3_key='citibike',
    table="bike",
    file_format='CSV')
Esempio n. 25
0
                      0)  # end time for debugging so run dag 10 times
)

start_operator = DummyOperator(task_id='Begin_execution', dag=dag)

# Create tables in Redshift to store S3 data
create_tables = PostgresOperator(task_id='Create_tables',
                                 dag=dag,
                                 postgres_conn_id='redshift',
                                 sql='create_tables.sql')

stage_events_to_redshift = StageToRedshiftOperator(
    task_id='Stage_events',
    dag=dag,
    table='staging_events',
    redshift_conn_id='redshift',
    aws_credentials_id='aws_credentials',
    s3_bucket='udacity-dend',
    s3_key=
    'log_data',  # only load events for execution year, for full 'log_data/{{ execution_date.year }}', s3 does not support wildcard such as *
    json_format='s3://udacity-dend/log_json_path.json')

stage_songs_to_redshift = StageToRedshiftOperator(
    task_id='Stage_songs',
    dag=dag,
    table='staging_songs',
    redshift_conn_id='redshift',
    aws_credentials_id='aws_credentials',
    s3_bucket='udacity-dend',
    s3_key=
    'song_data',  # load a small portion of song data with 'song_data/A/A/A'
    json_path='auto')
Esempio n. 26
0
def stage_fact_s3_to_redshift(
    parent_dag_name,
    child_dag_name,
    start_date,
    end_date,
    schedule_interval,
    redshift_conn_id,
    degree_list,
    s3_data,
    create_sql,
    s3_bucket,
    s3_key,
    iam_role,
    region,
    file_format,
    extra_copy_parameters='',
    *args, **kwargs):

    """
    Subdag used to create staging table, copy data from s3 to staging table in redshift and lastly perform a data quality check.

    Keyword Arguments:
    parent_dag_name -- Parent DAG name defined in `main_dag.py` dag object
    child_dag_name -- Child DAG name used to define subdag ID
    start_date -- DAG start date
    end_date -- DAG end date
    schedule_interval -- (e.g. '@monthly', '@weekly', etc.)
    redshift_conn_id   -- Redshift connection ID (str)
    degree_list -- List of degree names (list)
    aws_credentials_id -- AWS connection ID (str)
    s3_bucket -- AWS S3 bucket name (str)
    s3_date -- S3 data name used to format staging table name
    create_sql -- SQL used to create staging table 
    s3_key -- AWS S3 bucket data directory/file (str)
    region -- Redshift cluster configured region (str)
    file_format -- File format for AWS S3 files  (currently only: 'JSON' or 'CSV') (str)
    """

    dag = DAG(
        dag_id=f"{parent_dag_name}.{child_dag_name}",
        start_date=start_date,
        end_date=end_date,
        schedule_interval=schedule_interval,
        **kwargs
    )

    for degree in degree_list:
        table = f'{degree}_{s3_data}'
        error_table = f'{table}_errors'

        start_task = DummyOperator(task_id=f'{degree}',  dag=dag)

        create_task = CreatedTableOperator(
            task_id=f'create_{table}_table',
            redshift_conn_id=redshift_conn_id,
            create_sql=create_sql.format(table),
            table=table,
            provide_context=True
        )

        copy_task = StageToRedshiftOperator(
            task_id=f'staging_{table}_table',
            dag=dag,
            table=table,
            redshift_conn_id=redshift_conn_id,
            s3_bucket=s3_bucket,
            s3_key=s3_key,
            iam_role=iam_role,
            s3_data=s3_data, 
            degree=degree,
            region=region,
            file_format=file_format,
            extra_copy_parameters=extra_copy_parameters,
            provide_context=True
            )

        #push count to xcom for stl count comparison
        count_check_task = DataQualityOperator(
            task_id=f'data_quality_check_{table}',
            dag=dag,
            redshift_conn_id=redshift_conn_id,
            table=table,
            provide_context=True
        )

        check_stl_branch = STLCheckOperator(
            task_id=f'stl_check_{table}',
            table=table,
            error_table=error_table,
            redshift_conn_id=redshift_conn_id
        )

        staging_success_task = PythonOperator(
            task_id=f'staging_success_check_{table}',
            python_callable=staging_success_check,
            op_kwargs={'redshift_conn_id': redshift_conn_id, 'table': table, 'error_table': error_table},
            dag=dag,
            provide_context=True
        )

        start_task >> create_task
        create_task >> copy_task
        copy_task >> [check_stl_branch, count_check_task]
        check_stl_branch >> staging_success_task

    return dag
#Instantiate DAG
dag = DAG('udac_example_dag',
          default_args=default_args,
          description='Load and transform data in Redshift with Airflow',
          schedule_interval='0 * * * *')

#1. Dummy Task - no functionality
start_operator = DummyOperator(task_id='Begin_execution', dag=dag)

#2. Copy log files to staging table in Redshift
stage_events_to_redshift = StageToRedshiftOperator(
    task_id='Stage_events',
    dag=dag,
    redshift_conn_id='redshift',
    aws_credentials_id='aws_credentials',
    table='staging_events',
    s3_bucket="udacity-dend",
    s3_key="log_data",
    region='us-west-2',
    json_path="s3://udacity-dend/log_json_path.json")

#3. Copy song files to staging table in Redshift
stage_songs_to_redshift = StageToRedshiftOperator(
    task_id='Stage_songs',
    dag=dag,
    redshift_conn_id='redshift',
    aws_credentials_id='aws_credentials',
    table='staging_songs',
    s3_bucket="udacity-dend",
    s3_key="song_data",
    region='us-west-2',
dag = DAG('udac_sparkify_dag',
          default_args=default_args,
          description='Load and transform data in Redshift with Airflow',
          schedule_interval="@daily",
          catchup=False)

start_operator = DummyOperator(task_id='Begin_execution', dag=dag)

stage_events_to_redshift = StageToRedshiftOperator(
    task_id='Stage_events',
    dag=dag,
    table="staging_events",
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    s3_bucket=Variable.get('s3_bucket'),
    s3_key=Variable.get('s3_key_log_data'),
    log_json_path=Variable.get('s3_key_log_data_json_path'),
    depends_on_past=False,
    retries=3,
    retry_delay=timedelta(minutes=5),
    email_on_retry=False)

stage_songs_to_redshift = StageToRedshiftOperator(
    task_id='Stage_songs',
    dag=dag,
    table="staging_songs",
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    s3_bucket=Variable.get('s3_bucket'),
    s3_key=Variable.get('s3_key_song_data'),
    redshift_conn_id = 'redshift',
    dag = dag
)


"""
connecting to S3
connecting to redshift
running the StageToRedshiftOperator operator
"""
stage_events_to_redshift = StageToRedshiftOperator(

    task_id="stage_events_to_redshift",
    dag=dag,
    table="staging_events",
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    s3_bucket="udacity-dend",
    s3_key="log_data",
    json="s3://udacity-dend/log_json_path.json"
)

"""
connecting to S3
connecting to redshift
running the StageToRedshiftOperator operator
"""
stage_songs_to_redshift = StageToRedshiftOperator(

    task_id="stage_songs_to_redshift",
    dag=dag,
Esempio n. 30
0
        )

start_operator = DummyOperator(task_id='Begin_execution', dag=dag)

create_redshift_tables = CreateTablesOperator(
    task_id='Create_tables',
    dag=dag,
    redshift_conn_id="redshift"
)
logging.info('Starting staging to redshift')
stage_covid_to_redshift = StageToRedshiftOperator(
    task_id='Stage_covid',
    dag=dag,
    table="staging_covid",
    redshift_conn_id="redshift",
    aws_credentials_id="aws_s3_connection",
    s3_bucket="udacity-data-lake",
    s3_key="covid19/staging",
    region="us-west-2",
    extra_params="delimiter ';'"
)

load_covid_cases_fact_table = LoadFactOperator(
    task_id='Load_covid_cases_fact_table',
    dag=dag,
    table='fact_covid_cases',
    redshift_conn_id="redshift",
    load_sql_stmt=SqlQueries.covid_cases_insert
)

load_dim_location_table = LoadDimensionOperator(