'retries': 3,
    'retry_delay': timedelta(minutes=5),
}

dag = DAG('udac_example_dag',
          default_args=default_args,
          description='Load and transform data in Redshift with Airflow',
          schedule_interval='0 * * * *',
          max_active_runs=1
        )

start_operator = DummyOperator(task_id='Begin_execution', dag=dag)

create_tables = CreateTablesOperator(
    task_id = 'create_tables',
    dag = dag,
    redshift_conn_id = 'redshift'

)

stage_events_to_redshift = StageToRedshiftOperator(
    task_id='Stage_events',
    dag=dag,
    aws_credentials_id='aws_credentials',
    redshift_conn_id='redshift',
    table_name='staging_events',
    s3_bucket='udacity-dend',
    s3_key='log_data',
    region='us-west-2',
    json_path='s3://udacity-dend/log_json_path.json',
    provide_context=True
)
    'retries': 3,
    'retry_delay': timedelta(minutes=5),
    'catchup':False
}

dag = DAG('udac_example_dag',
          default_args=default_args,
          description='Load and transform data in Redshift with Airflow',
          schedule_interval='0 * * * *'
        )

start_operator = DummyOperator(task_id='Begin_execution',  dag=dag)

create_tables = CreateTablesOperator(
    task_id='Create_tables',
    dag=dag,
    conn_id="redshift",
    create_query_list= create_table_queries_list
    )

stage_events_to_redshift = StageToRedshiftOperator(
    task_id='Stage_events',
    dag=dag,
    table='staging_events',
    conn_id="redshift",
    aws_credentials_id="aws_credentials",
    s3_bucket="udacity-dend",
    s3_key="log_data",
    region="us-west-2",
    file_format='JSON',
    optional_path='s3://udacity-dend/log_json_path.json'
)
    'retry_delay': timedelta(minutes=5),
    'catchup': False,
    'email_on_retry': False
}

dag_name = 'sparkify_dend_dag'
dag = DAG(dag_name,
          default_args=default_args,
          description='Load and transform data in Redshift with Airflow',
          schedule_interval='0 * * * *',
          max_active_runs=3)

start_operator = DummyOperator(task_id='Begin_execution', dag=dag)

create_redshift_tables = CreateTablesOperator(task_id='Create_tables',
                                              dag=dag,
                                              redshift_conn_id="redshift")

stage_events_to_redshift = StageToRedshiftOperator(
    task_id='Stage_events',
    dag=dag,
    provide_context=True,
    table="events",
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    s3_bucket="udacity-dend",
    s3_key="log_data",
    region="us-west-2",
    file_format="JSON",
    execution_date=start_date)
Exemple #4
0
}

dag = DAG('udac_sparkify_dag_1',
          default_args=default_args,
          description='Load and transform data from S3 to Redshift with Airflow',
          schedule_interval='@hourly',
#           schedule_interval='@monthly',
          max_active_runs=1
        )

start_operator = DummyOperator(task_id='Begin_execution',  dag=dag)

create_staging_event_table = CreateTablesOperator(
    task_id='Create_staging_event_table',
    dag=dag,
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    sql=CreateTableStaments.CREATE_STAGING_EVENTS_TABLE_SQL
)

create_staging_song_table = CreateTablesOperator(
    task_id='Create_staging_songs_table',
    dag=dag,
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    sql=CreateTableStaments.CREATE_STAGING_SONGS_TABLE_SQL
)

create_fact_songplays_table = CreateTablesOperator(
    task_id='Create_fact_songplays_table',
    dag=dag,
    'start_date': datetime(2019, 7, 27),
    'depends_on_past': False,
    'email_on_retry': False,
    'retries': 3,
    'retry_delay': timedelta(minutes=5),
}

dag = DAG('udac_example_dag',
          default_args=default_args,
          description='Load and transform data in Redshift with Airflow',
          schedule_interval='0 * * * *',
          catchup=False)

start_operator = CreateTablesOperator(
    task_id='Begin_execution',
    dag=dag,
    redshift_conn_id='redshift',
    sql_file='/home/workspace/airflow/create_tables.sql')

json_path = "s3://{}/{}".format('udacity-dend', 'log_json_path.json')
stage_events_to_redshift = StageToRedshiftOperator(
    task_id='Stage_events',
    dag=dag,
    table='staging_events',
    redshift_conn_id='redshift',
    aws_credentials_id='aws_credentials',
    s3_bucket='udacity-dend',
    s3_key='log_data/2018/11/',
    copy_extra="FORMAT AS JSON '{}' REGION 'us-west-2'".format(json_path))

stage_songs_to_redshift = StageToRedshiftOperator(
    'retry_delay': timedelta(minutes=5),
    'catchup': False
}

dag = DAG('udac_capstone_dag',
          default_args=default_args,
          description='Load and transform data in Redshift with Airflow',
          schedule_interval='@monthly',
          max_active_runs=1)

start_operator = DummyOperator(task_id='Begin_execution', dag=dag)

create_staging_table = CreateTablesOperator(
    task_id='Create_staging_table',
    dag=dag,
    provide_context=True,
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    queries=SqlQueries.create_staging_table_queries)

create_target_table = CreateTablesOperator(
    task_id='Create_target_table',
    dag=dag,
    provide_context=True,
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    queries=SqlQueries.create_target_table_queries)

consolidate_operator_1 = DummyOperator(task_id='Consolidate_execution_1',
                                       dag=dag)
Exemple #7
0
    'email_on_retry': False
}

dag = DAG(
    'movie_recommendation_dag',
    default_args=default_args,
    description=
    'Load and transform Movie Recommendation Data from S3 to Redshift with Airflow',
    schedule_interval="@monthly")

start_operator = DummyOperator(task_id='Begin_execution', dag=dag)

end_operator = DummyOperator(task_id='Stop_execution', dag=dag)

create_tables_task = CreateTablesOperator(task_id="Create_tables",
                                          dag=dag,
                                          redshift_conn_id="redshift")

stage_ratings_task = StageToRedshiftOperator(task_id="Stage_ratings",
                                             dag=dag,
                                             table="staging_ratings",
                                             redshift_conn_id="redshift",
                                             aws_credentials="aws_credentials",
                                             s3_bucket="spark-out-data",
                                             s3_key="ratings",
                                             data_format="PARQUET")

stage_movies_task = StageToRedshiftOperator(task_id="Stage_movies",
                                            dag=dag,
                                            table="staging_movies",
                                            redshift_conn_id="redshift",
    'retry_delay': timedelta(minutes=5),
    'email_on_retry': False,
    'catchup': False
}

dag = DAG('udacity-pipeline',
          default_args=default_args,
          description='Load and transform data in Redshift with Airflow',
          schedule_interval='@hourly')

start_operator = DummyOperator(task_id='Begin_execution', dag=dag)

# Create the songplays fact table
create_fact_tables = CreateTablesOperator(
    task_id='Create_fact_tables',
    tables=['songplays'],
    redshift_conn_id='redshift',
    sql_queries=[SqlQueries.songplay_table_create],
    dag=dag)

# Create the dimension tables: artists, songs, time, and users
create_dimension_tables = CreateTablesOperator(
    task_id='Create_dimension_tables',
    tables=['artists', 'songs', 'time', 'users'],
    redshift_conn_id='redshift',
    sql_queries=[
        SqlQueries.artist_table_create, SqlQueries.song_table_create,
        SqlQueries.time_table_create, SqlQueries.user_table_create
    ],
    dag=dag)

# Create the two stagginng tables: staging events and staging songs
dag = DAG('udac_example_dag',
          default_args=default_args,
          description='Load and transform data in Redshift with Airflow',
          schedule_interval='0 * * * *'
        )

start_operator = DummyOperator(task_id='Begin_execution',  dag=dag)

create_tables = CreateTablesOperator(
    task_id='Create_tables',
    dag=dag,
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    tables={"staging_events":SqlQueries.create_staging_events,
            "staging_songs":SqlQueries.create_staging_songs,
            "songplays":SqlQueries.create_songplays_table,
            "users":SqlQueries.create_users_table,
            "artists":SqlQueries.create_artist_table,
            "songs":SqlQueries.create_songs_table,
            "time":SqlQueries.create_time_table}
)

stage_events_to_redshift = StageToRedshiftOperator(
    task_id='Stage_events',
    dag=dag,
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    table="staging_events",
    s3_bucket=s3_bucket,
    s3_key=events_key,
Exemple #10
0
dag = DAG(
    'capstone_pipeline',
    default_args=default_arguments,
    description=
    'Use Airflow to load New York car crash and weather data from S3 to Redshift, perform data quality checks and SQL queries',
    max_active_runs=3)
'''
Specify dummy operator at the beginning of the DAG
'''

start_operator = DummyOperator(task_id='begin_execution', dag=dag)
'''
Specify CreateTablesOperator to create tables on Redshift
'''

create_tables_in_redshift = CreateTablesOperator(
    redshift_conn_id='redshift', task_id='create_tables_in_redshift', dag=dag)
'''
Specify StageToRedshiftOperator to stage events and song data to Redshift
'''

bucket = 'mh-udacity-dend'
region = 'eu-central-1'

stage_crash_data_to_redshift = StageToRedshiftOperator(
    redshift_conn_id='redshift',
    aws_credentials_id='aws_credentials',
    table='staging_crashes',
    s3_bucket=bucket,
    s3_key='capstone_project/crash_data',
    region=region,
    file_format='JSON',
Exemple #11
0
    'depends_on_past': False,
    'retry_delay': timedelta(minutes=5),
    'retries': 3,
}

dag = DAG('udac_example_dag',
          default_args=default_args,
          description='Load and transform data in Redshift with Airflow',
          schedule_interval='0 * * * *',
          catchup=False)

start_operator = DummyOperator(task_id='Begin_execution', dag=dag)

create_tables_redshift = CreateTablesOperator(
    task_id='Create_tables',
    dag=dag,
    redshift_conn_id="redshift",
    file_path="/home/workspace/airflow/create_tables.sql")

stage_events_to_redshift = StageToRedshiftOperator(
    task_id='Stage_events',
    dag=dag,
    redshift_conn_id="redshift",
    table="staging_events",
    aws_credentials_id="aws_credentials",
    s3_bucket=s3_bucket,
    s3_key=log_s3_key,
    log_json_file=log_json_file,
    provide_context=True)

stage_songs_to_redshift = StageToRedshiftOperator(
# this ensures that `songplays` table is created after all other tables were created
initialize_tables = [
    SqlQueries.create_staging_events,
    SqlQueries.create_staging_songs,
    SqlQueries.create_table_users,
    SqlQueries.create_table_time,
    SqlQueries.create_table_artists,
    SqlQueries.create_table_songs,
    SqlQueries.create_table_songplays,
]

create_tables = CreateTablesOperator(
    task_id='create_tables',
    dag=dag,
    redshift_conn_id="redshift",
    queries_to_run=initialize_tables,  # list of tables to create
    table_names=[
        "staging_events", "staging_songs", "users", "time", "artists", "songs",
        "songplays"
    ])

stage_events_to_redshift = StageToRedshiftOperator(
    task_id='Stage_events',
    dag=dag,
    table="public.staging_events",
    #create_sql_stmt=SqlQueries.create_staging_events,
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    s3_bucket="udacity-dend",
    s3_key="log_data",
    region="us-west-2",
Exemple #13
0
dag = DAG('udac_example_dag',
          default_args=default_args,
          description='Load and transform data in Redshift with Airflow',
          max_active_runs=1)
#schedule_interval = "@hourly"
#schedule_interval = '0 0 * * *')

start_operator = DummyOperator(task_id='START_OPERATOR', dag=dag)

##Creating STG Tables - DAGS

create_listings_staging_table = CreateTablesOperator(
    task_id='Create_Listings_STG_Table',
    dag=dag,
    query=SqlQueries.create_staging_listings,
    table="STG_LISTINGS",
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials")
create_listings_staging_table.set_upstream(start_operator)

create_calendars_staging_table = CreateTablesOperator(
    task_id='Create_Calendars_STG_Table',
    dag=dag,
    query=SqlQueries.create_staging_calendars,
    table="STG_CALENDARS",
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials")
create_calendars_staging_table.set_upstream(start_operator)

create_reviews_staging_table = CreateTablesOperator(
                                           target_table="covidcases")

drop_table_masternode = DropTablesOperator(task_id='drop_table_masternode',
                                           dag=dag,
                                           target_table="masternode")

drop_table_hospital = DropTablesOperator(task_id='drop_table_hospital',
                                         dag=dag,
                                         target_table="hospital")

#create_tables
create_tables_operator = DummyOperator(task_id='create_tables', dag=dag)

create_tables_covidcases_stage = CreateTablesOperator(
    task_id='create_tables_covidcases_stage',
    dag=dag,
    sql=SqlQueries.table_create_covidcases_stage,
    table='table_create_covidcases_stage')

create_tables_masternode_stage = CreateTablesOperator(
    task_id='create_tables_masternode_stage',
    dag=dag,
    sql=SqlQueries.table_create_masternode_stage,
    table='table_create_masternode_stage')

create_tables_hospital_stage = CreateTablesOperator(
    task_id='create_tables_hospital_stage',
    dag=dag,
    sql=SqlQueries.table_create_hospital_stage,
    table='table_create_hospital_stage')