def create_and_load_table_dag(parent_dag_name, task_id, redshift_conn_id, create_sql, insert_sql, table, truncate, *args, **kwargs): dag = DAG(f"{parent_dag_name}.{task_id}", **kwargs) create_users_table = CreateTableOperator(task_id=f'create_{table}_table', dag=dag, redshift_conn_id=redshift_conn_id, create_sql=create_sql, table=table) load_user_dimension_table = LoadDimensionOperator( task_id=f'Load_{table}_dim_table', dag=dag, table=table, redshift_conn_id=redshift_conn_id, query=insert_sql, truncate=truncate) create_users_table >> load_user_dimension_table return dag
'catchup': True, 'end_date': datetime(2018, 11, 2) } dag = DAG('udac_example_dag', default_args=default_args, description='Load and transform data in Redshift with Airflow', schedule_interval='@hourly') ### Start operator start_operator = DummyOperator(task_id='Begin_execution', dag=dag) ### Create tables only if not exist create_staging_events = CreateTableOperator( task_id='create_staging_events', dag=dag, redshift_conn_id='redshift', table_name='staging_events', sql_command=SqlQueries.create_staging_events) create_staging_songs = CreateTableOperator( task_id='create_staging_songs', dag=dag, redshift_conn_id='redshift', table_name='staging_songs', sql_command=SqlQueries.create_staging_songs) create_songplays = CreateTableOperator(task_id='create_songplays', dag=dag, redshift_conn_id='redshift', table_name='songplays', sql_command=SqlQueries.create_songplays)
'retries': 3, 'retry_delay': timedelta(minutes=5), 'email_on_retry': False } dag = DAG('udac_sparkify_airflow', default_args=default_args, description='Load and transform data in Redshift with Airflow', schedule_interval='0 * * * *' ) start_operator = DummyOperator(task_id='Begin_execution', dag=dag) create_table = CreateTableOperator( task_id = "Create_tables", redshift_conn_id = "redshift", dag = dag ) stage_events_to_redshift = StageToRedshiftOperator( task_id='Stage_events', redshift_conn_id = "redshift", aws_credentials_id = "aws_credentials", table= "event", s3_bucket=default_args["s3_bucket"], format="JSON", dag=dag ) stage_songs_to_redshift = StageToRedshiftOperator( task_id='Stage_songs',
description='Load and transform data in Redshift with Airflow', schedule_interval='@hourly', max_active_runs=1) start_operator = DummyOperator(task_id='Begin_execution', dag=dag) # extract data from sas and save data to csv format # redshift_conn_id = conn_id used only when load data to generate_csv_operator = GenerateCsvOperator( task_id="Extract_sas", dag=dag, aws_credential_id="aws_credentials", s3_bucket=s3_bucket, s3_sas_key=s3_sas_key) create_tables_operator = CreateTableOperator(task_id="Create_All_Tables", dag=dag, redshift_conn_id="redshift") # using subDAG? # immigrations us_cities_demographics airport i94visas i94port i94mode i94cit i94addr # airport_codes_csv.csv load_airport_operator = CopyInsertTableOperator( task_id="Load_airport_Tables", dag=dag, table_name="airport", aws_credential_id="aws_credentials", redshift_conn_id="redshift", s3_bucket=s3_bucket, s3_csv_key="airport_codes_csv.csv") load_immigration_operator = CopyInsertTableOperator( task_id="Load_immigration_Tables",
dag = DAG( 'udac_example_dag', default_args=default_args, description='Load and transform data in Redshift with Airflow', ##https://medium.com/intage-analytics/airflow-trick-to-find-the-exact-start-date-via-cron-expression-23b5351007b schedule_interval='0 * * * *', ## Dag is running hourly max_active_runs=1, catchup=False) start_operator = DummyOperator(task_id='Begin_execution', dag=dag) create_tables = CreateTableOperator( task_id='create_tables', postgres_conn_id='redshift', provide_context=True, dag=dag, sql_statment1=CREATE_ALL_TABLES_SQL.CREATE_TABLE_artists, sql_statment2=CREATE_ALL_TABLES_SQL.CREATE_TABLE_songplays, sql_statment3=CREATE_ALL_TABLES_SQL.CREATE_TABLE_songs, sql_statment4=CREATE_ALL_TABLES_SQL.CREATE_TABLE_staging_events, sql_statment5=CREATE_ALL_TABLES_SQL.CREATE_TABLE_time) stage_events_to_redshift = StageToRedshiftOperator( task_id='Stage_events', dag=dag, provide_context=True, table='staging_events', redshift_conn_id='redshift', aws_credentials_id='aws_credentials', s3_bucket='udacity-dend/', s3_key='log_data/{execution_date.year}/{execution_date.month}', json_path='s3://udacity-dend/log_json_path.json',
'email_on_retry': False } # create DAG udac_pipeline_airflow dag = DAG('udac_pipeline_airflow', default_args=default_args, description='Load and transform data in Redshift with Airflow', schedule_interval='0 * * * *', max_active_runs=1) # dummy task start_operator = DummyOperator(task_id='Begin_execution', dag=dag) # create all tables: staging tables, fact table and dimension tables create_tables = CreateTableOperator(task_id='Create_tables', dag=dag, redshift_conn_id="redshift") # load log data from s3 to staging_events table in redshift stage_events_to_redshift = StageToRedshiftOperator( task_id='Stage_events', dag=dag, redshift_conn_id="redshift", aws_credentials_id="aws_credentials", table="staging_events", s3_bucket="udacity-dend", s3_key="log_data", jsonpaths_file="s3://udacity-dend/log_json_path.json") # load log data from s3 to staging_songs table in redshift stage_songs_to_redshift = StageToRedshiftOperator(
'catchup': True } dag_name = 'udac_airflow_dag' dag = DAG(dag_name, default_args=default_args, description='Load and transform data in Redshift with Airflow', schedule_interval='0 * * * *', max_active_runs = 1 ) start_operator = DummyOperator(task_id='Begin_execution', dag=dag) create_tables_in_redshift = CreateTableOperator( task_id = 'create_tables_in_redshift', redshift_conn_id = 'redshift', dag = dag ) stage_events_to_redshift = StageToRedshiftOperator( task_id='Stage_events', table_name="staging_events", s3_bucket = s3_bucket, s3_key = log_s3_key, file_format="JSON", log_json_file = log_json_file, redshift_conn_id = "redshift", aws_credential_id="aws_credentials", dag=dag, provide_context=True )
catchup=False) extract_sas_data_operator = ExtractionFromSASOperator( task_id ='Extract_data_from_SAS_save_as_csv_in_s3bucket', dag=dag, s3_bucket = 'uda-capstone-data', s3_load_prefix = 'csv_data', s3_save_prefix = 'csv_data', file_name = 'I94_SAS_Labels_Descriptions.SAS') create_immigration_table = CreateTableOperator( task_id = 'Create_immigration_table', dag=dag, table = 'immigration', create_sql_stmt = SqlQueries.immigrant_table_create, drop_sql_stmt = SqlQueries.drop_table ) load_immigration_table = CopyTableOperator( task_id = 'Load_immigration_table', dag=dag, table = 'immigration', schema ='public', s3_bucket = 'uda-capstone-data', s3_load_prefix = 'sas_data', iam_role = Variable.get("IAM_ROLE") ) data_quality_check_on_immigration = CheckQualityOperator(
cur.execute(code) result = cur.fetchall()[0][0] if result == failvalue: print(f"Data quality check failed. {table} has no rows.") else: print(f"Data quality check passed. {table} has rows") #Start task definition start = PythonOperator(task_id="start", dag=dag, python_callable=start) #Definition of tasksnecessary for creating the dimension and fact tables create_dim_date = CreateTableOperator(task_id="create_dim_date", dag=dag, sql=sql.create_dim_date, table="v_dim_date", cur=cur) create_dim_vehicletype = CreateTableOperator(task_id="create_dim_vehicletype", dag=dag, sql=sql.create_dim_vehicle_type, table="dim_vehicletype", cur=cur) create_dim_driver = CreateTableOperator(task_id="create_dim_driver", dag=dag, sql=sql.create_dim_driver, table="dim_driver", cur=cur)
'start_date': datetime(2020, 5, 11), 'end_date': datetime(2020, 12, 30), } #create DAG dag = DAG('sparkify_etl_dag', description='Performs ETL operations form S3 to Redshift', max_active_runs=3, start_date=datetime(2020, 6, 10, 0, 0, 0, 0)) start_operator = DummyOperator(task_id='Begin_execution', dag=dag) create_tables_task = CreateTableOperator(task_id='create_tables', dag=dag, redshift_conn_id='redshift', tables=[ "artists", "songplays", "songs", "staging_events", "staging_songs", "time", "users" ]) stage_events_task = StageTablesToRedshiftOperator( redshift_conn_id='redshift', aws_credentials_id='aws_credentials', table='staging_events', s3_bucket='udacity-dend', s3_key='log_data', region='us-west-2', file_format="s3://udacity-dend/log_json_path.json", provide_context=True, execution_date=None, task_id='staging_events_data',
'depends_on_past': False, 'retries': 3, 'retry_delay': timedelta(minutes=5), 'email_on_retry': False, 'catchup': False } dag = DAG('sparkify_etl_dag', default_args=default_args, description='Load and transform data in Redshift with Airflow', schedule_interval='0 * * * *') start_operator = DummyOperator(task_id='Begin_execution', dag=dag) create_tables_in_redshift = CreateTableOperator(task_id="Create_tables", dag=dag, redshift_conn_id="redshift") stage_events_to_redshift = StageToRedshiftOperator( task_id="Stage_events", dag=dag, table="staging_events", redshift_conn_id="redshift", aws_credentials_id="aws_credentials", s3_bucket="udacity-dend", s3_key="log_data", jsonpath="s3://udacity-dend/log_json_path.json") stage_songs_to_redshift = StageToRedshiftOperator( task_id='Stage_songs', dag=dag,
connection_operator = DummyOperator(task_id='connection_operator', dag=dag) finish_operator = DummyOperator(task_id='finish_execution', dag=dag) fetch_api_bisq = FetchApiOperator(task_id="fetch_api_bisq", dag=dag, aws_con=aws_credentials, remote_provider="bisq", aws_bucket_name=s3_bucket) fetch_api_paxful = FetchApiOperator(task_id="fetch_api_paxful", dag=dag, aws_con=aws_credentials, remote_provider="paxful", aws_bucket_name=s3_bucket) create_table = CreateTableOperator(task_id="Create_table", dag=dag, conn_id="redshift", sql_query=SqlQueries.create_table) stage_paxful_to_redshift = StageToRedshiftOperator( task_id='stage_paxful', dag=dag, table_name="staging_paxful", s3_bucket=s3_bucket, conn_id="redshift", remote_provider="paxful", aws_credential_id=aws_credentials, provide_context=True) stage_bisq_to_redshift = StageToRedshiftOperator( task_id='stage_bisq', dag=dag,
description='Load and transform data in Redshift with Airflow', catchup=False, max_active_runs=1, schedule_interval='0 * * * *', start_date=datetime(2019, 9, 9), ) dummy_operator = DummyOperator(task_id='started_pipeline', retries=3, dag=dag) create_redshift = CreateRedshiftOperator(task_id='create_cluster', dag=dag, aws_credentials_id='aws_credentials') create_table = CreateTableOperator( task_id='create_table', dag=dag, aws_credentials_id='aws_credentials', create_tables=SqlQueries.create_table_queries, drop_tables=SqlQueries.drop_table_queries) load_csv_files = PythonOperator(task_id='load_csv_files', python_callable=load_csv_data, dag=dag, provide_context=True) load_sas_file = PythonOperator(task_id='clean_and_load_i94', python_callable=load_sas_data, dag=dag, provide_context=True) ## ##clean_data_check = PythonOperator(task_id='clean_data_query', python_callable = clean_data, dag=dag, provide_context=True)