def get_s3_to_redshift_dag(parent_dag_name, task_id, redshift_conn_id, aws_credentials_id, table, create_sql_stmt, s3_bucket, s3_key, *args, **kwargs): dag = DAG(f"{parent_dag_name}.{task_id}", **kwargs) create_task = PostgresOperator(task_id=f"create_{table}_table", dag=dag, postgres_conn_id=redshift_conn_id, sql=create_sql_stmt) copy_task = S3ToRedshiftOperator( task_id=f"load_{table}_from_s3_to_redshift", dag=dag, table=table, redshift_conn_id=redshift_conn_id, aws_credentials_id=aws_credentials_id, s3_bucket=s3_bucket, s3_key=s3_key) # # TODO: Move the HasRowsOperator task here from the DAG # check_task = HasRowsOperator(task_id=f"check_{table}_has_rows", dag=dag, redshift_conn_id=redshift_conn_id, table=table) create_task >> copy_task copy_task >> check_task # # TODO: Use DAG ordering to place the check task # return dag
'owner': 'sandeep', 'start_date': datetime.utcnow() - timedelta(hours=5), 'depends_on_past': False, 'email_on_retry': False, 'retries': 2, 'retry_delay': timedelta(minutes=1), 'catchup_by_default': False, } dag = DAG('create_tables_dag', default_args=default_args, description='Create tables in Redshift using Airflow', schedule_interval=None, max_active_runs=1 ) start_operator = DummyOperator(task_id='Begin_execution', dag=dag) create_all_tables = PostgresOperator( task_id="create_all_tables", dag=dag, postgres_conn_id="redshift", sql=create_all_tables ) end_operator = DummyOperator(task_id='Stop_execution', dag=dag) # task dependencies start_operator >> create_all_tables create_all_tables >> end_operator
'start_date': datetime(2020, 1, 1), } default_args = { 'owner': 'udacity', 'start_date' : datetime.now()} dag = DAG('udac_example_dag', default_args=default_args, description='Load and transform data in Redshift with Airflow' ) start_operator = DummyOperator(task_id='Begin_execution', dag=dag) create_tables_task = PostgresOperator( task_id="create_tables", dag=dag, sql='create_tables.sql', postgres_conn_id="redshift" ) # Taking a reference from Project 3 - Cloud Data Warehouses stage_events_to_redshift = StageToRedshiftOperator( task_id='Stage_events', dag=dag, redshift_conn_id = "redshift", aws_conn_id = "aws_credentials", s3_bucket="udacity-dend", s3_key="log_data", s3_json_path = "s3://udacity-dend/log_json_path.json", table = "staging_events", file_type = "json"
from airflow.operators import PostgresOperator from airflow.operators.sensors import TimeDeltaSensor from helpers import SqlQueries default_args = { 'owner': 'gabriel', 'start_date': datetime(2009, 12, 31, 23, 59, 59), 'end_date': datetime(2012, 12, 31, 23, 59, 59), 'depends_on_past': False, 'retries': 1, 'retry_delay': timedelta(seconds=300), 'catchup': True } dag = DAG('recreate_bi_tables', default_args=default_args, description='Recreate the business intelligence tables', schedule_interval=None, max_active_runs=1) dummy_wait = TimeDeltaSensor(task_id='dummy_wait', dag=dag, delta=timedelta(seconds=1)) recreate_bi_tables = PostgresOperator(task_id="recreate_bi_tables_task", dag=dag, postgres_conn_id="redshift", sql=SqlQueries.recreate_bi_tables) dummy_wait >> recreate_bi_tables
) del_suc_task = BashOperator( task_id="delete_success_data", bash_command='rm -r ' + home_dir + '/git_{{ (execution_date - macros.timedelta(days=2)).strftime("%Y-%m-%d") }}.parquet/_SUCCESS', ) upload_to_S3_task = PythonOperator(task_id='upload_parquet_to_S3', python_callable=upload_file_to_S3_with_hook, dag=dag, provide_context=True) create_main_tables = PostgresOperator(task_id="create_main_tables", dag=dag, postgres_conn_id=config.get( "REDSHIFT", "CONNECTION"), sql="create_main_tables.sql") create_stage_tables = PostgresOperator(task_id="create_stage_tables", dag=dag, postgres_conn_id=config.get( "REDSHIFT", "CONNECTION"), sql="create_stage_tables.sql") stage_events_dict_to_redshift = StageToRedshiftOperator( task_id='staging_events_dict_copy', redshift_conn_id=config.get("REDSHIFT", "CONNECTION"), aws_credentials_id=config.get("AWS", "CREDENTIALS"), table='events_dict_staging', s3_bucket=config.get("S3", "BUCKET"),
'retries': 3, 'retry_delay': timedelta(minutes=1), 'catchup': False, 'email_on_retry': False } dag = DAG('sustainify_fact_dag', default_args=default_args, description='Load and transform fact data in Redshift with Airflow', schedule_interval='@monthly') start_operator = DummyOperator(task_id='Begin_Execution', dag=dag) create_fact_table = PostgresOperator( task_id='Create_Fact_Table', dag=dag, sql=SqlQueries.create_fact_table.format(table_name=FACT_TABLE), postgres_conn_id='redshift', ) create_staging_table = PostgresOperator( task_id='Create_Staging_Table', dag=dag, sql=SqlQueries.create_fact_table.format(table_name=STAGING_TABLE), postgres_conn_id='redshift', ) drop_staging_table = PostgresOperator( task_id='Drop_Staging_Table', dag=dag, sql=SqlQueries.drop_fact_table.format(table_name=STAGING_TABLE), postgres_conn_id='redshift',
def make_task_group(dagname, name, path, pdag, trigger_rule): args = { 'owner': 'deploy', 'start_date': datetime.strptime( (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d'), '%Y-%m-%d') } dag = DAG(dag_id=dagname, default_args=args, schedule_interval=timedelta(hours=24)) sdOp = SubDagOperator(task_id=name, subdag=dag, dag=pdag, trigger_rule=trigger_rule) start = DummyOperator(task_id=name + '_start', dag=dag) end = DummyOperator(task_id=name + '_end', dag=dag) task_dict = {} task_dict[name + '_start'] = start task_dict[name + '_end'] = end config_str = open(path, 'r').read() config_arr = config_str.split( '[dependency]' ) # a -> b means data flow from a to b, though not valid ini format, but i think it is better to understand config_fp = StringIO.StringIO(config_arr[0]) config = ConfigParser.RawConfigParser() config.readfp(config_fp) sections = config.sections() #print sections for section in sections: #print section if section != 'dependency': options = config.options(section) #print options if 'type' not in options or 'cmd' not in options: continue operator_type = config.get(section, 'type').strip() operator_cmd = config.get(section, 'cmd').strip() if operator_type == 'PostgresOperator': task = PostgresOperator(task_id=section.strip(), depends_on_past=False, postgres_conn_id='postgres_sha2dw03', sql=operator_cmd, dag=dag) task_dict[section.strip()] = task task.set_downstream(end) task.set_upstream(start) elif operator_type == 'BashOperator': task = BashOperator(task_id=section.strip(), depends_on_past=False, bash_command=operator_cmd, dag=dag) task_dict[section.strip()] = task task.set_downstream(end) task.set_upstream(start) else: print "Error: currently not support %s operator type" % operator_type if len(config_arr) == 1: return (start, end, dag, sdOp, task_dict) for line in config_arr[1].split('\n'): arr = line.split('->') if len(arr) != 2: continue left_side = arr[0].strip() right_side = arr[1].strip() if left_side in task_dict.keys() and right_side in task_dict.keys(): if end in task_dict[left_side].downstream_list: task_dict[left_side].downstream_list.remove(end) task_dict[left_side].set_downstream(task_dict[right_side]) if start in task_dict[right_side].upstream_list: task_dict[right_side].upstream_list.remove(start) if task_dict[right_side] in start.downstream_list: start.downstream_list.remove(task_dict[right_side]) if task_dict[left_side] in end.upstream_list: end.upstream_list.remove(task_dict[left_side]) return (start, end, dag, sdOp, task_dict)
'retries': 3, 'retry_delay': 300, 'email_on_retry': False } dag = DAG('s3_to_redshift_dag', default_args=default_args, description='Extract Load and Transform data from S3 to Redshift', schedule_interval='@hourly', catchup=False) start_operator = DummyOperator(task_id='Begin_execution', dag=dag) create_staging_events_table = PostgresOperator( task_id='Create_staging_events_table', dag=dag, postgres_conn_id='redshift', sql=SqlQueries.staging_events_table_create) create_staging_songs_table = PostgresOperator( task_id='Create_staging_songs_table', dag=dag, postgres_conn_id='redshift', sql=SqlQueries.staging_songs_table_create) create_songplays_table = PostgresOperator( task_id='Create_songplays_table', dag=dag, postgres_conn_id='redshift', sql=SqlQueries.songplays_table_create)
'retry_delay': timedelta(minutes=1), 'catchup': False, 'email_on_retry': False } dag = DAG('sustainify_dimensions_dag', default_args=default_args, description='Load and transform data in Redshift with Airflow', schedule_interval='@once' ) start_operator = DummyOperator(task_id='Begin_Execution', dag=dag) create_sql_tables = PostgresOperator( task_id='Create_Tables', dag=dag, sql=SqlQueries.create_tables, postgres_conn_id='redshift', ) transport_to_redshift = StageToRedshiftOperator( task_id='Transport_to_Redshift', dag=dag, table='public.transport', redshift_conn_id='redshift', s3_bucket='xwoe-udacity', s3_key='deng_capstone/tables/transport.parquet', aws_credentials_id='aws_credentials', aws_iam=Variable.get('arn'), file_format="FORMAT AS PARQUET" )
default_args = { 'owner': 'Gaurav', 'start_date': datetime(2019, 1, 12), 'depends_on_past': False } with DAG( dag_id='sparkify_data_pipeline', default_args=default_args, schedule_interval='0 0 * * *', description='Load and transform data in Redshift with Airflow') as dag: start_operator = DummyOperator(task_id='Begin_execution') create_tables = PostgresOperator(task_id="create_tables", postgres_conn_id="redshift", sql="create_tables.sql") stage_events_to_redshift = StageToRedshiftOperator( task_id='Stage_events', redshift_conn_id='redshift', aws_credentials_id='aws_credentials', table='staging_events', s3_bucket='udacity-dend', s3_key='log_data', copy_json_option='s3://udacity-dend/log_json_path.json', region='us-west-2') stage_songs_to_redshift = StageToRedshiftOperator( task_id='Stage_songs', redshift_conn_id='redshift',
) from helpers import SqlQueries dag = DAG( "reset_tables_dag", start_date=datetime.datetime(2018, 1, 1, 0, 0, 0, 0), schedule_interval=None, max_active_runs=1 ) drop_staging_events = PostgresOperator( task_id="staging_events_table_drop", dag=dag, postgres_conn_id="redshift", sql=SqlQueries.staging_events_table_drop ) drop_staging_songs = PostgresOperator( task_id="staging_songs_table_drop", dag=dag, postgres_conn_id="redshift", sql=SqlQueries.staging_songs_table_drop ) drop_user = PostgresOperator( task_id="user_table_drop", dag=dag, postgres_conn_id="redshift", sql=SqlQueries.user_table_drop
import datetime import sql_statements from airflow import DAG from airflow.operators import (PostgresOperator, S3ToRedshiftOperator, HasRowsOperator, FactsCalculatorOperator) customDag = DAG(dag_id='lesson3Exercise4', start_date=datetime.datetime(2020, 6, 21), max_active_runs=1) # create trips table createTripsTable = PostgresOperator(task_id='createTripsTable', postgres_conn_id='redshift', sql=sql_statements.CREATE_TRIPS_TABLE_SQL, dag=customDag) # load data from S3 to Resdhift loadTripData = S3ToRedshiftOperator( task_id='loadTripData', redshift_conn_id='redshift', aws_credentials_id='aws_credentials', table='trips', truncate=False, s3_bucket='udacity-dend', s3_key='data-pipelines/divvy/unpartitioned/divvy_trips_2018.csv', delimiter=',', ignore_headers=1, dag=customDag) # check data quality checkDataQuality = HasRowsOperator(task_id='checkDataQuality',
dag = DAG( 'sparkify_etl_dag', default_args=default_args, description='Load and transform data in Redshift with Airflow', schedule_interval='0 * * * *', max_active_runs=1, ) start_operator = DummyOperator(task_id='Begin_execution', dag=dag) # Create destination SQL tables # Dimension tables create_artists_task = PostgresOperator( dag=dag, postgres_conn_id="redshift", sql=CreateTablesSqlQueries.create_artists, task_id="create_artists") create_songplays_task = PostgresOperator( dag=dag, postgres_conn_id="redshift", sql=CreateTablesSqlQueries.create_songplays, task_id="create_songplays") create_songs_task = PostgresOperator(dag=dag, postgres_conn_id="redshift", sql=CreateTablesSqlQueries.create_songs, task_id="create_songs") create_time_task = PostgresOperator(dag=dag,
my_aws_credentials = "aws_credentials" # # TODO: Create a DAG which performs the following functions: # # 1. Loads Trip data from S3 to RedShift # 2. Performs a data quality check on the Trips table in RedShift # 3. Uses the FactsCalculatorOperator to create a Facts table in Redshift # a. **NOTE**: to complete this step you must complete the FactsCalcuatorOperator # skeleton defined in plugins/operators/facts_calculator.py # dag = DAG("lesson3.exercise4", start_date=datetime.datetime.utcnow()) create_trips_table = PostgresOperator( task_id="create_trips_table", dag=dag, postgres_conn_id=my_redshift_conn, sql=sql_statements.CREATE_TRIPS_TABLE_SQL) # # TODO: Load trips data from S3 to RedShift. Use the s3_key # "data-pipelines/divvy/unpartitioned/divvy_trips_2018.csv" # and the s3_bucket "udacity-dend" # copy_trips_task = S3ToRedshiftOperator( task_id="load_trips_from_s3_to_redshift", dag=dag, redshift_conn_id=my_redshift_conn, aws_credentials_id=my_aws_credentials, table="trips", s3_bucket="udacity-dend",
'email_on_retry': False } dag = DAG( 'udac_example_dag', default_args=default_args, description='Load and transform data in Redshift with Airflow', schedule_interval='0 * * * *', catchup= False # Perform scheduler catchup (or only run latest)? Defaults to True ) start_operator = DummyOperator(task_id='Begin_execution', dag=dag) create_tables = PostgresOperator(task_id="create_tables", dag=dag, postgres_conn_id=redshift_conn_id, sql=sql_statements.CREATE_TABLES) stage_events_to_redshift = StageToRedshiftOperator( task_id='Stage_events', dag=dag, redshift_conn_id=redshift_conn_id, aws_credentials_id=aws_credentials_id, s3_bucket=s3_bucket, s3_key=log_s3_key, table=log_stage_table, json_format=log_json_format, ) stage_songs_to_redshift = StageToRedshiftOperator( task_id='Stage_songs',
def redshift_table_ddl_plus_bulk_load(parent_dag_id, parent_task_id, redshift_conn_id, aws_credentials_id, table, create_table_statement, s3_bucket, s3_key, *args, **kwargs): """ This Python Factory Function returns an Airflow DAG Class instance to be used as a SubDag within the (outer) DAG that calls it. The DAG in turn executes three Task within a Redshift database instance: - execute CREATE TABLE (DDL) statement through PostgresOperator; - execute COPY (bulk load) to pull data from S3 into the newly created table through custom S3ToRedshiftOperator - perform data quality check through custom HasRowsOperator Args: parent_dag_id (str): """ # instantiate DAG Class subDagInstance = DAG( #---------------------------------------------------------------------- # # ATTENTION! "parent_dag_id.parent_task_id" IS MANDATORY # # It's an Airflow Convention that subdags must have their "dag_id" # attribute set to the combination of: # # 1. "dag_id" of the outer DAG Instance calling the SubDagOperator # which in turn calls the Python Factory Function; # # 2. "task_id" assigned to the SubDagOperator mentioned in item number 1 # # 3. "." a dot must separate the "dag_id" and "task_id" mentioned in # both items 1 and 2, respectively; # #---------------------------------------------------------------------- dag_id=f"{parent_dag_id}.{parent_task_id}" # make sure keyword arguments are also received , **kwargs) # NOTICE how Task instances are created in a SubDag just as they would in # a "normal" DAG. createTable = PostgresOperator(task_id=f'create_{table}_table', postgres_conn_id=redshift_conn_id, sql=create_table_statement, dag=subDagInstance) bulkLoadTable = S3ToRedshiftOperator(task_id=f'bulk_load_{table}_table', redshift_conn_id=redshift_conn_id, aws_credentials_id=aws_credentials_id, table=table, s3_bucket=s3_bucket, s3_key=s3_key, dag=subDagInstance) checkDataQuality = HasRowsOperator(task_id=f'check_{table}_table', redshift_conn_id=redshift_conn_id, table=table, dag=subDagInstance) # Define Task dependencies createTable >> bulkLoadTable >> checkDataQuality return subDagInstance
# Credential data and redshift parameters for connection connection_info = { 'conn_id': "redshift", 'aws_credentials_id': "aws_credentials", 's3_bucket': "udacity-dend", 'file_format': f"s3://udacity-dend/log_json_path.json" } # Operator that starts pipline start_operator = DummyOperator(task_id='Begin_execution', dag=dag) # Operator that create tables create_tables_task = PostgresOperator( task_id="Create_tables", dag=dag, postgres_conn_id="redshift", aws_credentials_id="aws_credentials", sql='create_tables.sql' ) # Operator that moves log data from S3 server to Redshift stage_events_to_redshift = StageToRedshiftOperator( task_id='Stage_events', dag=dag, conn_id=connection_info["conn_id"], aws_credentials_id=connection_info["aws_credentials_id"], table="staging_events", s3_bucket=connection_info["s3_bucket"], s3_key="log_data/2018/11/", file_format=connection_info["file_format"]
dag=dag, ) # Spark Job to Clean Immigration Data data_cleaning = SSHOperator( task_id='data_cleaning_spark_job', dag=dag, ssh_conn_id='my_ssh_connection', command= '/usr/bin/spark-submit --jars "spark-sas7bdat-2.1.0-s_2.11.jar,parso-2.0.10.jar" --master yarn clean_job.py', ) # Create Immigration Fact Table create_immigration_table = PostgresOperator( task_id="create_immigration_fact_table", dag=dag, postgres_conn_id="redshift", sql=sql_statements.CREATE_IMMIGRATION_FACT_SQL) # Create Happiness Table create_happiness_table = PostgresOperator( task_id="create_happiness_table", dag=dag, postgres_conn_id="redshift", sql=sql_statements.CREATE_HAPPINESS_SQL) # Create Country Table create_country_table = PostgresOperator(task_id="create_country_table", dag=dag, postgres_conn_id="redshift", sql=sql_statements.CREATE_COUNTRY_SQL)
# DAG definition dag = DAG('udac_example_dag', default_args=default_args, description='Load and transform data in Redshift with Airflow', schedule_interval='0 * * * *', max_active_runs=1 ) # start task definition start_operator = DummyOperator(task_id='Begin_execution', dag=dag) # create table task create_table = PostgresOperator( task_id="create_table", postgres_conn_id="redshift", sql=create_tables.drop_and_create_tables, dag=dag, ) # Task stage events to redshift stage_events_to_redshift = StageToRedshiftOperator( task_id='Stage_events', redshift_conn_id='redshift', aws_credentials_id='aws_credentials', table='staging_events', s3_bucket='udacity-dend', s3_key='log_data', copy_json_option='s3://udacity-dend/log_json_path.json', region='us-west-2', dag=dag )
main_dag = DAG( 'Project_5', description='Load and transform data in Redshift with Airflow', start_date=datetime(2018, 11, 1, 0, 0, 0, 0), end_date=datetime(2018, 11, 30, 0, 0, 0, 0), schedule_interval="@monthly", default_args=default_args, #catchup=False ) start_operator = DummyOperator(task_id='Begin_Execution', dag=main_dag) create_events_in_s3_task = PostgresOperator( task_id="Create_Events", dag=main_dag, postgres_conn_id="redshift", sql=SqlQueries.create_staging_events_table) copy_events_to_s3_task = StageToRedshiftOperator( task_id="Stage_Events", dag=main_dag, table_name="staging_events", redshift_conn_id="redshift", aws_credentials_id="aws_credentials", s3_bucket="udacity-dend", s3_key="log_data/{execution_date.year}/{execution_date.month}/", s3_format="json", json_path="s3://udacity-dend/log_json_path.json") create_songs_in_s3_task = PostgresOperator(
} }, 'query': "SELECT * FROM table_four" }] with DAG(dag_id='adb_pipeline', default_args=args, start_date=datetime(2019, 1, 1), schedule_interval='30 4 * * *', catchup=False) as dag: t1 = DummyOperator(task_id='kick_off_dag') t2 = S3KeySensor(task_id='check_for_file', bucket_key='globetelecom/copy_*', poke_interval=45, timeout=600, wildcard_match=True, bucket_name=BUCKET, aws_conn_id=S3_CONN_ID) for job in job_info: spark = DatabricksRunNowOperator(task_id=job['job_id'], job_id=job['job_id'], json=job['config']) query = PostgresOperator(task_id='post_{0}_query'.format( job['job_id']), sql=job['query'], postgres_conn_id='prod_postgres') t1 >> t2 >> spark >> query
task_id='Copy_immigration_dimensions', dag=dag, redshift_conn_id='redshift', iam_role=Variable.get('iam_role'), dimensions=[ 'country_codes', 'port_codes', 'entry_channel_codes', 'state_codes', 'trip_reason_codes' ], truncate=True, input_s3_bucket='ascfraguas-udacity-deng-capstone', input_s3_key='staging/immigration-dimensions') #### -------> RUN TEMPERATURES SUMMARY run_temperatures_sumary = PostgresOperator(task_id="Run_temperatures_summary", dag=dag, postgres_conn_id="redshift", sql=SqlQueries.run_temps_summary) #### -------> RUN DATA QUALITY CHECKS run_table_quality_checks = RunQualityCheckOperator( task_id='Run_data_quality_checks', dag=dag, redshift_conn_id='redshift', test_tables={ 'immigration.us_entries', 'immigration.country_codes', 'immigration.port_codes', 'immigration.entry_channel_codes', 'immigration.state_codes', 'immigration.trip_reason_codes', 'temperature.full_temperature_data', 'temperature.temp_summary' }, dq_checks=[{
- demonstrate the process to create custom Airflow plugins - generalize repetitive code and Task by leveraging custom Operators ----------------------------------------------------------------------------""" bulkLoadDag = DAG( dag_id='lesson3Exercise1', start_date=datetime.datetime(2018, 1, 1, 0, 0, 0, 0) # set a date beyond which this DAG won't run , end_date=datetime.datetime(2018, 2, 1, 0, 0, 0, 0), schedule_interval='@monthly', max_active_runs=1) # create Tasks by instantiating Operator Classes createTripsTable = PostgresOperator(task_id='createTripsTable', postgres_conn_id='redshift', sql=sql_statements.CREATE_TRIPS_TABLE_SQL, dag=bulkLoadDag) # IMPORTANT: # As "s3_key" parameter has been marked as "Templatable" (in its Class # "template_fields" variable) it'll be able to fetch JINJA template # variables like {{ds}}, {{execution_date}} and so on. loadTripsData = S3ToRedshiftOperator( task_id="loadTripsData", redshift_conn_id="redshift", aws_credentials_id="aws_credentials", table="trips", truncate=False, s3_bucket="udacity-dend", s3_key= "data-pipelines/divvy/partitioned/{execution_date.year}/{execution_date.month}/divvy_trips.csv",
# if len(records) < 1 or len(records[0]) < 1: # raise ValueError(f"Data quality check failed. {table} returned no results") # num_records = records[0][0] # if num_records < 1: # raise ValueError(f"Data quality check failed. {table} contained 0 rows") # logging.info(f"Data quality on table {table} check passed with {records[0][0]} records") dag = DAG("lesson3.exercise1", start_date=datetime.datetime(2018, 1, 1, 0, 0, 0, 0), end_date=datetime.datetime(2018, 12, 1, 0, 0, 0, 0), schedule_interval="@monthly", max_active_runs=1) create_trips_table = PostgresOperator( task_id="create_trips_table", dag=dag, postgres_conn_id="redshift", sql=sql_statements.CREATE_TRIPS_TABLE_SQL) copy_trips_task = S3ToRedshiftOperator( task_id="load_trips_from_s3_to_redshift", dag=dag, table="trips", redshift_conn_id="redshift", aws_credentials_id="aws_credentials", s3_bucket="udac-data-pipelines", s3_key= "divvy/partitioned/{execution_date.year}/{execution_date.month}/divvy_trips.csv" ) #
'retry_delay': timedelta(minutes=5), 'catchup': False, 'email_on_retry': False } dag = DAG('udacity_airflow_project', default_args=default_args, description='Load and transform data in Redshift with Airflow', schedule_interval='0 * * * *', max_active_runs=3) # start_operator = DummyOperator(task_id='Begin_execution', dag=dag) start_operator = PostgresOperator(task_id='Begin_execution', dag=dag, sql=['create_table.sql'], postgres_conn_id='redshift', autocommit=True) stage_events_to_redshift = StageToRedshiftOperator( task_id='Stage_events', dag=dag, provide_context=True, aws_credentials_id="aws_credentials", redshift_conn_id='redshift', s3_bucket="udacity-dend", s3_key="log_data", table="public.staging_events", # create_stmt=SqlQueries.create_table_staging_events, json="s3://udacity-dend/log_json_path.json")
from helpers import SqlQueries default_args = {'owner': 'Abderrazzak'} drop_table_sql = "DROP TABLE IF EXISTS public.{};" dag = DAG('create_dwh_spakify_tables', default_args=default_args, description='Creates Sparkify Redshift tables', start_date=datetime(2020, 6, 11), schedule_interval=None) start_operator = DummyOperator(task_id='Begin_execution', dag=dag) drop_songplays = PostgresOperator(task_id="drop_songplays_table", dag=dag, postgres_conn_id="redshift", sql=(drop_table_sql).format("songplays")) creat_songplays = PostgresOperator( task_id="creat_songplays_table", dag=dag, postgres_conn_id="redshift", sql=""" CREATE TABLE IF NOT EXISTS songplays ( songplay_id int not null IDENTITY(0,1) PRIMARY KEY, start_time timestamp NOT NULL , user_id int , song_id varchar NOT NULL, artist_id varchar NOT NULL, session_id int, location varchar, user_agent varchar
#'end_date': datetime(2018, 11, 30) } dag = DAG('sparkify-airflow', default_args=default_args, description='Load and transform data in Redshift with Airflow', schedule_interval='0 * * * *' #,catchup=True #,max_active_runs=1 ) start_operator = DummyOperator(task_id='Begin_execution', dag=dag) ## create all tables in redshift create_tables = PostgresOperator(task_id='Create_tables', dag=dag, postgres_conn_id='redshift', sql='create_tables.sql') stage_events_to_redshift = StageToRedshiftOperator( task_id='Stage_events', dag=dag, redshift_conn_id='redshift', aws_credentials_id='aws_credentials', table='staging_events', clear_table=False, s3_bucket='udacity-dend', s3_key= 'log-data/{execution_date.year}/{execution_date.month}/{ds}-events.json', #s3_key='log-data/{execution_date.year}/{execution_date.month}', json_path='log_json_path.json')
'owner': 'udacity', 'start_date': datetime(2019, 1, 12), 'depends_on_past': False, 'retries': 3, 'retray_delay': timedelta(minutes=5), 'catchup_by_default': False, 'email_on_retry': False } dag = DAG('udac_example_dag', default_args=default_args, description='Load and transform data in Redshift with Airflow', schedule_interval='0 * * * *') start_operator = PostgresOperator(task_id='Begin_execution', dag=dag, postgres_conn_id="redshift", sql='create_tables.sql') # If tables are already created # start_operator = DummyOperator(task_id='Begin_execution', dag=dag) stage_events_to_redshift = StageToRedshiftOperator( task_id='Stage_events', dag=dag, table="staging_events", redshift_conn_id="redshift", aws_credentials_id="aws_credentials", s3_bucket="udacity-dend", s3_key="log_data", region="us-west-2", extra_params="FORMAT AS JSON 's3://udacity-dend/log_json_path.json'")
'start_date': datetime(2019, 8, 21), 'depends_on_past': False, 'retries': 1, 'retry_delay': timedelta(seconds=300), 'catchup': False } dag = DAG('dice_com_jobs_dag', default_args=default_args, description='Load the jobs dataset and insert into Redshift', schedule_interval='@once' ) recreate_staging_dice_com_jobs_table = PostgresOperator( task_id="recreate_staging_dice_com_jobs_table", dag=dag, postgres_conn_id="redshift", sql=SqlQueries.recreate_staging_dice_com_jobs_table ) stage_dice_com_jobs = StageCsvToRedshiftOperator( task_id='stage_dice_com_jobs', dag=dag, table="staging_dice_com_jobs", redshift_conn_id="redshift", aws_credentials_id="aws_credentials", s3_bucket="social-wiki-datalake", s3_key="capstone/Dice_US_jobs.csv", extra_copy_parameters="DATEFORMAT AS 'MM/DD/YYYY' MAXERROR AS 6000" ) check_staging_dice_com_jobs_table = DataQualityOperator(
import sql_statements # # TODO: Replace the data quality checks with the HasRowsOperator # dag = DAG( "lesson3.exercise1", start_date=datetime.datetime(2018, 1, 1, 0, 0, 0, 0), end_date=datetime.datetime(2018, 12, 1, 0, 0, 0, 0), schedule_interval="@monthly", max_active_runs=1 ) create_trips_table = PostgresOperator( task_id="create_trips_table", dag=dag, postgres_conn_id="redshift", sql=sql_statements.CREATE_TRIPS_TABLE_SQL ) copy_trips_task = S3ToRedshiftOperator( task_id="load_trips_from_s3_to_redshift", dag=dag, table="trips", redshift_conn_id="redshift", aws_credentials_id="aws_credentials", s3_bucket="udac-data-pipelines", s3_key="divvy/partitioned/{execution_date.year}/{execution_date.month}/divvy_trips.csv" ) # # TODO: Replace this data quality check with the HasRowsOperator
def make_task_group(dagname, name, path, pdag, trigger_rule): args = { 'owner': 'deploy', 'start_date': datetime.strptime((datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d'), '%Y-%m-%d') } dag = DAG(dag_id=dagname, default_args=args, schedule_interval=timedelta(hours=24)) sdOp = SubDagOperator(task_id=name, subdag=dag, dag=pdag ,trigger_rule = trigger_rule) start = DummyOperator(task_id=name+'_start', dag=dag) end = DummyOperator(task_id=name+'_end', dag=dag) task_dict = {} task_dict[name+'_start'] = start task_dict[name+'_end'] = end config_str = open(path, 'r').read() config_arr = config_str.split('[dependency]') # a -> b means data flow from a to b, though not valid ini format, but i think it is better to understand config_fp = StringIO.StringIO(config_arr[0]) config = ConfigParser.RawConfigParser() config.readfp(config_fp) sections = config.sections() #print sections for section in sections: #print section if section != 'dependency': options = config.options( section ) #print options if 'type' not in options or 'cmd' not in options: continue operator_type = config.get(section, 'type').strip() operator_cmd = config.get(section, 'cmd').strip() if operator_type == 'PostgresOperator': task =PostgresOperator( task_id=section.strip(), depends_on_past=False, postgres_conn_id='postgres_sha2dw03', sql=operator_cmd, dag=dag) task_dict[ section.strip() ] = task task.set_downstream(end) task.set_upstream(start) elif operator_type == 'BashOperator': task =BashOperator( task_id=section.strip(), depends_on_past=False, bash_command=operator_cmd, dag=dag) task_dict[ section.strip() ] = task task.set_downstream(end) task.set_upstream(start) else: print "Error: currently not support %s operator type"%operator_type if len(config_arr) == 1: return (start, end, dag, sdOp, task_dict) for line in config_arr[1].split('\n'): arr = line.split('->') if len(arr) != 2: continue left_side = arr[0].strip() right_side= arr[1].strip() if left_side in task_dict.keys() and right_side in task_dict.keys(): if end in task_dict[left_side].downstream_list: task_dict[left_side].downstream_list.remove(end) task_dict[left_side].set_downstream( task_dict[right_side] ) if start in task_dict[right_side].upstream_list: task_dict[right_side].upstream_list.remove(start) if task_dict[right_side] in start.downstream_list: start.downstream_list.remove(task_dict[right_side]) if task_dict[left_side] in end.upstream_list: end.upstream_list.remove(task_dict[left_side]) return (start, end, dag, sdOp, task_dict)