def test_postgres_operator_test_multi(self): sql = [ "TRUNCATE TABLE test_airflow", "INSERT INTO test_airflow VALUES ('X')", ] from airflow.operators.postgres_operator import PostgresOperator t = PostgresOperator( task_id='postgres_operator_test_multi', sql=sql, dag=self.dag) t.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)
def test_vacuum(self): """ Verifies the VACUUM operation runs well with the PostgresOperator """ from airflow.operators.postgres_operator import PostgresOperator sql = "VACUUM ANALYZE;" t = PostgresOperator( task_id='postgres_operator_test_vacuum', sql=sql, dag=self.dag, autocommit=True) t.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)
def test_postgres_operator_test(self): sql = """ CREATE TABLE IF NOT EXISTS test_airflow ( dummy VARCHAR(50) ); """ from airflow.operators.postgres_operator import PostgresOperator t = PostgresOperator(task_id='basic_postgres', sql=sql, dag=self.dag) t.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True) autocommitTask = operators.postgres_operator.PostgresOperator( task_id='basic_postgres_with_autocommit', sql=sql, dag=self.dag, autocommit=True) autocommitTask.run( start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)
def test_overwrite_schema(self): """ Verifies option to overwrite connection schema """ from airflow.operators.postgres_operator import PostgresOperator sql = "SELECT 1;" t = PostgresOperator( task_id='postgres_operator_test_schema_overwrite', sql=sql, dag=self.dag, autocommit=True, database="foobar", ) from psycopg2._psycopg import OperationalError try: t.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True) except OperationalError as e: assert 'database "foobar" does not exist' in str(e)
) for key, file in files_to_download.items() ] # 6. Create TABLE create_tables = [ BashOperator( task_id=f"create_table_{key}", bash_command=f"psql {pg_params()} < {tmp_dir}/{dag_id}_{key}.sql", ) for key, _ in files_to_download.items() ] # 7. RE-define GEOM type (because ogr2ogr cannot set geom with .csv import) redefine_geom = [ PostgresOperator( task_id=f"re-define_geom_{key}", sql=SET_GEOM, params=dict(tablename=key), ) for key, _ in files_to_download.items() ] # 8. Rename COLUMNS based on Provenance provenance_translation = ProvenanceRenameOperator( task_id="rename_columns", dataset_name=dag_id, rename_indexes=False, pg_schema="public", ) # 9. Drop Exisiting TABLE drop_tables = [ PostgresOperator(
standardize_borough = standardize_task('standardize_borough') standardize_address = standardize_task('standardize_address') def yes_trigger(_, dag): return dag trigger_facdb_3_geoprocessing = TriggerDagRunOperator( task_id='trigger_facdb_3_geoprocessing', trigger_dag_id='facdb_3_geoprocessing', python_callable=yes_trigger, dag=facdb_2_assembly) ## ORDER TASKS facdb_2_assembly >> create for task_file in os.listdir( "/home/airflow/airflow/dags/facdb_2_assembly/config"): config = PostgresOperator(task_id=task_file[:-4], postgres_conn_id='facdb', sql="/facdb_2_assembly/config/" + task_file, dag=facdb_2_assembly) create >> config >> join_sourcedatainfo (join_sourcedatainfo >> standardize_fixallcaps >> standardize_capacity >> standardize_oversightlevel >> standardize_agencytag >> standardize_trim >> standardize_factypes >> standardize_borough >> standardize_address >> create_bblbin_one2one >> create_uid >> trigger_facdb_3_geoprocessing)
def check_table(): return "sense_the_csv" check_for_table = BranchPythonOperator( task_id="Check_for_table", python_callable=check_table, ) # TASK 2 (Case if the table is not already there.) -> Create table in postgres. create_table = PostgresOperator( task_id="Create_Table_in_Postgres", sql="""CREATE TABLE sampletable( Date text, Open text, High text, Low text, Close text ); """, ) # TASK 2 (If the table is there, then this will be the second task) -> Wait for the file using the FileSensor file_sensing_task = FileSensor( trigger_rule=TriggerRule.ONE_SUCCESS, task_id="sense_the_csv", filepath="sample1.csv", fs_conn_id="my_file_system", poke_interval=10, )
'retries': 3, 'email_on_retry': False, 'email_on_failure': False, 'retry_delay': timedelta(minutes=5), 'depends_on_past': False } with DAG('avocado_dag', default_args=default_args, description='Forecasting avocado prices', schedule_interval='*/10 * * * *', start_date=datetime(2020, 1, 1), catchup=False) as dag: creating_accuray_table = PostgresOperator( task_id='creating_accuray_table', sql='sql/CREATE_TABLE_ACCURACIES.sql', postgres_conn_id='postgres') downloading_data = PythonOperator(task_id='downloading_data', python_callable=download_dataset) sanity_check = PythonOperator(task_id='sanity_check', python_callable=check_dataset, provide_context=True) waiting_for_data = FileSensor(task_id='waiting_for_data', fs_conn_id='fs_default', filepath='avocado.csv', poke_interval=15) n_estimators = [100, 150]
# - Set "Login" and "Password" # - Set "Port" to 5439 def load_data_to_redshift(*args, **kwargs): aws_hook = AwsHook("aws_credentials", client_type="s3") credentials = aws_hook.get_credentials() redshift_hook = PostgresHook("redshift") redshift_hook.run( ss.COPY_ALL_TRIPS_SQL.format(credentials.access_key, credentials.secret_key)) dag = DAG('aws_s3_to_redshift', start_date=datetime.datetime.now()) create_table = PostgresOperator(task_id="create_table", dag=dag, postgres_conn_id="redshift", sql=ss.CREATE_TRIPS_TABLE_SQL) copy_task = PythonOperator(task_id='load_from_s3_to_redshift', dag=dag, python_callable=load_data_to_redshift) location_traffic_task = PostgresOperator(task_id="calculate_location_traffic", dag=dag, postgres_conn_id="redshift", sql=ss.LOCATION_TRAFFIC_SQL) create_table >> copy_task copy_task >> location_traffic_task
JOIN asamoilov.dm_payment_report_dim_billing_year dby ON s.billing_year_key = dby.billing_year_key JOIN asamoilov.dm_payment_report_dim_legal_type dlt ON s.legal_type_key = dlt.legal_type_key JOIN asamoilov.dm_payment_report_dim_district dd ON s.district_key = dd.district_key JOIN asamoilov.dm_payment_report_dim_billing_mode dbm ON s.billing_mode_key = dbm.billing_mode_key JOIN asamoilov.dm_payment_report_dim_registration_year dry ON s.registration_year_key = dry.registration_year_key; """ SQL_DROP_DM_PAYMENT_REPORT_TMP = """ DROP TABLE IF EXISTS asamoilov.dm_payment_report_tmp_{{ execution_date.year }}; """ start_load = DummyOperator(task_id="start_load", dag=dag) dm_payment_report_tmp = PostgresOperator( task_id="dm_payment_report_tmp", dag=dag, # postgres_conn_id="postgres_default", sql=SQL_DM_PAYMENT_REPORT_TMP) dm_payment_report_dim_billing_year = PostgresOperator( task_id="dm_payment_report_dim_billing_year", dag=dag, # postgres_conn_id="postgres_default", sql=SQL_DM_PAYMENT_REPORT_DIM_BILLING_YEAR) dm_payment_report_dim_legal_type = PostgresOperator( task_id="dm_payment_report_dim_legal_type", dag=dag, # postgres_conn_id="postgres_default", sql=SQL_DM_PAYMENT_REPORT_DIM_LEGAL_TYPE)
credentials.secret_key, ) redshift_hook.run(sql_stmt) dag = DAG( 'lesson2.exercise3', start_date=datetime.datetime(2018, 1, 1, 0, 0, 0, 0), end_date=datetime.datetime(2018, 12, 1, 0, 0, 0, 0), schedule_interval='@monthly', max_active_runs=1 ) create_trips_table = PostgresOperator( task_id="create_trips_table", dag=dag, postgres_conn_id="redshift", sql=sql.CREATE_TRIPS_TABLE_SQL ) copy_trips_task = PythonOperator( task_id='load_trips_from_s3_to_redshift', dag=dag, python_callable=load_trip_data_to_redshift, provide_context=True, ) create_stations_table = PostgresOperator( task_id="create_stations_table", dag=dag, postgres_conn_id="redshift", sql=sql.CREATE_STATIONS_TABLE_SQL,
provenance_translation = ProvenanceRenameOperator( task_id="rename_columns", dataset_name=f"{dag_id}", prefix_table_name=f"{dag_id}_", postfix_table_name="_new", rename_indexes=False, pg_schema="public", ) # 8. Revalidate invalid geometry records # the source has some invalid records # to do: inform the source maintainer revalidate_geometry_records = [ PostgresOperator( task_id=f"revalidate_geometry_{key}", sql=[ f"UPDATE {dag_id}_{key}_new SET geometry = ST_CollectionExtract((st_makevalid(geometry)),3) WHERE 1=1 AND ST_IsValid(geometry) is false; COMMIT;", ], ) for key in tables_to_create.keys() ] # Prepare the checks and added them per source to a dictionary for key in tables_to_create.keys(): total_checks.clear() count_checks.clear() geo_checks.clear() count_checks.append( COUNT_CHECK.make_check( check_id=f"count_check_{key}",
task_id="slack_at_start", http_conn_id="slack", webhook_token=slack_webhook_token, message=f"Starting {dag_id} ({DATAPUNT_ENVIRONMENT})", username="******", ) check_table_exists = PostgresXcomOperator(task_id="check_table_exists", sql=SQL_EXISTS_CHECK, do_xcom_push=True) branch_task = BranchPythonOperator(task_id="branch_task", provide_context=True, python_callable=choose_branch) update_oplaadpalen = PostgresOperator( task_id="update_oplaadpalen", sql=f"{sql_path}/oplaadpalen_copy.sql") create_oplaadpalen = PostgresOperator( task_id="create_oplaadpalen", sql=f"{sql_path}/oplaadpalen_create.sql") # The trigger_rule is essential, otherwise the skipped path blocks progress import_allego = PythonOperator( task_id="import_allego", python_callable=import_oplaadpalen, trigger_rule="none_failed_or_skipped", op_args=[ PostgresHook(postgres_conn_id=dag.default_args["postgres_conn_id"] ).get_conn() ], )
'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5) } dag = DAG('transfer_data', catchup=False, default_args=default_args, schedule_interval=None, max_active_runs=1) task_create_tables = PostgresOperator( task_id='task_create_tables', sql=SQL_PATH + 'create_transfer_and_aggregate_tables.sql', postgres_conn_id='my_local_db', dag=dag) task_create_views = PostgresOperator(task_id='task_create_views', sql=SQL_PATH + 'create_views.sql', postgres_conn_id='my_local_db', dag=dag) task_load_transfer_table = PythonOperator(task_id='task_load_transfer_table', python_callable=load_transfer, provide_context=True, dag=dag) task_transfer_to_aggregate_table = PostgresOperator( task_id='task_transfer_to_aggregate_table',
) as dag : task_1 = BashOperator( task_id = 'copy_csv_file', bash_command= 'cp "/mnt/c/Users/sivkumar/Documents/Project Refference/VNPT/Data/network data/DIM_PM_PROVINCE.csv" /home/siva/files/', dag = dag ) task_2 = PostgresOperator( task_id = 'create_table', database = 'postgres', postgres_conn_id = "postgres_localhost", sql = """ DROP TABLE IF EXISTS prestage.dim_province; CREATE TABLE prestage.dim_province( province_id numeric(10), code varchar(10), province_name varchar(50), status numeric(5) ); """, dag = dag ) task_3 = PythonOperator( task_id = 'insert_into_postgres', provide_context=True, python_callable = load_to_dwh, dag=dag )
# retrieve credentials with "get_credentials()" method AWSCredentials = awsHookInstance.get_credentials() # instantiate PostgresHook Class derivedRedshiftHook = PostgresHook("redshift") # retrieve Station data bulk load SQL statement bulkLoadStatement = sql_statements.COPY_STATIONS_SQL.format( AWSCredentials.access_key ,AWSCredentials.secret_key ) # execute SQL statement derivedRedshiftHook.run(bulkLoadStatement) # create Tasks by instantiating Operator Classes createTripsTable = PostgresOperator( task_id='createTripsTable' ,postgres_conn_id='redshift' ,sql=sql_statements.CREATE_TRIPS_TABLE_SQL ,dag=bulkLoadDag ) createStationsTable = PostgresOperator( task_id='createStationsTable' ,postgres_conn_id='redshift' ,sql=sql_statements.CREATE_STATIONS_TABLE_SQL ,dag=bulkLoadDag ) loadTripsData = PythonOperator( task_id='loadTripsData' ,python_callable=loadTripDataFromS3ToRedshift ,dag=bulkLoadDag )
) if route.post_process: hook = PostgresHook(postgres_conn_id=postgres_conn_id) hook.run(route.post_process) with DAG(dag_id, default_args=default_args) as dag: count_checks = [] colname_checks = [] geo_checks = [] renames = [] drop_old_tables = PostgresOperator( task_id="drop_old_tables", sql=DROP_TMPL, params=dict(tablenames=TABLES_TO_DROP), ) import_geojson = PythonOperator( task_id="import_geojson", python_callable=_load_geojson, op_args=[default_args.get("postgres_conn_id", "postgres_default")], ) for route in ROUTES: count_checks.append( COUNT_CHECK.make_check( check_id=f"count_check_{route.name}", pass_value=3, params=dict(table_name=route.tmp_db_table_name),
'depends_on_past': False, 'retries': 3, 'retry_delay': timedelta(minutes=5), 'email_on_retry': False, 'catchup': False, } dag = DAG('data_pipeline_dag', default_args=default_args, description='Load and transform data in Redshift with Airflow', schedule_interval='@hourly') start_operator = DummyOperator(task_id='Begin_execution', dag=dag) create_tables = PostgresOperator(task_id='create_tables', dag=dag, postgres_conn_id='redshift', sql='create_tables.sql') stage_events_to_redshift = StageToRedshiftOperator( task_id='Stage_events', dag=dag, aws_credentials_id='aws_credentials', redshift_conn_id='redshift', table='staging_events', s3_bucket='udacity-dend', s3_key='log_data', json_parameter='s3://udacity-dend/log_json_path.json') stage_songs_to_redshift = StageToRedshiftOperator( task_id='Stage_songs', dag=dag,
"email_on_retry": False, "retries": 1, "retry_delay": timedelta(minutes=5) } dag = DAG("user_behaviour", default_args=default_args, schedule_interval="0 0 * * *", max_active_runs=1) end_of_data_pipeline = DummyOperator(task_id='end_of_data_pipeline', dag=dag) pg_unload = PostgresOperator( dag=dag, task_id='pg_unload', sql=unload_user_purchase, postgres_conn_id='postgres_default', params={'temp_filtered_user_purchase': temp_filtered_user_purchase}, depends_on_past=True, wait_for_downstream=True) user_purchase_to_s3_stage = PythonOperator( dag=dag, task_id='user_purchase_to_s3_stage', python_callable=_local_to_s3, op_kwargs={ 'filename': temp_filtered_user_purchase, 'key': temp_filtered_user_purchase_key, }, ) # remove_local_user_purchase_file = PythonOperator(
redshift_hook = PostgresHook("redshift") records = redshift_hook.get_records(""" SELECT birthyear FROM younger_riders ORDER BY birthyear DESC LIMIT 1 """) if len(records) > 0 and len(records[0]) > 0: logging.info(f"Youngest rider was born in {records[0][0]}") dag = DAG("lesson3.exercise2", start_date=datetime.datetime.utcnow()) create_oldest_task = PostgresOperator(task_id="create_oldest", dag=dag, sql=""" BEGIN; DROP TABLE IF EXISTS older_riders; CREATE TABLE older_riders AS ( SELECT * FROM trips WHERE birthyear > 0 AND birthyear <= 1945 ); COMMIT; """, postgres_conn_id="redshift") log_oldest_task = PythonOperator(task_id="log_oldest", dag=dag, python_callable=log_oldest) create_youngest_task = PostgresOperator(task_id="create_youngest", dag=dag, sql=""" BEGIN; DROP TABLE IF EXISTS younger_riders;
with open(movie_clean_emr_steps) as json_file: emr_steps = json.load(json_file) last_step = len(emr_steps) - 1 with DAG(dag_id="user_behaviour", default_args=default_args, schedule_interval="0 0 * * *", max_active_runs=1) as dag: end_of_data_pipeline = DummyOperator(task_id='end_of_data_pipeline') # this task loads the user purchase data we wanted into local csv file pg_unload = PostgresOperator( task_id='pg_unload', sql=unload_user_purchase, postgres_conn_id='postgres_default', params={'temp_filtered_user_purchase': temp_filtered_user_purchase}, depends_on_past=True, wait_for_downstream=True) # this task deploys the local file onto S3 user_purchase_to_s3_stage = PythonOperator( task_id='user_purchase_to_s3_stage', python_callable=_local_to_s3, op_kwargs={ 'filename': temp_filtered_user_purchase, 'key': temp_filtered_user_purchase_key, }, ) # after deployment, remove the local file remove_local_user_purchase_file = PythonOperator( task_id='remove_local_user_purchase_file',
task_id="update_SQL_data", python_callable=convert_biz_data, op_args=[ f"{dag_id}_{dag_id}_new", f"{tmp_dir}/{dag_id}.utf8.sql", f"{tmp_dir}/{file}", f"{tmp_dir}/{dag_id}_updated_data_insert.sql", ], ) for files in files_to_download.values() for file in files if "xlsx" in file ] # 8. CREATE target TABLE (the ogr2ogr output is not used to create the table) create_table = PostgresOperator( task_id=f"create_target_table", sql=CREATE_TABLE, params=dict(tablename=f"{dag_id}_{dag_id}_new"), ) # 9. Import data import_data = BashOperator( task_id="import_data", bash_command= f"psql {pg_params()} < {tmp_dir}/{dag_id}_updated_data_insert.sql", ) # 10. UPDATE target TABLE (add display field content) update_table = PostgresOperator( task_id=f"update_target_table", sql=UPDATE_TABLE, params=dict(tablename=f"{dag_id}_{dag_id}_new"),
'email_on_retry': False, 'retries': 3, 'retry_delay': timedelta(minutes=5), 'catchup': False, 'depends_on_past': False, 'schedule_interval': '@hourly' } dag = DAG('etl_task', default_args=default_args, description='Data Pipelines with Airflow') start_operator = DummyOperator(task_id='Begin_execution', dag=dag) create_tables = PostgresOperator(task_id="create_tables", postgres_conn_id="redshift", sql="create_tables.sql", dag=dag) stage_events_to_redshift = StageToRedshiftOperator( task_id="stage_events", redshift_conn_id="redshift", aws_credentials_id="aws_credentials", table="staging_events", s3_bucket='udacity-dend', s3_key="log_data/", extra_params="format as json 's3://udacity-dend/log_json_path.json'", dag=dag) stage_songs_to_redshift = StageToRedshiftOperator( task_id='Stage_songs', redshift_conn_id="redshift",
'start_date': datetime(2021, 4, 12), 'retries': 2, 'retry_delay': timedelta(minutes=5) } dag = DAG( 'amvisitors_dbschema_dag', default_args=default_args, description='Cleand DB and prepare Lookup Tables in Redshift with Airflow', schedule_interval=None, catchup=False) start_operator = DummyOperator(task_id='Begin_execution', dag=dag) drop_db_schema = PostgresOperator(task_id=f"drop_db_schema", dag=dag, postgres_conn_id='redshift', sql=SqlQueries.db_schema_drop) lt_mode_to_redshift = LoadLookupToRedshiftOperator( task_id='Load_Mode_Lookup_Tables', provide_context=False, dag=dag, redshift_conn_id="redshift", aws_credentials_id="aws_credentials", table="dim_mode", sql_init_command=SqlQueries.mode_table_create, copy_options="IGNOREHEADER 0 FORMAT AS JSON 'auto'", s3_path="s3://lzalewsk-capstone/lookup_data/_i94model.json", region="us-west-2") lt_location_to_redshift = LoadLookupToRedshiftOperator(
importing_zones_from_S3 = PythonOperator(dag=dag, task_id='importing_zones_from_S3', provide_context=False, python_callable=dim_table) # popular_destinations=PythonOperator(dag=dag, # task_id='popular_destinations', # provide_context=False, # python_callable=run_sql, # op_kwargs={'file_path': './sql/DML/popular_destinations_monthly.sql'}) postgres_conn_id = 'postgres_airflow' create_stg_taxis_data = PostgresOperator(dag=dag, postgres_conn_id=postgres_conn_id, task_id='create_stg_taxis_data', sql='DDL/create_stg_taxis_data.sql') staging_taxis_data = PostgresOperator(dag=dag, postgres_conn_id=postgres_conn_id, task_id='staging_taxis_data', sql='DML/staging_taxis_data.sql') create_table_popular_destination_zones = PostgresOperator( dag=dag, postgres_conn_id=postgres_conn_id, task_id='create_table_popular_destination_zones', sql='DDL/create_table_popular_destination_zones.sql') popular_destinations_zones_passengers = PostgresOperator( dag=dag,
def standardize_task(task_id): return PostgresOperator( task_id=task_id, postgres_conn_id='facdb', sql="/facdb_2_assembly/standardize/{0}.sql".format(task_id), dag=facdb_2_assembly)
Add IMDB ratings and metadata to shows ''' def promote_staging(**kwargs): ''' Replace the live schema with the staging schema ''' default_args = {'owner': 'airflow', 'retries': 0} with DAG('syncing_movie_and_tv_data', default_args=default_args, start_date=datetime.now(), schedule_interval=timedelta(minutes=10)) as dag: # TODO: use alembic for this? create_my_ratings = PostgresOperator(task_id='create_my_ratings', postgres_conn_id='postgres_movies', sql=''' create table if not exists my_ratings(imdb_url text unique primary key, stars integer not null); truncate table my_ratings; ''', dag=dag) extract_airtable_shows = PythonOperator( task_id='extract_airtable_shows', python_callable=extract_airtable_shows, dag=dag)
task_id="insert_into_table_markering", sql_files=[f"{sql_path}/cmsa_data_insert_markering.sql"], ) # 9. RENAME columns based on PROVENANCE provenance_translation = ProvenanceRenameOperator( task_id="provenance_rename", dataset_name=f"{dag_id}", prefix_table_name=f"{dag_id}_", postfix_table_name="_new", rename_indexes=False, pg_schema="public", ) # 10. Rename temp named tables to final names rename_tables = PostgresOperator(task_id="rename_tables", sql=SQL_TABLE_RENAMES) (slack_at_start >> mkdir >> download_geojson >> fetch_files >> proces_cmsa >> create_tables >> import_data >> fill_markering >> provenance_translation >> rename_tables) dag.doc_md = """ #### DAG summery This DAG containts crowd monitoring sensor data, the source is the CMSA (Crowd Monitoring Systeem Amsterdam) #### Mission Critical Classified as 2 (beschikbaarheid [range: 1,2,3]) #### On Failure Actions Fix issues and rerun dag on working days #### Point of Contact Inform the businessowner at [businessowner]@amsterdam.nl
dag=dag, ) # # TODO: Consolidate check_trips and check_stations into a single check in the subdag # as we did with the create and copy in the demo # check_trips = HasRowsOperator(task_id="check_trips_data", dag=dag, redshift_conn_id="redshift", table="trips") check_stations = HasRowsOperator(task_id="check_stations_data", dag=dag, redshift_conn_id="redshift", table="stations") location_traffic_task = PostgresOperator( task_id="calculate_location_traffic", dag=dag, postgres_conn_id="redshift", sql=sql_statements.LOCATION_TRAFFIC_SQL) # # TODO: Reorder the Graph once you have moved the checks # trips_subdag_task >> check_trips stations_subdag_task >> check_stations check_stations >> location_traffic_task check_trips >> location_traffic_task
default_args = {"start_date": datetime(2020, 1, 1), "owner": "airflow"} with DAG(dag_id="twitter_dag", schedule_interval="@daily", default_args=default_args, catchup=False) as dag: waiting_for_tweets = FileSensor(task_id="waiting_for_tweets", fs_conn_id="fs_tweet", filepath="data.csv", poke_interval=5) fetching_tweets = PythonOperator(task_id="fetching_tweets", python_callable=fetching_tweets.main) cleaning_tweets = PythonOperator(task_id="cleaning_tweets", python_callable=cleaning_tweets.main) storing_tweets = PostgresOperator( task_id='storing_tweets', postgres_conn_id="postgres_default", sql= '''CREATE TABLE IF NOT EXISTS tweets(Tweet varchar(250), Date varchar(50), Retweet_from varchar(50), T_User varchar(50));''' ) update_tweets = PostgresOperator( task_id='update_tweets', postgres_conn_id="postgres_default", sql= '''COPY tweets(Tweet, Date, Retweet_from, T_User) FROM '/tmp/data_cleaned.csv' DELIMITER ',' CSV HEADER;''' ) waiting_for_tweets >> fetching_tweets >> cleaning_tweets >> storing_tweets >> update_tweets
'owner': 'udacity', 'start_date': datetime(2020, 4, 14), } dag = DAG('udac_example_dag', default_args=default_args, description='Load and transform data in Redshift with Airflow', schedule_interval='@daily' ) start_operator = DummyOperator(task_id='Begin_execution', dag=dag) #Table Creation task create_tables_task = PostgresOperator( task_id="create_tables", dag=dag, sql='create_tables.sql', postgres_conn_id="redshift" ) #Load task: S3 to staging table-staging_events stage_events_to_redshift = StageToRedshiftOperator( task_id='stage_events', dag=dag, table="staging_events", redshift_conn_id="redshift", aws_credentials_id="aws_credentials", s3_bucket="udacity-dend", s3_key="log_data" ) #Load task: S3 to staging table-staging_songs
'retry_delay':300, 'email_on_retry': False, 'catchup': False } dag = DAG('Sparkify_Data_Pipelines_DAG', default_args=default_args, description='Extact, load and transform data in Redshift with Airflow', schedule_interval='@hourly' ) start_operator = DummyOperator(task_id='Begin_execution', dag=dag) create_staging_events_table = PostgresOperator( task_id='Create_staging_events_table', dag=dag, postgres_conn_id='redshift', sql=SqlQueries.staging_events_table_create ) create_staging_songs_table = PostgresOperator( task_id='Create_staging_songs_table', dag=dag, postgres_conn_id='redshift', sql=SqlQueries.staging_songs_table_create ) create_songplays_table = PostgresOperator( task_id='Create_songplays_table', dag=dag, postgres_conn_id='redshift', sql=SqlQueries.songplays_table_create
filename='tinydesk_api_data.json', aws_conn_id='tinydesk_aws', bucket_name=os.environ['S3_BUCKET'], dag=dag) t11 = ScrapeYoutubeAPIOperator( task_id='get_tinydesk_at_home_video_data', playlist_id='PLy2PCKGkKRVYPm1tBwoX45ocAzuhVyvJX', youtube_api_key=os.environ['YOUTUBE_API_KEY'], filename='tinydesk_at_home_api_data.json', aws_conn_id='tinydesk_aws', bucket_name=os.environ['S3_BUCKET'], dag=dag) t2 = PostgresOperator(task_id='empty_staging_table', sql='DELETE FROM VIDEO_STAGING', postgres_conn_id='tinydesk_postgres', dag=dag) t13 = PostgresOperator(task_id='empty_staging_table_at_home', sql='DELETE FROM VIDEO_STAGING_AT_HOME', postgres_conn_id='tinydesk_postgres', dag=dag) t3 = FileToPostgresOperator(task_id='load_to_staging', filename='tinydesk_api_data.json', postgres_conn_id='tinydesk_postgres', table='video_staging', aws_conn_id='tinydesk_aws', bucket_name=os.environ['S3_BUCKET'], dag=dag)
# Initialize subdag variables parent_task_id = 'udac_example_dag' start_date = datetime.utcnow() dag = DAG( parent_task_id, default_args=default_args, description='Load and transform data in Redshift with Airflow', ) start_operator = DummyOperator(task_id='Begin_execution', dag=dag) # Create Staging tables create_staging_events = PostgresOperator( task_id="create_staging_events", dag=dag, postgres_conn_id="redshift", sql=CreateTables.create_staging_events) create_songs_table = PostgresOperator(task_id="create_staging_songs", dag=dag, postgres_conn_id="redshift", sql=CreateTables.create_staging_songs) # Populate staging tables stage_events_to_redshift = StageToRedshiftOperator( task_id="Stage_events", dag=dag, table="staging_events", redshift_conn_id="redshift", aws_credentials_id="aws_credentials",