WHERE time_received < '%(y)s-%(m)s-01 00:00:00'; """ % { 'y': DY, 'm': DM, 'd': DD, 'yy': YDY, 'ym': YDM, 'yd': YDD }, dag=dag) # TODO: conditional here # notify_on_failure = SlackAPIPostOperator( # # ) # all_succeeded = SlackAPIPostOperator( # channel='#analytics', # username="******", # icon_url="https://airflow.apache.org/_images/pin_large.png", # slack_conn_id="slack_fora_internal", # text="S3 data has loaded successfully for DAG run {{ ds }}. Carry on!" # ) clear_partitions.set_upstream(create_partitions) create_partitions.set_upstream(sensor) load_logs_to_postgres.set_upstream(clear_partitions) move_rows_to_partitions.set_upstream(load_logs_to_postgres) attach_partitions.set_upstream(move_rows_to_partitions) # all_succeeded.set_upstream(attach_partitions)
clean_stg_player_stats_ds = PostgresOperator( task_id='clean_stg_player_stats_ds', postgres_conn_id='hotstats', dag=dag, sql=Template(""" DELETE FROM {{stg_table}} WHERE process_date = '{{ds}}'; DELETE FROM {{stg_table_1}} WHERE process_date = '{{ds}}'; DELETE FROM {{stg_table_2}} WHERE process_date = '{{ds}}'; DELETE FROM {{stg_table_3}} WHERE process_date = '{{ds}}'; """).render(stg_table=task_variables['STG_PLAYER_STATS_0'], stg_table_1=task_variables['STG_PLAYER_STATS_1'], stg_table_2=task_variables['STG_PLAYER_STATS_2'], stg_table_3=task_variables['STG_PLAYER_STATS_3'], ds="{{ ds }}")) clean_stg_player_stats_ds.set_upstream(create_table) insert_stg_player_stats_0 = PostgresOperator( task_id='insert_stg_player_stats_0', postgres_conn_id='hotstats', dag=dag, sql=Template(""" INSERT INTO {{stg_table_0}} (heroName, replayId, team, metric, value, process_date) SELECT heroname, replayid, team, LOWER(key) AS key, CASE WHEN value ~ '^[0-9]+$' THEN value :: INTEGER ELSE 0 END AS value,
dag=dag) # Get daily language totals cockroachdb_daily_language_totals = PostgresOperator( task_id='cockroachdb_daily_language_totals', postgres_conn_id='cockroachdb', autocommit=True, sql=""" UPSERT INTO daily_language_totals ( language_name, commit_date, total_daily_usage ) SELECT language_name, commit_date, CAST(SUM(usage_count) AS INT) AS total_daily_usage FROM commits WHERE commit_date = '{{ ds }}' GROUP BY commit_date, language_name ;""", dag=dag) # define the DAG edges download.set_upstream(check_for_new_dump) bsondump.set_upstream(download) remove_staging_file.set_upstream(bsondump) spark_parse_commits.set_upstream(remove_staging_file) cockroachdb_daily_import_summary.set_upstream(spark_parse_commits) cockroachdb_daily_language_totals.set_upstream(spark_parse_commits)
redshift_schema='public', s3_conn_id='aws_credentials', s3_bucket='udac-dend-capstone-dz', load_type='truncate', schema_location='Local', start_date=start_date), task_id=subdag_id, dag=dag, executor=LocalExecutor()) copy_data_to_redshift.set_upstream(start_operator) process_dim_category = PostgresOperator(dag=dag, task_id='process_dim_category', sql='/sql/categories.sql', postgres_conn_id='redshift') process_dim_category.set_upstream(copy_data_to_redshift) process_dim_cities = PostgresOperator(dag=dag, task_id='process_dim_cities', sql='/sql/cities.sql', postgres_conn_id='redshift') process_dim_cities.set_upstream(copy_data_to_redshift) process_dim_business = PostgresOperator(dag=dag, task_id='process_dim_business', sql='/sql/business.sql', postgres_conn_id='redshift') process_dim_business.set_upstream([process_dim_category, process_dim_cities]) process_dim_users = PostgresOperator(dag=dag, task_id='process_dim_users',
'depends_on_past': True, 'start_date': datetime(2019, 2, 6), 'email': ['*****@*****.**', '*****@*****.**'], 'email_on_failure': True, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5) } dag = DAG('agg_bounty_snapshot', default_args=default_args, schedule_interval='@daily') insert_cmd = """INSERT INTO agg_bounty_snapshot (bounty_id, title, description, bounty_stage, usd_price, user_id, deadline, created, ymd) SELECT bounty_id, title, description, "bountyStage" as bounty_stage, usd_price, user_id, deadline, created, '{{ ds }}' as ymd from bounties.std_bounties_bounty """ delete_cmd = "DELETE FROM agg_bounty_snapshot WHERE ymd = '{{ ds }}' " delete = PostgresOperator(task_id="delete", postgres_conn_id='postgres_data_warehouse', sql=delete_cmd, dag=dag) insert = PostgresOperator(task_id="insert", postgres_conn_id='postgres_data_warehouse', sql=insert_cmd, dag=dag) insert.set_upstream(delete)
SUM(CASE WHEN DATE((match_date AT TIME ZONE 'UTC-07:00') AT TIME ZONE 'UTC') BETWEEN ((CURRENT_TIMESTAMP AT TIME ZONE 'UTC-07:00') AT TIME ZONE 'UTC') - interval '180 day' AND ((CURRENT_TIMESTAMP AT TIME ZONE 'UTC-07:00') AT TIME ZONE 'UTC') THEN 1 ELSE 0 END) AS games_l180 FROM {{agg_table}} WHERE metric = 'match_won' GROUP BY heroname, mapname, toonhandle, gameType, gameversion ) winrates ON CONFLICT ON CONSTRAINT stats_historical_winrates_toonhandle_mapname_pk DO UPDATE SET winrate_l30 = excluded.winrate_l30, winrate_l60 = excluded.winrate_l60, winrate_l90 = excluded.winrate_l90, winrate_l180 = excluded.winrate_l180, games_l30 = excluded.games_l30, games_l60 = excluded.games_l60, games_l90 = excluded.games_l90, games_l180 = excluded.games_l180, wins_l30 = excluded.wins_l30, wins_l60 = excluded.wins_l60, wins_l90 = excluded.wins_l90, wins_l180 = excluded.wins_l180; """).render(metrics_table=task_variables['METRICS_TABLE'], agg_table=task_variables['STG_PLAYER_STATS_AGG'], ds="{{ ds }}")) insert_global_winrates.set_upstream(create_table) end_task = DummyOperator(dag=dag, task_id='end_task') end_task.set_upstream(insert_global_winrates)
dag=dag) t3 = PythonOperator(task_id='modelling', provide_context=True, python_callable=model, dag=dag) t4 = PostgresOperator( task_id='create_table', postgres_conn_id="postgres_default", sql= "create table if not exists labeling(order_id varchar(50), user_id varchar(50))", database="airflow", dag=dag) t5 = PostgresOperator(task_id='truncate_table', postgres_conn_id="postgres_default", sql="truncate table labeling;", database="airflow", dag=dag) t6 = PythonOperator(task_id='load_to_postgres', provide_context=True, python_callable=load_to_postgres, dag=dag) t2.set_upstream(t1) t3.set_upstream(t2) t5.set_upstream(t4) t6.set_upstream(t3) t6.set_upstream(t5)
dag=dag) playdemic_build_manifest_earnings = PythonOperator( task_id='playdemic_build_manifest_earnings', python_callable=build_manifest, provide_context=True, op_kwargs={'local_path': '/mnt/xxx-airflow/airflow/google/data/playdemic/earnings' ,'s3_bucket':'xxxxxxxxxxx' ,'s3_prefix':'xxxxxxxxxxx' ,'bucket': 'xxxxxxxxxxx' ,'yearmonth':(date.today() + relativedelta.relativedelta(months=-1)).strftime('%Y%m') ,'bucket_subfolder':'earnings' ,'manifest_file': 'manifest_earnings.txt' ,'file_filter': 'PlayApps_' + (date.today() + relativedelta.relativedelta(months=-1)).strftime('%Y%m') }, dag=dag) rs_load_playdemic_earnings.set_upstream(rs_delete_earnings) rs_delete_earnings.set_upstream(s3_upload_playdemic_earnings) s3_upload_playdemic_earnings.set_upstream(playdemic_build_manifest_earnings) playdemic_build_manifest_earnings.set_upstream(playdemic_download_lastmonth) playdemic_download_lastmonth.set_upstream(check_end_of_month) rs_load_xx_earnings.set_upstream(rs_delete_earnings) rs_delete_earnings.set_upstream(s3_upload_xx_earnings) s3_upload_xx_earnings.set_upstream(xx_build_manifest_earnings) xx_build_manifest_earnings.set_upstream(xx_download_lastmonth) xx_download_lastmonth.set_upstream(check_end_of_month) check_end_of_month.set_upstream(clear_folders)
SUM(CASE WHEN DATE((match_date AT TIME ZONE 'UTC-07:00') AT TIME ZONE 'UTC') BETWEEN ((CURRENT_TIMESTAMP AT TIME ZONE 'UTC-07:00') AT TIME ZONE 'UTC') - interval '180 day' AND ((CURRENT_TIMESTAMP AT TIME ZONE 'UTC-07:00') AT TIME ZONE 'UTC') THEN 1 ELSE 0 END) AS games_l180 FROM {{agg_table}} GROUP BY heroname, mapname, metric ) metrics ON CONFLICT ON CONSTRAINT stats_historical_metrics_heroname_mapname_metric_pk DO UPDATE SET mpg_l30 = excluded.mpg_l30, mpg_l60 = excluded.mpg_l60, mpg_l90 = excluded.mpg_l90, mpg_l180 = excluded.mpg_l180, games_l30 = excluded.games_l30, games_l60 = excluded.games_l60, games_l90 = excluded.games_l90, games_l180 = excluded.games_l180, metric_l30 = excluded.metric_l30, metric_l60 = excluded.metric_l60, metric_l90 = excluded.metric_l90, metric_l180 = excluded.metric_l180; """).render(metrics_table=task_variables['METRICS_TABLE'], agg_table=task_variables['STG_PLAYER_STATS_AGG'], ds="{{ ds }}")) insert_global_stats.set_upstream(create_table) end_task = DummyOperator(dag=dag, task_id='end_task') end_task.set_upstream(insert_global_stats)
'Airbnb-Data-Pipeline', default_args=default_args, description='Load and transform Airbnb data in Redshift with Airflow', schedule_interval='@hourly', max_active_runs=3, catchup=False) start_operator = DummyOperator(task_id='START_OPERATOR', dag=dag) create_staging_listings_table = PostgresOperator( task_id="Create_STAGGING_Listings_Table", dag=dag, postgres_conn_id="redshift", sql=SqlQueries.create_staging_listings) create_staging_listings_table.set_upstream(start_operator) ## Creating Stagging Tables - DAGs create_staging_calendar_table = PostgresOperator( task_id="Create_STAGGING_Calendar_Table", dag=dag, postgres_conn_id="redshift", sql=SqlQueries.create_staging_calendar) create_staging_calendar_table.set_upstream(start_operator) create_staging_reviews_table = PostgresOperator( task_id="Create_STAGGING_Reviews_Table", dag=dag, postgres_conn_id="redshift", sql=SqlQueries.create_staging_reviews)
t.set_upstream(dummy_op) t.set_downstream(create_temp_scores_table_op) archive_trained_models_op = BashOperator( task_id='archive_trained_models', bash_command='scripts/bash/archive_trained_models.sh', dag=dag ) notify_processing_completion_op = SlackAPIPostOperator( task_id='notify_processing_completion', token=Variable.get('slack_token'), channel='#engineering-commits', username='******', icon_url=Variable.get('tia_slack_icon_url'), text='*user_work_experience_job_posting_similarity_scores* has been refreshed on {{ts}}', dag=dag ) create_temp_scores_table_op.set_downstream(copy_scores_to_temp_table_op) copy_scores_to_temp_table_op.set_downstream(remove_scores_op) copy_scores_to_temp_table_op.set_downstream(update_scores_op) delete_temp_scores_table_op.set_upstream(remove_scores_op) delete_temp_scores_table_op.set_upstream(update_scores_op) delete_temp_scores_table_op.set_downstream(notify_processing_completion_op) dummy_op.set_upstream(compute_title_feature_op) dummy_op.set_upstream(compute_skill_feature_op) dummy_op.set_upstream(compute_description_feature_op) dummy_op.set_downstream(archive_trained_models_op)
task_id="move_rows_to_partitions", postgres_conn_id='postgres_data_warehouse', sql=insert_sql % { 'dy': DY, 'dm': DM, 'dd': DD, 'dh': DH, 'lhdy': LHDY, 'lhdm': LHDM, 'lhdd': LHDD, 'lhdh': LHDH }, dag=dag) clear_partitions = PostgresOperator(task_id="clear_partitions", postgres_conn_id='postgres_data_warehouse', sql=delete_sql % { 'dy': DY, 'dm': DM, 'dd': DD, 'dh': DH, 'lhdy': LHDY, 'lhdm': LHDM, 'lhdd': LHDD, 'lhdh': LHDH }, dag=dag) clear_partitions.set_upstream(sensor) move_rows_to_partitions.set_upstream(clear_partitions)
catchup=False, default_args=default_args, schedule_interval=timedelta(days=1)) t1 = BashOperator(task_id='import_countries', bash_command='python3 /code/manage.py import_countries', dag=dag) t2 = BashOperator(task_id='import_territories', bash_command='python3 /code/manage.py import_territories', dag=dag) # TODO: Improve by incremental insert sql = ( "DELETE FROM geography_countryterritory; " "INSERT INTO geography_countryterritory " "(country, citizen_names, start_date, end_date, name, official_name, territory, id)" "SELECT country, citizen_names, start_date, end_date, name, official_name, territory," "(ROW_NUMBER() OVER (ORDER BY name)) as id " "FROM (SELECT country, citizen_names, start_date, end_date, name, official_name, NULL as territory from " "geography_country UNION ALL " "SELECT NULL as country, NULL as citizen_names, start_date, end_date, name, official_name, territory FROM " "geography_territory) tmp order by name;") t3 = PostgresOperator(task_id='merge_tables', sql=sql, postgres_conn_id='postgres_django', dag=dag) t3.set_upstream([t1, t2])
all_past_pageviews as (select user_id_uuid, user_id, max(date(time_received)) as ymd from staging.user_page_log where date(time_received) <= '%(YYDY)s-%(YYDM)s-%(YYDD)s' group by user_id_uuid, user_id) select daily_pageviews.user_id_uuid, daily_pageviews.user_id, '%(YDY)s-%(YDM)s-%(YDD)s'::date as ymd, has_wallet, pageviews, coalesce(all_past_pageviews.ymd, '-Infinity'::date) as last_visit from daily_pageviews left join all_past_pageviews on daily_pageviews.user_id_uuid = all_past_pageviews.user_id_uuid """ delete_cmd = "DELETE FROM agg_visitor_previous_visit WHERE ymd = '%(YDY)s-%(YDM)s-%(YDD)s' " sensor = SqlSensor( task_id="sensor", conn_id='postgres_data_warehouse', sql=sense_sql % MACROS, poke_interval=15 * 60, # minutes timeout=60 * 60 * 13, # 13h, S3 dag has 12h timeout dag=dag) delete = PostgresOperator(task_id="delete", postgres_conn_id='postgres_data_warehouse', sql=delete_cmd % MACROS, dag=dag) insert = PostgresOperator(task_id="insert", postgres_conn_id='postgres_data_warehouse', sql=insert_cmd % MACROS, dag=dag) delete.set_upstream(sensor) insert.set_upstream(delete)
) SELECT heroes.name AS banned_hero, map_name, game_type, game_version, COUNT(*) AS banned_games FROM ( SELECT jsonb_array_elements(tgs.doc -> 'banned') AS banned_hero, r.doc->>'gameType'::varchar AS game_type, (r.doc->>'gameVersion') :: INTEGER AS game_version, r.doc->>'mapName' AS map_name FROM {{team_table}} tgs JOIN {{info_table}} r ON (tgs.replayid = r.replayid) AND tgs.doc ->> 'banned' IS NOT NULL ORDER BY r.updated_at) source_data JOIN heroes ON (LOWER(REGEXP_REPLACE(source_data.banned_hero :: VARCHAR, '"', '', 'g')) = heroes.attrib_name) GROUP BY heroes.name, game_type, game_version, map_name ON CONFLICT ON CONSTRAINT stats_hero_bans_pk DO UPDATE SET banned_games = excluded.banned_games """).render(metrics_table=task_variables['METRICS_TABLE'], team_table=task_variables['TEAMSTATS'], info_table=task_variables['REPLAYINFO']) ) insert_banned_heroes.set_upstream(create_table) end_task = DummyOperator( dag=dag, task_id='end_task' ) end_task.set_upstream(insert_banned_heroes)