WHERE time_received < '%(y)s-%(m)s-01 00:00:00';
        """ % {
        'y': DY,
        'm': DM,
        'd': DD,
        'yy': YDY,
        'ym': YDM,
        'yd': YDD
    },
    dag=dag)

# TODO: conditional here
# notify_on_failure = SlackAPIPostOperator(
#
# )

# all_succeeded = SlackAPIPostOperator(
#     channel='#analytics',
#     username="******",
#     icon_url="https://airflow.apache.org/_images/pin_large.png",
#     slack_conn_id="slack_fora_internal",
#     text="S3 data has loaded successfully for DAG run {{ ds }}. Carry on!"
# )

clear_partitions.set_upstream(create_partitions)
create_partitions.set_upstream(sensor)
load_logs_to_postgres.set_upstream(clear_partitions)
move_rows_to_partitions.set_upstream(load_logs_to_postgres)
attach_partitions.set_upstream(move_rows_to_partitions)
# all_succeeded.set_upstream(attach_partitions)
Example #2
0
clean_stg_player_stats_ds = PostgresOperator(
    task_id='clean_stg_player_stats_ds',
    postgres_conn_id='hotstats',
    dag=dag,
    sql=Template("""
                    DELETE FROM {{stg_table}} WHERE process_date = '{{ds}}';
                    DELETE FROM {{stg_table_1}} WHERE process_date = '{{ds}}';
                    DELETE FROM {{stg_table_2}} WHERE process_date = '{{ds}}';
                    DELETE FROM {{stg_table_3}} WHERE process_date = '{{ds}}';
                """).render(stg_table=task_variables['STG_PLAYER_STATS_0'],
                            stg_table_1=task_variables['STG_PLAYER_STATS_1'],
                            stg_table_2=task_variables['STG_PLAYER_STATS_2'],
                            stg_table_3=task_variables['STG_PLAYER_STATS_3'],
                            ds="{{ ds }}"))

clean_stg_player_stats_ds.set_upstream(create_table)

insert_stg_player_stats_0 = PostgresOperator(
    task_id='insert_stg_player_stats_0',
    postgres_conn_id='hotstats',
    dag=dag,
    sql=Template("""
        INSERT INTO {{stg_table_0}} (heroName, replayId, team, metric, value, process_date)
        SELECT
            heroname,
            replayid,
            team,
            LOWER(key) AS key,
            CASE WHEN value ~ '^[0-9]+$'
                THEN value :: INTEGER
                ELSE 0 END AS value,
Example #3
0
    dag=dag)

# Get daily language totals
cockroachdb_daily_language_totals = PostgresOperator(
    task_id='cockroachdb_daily_language_totals',
    postgres_conn_id='cockroachdb',
    autocommit=True,
    sql="""
    UPSERT INTO daily_language_totals (
    	language_name,
    	commit_date,
    	total_daily_usage
    )
    SELECT language_name,
    		commit_date,
    		CAST(SUM(usage_count) AS INT) AS total_daily_usage
    	FROM commits
    	WHERE commit_date = '{{ ds }}'
    	GROUP BY commit_date,
    		language_name
    ;""",
    dag=dag)

# define the DAG edges
download.set_upstream(check_for_new_dump)
bsondump.set_upstream(download)
remove_staging_file.set_upstream(bsondump)
spark_parse_commits.set_upstream(remove_staging_file)
cockroachdb_daily_import_summary.set_upstream(spark_parse_commits)
cockroachdb_daily_language_totals.set_upstream(spark_parse_commits)
Example #4
0
        redshift_schema='public',
        s3_conn_id='aws_credentials',
        s3_bucket='udac-dend-capstone-dz',
        load_type='truncate',
        schema_location='Local',
        start_date=start_date),
                                           task_id=subdag_id,
                                           dag=dag,
                                           executor=LocalExecutor())
    copy_data_to_redshift.set_upstream(start_operator)

process_dim_category = PostgresOperator(dag=dag,
                                        task_id='process_dim_category',
                                        sql='/sql/categories.sql',
                                        postgres_conn_id='redshift')
process_dim_category.set_upstream(copy_data_to_redshift)

process_dim_cities = PostgresOperator(dag=dag,
                                      task_id='process_dim_cities',
                                      sql='/sql/cities.sql',
                                      postgres_conn_id='redshift')
process_dim_cities.set_upstream(copy_data_to_redshift)

process_dim_business = PostgresOperator(dag=dag,
                                        task_id='process_dim_business',
                                        sql='/sql/business.sql',
                                        postgres_conn_id='redshift')
process_dim_business.set_upstream([process_dim_category, process_dim_cities])

process_dim_users = PostgresOperator(dag=dag,
                                     task_id='process_dim_users',
    'depends_on_past': True,
    'start_date': datetime(2019, 2, 6),
    'email': ['*****@*****.**', '*****@*****.**'],
    'email_on_failure': True,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5)
}

dag = DAG('agg_bounty_snapshot',
          default_args=default_args,
          schedule_interval='@daily')

insert_cmd = """INSERT INTO agg_bounty_snapshot (bounty_id, title, description, bounty_stage,
            usd_price, user_id, deadline, created, ymd)
            SELECT bounty_id, title, description, "bountyStage" as bounty_stage,
            usd_price, user_id, deadline, created, '{{ ds }}' as ymd from bounties.std_bounties_bounty """
delete_cmd = "DELETE FROM agg_bounty_snapshot WHERE ymd = '{{ ds }}' "

delete = PostgresOperator(task_id="delete",
                          postgres_conn_id='postgres_data_warehouse',
                          sql=delete_cmd,
                          dag=dag)

insert = PostgresOperator(task_id="insert",
                          postgres_conn_id='postgres_data_warehouse',
                          sql=insert_cmd,
                          dag=dag)

insert.set_upstream(delete)
          SUM(CASE WHEN
            DATE((match_date AT TIME ZONE 'UTC-07:00') AT TIME ZONE 'UTC')
              BETWEEN ((CURRENT_TIMESTAMP AT TIME ZONE 'UTC-07:00') AT TIME ZONE 'UTC') - interval '180 day'
                AND
              ((CURRENT_TIMESTAMP AT TIME ZONE 'UTC-07:00') AT TIME ZONE 'UTC')
            THEN 1 ELSE 0
          END) AS games_l180
        FROM {{agg_table}} WHERE metric = 'match_won'
        GROUP BY heroname, mapname, toonhandle, gameType, gameversion ) winrates
        ON CONFLICT ON CONSTRAINT stats_historical_winrates_toonhandle_mapname_pk DO
        UPDATE SET
        winrate_l30 = excluded.winrate_l30,
        winrate_l60 = excluded.winrate_l60,
        winrate_l90 = excluded.winrate_l90,
        winrate_l180 = excluded.winrate_l180,
        games_l30 = excluded.games_l30,
        games_l60 = excluded.games_l60,
        games_l90 = excluded.games_l90,
        games_l180 = excluded.games_l180,
        wins_l30 = excluded.wins_l30,
        wins_l60 = excluded.wins_l60,
        wins_l90 = excluded.wins_l90,
        wins_l180 = excluded.wins_l180;
    """).render(metrics_table=task_variables['METRICS_TABLE'],
                agg_table=task_variables['STG_PLAYER_STATS_AGG'],
                ds="{{ ds }}"))
insert_global_winrates.set_upstream(create_table)

end_task = DummyOperator(dag=dag, task_id='end_task')
end_task.set_upstream(insert_global_winrates)
Example #7
0
                    dag=dag)

t3 = PythonOperator(task_id='modelling',
                    provide_context=True,
                    python_callable=model,
                    dag=dag)

t4 = PostgresOperator(
    task_id='create_table',
    postgres_conn_id="postgres_default",
    sql=
    "create table if not exists labeling(order_id varchar(50), user_id varchar(50))",
    database="airflow",
    dag=dag)

t5 = PostgresOperator(task_id='truncate_table',
                      postgres_conn_id="postgres_default",
                      sql="truncate table labeling;",
                      database="airflow",
                      dag=dag)

t6 = PythonOperator(task_id='load_to_postgres',
                    provide_context=True,
                    python_callable=load_to_postgres,
                    dag=dag)

t2.set_upstream(t1)
t3.set_upstream(t2)
t5.set_upstream(t4)
t6.set_upstream(t3)
t6.set_upstream(t5)
Example #8
0
        dag=dag)

playdemic_build_manifest_earnings = PythonOperator(
        task_id='playdemic_build_manifest_earnings',
        python_callable=build_manifest,
        provide_context=True,
        op_kwargs={'local_path': '/mnt/xxx-airflow/airflow/google/data/playdemic/earnings'
                    ,'s3_bucket':'xxxxxxxxxxx'
                    ,'s3_prefix':'xxxxxxxxxxx'
                    ,'bucket': 'xxxxxxxxxxx'
                    ,'yearmonth':(date.today() + relativedelta.relativedelta(months=-1)).strftime('%Y%m')
                    ,'bucket_subfolder':'earnings'
                    ,'manifest_file': 'manifest_earnings.txt'
                    ,'file_filter': 'PlayApps_' + (date.today() + relativedelta.relativedelta(months=-1)).strftime('%Y%m')
                  },
        dag=dag)

rs_load_playdemic_earnings.set_upstream(rs_delete_earnings)
rs_delete_earnings.set_upstream(s3_upload_playdemic_earnings)
s3_upload_playdemic_earnings.set_upstream(playdemic_build_manifest_earnings)
playdemic_build_manifest_earnings.set_upstream(playdemic_download_lastmonth)
playdemic_download_lastmonth.set_upstream(check_end_of_month)

rs_load_xx_earnings.set_upstream(rs_delete_earnings)
rs_delete_earnings.set_upstream(s3_upload_xx_earnings)
s3_upload_xx_earnings.set_upstream(xx_build_manifest_earnings)
xx_build_manifest_earnings.set_upstream(xx_download_lastmonth)
xx_download_lastmonth.set_upstream(check_end_of_month)

check_end_of_month.set_upstream(clear_folders)
Example #9
0
            SUM(CASE WHEN
              DATE((match_date AT TIME ZONE 'UTC-07:00') AT TIME ZONE 'UTC')
                BETWEEN ((CURRENT_TIMESTAMP AT TIME ZONE 'UTC-07:00') AT TIME ZONE 'UTC') - interval '180 day'
                  AND
                ((CURRENT_TIMESTAMP AT TIME ZONE 'UTC-07:00') AT TIME ZONE 'UTC')
              THEN 1 ELSE 0
            END) AS games_l180
          FROM {{agg_table}}
          GROUP BY heroname, mapname, metric
          ) metrics
          ON CONFLICT ON CONSTRAINT stats_historical_metrics_heroname_mapname_metric_pk DO
          UPDATE SET mpg_l30 = excluded.mpg_l30,
          mpg_l60 = excluded.mpg_l60,
          mpg_l90 = excluded.mpg_l90,
          mpg_l180 = excluded.mpg_l180,
          games_l30 = excluded.games_l30,
          games_l60 = excluded.games_l60,
          games_l90 = excluded.games_l90,
          games_l180 = excluded.games_l180,
          metric_l30 = excluded.metric_l30,
          metric_l60 = excluded.metric_l60,
          metric_l90 = excluded.metric_l90,
          metric_l180 = excluded.metric_l180;
    """).render(metrics_table=task_variables['METRICS_TABLE'],
                agg_table=task_variables['STG_PLAYER_STATS_AGG'],
                ds="{{ ds }}"))
insert_global_stats.set_upstream(create_table)

end_task = DummyOperator(dag=dag, task_id='end_task')
end_task.set_upstream(insert_global_stats)
    'Airbnb-Data-Pipeline',
    default_args=default_args,
    description='Load and transform Airbnb data in Redshift with Airflow',
    schedule_interval='@hourly',
    max_active_runs=3,
    catchup=False)

start_operator = DummyOperator(task_id='START_OPERATOR', dag=dag)

create_staging_listings_table = PostgresOperator(
    task_id="Create_STAGGING_Listings_Table",
    dag=dag,
    postgres_conn_id="redshift",
    sql=SqlQueries.create_staging_listings)

create_staging_listings_table.set_upstream(start_operator)
## Creating Stagging Tables - DAGs

create_staging_calendar_table = PostgresOperator(
    task_id="Create_STAGGING_Calendar_Table",
    dag=dag,
    postgres_conn_id="redshift",
    sql=SqlQueries.create_staging_calendar)

create_staging_calendar_table.set_upstream(start_operator)

create_staging_reviews_table = PostgresOperator(
    task_id="Create_STAGGING_Reviews_Table",
    dag=dag,
    postgres_conn_id="redshift",
    sql=SqlQueries.create_staging_reviews)
Example #11
0
    t.set_upstream(dummy_op)
    t.set_downstream(create_temp_scores_table_op)

archive_trained_models_op = BashOperator(
    task_id='archive_trained_models',
    bash_command='scripts/bash/archive_trained_models.sh',
    dag=dag
)

notify_processing_completion_op = SlackAPIPostOperator(
    task_id='notify_processing_completion',
    token=Variable.get('slack_token'),
    channel='#engineering-commits',
    username='******',
    icon_url=Variable.get('tia_slack_icon_url'),
    text='*user_work_experience_job_posting_similarity_scores* has been refreshed on {{ts}}',
    dag=dag
)

create_temp_scores_table_op.set_downstream(copy_scores_to_temp_table_op)
copy_scores_to_temp_table_op.set_downstream(remove_scores_op)
copy_scores_to_temp_table_op.set_downstream(update_scores_op)
delete_temp_scores_table_op.set_upstream(remove_scores_op)
delete_temp_scores_table_op.set_upstream(update_scores_op)
delete_temp_scores_table_op.set_downstream(notify_processing_completion_op)

dummy_op.set_upstream(compute_title_feature_op)
dummy_op.set_upstream(compute_skill_feature_op)
dummy_op.set_upstream(compute_description_feature_op)
dummy_op.set_downstream(archive_trained_models_op)
Example #12
0
    task_id="move_rows_to_partitions",
    postgres_conn_id='postgres_data_warehouse',
    sql=insert_sql % {
        'dy': DY,
        'dm': DM,
        'dd': DD,
        'dh': DH,
        'lhdy': LHDY,
        'lhdm': LHDM,
        'lhdd': LHDD,
        'lhdh': LHDH
    },
    dag=dag)

clear_partitions = PostgresOperator(task_id="clear_partitions",
                                    postgres_conn_id='postgres_data_warehouse',
                                    sql=delete_sql % {
                                        'dy': DY,
                                        'dm': DM,
                                        'dd': DD,
                                        'dh': DH,
                                        'lhdy': LHDY,
                                        'lhdm': LHDM,
                                        'lhdd': LHDD,
                                        'lhdh': LHDH
                                    },
                                    dag=dag)

clear_partitions.set_upstream(sensor)
move_rows_to_partitions.set_upstream(clear_partitions)
Example #13
0
          catchup=False,
          default_args=default_args,
          schedule_interval=timedelta(days=1))

t1 = BashOperator(task_id='import_countries',
                  bash_command='python3 /code/manage.py import_countries',
                  dag=dag)

t2 = BashOperator(task_id='import_territories',
                  bash_command='python3 /code/manage.py import_territories',
                  dag=dag)

# TODO: Improve by incremental insert
sql = (
    "DELETE FROM geography_countryterritory; "
    "INSERT INTO geography_countryterritory "
    "(country, citizen_names, start_date, end_date, name, official_name, territory, id)"
    "SELECT country, citizen_names, start_date, end_date, name, official_name, territory,"
    "(ROW_NUMBER() OVER (ORDER BY name)) as id "
    "FROM (SELECT country, citizen_names, start_date, end_date, name, official_name, NULL as territory from "
    "geography_country UNION ALL "
    "SELECT NULL as country, NULL as citizen_names, start_date, end_date, name, official_name, territory FROM "
    "geography_territory) tmp order by name;")

t3 = PostgresOperator(task_id='merge_tables',
                      sql=sql,
                      postgres_conn_id='postgres_django',
                      dag=dag)

t3.set_upstream([t1, t2])
Example #14
0
all_past_pageviews as (select user_id_uuid, user_id, max(date(time_received)) as ymd from staging.user_page_log
                       where date(time_received) <= '%(YYDY)s-%(YYDM)s-%(YYDD)s'
                       group by user_id_uuid, user_id)
select daily_pageviews.user_id_uuid, daily_pageviews.user_id, '%(YDY)s-%(YDM)s-%(YDD)s'::date as ymd, has_wallet, pageviews,
coalesce(all_past_pageviews.ymd, '-Infinity'::date) as last_visit
from daily_pageviews left join all_past_pageviews on daily_pageviews.user_id_uuid = all_past_pageviews.user_id_uuid
"""

delete_cmd = "DELETE FROM agg_visitor_previous_visit WHERE ymd = '%(YDY)s-%(YDM)s-%(YDD)s' "

sensor = SqlSensor(
    task_id="sensor",
    conn_id='postgres_data_warehouse',
    sql=sense_sql % MACROS,
    poke_interval=15 * 60,  # minutes
    timeout=60 * 60 * 13,  # 13h, S3 dag has 12h timeout
    dag=dag)

delete = PostgresOperator(task_id="delete",
                          postgres_conn_id='postgres_data_warehouse',
                          sql=delete_cmd % MACROS,
                          dag=dag)

insert = PostgresOperator(task_id="insert",
                          postgres_conn_id='postgres_data_warehouse',
                          sql=insert_cmd % MACROS,
                          dag=dag)

delete.set_upstream(sensor)
insert.set_upstream(delete)
Example #15
0
        )

        SELECT heroes.name AS banned_hero, map_name, game_type, game_version, COUNT(*) AS banned_games
        FROM
          (
            SELECT
              jsonb_array_elements(tgs.doc -> 'banned') AS banned_hero,
              r.doc->>'gameType'::varchar AS game_type,
              (r.doc->>'gameVersion') :: INTEGER AS game_version,
              r.doc->>'mapName' AS map_name
            FROM {{team_table}} tgs
            JOIN {{info_table}} r ON (tgs.replayid = r.replayid)
                                   AND tgs.doc ->> 'banned' IS NOT NULL
            ORDER BY r.updated_at) source_data
            JOIN heroes ON (LOWER(REGEXP_REPLACE(source_data.banned_hero :: VARCHAR, '"', '', 'g')) = heroes.attrib_name)
        GROUP BY heroes.name, game_type, game_version, map_name
        ON CONFLICT ON CONSTRAINT stats_hero_bans_pk DO
        UPDATE SET
        banned_games = excluded.banned_games
    """).render(metrics_table=task_variables['METRICS_TABLE'],
                team_table=task_variables['TEAMSTATS'],
                info_table=task_variables['REPLAYINFO'])
)
insert_banned_heroes.set_upstream(create_table)

end_task = DummyOperator(
    dag=dag,
    task_id='end_task'
)
end_task.set_upstream(insert_banned_heroes)