ON a.url = b.url ''', destination_dataset_table= 'my-project.github_trends.hackernews_github_agg${{ yesterday_ds_nodash }}', dag=dag) # Task 7 # airflow test bigquery_github_trends_v1 bq_check_hackernews_github_agg 2017-06-02 t7 = BigQueryCheckOperator(task_id='bq_check_hackernews_github_agg', sql=''' #legacySql SELECT partition_id FROM [my-project:github_trends.hackernews_github_agg$__PARTITIONS_SUMMARY__] WHERE partition_id = "{{ yesterday_ds_nodash }}" ''', dag=dag) t3.set_upstream(t1) t4.set_upstream(t3) t5.set_upstream(t2) t6.set_upstream(t4) t6.set_upstream(t5) t7.set_upstream(t6)
query_tasks, tempAggtables_list = [] , [] count = 1 for query in listOfQueries: current_time = datetime.today().strftime("%Y%m%d_%H_%M") tempTable = "tempAggtable{}".format(str(count)) tablePointer_str = "{}.{}.{}".format(project_id,dataset_id,tempTable) tempAggtables_list.append(tablePointer_str) aggregationQuery_task = BigQueryOperator(task_id="queryJOb_{}_{}".format(str(count),current_time), sql=query, destination_dataset_table=tempTable, write_disposition="WRITE_TRUNCATE", create_disposition="CREATE_IF_NEEDED", allow_large_results=True, dag=dag) aggregationQuery_task.set_upstream(dummy_task) query_tasks.append(aggregationQuery_task) count += 1 ''' Exporting BigQuery aggregation query results from tables [t8_query1Result_BQTable,...] >> [t9_query1Result_GCS,...] ''' AggbigQueryToGCS_tasks , bigTableInputs = [] , [] count=1 aggQuery_tempFolder_gcs = "gs://{}/aggQuery".format(bucket_name) for aggQuery,table in zip(query_tasks,tempAggtables_list):
main_summary_experiments_bigquery_load.set_upstream(main_summary_experiments) experiments_aggregates_import.set_upstream(main_summary_experiments) search_dashboard.set_upstream(main_summary) search_dashboard_bigquery_load.set_upstream(search_dashboard) search_clients_daily.set_upstream(main_summary) search_clients_daily_bigquery_load.set_upstream(search_clients_daily) taar_dynamo.set_upstream(main_summary) taar_similarity.set_upstream(clients_daily_v6) clients_daily.set_upstream(main_summary) clients_daily_v6.set_upstream(main_summary) desktop_active_dau.set_upstream(clients_daily_v6) clients_daily_v6_bigquery_load.set_upstream(clients_daily_v6) clients_last_seen.set_upstream(clients_daily_v6_bigquery_load) clients_last_seen_export.set_upstream(clients_last_seen) exact_mau_by_dimensions.set_upstream(clients_last_seen) exact_mau_by_dimensions_export.set_upstream(exact_mau_by_dimensions) retention.set_upstream(main_summary) retention_bigquery_load.set_upstream(retention) client_count_daily_view.set_upstream(main_summary) desktop_dau.set_upstream(client_count_daily_view) main_summary_glue.set_upstream(main_summary) taar_locale_job.set_upstream(clients_daily_v6) taar_collaborative_recommender.set_upstream(clients_daily_v6)
#standardSQL SELECT a.county_name, b.State_Name, a.state, a.deaths, a.deaths_per_100000, a.confirmed_cases_per_100000, a.confirmed_cases, b.Avg_HPSA_Score, FROM `{0}.{1}.covid_aggs` as a LEFT JOIN `{0}.{1}.health_agg_county` as b ON a.county_name = b.County_Name AND a.state = b.State_Abbr ORDER BY a.confirmed_cases_per_100000 DESC '''.format(BQ_PROJECT, BQ_DATASET, "{{ yesterday_ds }}"), destination_dataset_table='{0}.{1}.Covid19_final_table${2}'.format( BQ_PROJECT, BQ_DATASET, '{{ yesterday_ds_nodash }}'), write_disposition='WRITE_TRUNCATE', allow_large_results=True, use_legacy_sql=False, bigquery_conn_id=BQ_CONN_ID, dag=dag) # Setting up Dependencies t2.set_upstream(t1) t4.set_upstream(t2) t4.set_upstream(t3)
bigquery_conn_id='bigquery', use_legacy_sql=False, write_disposition='WRITE_TRUNCATE', allow_large_results=True, bql=''' #standardSQL SELECT date, repo, SUM(IF(type='WatchEvent', 1, NULL)) AS stars, SUM(IF(type='ForkEvent', 1, NULL)) AS forks FROM ( SELECT FORMAT_TIMESTAMP("%Y%m%d", created_at) AS date, actor.id as actor_id, repo.name as repo, type FROM `githubarchive.day.{{ yesterday_ds_nodash }}` WHERE type IN ('WatchEvent','ForkEvent') ) GROUP BY date, repo ''', destination_dataset_table= 'gagan-sb.github_trends.github_daily_metrics${{ yesterday_ds_nodash }}', dag=dag) t1.set_upstream(t0)
# add documentation for what this task does - this will be displayed in the Airflow UI bq_task_1.doc_md = """\ Append a "Hello World!" message string to the table [airflow.<lob>_test_task1] """ # define the second task, in our case another big query operator bq_task_2 = BigQueryOperator( dag= dag, # need to tell airflow that this task belongs to the dag we defined above task_id='my_bq_task_2_' + lob, # task id's must be uniqe within the dag bql= 'my_qry_2.sql', # the actual sql command we want to run on bigquery is in this file in the same folder. it is also templated params={ "lob": lob }, # the sql file above have a template in it for a 'lob' paramater - this is how we pass it in destination_dataset_table='airflow.' + lob + '_test_task2', # we also in this example want our target table to be lob and task specific write_disposition= 'WRITE_TRUNCATE', # drop and recreate this table each time, you could use other options here bigquery_conn_id= 'my_gcp_connection' # this is the airflow connection to gcp we defined in the front end. More info here: https://github.com/alexvanboxel/airflow-gcp-examples ) # add documentation for what this task does - this will be displayed in the Airflow UI bq_task_2.doc_md = """\ Append a "Goodbye World!" message string to the table [airflow.<lob>_test_task2] """ # set dependencies so for example 'bq_task_2' wont start until 'bq_task_1' is completed with success bq_task_2.set_upstream(bq_task_1)
ARRAY_AGG(price ORDER BY created_at DESC LIMIT 1)[SAFE_OFFSET(0)] close FROM `composer-236006.crypto.ticks` WHERE symbol ='BTC' GROUP BY symbol, timestamp ORDER BY symbol, timestamp DESC ''', destination_dataset_table='composer-236006.crypto.ohlc1d', dag=dag) ''' BTC record count PythonOperator ''' btc_rc_operator = PythonOperator(task_id='btc_rc_operator', python_callable=btc_rc, dag=dag) crypto_pull_rates_operator.set_upstream(start_operator) btc_five_minutes_operator.set_upstream(crypto_pull_rates_operator) btc_fifteen_minutes_operator.set_upstream(crypto_pull_rates_operator) btc_thirty_minutes_operator.set_upstream(crypto_pull_rates_operator) btc_one_hour_operator.set_upstream(crypto_pull_rates_operator) btc_four_hour_operator.set_upstream(crypto_pull_rates_operator) btc_daily_operator.set_upstream(crypto_pull_rates_operator) btc_rc_operator.set_upstream(btc_five_minutes_operator) btc_rc_operator.set_upstream(btc_fifteen_minutes_operator) btc_rc_operator.set_upstream(btc_thirty_minutes_operator) btc_rc_operator.set_upstream(btc_one_hour_operator) btc_rc_operator.set_upstream(btc_four_hour_operator) btc_rc_operator.set_upstream(btc_daily_operator) end_operator.set_upstream(btc_rc_operator)
task_id='gcs_to_bq_eu', bucket='some-bucket', schema_fields=schema_fields, skip_leading_rows=1, source_objects=['eu-referendum-result-data.csv'], destination_project_dataset_table='airflow_referendum.result_data', dag=dag) # Task 2: Check admin areas table exists # airflow test -sd=airflow_dags demo bq_dest_table_lookup_admin 2017-03-06 t2 = BigQueryCheckOperator( task_id='bq_dest_table_lookup_admin', sql= 'SELECT table_id FROM [some-project:airflow_referendum.__TABLES__] WHERE table_id = "admin_areas"', dag=dag) # Task 3: Join DCM table with lookup files # airflow test -sd=airflow_dags demo bq_dest_populate 2017-03-06 t3 = BigQueryOperator( task_id='bq_dest_populate', bql='demo/bigquery_sql/geo.sql', destination_dataset_table='airflow_referendum.geo_results{}'.format( current_date), write_disposition='WRITE_TRUNCATE', dag=dag) t2.set_upstream(t1) t3.set_upstream(t2)