Exemple #1
0
    ON a.url = b.url
    ''',
    destination_dataset_table=
    'my-project.github_trends.hackernews_github_agg${{ yesterday_ds_nodash }}',
    dag=dag)

# Task 7
# airflow test bigquery_github_trends_v1 bq_check_hackernews_github_agg 2017-06-02

t7 = BigQueryCheckOperator(task_id='bq_check_hackernews_github_agg',
                           sql='''
    #legacySql
    SELECT
    partition_id
    FROM
    [my-project:github_trends.hackernews_github_agg$__PARTITIONS_SUMMARY__]
    WHERE partition_id = "{{ yesterday_ds_nodash }}"
    ''',
                           dag=dag)

t3.set_upstream(t1)

t4.set_upstream(t3)

t5.set_upstream(t2)

t6.set_upstream(t4)
t6.set_upstream(t5)

t7.set_upstream(t6)
Exemple #2
0
query_tasks, tempAggtables_list = [] , []
count = 1
for query in listOfQueries:
	current_time = datetime.today().strftime("%Y%m%d_%H_%M")
	tempTable = "tempAggtable{}".format(str(count)) 
	tablePointer_str = "{}.{}.{}".format(project_id,dataset_id,tempTable)
	tempAggtables_list.append(tablePointer_str)
	aggregationQuery_task = BigQueryOperator(task_id="queryJOb_{}_{}".format(str(count),current_time),
		sql=query,
		destination_dataset_table=tempTable,
		write_disposition="WRITE_TRUNCATE",
		create_disposition="CREATE_IF_NEEDED",
		allow_large_results=True,
		dag=dag)
	aggregationQuery_task.set_upstream(dummy_task)
	query_tasks.append(aggregationQuery_task)
	count += 1


'''
Exporting BigQuery aggregation query results from tables

[t8_query1Result_BQTable,...] >> [t9_query1Result_GCS,...]

'''

AggbigQueryToGCS_tasks , bigTableInputs = [] , [] 
count=1
aggQuery_tempFolder_gcs = "gs://{}/aggQuery".format(bucket_name)
for aggQuery,table in zip(query_tasks,tempAggtables_list): 
main_summary_experiments_bigquery_load.set_upstream(main_summary_experiments)

experiments_aggregates_import.set_upstream(main_summary_experiments)
search_dashboard.set_upstream(main_summary)
search_dashboard_bigquery_load.set_upstream(search_dashboard)
search_clients_daily.set_upstream(main_summary)
search_clients_daily_bigquery_load.set_upstream(search_clients_daily)

taar_dynamo.set_upstream(main_summary)
taar_similarity.set_upstream(clients_daily_v6)

clients_daily.set_upstream(main_summary)
clients_daily_v6.set_upstream(main_summary)
desktop_active_dau.set_upstream(clients_daily_v6)
clients_daily_v6_bigquery_load.set_upstream(clients_daily_v6)
clients_last_seen.set_upstream(clients_daily_v6_bigquery_load)
clients_last_seen_export.set_upstream(clients_last_seen)
exact_mau_by_dimensions.set_upstream(clients_last_seen)
exact_mau_by_dimensions_export.set_upstream(exact_mau_by_dimensions)

retention.set_upstream(main_summary)
retention_bigquery_load.set_upstream(retention)

client_count_daily_view.set_upstream(main_summary)
desktop_dau.set_upstream(client_count_daily_view)

main_summary_glue.set_upstream(main_summary)

taar_locale_job.set_upstream(clients_daily_v6)
taar_collaborative_recommender.set_upstream(clients_daily_v6)
        #standardSQL
    SELECT 
    a.county_name,
    b.State_Name,
    a.state,
    a.deaths,
    a.deaths_per_100000,
    a.confirmed_cases_per_100000,
    a.confirmed_cases,
    b.Avg_HPSA_Score,
    FROM 
        `{0}.{1}.covid_aggs` as a
    LEFT JOIN `{0}.{1}.health_agg_county` as b
         ON a.county_name = b.County_Name
         AND a.state = b.State_Abbr
        ORDER BY
        a.confirmed_cases_per_100000 DESC
    '''.format(BQ_PROJECT, BQ_DATASET, "{{ yesterday_ds }}"),
    destination_dataset_table='{0}.{1}.Covid19_final_table${2}'.format(
        BQ_PROJECT, BQ_DATASET, '{{ yesterday_ds_nodash }}'),
    write_disposition='WRITE_TRUNCATE',
    allow_large_results=True,
    use_legacy_sql=False,
    bigquery_conn_id=BQ_CONN_ID,
    dag=dag)

# Setting up Dependencies
t2.set_upstream(t1)
t4.set_upstream(t2)
t4.set_upstream(t3)
    bigquery_conn_id='bigquery',
    use_legacy_sql=False,
    write_disposition='WRITE_TRUNCATE',
    allow_large_results=True,
    bql='''
    #standardSQL
    SELECT
      date,
      repo,
      SUM(IF(type='WatchEvent', 1, NULL)) AS stars,
      SUM(IF(type='ForkEvent',  1, NULL)) AS forks
    FROM (
      SELECT
        FORMAT_TIMESTAMP("%Y%m%d", created_at) AS date,
        actor.id as actor_id,
        repo.name as repo,
        type
      FROM
        `githubarchive.day.{{ yesterday_ds_nodash }}`
      WHERE type IN ('WatchEvent','ForkEvent')
    )
    GROUP BY
      date,
      repo
    ''',
    destination_dataset_table=
    'gagan-sb.github_trends.github_daily_metrics${{ yesterday_ds_nodash }}',
    dag=dag)

t1.set_upstream(t0)
Exemple #6
0
    # add documentation for what this task does - this will be displayed in the Airflow UI
    bq_task_1.doc_md = """\
    Append a "Hello World!" message string to the table [airflow.<lob>_test_task1]
    """

    # define the second task, in our case another big query operator
    bq_task_2 = BigQueryOperator(
        dag=
        dag,  # need to tell airflow that this task belongs to the dag we defined above
        task_id='my_bq_task_2_' +
        lob,  # task id's must be uniqe within the dag
        bql=
        'my_qry_2.sql',  # the actual sql command we want to run on bigquery is in this file in the same folder. it is also templated
        params={
            "lob": lob
        },  # the sql file above have a template in it for a 'lob' paramater - this is how we pass it in
        destination_dataset_table='airflow.' + lob +
        '_test_task2',  # we also in this example want our target table to be lob and task specific
        write_disposition=
        'WRITE_TRUNCATE',  # drop and recreate this table each time, you could use other options here
        bigquery_conn_id=
        'my_gcp_connection'  # this is the airflow connection to gcp we defined in the front end. More info here: https://github.com/alexvanboxel/airflow-gcp-examples
    )
    # add documentation for what this task does - this will be displayed in the Airflow UI
    bq_task_2.doc_md = """\
    Append a "Goodbye World!" message string to the table [airflow.<lob>_test_task2]
    """

    # set dependencies so for example 'bq_task_2' wont start until 'bq_task_1' is completed with success
    bq_task_2.set_upstream(bq_task_1)
Exemple #7
0
        ARRAY_AGG(price ORDER BY created_at DESC LIMIT 1)[SAFE_OFFSET(0)] close
    FROM `composer-236006.crypto.ticks`
    WHERE symbol ='BTC'
    GROUP BY symbol, timestamp
    ORDER BY symbol, timestamp DESC
    ''',
    destination_dataset_table='composer-236006.crypto.ohlc1d',
    dag=dag)
''' BTC  record count PythonOperator '''
btc_rc_operator = PythonOperator(task_id='btc_rc_operator',
                                 python_callable=btc_rc,
                                 dag=dag)

crypto_pull_rates_operator.set_upstream(start_operator)

btc_five_minutes_operator.set_upstream(crypto_pull_rates_operator)
btc_fifteen_minutes_operator.set_upstream(crypto_pull_rates_operator)
btc_thirty_minutes_operator.set_upstream(crypto_pull_rates_operator)
btc_one_hour_operator.set_upstream(crypto_pull_rates_operator)
btc_four_hour_operator.set_upstream(crypto_pull_rates_operator)
btc_daily_operator.set_upstream(crypto_pull_rates_operator)

btc_rc_operator.set_upstream(btc_five_minutes_operator)
btc_rc_operator.set_upstream(btc_fifteen_minutes_operator)
btc_rc_operator.set_upstream(btc_thirty_minutes_operator)
btc_rc_operator.set_upstream(btc_one_hour_operator)
btc_rc_operator.set_upstream(btc_four_hour_operator)
btc_rc_operator.set_upstream(btc_daily_operator)

end_operator.set_upstream(btc_rc_operator)
Exemple #8
0
    task_id='gcs_to_bq_eu',
    bucket='some-bucket',
    schema_fields=schema_fields,
    skip_leading_rows=1,
    source_objects=['eu-referendum-result-data.csv'],
    destination_project_dataset_table='airflow_referendum.result_data',
    dag=dag)

# Task 2: Check admin areas table exists
# airflow test -sd=airflow_dags demo bq_dest_table_lookup_admin 2017-03-06

t2 = BigQueryCheckOperator(
    task_id='bq_dest_table_lookup_admin',
    sql=
    'SELECT table_id FROM [some-project:airflow_referendum.__TABLES__] WHERE table_id = "admin_areas"',
    dag=dag)

# Task 3: Join DCM table with lookup files
# airflow test -sd=airflow_dags demo bq_dest_populate 2017-03-06

t3 = BigQueryOperator(
    task_id='bq_dest_populate',
    bql='demo/bigquery_sql/geo.sql',
    destination_dataset_table='airflow_referendum.geo_results{}'.format(
        current_date),
    write_disposition='WRITE_TRUNCATE',
    dag=dag)

t2.set_upstream(t1)
t3.set_upstream(t2)