def table_custom_check(jinja_query, days_to_retry): return BigQueryCheckOperator( task_id='custom_check', sql=jinja_query, use_legacy_sql=False, retries=2 * 24 * days_to_retry, # Retries 3 days with 30 minutes. execution_timeout=timedelta(days=days_to_retry), # TimeOut of 3 days. retry_delay=timedelta(minutes=30), # Delay in retries 30 minutes. max_retry_delay=timedelta( minutes=30), # Max Delay in retries 30 minutes on_failure_callback=config_tools.failure_callback_gfw)
def table_check(self, task_id, project, dataset, table, date, **retries_config): """ Returns a BigQueryCheckOperator that checks the existance of a partitioned table for a specific date. Having a retry mechanism of 3 days checking by 30 minutes. :param task_id: The task identification. :type task_id: str :param project: The project where the dataset and table belongs. :type project: str :param dataset: The dataset that has the table. :type dataset: str :param table: The partitioned table. :type table: str :param date: The date of the partition to be checked. :type date: str :param retries_config: The retries configuration to adapt to your needs. You can customize parameters as retries, execution_timeout, retry_delay and max_retry_delay. :type retries_config: dict """ retries = retries_config.get('retries') execution_timeout = retries_config.get('execution_timeout') retry_delay = retries_config.get('retry_delay') max_retry_delay = retries_config.get('max_retry_delay') return BigQueryCheckOperator( task_id=task_id, sql='SELECT COUNT(*) FROM [{}.{}${}]'.format(dataset, table, date), retries=2 * 24 * 3 if not retries else retries, # Retries 3 days with 30 minutes. execution_timeout=timedelta(days=3) if not execution_timeout else execution_timeout, # TimeOut of 3 days. retry_delay=timedelta(minutes=30) if not retry_delay else retry_delay, # Delay in retries 30 minutes. max_retry_delay=timedelta(minutes=30) if not max_retry_delay else max_retry_delay, # Max Delay in retries 30 minutes on_failure_callback=config_tools.failure_callback_gfw)
'start_date': datetime(2020, 9, 23), 'end_date': datetime(2020, 9, 30), 'depends_on_past': False, 'retries': 0, 'retry_delay': timedelta(minutes=10) } dag = DAG(dag_id="my-bq-task", default_args=default_args, schedule_interval='@daily') filepath = "gs://backup-data-2020/" + 'out_' + datetime.now().strftime( "%d_%b_%Y:%H:%M:%S.%f") + '.csv' t1 = BigQueryCheckOperator( bigquery_conn_id="my-bq-conn", task_id="check_for_table", dag=dag, use_legacy_sql=False, sql="""select count(*) from `tech-289406.sample.covid`""") t2 = BigQueryToCloudStorageOperator( task_id="dataset_to_gcs", source_project_dataset_table="sample.covid", destination_cloud_storage_uris=[filepath], export_format="CSV", field_delimiter=',', bigquery_conn_id="my-bq-conn", dag=dag, ) t1 >> t2
t1 = GoogleCloudStorageToBigQueryOperator( task_id='gcs_to_bq_eu', bucket='some-bucket', schema_fields=schema_fields, skip_leading_rows=1, source_objects=['eu-referendum-result-data.csv'], destination_project_dataset_table='airflow_referendum.result_data', dag=dag) # Task 2: Check admin areas table exists # airflow test -sd=airflow_dags demo bq_dest_table_lookup_admin 2017-03-06 t2 = BigQueryCheckOperator( task_id='bq_dest_table_lookup_admin', sql= 'SELECT table_id FROM [some-project:airflow_referendum.__TABLES__] WHERE table_id = "admin_areas"', dag=dag) # Task 3: Join DCM table with lookup files # airflow test -sd=airflow_dags demo bq_dest_populate 2017-03-06 t3 = BigQueryOperator( task_id='bq_dest_populate', bql='demo/bigquery_sql/geo.sql', destination_dataset_table='airflow_referendum.geo_results{}'.format( current_date), write_disposition='WRITE_TRUNCATE', dag=dag) t2.set_upstream(t1)
def preprocess_tasks(model, dag, PROJECT_ID, BUCKET, DATA_DIR): # Constants # Specify your source BigQuery project, dataset, and table names SOURCE_BQ_PROJECT = "nyc-tlc" SOURCE_DATASET_TABLE_NAMES = "yellow.trips,green.trips_2014,green.trips_2015".split(",") # Specify your destination BigQuery dataset DESTINATION_DATASET = "taxifare" # BigQuery data query bql=""" SELECT (tolls_amount + fare_amount) AS fare_amount, EXTRACT(DAYOFWEEK FROM pickup_datetime) * 1.0 AS dayofweek, EXTRACT(HOUR FROM pickup_datetime) * 1.0 AS hourofday, pickup_longitude AS pickuplon, pickup_latitude AS pickuplat, dropoff_longitude AS dropofflon, dropoff_latitude AS dropofflat, passenger_count*1.0 AS passengers, CONCAT(CAST(pickup_datetime AS STRING), CAST(pickup_longitude AS STRING), CAST(pickup_latitude AS STRING), CAST(dropoff_latitude AS STRING), CAST(dropoff_longitude AS STRING)) AS key FROM `{0}.{1}` WHERE trip_distance > 0 AND fare_amount >= 2.5 AND pickup_longitude > -78 AND pickup_longitude < -70 AND dropoff_longitude > -78 AND dropoff_longitude < -70 AND pickup_latitude > 37 AND pickup_latitude < 45 AND dropoff_latitude > 37 AND dropoff_latitude < 45 AND passenger_count > 0 AND rand() < 0.00001 """ bql = bql.format(SOURCE_BQ_PROJECT, model) bql_train = "SELECT * EXCEPT (key) FROM({0}) WHERE MOD(ABS(FARM_FINGERPRINT(key)), 5) < 4".format(bql) bql_eval = "SELECT * EXCEPT (key) FROM({0}) WHERE MOD(ABS(FARM_FINGERPRINT(key)), 5) = 4".format(bql) # Complete the BigQueryOperator task to truncate the table if it already exists before writing # Reference: https://airflow.apache.org/integration.html#bigqueryoperator bq_train_data_op = BigQueryOperator( task_id="bq_train_data_{}_task".format(model.replace(".","_")), bql=bql_train, destination_dataset_table="{}.{}_train_data".format(DESTINATION_DATASET, model.replace(".","_")), write_disposition="WRITE_TRUNCATE", # specify to truncate on writes use_legacy_sql=False, dag=dag ) bq_eval_data_op = BigQueryOperator( task_id="bq_eval_data_{}_task".format(model.replace(".","_")), bql=bql_eval, destination_dataset_table="{}.{}_eval_data".format(DESTINATION_DATASET, model.replace(".","_")), write_disposition="WRITE_TRUNCATE", # specify to truncate on writes use_legacy_sql=False, dag=dag ) sql = """ SELECT COUNT(*) FROM [{0}:{1}.{2}] """ # Check to make sure that the data tables won"t be empty bq_check_train_data_op = BigQueryCheckOperator( task_id="bq_check_train_data_{}_task".format(model.replace(".","_")), sql=sql.format(PROJECT_ID, DESTINATION_DATASET, model.replace(".","_") + "_train_data"), dag=dag ) bq_check_eval_data_op = BigQueryCheckOperator( task_id="bq_check_eval_data_{}_task".format(model.replace(".","_")), sql=sql.format(PROJECT_ID, DESTINATION_DATASET, model.replace(".","_") + "_eval_data"), dag=dag ) # BigQuery training data export to GCS bash_remove_old_data_op = BashOperator( task_id="bash_remove_old_data_{}_task".format(model.replace(".","_")), bash_command="if gsutil ls {0}/taxifare/data/{1} 2> /dev/null; then gsutil -m rm -rf {0}/taxifare/data/{1}/*; else true; fi".format(BUCKET, model.replace(".","_")), dag=dag ) # Takes a BigQuery dataset and table as input and exports it to GCS as a CSV bq_export_gcs_train_csv_op = BigQueryToCloudStorageOperator( task_id="bq_export_gcs_train_csv_{}_task".format(model.replace(".","_")), source_project_dataset_table="{}.{}_train_data".format(DESTINATION_DATASET, model.replace(".","_")), destination_cloud_storage_uris=[DATA_DIR + "{}/train-*.csv".format(model.replace(".","_"))], export_format="CSV", print_header=False, dag=dag ) bq_export_gcs_eval_csv_op = BigQueryToCloudStorageOperator( task_id="bq_export_gcs_eval_csv_{}_task".format(model.replace(".","_")), source_project_dataset_table="{}.{}_eval_data".format(DESTINATION_DATASET, model.replace(".","_")), destination_cloud_storage_uris=[DATA_DIR + "{}/eval-*.csv".format(model.replace(".","_"))], export_format="CSV", print_header=False, dag=dag ) return (bq_train_data_op, bq_eval_data_op, bq_check_train_data_op, bq_check_eval_data_op, bash_remove_old_data_op, bq_export_gcs_train_csv_op, bq_export_gcs_eval_csv_op)
# Define DAG: Set ID and assign default args and schedule interval dag = DAG('bigquery_github_trends', default_args=default_args, schedule_interval=schedule_interval) ## Task 1: check that the github archive data has a dated table created for that date # To test this task, run this command: # docker-compose -f docker-compose-gcloud.yml run --rm webserver airflow test bigquery_github_trends bq_check_githubarchive_day 2018-12-01 t1 = BigQueryCheckOperator(task_id='bq_check_githubarchive_day', sql=''' #standardSQL SELECT table_id FROM `githubarchive.day.__TABLES_SUMMARY__` WHERE table_id = "{{ yesterday_ds_nodash }}" ''', use_legacy_sql=False, bigquery_conn_id=BQ_CONN_ID, dag=dag) ## Task 2: check that the hacker news table contains data for that date. t2 = BigQueryCheckOperator(task_id='bq_check_hackernews_full', sql=''' #standardSQL SELECT FORMAT_TIMESTAMP("%Y%m%d", timestamp ) AS date FROM `bigquery-public-data.hacker_news.full`
########## #Authentication client = bigquery.Client() #client = bigquery.Client.from_service_account_json('/Users/jasmeetbhatia/Dropbox/Google/service_keys/jsb-demos-8fb1eb4ebaf6.json') t3 = BigQueryCheckOperator( task_id='bq_check_table', sql=''' #legacySql SELECT COUNT(pageviews) FROM [jsb-demos.ga_360_merch_store.merch_store_ga360_sessions] LIMIT 1 ''', dag=dag, depends_on_past=True, ui_color= '#bed578' ) t4 = BigQueryCheckOperator( task_id='bq_data_validation', sql=''' #legacySql SELECT STRFTIME_UTC_USEC(timestamp, "%Y%m%d") as date FROM [bigquery-public-data:hacker_news.full] WHERE
'email_on_failure': True, 'email_on_retry': True, 'retries': 1, 'retry_delay': timedelta(minutes=3) } dag = DAG(dag_id='bq_data_v1', start_date=datetime(2020, 5, 31), default_args=dag_args, end_date=None, schedule_interval='0 9 * * *') t1 = BigQueryCheckOperator(task_id='check_public_data_exists', sql=""" select count(*) > 0 from bigquery-public-data.covid19_italy.data_by_region where DATE(_PARTITIONTIME) = DATE_SUB('{{ ds }}', INTERVAL 1 DAY)) """, use_legacy_sql=False) t2 = BigQueryOperator( task_id='load_public_data', use_legacy_sql=False, create_disposition='CREATE_IF_NEEDED', write_disposition='WRITE_TRUNCATE', allow_large_results=True, bigquery_conn_id='bigquery_default', time_partitioning={"type": 'DAY'}, sql=''' #standardSQL select date,
# ingested within the past 90 days. check_sql = """ SELECT COUNT(*) FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips` WHERE trip_start_timestamp >= TIMESTAMP('{{ macros.ds_add(ds, -90) }}') """ # BigQueryCheckOperator will fail if the result of the query is 0. # I.e. if there is no fresh data. bq_check_data_op = BigQueryCheckOperator( task_id="bq_check_data_task", use_legacy_sql=False, sql=check_sql, ) CHECK_ERROR_MESSAGE = b64e(b'Error. Did not retrain on stale data.') # Task will only trigger if all upstream tasks fail. In which case a # message will be sent to the Pub/Sub topic specified above. publish_if_failed_check_op = PubSubPublishOperator( task_id="publish_on_failed_check_task", project=PROJECT_ID, topic=TOPIC, messages=[{'data': CHECK_ERROR_MESSAGE.decode()}], trigger_rule=TriggerRule.ALL_FAILED )
'start_date': datetime(2019, 12, 17), 'email': ['*****@*****.**'], 'email_on_failure': True, 'email_on_retry': False, 'retries': 3, 'retry_delay': timedelta(seconds=7) } dag = DAG( dag_id='BQ_TableToPartTable', default_args=default_args, schedule_interval='35 6 * * *', description='Перезаписывает и добавляет данные в PARTITIONTIME таблице' , catchup=False ) t2 = BigQueryCheckOperator(task_id='data_validation', sql="""SELECT count(*) FROM `konic-progress-196909.Mig_Data.Orders` where date = '{{ ds }}'""", use_legacy_sql=False, dag=dag) t3 = BigQueryOperator( task_id='delete_22_day_ago', use_legacy_sql=False, allow_large_results=True, sql="""delete FROM Mig_Data.PartOrders where DATE(_PARTITIONTIME) >= '{{ macros.ds_add(ds, -22) }}' and DATE(_PARTITIONTIME) <= '{{ ds }}'""", dag=dag) t2>>t3 for i in range(0,23,1): k='{{{{ (execution_date - macros.timedelta(days={})).strftime("%Y-%m-%d") }}}}'.format(i) k_='{{{{ (execution_date - macros.timedelta(days={})).strftime("%Y%m%d") }}}}'.format(i)
def py_call(name): print("Hello {0}. Congrats on Printing this.".format(name)) t2 = PythonOperator(task_id='Py_Test', python_callable=py_call, op_kwargs={'name': 'Ravi Kishore'}, dag=dag) # Check to see if the Source Table exists t3 = BigQueryCheckOperator( task_id='Check_Chicago_Crime_Table', sql= '''select count(*) FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips` where date(trip_start_timestamp) = "{{yesterday_ds}}"''', bigquery_conn_id='my_gcp_conn', use_legacy_sql=False, dag=dag) # Load data to Project Dataset t4 = BigQueryOperator( task_id='BQLoadDestinationTable', sql= '''select trip_start_timestamp, trip_end_timestamp, trip_seconds, trip_miles, fare, tips, tolls, extras, trip_total FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips` where date(trip_start_timestamp)= "{{ yesterday_ds }}"''', bigquery_conn_id='my_gcp_conn', write_disposition='WRITE_TRUNCATE', create_disposition='CREATE_IF_NEEDED', destination_dataset_table=
create_disposition='CREATE_IF_NEEDED', use_legacy_sql=False, task_id='analytics_award_saga', dag=dag) drop_awards = BigQueryTableDeleteOperator(deletion_dataset_table=award_table, ignore_if_missing=True, task_id='deletion_dataset_table', dag=dag) ########################### # Validation tasks ########################### validate_non_empty_movie = BigQueryCheckOperator( dag=dag, task_id='validate_non_empty_movie', sql=SqlQueries.validate_non_empty_movie, use_legacy_sql=False) validate_non_empty_person = BigQueryCheckOperator( dag=dag, task_id='validate_non_empty_person', sql=SqlQueries.validate_non_empty_person, use_legacy_sql=False) validate_non_empty_movie_person = BigQueryCheckOperator( dag=dag, task_id='validate_non_empty_movie_person', sql=SqlQueries.validate_non_empty_movie_person, use_legacy_sql=False)
msg = msg.format(conn_id=new_conn.conn_id) print(msg) dag = DAG('bigquery_github_trends_v1', default_args=default_args, schedule_interval="@once") # Task to add a connection t0 = PythonOperator( dag=dag, task_id='add_gcp_connection_python', python_callable=add_gcp_connection, provide_context=True, ) t1 = BigQueryCheckOperator( task_id='bq_check_hackernews_full', bigquery_conn_id='bigquery', sql=''' #legacySql SELECT STRFTIME_UTC_USEC(timestamp, "%Y%m%d") as date FROM [bigquery-public-data:hacker_news.full] WHERE type = 'story' AND STRFTIME_UTC_USEC(timestamp, "%Y%m%d") = "{{ yesterday_ds_nodash }}" LIMIT 1 ''', dag=dag) t1.set_upstream(t0)
session.commit() else: msg = '\n\tA connection with `conn_id`={conn_id} already exists\n' msg = msg.format(conn_id=new_conn.conn_id) print(msg) dag = DAG('bigquery_github_trends_v1', default_args=default_args, schedule_interval="@once") # Task to add a connection t0 = PythonOperator( dag=dag, task_id='add_gcp_connection_python', python_callable=add_gcp_connection, provide_context=True, ) t1 = BigQueryCheckOperator( task_id='bq_check_hackernews_github_agg', bigquery_conn_id='bigquery', sql=''' #legacySql SELECT partition_id FROM [github_trends.hackernews_github_agg$__PARTITIONS_SUMMARY__] WHERE partition_id = "{{ yesterday_ds_nodash }}" ''', dag=dag) t1.set_upstream(t0)
'start_date': start_date, 'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5) } # write dag with DAG(dag_id='blog', default_args=default_args, schedule_interval=schedule_interval, catchup=False) as dag: t1 = BigQueryCheckOperator(task_id='check_bq_data_exists', sql=queries.check_bq_data_exists, use_legacy_sql=False) t2 = GoogleCloudStorageObjectSensor(task_id='check_gcs_file_exists', bucket=cfg.BUCKET, object=cfg.SOURCE_OBJECT) t3 = BigQueryOperator(task_id='write_weight_data_to_bq', sql=queries.write_weight_data_to_bq, destination_dataset_table=cfg.BQ_TABLE_WEIGHT, create_disposition='CREATE_IF_NEEDED', write_disposition='WRITE_TRUNCATE', use_legacy_sql=False) t4 = GoogleCloudStorageToBigQueryOperator( task_id='write_route_data_to_bq',
dag = DAG( 'bigquery_check', default_args=default_args, schedule_interval=schedule_interval ) ## For testing individual tasks run the below code in your terminal ## airflow test bigquery_github_trends bq_check_githubarchive_day 2020-12-01 ## Task 1: check that the github archive data has a dated table created for that date t1 = BigQueryCheckOperator( task_id='bq_check_githubarchive_day', sql=''' #standardSQL SELECT table_id FROM `githubarchive.day.__TABLES_SUMMARY__` WHERE table_id = "{{ yesterday_ds_nodash }}" ''', use_legacy_sql=False, bigquery_conn_id=BQ_CONN_ID, dag=dag ) ## Task 2: check that the hacker news table contains data for that date. t2 = BigQueryCheckOperator( task_id='bq_check_hackernews_full', sql=''' #standardSQL SELECT FORMAT_TIMESTAMP("%Y%m%d", timestamp ) AS date
BQ_CONN = gcp_config["BQ_CONN"] GCP_PROJECT = gcp_config["GCP_PROJECT"] PROJECT_DATASET = gcp_config["PROJECT_DATASET"] schedule_interval = "0 0 * * *" dag = DAG('bigquery_github_trends', catchup=True ,default_args = default_args, schedule_interval = schedule_interval) #Task 1: Check if githubarchive contains data for the date. # To test: docker-compose run --rm webserver airflow test bigquery_github_trends github_table_check 2020-01-01 t1 = BigQueryCheckOperator( task_id = "github_table_check", sql = """ SELECT * FROM `githubarchive.day.__TABLES_SUMMARY__` WHERE table_id = '{{ yesterday_ds_nodash }}' """, bigquery_conn_id = BQ_CONN, use_legacy_sql = False, dag = dag ) #Task 2: Check if hackernews has data for the date. # To test: docker-compose run --rm webserver airflow test bigquery_github_trends hackernews_full_table_check 2020-01-01 t2 = BigQueryCheckOperator( task_id = "hackernews_full_table_check", sql = """ SELECT timestamp FROM `bigquery-public-data.hacker_news.full` WHERE EXTRACT(date from timestamp) = '{{ yesterday_ds }}' AND type = 'story'
'email_on_failure': True, 'email_on_retry': True, 'retries': 1, 'retry_delay': timedelta(minutes=3) } dag = DAG(dag_id='covid_italy_daily', start_date=datetime(2020, 5, 31), default_args=dag_args, end_date=None, schedule_interval='0 9 * * *') t1 = BigQueryCheckOperator(task_id='check_public_data_exists', sql=""" select count(*) > 0 from bigquery-public-data.covid19_italy.data_by_region where DATE(date) = '{{ ds }}' """, use_legacy_sql=False) t2 = BigQueryOperator( task_id='load_public_data', use_legacy_sql=False, create_disposition='CREATE_IF_NEEDED', write_disposition='WRITE_TRUNCATE', allow_large_results=True, bigquery_conn_id='bigquery_default', time_partitioning={"type": 'DAY'}, sql=''' #standardSQL select date,
load_immigration_data = GoogleCloudStorageToBigQueryOperator( task_id='load_immigration_data', bucket=gs_bucket, source_objects=['immigration_data/*.parquet'], destination_project_dataset_table= f'{project_id}:{staging_dataset}.immigration_data', source_format='parquet', write_disposition='WRITE_TRUNCATE', skip_leading_rows=1, autodetect=True) # Check loaded data not null check_us_cities_demo = BigQueryCheckOperator( task_id='check_us_cities_demo', use_legacy_sql=False, sql=f'SELECT count(*) FROM `{project_id}.{staging_dataset}.us_cities_demo`' ) check_airports = BigQueryCheckOperator( task_id='check_airports', use_legacy_sql=False, sql=f'SELECT count(*) FROM `{project_id}.{staging_dataset}.airport_codes`') check_weather = BigQueryCheckOperator( task_id='check_weather', use_legacy_sql=False, sql= f'SELECT count(*) FROM `{project_id}.{staging_dataset}.temperature_by_city`' )
session.commit() else: msg = '\n\tA connection with `conn_id`={conn_id} already exists\n' msg = msg.format(conn_id=new_conn.conn_id) print(msg) dag = DAG('bigquery_github_trends_v1', default_args=default_args, schedule_interval="@once") # Task to add a connection t0 = PythonOperator( dag=dag, task_id='add_gcp_connection_python', python_callable=add_gcp_connection, provide_context=True, ) t1 = BigQueryCheckOperator(task_id='bq_check_githubarchive_day', bigquery_conn_id='bigquery', sql=''' #legacySql SELECT table_id FROM [githubarchive:day.__TABLES__] WHERE table_id = "{{ yesterday_ds_nodash }}" ''', dag=dag) t1.set_upstream(t0)
gcs_schema_object='gs://testcovidlinh/googletrend_schema.json', # schema_fields=googletrend_schema, bigquery_conn_id=gcp_conn_id, google_cloud_storage_conn_id=gcp_conn_id) # Loading data from GCS to BigQuery gcs_to_bigquery = GoogleCloudStorageToBigQueryOperator( task_id='GCS_to_BigQuery', dag=dag, bucket='testcovidlinh', source_objects=source_objects, # schema_object = "/tmp/covidStatSchema.json", schema_object='googletrend_schema.json', # schema_fields = googletrend_schema, source_format='CSV', destination_project_dataset_table=destination_project_dataset_table, write_disposition='WRITE_TRUNCATE', # autodetect = False, bigquery_conn_id=gcp_conn_id, google_cloud_storage_conn_id=gcp_conn_id) # Check Count check_count = BigQueryCheckOperator(task_id="check_count", bigquery_conn_id=gcp_conn_id, sql="SELECT COUNT(*) FROM " + destination_project_dataset_table, use_legacy_sql=False # location=location ) start_operator >> googletrend >> CreateTable >> gcs_to_bigquery >> check_count
'email_on_retry': False } # Step 3: Initiate DAG dag = DAG('Airflow_20201025', default_args=default_args, schedule_interval='0 0 * * *') # Step 4: Define Tasks t1 = BashOperator(task_id='Bash_Test', bash_command='date', dag=dag) def py_call(name): print("Hello {0}. Congrats on Printing this.".format(name)) t2 = PythonOperator(task_id='Py_Test', python_callable=py_call, op_kwargs={'name': 'Ravi Kishore'}, dag=dag) t3 = BigQueryCheckOperator( task_id='Check_Chicago_Crime_Table', sql= '''select * from `bigquery-public-data.chicago_crime.crime` where date(date) = "{{ yesterday_ds }}"''', bigquery_conn_id='google_cloud_default', use_legacy_sql=False, dag=dag) # Step 5: Define Dependencies t1 >> t2 >> t3
query = ''' SELECT CONCAT( 'https://stackoverflow.com/questions/', CAST(id as STRING)) as url, view_count FROM `bigquery-public-data.stackoverflow.posts_questions` WHERE tags like '%google-bigquery%' ORDER BY view_count DESC LIMIT 10 ''' t1 = BigQueryCheckOperator( task_id='bq_check_githubarchive_day', # use_legacy_sql=False, sql=query, project_id='tt-cust-analytics', dag=dag) # t2 = BigQueryCheckOperator( # task_id='bq_check_hackernews_full', # sql=''' # #legacySql # SELECT # STRFTIME_UTC_USEC(timestamp, "%Y%m%d") as date # FROM # [bigquery-public-data:hacker_news.full] # WHERE # type = 'story' # AND STRFTIME_UTC_USEC(timestamp, "%Y%m%d") = "{{ yesterday_ds_nodash }}" # LIMIT 1
DESTINATION_DATASET, model.replace(".", "_")), write_disposition="WRITE_TRUNCATE", # specify to truncate on writes use_legacy_sql=False, dag=dag) sql = """ SELECT COUNT(*) FROM [{0}:{1}.{2}] """ # Check to make sure that the data tables won"t be empty bq_check_train_data_op = BigQueryCheckOperator( task_id="bq_check_train_data_{}_task".format(model.replace(".", "_")), sql=sql.format(PROJECT_ID, DESTINATION_DATASET, model.replace(".", "_") + "_train_data"), dag=dag) bq_check_eval_data_op = BigQueryCheckOperator( task_id="bq_check_eval_data_{}_task".format(model.replace(".", "_")), sql=sql.format(PROJECT_ID, DESTINATION_DATASET, model.replace(".", "_") + "_eval_data"), dag=dag) # BigQuery training data export to GCS bash_remove_old_data_op = BashOperator( task_id="bash_remove_old_data_{}_task".format(model.replace(".", "_")), bash_command= "if gsutil ls {0}/taxifare/data/{1} 2> /dev/null; then gsutil -m rm -rf {0}/taxifare/data/{1}/*; else true; fi" .format(BUCKET, model.replace(".", "_")),
schedule_interval=schedule_interval) # Create new tables in cli before running DAG # bq mk --time_partitioning_type=DAY my-project:github_trends.github_daily_metrics # bq mk --time_partitioning_type=DAY my-project:github_trends.github_agg # bq mk --time_partitioning_type=DAY my-project:github_trends.hackernews_agg # bq mk --time_partitioning_type=DAY my-project:github_trends.hackernews_github_agg ## Task 1 # airflow test bigquery_github_trends_v1 bq_check_githubarchive_day 2017-06-02 t1 = BigQueryCheckOperator(task_id='bq_check_githubarchive_day', sql=''' #legacySql SELECT table_id FROM [githubarchive:day.__TABLES__] WHERE table_id = "{{ yesterday_ds_nodash }}" ''', dag=dag) ## Task 2 # airflow test bigquery_github_trends_v1 bq_check_hackernews_full 2017-06-02 t2 = BigQueryCheckOperator(task_id='bq_check_hackernews_full', sql=''' #legacySql SELECT STRFTIME_UTC_USEC(timestamp, "%Y%m%d") as date FROM [bigquery-public-data:hacker_news.full] WHERE
import os import datetime from airflow import DAG from airflow.contrib.operators.bigquery_operator import BigQueryOperator from airflow.contrib.operators.bigquery_check_operator import BigQueryCheckOperator args = { 'owner': 'matti', 'start_date': datetime.datetime(day=20, month=11, year=2018), } template_path = '{}/{}'.format(os.environ['AIRFLOW_HOME'], 'dags/sql') with DAG(dag_id='github_dag', default_args=args, template_searchpath=template_path) as dag: github_commits = BigQueryOperator(task_id='github_commits', sql='github_users_aggregate.sql', use_legacy_sql=False, write_disposition='WRITE_TRUNCATE', destination_dataset_table='github.commits${{ ds_nodash }}') test__check_commit_sums_match_with_raw_data = BigQueryCheckOperator(task_id='TEST_commit_sums_match', sql='github_users_check_commit_sums.sql', use_legacy_sql=False) github_commits >> test__check_commit_sums_match_with_raw_data