ON a.url = b.url ''', destination_dataset_table= 'my-project.github_trends.hackernews_github_agg${{ yesterday_ds_nodash }}', dag=dag) # Task 7 # airflow test bigquery_github_trends_v1 bq_check_hackernews_github_agg 2017-06-02 t7 = BigQueryCheckOperator(task_id='bq_check_hackernews_github_agg', sql=''' #legacySql SELECT partition_id FROM [my-project:github_trends.hackernews_github_agg$__PARTITIONS_SUMMARY__] WHERE partition_id = "{{ yesterday_ds_nodash }}" ''', dag=dag) t3.set_upstream(t1) t4.set_upstream(t3) t5.set_upstream(t2) t6.set_upstream(t4) t6.set_upstream(t5) t7.set_upstream(t6)
def preprocess_tasks(model, parent_dag_name, child_dag_name, default_args, PROJECT_ID, BUCKET, DATA_DIR): # Create inner dag dag = DAG("{0}.{1}".format(parent_dag_name, child_dag_name), default_args=default_args, schedule_interval=None) # Constants # Specify your source BigQuery project, dataset, and table names SOURCE_BQ_PROJECT = "nyc-tlc" SOURCE_DATASET_TABLE_NAMES = "yellow.trips,green.trips_2014,green.trips_2015".split( ",") # Specify your destination BigQuery dataset DESTINATION_DATASET = "taxifare" # BigQuery data query bql = """ SELECT (tolls_amount + fare_amount) AS fare_amount, EXTRACT(DAYOFWEEK FROM pickup_datetime) * 1.0 AS dayofweek, EXTRACT(HOUR FROM pickup_datetime) * 1.0 AS hourofday, pickup_longitude AS pickuplon, pickup_latitude AS pickuplat, dropoff_longitude AS dropofflon, dropoff_latitude AS dropofflat, passenger_count*1.0 AS passengers, CONCAT(CAST(pickup_datetime AS STRING), CAST(pickup_longitude AS STRING), CAST(pickup_latitude AS STRING), CAST(dropoff_latitude AS STRING), CAST(dropoff_longitude AS STRING)) AS key FROM `{0}.{1}` WHERE trip_distance > 0 AND fare_amount >= 2.5 AND pickup_longitude > -78 AND pickup_longitude < -70 AND dropoff_longitude > -78 AND dropoff_longitude < -70 AND pickup_latitude > 37 AND pickup_latitude < 45 AND dropoff_latitude > 37 AND dropoff_latitude < 45 AND passenger_count > 0 AND rand() < 0.00001 """ bql = bql.format(SOURCE_BQ_PROJECT, model) bql_train = "SELECT * EXCEPT (key) FROM({0}) WHERE MOD(ABS(FARM_FINGERPRINT(key)), 5) < 4".format( bql) bql_eval = "SELECT * EXCEPT (key) FROM({0}) WHERE MOD(ABS(FARM_FINGERPRINT(key)), 5) = 4".format( bql) # Complete the BigQueryOperator task to truncate the table if it already exists before writing # Reference: https://airflow.apache.org/integration.html#bigqueryoperator bq_train_data_op = BigQueryOperator( task_id="bq_train_data_{}_task".format(model.replace(".", "_")), bql=bql_train, destination_dataset_table="{}.{}_train_data".format( DESTINATION_DATASET, model.replace(".", "_")), write_disposition="WRITE_TRUNCATE", # specify to truncate on writes use_legacy_sql=False, dag=dag) bq_eval_data_op = BigQueryOperator( task_id="bq_eval_data_{}_task".format(model.replace(".", "_")), bql=bql_eval, destination_dataset_table="{}.{}_eval_data".format( DESTINATION_DATASET, model.replace(".", "_")), write_disposition="WRITE_TRUNCATE", # specify to truncate on writes use_legacy_sql=False, dag=dag) sql = """ SELECT COUNT(*) FROM [{0}:{1}.{2}] """ # Check to make sure that the data tables won"t be empty bq_check_train_data_op = BigQueryCheckOperator( task_id="bq_check_train_data_{}_task".format(model.replace(".", "_")), sql=sql.format(PROJECT_ID, DESTINATION_DATASET, model.replace(".", "_") + "_train_data"), dag=dag) bq_check_eval_data_op = BigQueryCheckOperator( task_id="bq_check_eval_data_{}_task".format(model.replace(".", "_")), sql=sql.format(PROJECT_ID, DESTINATION_DATASET, model.replace(".", "_") + "_eval_data"), dag=dag) # BigQuery training data export to GCS bash_remove_old_data_op = BashOperator( task_id="bash_remove_old_data_{}_task".format(model.replace(".", "_")), bash_command= "if gsutil ls {0}/taxifare/data/{1} 2> /dev/null; then gsutil -m rm -rf {0}/taxifare/data/{1}/*; else true; fi" .format(BUCKET, model.replace(".", "_")), dag=dag) # Takes a BigQuery dataset and table as input and exports it to GCS as a CSV bq_export_gcs_train_csv_op = BigQueryToCloudStorageOperator( task_id="bq_export_gcs_train_csv_{}_task".format( model.replace(".", "_")), source_project_dataset_table="{}.{}_train_data".format( DESTINATION_DATASET, model.replace(".", "_")), destination_cloud_storage_uris=[ DATA_DIR + "{}/train-*.csv".format(model.replace(".", "_")) ], export_format="CSV", print_header=False, dag=dag) bq_export_gcs_eval_csv_op = BigQueryToCloudStorageOperator( task_id="bq_export_gcs_eval_csv_{}_task".format(model.replace( ".", "_")), source_project_dataset_table="{}.{}_eval_data".format( DESTINATION_DATASET, model.replace(".", "_")), destination_cloud_storage_uris=[ DATA_DIR + "{}/eval-*.csv".format(model.replace(".", "_")) ], export_format="CSV", print_header=False, dag=dag) # Build dependency graph, set_upstream dependencies for all tasks bq_check_train_data_op.set_upstream(bq_train_data_op) bq_check_eval_data_op.set_upstream(bq_eval_data_op) bash_remove_old_data_op.set_upstream( [bq_check_train_data_op, bq_check_eval_data_op]) bq_export_gcs_train_csv_op.set_upstream(bash_remove_old_data_op) bq_export_gcs_eval_csv_op.set_upstream(bash_remove_old_data_op) return dag
session.commit() else: msg = '\n\tA connection with `conn_id`={conn_id} already exists\n' msg = msg.format(conn_id=new_conn.conn_id) print(msg) dag = DAG('bigquery_github_trends_v1', default_args=default_args, schedule_interval="@once") # Task to add a connection t0 = PythonOperator( dag=dag, task_id='add_gcp_connection_python', python_callable=add_gcp_connection, provide_context=True, ) t1 = BigQueryCheckOperator(task_id='bq_check_githubarchive_day', bigquery_conn_id='bigquery', sql=''' #legacySql SELECT table_id FROM [githubarchive:day.__TABLES__] WHERE table_id = "{{ yesterday_ds_nodash }}" ''', dag=dag) t1.set_upstream(t0)
version={"name": MODEL_VERSION}, operation="set_default", dag=dag) ml_engine_set_default_other_version_op = MLEngineVersionOperator( task_id="ml_engine_set_default_other_version_{}_task".format( model.replace(".", "_")), project_id=PROJECT_ID, model_name=MODEL_NAME + model.replace(".", "_"), version_name=OTHER_VERSION_NAME, version={"name": OTHER_VERSION_NAME}, operation="set_default", dag=dag) # Build dependency graph, set_upstream dependencies for all tasks bq_check_train_data_op.set_upstream(bq_train_data_op) bq_check_eval_data_op.set_upstream(bq_eval_data_op) bash_remove_old_data_op.set_upstream( [bq_check_train_data_op, bq_check_eval_data_op]) bq_export_gcs_train_csv_op.set_upstream([bash_remove_old_data_op]) bq_export_gcs_eval_csv_op.set_upstream([bash_remove_old_data_op]) ml_engine_training_op.set_upstream( [bq_export_gcs_train_csv_op, bq_export_gcs_eval_csv_op]) bash_remove_old_saved_model_op.set_upstream(ml_engine_training_op) bash_copy_new_saved_model_op.set_upstream(bash_remove_old_saved_model_op) bash_ml_engine_models_list_op.set_upstream(ml_engine_training_op)
''', use_legacy_sql=False, bigquery_conn_id=BQ_CONN_ID, dag=dag ) ## Task 2: check that the hacker news table contains data for that date. t2 = BigQueryCheckOperator( task_id='bq_check_hackernews_full', sql=''' #standardSQL SELECT FORMAT_TIMESTAMP("%Y%m%d", timestamp ) AS date FROM `bigquery-public-data.hacker_news.full` WHERE type = 'story' AND FORMAT_TIMESTAMP("%Y%m%d", timestamp ) = "{{ yesterday_ds_nodash }}" LIMIT 1 ''', use_legacy_sql=False, bigquery_conn_id=BQ_CONN_ID, dag=dag ) # Setting up Dependencies t2.set_upstream(t1)