def test_execute_no_force_rerun(self, mock_hook, mock_md5): job_id = "123456" hash_ = "hash" real_job_id = f"{job_id}_{hash_}" mock_md5.return_value.hexdigest.return_value = hash_ configuration = { "query": { "query": "SELECT * FROM any", "useLegacySql": False, } } mock_hook.return_value.insert_job.return_value.result.side_effect = Conflict( "any") job = MagicMock( job_id=real_job_id, error_result=False, state="DONE", done=lambda: True, ) mock_hook.return_value.get_job.return_value = job op = BigQueryInsertJobOperator( task_id="insert_query_job", configuration=configuration, location=TEST_DATASET_LOCATION, job_id=job_id, project_id=TEST_GCP_PROJECT_ID, reattach_states={"PENDING"}, ) # No force rerun with pytest.raises(AirflowException): op.execute({})
def test_execute_success(self, mock_hook, mock_md5): job_id = "123456" hash_ = "hash" real_job_id = f"{job_id}_{hash_}" mock_md5.return_value.hexdigest.return_value = hash_ configuration = { "query": { "query": "SELECT * FROM any", "useLegacySql": False, } } mock_hook.return_value.insert_job.return_value = MagicMock( job_id=real_job_id, error_result=False) op = BigQueryInsertJobOperator( task_id="insert_query_job", configuration=configuration, location=TEST_DATASET_LOCATION, job_id=job_id, project_id=TEST_GCP_PROJECT_ID, ) result = op.execute({}) mock_hook.return_value.insert_job.assert_called_once_with( configuration=configuration, location=TEST_DATASET_LOCATION, job_id=real_job_id, project_id=TEST_GCP_PROJECT_ID, ) assert result == real_job_id
def test_execute(self, mock_hook): job_id = "123456" configuration = { "query": { "query": "SELECT * FROM any", "useLegacySql": False, } } mock_hook.return_value.insert_job.return_value = MagicMock( job_id=job_id) op = BigQueryInsertJobOperator(task_id="insert_query_job", configuration=configuration, location=TEST_DATASET_LOCATION, job_id=job_id, project_id=TEST_GCP_PROJECT_ID) result = op.execute({}) mock_hook.return_value.insert_job.assert_called_once_with( configuration=configuration, location=TEST_DATASET_LOCATION, job_id=job_id, project_id=TEST_GCP_PROJECT_ID, ) assert result == job_id
def test_execute_idempotency(self, mock_hook, mock_sleep_generator): job_id = "123456" configuration = { "query": { "query": "SELECT * FROM any", "useLegacySql": False, } } class MockJob: _call_no = 0 _done = False def __init__(self): pass def reload(self): if MockJob._call_no == 3: MockJob._done = True else: MockJob._call_no += 1 def done(self): return MockJob._done @property def job_id(self): return job_id mock_hook.return_value.insert_job.return_value.result.side_effect = Conflict( "any") mock_sleep_generator.return_value = [0, 0, 0, 0, 0] mock_hook.return_value.get_job.return_value = MockJob() op = BigQueryInsertJobOperator(task_id="insert_query_job", configuration=configuration, location=TEST_DATASET_LOCATION, job_id=job_id, project_id=TEST_GCP_PROJECT_ID) result = op.execute({}) assert MockJob._call_no == 3 mock_hook.return_value.get_job.assert_called_once_with( location=TEST_DATASET_LOCATION, job_id=job_id, project_id=TEST_GCP_PROJECT_ID, ) assert result == job_id
def test_job_id_validity(self, mock_md5, test_dag_id, expected_job_id): hash_ = "hash" mock_md5.return_value.hexdigest.return_value = hash_ context = {"execution_date": datetime(2020, 1, 23)} configuration = { "query": { "query": "SELECT * FROM any", "useLegacySql": False, } } with DAG(dag_id=test_dag_id, start_date=datetime(2020, 1, 23)): op = BigQueryInsertJobOperator(task_id="test_job_id", configuration=configuration, project_id=TEST_GCP_PROJECT_ID) assert op._job_id(context) == expected_job_id
def test_execute_reattach(self, mock_hook, mock_md5): job_id = "123456" hash_ = "hash" real_job_id = f"{job_id}_{hash_}" mock_md5.return_value.hexdigest.return_value = hash_ configuration = { "query": { "query": "SELECT * FROM any", "useLegacySql": False, } } mock_hook.return_value.insert_job.return_value.result.side_effect = Conflict( "any") job = MagicMock( job_id=real_job_id, error_result=False, state="PENDING", done=lambda: False, ) mock_hook.return_value.get_job.return_value = job op = BigQueryInsertJobOperator( task_id="insert_query_job", configuration=configuration, location=TEST_DATASET_LOCATION, job_id=job_id, project_id=TEST_GCP_PROJECT_ID, reattach_states={"PENDING"}, ) result = op.execute({}) mock_hook.return_value.get_job.assert_called_once_with( location=TEST_DATASET_LOCATION, job_id=real_job_id, project_id=TEST_GCP_PROJECT_ID, ) job.result.assert_called_once_with() assert result == real_job_id
def test_execute_failure(self, mock_hook, mock_md5): job_id = "123456" hash_ = "hash" real_job_id = f"{job_id}_{hash_}" mock_md5.return_value.hexdigest.return_value = hash_ configuration = { "query": { "query": "SELECT * FROM any", "useLegacySql": False, } } mock_hook.return_value.insert_job.return_value = MagicMock(job_id=real_job_id, error_result=True) op = BigQueryInsertJobOperator( task_id="insert_query_job", configuration=configuration, location=TEST_DATASET_LOCATION, job_id=job_id, project_id=TEST_GCP_PROJECT_ID, ) with pytest.raises(AirflowException): op.execute({})
def bq_insert(): INSERT_ROWS_QUERY = ( f"INSERT into {DATASET_NAME}.{TABLE_1} VALUES ( 'storage#object','tl-alp-storage-transfer/TempusVCF/1.6.0/~/xO_germline_db.rdata/1613720449739061','https://www.googleapis.com/storage/v1/b/dag_test_bucket_sub/o/test1%2F','TempusVCF/1.10.0/xF_bgrd_ave.csv','tl-alp-storage-transfer','1613720449739061','1','application/x-directory; charset=UTF-8','2021-02-25 07:40:49.822 UTC','2021-02-25 07:40:49.822 UTC','STANDARD','2021-02-19 07:40:49.822 UTC','823540771','1B2M2Y8AsgTpgAmY7PhCfg==','https://www.googleapis.com/download/storage/v1/b/dag_test_bucket_sub/o/test1%2F?generation=1613720449739061&alt=media','AAAAAA==','CLXSsZG59e4CEAE='),( 'storage#object','tl-alp-storage-transfer/TempusVCF/1.6.5/hs_depth.csv/1613720449739061','https://www.googleapis.com/storage/v1/b/dag_test_bucket_sub/o/test1%2F','TempusVCF/1.6.5/hs_depth.csv','tl-alp-storage-transfer','1613720449739061','1','application/x-directory; charset=UTF-8','2021-02-25 07:40:49.822 UTC','2021-02-25 07:40:49.822 UTC','STANDARD','2021-02-19 07:40:49.822 UTC','443893876','1B2M2Y8AsgTpgAmY7PhCfg==','https://www.googleapis.com/download/storage/v1/b/dag_test_bucket_sub/o/test1%2F?generation=1613720449739061&alt=media','AAAAAA==','CLXSsZG59e4CEAE='),( 'storage#object','tl-alp-storage-transfer/TempusVCF/1.6.0/~/xO_germline_db.rdata/1613720449739061','https://www.googleapis.com/storage/v1/b/dag_test_bucket_sub/o/test1%2F','TempusVCF/1.6.5/xf_whitelist_cpras.txt','tl-alp-storage-transfer','1613720449739061','1','application/x-directory; charset=UTF-8','2021-02-25 07:40:49.822 UTC','2021-02-25 07:40:49.822 UTC','STANDARD','2021-02-19 07:40:49.822 UTC','8400','1B2M2Y8AsgTpgAmY7PhCfg==','https://www.googleapis.com/download/storage/v1/b/dag_test_bucket_sub/o/test1%2F?generation=1613720449739061&alt=media','AAAAAA==','CLXSsZG59e4CEAE='),( 'storage#object','tl-alp-storage-transfer/TempusVCF/1.6.0/~/xO_germline_db.rdata/1613720449739061','https://www.googleapis.com/storage/v1/b/dag_test_bucket_sub/o/test1%2F','TempusVCF/1.6.3/xe_gene_coordinates.bed','tl-alp-storage-transfer','1613720449739061','1','application/x-directory; charset=UTF-8','2021-02-25 07:40:49.822 UTC','2021-02-25 07:40:49.822 UTC','STANDARD','2021-02-19 07:40:49.822 UTC','5803378','1B2M2Y8AsgTpgAmY7PhCfg==','https://www.googleapis.com/download/storage/v1/b/dag_test_bucket_sub/o/test1%2F?generation=1613720449739061&alt=media','AAAAAA==','CLXSsZG59e4CEAE='),( 'storage#object','tl-alp-storage-transfer/TempusVCF/1.6.0/~/xO_germline_db.rdata/1613720449739061','https://www.googleapis.com/storage/v1/b/dag_test_bucket_sub/o/test1%2F','TempusVCF/1.10.0/xF_germline_db.csv','tl-alp-storage-transfer','1613720449739061','1','application/x-directory; charset=UTF-8','2021-02-25 07:40:49.822 UTC','2021-02-25 07:40:49.822 UTC','STANDARD','2021-02-19 07:40:49.822 UTC','3225246','1B2M2Y8AsgTpgAmY7PhCfg==','https://www.googleapis.com/download/storage/v1/b/dag_test_bucket_sub/o/test1%2F?generation=1613720449739061&alt=media','AAAAAA==','CLXSsZG59e4CEAE=');" ) print(INSERT_ROWS_QUERY) # task_id=f"insert_query_{name}" bq_job = BigQueryInsertJobOperator(task_id='insert_job', configuration={ "query": { "query": INSERT_ROWS_QUERY, "useLegacySql": False, } })
) # [END howto_operator_facebook_ads_to_gcs] load_csv = GCSToBigQueryOperator( task_id='gcs_to_bq_example', bucket=GCS_BUCKET, source_objects=[GCS_OBJ_PATH], destination_project_dataset_table=f"{DATASET_NAME}.{TABLE_NAME}", write_disposition='WRITE_TRUNCATE', ) read_data_from_gcs_many_chunks = BigQueryInsertJobOperator( task_id="read_data_from_gcs_many_chunks", configuration={ "query": { "query": f"SELECT COUNT(*) FROM `{GCP_PROJECT_ID}.{DATASET_NAME}.{TABLE_NAME}`", "useLegacySql": False, } }, ) delete_bucket = GCSDeleteBucketOperator( task_id="delete_bucket", bucket_name=GCS_BUCKET, ) delete_dataset = BigQueryDeleteDatasetOperator( task_id="delete_dataset", project_id=GCP_PROJECT_ID, dataset_id=DATASET_NAME, delete_contents=True,
schema_fields=SCHEMA, location=location, ) create_dataset >> [create_table_1, create_table_2] delete_dataset = BigQueryDeleteDatasetOperator( task_id="delete_dataset", dataset_id=DATASET_NAME, delete_contents=True ) # [START howto_operator_bigquery_insert_job] insert_query_job = BigQueryInsertJobOperator( task_id="insert_query_job", configuration={ "query": { "query": INSERT_ROWS_QUERY, "useLegacySql": "False", } }, location=location, ) # [END howto_operator_bigquery_insert_job] # [START howto_operator_bigquery_select_job] select_query_job = BigQueryInsertJobOperator( task_id="select_query_job", configuration={ "query": { "query": "{% include 'example_bigquery_query.sql' %}", "useLegacySql": False, } },
}, "externalDataConfiguration": { "sourceFormat": "NEWLINE_DELIMITED_JSON", "compression": "NONE", "csvOptions": {"skipLeadingRows": 1}, }, }, schema_object=f"{safe_name(SOURCE_MULTIPLE_TYPES)}-schema.json", ) # [END howto_operator_create_external_table_multiple_types] read_data_from_gcs_multiple_types = BigQueryInsertJobOperator( task_id="read_data_from_gcs_multiple_types", configuration={ "query": { "query": f"SELECT COUNT(*) FROM `{GCP_PROJECT_ID}.{DATASET_NAME}." f"{safe_name(SOURCE_MULTIPLE_TYPES)}`", "useLegacySql": False, } }, ) # [START howto_operator_presto_to_gcs_many_chunks] presto_to_gcs_many_chunks = PrestoToGCSOperator( task_id="presto_to_gcs_many_chunks", sql=f"select * from {SOURCE_CUSTOMER_TABLE}", bucket=GCS_BUCKET, filename=f"{safe_name(SOURCE_CUSTOMER_TABLE)}.{{}}.json", schema_filename=f"{safe_name(SOURCE_CUSTOMER_TABLE)}-schema.json", approx_max_file_size_bytes=10_000_000, gzip=False, )
schema_fields=SCHEMA, location=location, ) create_dataset >> [create_table_1, create_table_2] delete_dataset = BigQueryDeleteDatasetOperator( task_id="delete_dataset", dataset_id=DATASET_NAME, delete_contents=True ) # [START howto_operator_bigquery_insert_job] insert_query_job = BigQueryInsertJobOperator( task_id="insert_query_job", configuration={ "query": { "query": INSERT_ROWS_QUERY, "useLegacySql": False, } }, location=location, ) # [END howto_operator_bigquery_insert_job] # [START howto_operator_bigquery_select_job] select_query_job = BigQueryInsertJobOperator( task_id="select_query_job", configuration={ "query": { "query": "{% include 'example_bigquery_query.sql' %}", "useLegacySql": False, } },
"field": "ds", }, ) # [START howto_sensor_bigquery_table] check_table_exists = BigQueryTableExistenceSensor( task_id="check_table_exists", project_id=PROJECT_ID, dataset_id=DATASET_NAME, table_id=TABLE_NAME) # [END howto_sensor_bigquery_table] execute_insert_query = BigQueryInsertJobOperator( task_id="execute_insert_query", configuration={ "query": { "query": INSERT_ROWS_QUERY, "useLegacySql": False, } }, ) # [START howto_sensor_bigquery_table_partition] check_table_partition_exists = BigQueryTablePartitionExistenceSensor( task_id="check_table_partition_exists", project_id=PROJECT_ID, dataset_id=DATASET_NAME, table_id=TABLE_NAME, partition_id=PARTITION_NAME, ) # [END howto_sensor_bigquery_table_partition]
create_warehouse_reviews_table = BigQueryCreateEmptyTableOperator( task_id='create_warehouse_reviews_table', dataset_id=os.environ['GCP_BQ_DATASET_WAREHOUSE'], table_id=GCP_BQ_TABLE_REVIEWS, project_id=GCP_PROJECT_ID, bigquery_conn_id='gr_bigquery_conn', schema_fields=warehouse.reviews_schema) load_warehouse_data = DummyOperator(task_id='load_warehouse_data') load_warehouse_users_data = BigQueryInsertJobOperator( task_id="load_warehouse_users_data", project_id=os.environ['GCP_PROJECT_ID'], job_id='users', configuration={ "query": { "query": warehouse.load_users, "useLegacySql": False, } }, gcp_conn_id='gr_bigquery_conn') load_warehouse_authors_data = BigQueryInsertJobOperator( task_id="load_warehouse_authors_data", project_id=os.environ['GCP_PROJECT_ID'], job_id='users', configuration={ "query": { "query": warehouse.load_authors, "useLegacySql": False, }
table_id=TABLE_2, schema_fields=SCHEMA, location=location, ) create_dataset >> [create_table_1, create_table_2] delete_dataset = BigQueryDeleteDatasetOperator( task_id="delete_dataset", dataset_id=DATASET, delete_contents=True) # [START howto_operator_bigquery_insert_job] insert_query_job = BigQueryInsertJobOperator( task_id="insert_query_job", configuration={ "query": { "query": INSERT_ROWS_QUERY, "useLegacySql": False, } }, location=location, ) # [END howto_operator_bigquery_insert_job] # [START howto_operator_bigquery_select_job] select_query_job = BigQueryInsertJobOperator( task_id="select_query_job", configuration={ "query": { "query": "{% include 'example_bigquery_query.sql' %}", "useLegacySql": False, } },
join_customer_purchase = BigQueryInsertJobOperator( task_id='bq_join_customer_purchase', gcp_conn_id='bigquery_default', project_id=TRF_PRJ, location=BQ_LOCATION, configuration={ 'jobType': 'QUERY', 'query': { 'query': """SELECT c.id as customer_id, p.id as purchase_id, p.item as item, p.price as price, p.timestamp as timestamp FROM `{dwh_0_prj}.{dwh_0_dataset}.customers` c JOIN `{dwh_0_prj}.{dwh_0_dataset}.purchases` p ON c.id = p.customer_id """.format( dwh_0_prj=DWH_LAND_PRJ, dwh_0_dataset=DWH_LAND_BQ_DATASET, ), 'destinationTable': { 'projectId': DWH_CURATED_PRJ, 'datasetId': DWH_CURATED_BQ_DATASET, 'tableId': 'customer_purchase' }, 'writeDisposition': 'WRITE_TRUNCATE', "useLegacySql": False } }, impersonation_chain=[TRF_SA_BQ])
location=location, ) create_dataset >> [create_table_1, create_table_2] delete_dataset = BigQueryDeleteDatasetOperator( task_id="delete_dataset", dataset_id=DATASET_NAME, delete_contents=True) # [START howto_operator_bigquery_insert_job] insert_query_job = BigQueryInsertJobOperator( task_id="insert_query_job", configuration={ "query": { "query": INSERT_ROWS_QUERY, "useLegacySql": False, } }, location=location, ) # [END howto_operator_bigquery_insert_job] execute_insert_query = BigQueryExecuteQueryOperator( task_id="execute_insert_query", sql=INSERT_ROWS_QUERY, use_legacy_sql=False, location=location) bigquery_execute_multi_query = BigQueryExecuteQueryOperator( task_id="execute_multi_query", sql=[
############################################################## # Check mandatory columns for null values # # This function sets up a check to see if the numbber of null# # values allowed in the title column to be 1 # ############################################################## validateTitleNull = BigQueryValueCheckOperator( task_id="validateNullTitle", sql=f"SELECT COUNT(*) FROM {DATASET_NAME}.{TABLE_NAME} WHERE title IS NULL", pass_value=1, use_legacy_sql=False ) ############################################################## # Update row to set tbd1 with uppercase value of title # ############################################################## UPDATE_STATEMENT = f"""UPDATE `{DATASET_NAME}.{TABLE_NAME}` SET tbd1 = LOWER(title) WHERE TRUE""" updateTableColumns = BigQueryInsertJobOperator( task_id="select_query_job", configuration={ "query": { "query": UPDATE_STATEMENT, "useLegacySql": False, } } ) loadGcsToBq >> validateTitleNull >> updateTableColumns
logs_to_parquet = MyDataprocSubmitJobOperator( task_id='logs_to_parquet', job={ 'placement': { 'cluster_name': dataproc_config['cluster_name'] }, 'pyspark_job': { 'main_python_file_uri': f"gs://{bucket_config['jobs']}/jobs/UploadFilesToParquet.py", 'properties': { 'spark.master': 'yarn', 'spark.deploy-mode': 'cluster', 'spark.driver.cores': '1', 'spark.driver.memory': '1g', 'spark.executor.cores': '2', 'spark.executor.memory': '4g', 'spark.executor.instances': '4' }, 'args': [ '--GCS_UPLOAD_URI', f"gs://{bucket_config['upload_data']}/logs/20210101/*.log", '--GCS_DATA_LAKE_URI', '/user/hdfs/raw_data/' ] } }) hdfs_to_bigquery = BigQueryInsertJobOperator( task_id='hdfs_to_bigquery', configuration=bigquery_load_storage_config) logs_to_parquet >> hdfs_to_bigquery
"csvOptions": { "skipLeadingRows": 1 }, }, }, source_objects=[ f"{EXPORT_PREFIX}/all_namespaces/kind_{EXPORT_COLLECTION_ID}" f"/all_namespaces_kind_{EXPORT_COLLECTION_ID}.export_metadata" ], ) # [END howto_operator_create_external_table_multiple_types] read_data_from_gcs_multiple_types = BigQueryInsertJobOperator( task_id="execute_query", configuration={ "query": { "query": f"SELECT COUNT(*) FROM `{GCP_PROJECT_ID}.{DATASET_NAME}.firestore_data`", "useLegacySql": False, } }, ) # Firestore export_database_to_gcs >> create_dataset # BigQuery create_dataset >> create_external_table_multiple_types create_external_table_multiple_types >> read_data_from_gcs_multiple_types read_data_from_gcs_multiple_types >> delete_dataset
dataset_id="{{ var.json.env.stg }}", dag=dag, ) create_production_dataset = BigQueryCreateEmptyDatasetOperator( task_id="create_production_dataset", gcp_conn_id="my_gcp_connection", dataset_id="{{ var.json.env.production }}", dag=dag, ) create_fact_table = BigQueryInsertJobOperator( task_id="create_fact_table", configuration={ "query": { "query": "{% include 'create_fact_table.sql' %}", "use_legacy_sql": False, } }, dag=dag, ) create_geo_table = BigQueryInsertJobOperator( task_id="create_geo_table", configuration={ "query": { "query": "{% include 'create_geo_table.sql' %}", "use_legacy_sql": False, } }, dag=dag, )