def test_backfill(bq, gcs_partitioned_data, gcs_truncating_load_config, gcs_bucket, dest_partitioned_table): """ This is an adaptation of test_load_job_partitioned but instead uses the backfill CLI code path to execute the cloud function's main method in parallel threads. Test loading separate partitions with WRITE_TRUNCATE after both load jobs the count should equal the sum of the test data in both partitions despite having WRITE_TRUNCATE disposition because the destination table should target only a particular partition with a decorator. """ test_utils.check_blobs_exist( gcs_truncating_load_config, "the test is not configured correctly the load.json is missing") test_utils.check_blobs_exist(gcs_partitioned_data, "test data objects must exist") expected_num_rows = 0 for part in [ "$2017041101", "$2017041102", ]: test_data_file = os.path.join(TEST_DIR, "resources", "test-data", "nyc_311", part, "nyc_311.csv") expected_num_rows += sum(1 for _ in open(test_data_file)) args = backfill.parse_args([ f"--gcs-path=gs://{gcs_bucket.name}", "--mode=LOCAL", ]) backfill.main(args) test_utils.bq_wait_for_rows(bq, dest_partitioned_table, expected_num_rows)
def test_ordered_load_parquet_hive_partitioning( monkeypatch, gcs, bq, gcs_bucket, gcs_destination_parquet_config_hive_partitioned, gcs_external_hive_partitioned_parquet_config, gcs_split_path_partitioned_parquet_data, dest_hive_partitioned_table): """Test ordered loads of parquet data files Set global env variable ORDER_PER_TABLE so that all loads are ordered. Test to make sure that parquet data files are loaded in order. """ monkeypatch.setenv("ORDER_PER_TABLE", "True") monkeypatch.setenv("START_BACKFILL_FILENAME", "_HISTORYDONE") # Must reload the constants file in order to pick up testing mock env vars importlib.reload(gcs_ocn_bq_ingest.common.constants) test_utils.check_blobs_exist(gcs_split_path_partitioned_parquet_data, "test data objects must exist") table_prefix = "" for gcs_data in gcs_split_path_partitioned_parquet_data: if gcs_data.name.endswith( gcs_ocn_bq_ingest.common.constants.SUCCESS_FILENAME): table_prefix = gcs_ocn_bq_ingest.common.utils.get_table_prefix( gcs, gcs_data) break # Invoke cloud function for all data blobs and _SUCCESS blob. # Cloud function shouldn't take any action at this point because there is # no _HISTORYDONE file yet. test_utils.trigger_gcf_for_each_blob( gcs_split_path_partitioned_parquet_data) # Upload _HISTORYDONE file which will cause cloud function to take action backfill_start_blob: storage.Blob = gcs_bucket.blob( f"{table_prefix}/" f"{gcs_ocn_bq_ingest.common.constants.START_BACKFILL_FILENAME}") backfill_start_blob.upload_from_string("") test_utils.check_blobs_exist([backfill_start_blob], "_HISTORYDONE file was" "not created.") test_utils.trigger_gcf_for_each_blob([backfill_start_blob]) # Check to make sure _BACKFILL file has been craeted backfill_blob: storage.Blob = gcs_bucket.blob( f"{table_prefix}/{gcs_ocn_bq_ingest.common.constants.BACKFILL_FILENAME}" ) test_utils.check_blobs_exist([backfill_blob], "_BACKFILL file was not created by method" "start_backfill_subscriber_if_not_running") test_utils.trigger_gcf_for_each_blob([backfill_blob]) expected_num_rows = 100 test_utils.bq_wait_for_rows(bq, dest_hive_partitioned_table, expected_num_rows) # Check to make sure the hive_part_column column values were correctly inserted # into the BigQuery destination table. for row in bq.query( f"SELECT DISTINCT hive_part_column " f"FROM `{dest_hive_partitioned_table.full_table_id.replace(':','.')}`" ).result(): assert row.hive_part_column == 9999
def test_duplicate_success_notification(bq, gcs_data, dest_table): """tests behavior with two notifications for the same success file.""" test_utils.check_blobs_exist(gcs_data, "test data objects must exist") test_utils.trigger_gcf_for_each_blob(gcs_data) test_data_file = os.path.join(TEST_DIR, "resources", "test-data", "nation", "part-m-00001") expected_num_rows = sum(1 for _ in open(test_data_file)) test_utils.bq_wait_for_rows(bq, dest_table, expected_num_rows)
def test_load_job(bq, gcs_data, dest_table): """tests basic single invocation with load job""" test_utils.check_blobs_exist(gcs_data, "test data objects must exist") test_utils.trigger_gcf_for_each_blob(gcs_data) test_data_file = os.path.join(TEST_DIR, "resources", "test-data", "nation", "part-m-00001") expected_num_rows = sum(1 for _ in open(test_data_file)) test_utils.bq_wait_for_rows(bq, dest_table, expected_num_rows)
def test_gcf_event_schema(bq, gcs_data, dest_table): """tests compatibility to Cloud Functions Background Function posting the storage object schema https://cloud.google.com/storage/docs/json_api/v1/objects#resource directly based on object finalize. https://cloud.google.com/functions/docs/tutorials/storage#functions_tutorial_helloworld_storage-python """ test_utils.check_blobs_exist(gcs_data, "test data objects must exist") test_utils.trigger_gcf_for_each_blob(gcs_data) test_data_file = os.path.join(TEST_DIR, "resources", "test-data", "nation", "part-m-00001") expected_num_rows = sum(1 for _ in open(test_data_file)) test_utils.bq_wait_for_rows(bq, dest_table, expected_num_rows)
def test_look_for_config_in_parents(bq, gcs_data_under_sub_dirs, gcs_external_config, dest_table): """test discovery of configuration files for external query in parent _config paths. """ test_utils.check_blobs_exist(gcs_external_config, "config objects must exist") test_utils.check_blobs_exist(gcs_data_under_sub_dirs, "test data must exist") test_utils.trigger_gcf_for_each_blob(gcs_data_under_sub_dirs) test_data_file = os.path.join(TEST_DIR, "resources", "test-data", "nation", "part-m-00001") expected_num_rows = sum(1 for _ in open(test_data_file)) test_utils.bq_wait_for_rows(bq, dest_table, expected_num_rows)
def test_partitioned_parquet(bq, gcs_split_path_partitioned_parquet_data, gcs_destination_parquet_config, dest_partitioned_table): """tests the basic load ingestion mechanics for parquet files """ test_utils.check_blobs_exist(gcs_destination_parquet_config, "config objects must exist") test_utils.check_blobs_exist(gcs_split_path_partitioned_parquet_data, "test data objects must exist") test_utils.trigger_gcf_for_each_blob( gcs_split_path_partitioned_parquet_data) expected_num_rows = 100 test_utils.bq_wait_for_rows(bq, dest_partitioned_table, expected_num_rows)
def test_load_job_appending_batches(bq, gcs_batched_data, dest_table): """ tests two loading batches with the default load configuration. The total number of rows expected should be the number of rows in the test file multiplied by the number of batches because we should pick up the default WRITE_APPEND disposition. """ test_data_file = os.path.join(TEST_DIR, "resources", "test-data", "nation", "part-m-00001") test_count = sum(1 for _ in open(test_data_file)) expected_counts = 2 * test_count # 2 batches * num of test rows test_utils.check_blobs_exist(gcs_batched_data, "test data objects must exist") test_utils.trigger_gcf_for_each_blob(gcs_batched_data) test_utils.bq_wait_for_rows(bq, dest_table, expected_counts)
def test_external_query_partitioned_parquet( bq, gcs_split_path_partitioned_parquet_data, gcs_external_partitioned_parquet_config, gcs_destination_config, dest_partitioned_table): """tests the basic external query ingestion mechanics with bq_transform.sql and external.json """ test_utils.check_blobs_exist( gcs_destination_config + gcs_external_partitioned_parquet_config, "config objects must exist") test_utils.check_blobs_exist(gcs_split_path_partitioned_parquet_data, "test data objects must exist") test_utils.trigger_gcf_for_each_blob( gcs_split_path_partitioned_parquet_data) expected_num_rows = 100 test_utils.bq_wait_for_rows(bq, dest_partitioned_table, expected_num_rows)
def test_external_query_pure( bq, gcs_data, gcs_external_config, dest_table, ): """tests the basic external query ingestion mechanics with bq_transform.sql and external.json """ test_utils.check_blobs_exist(gcs_data, "test data objects must exist") test_utils.check_blobs_exist(gcs_external_config, "config objects must exist") test_utils.trigger_gcf_for_each_blob(gcs_data) test_data_file = os.path.join(TEST_DIR, "resources", "test-data", "nation", "part-m-00001") expected_num_rows = sum(1 for _ in open(test_data_file)) test_utils.bq_wait_for_rows(bq, dest_table, expected_num_rows)
def test_external_query_partitioned(bq, gcs_partitioned_data, gcs_external_partitioned_config, dest_partitioned_table): """tests the basic external query ingestion mechanics with bq_transform.sql and external.json """ if not all((blob.exists() for blob in gcs_external_partitioned_config)): raise google.cloud.exceptions.NotFound("config objects must exist") test_utils.trigger_gcf_for_each_blob(gcs_partitioned_data) expected_num_rows = 0 for part in [ "$2017041101", "$2017041102", ]: test_data_file = os.path.join(TEST_DIR, "resources", "test-data", "nyc_311", part, "nyc_311.csv") expected_num_rows += sum(1 for _ in open(test_data_file)) test_utils.bq_wait_for_rows(bq, dest_partitioned_table, expected_num_rows)
def test_look_for_destination_config_in_parents( bq, gcs_split_path_partitioned_data, gcs_destination_config, dest_partitioned_table, ): """test discovery of configuration files for destination in parent _config paths. """ test_utils.check_blobs_exist(gcs_destination_config, "config objects must exist") test_utils.check_blobs_exist(gcs_split_path_partitioned_data, "test data must exist") test_utils.trigger_gcf_for_each_blob(gcs_split_path_partitioned_data) expected_num_rows = 0 for part in ["$2017041101", "$2017041102"]: test_data_file = os.path.join(TEST_DIR, "resources", "test-data", "nyc_311", part, "nyc_311.csv") expected_num_rows += sum(1 for _ in open(test_data_file)) test_utils.bq_wait_for_rows(bq, dest_partitioned_table, expected_num_rows)
def test_load_job_partitioned(bq, gcs_partitioned_data, gcs_truncating_load_config, dest_partitioned_table): """ Test loading separate partitions with WRITE_TRUNCATE after both load jobs the count should equal the sum of the test data in both partitions despite having WRITE_TRUNCATE disposition because the destination table should target only a particular partition with a decorator. """ test_utils.check_blobs_exist(gcs_truncating_load_config, "the load.json is missing") test_utils.check_blobs_exist(gcs_partitioned_data, "test data objects must exist") test_utils.trigger_gcf_for_each_blob(gcs_partitioned_data) expected_num_rows = 0 for part in ["$2017041101", "$2017041102"]: test_data_file = os.path.join(TEST_DIR, "resources", "test-data", "nyc_311", part, "nyc_311.csv") expected_num_rows += sum(1 for _ in open(test_data_file)) test_utils.bq_wait_for_rows(bq, dest_partitioned_table, expected_num_rows)
def test_load_job_truncating_batches( bq, gcs_batched_data, gcs_truncating_load_config, dest_table, ): """ tests two successive batches with a load.json that dictates WRITE_TRUNCATE. after both load jobs the count should be the same as the number of lines in the test file because we should pick up the WRITE_TRUNCATE disposition. """ test_utils.check_blobs_exist( gcs_truncating_load_config, "the test is not configured correctly the load.json is missing") test_utils.check_blobs_exist(gcs_batched_data, "test data objects must exist") test_utils.trigger_gcf_for_each_blob(gcs_batched_data) test_data_file = os.path.join(TEST_DIR, "resources", "test-data", "nation", "part-m-00001") expected_num_rows = sum(1 for _ in open(test_data_file)) test_utils.bq_wait_for_rows(bq, dest_table, expected_num_rows)
def test_external_query_partitioned_with_destination_config( bq, gcs_partitioned_data, gcs_external_partitioned_config, gcs_destination_config, dest_partitioned_table): """tests the basic external query ingestion mechanics with bq_transform.sql, external.json, and destination config in load.json. """ test_utils.check_blobs_exist( (gcs_external_partitioned_config + gcs_destination_config), "config objects must exist") test_utils.check_blobs_exist(gcs_partitioned_data, "test data must exist") test_utils.trigger_gcf_for_each_blob(gcs_partitioned_data + gcs_external_partitioned_config + gcs_destination_config) expected_num_rows = 0 for part in [ "$2017041101", "$2017041102", ]: test_data_file = os.path.join(TEST_DIR, "resources", "test-data", "nyc_311", part, "nyc_311.csv") expected_num_rows += sum(1 for _ in open(test_data_file)) test_utils.bq_wait_for_rows(bq, dest_partitioned_table, expected_num_rows)
def test_ordered_load_parquet_wait_for_validation( monkeypatch, gcs, bq, gcs_bucket, gcs_destination_parquet_config, gcs_external_partitioned_parquet_config, gcs_split_path_partitioned_parquet_data, dest_partitioned_table): """Test ordered loads of parquet data files with a validation step between each load. Set global env variable ORDER_PER_TABLE so that all loads are ordered. Test to make sure that parquet data files are loaded in order. """ monkeypatch.setenv("ORDER_PER_TABLE", "True") monkeypatch.setenv("START_BACKFILL_FILENAME", "_HISTORYDONE") monkeypatch.setenv("WAIT_FOR_VALIDATION", "True") # Must reload the constants file in order to pick up testing mock env vars importlib.reload(gcs_ocn_bq_ingest.common.constants) test_utils.check_blobs_exist(gcs_split_path_partitioned_parquet_data, "test data objects must exist") table_prefix = "" for gcs_data in gcs_split_path_partitioned_parquet_data: if gcs_data.name.endswith( gcs_ocn_bq_ingest.common.constants.SUCCESS_FILENAME): table_prefix = gcs_ocn_bq_ingest.common.utils.get_table_prefix( gcs, gcs_data) break # Upload _HISTORYDONE file which will cause cloud function to take action backfill_start_blob: storage.Blob = gcs_bucket.blob( f"{table_prefix}/" f"{gcs_ocn_bq_ingest.common.constants.START_BACKFILL_FILENAME}") backfill_start_blob.upload_from_string("") test_utils.check_blobs_exist([backfill_start_blob], "_HISTORYDONE file was" "not created.") test_utils.trigger_gcf_for_each_blob([backfill_start_blob]) # Invoke cloud function for all data blobs and _SUCCESS blob. # Cloud function shouldn't take any action at this point because there is # no _HISTORYDONE file yet. test_utils.trigger_gcf_for_each_blob( gcs_split_path_partitioned_parquet_data) # Check to make sure _BACKFILL file has been craeted backfill_blob: storage.Blob = gcs_bucket.blob( f"{table_prefix}/{gcs_ocn_bq_ingest.common.constants.BACKFILL_FILENAME}" ) test_utils.check_blobs_exist([backfill_blob], "_BACKFILL file was not created by method" "start_backfill_subscriber_if_not_running") test_utils.trigger_gcf_for_each_blob([backfill_blob]) # Test to make sure that _bqlock is not present since cloud function should # remove the lock in between validations with pytest.raises(NotFound): test_utils.check_blobs_exist( [gcs_bucket.blob(f"{table_prefix}/_bqlock")]) # Check that the first batch of data was loaded but only the first batch, # since the second batch is waiting on confirmation of validation. expected_num_rows = 50 test_utils.bq_wait_for_rows(bq, dest_partitioned_table, expected_num_rows) # Upload _BACKFILL file to signal that validation has completed and # that the next item in the _backlog can be processed. backfill_blob.upload_from_string("") test_utils.trigger_gcf_for_each_blob([backfill_blob]) # Check that the second batch was loaded expected_num_rows = 100 test_utils.bq_wait_for_rows(bq, dest_partitioned_table, expected_num_rows) # Upload _BACKFILL file to signal that validation has completed. # There won't be another chunk to load so this _BACKFILL file # should signal the cloud function to remove _BACKFILL file # and backlog directory. backfill_blob.upload_from_string("") test_utils.trigger_gcf_for_each_blob([backfill_blob]) # Test to make sure that _BACKFILL file is not present since cloud function should # remove the _BACKFILL file after final load/validation is complete. with pytest.raises(NotFound): test_utils.check_blobs_exist( [gcs_bucket.blob(f"{table_prefix}/_BACKFILL")])