Esempio n. 1
0
def test_backfill(bq, gcs_partitioned_data, gcs_truncating_load_config,
                  gcs_bucket, dest_partitioned_table):
    """
    This is an adaptation of test_load_job_partitioned but instead uses the
    backfill CLI code path to execute the cloud function's main method in
    parallel threads.

    Test loading separate partitions with WRITE_TRUNCATE

    after both load jobs the count should equal the sum of the test data in both
    partitions despite having WRITE_TRUNCATE disposition because the destination
    table should target only a particular partition with a decorator.
    """
    test_utils.check_blobs_exist(
        gcs_truncating_load_config,
        "the test is not configured correctly the load.json is missing")
    test_utils.check_blobs_exist(gcs_partitioned_data,
                                 "test data objects must exist")

    expected_num_rows = 0
    for part in [
            "$2017041101",
            "$2017041102",
    ]:
        test_data_file = os.path.join(TEST_DIR, "resources", "test-data",
                                      "nyc_311", part, "nyc_311.csv")
        expected_num_rows += sum(1 for _ in open(test_data_file))
    args = backfill.parse_args([
        f"--gcs-path=gs://{gcs_bucket.name}",
        "--mode=LOCAL",
    ])
    backfill.main(args)
    test_utils.bq_wait_for_rows(bq, dest_partitioned_table, expected_num_rows)
Esempio n. 2
0
def test_load_job(bq, gcs_data, dest_table):
    """tests basic single invocation with load job"""
    test_utils.check_blobs_exist(gcs_data, "test data objects must exist")
    test_utils.trigger_gcf_for_each_blob(gcs_data)
    test_data_file = os.path.join(TEST_DIR, "resources", "test-data", "nation",
                                  "part-m-00001")
    expected_num_rows = sum(1 for _ in open(test_data_file))
    test_utils.bq_wait_for_rows(bq, dest_table, expected_num_rows)
Esempio n. 3
0
def test_duplicate_success_notification(bq, gcs_data, dest_table):
    """tests behavior with two notifications for the same success file."""
    test_utils.check_blobs_exist(gcs_data, "test data objects must exist")
    test_utils.trigger_gcf_for_each_blob(gcs_data)
    test_data_file = os.path.join(TEST_DIR, "resources", "test-data", "nation",
                                  "part-m-00001")
    expected_num_rows = sum(1 for _ in open(test_data_file))
    test_utils.bq_wait_for_rows(bq, dest_table, expected_num_rows)
Esempio n. 4
0
def test_backlog_subscriber_in_order_with_new_batch_after_exit(
        bq, gcs, gcs_bucket, dest_dataset, dest_ordered_update_table,
        gcs_ordered_update_data, gcs_external_update_config, gcs_backlog):
    """Test basic functionality of backlog subscriber.
    Populate a backlog with 3 files that make updates where we can assert
    that these jobs were applied in order.

    To ensure that the subscriber cleans up properly after itself before exit,
    we will drop a 4th batch after the subscriber has exited and assert that it
    gets applied as expected.
    """
    test_utils.check_blobs_exist(gcs_external_update_config,
                                 "config objects must exist")
    test_utils.check_blobs_exist(gcs_ordered_update_data,
                                 "test data objects must exist")
    for blob in gcs_external_update_config:
        basename = os.path.basename(blob.name)
        # Only perform the following actions for the backfill config file
        if basename == gcs_ocn_bq_ingest.common.constants.BACKFILL_FILENAME:
            _run_subscriber(gcs, bq, blob)
            table_prefix = gcs_ocn_bq_ingest.common.utils.get_table_prefix(
                gcs, blob)
            backlog_blobs = gcs_bucket.list_blobs(
                prefix=f"{table_prefix}/_backlog/")
            assert backlog_blobs.num_results == 0, "backlog is not empty"
            bqlock_blob: storage.Blob = gcs_bucket.blob("_bqlock")
            assert not bqlock_blob.exists(), "_bqlock was not cleaned up"
            rows = bq.query("SELECT alpha_update FROM "
                            f"{dest_ordered_update_table.dataset_id}"
                            f".{dest_ordered_update_table.table_id}")
            expected_num_rows = 1
            num_rows = 0
            for row in rows:
                num_rows += 1
                assert row[
                    "alpha_update"] == "ABC", "backlog not applied in order"
            assert num_rows == expected_num_rows

            # Now we will test what happens when the publisher posts another
            # batch after the backlog subscriber has exited.
            backfill_blob = _post_a_new_batch(gcs_bucket,
                                              dest_ordered_update_table)
            assert backfill_blob is not None

            _run_subscriber(gcs, bq, backfill_blob)

            rows = bq.query("SELECT alpha_update FROM "
                            f"{dest_ordered_update_table.dataset_id}"
                            f".{dest_ordered_update_table.table_id}")
            expected_num_rows = 1
            num_rows = 0
            for row in rows:
                num_rows += 1
                assert row[
                    "alpha_update"] == "ABCD", "new incremental not applied"
            assert num_rows == expected_num_rows
Esempio n. 5
0
def test_external_query_with_bad_statement(gcs_data,
                                           gcs_external_config_bad_statement):
    """tests the basic external query ingestion mechanics
    with bq_transform.sql and external.json
    """
    test_utils.check_blobs_exist(gcs_external_config_bad_statement,
                                 "config objects must exist")
    test_utils.check_blobs_exist(gcs_data, "test data objects must exist")

    with pytest.raises(gcs_ocn_bq_ingest.common.exceptions.BigQueryJobFailure):
        test_utils.trigger_gcf_for_each_blob(gcs_data)
Esempio n. 6
0
def test_look_for_config_in_parents(bq, gcs_data_under_sub_dirs,
                                    gcs_external_config, dest_table):
    """test discovery of configuration files for external query in parent
    _config paths.
    """
    test_utils.check_blobs_exist(gcs_external_config,
                                 "config objects must exist")
    test_utils.check_blobs_exist(gcs_data_under_sub_dirs,
                                 "test data must exist")
    test_utils.trigger_gcf_for_each_blob(gcs_data_under_sub_dirs)
    test_data_file = os.path.join(TEST_DIR, "resources", "test-data", "nation",
                                  "part-m-00001")
    expected_num_rows = sum(1 for _ in open(test_data_file))
    test_utils.bq_wait_for_rows(bq, dest_table, expected_num_rows)
Esempio n. 7
0
def test_partitioned_parquet(bq, gcs_split_path_partitioned_parquet_data,
                             gcs_destination_parquet_config,
                             dest_partitioned_table):
    """tests the basic load ingestion mechanics for parquet files
    """
    test_utils.check_blobs_exist(gcs_destination_parquet_config,
                                 "config objects must exist")
    test_utils.check_blobs_exist(gcs_split_path_partitioned_parquet_data,
                                 "test data objects must exist")

    test_utils.trigger_gcf_for_each_blob(
        gcs_split_path_partitioned_parquet_data)
    expected_num_rows = 100
    test_utils.bq_wait_for_rows(bq, dest_partitioned_table, expected_num_rows)
Esempio n. 8
0
def test_gcf_event_schema(bq, gcs_data, dest_table):
    """tests compatibility to Cloud Functions Background Function posting the
    storage object schema
    https://cloud.google.com/storage/docs/json_api/v1/objects#resource
    directly based on object finalize.

    https://cloud.google.com/functions/docs/tutorials/storage#functions_tutorial_helloworld_storage-python
    """
    test_utils.check_blobs_exist(gcs_data, "test data objects must exist")
    test_utils.trigger_gcf_for_each_blob(gcs_data)
    test_data_file = os.path.join(TEST_DIR, "resources", "test-data", "nation",
                                  "part-m-00001")
    expected_num_rows = sum(1 for _ in open(test_data_file))
    test_utils.bq_wait_for_rows(bq, dest_table, expected_num_rows)
Esempio n. 9
0
def test_load_job_appending_batches(bq, gcs_batched_data, dest_table):
    """
    tests two loading batches with the default load configuration.

    The total number of rows expected should be the number of rows
    in the test file multiplied by the number of batches because we
    should pick up the default WRITE_APPEND disposition.
    """
    test_data_file = os.path.join(TEST_DIR, "resources", "test-data", "nation",
                                  "part-m-00001")
    test_count = sum(1 for _ in open(test_data_file))
    expected_counts = 2 * test_count  # 2 batches * num of test rows
    test_utils.check_blobs_exist(gcs_batched_data,
                                 "test data objects must exist")
    test_utils.trigger_gcf_for_each_blob(gcs_batched_data)
    test_utils.bq_wait_for_rows(bq, dest_table, expected_counts)
Esempio n. 10
0
def test_external_query_partitioned_parquet(
        bq, gcs_split_path_partitioned_parquet_data,
        gcs_external_partitioned_parquet_config, gcs_destination_config,
        dest_partitioned_table):
    """tests the basic external query ingestion mechanics
    with bq_transform.sql and external.json
    """
    test_utils.check_blobs_exist(
        gcs_destination_config + gcs_external_partitioned_parquet_config,
        "config objects must exist")
    test_utils.check_blobs_exist(gcs_split_path_partitioned_parquet_data,
                                 "test data objects must exist")

    test_utils.trigger_gcf_for_each_blob(
        gcs_split_path_partitioned_parquet_data)
    expected_num_rows = 100
    test_utils.bq_wait_for_rows(bq, dest_partitioned_table, expected_num_rows)
Esempio n. 11
0
def test_external_query_pure(
    bq,
    gcs_data,
    gcs_external_config,
    dest_table,
):
    """tests the basic external query ingestion mechanics
    with bq_transform.sql and external.json
    """
    test_utils.check_blobs_exist(gcs_data, "test data objects must exist")
    test_utils.check_blobs_exist(gcs_external_config,
                                 "config objects must exist")

    test_utils.trigger_gcf_for_each_blob(gcs_data)
    test_data_file = os.path.join(TEST_DIR, "resources", "test-data", "nation",
                                  "part-m-00001")
    expected_num_rows = sum(1 for _ in open(test_data_file))
    test_utils.bq_wait_for_rows(bq, dest_table, expected_num_rows)
Esempio n. 12
0
def test_backlog_publisher_with_existing_backfill_file(gcs, gcs_bucket,
                                                       dest_dataset,
                                                       dest_partitioned_table,
                                                       gcs_partitioned_data):
    """Test basic functionality of backlog_publisher when the backfill is
    already running. It should not repost this backfill file.
    """
    test_utils.check_blobs_exist(gcs_partitioned_data,
                                 "test data objects must exist")
    table_prefix = "/".join(
        [dest_dataset.dataset_id, dest_partitioned_table.table_id])
    backfill_blob: storage.Blob = gcs_bucket.blob(
        f"{table_prefix}/{gcs_ocn_bq_ingest.common.constants.BACKFILL_FILENAME}"
    )
    backfill_blob.upload_from_string("")
    backfill_blob.reload()
    original_backfill_blob_generation = backfill_blob.generation
    table_prefix = ""
    # load each partition.
    for gcs_data in gcs_partitioned_data:
        if gcs_data.name.endswith(
                gcs_ocn_bq_ingest.common.constants.SUCCESS_FILENAME):
            table_prefix = gcs_ocn_bq_ingest.common.utils.get_table_prefix(
                gcs, gcs_data)
            gcs_ocn_bq_ingest.common.ordering.backlog_publisher(gcs, gcs_data)

    # Use of queue to test that list responses are returned in expected order.
    expected_backlog_blobs = queue.Queue()
    expected_backlog_blobs.put("/".join([
        table_prefix, "_backlog", "$2017041101",
        gcs_ocn_bq_ingest.common.constants.SUCCESS_FILENAME
    ]))
    expected_backlog_blobs.put("/".join([
        table_prefix, "_backlog", "$2017041102",
        gcs_ocn_bq_ingest.common.constants.SUCCESS_FILENAME
    ]))

    for backlog_blob in gcs_bucket.list_blobs(
            prefix=f"{table_prefix}/_backlog"):
        assert backlog_blob.name == expected_backlog_blobs.get(block=False)

    backfill_blob.reload()
    assert backfill_blob.generation == original_backfill_blob_generation
Esempio n. 13
0
def test_get_batches_for_gsurl_recursive(
    gcs,
    gcs_bucket,
    gcs_split_path_partitioned_parquet_data,
    gcs_external_partitioned_parquet_config,
):
    """tests that all blobs are recursively found for a given prefix
    """
    test_utils.check_blobs_exist(gcs_external_partitioned_parquet_config,
                                 "config objects must exist")
    test_utils.check_blobs_exist(gcs_split_path_partitioned_parquet_data,
                                 "test data objects must exist")
    batches = gcs_ocn_bq_ingest.common.utils.get_batches_for_gsurl(
        gcs, f"gs://{gcs_bucket.name}/", recursive=True)
    total_data_objects = 0
    for batch in batches:
        print(batch)
        total_data_objects += len(batch)
    assert total_data_objects == 4
Esempio n. 14
0
def test_look_for_destination_config_in_parents(
    bq,
    gcs_split_path_partitioned_data,
    gcs_destination_config,
    dest_partitioned_table,
):
    """test discovery of configuration files for destination in parent
    _config paths.
    """
    test_utils.check_blobs_exist(gcs_destination_config,
                                 "config objects must exist")
    test_utils.check_blobs_exist(gcs_split_path_partitioned_data,
                                 "test data must exist")
    test_utils.trigger_gcf_for_each_blob(gcs_split_path_partitioned_data)
    expected_num_rows = 0
    for part in ["$2017041101", "$2017041102"]:
        test_data_file = os.path.join(TEST_DIR, "resources", "test-data",
                                      "nyc_311", part, "nyc_311.csv")
        expected_num_rows += sum(1 for _ in open(test_data_file))
    test_utils.bq_wait_for_rows(bq, dest_partitioned_table, expected_num_rows)
Esempio n. 15
0
def test_load_job_partitioned(bq, gcs_partitioned_data,
                              gcs_truncating_load_config,
                              dest_partitioned_table):
    """
    Test loading separate partitions with WRITE_TRUNCATE

    after both load jobs the count should equal the sum of the test data in both
    partitions despite having WRITE_TRUNCATE disposition because the destination
    table should target only a particular partition with a decorator.
    """
    test_utils.check_blobs_exist(gcs_truncating_load_config,
                                 "the load.json is missing")
    test_utils.check_blobs_exist(gcs_partitioned_data,
                                 "test data objects must exist")
    test_utils.trigger_gcf_for_each_blob(gcs_partitioned_data)
    expected_num_rows = 0
    for part in ["$2017041101", "$2017041102"]:
        test_data_file = os.path.join(TEST_DIR, "resources", "test-data",
                                      "nyc_311", part, "nyc_311.csv")
        expected_num_rows += sum(1 for _ in open(test_data_file))
    test_utils.bq_wait_for_rows(bq, dest_partitioned_table, expected_num_rows)
Esempio n. 16
0
def test_ordered_load_parquet_hive_partitioning(
        monkeypatch, gcs, bq, gcs_bucket,
        gcs_destination_parquet_config_hive_partitioned,
        gcs_external_hive_partitioned_parquet_config,
        gcs_split_path_partitioned_parquet_data, dest_hive_partitioned_table):
    """Test ordered loads of parquet data files

    Set global env variable ORDER_PER_TABLE so that all loads are ordered.
    Test to make sure that parquet data files are loaded in order.
    """
    monkeypatch.setenv("ORDER_PER_TABLE", "True")
    monkeypatch.setenv("START_BACKFILL_FILENAME", "_HISTORYDONE")
    # Must reload the constants file in order to pick up testing mock env vars
    importlib.reload(gcs_ocn_bq_ingest.common.constants)

    test_utils.check_blobs_exist(gcs_split_path_partitioned_parquet_data,
                                 "test data objects must exist")

    table_prefix = ""
    for gcs_data in gcs_split_path_partitioned_parquet_data:
        if gcs_data.name.endswith(
                gcs_ocn_bq_ingest.common.constants.SUCCESS_FILENAME):
            table_prefix = gcs_ocn_bq_ingest.common.utils.get_table_prefix(
                gcs, gcs_data)
            break

    # Invoke cloud function for all data blobs and _SUCCESS blob.
    # Cloud function shouldn't take any action at this point because there is
    # no _HISTORYDONE file yet.
    test_utils.trigger_gcf_for_each_blob(
        gcs_split_path_partitioned_parquet_data)

    # Upload _HISTORYDONE file which will cause cloud function to take action
    backfill_start_blob: storage.Blob = gcs_bucket.blob(
        f"{table_prefix}/"
        f"{gcs_ocn_bq_ingest.common.constants.START_BACKFILL_FILENAME}")
    backfill_start_blob.upload_from_string("")
    test_utils.check_blobs_exist([backfill_start_blob], "_HISTORYDONE file was"
                                 "not created.")
    test_utils.trigger_gcf_for_each_blob([backfill_start_blob])

    # Check to make sure _BACKFILL file has been craeted
    backfill_blob: storage.Blob = gcs_bucket.blob(
        f"{table_prefix}/{gcs_ocn_bq_ingest.common.constants.BACKFILL_FILENAME}"
    )
    test_utils.check_blobs_exist([backfill_blob],
                                 "_BACKFILL file was not created by method"
                                 "start_backfill_subscriber_if_not_running")
    test_utils.trigger_gcf_for_each_blob([backfill_blob])
    expected_num_rows = 100
    test_utils.bq_wait_for_rows(bq, dest_hive_partitioned_table,
                                expected_num_rows)
    # Check to make sure the hive_part_column column values were correctly inserted
    # into the BigQuery destination table.
    for row in bq.query(
            f"SELECT DISTINCT hive_part_column "
            f"FROM `{dest_hive_partitioned_table.full_table_id.replace(':','.')}`"
    ).result():
        assert row.hive_part_column == 9999
Esempio n. 17
0
def test_external_query_partitioned_with_destination_config(
        bq, gcs_partitioned_data, gcs_external_partitioned_config,
        gcs_destination_config, dest_partitioned_table):
    """tests the basic external query ingestion mechanics
    with bq_transform.sql, external.json, and
    destination config in load.json.
    """
    test_utils.check_blobs_exist(
        (gcs_external_partitioned_config + gcs_destination_config),
        "config objects must exist")
    test_utils.check_blobs_exist(gcs_partitioned_data, "test data must exist")
    test_utils.trigger_gcf_for_each_blob(gcs_partitioned_data +
                                         gcs_external_partitioned_config +
                                         gcs_destination_config)
    expected_num_rows = 0
    for part in [
            "$2017041101",
            "$2017041102",
    ]:
        test_data_file = os.path.join(TEST_DIR, "resources", "test-data",
                                      "nyc_311", part, "nyc_311.csv")
        expected_num_rows += sum(1 for _ in open(test_data_file))
    test_utils.bq_wait_for_rows(bq, dest_partitioned_table, expected_num_rows)
Esempio n. 18
0
def test_load_job_truncating_batches(
    bq,
    gcs_batched_data,
    gcs_truncating_load_config,
    dest_table,
):
    """
    tests two successive batches with a load.json that dictates WRITE_TRUNCATE.

    after both load jobs the count should be the same as the number of lines
    in the test file because we should pick up the WRITE_TRUNCATE disposition.
    """
    test_utils.check_blobs_exist(
        gcs_truncating_load_config,
        "the test is not configured correctly the load.json is missing")
    test_utils.check_blobs_exist(gcs_batched_data,
                                 "test data objects must exist")
    test_utils.trigger_gcf_for_each_blob(gcs_batched_data)

    test_data_file = os.path.join(TEST_DIR, "resources", "test-data", "nation",
                                  "part-m-00001")
    expected_num_rows = sum(1 for _ in open(test_data_file))
    test_utils.bq_wait_for_rows(bq, dest_table, expected_num_rows)
Esempio n. 19
0
def test_backlog_publisher(gcs, gcs_bucket, gcs_partitioned_data):
    """Test basic functionality of backlog_publisher
    Drop two success files.
    Assert that both success files are added to backlog and backfill file
    created.
    Assert that that only one backfill file is not recreated.
    """
    test_utils.check_blobs_exist(gcs_partitioned_data,
                                 "test data objects must exist")
    table_prefix = ""
    # load each partition.
    for gcs_data in gcs_partitioned_data:
        if gcs_data.name.endswith(
                gcs_ocn_bq_ingest.common.constants.SUCCESS_FILENAME):
            table_prefix = gcs_ocn_bq_ingest.common.utils.get_table_prefix(
                gcs, gcs_data)
            gcs_ocn_bq_ingest.common.ordering.backlog_publisher(gcs, gcs_data)

    expected_backlog_blobs = queue.Queue()
    expected_backlog_blobs.put("/".join([
        table_prefix, "_backlog", "$2017041101",
        gcs_ocn_bq_ingest.common.constants.SUCCESS_FILENAME
    ]))
    expected_backlog_blobs.put("/".join([
        table_prefix, "_backlog", "$2017041102",
        gcs_ocn_bq_ingest.common.constants.SUCCESS_FILENAME
    ]))

    for backlog_blob in gcs_bucket.list_blobs(
            prefix=f"{table_prefix}/_backlog"):
        assert backlog_blob.name == expected_backlog_blobs.get(block=False)

    backfill_blob: storage.Blob = gcs_bucket.blob(
        f"{table_prefix}/{gcs_ocn_bq_ingest.common.constants.BACKFILL_FILENAME}"
    )
    assert backfill_blob.exists()
Esempio n. 20
0
def test_ordered_load_parquet_wait_for_validation(
        monkeypatch, gcs, bq, gcs_bucket, gcs_destination_parquet_config,
        gcs_external_partitioned_parquet_config,
        gcs_split_path_partitioned_parquet_data, dest_partitioned_table):
    """Test ordered loads of parquet data files with a validation step
    between each load.

    Set global env variable ORDER_PER_TABLE so that all loads are ordered.
    Test to make sure that parquet data files are loaded in order.
    """
    monkeypatch.setenv("ORDER_PER_TABLE", "True")
    monkeypatch.setenv("START_BACKFILL_FILENAME", "_HISTORYDONE")
    monkeypatch.setenv("WAIT_FOR_VALIDATION", "True")
    # Must reload the constants file in order to pick up testing mock env vars
    importlib.reload(gcs_ocn_bq_ingest.common.constants)

    test_utils.check_blobs_exist(gcs_split_path_partitioned_parquet_data,
                                 "test data objects must exist")

    table_prefix = ""
    for gcs_data in gcs_split_path_partitioned_parquet_data:
        if gcs_data.name.endswith(
                gcs_ocn_bq_ingest.common.constants.SUCCESS_FILENAME):
            table_prefix = gcs_ocn_bq_ingest.common.utils.get_table_prefix(
                gcs, gcs_data)
            break

    # Upload _HISTORYDONE file which will cause cloud function to take action
    backfill_start_blob: storage.Blob = gcs_bucket.blob(
        f"{table_prefix}/"
        f"{gcs_ocn_bq_ingest.common.constants.START_BACKFILL_FILENAME}")
    backfill_start_blob.upload_from_string("")
    test_utils.check_blobs_exist([backfill_start_blob], "_HISTORYDONE file was"
                                 "not created.")
    test_utils.trigger_gcf_for_each_blob([backfill_start_blob])

    # Invoke cloud function for all data blobs and _SUCCESS blob.
    # Cloud function shouldn't take any action at this point because there is
    # no _HISTORYDONE file yet.
    test_utils.trigger_gcf_for_each_blob(
        gcs_split_path_partitioned_parquet_data)

    # Check to make sure _BACKFILL file has been craeted
    backfill_blob: storage.Blob = gcs_bucket.blob(
        f"{table_prefix}/{gcs_ocn_bq_ingest.common.constants.BACKFILL_FILENAME}"
    )
    test_utils.check_blobs_exist([backfill_blob],
                                 "_BACKFILL file was not created by method"
                                 "start_backfill_subscriber_if_not_running")
    test_utils.trigger_gcf_for_each_blob([backfill_blob])

    # Test to make sure that _bqlock is not present since cloud function should
    # remove the lock in between validations
    with pytest.raises(NotFound):
        test_utils.check_blobs_exist(
            [gcs_bucket.blob(f"{table_prefix}/_bqlock")])

    # Check that the first batch of data was loaded but only the first batch,
    # since the second batch is waiting on confirmation of validation.
    expected_num_rows = 50
    test_utils.bq_wait_for_rows(bq, dest_partitioned_table, expected_num_rows)

    # Upload _BACKFILL file to signal that validation has completed and
    # that the next item in the _backlog can be processed.
    backfill_blob.upload_from_string("")
    test_utils.trigger_gcf_for_each_blob([backfill_blob])

    # Check that the second batch was loaded
    expected_num_rows = 100
    test_utils.bq_wait_for_rows(bq, dest_partitioned_table, expected_num_rows)

    # Upload _BACKFILL file to signal that validation has completed.
    # There won't be another chunk to load so this _BACKFILL file
    # should signal the cloud function to remove _BACKFILL file
    # and backlog directory.
    backfill_blob.upload_from_string("")
    test_utils.trigger_gcf_for_each_blob([backfill_blob])

    # Test to make sure that _BACKFILL file is not present since cloud function should
    # remove the _BACKFILL file after final load/validation is complete.
    with pytest.raises(NotFound):
        test_utils.check_blobs_exist(
            [gcs_bucket.blob(f"{table_prefix}/_BACKFILL")])
Esempio n. 21
0
def test_backlog_subscriber_in_order_with_new_batch_while_running(
        bq, gcs, gcs_bucket, dest_ordered_update_table: bigquery.Table,
        gcs_ordered_update_data: List[storage.Blob],
        gcs_external_update_config: List[storage.Blob],
        gcs_backlog: List[storage.Blob]):
    """Test functionality of backlog subscriber when new batches are added
    before the subscriber is done finishing the existing backlog.

    Populate a backlog with 3 files that make updates where we can assert
    that these jobs were applied in order.
    In another process populate a fourth batch, and call the publisher.
    """
    test_utils.check_blobs_exist(gcs_external_update_config,
                                 "config objects must exist")
    test_utils.check_blobs_exist(gcs_ordered_update_data,
                                 "test data objects must exist")
    # Cannot pickle clients to another process so we need to recreate some
    # objects without the client property.
    for blob in gcs_external_update_config:
        basename = os.path.basename(blob.name)
        # Only perform the following actions for the backfill config file
        if basename == gcs_ocn_bq_ingest.common.constants.BACKFILL_FILENAME:
            backfill_blob = storage.Blob.from_string(
                f"gs://{blob.bucket.name}/"
                f"{blob.name}")
            bkt = storage.Bucket(None, gcs_bucket.name)
            claim_blob: storage.Blob = blob.bucket.blob(
                blob.name.replace(
                    basename, f"_claimed_{basename}_created_at_"
                    f"{blob.time_created.timestamp()}"))
            # Run subscriber w/ backlog and publisher w/ new batch in parallel.
            with multiprocessing.Pool(processes=3) as pool:
                res_subscriber = pool.apply_async(_run_subscriber,
                                                  (None, None, backfill_blob))
                # wait for existence of claim blob
                # to ensure subscriber is running.
                while not claim_blob.exists():
                    pass
                res_backlog_publisher = pool.apply_async(
                    _post_a_new_batch, (bkt, dest_ordered_update_table))
                res_backlog_publisher.wait()
                res_monitor = pool.apply_async(
                    gcs_ocn_bq_ingest.common.ordering.subscriber_monitor,
                    (None, bkt,
                     storage.Blob(
                         f"{dest_ordered_update_table.project}"
                         f".{dest_ordered_update_table.dataset_id}/"
                         f"{dest_ordered_update_table.table_id}/"
                         f"_backlog/04/_SUCCESS", bkt)))

                if res_monitor.get():
                    print(
                        "subscriber monitor had to retrigger subscriber loop")
                    backfill_blob.reload(client=gcs)
                    _run_subscriber(None, None, backfill_blob)

                res_subscriber.wait()

            table_prefix = gcs_ocn_bq_ingest.common.utils.get_table_prefix(
                gcs, blob)
            backlog_blobs = gcs_bucket.list_blobs(prefix=f"{table_prefix}/"
                                                  f"_backlog/")
            assert backlog_blobs.num_results == 0, "backlog is not empty"
            bqlock_blob: storage.Blob = gcs_bucket.blob("_bqlock")
            assert not bqlock_blob.exists(), "_bqlock was not cleaned up"
            rows = bq.query("SELECT alpha_update FROM "
                            f"{dest_ordered_update_table.dataset_id}"
                            f".{dest_ordered_update_table.table_id}")
            expected_num_rows = 1
            num_rows = 0
            for row in rows:
                num_rows += 1
                assert row[
                    "alpha_update"] == "ABCD", "backlog not applied in order"
            assert num_rows == expected_num_rows