Ejemplo n.º 1
0
def test_backfill(bq, gcs_partitioned_data, gcs_truncating_load_config,
                  gcs_bucket, dest_partitioned_table):
    """
    This is an adaptation of test_load_job_partitioned but instead uses the
    backfill CLI code path to execute the cloud function's main method in
    parallel threads.

    Test loading separate partitions with WRITE_TRUNCATE

    after both load jobs the count should equal the sum of the test data in both
    partitions despite having WRITE_TRUNCATE disposition because the destination
    table should target only a particular partition with a decorator.
    """
    test_utils.check_blobs_exist(
        gcs_truncating_load_config,
        "the test is not configured correctly the load.json is missing")
    test_utils.check_blobs_exist(gcs_partitioned_data,
                                 "test data objects must exist")

    expected_num_rows = 0
    for part in [
            "$2017041101",
            "$2017041102",
    ]:
        test_data_file = os.path.join(TEST_DIR, "resources", "test-data",
                                      "nyc_311", part, "nyc_311.csv")
        expected_num_rows += sum(1 for _ in open(test_data_file))
    args = backfill.parse_args([
        f"--gcs-path=gs://{gcs_bucket.name}",
        "--mode=LOCAL",
    ])
    backfill.main(args)
    test_utils.bq_wait_for_rows(bq, dest_partitioned_table, expected_num_rows)
Ejemplo n.º 2
0
def test_ordered_load_parquet_hive_partitioning(
        monkeypatch, gcs, bq, gcs_bucket,
        gcs_destination_parquet_config_hive_partitioned,
        gcs_external_hive_partitioned_parquet_config,
        gcs_split_path_partitioned_parquet_data, dest_hive_partitioned_table):
    """Test ordered loads of parquet data files

    Set global env variable ORDER_PER_TABLE so that all loads are ordered.
    Test to make sure that parquet data files are loaded in order.
    """
    monkeypatch.setenv("ORDER_PER_TABLE", "True")
    monkeypatch.setenv("START_BACKFILL_FILENAME", "_HISTORYDONE")
    # Must reload the constants file in order to pick up testing mock env vars
    importlib.reload(gcs_ocn_bq_ingest.common.constants)

    test_utils.check_blobs_exist(gcs_split_path_partitioned_parquet_data,
                                 "test data objects must exist")

    table_prefix = ""
    for gcs_data in gcs_split_path_partitioned_parquet_data:
        if gcs_data.name.endswith(
                gcs_ocn_bq_ingest.common.constants.SUCCESS_FILENAME):
            table_prefix = gcs_ocn_bq_ingest.common.utils.get_table_prefix(
                gcs, gcs_data)
            break

    # Invoke cloud function for all data blobs and _SUCCESS blob.
    # Cloud function shouldn't take any action at this point because there is
    # no _HISTORYDONE file yet.
    test_utils.trigger_gcf_for_each_blob(
        gcs_split_path_partitioned_parquet_data)

    # Upload _HISTORYDONE file which will cause cloud function to take action
    backfill_start_blob: storage.Blob = gcs_bucket.blob(
        f"{table_prefix}/"
        f"{gcs_ocn_bq_ingest.common.constants.START_BACKFILL_FILENAME}")
    backfill_start_blob.upload_from_string("")
    test_utils.check_blobs_exist([backfill_start_blob], "_HISTORYDONE file was"
                                 "not created.")
    test_utils.trigger_gcf_for_each_blob([backfill_start_blob])

    # Check to make sure _BACKFILL file has been craeted
    backfill_blob: storage.Blob = gcs_bucket.blob(
        f"{table_prefix}/{gcs_ocn_bq_ingest.common.constants.BACKFILL_FILENAME}"
    )
    test_utils.check_blobs_exist([backfill_blob],
                                 "_BACKFILL file was not created by method"
                                 "start_backfill_subscriber_if_not_running")
    test_utils.trigger_gcf_for_each_blob([backfill_blob])
    expected_num_rows = 100
    test_utils.bq_wait_for_rows(bq, dest_hive_partitioned_table,
                                expected_num_rows)
    # Check to make sure the hive_part_column column values were correctly inserted
    # into the BigQuery destination table.
    for row in bq.query(
            f"SELECT DISTINCT hive_part_column "
            f"FROM `{dest_hive_partitioned_table.full_table_id.replace(':','.')}`"
    ).result():
        assert row.hive_part_column == 9999
Ejemplo n.º 3
0
def test_duplicate_success_notification(bq, gcs_data, dest_table):
    """tests behavior with two notifications for the same success file."""
    test_utils.check_blobs_exist(gcs_data, "test data objects must exist")
    test_utils.trigger_gcf_for_each_blob(gcs_data)
    test_data_file = os.path.join(TEST_DIR, "resources", "test-data", "nation",
                                  "part-m-00001")
    expected_num_rows = sum(1 for _ in open(test_data_file))
    test_utils.bq_wait_for_rows(bq, dest_table, expected_num_rows)
Ejemplo n.º 4
0
def test_load_job(bq, gcs_data, dest_table):
    """tests basic single invocation with load job"""
    test_utils.check_blobs_exist(gcs_data, "test data objects must exist")
    test_utils.trigger_gcf_for_each_blob(gcs_data)
    test_data_file = os.path.join(TEST_DIR, "resources", "test-data", "nation",
                                  "part-m-00001")
    expected_num_rows = sum(1 for _ in open(test_data_file))
    test_utils.bq_wait_for_rows(bq, dest_table, expected_num_rows)
Ejemplo n.º 5
0
def test_gcf_event_schema(bq, gcs_data, dest_table):
    """tests compatibility to Cloud Functions Background Function posting the
    storage object schema
    https://cloud.google.com/storage/docs/json_api/v1/objects#resource
    directly based on object finalize.

    https://cloud.google.com/functions/docs/tutorials/storage#functions_tutorial_helloworld_storage-python
    """
    test_utils.check_blobs_exist(gcs_data, "test data objects must exist")
    test_utils.trigger_gcf_for_each_blob(gcs_data)
    test_data_file = os.path.join(TEST_DIR, "resources", "test-data", "nation",
                                  "part-m-00001")
    expected_num_rows = sum(1 for _ in open(test_data_file))
    test_utils.bq_wait_for_rows(bq, dest_table, expected_num_rows)
Ejemplo n.º 6
0
def test_look_for_config_in_parents(bq, gcs_data_under_sub_dirs,
                                    gcs_external_config, dest_table):
    """test discovery of configuration files for external query in parent
    _config paths.
    """
    test_utils.check_blobs_exist(gcs_external_config,
                                 "config objects must exist")
    test_utils.check_blobs_exist(gcs_data_under_sub_dirs,
                                 "test data must exist")
    test_utils.trigger_gcf_for_each_blob(gcs_data_under_sub_dirs)
    test_data_file = os.path.join(TEST_DIR, "resources", "test-data", "nation",
                                  "part-m-00001")
    expected_num_rows = sum(1 for _ in open(test_data_file))
    test_utils.bq_wait_for_rows(bq, dest_table, expected_num_rows)
Ejemplo n.º 7
0
def test_partitioned_parquet(bq, gcs_split_path_partitioned_parquet_data,
                             gcs_destination_parquet_config,
                             dest_partitioned_table):
    """tests the basic load ingestion mechanics for parquet files
    """
    test_utils.check_blobs_exist(gcs_destination_parquet_config,
                                 "config objects must exist")
    test_utils.check_blobs_exist(gcs_split_path_partitioned_parquet_data,
                                 "test data objects must exist")

    test_utils.trigger_gcf_for_each_blob(
        gcs_split_path_partitioned_parquet_data)
    expected_num_rows = 100
    test_utils.bq_wait_for_rows(bq, dest_partitioned_table, expected_num_rows)
Ejemplo n.º 8
0
def test_load_job_appending_batches(bq, gcs_batched_data, dest_table):
    """
    tests two loading batches with the default load configuration.

    The total number of rows expected should be the number of rows
    in the test file multiplied by the number of batches because we
    should pick up the default WRITE_APPEND disposition.
    """
    test_data_file = os.path.join(TEST_DIR, "resources", "test-data", "nation",
                                  "part-m-00001")
    test_count = sum(1 for _ in open(test_data_file))
    expected_counts = 2 * test_count  # 2 batches * num of test rows
    test_utils.check_blobs_exist(gcs_batched_data,
                                 "test data objects must exist")
    test_utils.trigger_gcf_for_each_blob(gcs_batched_data)
    test_utils.bq_wait_for_rows(bq, dest_table, expected_counts)
Ejemplo n.º 9
0
def test_external_query_partitioned_parquet(
        bq, gcs_split_path_partitioned_parquet_data,
        gcs_external_partitioned_parquet_config, gcs_destination_config,
        dest_partitioned_table):
    """tests the basic external query ingestion mechanics
    with bq_transform.sql and external.json
    """
    test_utils.check_blobs_exist(
        gcs_destination_config + gcs_external_partitioned_parquet_config,
        "config objects must exist")
    test_utils.check_blobs_exist(gcs_split_path_partitioned_parquet_data,
                                 "test data objects must exist")

    test_utils.trigger_gcf_for_each_blob(
        gcs_split_path_partitioned_parquet_data)
    expected_num_rows = 100
    test_utils.bq_wait_for_rows(bq, dest_partitioned_table, expected_num_rows)
Ejemplo n.º 10
0
def test_external_query_pure(
    bq,
    gcs_data,
    gcs_external_config,
    dest_table,
):
    """tests the basic external query ingestion mechanics
    with bq_transform.sql and external.json
    """
    test_utils.check_blobs_exist(gcs_data, "test data objects must exist")
    test_utils.check_blobs_exist(gcs_external_config,
                                 "config objects must exist")

    test_utils.trigger_gcf_for_each_blob(gcs_data)
    test_data_file = os.path.join(TEST_DIR, "resources", "test-data", "nation",
                                  "part-m-00001")
    expected_num_rows = sum(1 for _ in open(test_data_file))
    test_utils.bq_wait_for_rows(bq, dest_table, expected_num_rows)
Ejemplo n.º 11
0
def test_external_query_partitioned(bq, gcs_partitioned_data,
                                    gcs_external_partitioned_config,
                                    dest_partitioned_table):
    """tests the basic external query ingestion mechanics
    with bq_transform.sql and external.json
    """
    if not all((blob.exists() for blob in gcs_external_partitioned_config)):
        raise google.cloud.exceptions.NotFound("config objects must exist")

    test_utils.trigger_gcf_for_each_blob(gcs_partitioned_data)
    expected_num_rows = 0
    for part in [
            "$2017041101",
            "$2017041102",
    ]:
        test_data_file = os.path.join(TEST_DIR, "resources", "test-data",
                                      "nyc_311", part, "nyc_311.csv")
        expected_num_rows += sum(1 for _ in open(test_data_file))
    test_utils.bq_wait_for_rows(bq, dest_partitioned_table, expected_num_rows)
Ejemplo n.º 12
0
def test_look_for_destination_config_in_parents(
    bq,
    gcs_split_path_partitioned_data,
    gcs_destination_config,
    dest_partitioned_table,
):
    """test discovery of configuration files for destination in parent
    _config paths.
    """
    test_utils.check_blobs_exist(gcs_destination_config,
                                 "config objects must exist")
    test_utils.check_blobs_exist(gcs_split_path_partitioned_data,
                                 "test data must exist")
    test_utils.trigger_gcf_for_each_blob(gcs_split_path_partitioned_data)
    expected_num_rows = 0
    for part in ["$2017041101", "$2017041102"]:
        test_data_file = os.path.join(TEST_DIR, "resources", "test-data",
                                      "nyc_311", part, "nyc_311.csv")
        expected_num_rows += sum(1 for _ in open(test_data_file))
    test_utils.bq_wait_for_rows(bq, dest_partitioned_table, expected_num_rows)
Ejemplo n.º 13
0
def test_load_job_partitioned(bq, gcs_partitioned_data,
                              gcs_truncating_load_config,
                              dest_partitioned_table):
    """
    Test loading separate partitions with WRITE_TRUNCATE

    after both load jobs the count should equal the sum of the test data in both
    partitions despite having WRITE_TRUNCATE disposition because the destination
    table should target only a particular partition with a decorator.
    """
    test_utils.check_blobs_exist(gcs_truncating_load_config,
                                 "the load.json is missing")
    test_utils.check_blobs_exist(gcs_partitioned_data,
                                 "test data objects must exist")
    test_utils.trigger_gcf_for_each_blob(gcs_partitioned_data)
    expected_num_rows = 0
    for part in ["$2017041101", "$2017041102"]:
        test_data_file = os.path.join(TEST_DIR, "resources", "test-data",
                                      "nyc_311", part, "nyc_311.csv")
        expected_num_rows += sum(1 for _ in open(test_data_file))
    test_utils.bq_wait_for_rows(bq, dest_partitioned_table, expected_num_rows)
Ejemplo n.º 14
0
def test_load_job_truncating_batches(
    bq,
    gcs_batched_data,
    gcs_truncating_load_config,
    dest_table,
):
    """
    tests two successive batches with a load.json that dictates WRITE_TRUNCATE.

    after both load jobs the count should be the same as the number of lines
    in the test file because we should pick up the WRITE_TRUNCATE disposition.
    """
    test_utils.check_blobs_exist(
        gcs_truncating_load_config,
        "the test is not configured correctly the load.json is missing")
    test_utils.check_blobs_exist(gcs_batched_data,
                                 "test data objects must exist")
    test_utils.trigger_gcf_for_each_blob(gcs_batched_data)

    test_data_file = os.path.join(TEST_DIR, "resources", "test-data", "nation",
                                  "part-m-00001")
    expected_num_rows = sum(1 for _ in open(test_data_file))
    test_utils.bq_wait_for_rows(bq, dest_table, expected_num_rows)
Ejemplo n.º 15
0
def test_external_query_partitioned_with_destination_config(
        bq, gcs_partitioned_data, gcs_external_partitioned_config,
        gcs_destination_config, dest_partitioned_table):
    """tests the basic external query ingestion mechanics
    with bq_transform.sql, external.json, and
    destination config in load.json.
    """
    test_utils.check_blobs_exist(
        (gcs_external_partitioned_config + gcs_destination_config),
        "config objects must exist")
    test_utils.check_blobs_exist(gcs_partitioned_data, "test data must exist")
    test_utils.trigger_gcf_for_each_blob(gcs_partitioned_data +
                                         gcs_external_partitioned_config +
                                         gcs_destination_config)
    expected_num_rows = 0
    for part in [
            "$2017041101",
            "$2017041102",
    ]:
        test_data_file = os.path.join(TEST_DIR, "resources", "test-data",
                                      "nyc_311", part, "nyc_311.csv")
        expected_num_rows += sum(1 for _ in open(test_data_file))
    test_utils.bq_wait_for_rows(bq, dest_partitioned_table, expected_num_rows)
Ejemplo n.º 16
0
def test_ordered_load_parquet_wait_for_validation(
        monkeypatch, gcs, bq, gcs_bucket, gcs_destination_parquet_config,
        gcs_external_partitioned_parquet_config,
        gcs_split_path_partitioned_parquet_data, dest_partitioned_table):
    """Test ordered loads of parquet data files with a validation step
    between each load.

    Set global env variable ORDER_PER_TABLE so that all loads are ordered.
    Test to make sure that parquet data files are loaded in order.
    """
    monkeypatch.setenv("ORDER_PER_TABLE", "True")
    monkeypatch.setenv("START_BACKFILL_FILENAME", "_HISTORYDONE")
    monkeypatch.setenv("WAIT_FOR_VALIDATION", "True")
    # Must reload the constants file in order to pick up testing mock env vars
    importlib.reload(gcs_ocn_bq_ingest.common.constants)

    test_utils.check_blobs_exist(gcs_split_path_partitioned_parquet_data,
                                 "test data objects must exist")

    table_prefix = ""
    for gcs_data in gcs_split_path_partitioned_parquet_data:
        if gcs_data.name.endswith(
                gcs_ocn_bq_ingest.common.constants.SUCCESS_FILENAME):
            table_prefix = gcs_ocn_bq_ingest.common.utils.get_table_prefix(
                gcs, gcs_data)
            break

    # Upload _HISTORYDONE file which will cause cloud function to take action
    backfill_start_blob: storage.Blob = gcs_bucket.blob(
        f"{table_prefix}/"
        f"{gcs_ocn_bq_ingest.common.constants.START_BACKFILL_FILENAME}")
    backfill_start_blob.upload_from_string("")
    test_utils.check_blobs_exist([backfill_start_blob], "_HISTORYDONE file was"
                                 "not created.")
    test_utils.trigger_gcf_for_each_blob([backfill_start_blob])

    # Invoke cloud function for all data blobs and _SUCCESS blob.
    # Cloud function shouldn't take any action at this point because there is
    # no _HISTORYDONE file yet.
    test_utils.trigger_gcf_for_each_blob(
        gcs_split_path_partitioned_parquet_data)

    # Check to make sure _BACKFILL file has been craeted
    backfill_blob: storage.Blob = gcs_bucket.blob(
        f"{table_prefix}/{gcs_ocn_bq_ingest.common.constants.BACKFILL_FILENAME}"
    )
    test_utils.check_blobs_exist([backfill_blob],
                                 "_BACKFILL file was not created by method"
                                 "start_backfill_subscriber_if_not_running")
    test_utils.trigger_gcf_for_each_blob([backfill_blob])

    # Test to make sure that _bqlock is not present since cloud function should
    # remove the lock in between validations
    with pytest.raises(NotFound):
        test_utils.check_blobs_exist(
            [gcs_bucket.blob(f"{table_prefix}/_bqlock")])

    # Check that the first batch of data was loaded but only the first batch,
    # since the second batch is waiting on confirmation of validation.
    expected_num_rows = 50
    test_utils.bq_wait_for_rows(bq, dest_partitioned_table, expected_num_rows)

    # Upload _BACKFILL file to signal that validation has completed and
    # that the next item in the _backlog can be processed.
    backfill_blob.upload_from_string("")
    test_utils.trigger_gcf_for_each_blob([backfill_blob])

    # Check that the second batch was loaded
    expected_num_rows = 100
    test_utils.bq_wait_for_rows(bq, dest_partitioned_table, expected_num_rows)

    # Upload _BACKFILL file to signal that validation has completed.
    # There won't be another chunk to load so this _BACKFILL file
    # should signal the cloud function to remove _BACKFILL file
    # and backlog directory.
    backfill_blob.upload_from_string("")
    test_utils.trigger_gcf_for_each_blob([backfill_blob])

    # Test to make sure that _BACKFILL file is not present since cloud function should
    # remove the _BACKFILL file after final load/validation is complete.
    with pytest.raises(NotFound):
        test_utils.check_blobs_exist(
            [gcs_bucket.blob(f"{table_prefix}/_BACKFILL")])