Example #1
0
def test_get_id_range_for_partition_with_one_over():
    """Checks that the proper upper and lower bound are retrieved even when the range of IDs leaves only 1 item
    in the last partition. There was a bug here before."""
    min_id = 1
    max_id = 101
    partition_size = 20
    id_range_item_count = max_id - min_id + 1  # this many individual IDs should be processed for continuous ID range
    assert id_range_item_count % partition_size == 1  # one over the partition size
    etl_config = {"partition_size": partition_size}
    ctrl = Controller(etl_config)
    ctrl.min_id = min_id
    ctrl.max_id = max_id
    ctrl.record_count = id_range_item_count  # assume records exist for each ID in range
    ctrl.config["partitions"] = ctrl.determine_partitions()
    assert ctrl.config["partitions"] == ceil(id_range_item_count /
                                             partition_size)
    partition_range = range(0, ctrl.config["partitions"])
    # First batch
    lower_bound, upper_bound = ctrl.get_id_range_for_partition(
        partition_range[0])
    assert lower_bound == min_id
    assert upper_bound == lower_bound + (partition_size - 1)
    # Second batch
    lower_bound, upper_bound = ctrl.get_id_range_for_partition(
        partition_range[1])
    assert lower_bound == min_id + partition_size
    assert upper_bound == lower_bound + (partition_size - 1)
    # Last batch should go all the way up to max_id
    lower_bound, upper_bound = ctrl.get_id_range_for_partition(
        partition_range[-1])
    assert lower_bound == (min_id +
                           (partition_size * partition_range[-1])) == 101
    assert upper_bound == max_id == 101
    id_set = set(range(min_id, max_id + 1))
    assert _remove_seen_ids(ctrl, id_set) == set({})
Example #2
0
def test_get_id_range_for_partition_with_evenly_divisible():
    """Check all is good when set of records fit evenly into partitions (each partition full)"""
    min_id = 1
    max_id = 100
    partition_size = 20
    id_range_item_count = max_id - min_id + 1  # this many individual IDs should be processed for continuous ID range
    assert id_range_item_count % partition_size == 0  # evenly divisible
    etl_config = {"partition_size": partition_size}
    ctrl = Controller(etl_config)
    ctrl.min_id = min_id
    ctrl.max_id = max_id
    ctrl.record_count = id_range_item_count  # assume records exist for each ID in range
    ctrl.config["partitions"] = ctrl.determine_partitions()
    assert ctrl.config["partitions"] == ceil(id_range_item_count /
                                             partition_size)
    partition_range = range(0, ctrl.config["partitions"])
    # First batch
    lower_bound, upper_bound = ctrl.get_id_range_for_partition(
        partition_range[0])
    assert lower_bound == min_id
    assert upper_bound == lower_bound + (partition_size - 1)
    # Second batch
    lower_bound, upper_bound = ctrl.get_id_range_for_partition(
        partition_range[1])
    assert lower_bound == min_id + partition_size
    assert upper_bound == lower_bound + (partition_size - 1)
    # Last batch should go all the way up to max_id
    lower_bound, upper_bound = ctrl.get_id_range_for_partition(
        partition_range[-1])
    assert lower_bound == (max_id - partition_size +
                           1) == (min_id +
                                  (partition_size * partition_range[-1]))
    assert upper_bound == max_id
    id_set = set(range(min_id, max_id + 1))
    assert _remove_seen_ids(ctrl, id_set) == set({})
Example #3
0
def test_get_id_range_for_partition_with_empty_partitions():
    """Checks that the proper upper and lower bound are retrieved even when the range of IDs is evenly divisible by
    the partition size. There was a bug here before."""
    min_id = 1
    max_id = 100
    partition_size = 20
    id_range_item_count = max_id - min_id + 1  # this many individual IDs should be processed for continuous ID range
    record_ids = {1, 5, 7, 15, 19, 20, 41, 100}
    etl_config = {"partition_size": partition_size}
    ctrl = Controller(etl_config)
    ctrl.min_id = min_id
    ctrl.max_id = max_id
    ctrl.record_count = len(record_ids)
    ctrl.config["partitions"] = ctrl.determine_partitions()
    assert ctrl.config["partitions"] == ceil(id_range_item_count /
                                             partition_size)
    partition_range = range(0, ctrl.config["partitions"])
    # First batch
    lower_bound, upper_bound = ctrl.get_id_range_for_partition(
        partition_range[0])
    assert lower_bound == min_id
    assert upper_bound == lower_bound + (partition_size - 1)
    # Second batch
    lower_bound, upper_bound = ctrl.get_id_range_for_partition(
        partition_range[1])
    assert lower_bound == min_id + partition_size
    assert upper_bound == lower_bound + (partition_size - 1)
    # Last batch should go all the way up to max_id
    lower_bound, upper_bound = ctrl.get_id_range_for_partition(
        partition_range[-1])
    assert lower_bound == (min_id + (partition_size * partition_range[-1]))
    assert upper_bound == max_id
    assert _remove_seen_ids(ctrl, record_ids) == set({})
Example #4
0
def test_es_award_loader_class(award_data_fixture, elasticsearch_award_index,
                               monkeypatch):
    monkeypatch.setattr(
        "usaspending_api.etl.elasticsearch_loader_helpers.utilities.execute_sql_statement",
        mock_execute_sql)
    elasticsearch_client = instantiate_elasticsearch_client()
    loader = Controller(award_config, elasticsearch_client)
    assert loader.__class__.__name__ == "Controller"
    loader.run_load_steps()
    assert elasticsearch_client.indices.exists(award_config["index_name"])
    elasticsearch_client.indices.delete(index=award_config["index_name"],
                                        ignore_unavailable=False)
Example #5
0
    def handle(self, *args, **options):
        elasticsearch_client = instantiate_elasticsearch_client()
        config = parse_cli_args(options, elasticsearch_client)

        start = perf_counter()
        logger.info(format_log(f"Starting script\n{'=' * 56}"))
        start_msg = "target index: {index_name} | Starting from: {starting_date}"
        logger.info(format_log(start_msg.format(**config)))

        ensure_view_exists(config["sql_view"], force=True)
        error_addition = ""
        loader = Controller(config)

        if config["is_incremental_load"]:
            toggle_refresh_off(elasticsearch_client,
                               config["index_name"])  # Turned back on at end.

        try:
            if config["process_deletes"]:
                loader.run_deletes()

            if not config["deletes_only"]:
                loader.prepare_for_etl()
                loader.dispatch_tasks()
        except Exception as e:
            logger.error(f"{str(e)}")
            error_addition = "before encountering a problem during execution.... "
            raise SystemExit(1)
        else:
            loader.complete_process()
            if config["drop_db_view"]:
                logger.info(
                    format_log(f"Dropping SQL view '{config['sql_view']}'"))
                drop_etl_view(config["sql_view"], True)
        finally:
            msg = f"Script duration was {perf_counter() - start:.2f}s {error_addition}|"
            headers = f"{'-' * (len(msg) - 2)} |"
            logger.info(format_log(headers))
            logger.info(format_log(msg))
            logger.info(format_log(headers))

        # Used to help pipeline determine when job passed but needs attention
        if config["raise_status_code_3"]:
            raise SystemExit(3)
def test_create_and_load_new_award_index(award_data_fixture,
                                         elasticsearch_award_index,
                                         monkeypatch):
    """Test the ``elasticsearch_loader`` django management command to create a new awards index and load it
    with data from the DB
    """
    client = elasticsearch_award_index.client  # type: Elasticsearch

    # Ensure index is not yet created
    assert not client.indices.exists(elasticsearch_award_index.index_name)
    original_db_awards_count = Award.objects.count()

    # Inject ETL arg into config for this run, which loads a newly created index
    elasticsearch_award_index.etl_config["create_new_index"] = True
    es_etl_config = _process_es_etl_test_config(client,
                                                elasticsearch_award_index)

    # Must use mock sql function to share test DB conn+transaction in ETL code
    # Patching on the module into which it is imported, not the module where it is defined
    monkeypatch.setattr(
        "usaspending_api.etl.elasticsearch_loader_helpers.extract_data.execute_sql_statement",
        mock_execute_sql)
    # Also override SQL function listed in config object with the mock one
    es_etl_config["execute_sql_func"] = mock_execute_sql
    loader = Controller(es_etl_config)
    assert loader.__class__.__name__ == "Controller"
    loader.prepare_for_etl()
    loader.dispatch_tasks()
    # Along with other things, this will refresh the index, to surface loaded docs
    set_final_index_config(client, elasticsearch_award_index.index_name)

    assert client.indices.exists(elasticsearch_award_index.index_name)
    es_award_docs = client.count(
        index=elasticsearch_award_index.index_name)["count"]
    assert es_award_docs == original_db_awards_count
Example #7
0
def test_get_id_range_for_partition_one_records():
    min_id = 1
    max_id = 1
    id_range_item_count = max_id - min_id + 1  # this many individual IDs should be processed for continuous ID range
    etl_config = {"partition_size": 10000}
    ctrl = Controller(etl_config)
    ctrl.min_id = min_id
    ctrl.max_id = max_id
    ctrl.record_count = id_range_item_count  # assume records exist for each ID in range
    ctrl.config["partitions"] = ctrl.determine_partitions()
    partition_range = range(0, ctrl.config["partitions"])
    lower_bound, upper_bound = ctrl.get_id_range_for_partition(
        partition_range[0])
    assert lower_bound == min_id
    assert upper_bound == max_id
    lower_bound, upper_bound = ctrl.get_id_range_for_partition(
        partition_range[-1])
    assert lower_bound == min_id
    assert upper_bound == max_id
    id_set = set(range(min_id, max_id + 1))
    assert _remove_seen_ids(ctrl, id_set) == set({})
def test_incremental_load_into_award_index(award_data_fixture,
                                           elasticsearch_award_index,
                                           monkeypatch):
    """Test the ``elasticsearch_loader`` django management command to incrementally load updated data into the awards ES
    index from the DB, overwriting the doc that was already there
    """
    original_db_awards_count = Award.objects.count()
    elasticsearch_award_index.update_index()
    client = elasticsearch_award_index.client  # type: Elasticsearch
    assert client.indices.exists(elasticsearch_award_index.index_name)
    es_award_docs = client.count(
        index=elasticsearch_award_index.index_name)["count"]
    assert es_award_docs == original_db_awards_count

    # Inject ETL arg into config for this run, to suppress processing deletes. Test incremental load only
    elasticsearch_award_index.etl_config["process_deletes"] = False
    elasticsearch_award_index.etl_config["start_datetime"] = datetime.now(
        timezone.utc)
    es_etl_config = _process_es_etl_test_config(client,
                                                elasticsearch_award_index)

    # Now modify one of the DB objects
    awd = Award.objects.first()  # type: Award
    awd.total_obligation = 9999
    awd.save()

    # Must use mock sql function to share test DB conn+transaction in ETL code
    # Patching on the module into which it is imported, not the module where it is defined
    monkeypatch.setattr(
        "usaspending_api.etl.elasticsearch_loader_helpers.extract_data.execute_sql_statement",
        mock_execute_sql)
    # Also override SQL function listed in config object with the mock one
    es_etl_config["execute_sql_func"] = mock_execute_sql
    ensure_view_exists(es_etl_config["sql_view"], force=True)
    loader = Controller(es_etl_config)
    assert loader.__class__.__name__ == "Controller"
    loader.prepare_for_etl()
    loader.dispatch_tasks()
    client.indices.refresh(elasticsearch_award_index.index_name)

    assert client.indices.exists(elasticsearch_award_index.index_name)
    es_award_docs = client.count(
        index=elasticsearch_award_index.index_name)["count"]
    assert es_award_docs == original_db_awards_count
    es_awards = client.search(index=elasticsearch_award_index.index_name)
    updated_award = [
        a for a in es_awards["hits"]["hits"]
        if a["_source"]["award_id"] == awd.id
    ][0]
    assert int(updated_award["_source"]["total_obligation"]) == 9999