コード例 #1
0
    def handle(self, *args, **options):
        elasticsearch_client = instantiate_elasticsearch_client()
        config = process_cli_parameters(options, elasticsearch_client)

        start = perf_counter()
        printf({"msg": f"Starting script\n{'=' * 56}"})
        start_msg = "target index: {index_name} | FY(s): {fiscal_years} | Starting from: {starting_date}"
        printf({"msg": start_msg.format(**config)})

        if config["load_type"] == "transactions":
            ensure_view_exists(settings.ES_TRANSACTIONS_ETL_VIEW_NAME)
        elif config["load_type"] == "awards":
            ensure_view_exists(settings.ES_AWARDS_ETL_VIEW_NAME)

        loader = Rapidloader(config, elasticsearch_client)
        loader.run_load_steps()
        loader.complete_process()

        printf({
            "msg":
            "---------------------------------------------------------------"
        })
        printf({"msg": f"Script completed in {perf_counter() - start:.2f}s"})
        printf({
            "msg":
            "---------------------------------------------------------------"
        })
コード例 #2
0
def test_es_award_loader_class(award_data_fixture, elasticsearch_award_index, baby_sleeps):
    elasticsearch_client = instantiate_elasticsearch_client()
    loader = Rapidloader(config, elasticsearch_client)
    assert loader.__class__.__name__ == "Rapidloader"
    loader.run_load_steps()
    assert elasticsearch_client.indices.exists(config["index_name"])
    elasticsearch_client.indices.delete(index=config["index_name"], ignore_unavailable=False)
コード例 #3
0
    def handle(self, *args, **options):
        self.elasticsearch_client = instantiate_elasticsearch_client()
        self.config = process_cli_parameters(options,
                                             self.elasticsearch_client)

        start = perf_counter()
        printf({"msg": "Starting script\n{}".format("=" * 56)})
        start_msg = "target index: {index_name} | FY(s): {fiscal_years} | Starting from: {starting_date}"
        printf({"msg": start_msg.format(**self.config)})
        ensure_transaction_etl_view_exists()

        self.run_load_steps()
        self.complete_process()

        printf({
            "msg":
            "---------------------------------------------------------------"
        })
        printf({
            "msg":
            "Script completed in {} seconds".format(perf_counter() - start)
        })
        printf({
            "msg":
            "---------------------------------------------------------------"
        })
コード例 #4
0
def extract_transform_load(task: TaskSpec) -> None:
    if abort.is_set():
        logger.warning(format_log(f"Skipping partition #{task.partition_number} due to previous error", name=task.name))
        return

    start = perf_counter()
    msg = f"Started processing on partition #{task.partition_number}: {task.name}"
    logger.info(format_log(msg, name=task.name))

    client = instantiate_elasticsearch_client()
    try:
        records = task.transform_func(task, extract_records(task))
        if abort.is_set():
            f"Prematurely ending partition #{task.partition_number} due to error in another process"
            logger.warning(format_log(msg, name=task.name))
            return
        success, fail = load_data(task, records, client)
        with total_doc_success.get_lock():
            total_doc_success.value += success
        with total_doc_fail.get_lock():
            total_doc_fail.value += fail
    except Exception:
        if abort.is_set():
            msg = f"Partition #{task.partition_number} failed after an error was previously encountered"
            logger.warning(format_log(msg, name=task.name))
        else:
            logger.error(format_log(f"{task.name} failed!", name=task.name))
            abort.set()
    else:
        msg = f"Partition #{task.partition_number} was successfully processed in {perf_counter() - start:.2f}s"
        logger.info(format_log(msg, name=task.name))
コード例 #5
0
    def prepare_for_etl(self) -> None:
        if self.config["process_deletes"]:
            self.run_deletes()
        logger.info(format_log("Assessing data to process"))
        self.record_count, self.min_id, self.max_id = count_of_records_to_process(
            self.config)

        if self.record_count == 0:
            self.processes = []
            return

        self.config["partitions"] = self.determine_partitions()
        self.config["processes"] = min(self.config["processes"],
                                       self.config["partitions"])
        self.tasks = self.construct_tasks()

        logger.info(
            format_log(
                f"Created {len(self.tasks):,} task partitions"
                f" to process {self.record_count:,} total {self.config['data_type']} records"
                f" from ID {self.min_id} to {self.max_id}"
                f" with {self.config['processes']:,} parallel processes"))

        if self.config["create_new_index"]:
            # ensure template for index is present and the latest version
            call_command("es_configure", "--template-only",
                         f"--load-type={self.config['data_type']}")
            create_index(self.config["index_name"],
                         instantiate_elasticsearch_client())
コード例 #6
0
def test_es_award_loader_class(award_data_fixture, elasticsearch_award_index, baby_sleeps, monkeypatch):
    monkeypatch.setattr("usaspending_api.etl.es_etl_helpers.execute_sql_statement", mock_execute_sql)
    elasticsearch_client = instantiate_elasticsearch_client()
    loader = Rapidloader(config, elasticsearch_client)
    assert loader.__class__.__name__ == "Rapidloader"
    loader.run_load_steps()
    assert elasticsearch_client.indices.exists(config["index_name"])
    elasticsearch_client.indices.delete(index=config["index_name"], ignore_unavailable=False)
コード例 #7
0
 def run_deletes(self) -> None:
     logger.info(format_log("Processing deletions"))
     client = instantiate_elasticsearch_client()
     if self.config["data_type"] == "award":
         deleted_awards(client, self.config)
     elif self.config["data_type"] == "transaction":
         deleted_transactions(client, self.config)
     else:
         raise RuntimeError(f"No delete function implemented for type {self.config['data_type']}")
コード例 #8
0
def test_es_transaction_loader_class(award_data_fixture,
                                     elasticsearch_transaction_index,
                                     monkeypatch):
    monkeypatch.setattr(
        "usaspending_api.etl.elasticsearch_loader_helpers.utilities.execute_sql_statement",
        mock_execute_sql)
    elasticsearch_client = instantiate_elasticsearch_client()
    loader = Controller(transaction_config, elasticsearch_client)
    assert loader.__class__.__name__ == "Controller"
    loader.run_load_steps()
    assert elasticsearch_client.indices.exists(
        transaction_config["index_name"])
    elasticsearch_client.indices.delete(index=transaction_config["index_name"],
                                        ignore_unavailable=False)
コード例 #9
0
    def complete_process(self) -> None:
        client = instantiate_elasticsearch_client()
        if self.config["create_new_index"]:
            set_final_index_config(client, self.config["index_name"])
            if self.config["skip_delete_index"]:
                logger.info(format_log("Skipping deletion of old indices"))
            else:
                logger.info(format_log("Closing old indices and adding aliases"))
                swap_aliases(client, self.config)

        if self.config["is_incremental_load"]:
            toggle_refresh_on(client, self.config["index_name"])
            logger.info(
                format_log(f"Storing datetime {self.config['processing_start_datetime']} for next incremental load")
            )
            update_last_load_date(f"{self.config['stored_date_key']}", self.config["processing_start_datetime"])
コード例 #10
0
    def handle(self, *args, **options):
        elasticsearch_client = instantiate_elasticsearch_client()
        config = parse_cli_args(options, elasticsearch_client)

        start = perf_counter()
        logger.info(format_log(f"Starting script\n{'=' * 56}"))
        start_msg = "target index: {index_name} | Starting from: {starting_date}"
        logger.info(format_log(start_msg.format(**config)))

        ensure_view_exists(config["sql_view"], force=True)
        error_addition = ""
        loader = Controller(config)

        if config["is_incremental_load"]:
            toggle_refresh_off(elasticsearch_client,
                               config["index_name"])  # Turned back on at end.

        try:
            if config["process_deletes"]:
                loader.run_deletes()

            if not config["deletes_only"]:
                loader.prepare_for_etl()
                loader.dispatch_tasks()
        except Exception as e:
            logger.error(f"{str(e)}")
            error_addition = "before encountering a problem during execution.... "
            raise SystemExit(1)
        else:
            loader.complete_process()
            if config["drop_db_view"]:
                logger.info(
                    format_log(f"Dropping SQL view '{config['sql_view']}'"))
                drop_etl_view(config["sql_view"], True)
        finally:
            msg = f"Script duration was {perf_counter() - start:.2f}s {error_addition}|"
            headers = f"{'-' * (len(msg) - 2)} |"
            logger.info(format_log(headers))
            logger.info(format_log(msg))
            logger.info(format_log(headers))

        # Used to help pipeline determine when job passed but needs attention
        if config["raise_status_code_3"]:
            raise SystemExit(3)
コード例 #11
0
    def handle(self, *args, **options):

        # Initialize client to connect to Elasticsearch
        es_client = instantiate_elasticsearch_client()

        # Open connection to database
        with connection.cursor() as cursor:

            # Queries for Covid Awards not present in latest File C Submission
            cursor.execute(MISSING_COVID_AWARD_SQL)

            logger.info(
                "Found {} Covid awards without entry in latest File C Submission"
                .format(cursor.rowcount))

            rows = cursor.fetchmany(self.FETCH_COUNT)
            while len(rows) > 0:
                award_ids = [row[0] for row in rows]

                # Sets the outlays of these awards to zero in Elasticsearch
                self.set_elasticsearch_covid_outlays_to_zero(
                    es_client, award_ids)
                rows = cursor.fetchmany(self.FETCH_COUNT)