def nightly_loader(self, start_date):

        logger.info("==== Starting FPDS nightly data load ====")

        if start_date:
            date = start_date
            date = datetime.strptime(date, "%Y-%m-%d").date()
        else:
            default_last_load_date = datetime.now(timezone.utc) - timedelta(days=1)
            date = get_last_load_date("fpds", default=default_last_load_date).date()
        processing_start_datetime = datetime.now(timezone.utc)

        logger.info("Processing data for FPDS starting from %s" % date)

        with timer("retrieval of new/modified FPDS data ID list", logger.info):
            ids_to_insert = self.get_fpds_transaction_ids(date=date)

        with timer("retrieval of deleted FPDS IDs", logger.info):
            ids_to_delete = self.get_deleted_fpds_data_from_s3(date=date)

        self.perform_load(
            ids_to_delete,
            ids_to_insert,
        )

        # Update the date for the last time the data load was run
        update_last_load_date("fpds", processing_start_datetime)

        logger.info("FPDS NIGHTLY UPDATE COMPLETE")
 def obtain_last_date(self):
     dt = get_last_load_date(self.last_load_record, self.lookback_minutes)
     if not dt:
         raise SystemExit(
             "No datetime stored in the database, unable to use --since-last-load"
         )
     return dt
Example #3
0
def get_last_load_date():
    """
    Wraps the get_load_load_date helper which is responsible for grabbing the
    last load date from the database.

    Without getting into too much detail, SUBMISSION_LOOKBACK_MINUTES is used
    to counter a very rare race condition where database commits are saved ever
    so slightly out of order with the updated_at timestamp.  It will be
    subtracted from the last_run_date to ensure submissions with similar
    updated_at times do not fall through the cracks.  An unfortunate side
    effect is that some submissions may be processed more than once which
    SHOULDN'T cause any problems but will add to the run time.  To minimize
    this, keep the value as small as possible while still preventing skips.  To
    be clear, the original fabs loader did this as well, just in a way that did
    not always prevent skips.
    """
    from usaspending_api.broker.helpers.last_load_date import get_last_load_date

    last_load_date = get_last_load_date("fabs", SUBMISSION_LOOKBACK_MINUTES)
    if last_load_date is None:
        external_data_type_id = lookups.EXTERNAL_DATA_TYPE_DICT["fabs"]
        raise RuntimeError(
            "Unable to find last_load_date in table {} for external_data_type_id={}. "
            "If this is expected and the goal is to reload all submissions, supply the "
            "--reload-all switch on the command line.".format(
                ExternalDataLoadDate.objects.model._meta.db_table,
                external_data_type_id))
    return last_load_date
def get_last_load_date():
    """
    Wraps the get_load_load_date helper which is responsible for grabbing the
    last load date from the database.

    Without getting into too much detail, SUBMISSION_LOOKBACK_MINUTES is used
    to counter a very rare race condition where database commits are saved ever
    so slightly out of order with the updated_at timestamp.  It will be
    subtracted from the last_run_date to ensure submissions with similar
    updated_at times do not fall through the cracks.  An unfortunate side
    effect is that some submissions may be processed more than once which
    SHOULDN'T cause any problems but will add to the run time.  To minimize
    this, keep the value as small as possible while still preventing skips.  To
    be clear, the original fabs loader did this as well, just in a way that did
    not always prevent skips.
    """
    from usaspending_api.broker.helpers.last_load_date import get_last_load_date

    last_load_date = get_last_load_date("fabs", SUBMISSION_LOOKBACK_MINUTES)
    if last_load_date is None:
        external_data_type_id = lookups.EXTERNAL_DATA_TYPE_DICT["fabs"]
        raise RuntimeError(
            "Unable to find last_load_date in table {} for external_data_type_id={}. "
            "If this is expected and the goal is to reload all submissions, supply the "
            "--reload-all switch on the command line.".format(
                ExternalDataLoadDate.objects.model._meta.db_table, external_data_type_id))
    return last_load_date
Example #5
0
def parse_cli_args(options: dict, es_client) -> dict:
    passthrough_values = (
        "create_new_index",
        "drop_db_view",
        "index_name",
        "load_type",
        "partition_size",
        "process_deletes",
        "processes",
        "skip_counts",
        "skip_delete_index",
    )
    config = set_config(passthrough_values, options)

    if config["create_new_index"] and not config["index_name"]:
        raise SystemExit(
            "Fatal error: '--create-new-index' requires '--index-name'.")
    elif config["create_new_index"]:
        config["index_name"] = config["index_name"].lower()
        config["starting_date"] = config["initial_datetime"]
        check_new_index_name_is_ok(config["index_name"],
                                   config["required_index_name"])
    elif options["start_datetime"]:
        config["starting_date"] = options["start_datetime"]
    else:
        # Due to the queries used for fetching postgres data,
        #  `starting_date` needs to be present and a date before:
        #      - The earliest records in S3.
        #      - When all transaction records in the USAspending SQL database were updated.
        #   And keep it timezone-aware for S3
        config["starting_date"] = get_last_load_date(
            config["stored_date_key"], default=config["initial_datetime"])

    config["is_incremental_load"] = not bool(config["create_new_index"]) and (
        config["starting_date"] != config["initial_datetime"])

    if config["is_incremental_load"]:
        if config["index_name"]:
            logger.info(
                format_log(
                    f"Ignoring provided index name, using alias '{config['write_alias']}' for safety"
                ))
        config["index_name"] = config["write_alias"]
        if not es_client.cat.aliases(name=config["write_alias"]):
            logger.error(f"Write alias '{config['write_alias']}' is missing")
            raise SystemExit(1)
    else:
        if es_client.indices.exists(config["index_name"]):
            logger.error(
                f"Data load into existing index. Change index name or run an incremental load"
            )
            raise SystemExit(1)

    if config["starting_date"] < config["initial_datetime"]:
        logger.error(
            f"--start-datetime is too early. Set no earlier than {config['initial_datetime']}"
        )
        raise SystemExit(1)

    return config
Example #6
0
    def handle(self, *args, **options):
        script_start_time = datetime.now(timezone.utc)
        periods = retrieve_recent_periods()

        # Using `script_start_time` as a default, so no awards will be touched the first time this script
        # is run. The assumption is that awards are up to date at the time the script is deployed. After
        # this runs the first time, a date will be populated in the database.
        self.last_load_date = get_last_load_date("touch_last_period_awards",
                                                 default=script_start_time)

        logger.info(
            f"Using {script_start_time} to determine if awards should be touched."
        )

        total_records_updated = 0

        total_records_updated += self.touch_period_awards_if_behind(
            periods["this_month"])
        total_records_updated += self.touch_period_awards_if_behind(
            periods["this_quarter"])

        update_last_load_date("touch_last_period_awards", script_start_time)

        logger.info(
            f"Found {total_records_updated:,} award records to update in Elasticsearch"
        )

        # Return will be captured as stdout in Jenkins job
        return str(total_records_updated)
Example #7
0
    def handle(self, *args, **options):
        logger.info("==== Starting FPDS nightly data load ====")

        if options.get("date"):
            date = options.get("date")[0]
            date = datetime.strptime(date, "%Y-%m-%d").date()
        else:
            default_last_load_date = datetime.now(
                timezone.utc) - timedelta(days=1)
            date = get_last_load_date("fpds",
                                      default=default_last_load_date).date()
        processing_start_datetime = datetime.now(timezone.utc)

        logger.info("Processing data for FPDS starting from %s" % date)

        with timer("retrieval of deleted FPDS IDs", logger.info):
            ids_to_delete = self.get_deleted_fpds_data_from_s3(date=date)

        if len(ids_to_delete) > 0:
            with timer("deletion of all stale FPDS data", logger.info):
                self.delete_stale_fpds(ids_to_delete=ids_to_delete)
        else:
            logger.info("No FPDS records to delete at this juncture")

        with timer("retrieval of new/modified FPDS data ID list", logger.info):
            total_insert = self.get_fpds_transaction_ids(date=date)

        if len(total_insert) > 0:
            # Add FPDS records
            with timer("insertion of new FPDS data in batches", logger.info):
                self.insert_all_new_fpds(total_insert)

            # Update Awards based on changed FPDS records
            with timer(
                    "updating awards to reflect their latest associated transaction info",
                    logger.info):
                update_awards(tuple(AWARD_UPDATE_ID_LIST))

            # Update FPDS-specific Awards based on the info in child transactions
            with timer(
                    "updating contract-specific awards to reflect their latest transaction info",
                    logger.info):
                update_contract_awards(tuple(AWARD_UPDATE_ID_LIST))

            # Update AwardCategories based on changed FPDS records
            with timer("updating award category variables", logger.info):
                update_award_categories(tuple(AWARD_UPDATE_ID_LIST))

            # Check the linkages from file C to FPDS records and update any that are missing
            with timer("updating C->D linkages", logger.info):
                update_c_to_d_linkages("contract")
        else:
            logger.info("No FPDS records to insert or modify at this juncture")

        # Update the date for the last time the data load was run
        update_last_load_date("fpds", processing_start_datetime)

        logger.info("FPDS NIGHTLY UPDATE COMPLETE")
    def handle(self, *args, **options):
        logger.info("==== Starting FPDS nightly data load ====")

        if options.get("date"):
            date = options.get("date")[0]
            date = datetime.strptime(date, "%Y-%m-%d").date()
        else:
            default_last_load_date = datetime.now(timezone.utc) - timedelta(days=1)
            date = get_last_load_date("fpds", default=default_last_load_date).date()
        processing_start_datetime = datetime.now(timezone.utc)

        logger.info("Processing data for FPDS starting from %s" % date)

        with timer("retrieval of deleted FPDS IDs", logger.info):
            ids_to_delete = self.get_deleted_fpds_data_from_s3(date=date)

        if len(ids_to_delete) > 0:
            with timer("deletion of all stale FPDS data", logger.info):
                self.delete_stale_fpds(ids_to_delete=ids_to_delete)
        else:
            logger.info("No FPDS records to delete at this juncture")

        with timer("retrieval of new/modified FPDS data ID list", logger.info):
            total_insert = self.get_fpds_transaction_ids(date=date)

        if len(total_insert) > 0:
            # Add FPDS records
            with timer("insertion of new FPDS data in batches", logger.info):
                self.insert_all_new_fpds(total_insert)

            # Update Awards based on changed FPDS records
            with timer("updating awards to reflect their latest associated transaction info", logger.info):
                update_awards(tuple(AWARD_UPDATE_ID_LIST))

            # Update FPDS-specific Awards based on the info in child transactions
            with timer("updating contract-specific awards to reflect their latest transaction info", logger.info):
                update_contract_awards(tuple(AWARD_UPDATE_ID_LIST))

            # Update AwardCategories based on changed FPDS records
            with timer("updating award category variables", logger.info):
                update_award_categories(tuple(AWARD_UPDATE_ID_LIST))

            # Check the linkages from file C to FPDS records and update any that are missing
            with timer("updating C->D linkages", logger.info):
                update_c_to_d_linkages("contract")
        else:
            logger.info("No FPDS records to insert or modify at this juncture")

        # Update the date for the last time the data load was run
        update_last_load_date("fpds", processing_start_datetime)

        logger.info("FPDS NIGHTLY UPDATE COMPLETE")
    def handle(self, *args, **options):
        processing_start_datetime = datetime.now(timezone.utc)

        logger.info("Starting FABS data load script...")

        do_not_log_deletions = options["do_not_log_deletions"]

        # "Reload all" supersedes all other processing options.
        reload_all = options["reload_all"]
        if reload_all:
            submission_ids = None
            afa_ids = None
            start_datetime = None
            end_datetime = None
        else:
            submission_ids = tuple(options["submission_ids"]
                                   ) if options["submission_ids"] else None
            afa_ids = read_afa_ids_from_file(
                options['afa_id_file']) if options['afa_id_file'] else None
            start_datetime = options["start_datetime"]
            end_datetime = options["end_datetime"]

        # If no other processing options were provided than this is an incremental load.
        is_incremental_load = not any((reload_all, submission_ids, afa_ids,
                                       start_datetime, end_datetime))

        if is_incremental_load:
            last_load_date = get_last_load_date()
            submission_ids = get_new_submission_ids(last_load_date)
            logger.info("Processing data for FABS starting from %s" %
                        last_load_date)

        if is_incremental_load and not submission_ids:
            logger.info("No new submissions. Exiting.")

        else:
            with timer("obtaining delete records", logger.info):
                ids_to_delete = get_fabs_records_to_delete(
                    submission_ids, afa_ids, start_datetime, end_datetime)

            with timer("retrieving/diff-ing FABS Data", logger.info):
                ids_to_upsert = get_fabs_transaction_ids(
                    submission_ids, afa_ids, start_datetime, end_datetime)

            update_award_ids = delete_fabs_transactions(
                ids_to_delete, do_not_log_deletions)
            upsert_fabs_transactions(ids_to_upsert, update_award_ids)

        if is_incremental_load:
            update_last_load_date("fabs", processing_start_datetime)

        logger.info("FABS UPDATE FINISHED!")
    def transform_cli_arguments(self, options):
        simple_args = ("provide_deleted", "reload_all", "snapshot",
                       "index_name", "directory", "fast")
        self.config = set_config(simple_args, options)

        self.config["fiscal_years"] = fiscal_years_for_processing(options)
        self.config["directory"] = self.config["directory"] + os.sep
        self.config["index_name"] = self.config["index_name"].lower()

        if self.config["reload_all"]:
            self.config["starting_date"] = DEFAULT_DATETIME
        elif options["start_datetime"]:
            self.config["starting_date"] = options["start_datetime"]
        else:
            # Due to the queries used for fetching postgres data,
            #  `starting_date` needs to be present and a date before:
            #      - The earliest records in S3.
            #      - When all transaction records in the USAspending SQL database were updated.
            #   And keep it timezone-award for S3
            self.config["starting_date"] = get_last_load_date(
                "es_transactions", default=DEFAULT_DATETIME)

        self.config["mapping"], self.config["doc_type"], self.config[
            "max_query_size"] = mapping_data_for_processing()

        does_index_exist = ES.indices.exists(self.config["index_name"])
        self.config["is_incremental_load"] = self.config[
            "starting_date"] != DEFAULT_DATETIME

        if not os.path.isdir(self.config["directory"]):
            printf({"msg": "Provided directory does not exist"})
            raise SystemExit(1)
        elif self.config["starting_date"] < DEFAULT_DATETIME:
            printf({
                "msg":
                "`start-datetime` is too early. Set to after {}".format(
                    DEFAULT_DATETIME)
            })
            raise SystemExit(1)
        elif does_index_exist and not self.config["is_incremental_load"]:
            printf({
                "msg":
                "Full data load into existing index! Change destination index or load a subset of data"
            })
            raise SystemExit(1)
        elif not does_index_exist or self.config["reload_all"]:
            printf({
                "msg":
                "Skipping deletions for ths load, provide_deleted overwritten to False"
            })
            self.config["provide_deleted"] = False
Example #11
0
def get_incremental_load_start_datetime():
    """
    This function is designed to help prevent two issues we've discovered with the FABS nightly
    pipeline:

     #1 LAST_LOAD_LOOKBACK_MINUTES are subtracted from last load datetime to counter a very rare
        race condition where database commits are saved ever so slightly out of order when compared
        to the last updated timestamp due to commit duration which can cause transactions to be
        skipped.  The timestamp is set to when the commit begins, so if the commit starts before
        the load but doesn't finish until after the load, the transactions saved as part of that
        commit will never get picked up.  This has happened in the wild at least once and is not
        all that hard to imagine if you consider that large submissions can take many minutes to
        commit.  We do not currently have a good way to test this, but I've seen extremely large
        transactions take an hour to commit.  I do not believe submissions currently get this large
        but it's something to keep in mind.

     #2 We use the minimum of the last load date or the max transaction_fabs updated_at date
        to prevent FABS transactions submitted between when the source records are copied from
        Broker and when FABS transactions are processed from being skipped.

    An unfortunate side effect of the look back is that some submissions may be processed more than
    once.  This SHOULDN'T cause any problems since the FABS loader is designed to be able to reload
    transactions, but it could add to the run time.  To minimize reprocessing, keep the
    LAST_LOAD_LOOKBACK_MINUTES value as small as possible while still preventing skips.  To be
    clear, the original fabs loader did this as well, just in a way that did not always prevent
    skips (by always running since midnight - which had its own issues).
    """
    last_load_date = get_last_load_date("fabs", LAST_LOAD_LOOKBACK_MINUTES)
    if last_load_date is None:
        raise RuntimeError(
            f"Unable to find last_load_date in table {ExternalDataLoadDate.objects.model._meta.db_table} "
            f"for external_data_type_id={lookups.EXTERNAL_DATA_TYPE_DICT['fabs']}.  If this is expected and "
            f"the goal is to reload all submissions, supply the --reload-all switch on the command line."
        )
    max_updated_at = TransactionFABS.objects.aggregate(
        Max("updated_at"))["updated_at__max"]
    if max_updated_at is None:
        return last_load_date
    else:
        logger.info(
            f"Most recent update_date in `transaction_fabs` {max_updated_at}")

    # We add a little tiny bit of time to the max_updated_at to prevent us from always reprocessing
    # records since the SQL that grabs new records is using updated_at >=.  I realize this is a hack
    # but the pipeline is already running for too long so anything we can do to prevent elongating
    # it should be welcome.
    max_updated_at += timedelta(milliseconds=UPDATED_AT_MODIFIER_MS)

    return min((last_load_date, max_updated_at))
Example #12
0
    def handle(self, *args, **options):
        processing_start_datetime = datetime.now(timezone.utc)

        logger.info("Starting FABS data load script...")

        # "Reload all" supersedes all other processing options.
        reload_all = options["reload_all"]
        if reload_all:
            afa_ids = None
            start_datetime = None
            end_datetime = None
        else:
            afa_ids = read_afa_ids_from_file(
                options["afa_id_file"]) if options["afa_id_file"] else None
            start_datetime = options["start_datetime"]
            end_datetime = options["end_datetime"]

        # If no other processing options were provided than this is an incremental load.
        is_incremental_load = not any(
            (reload_all, afa_ids, start_datetime, end_datetime))

        if is_incremental_load:
            start_datetime = get_last_load_date()
            logger.info("Processing data for FABS starting from %s" %
                        start_datetime)

        with timer("obtaining delete records", logger.info):
            delete_records = retrieve_deleted_fabs_transactions(
                start_datetime, end_datetime)
            ids_to_delete = [
                item for sublist in delete_records.values() for item in sublist
                if item
            ]

        with timer("retrieving/diff-ing FABS Data", logger.info):
            ids_to_upsert = get_fabs_transaction_ids(afa_ids, start_datetime,
                                                     end_datetime)

        update_award_ids = delete_fabs_transactions(ids_to_delete)
        upsert_fabs_transactions(ids_to_upsert, update_award_ids)

        if is_incremental_load:
            update_last_load_date("fabs", processing_start_datetime)

        logger.info("FABS UPDATE FINISHED!")
    def handle(self, *args, **options):
        processing_start_datetime = datetime.now(timezone.utc)

        logger.info("Starting FABS data load script...")

        do_not_log_deletions = options["do_not_log_deletions"]

        # "Reload all" supersedes all other processing options.
        reload_all = options["reload_all"]
        if reload_all:
            submission_ids = None
            afa_ids = None
            start_datetime = None
            end_datetime = None
        else:
            submission_ids = tuple(options["submission_ids"]) if options["submission_ids"] else None
            afa_ids = read_afa_ids_from_file(options['afa_id_file']) if options['afa_id_file'] else None
            start_datetime = options["start_datetime"]
            end_datetime = options["end_datetime"]

        # If no other processing options were provided than this is an incremental load.
        is_incremental_load = not any((reload_all, submission_ids, afa_ids, start_datetime, end_datetime))

        if is_incremental_load:
            last_load_date = get_last_load_date()
            submission_ids = get_new_submission_ids(last_load_date)
            logger.info("Processing data for FABS starting from %s" % last_load_date)

        if is_incremental_load and not submission_ids:
            logger.info("No new submissions. Exiting.")

        else:
            with timer("obtaining delete records", logger.info):
                ids_to_delete = get_fabs_records_to_delete(submission_ids, afa_ids, start_datetime, end_datetime)

            with timer("retrieving/diff-ing FABS Data", logger.info):
                ids_to_upsert = get_fabs_transaction_ids(submission_ids, afa_ids, start_datetime, end_datetime)

            update_award_ids = delete_fabs_transactions(ids_to_delete, do_not_log_deletions)
            upsert_fabs_transactions(ids_to_upsert, update_award_ids)

        if is_incremental_load:
            update_last_load_date("fabs", processing_start_datetime)

        logger.info("FABS UPDATE FINISHED!")
Example #14
0
    def handle(self, *args, **options):

        # Record script execution start time to update the FPDS last updated date in DB as appropriate
        update_time = datetime.now(timezone.utc)

        if options["reload_all"]:
            self.load_fpds_incrementally(None)

        elif options["date"]:
            self.load_fpds_incrementally(options["date"])

        elif options["ids"]:
            self.modified_award_ids.extend(
                load_fpds_transactions(options["ids"]))

        elif options["file"]:
            self.load_fpds_from_file(options["file"])

        elif options["since_last_load"]:
            last_load = get_last_load_date("fpds")
            if not last_load:
                raise ValueError(
                    "No last load date for FPDS stored in the database")
            self.load_fpds_incrementally(last_load)

        self.update_award_records(awards=self.modified_award_ids,
                                  skip_cd_linkage=False)

        logger.info(f"Script took {datetime.now(timezone.utc) - update_time}")

        if failed_ids:
            failed_id_str = ", ".join([str(id) for id in failed_ids])
            logger.error(
                f"The following detached_award_procurement_ids failed to load: [{failed_id_str}]"
            )
            raise SystemExit(1)

        if options["reload_all"] or options["since_last_load"]:
            # we wait until after the load finishes to update the load date because if this crashes we'll need to load again
            update_last_load_date("fpds", update_time)

        logger.info(f"Successfully Completed")
Example #15
0
def process_cli_parameters(options: dict, es_client) -> dict:
    default_datetime = datetime.strptime(
        f"{settings.API_SEARCH_MIN_DATE}+0000", "%Y-%m-%d%z")
    simple_args = (
        "skip_delete_index",
        "process_deletes",
        "create_new_index",
        "snapshot",
        "index_name",
        "directory",
        "skip_counts",
        "load_type",
    )
    config = set_config(simple_args, options)

    config["fiscal_years"] = fiscal_years_for_processing(options)
    config["directory"] = Path(config["directory"]).resolve()

    if config["create_new_index"] and not config["index_name"]:
        raise SystemExit(
            "Fatal error: --create-new-index requires --index-name.")
    elif config["create_new_index"]:
        config["index_name"] = config["index_name"].lower()
        config["starting_date"] = default_datetime
        check_new_index_name_is_ok(
            config["index_name"],
            settings.ES_AWARDS_NAME_SUFFIX if config["load_type"] == "awards"
            else settings.ES_TRANSACTIONS_NAME_SUFFIX,
        )
    elif options["start_datetime"]:
        config["starting_date"] = options["start_datetime"]
    else:
        # Due to the queries used for fetching postgres data,
        #  `starting_date` needs to be present and a date before:
        #      - The earliest records in S3.
        #      - When all transaction records in the USAspending SQL database were updated.
        #   And keep it timezone-award for S3
        config["starting_date"] = get_last_load_date(
            f"es_{options['load_type']}", default=default_datetime)

    config["max_query_size"] = settings.ES_TRANSACTIONS_MAX_RESULT_WINDOW
    if options["load_type"] == "awards":
        config["max_query_size"] = settings.ES_AWARDS_MAX_RESULT_WINDOW

    config["is_incremental_load"] = not bool(config["create_new_index"]) and (
        config["starting_date"] != default_datetime)

    if config["is_incremental_load"]:
        write_alias = settings.ES_TRANSACTIONS_WRITE_ALIAS
        if config["load_type"] == "awards":
            write_alias = settings.ES_AWARDS_WRITE_ALIAS
        if config["index_name"]:
            printf({
                "msg":
                f"Ignoring provided index name, using alias '{write_alias}' for incremental load"
            })
        config["index_name"] = write_alias
        if not es_client.cat.aliases(name=write_alias):
            printf({
                "msg":
                f"Fatal error: write alias '{write_alias}' is missing"
            })
            raise SystemExit(1)
        # Force manual refresh for atomic transaction-like delete/re-add consistency during incremental load.
        # Turned back on at end.
        toggle_refresh_off(es_client, config["index_name"])
    else:
        if es_client.indices.exists(config["index_name"]):
            printf({
                "msg":
                "Fatal error: data load into existing index. Change index name or run an incremental load"
            })
            raise SystemExit(1)

    if not config["directory"].is_dir():
        printf({"msg": "Fatal error: provided directory does not exist"})
        raise SystemExit(1)
    elif config["starting_date"] < default_datetime:
        printf({
            "msg":
            f"Fatal error: --start-datetime is too early. Set no earlier than {default_datetime}"
        })
        raise SystemExit(1)
    elif not config["is_incremental_load"] and config["process_deletes"]:
        printf({
            "msg":
            "Skipping deletions for ths load, --deleted overwritten to False"
        })
        config["process_deletes"] = False

    config["ingest_wait"] = options["idle_wait_time"]

    return config
def process_cli_parameters(options: dict, es_client) -> None:
    default_datetime = datetime.strptime(
        "{}+0000".format(settings.API_SEARCH_MIN_DATE), "%Y-%m-%d%z")
    simple_args = (
        "process_deletes",
        "create_new_index",
        "snapshot",
        "index_name",
        "directory",
        "skip_counts",
        "skip_delete_index",
    )
    config = set_config(simple_args, options)

    config["fiscal_years"] = fiscal_years_for_processing(options)
    config["directory"] = Path(config["directory"]).resolve()

    if config["create_new_index"] and not config["index_name"]:
        raise SystemExit(
            "Fatal error: --create-new-index requires --index-name.")
    elif config["create_new_index"]:
        config["index_name"] = config["index_name"].lower()
        config["starting_date"] = default_datetime
        check_new_index_name_is_ok(config["index_name"])
    elif options["start_datetime"]:
        config["starting_date"] = options["start_datetime"]
    else:
        # Due to the queries used for fetching postgres data,
        #  `starting_date` needs to be present and a date before:
        #      - The earliest records in S3.
        #      - When all transaction records in the USAspending SQL database were updated.
        #   And keep it timezone-award for S3
        config["starting_date"] = get_last_load_date("es_transactions",
                                                     default=default_datetime)

    config["max_query_size"] = settings.ES_TRANSACTIONS_MAX_RESULT_WINDOW

    config["is_incremental_load"] = not bool(config["create_new_index"]) and (
        config["starting_date"] != default_datetime)

    if config["is_incremental_load"]:
        if config["index_name"]:
            msg = "Ignoring provided index name, using alias '{}' for incremental load"
            printf({"msg": msg.format(settings.ES_TRANSACTIONS_WRITE_ALIAS)})
        config["index_name"] = settings.ES_TRANSACTIONS_WRITE_ALIAS
        if not es_client.cat.aliases(
                name=settings.ES_TRANSACTIONS_WRITE_ALIAS):
            printf({
                "msg":
                "Fatal error: write alias '{}' is missing".format(
                    settings.ES_TRANSACTIONS_WRITE_ALIAS)
            })
            raise SystemExit(1)
    else:
        if es_client.indices.exists(config["index_name"]):
            printf({
                "msg":
                "Fatal error: data load into existing index. Change index name or run an incremental load"
            })
            raise SystemExit(1)

    if not config["directory"].is_dir():
        printf({"msg": "Fatal error: provided directory does not exist"})
        raise SystemExit(1)
    elif config["starting_date"] < default_datetime:
        printf({
            "msg":
            "Fatal error: --start-datetime is too early. Set no earlier than {}"
            .format(default_datetime)
        })
        raise SystemExit(1)
    elif not config["is_incremental_load"] and config["process_deletes"]:
        printf({
            "msg":
            "Skipping deletions for ths load, --deleted overwritten to False"
        })
        config["process_deletes"] = False

    return config