def handle(self, *args, **options):
        """ Script execution of custom code starts in this method"""
        start = perf_counter()
        printf({"msg": "Starting script\n{}".format("=" * 56)})

        self.transform_cli_arguments(options)

        start_msg = "target index: {index_name} | FY(s): {fiscal_years} | Starting from: {starting_date}"
        printf({"msg": start_msg.format(**self.config)})

        self.controller()

        if self.config["is_incremental_load"]:
            printf({
                "msg":
                "Updating Last Load record with {}".format(
                    self.config["processing_start_datetime"])
            })
            update_last_load_date("es_transactions",
                                  self.config["processing_start_datetime"])
        printf({
            "msg":
            "---------------------------------------------------------------"
        })
        printf({
            "msg":
            "Script completed in {} seconds".format(perf_counter() - start)
        })
        printf({
            "msg":
            "---------------------------------------------------------------"
        })
Esempio n. 2
0
    def complete_process(self) -> None:
        if self.config["create_new_index"]:
            set_final_index_config(self.elasticsearch_client,
                                   self.config["index_name"])
            if self.config["skip_delete_index"]:
                logger.info(format_log("Skipping deletion of old indices"))
            else:
                logger.info(
                    format_log("Closing old indices and adding aliases"))
                swap_aliases(self.elasticsearch_client,
                             self.config["index_name"],
                             self.config["load_type"])

        if self.config["snapshot"]:
            logger.info(format_log("Taking snapshot"))
            take_snapshot(self.elasticsearch_client, self.config["index_name"],
                          settings.ES_REPOSITORY)

        if self.config["is_incremental_load"]:
            toggle_refresh_on(self.elasticsearch_client,
                              self.config["index_name"])
            logger.info(
                format_log(
                    f"Storing datetime {self.config['processing_start_datetime']} for next incremental load"
                ))
            update_last_load_date(f"es_{self.config['load_type']}",
                                  self.config["processing_start_datetime"])
    def nightly_loader(self, start_date):

        logger.info("==== Starting FPDS nightly data load ====")

        if start_date:
            date = start_date
            date = datetime.strptime(date, "%Y-%m-%d").date()
        else:
            default_last_load_date = datetime.now(timezone.utc) - timedelta(days=1)
            date = get_last_load_date("fpds", default=default_last_load_date).date()
        processing_start_datetime = datetime.now(timezone.utc)

        logger.info("Processing data for FPDS starting from %s" % date)

        with timer("retrieval of new/modified FPDS data ID list", logger.info):
            ids_to_insert = self.get_fpds_transaction_ids(date=date)

        with timer("retrieval of deleted FPDS IDs", logger.info):
            ids_to_delete = self.get_deleted_fpds_data_from_s3(date=date)

        self.perform_load(
            ids_to_delete,
            ids_to_insert,
        )

        # Update the date for the last time the data load was run
        update_last_load_date("fpds", processing_start_datetime)

        logger.info("FPDS NIGHTLY UPDATE COMPLETE")
Esempio n. 4
0
    def handle(self, *args, **options):
        script_start_time = datetime.now(timezone.utc)
        periods = retrieve_recent_periods()

        # Using `script_start_time` as a default, so no awards will be touched the first time this script
        # is run. The assumption is that awards are up to date at the time the script is deployed. After
        # this runs the first time, a date will be populated in the database.
        self.last_load_date = get_last_load_date("touch_last_period_awards",
                                                 default=script_start_time)

        logger.info(
            f"Using {script_start_time} to determine if awards should be touched."
        )

        total_records_updated = 0

        total_records_updated += self.touch_period_awards_if_behind(
            periods["this_month"])
        total_records_updated += self.touch_period_awards_if_behind(
            periods["this_quarter"])

        update_last_load_date("touch_last_period_awards", script_start_time)

        logger.info(
            f"Found {total_records_updated:,} award records to update in Elasticsearch"
        )

        # Return will be captured as stdout in Jenkins job
        return str(total_records_updated)
Esempio n. 5
0
    def handle(self, *args, **options):
        processing_start_datetime = datetime.now(timezone.utc)

        logger.info("Starting FABS data load script...")

        # "Reload all" supersedes all other processing options.
        reload_all = options["reload_all"]
        if reload_all:
            ids = None
            afa_ids = None
            start_datetime = None
            end_datetime = None
        else:
            ids = options["ids"]
            afa_ids = set(options["afa_ids"])
            if options["afa_id_file"]:
                afa_ids = tuple(
                    afa_ids | read_afa_ids_from_file(options["afa_id_file"]))
            start_datetime = options["start_datetime"]
            end_datetime = options["end_datetime"]

        # If no other processing options were provided than this is an incremental load.
        is_incremental_load = not any(
            (reload_all, ids, afa_ids, start_datetime, end_datetime))

        if is_incremental_load:
            start_datetime = get_incremental_load_start_datetime()
            logger.info(
                f"Processing data for FABS starting from {start_datetime} (includes offset)"
            )

            # We only perform deletes with incremental loads.
            with timer("obtaining delete records", logger.info):
                delete_records = retrieve_deleted_fabs_transactions(
                    start_datetime, end_datetime)
                ids_to_delete = [
                    item for sublist in delete_records.values()
                    for item in sublist if item
                ]
                ids_to_delete = get_delete_pks_for_afa_keys(ids_to_delete)
            logger.info(f"{len(ids_to_delete):,} delete ids found in total")

        with timer("retrieving IDs of FABS to process", logger.info):
            ids_to_upsert = get_fabs_transaction_ids(ids, afa_ids,
                                                     start_datetime,
                                                     end_datetime)

        update_award_ids = delete_fabs_transactions(
            ids_to_delete) if is_incremental_load else []
        upsert_fabs_transactions(ids_to_upsert, update_award_ids)

        if is_incremental_load:
            logger.info(
                f"Storing {processing_start_datetime} for the next incremental run"
            )
            update_last_load_date("fabs", processing_start_datetime)

        logger.info("FABS UPDATE FINISHED!")
Esempio n. 6
0
    def handle(self, *args, **options):
        logger.info("==== Starting FPDS nightly data load ====")

        if options.get("date"):
            date = options.get("date")[0]
            date = datetime.strptime(date, "%Y-%m-%d").date()
        else:
            default_last_load_date = datetime.now(
                timezone.utc) - timedelta(days=1)
            date = get_last_load_date("fpds",
                                      default=default_last_load_date).date()
        processing_start_datetime = datetime.now(timezone.utc)

        logger.info("Processing data for FPDS starting from %s" % date)

        with timer("retrieval of deleted FPDS IDs", logger.info):
            ids_to_delete = self.get_deleted_fpds_data_from_s3(date=date)

        if len(ids_to_delete) > 0:
            with timer("deletion of all stale FPDS data", logger.info):
                self.delete_stale_fpds(ids_to_delete=ids_to_delete)
        else:
            logger.info("No FPDS records to delete at this juncture")

        with timer("retrieval of new/modified FPDS data ID list", logger.info):
            total_insert = self.get_fpds_transaction_ids(date=date)

        if len(total_insert) > 0:
            # Add FPDS records
            with timer("insertion of new FPDS data in batches", logger.info):
                self.insert_all_new_fpds(total_insert)

            # Update Awards based on changed FPDS records
            with timer(
                    "updating awards to reflect their latest associated transaction info",
                    logger.info):
                update_awards(tuple(AWARD_UPDATE_ID_LIST))

            # Update FPDS-specific Awards based on the info in child transactions
            with timer(
                    "updating contract-specific awards to reflect their latest transaction info",
                    logger.info):
                update_contract_awards(tuple(AWARD_UPDATE_ID_LIST))

            # Update AwardCategories based on changed FPDS records
            with timer("updating award category variables", logger.info):
                update_award_categories(tuple(AWARD_UPDATE_ID_LIST))

            # Check the linkages from file C to FPDS records and update any that are missing
            with timer("updating C->D linkages", logger.info):
                update_c_to_d_linkages("contract")
        else:
            logger.info("No FPDS records to insert or modify at this juncture")

        # Update the date for the last time the data load was run
        update_last_load_date("fpds", processing_start_datetime)

        logger.info("FPDS NIGHTLY UPDATE COMPLETE")
def load_executive_compensation(db_cursor, date, start_date):
    logger.info("Getting DUNS/Exec Comp data from broker based on the last pull date of %s..." % str(date))

    # Get first page
    db_cursor.execute(EXEC_COMP_QUERY, [date])
    exec_comp_query_dict = dictfetchall(db_cursor)

    total_rows = len(exec_comp_query_dict)
    logger.info('Updating Executive Compensation Data, {} rows coming from the Broker...'.format(total_rows))

    start_time = datetime.now(timezone.utc)

    for index, row in enumerate(exec_comp_query_dict, 1):

        if not (index % 100):
            logger.info('Loading row {} of {} ({})'.format(str(index),
                                                           str(total_rows),
                                                           datetime.now() - start_time))

        leo_update_dict = {
            "officer_1_name": row['high_comp_officer1_full_na'],
            "officer_1_amount": row['high_comp_officer1_amount'],
            "officer_2_name": row['high_comp_officer2_full_na'],
            "officer_2_amount": row['high_comp_officer2_amount'],
            "officer_3_name": row['high_comp_officer3_full_na'],
            "officer_3_amount": row['high_comp_officer3_amount'],
            "officer_4_name": row['high_comp_officer4_full_na'],
            "officer_4_amount": row['high_comp_officer4_amount'],
            "officer_5_name": row['high_comp_officer5_full_na'],
            "officer_5_amount": row['high_comp_officer5_amount'],
        }

        any_data = False
        for attr, value in leo_update_dict.items():
            if value and value != "":
                any_data = True
                break

        if not any_data:
            continue

        duns_number = row['awardee_or_recipient_uniqu']

        # Deal with multiples that we have in our LE table
        legal_entities = LegalEntity.objects.filter(recipient_unique_id=duns_number)
        if not legal_entities.exists():
            logger.info('No record in data store for DUNS {}. Skipping...'.format(duns_number))

        for le in legal_entities:
            leo, _ = LegalEntityOfficers.objects.get_or_create(legal_entity=le)
            for attr, value in leo_update_dict.items():
                if value == "":
                    value = None
                setattr(leo, attr, value)
            leo.save()

    # Update the date for the last time the data load was run
    update_last_load_date("exec_comp", start_date)
    def handle(self, *args, **options):
        logger.info("==== Starting FPDS nightly data load ====")

        if options.get("date"):
            date = options.get("date")[0]
            date = datetime.strptime(date, "%Y-%m-%d").date()
        else:
            default_last_load_date = datetime.now(timezone.utc) - timedelta(days=1)
            date = get_last_load_date("fpds", default=default_last_load_date).date()
        processing_start_datetime = datetime.now(timezone.utc)

        logger.info("Processing data for FPDS starting from %s" % date)

        with timer("retrieval of deleted FPDS IDs", logger.info):
            ids_to_delete = self.get_deleted_fpds_data_from_s3(date=date)

        if len(ids_to_delete) > 0:
            with timer("deletion of all stale FPDS data", logger.info):
                self.delete_stale_fpds(ids_to_delete=ids_to_delete)
        else:
            logger.info("No FPDS records to delete at this juncture")

        with timer("retrieval of new/modified FPDS data ID list", logger.info):
            total_insert = self.get_fpds_transaction_ids(date=date)

        if len(total_insert) > 0:
            # Add FPDS records
            with timer("insertion of new FPDS data in batches", logger.info):
                self.insert_all_new_fpds(total_insert)

            # Update Awards based on changed FPDS records
            with timer("updating awards to reflect their latest associated transaction info", logger.info):
                update_awards(tuple(AWARD_UPDATE_ID_LIST))

            # Update FPDS-specific Awards based on the info in child transactions
            with timer("updating contract-specific awards to reflect their latest transaction info", logger.info):
                update_contract_awards(tuple(AWARD_UPDATE_ID_LIST))

            # Update AwardCategories based on changed FPDS records
            with timer("updating award category variables", logger.info):
                update_award_categories(tuple(AWARD_UPDATE_ID_LIST))

            # Check the linkages from file C to FPDS records and update any that are missing
            with timer("updating C->D linkages", logger.info):
                update_c_to_d_linkages("contract")
        else:
            logger.info("No FPDS records to insert or modify at this juncture")

        # Update the date for the last time the data load was run
        update_last_load_date("fpds", processing_start_datetime)

        logger.info("FPDS NIGHTLY UPDATE COMPLETE")
    def handle(self, *args, **options):
        processing_start_datetime = datetime.now(timezone.utc)

        logger.info("Starting FABS data load script...")

        do_not_log_deletions = options["do_not_log_deletions"]

        # "Reload all" supersedes all other processing options.
        reload_all = options["reload_all"]
        if reload_all:
            submission_ids = None
            afa_ids = None
            start_datetime = None
            end_datetime = None
        else:
            submission_ids = tuple(options["submission_ids"]
                                   ) if options["submission_ids"] else None
            afa_ids = read_afa_ids_from_file(
                options['afa_id_file']) if options['afa_id_file'] else None
            start_datetime = options["start_datetime"]
            end_datetime = options["end_datetime"]

        # If no other processing options were provided than this is an incremental load.
        is_incremental_load = not any((reload_all, submission_ids, afa_ids,
                                       start_datetime, end_datetime))

        if is_incremental_load:
            last_load_date = get_last_load_date()
            submission_ids = get_new_submission_ids(last_load_date)
            logger.info("Processing data for FABS starting from %s" %
                        last_load_date)

        if is_incremental_load and not submission_ids:
            logger.info("No new submissions. Exiting.")

        else:
            with timer("obtaining delete records", logger.info):
                ids_to_delete = get_fabs_records_to_delete(
                    submission_ids, afa_ids, start_datetime, end_datetime)

            with timer("retrieving/diff-ing FABS Data", logger.info):
                ids_to_upsert = get_fabs_transaction_ids(
                    submission_ids, afa_ids, start_datetime, end_datetime)

            update_award_ids = delete_fabs_transactions(
                ids_to_delete, do_not_log_deletions)
            upsert_fabs_transactions(ids_to_upsert, update_award_ids)

        if is_incremental_load:
            update_last_load_date("fabs", processing_start_datetime)

        logger.info("FABS UPDATE FINISHED!")
Esempio n. 10
0
    def complete_process(self) -> None:
        if self.config["create_new_index"]:
            printf({"msg": "Closing old indices and adding aliases"})
            set_final_index_config(self.elasticsearch_client, self.config["index_name"])
            swap_aliases(self.elasticsearch_client, self.config["index_name"], self.config["load_type"])

        if self.config["snapshot"]:
            printf({"msg": "Taking snapshot"})
            take_snapshot(self.elasticsearch_client, self.config["index_name"], settings.ES_REPOSITORY)

        if self.config["is_incremental_load"]:
            msg = "Storing datetime {} for next incremental load"
            printf({"msg": msg.format(self.config["processing_start_datetime"])})
            update_last_load_date("es_transactions", self.config["processing_start_datetime"])
Esempio n. 11
0
    def complete_process(self) -> None:
        client = instantiate_elasticsearch_client()
        if self.config["create_new_index"]:
            set_final_index_config(client, self.config["index_name"])
            if self.config["skip_delete_index"]:
                logger.info(format_log("Skipping deletion of old indices"))
            else:
                logger.info(format_log("Closing old indices and adding aliases"))
                swap_aliases(client, self.config)

        if self.config["is_incremental_load"]:
            toggle_refresh_on(client, self.config["index_name"])
            logger.info(
                format_log(f"Storing datetime {self.config['processing_start_datetime']} for next incremental load")
            )
            update_last_load_date(f"{self.config['stored_date_key']}", self.config["processing_start_datetime"])
    def cleanup(self) -> None:
        """Finalize the execution and cleanup for the next script run"""
        logger.info(f"Processed {self.upsert_records:,} transction records (insert/update)")
        if self.successful_run and (self.is_incremental or self.options["reload_all"]):
            logger.info("Updated last run time for next incremental load")
            update_last_load_date(self.last_load_record, self.start_time)

        if hasattr(self, "file_path") and self.file_path.exists():
            # If the script fails before the file is created, skip
            # If the file still exists, remove
            self.file_path.unlink()

        if self.successful_run:
            logger.info(f"Loading {self.destination_table_name} completed successfully")
        else:
            logger.info("Failed state on exit")
            raise SystemExit(1)
Esempio n. 13
0
    def handle(self, *args, **options):
        processing_start_datetime = datetime.now(timezone.utc)

        logger.info("Starting FABS data load script...")

        # "Reload all" supersedes all other processing options.
        reload_all = options["reload_all"]
        if reload_all:
            afa_ids = None
            start_datetime = None
            end_datetime = None
        else:
            afa_ids = read_afa_ids_from_file(
                options["afa_id_file"]) if options["afa_id_file"] else None
            start_datetime = options["start_datetime"]
            end_datetime = options["end_datetime"]

        # If no other processing options were provided than this is an incremental load.
        is_incremental_load = not any(
            (reload_all, afa_ids, start_datetime, end_datetime))

        if is_incremental_load:
            start_datetime = get_last_load_date()
            logger.info("Processing data for FABS starting from %s" %
                        start_datetime)

        with timer("obtaining delete records", logger.info):
            delete_records = retrieve_deleted_fabs_transactions(
                start_datetime, end_datetime)
            ids_to_delete = [
                item for sublist in delete_records.values() for item in sublist
                if item
            ]

        with timer("retrieving/diff-ing FABS Data", logger.info):
            ids_to_upsert = get_fabs_transaction_ids(afa_ids, start_datetime,
                                                     end_datetime)

        update_award_ids = delete_fabs_transactions(ids_to_delete)
        upsert_fabs_transactions(ids_to_upsert, update_award_ids)

        if is_incremental_load:
            update_last_load_date("fabs", processing_start_datetime)

        logger.info("FABS UPDATE FINISHED!")
    def handle(self, *args, **options):
        processing_start_datetime = datetime.now(timezone.utc)

        logger.info("Starting FABS data load script...")

        do_not_log_deletions = options["do_not_log_deletions"]

        # "Reload all" supersedes all other processing options.
        reload_all = options["reload_all"]
        if reload_all:
            submission_ids = None
            afa_ids = None
            start_datetime = None
            end_datetime = None
        else:
            submission_ids = tuple(options["submission_ids"]) if options["submission_ids"] else None
            afa_ids = read_afa_ids_from_file(options['afa_id_file']) if options['afa_id_file'] else None
            start_datetime = options["start_datetime"]
            end_datetime = options["end_datetime"]

        # If no other processing options were provided than this is an incremental load.
        is_incremental_load = not any((reload_all, submission_ids, afa_ids, start_datetime, end_datetime))

        if is_incremental_load:
            last_load_date = get_last_load_date()
            submission_ids = get_new_submission_ids(last_load_date)
            logger.info("Processing data for FABS starting from %s" % last_load_date)

        if is_incremental_load and not submission_ids:
            logger.info("No new submissions. Exiting.")

        else:
            with timer("obtaining delete records", logger.info):
                ids_to_delete = get_fabs_records_to_delete(submission_ids, afa_ids, start_datetime, end_datetime)

            with timer("retrieving/diff-ing FABS Data", logger.info):
                ids_to_upsert = get_fabs_transaction_ids(submission_ids, afa_ids, start_datetime, end_datetime)

            update_award_ids = delete_fabs_transactions(ids_to_delete, do_not_log_deletions)
            upsert_fabs_transactions(ids_to_upsert, update_award_ids)

        if is_incremental_load:
            update_last_load_date("fabs", processing_start_datetime)

        logger.info("FABS UPDATE FINISHED!")
Esempio n. 15
0
    def handle(self, *args, **options):

        # Record script execution start time to update the FPDS last updated date in DB as appropriate
        update_time = datetime.now(timezone.utc)

        if options["reload_all"]:
            self.load_fpds_incrementally(None)

        elif options["date"]:
            self.load_fpds_incrementally(options["date"])

        elif options["ids"]:
            self.modified_award_ids.extend(
                load_fpds_transactions(options["ids"]))

        elif options["file"]:
            self.load_fpds_from_file(options["file"])

        elif options["since_last_load"]:
            last_load = get_last_load_date("fpds")
            if not last_load:
                raise ValueError(
                    "No last load date for FPDS stored in the database")
            self.load_fpds_incrementally(last_load)

        self.update_award_records(awards=self.modified_award_ids,
                                  skip_cd_linkage=False)

        logger.info(f"Script took {datetime.now(timezone.utc) - update_time}")

        if failed_ids:
            failed_id_str = ", ".join([str(id) for id in failed_ids])
            logger.error(
                f"The following detached_award_procurement_ids failed to load: [{failed_id_str}]"
            )
            raise SystemExit(1)

        if options["reload_all"] or options["since_last_load"]:
            # we wait until after the load finishes to update the load date because if this crashes we'll need to load again
            update_last_load_date("fpds", update_time)

        logger.info(f"Successfully Completed")
Esempio n. 16
0
    def complete_process(self) -> None:
        if self.config["create_new_index"]:
            set_final_index_config(self.elasticsearch_client,
                                   self.config["index_name"])
            if self.config["skip_delete_index"]:
                printf({"msg": "Skipping deletion of old indices"})
            else:
                printf({"msg": "Closing old indices and adding aliases"})
                swap_aliases(self.elasticsearch_client,
                             self.config["index_name"],
                             self.config["load_type"])

        if self.config["snapshot"]:
            printf({"msg": "Taking snapshot"})
            take_snapshot(self.elasticsearch_client, self.config["index_name"],
                          settings.ES_REPOSITORY)

        if self.config["is_incremental_load"]:
            printf({
                "msg":
                f"Storing datetime {self.config['processing_start_datetime']} for next incremental load"
            })
            update_last_load_date(f"es_{self.config['load_type']}",
                                  self.config["processing_start_datetime"])