def retrieve_deleted_transactions(
        regex: str,
        date_fmt: str,
        start_datetime: datetime,
        end_datetime: Optional[datetime] = None) -> dict:
    with Timer("Obtaining S3 Object list"):
        logger.info(f"file regex pattern: {regex}")
        logger.info(f"date format: {date_fmt}")
        logger.info(f"start date: {start_datetime}")
        logger.info(f"end date: {end_datetime}")
        logger.info(
            f"S3 bucket name: {settings.DELETED_TRANSACTION_JOURNAL_FILES}")
        objects = access_s3_object_list(
            settings.DELETED_TRANSACTION_JOURNAL_FILES, regex_pattern=regex)
        if objects is None:
            raise Exception("Problem accessing S3 for deleted records")
        objects = limit_objects_to_date_range(objects, date_fmt,
                                              start_datetime, end_datetime)
        logger.info(f"{len(objects):,} files found in date range.")

    deleted_records = defaultdict(list)
    for obj in objects:
        with Timer(f"Read delete ids from {obj.key}"):
            object_data = access_s3_object(
                settings.DELETED_TRANSACTION_JOURNAL_FILES, obj)
            reader = csv.reader(
                object_data.read().decode("utf-8").splitlines())
            next(reader)  # skip the header

            transaction_id_list = [rows[0] for rows in reader]
            logger.info(f"{len(transaction_id_list):,} delete ids found")

            if transaction_id_list:
                file_date = obj.key[:obj.key.find("_")]
                deleted_records[file_date].extend(transaction_id_list)

    return deleted_records
Ejemplo n.º 2
0
def retrieve_deleted_transactions(
        regex: str,
        start_datetime: datetime,
        end_datetime: Optional[datetime] = None) -> dict:
    with Timer("Obtaining S3 Object list"):
        objects = retrieve_s3_bucket_object_list(
            settings.DELETED_TRANSACTION_JOURNAL_FILES)
        logger.info(
            f"{len(objects):,} files found in bucket '{settings.DELETED_TRANSACTION_JOURNAL_FILES}'."
        )
        objects = [
            o for o in objects if re.fullmatch(regex, o.key) is not None
        ]
        logger.info(f"{len(objects):,} files match file pattern '{regex}'.")
        objects = limit_objects_to_date_range(objects, regex, start_datetime,
                                              end_datetime)
        logger.info(
            f"{len(objects):,} files found in date range {start_datetime} through {end_datetime or 'the end of time'}."
        )

    deleted_records = defaultdict(list)
    for obj in objects:
        object_data = access_s3_object(
            settings.DELETED_TRANSACTION_JOURNAL_FILES, obj)
        reader = csv.reader(object_data.read().decode("utf-8").splitlines())
        next(reader)  # skip the header

        transaction_id_list = [rows[0] for rows in reader]
        logger.info(
            f"{len(transaction_id_list):,} delete ids found in {obj.key}")

        if transaction_id_list:
            file_date = obj.key[:obj.key.find("_")]
            deleted_records[file_date].extend(transaction_id_list)

    return deleted_records
Ejemplo n.º 3
0
def gather_deleted_ids(config):
    """
    Connect to S3 and gather all of the transaction ids stored in CSV files
    generated by the broker when transactions are removed from the DB.
    """

    if not config["process_deletes"]:
        logger.info(
            format_log(f"Skipping the S3 CSV fetch for deleted transactions",
                       process="ES Delete"))
        return

    logger.info(
        format_log(f"Gathering all deleted transactions from S3",
                   process="ES Delete"))
    start = perf_counter()

    bucket_objects = retrieve_s3_bucket_object_list(
        bucket_name=config["s3_bucket"])
    logger.info(
        format_log(
            f"{len(bucket_objects):,} files found in bucket '{config['s3_bucket']}'",
            process="ES Delete"))

    if config["verbose"]:
        logger.info(
            format_log(f"CSV data from {config['starting_date']} to now",
                       process="ES Delete"))

    filtered_csv_list = [
        x for x in bucket_objects
        if (x.key.endswith(".csv") and not x.key.startswith("staging")
            and x.last_modified >= config["starting_date"])
    ]

    if config["verbose"]:
        logger.info(
            format_log(f"Found {len(filtered_csv_list)} csv files",
                       process="ES Delete"))

    deleted_ids = {}

    for obj in filtered_csv_list:
        object_data = access_s3_object(bucket_name=config["s3_bucket"],
                                       obj=obj)

        # Ingests the CSV into a dataframe. pandas thinks some ids are dates, so disable parsing
        data = pd.read_csv(object_data, dtype=str)

        if "detached_award_proc_unique" in data:
            new_ids = [
                "CONT_TX_" + x.upper()
                for x in data["detached_award_proc_unique"].values
            ]
        elif "afa_generated_unique" in data:
            new_ids = [
                "ASST_TX_" + x.upper()
                for x in data["afa_generated_unique"].values
            ]
        else:
            logger.info(
                format_log(f"[Missing valid col] in {obj.key}",
                           process="ES Delete"))

        for uid in new_ids:
            if uid in deleted_ids:
                if deleted_ids[uid]["timestamp"] < obj.last_modified:
                    deleted_ids[uid]["timestamp"] = obj.last_modified
            else:
                deleted_ids[uid] = {"timestamp": obj.last_modified}

    if config["verbose"]:
        for uid, deleted_dict in deleted_ids.items():
            logger.info(
                format_log(
                    f"id: {uid} last modified: {deleted_dict['timestamp']}",
                    process="ES Delete"))

    logger.info(
        format_log(
            f"Gathering {len(deleted_ids):,} deleted transactions took {perf_counter() - start:.2f}s",
            process="ES Delete",
        ))
    return deleted_ids
Ejemplo n.º 4
0
def gather_deleted_ids(config):
    """
    Connect to S3 and gather all of the transaction ids stored in CSV files
    generated by the broker when transactions are removed from the DB.
    """

    if not config["process_deletes"]:
        printf({"msg": "Skipping the S3 CSV fetch for deleted transactions"})
        return
    printf({"msg": "Gathering all deleted transactions from S3"})
    start = perf_counter()

    bucket_objects = access_s3_object_list(bucket_name=config["s3_bucket"])
    if bucket_objects is None:
        raise Exception(f"Issue connecting to {config['s3_bucket']} in s3")

    if config["verbose"]:
        printf({"msg": f"CSV data from {config['starting_date']} to now"})

    filtered_csv_list = [
        x for x in bucket_objects
        if (x.key.endswith(".csv") and not x.key.startswith("staging")
            and x.last_modified >= config["starting_date"])
    ]

    if config["verbose"]:
        printf({"msg": f"Found {len(filtered_csv_list)} csv files"})

    deleted_ids = {}

    for obj in filtered_csv_list:
        object_data = access_s3_object(bucket_name=config["s3_bucket"],
                                       obj=obj)

        # Ingests the CSV into a dataframe. pandas thinks some ids are dates, so disable parsing
        data = pd.read_csv(object_data, dtype=str)

        if "detached_award_proc_unique" in data:
            new_ids = [
                "CONT_TX_" + x.upper()
                for x in data["detached_award_proc_unique"].values
            ]
        elif "afa_generated_unique" in data:
            new_ids = [
                "ASST_TX_" + x.upper()
                for x in data["afa_generated_unique"].values
            ]
        else:
            printf({"msg": f"  [Missing valid col] in {obj.key}"})

        for uid in new_ids:
            if uid in deleted_ids:
                if deleted_ids[uid]["timestamp"] < obj.last_modified:
                    deleted_ids[uid]["timestamp"] = obj.last_modified
            else:
                deleted_ids[uid] = {"timestamp": obj.last_modified}

    if config["verbose"]:
        for uid, deleted_dict in deleted_ids.items():
            printf({
                "msg":
                "id: {} last modified: {}".format(
                    uid, str(deleted_dict["timestamp"]))
            })

    printf({
        "msg":
        "Gathering {} deleted transactions took {}s".format(
            len(deleted_ids),
            perf_counter() - start)
    })
    return deleted_ids