def _match_from_charges(*, db_booking: entities.Booking,
                        ingested_booking: entities.Booking, name: str,
                        orphaned_entities: List[Entity]) -> None:
    """Helper function that, within a booking, matches objects that are children
    of the booking's charges. |name| should be 'bond' or 'sentence'.

    Any entities that are orphaned as a part of this process are added to the
    given |orphaned_entities|.
    """
    db_obj_map, db_relationship_map = _build_maps_from_charges(
        db_booking.charges, name)
    ing_obj_map, ing_relationship_map = _build_maps_from_charges(
        ingested_booking.charges, name)

    def _is_match_with_relationships(*, db_entity: Entity,
                                     ingested_entity: Entity) -> bool:
        ing_entity_id = generate_id_from_obj(ingested_entity)
        db_entity_id = db_entity.get_id()
        matcher = getattr(county_matching_utils, "is_{}_match".format(name))
        obj_match = matcher(db_entity=db_entity,
                            ingested_entity=ingested_entity)
        # The relationships "match" if new relationships have been added
        # since the last scrape, but not if relationships have been removed.
        parents_of_db_entity = db_relationship_map[db_entity_id]
        parents_of_ing_entity = ing_relationship_map[ing_entity_id]
        relationship_match = parents_of_db_entity.issubset(
            parents_of_ing_entity)

        return obj_match and relationship_match

    matched_ing_objs_by_db_id: Dict[int, Entity] = {}
    for ing_obj in ing_obj_map.values():
        db_obj = get_next_available_match(
            ing_obj,
            list(db_obj_map.values()),
            matched_ing_objs_by_db_id,
            _is_match_with_relationships,
        )
        if db_obj:
            db_id = db_obj.get_id()
            logging.debug("successfully matched to %s with id %s", name, db_id)
            setattr(ing_obj, name + "_id", db_id)
            matched_ing_objs_by_db_id[cast(int, db_id)] = ing_obj

    for db_obj in db_obj_map.values():
        db_obj_id = db_obj.get_id()
        if db_obj_id not in matched_ing_objs_by_db_id:
            logging.debug("Did not match %s to any ingested %s, dropping",
                          db_obj_id, name)
            drop_fn = globals()["_drop_" + name]
            drop_fn(db_obj)
            orphaned_entities.append(db_obj)
Example #2
0
def match_charges(*, db_booking: entities.Booking,
                  ingested_booking: entities.Booking):
    """
    Attempts to match all charges on the |ingested_booking| with charges on
    the |db_booking|. For any ingested charge, if a matching charge exists on
    |db_booking|, the primary key is updated on the ingested charge. All
    db charges that are not matched to an ingested charge are marked dropped and
    added to the |ingested_booking|.

    Note about charge matching:

    Our matching scheme here is designed to reduce turnover with our charge,
    sentence, an bond entities (i.e. preferring to update entities rather than
    replacing old ones).

    If possible, we match ingested_charges to db_charges while considering
    the equality of their children (bonds/sentences). If we cannot match an
    ingested_charge while considering its children, we attempt to match it only
    based on the charge fields. If the ingested_charge still has no match,
    it will be marked dropped.

    Because we can match two charges whose children are not equal, we sort our
    ingested_charges, attempting to match those with more children before
    those with fewer or none.

    Examples:
        1. Two identical charges are ingested, one with a bond (A) and one
        without (B). The same charges were also scraped yesterday, and are in
        our DB as charge C (with a bond) and D (without a bond)

        Because A could match either C or D, we sort our ingested charges by the
        number of children, so that we always attempt to match A before B. This
        ensures A matches to C, and then B matches to D.

        If we attempted to match B before A, B could match to C, causing a new
        bond to be added to B. Then A would match to D, and its bond would be
        dropped. This turnover is not desired.

        2. We scrape one charge from the website (A), and this charge is exactly
        the same as a charge in our DB (B), except that the ingested charge A
        now has a bond. In this case we'll first try to match A considering
        the child bond, and when no matches are found, we'll match just
        considering the charge fields. At this point A matches to B,
        and B gets a new bond created in the DB.
    """
    matched_charges_by_db_id: Dict[int, entities.Charge] = {}
    ing_charges_sorted_by_child_count = sorted(ingested_booking.charges,
                                               key=_charge_relationship_count,
                                               reverse=True)

    for ingested_charge in ing_charges_sorted_by_child_count:
        db_charge: entities.Charge = get_next_available_match(
            ingested_charge,
            db_booking.charges,
            matched_charges_by_db_id,
            is_charge_match_with_children,
        )

        if not db_charge:
            db_charge = get_next_available_match(
                ingested_charge,
                db_booking.charges,
                matched_charges_by_db_id,
                is_charge_match,
            )

        if db_charge:
            logging.debug("Successfully matched to charge with ID %s",
                          db_charge.charge_id)
            matched_charges_by_db_id[cast(
                int, db_charge.charge_id)] = ingested_charge
            ingested_charge.charge_id = db_charge.charge_id

    dropped_charges = []
    for db_charge in db_booking.charges:
        if db_charge.charge_id not in matched_charges_by_db_id:
            _drop_charge(db_charge)
            dropped_charges.append(db_charge)
    ingested_booking.charges.extend(dropped_charges)