コード例 #1
0
 def test_nonnullFieldsEntityMatch_placeholder(self):
     charge = StateCharge.new_with_defaults()
     charge_another = StateCharge.new_with_defaults()
     self.assertFalse(
         nonnull_fields_entity_match(
             ingested_entity=EntityTree(entity=charge, ancestor_chain=[]),
             db_entity=EntityTree(entity=charge_another,
                                  ancestor_chain=[])))
コード例 #2
0
 def test_nonnullFieldsEntityMatch_placeholder(self) -> None:
     charge = schema.StateCharge(state_code=_STATE_CODE,
                                 status=ChargeStatus.PRESENT_WITHOUT_INFO)
     charge_another = schema.StateCharge(
         state_code=_STATE_CODE, status=ChargeStatus.PRESENT_WITHOUT_INFO)
     self.assertFalse(
         nonnull_fields_entity_match(
             ingested_entity=EntityTree(entity=charge, ancestor_chain=[]),
             db_entity=EntityTree(entity=charge_another, ancestor_chain=[]),
         ))
コード例 #3
0
    def test_getAllEntityTreesOfCls(self):
        sentence_group = schema.StateSentenceGroup(sentence_group_id=_ID)
        sentence_group_2 = schema.StateSentenceGroup(sentence_group_id=_ID_2)
        person = schema.StatePerson(
            person_id=_ID, sentence_groups=[sentence_group, sentence_group_2])

        self.assertEqual([
            EntityTree(entity=sentence_group, ancestor_chain=[person]),
            EntityTree(entity=sentence_group_2, ancestor_chain=[person])
        ], get_all_entity_trees_of_cls([person], schema.StateSentenceGroup))
        self.assertEqual([EntityTree(entity=person, ancestor_chain=[])],
                         get_all_entity_trees_of_cls([person],
                                                     schema.StatePerson))
コード例 #4
0
 def test_nonnullFieldsEntityMatch_externalIdCompare(self):
     charge = StateCharge.new_with_defaults(external_id=_EXTERNAL_ID)
     charge_another = StateCharge.new_with_defaults()
     self.assertFalse(
         nonnull_fields_entity_match(
             ingested_entity=EntityTree(entity=charge, ancestor_chain=[]),
             db_entity=EntityTree(entity=charge_another,
                                  ancestor_chain=[])))
     charge_another.external_id = _EXTERNAL_ID
     self.assertTrue(
         nonnull_fields_entity_match(
             ingested_entity=EntityTree(entity=charge, ancestor_chain=[]),
             db_entity=EntityTree(entity=charge_another,
                                  ancestor_chain=[])))
コード例 #5
0
 def test_baseEntityMatch_flatFieldsCompare(self):
     charge = StateCharge.new_with_defaults(state_code=_STATE_CODE,
                                            county_code=_COUNTY_CODE)
     charge_another = StateCharge.new_with_defaults(state_code=_STATE_CODE)
     self.assertFalse(
         base_entity_match(ingested_entity=EntityTree(entity=charge,
                                                      ancestor_chain=[]),
                           db_entity=EntityTree(entity=charge_another,
                                                ancestor_chain=[])))
     charge_another.county_code = _COUNTY_CODE
     self.assertTrue(
         base_entity_match(ingested_entity=EntityTree(entity=charge,
                                                      ancestor_chain=[]),
                           db_entity=EntityTree(entity=charge_another,
                                                ancestor_chain=[])))
コード例 #6
0
def _match_persons(
        *, ingested_persons: List[StatePerson], db_persons: List[StatePerson]) \
        -> MatchedEntities:
    """Attempts to match all persons from |ingested_persons| with the provided
    |db_persons|. Results are returned in the MatchedEntities object which
    contains all successfully matched and merged persons as well as an error
    count that is incremented every time an error is raised matching an
    ingested person.
    """
    db_person_trees = [
        EntityTree(entity=db_person, ancestor_chain=[])
        for db_person in db_persons
    ]
    ingested_person_trees = [
        EntityTree(entity=ingested_person, ancestor_chain=[])
        for ingested_person in ingested_persons
    ]

    root_entity_cls = get_root_entity_cls(ingested_persons)
    total_root_entities = get_total_entities_of_cls(ingested_persons,
                                                    root_entity_cls)
    persons_match_results = _match_entity_trees(
        ingested_entity_trees=ingested_person_trees,
        db_entity_trees=db_person_trees,
        root_entity_cls=root_entity_cls)

    updated_persons = []
    for match_result in persons_match_results.individual_match_results:
        if not match_result.merged_entity_trees:
            updated_persons.append(match_result.ingested_entity_tree.entity)
        else:
            # It is possible that multiple ingested people match to the same
            # DB person, in which case we should only keep one reference to
            # that object.
            for merged_person_tree in match_result.merged_entity_trees:
                if merged_person_tree.entity not in updated_persons:
                    updated_persons.append(merged_person_tree.entity)

    # The only database persons that are unmatched that we potentially want to
    # update are placeholder persons. These may have had children removed as
    # a part of the matching process and therefore would need updating.
    for db_person in persons_match_results.unmatched_db_entities:
        if is_placeholder(db_person):
            updated_persons.append(db_person)

    return MatchedEntities(people=updated_persons,
                           error_count=persons_match_results.error_count,
                           total_root_entities=total_root_entities)
コード例 #7
0
def _get_all_entity_trees_of_cls_helper(
        tree: EntityTree,
        cls: Type[DatabaseEntity],
        seen_ids: Set[int],
        seen_trees: List[EntityTree],
        direction_checker: SchemaEdgeDirectionChecker):
    """
    Finds all objects in the provided |tree| graph which have the type |cls|.
    When an object of type |cls| is found, updates the provided |seen_ids| and
    |seen_trees| with the object's id and EntityTree respectively.
    """
    entity = tree.entity
    entity_cls = entity.__class__

    # If |cls| is higher ranked than |entity_cls|, it is impossible to reach
    # an object of type |cls| from the current entity.
    if direction_checker.is_higher_ranked(cls, entity_cls):
        return
    if entity_cls == cls and id(entity) not in seen_ids:
        seen_ids.add(id(entity))
        seen_trees.append(tree)
        return
    for child_field_name in get_set_entity_field_names(
            entity, EntityFieldType.FORWARD_EDGE):
        child_trees = tree.generate_child_trees(
            entity.get_field_as_list(child_field_name))
        for child_tree in child_trees:
            _get_all_entity_trees_of_cls_helper(
                child_tree, cls, seen_ids, seen_trees, direction_checker)
コード例 #8
0
def _get_match_results_for_all_children(
        ingested_entity_tree: EntityTree, db_entity_trees: List[EntityTree],
        root_entity_cls) \
        -> List[Tuple[str, MatchResults]]:
    """Attempts to match all children of the |ingested_entity_tree| to children
    of the |db_entity_trees|. Matching for each child is independent and can
    match to different DB parents.

    Returns a list of tuples with the following values:
    - str: the string name of the child field
    - MatchResult: the result of matching this child field to children of the
        provided |db_entity_trees|
    """
    results = []
    ingested_entity = ingested_entity_tree.entity
    set_child_fields = get_set_entity_field_names(ingested_entity,
                                                  EntityFieldType.FORWARD_EDGE)

    for child_field_name in set_child_fields:
        ingested_child_field = get_field(ingested_entity, child_field_name)
        db_child_trees = generate_child_entity_trees(child_field_name,
                                                     db_entity_trees)
        if isinstance(ingested_child_field, list):
            ingested_child_list = ingested_child_field
        else:
            ingested_child_list = [ingested_child_field]

        ingested_child_trees = \
            ingested_entity_tree.generate_child_trees(ingested_child_list)
        match_results = _match_entity_trees(
            ingested_entity_trees=ingested_child_trees,
            db_entity_trees=db_child_trees,
            root_entity_cls=root_entity_cls)
        results.append((child_field_name, match_results))
    return results
コード例 #9
0
 def test_nonnullFieldsEntityMatch_externalIdCompare(self) -> None:
     charge = schema.StateCharge(
         state_code=_STATE_CODE,
         status=ChargeStatus.PRESENT_WITHOUT_INFO,
         external_id=_EXTERNAL_ID,
     )
     charge_another = schema.StateCharge(
         state_code=_STATE_CODE, status=ChargeStatus.PRESENT_WITHOUT_INFO)
     self.assertFalse(
         nonnull_fields_entity_match(
             ingested_entity=EntityTree(entity=charge, ancestor_chain=[]),
             db_entity=EntityTree(entity=charge_another, ancestor_chain=[]),
         ))
     charge_another.external_id = _EXTERNAL_ID
     self.assertTrue(
         nonnull_fields_entity_match(
             ingested_entity=EntityTree(entity=charge, ancestor_chain=[]),
             db_entity=EntityTree(entity=charge_another, ancestor_chain=[]),
         ))
コード例 #10
0
def get_all_entity_trees_of_cls(sources: Sequence[DatabaseEntity], cls: Type[DatabaseEntity]) -> List[EntityTree]:
    """Finds all unique entities of type |cls| in the provided |sources|, and returns their corresponding EntityTrees.
    """
    seen_ids: Set[int] = set()
    seen_trees: List[EntityTree] = []
    direction_checker = SchemaEdgeDirectionChecker.state_direction_checker()
    for source in sources:
        tree = EntityTree(entity=source, ancestor_chain=[])
        _get_all_entity_trees_of_cls_helper(tree, cls, seen_ids, seen_trees, direction_checker)
    return seen_trees
コード例 #11
0
    def test_nonnullFieldsEntityMatch_flatFieldsCompare(self) -> None:
        charge = schema.StateCharge(
            state_code=_STATE_CODE,
            ncic_code="1234",
            county_code=_COUNTY_CODE,
            status=ChargeStatus.PRESENT_WITHOUT_INFO,
        )
        charge_another = schema.StateCharge(
            state_code=_STATE_CODE,
            ncic_code="1234",
            status=ChargeStatus.PRESENT_WITHOUT_INFO,
        )

        # If one of the entities is merely missing a field, we still consider it a match
        self.assertTrue(
            nonnull_fields_entity_match(
                ingested_entity=EntityTree(entity=charge, ancestor_chain=[]),
                db_entity=EntityTree(entity=charge_another, ancestor_chain=[]),
            ))
        charge_another.county_code = _COUNTY_CODE_ANOTHER

        # If one of the entities has a different value, then it is not a match
        self.assertFalse(
            nonnull_fields_entity_match(
                ingested_entity=EntityTree(entity=charge, ancestor_chain=[]),
                db_entity=EntityTree(entity=charge_another, ancestor_chain=[]),
            ))
        charge_another.county_code = _COUNTY_CODE

        # All fields the same - this is a match
        self.assertTrue(
            nonnull_fields_entity_match(
                ingested_entity=EntityTree(entity=charge, ancestor_chain=[]),
                db_entity=EntityTree(entity=charge_another, ancestor_chain=[]),
            ))
コード例 #12
0
    def test_nonnullFieldsEntityMatch_flatFieldsCompare(self):
        charge = StateCharge.new_with_defaults(state_code=_STATE_CODE,
                                               ncic_code='1234',
                                               county_code=_COUNTY_CODE)
        charge_another = StateCharge.new_with_defaults(state_code=_STATE_CODE,
                                                       ncic_code='1234')

        # If one of the entities is merely missing a field, we still consider it a match
        self.assertTrue(
            nonnull_fields_entity_match(
                ingested_entity=EntityTree(entity=charge, ancestor_chain=[]),
                db_entity=EntityTree(entity=charge_another,
                                     ancestor_chain=[])))
        charge_another.county_code = _COUNTY_CODE_ANOTHER

        # If one of the entities has a different value, then it is not a match
        self.assertFalse(
            nonnull_fields_entity_match(
                ingested_entity=EntityTree(entity=charge, ancestor_chain=[]),
                db_entity=EntityTree(entity=charge_another,
                                     ancestor_chain=[])))
        charge_another.county_code = _COUNTY_CODE

        # All fields the same - this is a match
        self.assertTrue(
            nonnull_fields_entity_match(
                ingested_entity=EntityTree(entity=charge, ancestor_chain=[]),
                db_entity=EntityTree(entity=charge_another,
                                     ancestor_chain=[])))
コード例 #13
0
    def test_generateChildEntitiesWithAncestorChain(self):
        fine = StateFine.new_with_defaults(fine_id=_ID)
        fine_another = StateFine.new_with_defaults(fine_id=_ID_2)
        person = StatePerson.new_with_defaults(person_id=_ID)
        sentence_group = StateSentenceGroup.new_with_defaults(
            status=StateSentenceStatus.PRESENT_WITHOUT_INFO,
            state_code=_STATE_CODE,
            fines=[fine, fine_another],
            person=[person],
            sentence_group_id=_ID)
        sentence_group_tree = EntityTree(entity=sentence_group,
                                         ancestor_chain=[person])

        expected_child_trees = [
            EntityTree(entity=fine, ancestor_chain=[person, sentence_group]),
            EntityTree(entity=fine_another,
                       ancestor_chain=[person, sentence_group]),
        ]

        self.assertEqual(
            expected_child_trees,
            generate_child_entity_trees('fines', [sentence_group_tree]))
コード例 #14
0
    def test_generateChildEntitiesWithAncestorChain(self) -> None:
        fine = schema.StateFine(state_code=_STATE_CODE, fine_id=_ID)
        fine_another = schema.StateFine(state_code=_STATE_CODE, fine_id=_ID_2)
        person = schema.StatePerson(state_code=_STATE_CODE, person_id=_ID)
        sentence_group = schema.StateSentenceGroup(
            status=StateSentenceStatus.PRESENT_WITHOUT_INFO,
            state_code=_STATE_CODE,
            fines=[fine, fine_another],
            person=person,
            sentence_group_id=_ID,
        )
        sentence_group_tree = EntityTree(entity=sentence_group,
                                         ancestor_chain=[person])

        expected_child_trees = [
            EntityTree(entity=fine, ancestor_chain=[person, sentence_group]),
            EntityTree(entity=fine_another,
                       ancestor_chain=[person, sentence_group]),
        ]

        self.assertEqual(
            expected_child_trees,
            generate_child_entity_trees("fines", [sentence_group_tree]),
        )
コード例 #15
0
def _merge_multiparent_entities_from_map(
        multiparent_map: Dict[str, List[_EntityWithParents]]):
    """Merges entities from the provided |multiparent_map|."""
    for entities_with_parents in multiparent_map.values():
        merged_entity_with_parents = None
        for entity_with_parents in entities_with_parents:
            if not merged_entity_with_parents:
                merged_entity_with_parents = entity_with_parents
                continue

            # Keep track of which one is a DB entity for matching below. If
            # both are ingested, then order does not matter for matching.
            if entity_with_parents.entity.get_id():
                db_with_parents = entity_with_parents
                ing_with_parents = merged_entity_with_parents
            else:
                db_with_parents = merged_entity_with_parents
                ing_with_parents = entity_with_parents

            # Merge the two objects via entity matching
            db_tree = EntityTree(db_with_parents.entity, [])
            ing_tree = EntityTree(ing_with_parents.entity, [])
            match_result = _match_matched_tree(ingested_entity_tree=ing_tree,
                                               db_match_tree=db_tree,
                                               matched_entities_by_db_ids={},
                                               root_entity_cls=StatePerson)
            updated_entity = one(match_result.merged_entity_trees).entity

            # As entity matching automatically updates the input db entity, we
            # only have to replace ing_with_parents.entity.
            _replace_entity(entity=updated_entity,
                            to_replace=ing_with_parents.entity,
                            linked_parents=ing_with_parents.linked_parents)
            db_with_parents.linked_parents.extend(
                ing_with_parents.linked_parents)
            merged_entity_with_parents = db_with_parents
コード例 #16
0
def _match_matched_tree(
        *, ingested_entity_tree: EntityTree, db_match_tree: EntityTree,
        matched_entities_by_db_ids: Dict[int, Entity],
        root_entity_cls) \
        -> IndividualMatchResult:
    """Given an |ingested_entity_tree| and it's matched |db_match_tree|, this
    method merges any updated information from teh ingested entity onto the DB
    entity and then continues entity matching for all children of the provided
    objects.

    Returns the results of matching as an IndividualMatchResult.
    """
    ingested_entity = ingested_entity_tree.entity
    db_entity = db_match_tree.entity

    _add_match_to_matched_entities_cache(
        db_entity_match=db_entity,
        ingested_entity=ingested_entity,
        matched_entities_by_db_ids=matched_entities_by_db_ids)
    error_count = 0
    match_results_by_child = _get_match_results_for_all_children(
        ingested_entity_tree=ingested_entity_tree,
        db_entity_trees=[db_match_tree],
        root_entity_cls=root_entity_cls)

    # Initialize so pylint doesn't yell
    child_match_result = None

    def resolve_child_match_result():
        """Keeps track of all matched and unmatched children."""
        if not child_match_result:
            raise EntityMatchingError(
                f"Expected child_match_result to be set, but instead got "
                f"{child_match_result}",
                ingested_entity_tree.entity.get_entity_name())

        if not child_match_result.merged_entity_trees:
            updated_child_trees.append(child_match_result.ingested_entity_tree)
        else:
            updated_child_trees.extend(child_match_result.merged_entity_trees)

    for child_field_name, match_results in match_results_by_child:
        error_count += match_results.error_count
        ingested_child_field = getattr(ingested_entity, child_field_name)
        updated_child_trees: List[EntityTree] = []
        for child_match_result in match_results.individual_match_results:
            resolve_child_match_result()

        # Update the db_entity with the updated child(ren).
        updated_children = [c.entity for c in updated_child_trees]
        if isinstance(ingested_child_field, list):
            updated_children.extend(match_results.unmatched_db_entities)
            set_field(db_entity, child_field_name, updated_children)
        else:
            if match_results.unmatched_db_entities:
                raise EntityMatchingError(
                    f"Singular ingested entity field {child_field_name} "
                    f"with value: {ingested_child_field} should "
                    f"match one of the provided db options, but it does not. "
                    f"Found unmatched db entities: "
                    f"{match_results.unmatched_db_entities}",
                    ingested_entity.get_entity_name())
            set_field(db_entity, child_field_name, one(updated_children))

    merged_entity = merge_flat_fields(new_entity=ingested_entity,
                                      old_entity=db_entity)
    merged_entity_tree = EntityTree(
        entity=merged_entity, ancestor_chain=db_match_tree.ancestor_chain)
    return IndividualMatchResult(ingested_entity_tree=ingested_entity_tree,
                                 merged_entity_trees=[merged_entity_tree],
                                 error_count=error_count)
コード例 #17
0
def _match_unmatched_tree(
        ingested_unmatched_entity_tree: EntityTree,
        db_entity_trees: List[EntityTree],
        root_entity_cls) \
        -> IndividualMatchResult:
    """
    Attempts to match the provided |ingested_unmatched_entity_tree| to any
    placeholder DB trees in the provided |db_entity_trees| based off of any
    child matches. When such a match is found, the merged child is moved off of
    the placeholder DB entity and onto the ingested entity.

    Returns the results of matching as an IndividualMatchResult.
    """
    db_placeholder_trees = [
        tree for tree in db_entity_trees if is_placeholder(tree.entity)
    ]

    error_count = 0
    match_results_by_child = _get_match_results_for_all_children(
        ingested_entity_tree=ingested_unmatched_entity_tree,
        db_entity_trees=db_placeholder_trees,
        root_entity_cls=root_entity_cls)

    # If the ingested entity is updated because of a child entity match, we
    # should update our ingested entity's ancestor chain to reflect that of it's
    # counterpart DB. This is necessary for above layers of entity matching
    # which rely on knowing the parent of any merged entities.
    ancestor_chain_updated: List[Entity] = []

    # Initialize so pylint doesn't yell.
    child_match_result = None
    child_field_name = None

    def resolve_child_match_result():
        """Resolves any child matches by moving matched children off of their DB
        placeholder parent and onto the ingested, unmatched entity.
        """
        if not child_field_name or not child_match_result:
            raise EntityMatchingError(
                f"Expected child_field_name and child_match_result to be set, "
                f"but instead got {child_field_name} and {child_match_result} "
                f"respectively.",
                ingested_unmatched_entity_tree.entity.get_entity_name())

        # If child is unmatched, keep track of unchanged child
        if not child_match_result.merged_entity_trees:
            updated_child_trees.append(child_match_result.ingested_entity_tree)
        else:
            # For each matched child, remove child from the DB placeholder and
            # keep track of merged child(ren).
            for merged_child_tree in child_match_result.merged_entity_trees:
                updated_child_trees.append(merged_child_tree)
                placeholder_tree = merged_child_tree.generate_parent_tree()
                remove_child_from_entity(
                    entity=placeholder_tree.entity,
                    child_field_name=child_field_name,
                    child_to_remove=merged_child_tree.entity)

                # For now we only handle the case where all placeholders with
                # matched children have the same parent chain. If they do not,
                # we throw an error.
                if ancestor_chain_updated:
                    if ancestor_chain_updated != \
                            placeholder_tree.ancestor_chain:
                        raise EntityMatchingError(
                            f"Expected all placeholder DB entities matched to "
                            f"an ingested unmatched entity to have the same "
                            f"ancestor chain, but they did not. Found "
                            f"conflicting ancestor chains: "
                            f"{ancestor_chain_updated} and "
                            f"{placeholder_tree.ancestor_chain}",
                            ingested_entity.get_entity_name())
                else:
                    ancestor_chain_updated.extend(
                        placeholder_tree.ancestor_chain)

    ingested_entity = ingested_unmatched_entity_tree.entity
    for child_field_name, match_results in match_results_by_child:
        error_count += match_results.error_count
        ingested_child_field = get_field(ingested_entity, child_field_name)
        updated_child_trees: List[EntityTree] = []
        for child_match_result in match_results.individual_match_results:
            resolve_child_match_result()

        # Update the ingested entity with the updated child(ren).
        updated_children = [mc.entity for mc in updated_child_trees]
        if isinstance(ingested_child_field, list):
            set_field(ingested_entity, child_field_name, updated_children)
        else:
            set_field(ingested_entity, child_field_name, one(updated_children))

    updated_entities = []
    if ancestor_chain_updated:
        updated_entities.append(
            EntityTree(entity=ingested_entity,
                       ancestor_chain=ancestor_chain_updated))

    return IndividualMatchResult(
        ingested_entity_tree=ingested_unmatched_entity_tree,
        merged_entity_trees=updated_entities,
        error_count=error_count)