def test_match_release_fuzzy(es_client, caplog): """ This test is tied to the current index contents, so if that changes, this test may fail as well. """ cases = ( ("wtv64ahbdzgwnan7rllwr3nurm", 1), ("eqcgtpav3na5jh56o5vjsvb4ei", 1), ) for i, (ident, count) in enumerate(cases): entity = anything_to_entity(ident, ReleaseEntity) result = match_release_fuzzy(entity, es=es_client) logger.info("[{}] given {}, found {}".format(i, entity.title, len(result))) assert len(result) == count # Partial data. cases = ( ({ "title": "digital libraries", "ext_ids": {} }, 5), ({ "title": "unlikelytitle", "ext_ids": {} }, 0), ({ "title": "Imminent dystopia", "ext_ids": {} }, 2), ({ "title": "", "contribs": [{ "raw_name": "Aristoteles" }], "ext_ids": {} }, 5), # ({ # "title": "Letter", # "contribs": [{"raw_name": "Claudel"}], # "ext_ids": {} # }, 1), # ({ # "title": "The Future of Digital Scholarship", # "contribs": [{ # "raw_name": "Costantino Thanos" # }], # "ext_ids": {} # }, 5), ) for i, (doc, count) in enumerate(cases): entity = entity_from_dict(doc, ReleaseEntity) result = match_release_fuzzy(entity, es=es_client) with caplog.at_level(logging.INFO): logging.info("[{}] given title '{}', found {}, {}".format( i, entity.title, len(result), [v.title for v in result])) assert len(result) == count, doc
def run_release_match(args): """ Given a release, return similar releases. """ try: entity = anything_to_entity(args.value, ReleaseEntity) result = match_release_fuzzy(entity, size=args.size, es=args.es_url) except Exception as err: print("fuzzy match failed: {}".format(err), file=sys.stderr) else: if args.output_format == "tsv": for ce in result: vs = [ce.ident, ce.work_id, ce.container_id, ce.title] print("\t".join((str(v) for v in vs))) if args.output_format == "json": matches = [] for ce in result: vs = { "ident": ce.ident, "work_id": ce.work_id, "container_id": ce.container_id, "title": ce.title, } matches.append(vs) vs = { "entity": entity_to_dict(entity), "matches": matches, "match_count": len(matches) } print(json.dumps(vs))
def close_fuzzy_release_matches( release: ReleaseEntity, es_client: Any, fatcat_api_client: Optional[Any] = None, match_limit: int = 5) -> Optional[FuzzyReleaseMatchResult]: """ This high-level helper function runs a fuzzy match (using elasticsearch), verifies all the results, and returns the "closest" matching results (if any). es_client is required, and used in the matcing process. fatcat_api_client is optional and used both for entity-to-dict conversion efficiency and for fetching current entities from the fatcat API match_limit sets the maximum result size from the inital fuzzy match call Returns an empty list if there was no match of any kind, or a sorted list of simple result objects (FuzzyReleaseMatchResult dataclass) with fields: status: fuzzycat.common.Status reason: fuzzycat.common.Reason release: ReleaseEntity Status is one of the fuzzycat.common.Status, with "strongest match" in this sorted order: - EXACT - STRONG - WEAK - AMBIGUOUS DIFFERENT and TODO matches are never returned. Eg, if there is any EXACT match that is always returned; an AMBIGIOUS result is only returned if all the candidate matches were ambiguous. """ candidates = match_release_fuzzy(release, size=match_limit, es=es_client) if not candidates: return None release_dict = entity_to_dict(release, api_client=fatcat_api_client) # list of tuple of (Verify, ReleaseEntity) verified = [( verify(release_dict, entity_to_dict(c, api_client=fatcat_api_client)), c, ) for c in candidates] # list of FuzzyReleaseMatchResult, with TODO and DIFFERENT removed verified = [ FuzzyReleaseMatchResult(v[0].status, v[0].reason, v[1]) for v in verified if v[0].status not in [Status.TODO, Status.DIFFERENT] ] return sorted(verified, key=lambda v: STATUS_SORT[v.status])
def fuzzy_match( release: ReleaseEntity, es_client: Any, api_client: Any, timeout: float = 10.0 ) -> Optional[Tuple[str, str, ReleaseEntity]]: """ This helper function uses fuzzycat (and elasticsearch) to look for existing release entities with similar metadata. Returns None if there was no match of any kind, or a single tuple (status: str, reason: str, existing: ReleaseEntity) if there was a match. Status string is one of the fuzzycat.common.Status, with "strongest match" in this sorted order: - EXACT - STRONG - WEAK - AMBIGUOUS Eg, if there is any EXACT match that is always returned; an AMBIGUOUS result is only returned if all the candidate matches were ambiguous. TODO: actually do something with timeout """ # this map used to establish priority order of verified matches STATUS_SORT = { fuzzycat.common.Status.TODO: 0, fuzzycat.common.Status.EXACT: 10, fuzzycat.common.Status.STRONG: 20, fuzzycat.common.Status.WEAK: 30, fuzzycat.common.Status.AMBIGUOUS: 40, fuzzycat.common.Status.DIFFERENT: 60, } # TODO: the size here is a first guess; what should it really be? candidates = match_release_fuzzy(release, size=10, es=es_client) if not candidates: return None release_dict = entity_to_dict(release, api_client=api_client.api_client) verified = [ ( fuzzycat.verify.verify( release_dict, entity_to_dict(c, api_client=api_client.api_client) ), c, ) for c in candidates ] # chose the "closest" match closest = sorted(verified, key=lambda v: STATUS_SORT[v[0].status])[0] if closest[0].status == fuzzycat.common.Status.DIFFERENT: return None elif closest[0].status == fuzzycat.common.Status.TODO: raise NotImplementedError("fuzzycat verify hit a Status.TODO") else: return (closest[0].status.name, closest[0].reason.value, closest[1])