Ejemplo n.º 1
0
def test_match_release_fuzzy(es_client, caplog):
    """
    This test is tied to the current index contents, so if that changes, this
    test may fail as well.
    """
    cases = (
        ("wtv64ahbdzgwnan7rllwr3nurm", 1),
        ("eqcgtpav3na5jh56o5vjsvb4ei", 1),
    )
    for i, (ident, count) in enumerate(cases):
        entity = anything_to_entity(ident, ReleaseEntity)

        result = match_release_fuzzy(entity, es=es_client)
        logger.info("[{}] given {}, found {}".format(i, entity.title,
                                                     len(result)))
        assert len(result) == count

    # Partial data.
    cases = (
        ({
            "title": "digital libraries",
            "ext_ids": {}
        }, 5),
        ({
            "title": "unlikelytitle",
            "ext_ids": {}
        }, 0),
        ({
            "title": "Imminent dystopia",
            "ext_ids": {}
        }, 2),
        ({
            "title": "",
            "contribs": [{
                "raw_name": "Aristoteles"
            }],
            "ext_ids": {}
        }, 5),
        # ({
        #     "title": "Letter",
        #     "contribs": [{"raw_name": "Claudel"}],
        #     "ext_ids": {}
        # }, 1),
        # ({
        #     "title": "The Future of Digital Scholarship",
        #     "contribs": [{
        #         "raw_name": "Costantino Thanos"
        #     }],
        #     "ext_ids": {}
        # }, 5),
    )
    for i, (doc, count) in enumerate(cases):
        entity = entity_from_dict(doc, ReleaseEntity)
        result = match_release_fuzzy(entity, es=es_client)
        with caplog.at_level(logging.INFO):
            logging.info("[{}] given title '{}', found {}, {}".format(
                i, entity.title, len(result), [v.title for v in result]))
        assert len(result) == count, doc
Ejemplo n.º 2
0
def run_release_match(args):
    """
    Given a release, return similar releases.
    """
    try:
        entity = anything_to_entity(args.value, ReleaseEntity)
        result = match_release_fuzzy(entity, size=args.size, es=args.es_url)
    except Exception as err:
        print("fuzzy match failed: {}".format(err), file=sys.stderr)
    else:
        if args.output_format == "tsv":
            for ce in result:
                vs = [ce.ident, ce.work_id, ce.container_id, ce.title]
                print("\t".join((str(v) for v in vs)))
        if args.output_format == "json":
            matches = []
            for ce in result:
                vs = {
                    "ident": ce.ident,
                    "work_id": ce.work_id,
                    "container_id": ce.container_id,
                    "title": ce.title,
                }
                matches.append(vs)
            vs = {
                "entity": entity_to_dict(entity),
                "matches": matches,
                "match_count": len(matches)
            }
            print(json.dumps(vs))
Ejemplo n.º 3
0
def close_fuzzy_release_matches(
        release: ReleaseEntity,
        es_client: Any,
        fatcat_api_client: Optional[Any] = None,
        match_limit: int = 5) -> Optional[FuzzyReleaseMatchResult]:
    """
    This high-level helper function runs a fuzzy match (using elasticsearch),
    verifies all the results, and returns the "closest" matching results (if
    any).

    es_client is required, and used in the matcing process.

    fatcat_api_client is optional and used both for entity-to-dict conversion
    efficiency and for fetching current entities from the fatcat API

    match_limit sets the maximum result size from the inital fuzzy match call

    Returns an empty list if there was no match of any kind, or a sorted list
    of simple result objects (FuzzyReleaseMatchResult dataclass) with fields:

        status: fuzzycat.common.Status
        reason: fuzzycat.common.Reason
        release: ReleaseEntity

    Status is one of the fuzzycat.common.Status, with "strongest match" in this
    sorted order:

    - EXACT
    - STRONG
    - WEAK
    - AMBIGUOUS

    DIFFERENT and TODO matches are never returned.

    Eg, if there is any EXACT match that is always returned; an AMBIGIOUS
    result is only returned if all the candidate matches were ambiguous.
    """

    candidates = match_release_fuzzy(release, size=match_limit, es=es_client)
    if not candidates:
        return None

    release_dict = entity_to_dict(release, api_client=fatcat_api_client)

    # list of tuple of (Verify, ReleaseEntity)
    verified = [(
        verify(release_dict, entity_to_dict(c, api_client=fatcat_api_client)),
        c,
    ) for c in candidates]

    # list of FuzzyReleaseMatchResult, with TODO and DIFFERENT removed
    verified = [
        FuzzyReleaseMatchResult(v[0].status, v[0].reason, v[1])
        for v in verified
        if v[0].status not in [Status.TODO, Status.DIFFERENT]
    ]

    return sorted(verified, key=lambda v: STATUS_SORT[v.status])
def fuzzy_match(
    release: ReleaseEntity, es_client: Any, api_client: Any, timeout: float = 10.0
) -> Optional[Tuple[str, str, ReleaseEntity]]:
    """
    This helper function uses fuzzycat (and elasticsearch) to look for
    existing release entities with similar metadata.

    Returns None if there was no match of any kind, or a single tuple
    (status: str, reason: str, existing: ReleaseEntity) if there was a match.

    Status string is one of the fuzzycat.common.Status, with "strongest
    match" in this sorted order:

    - EXACT
    - STRONG
    - WEAK
    - AMBIGUOUS

    Eg, if there is any EXACT match that is always returned; an AMBIGUOUS
    result is only returned if all the candidate matches were ambiguous.

    TODO: actually do something with timeout
    """

    # this map used to establish priority order of verified matches
    STATUS_SORT = {
        fuzzycat.common.Status.TODO: 0,
        fuzzycat.common.Status.EXACT: 10,
        fuzzycat.common.Status.STRONG: 20,
        fuzzycat.common.Status.WEAK: 30,
        fuzzycat.common.Status.AMBIGUOUS: 40,
        fuzzycat.common.Status.DIFFERENT: 60,
    }

    # TODO: the size here is a first guess; what should it really be?
    candidates = match_release_fuzzy(release, size=10, es=es_client)
    if not candidates:
        return None

    release_dict = entity_to_dict(release, api_client=api_client.api_client)
    verified = [
        (
            fuzzycat.verify.verify(
                release_dict, entity_to_dict(c, api_client=api_client.api_client)
            ),
            c,
        )
        for c in candidates
    ]

    # chose the "closest" match
    closest = sorted(verified, key=lambda v: STATUS_SORT[v[0].status])[0]
    if closest[0].status == fuzzycat.common.Status.DIFFERENT:
        return None
    elif closest[0].status == fuzzycat.common.Status.TODO:
        raise NotImplementedError("fuzzycat verify hit a Status.TODO")
    else:
        return (closest[0].status.name, closest[0].reason.value, closest[1])