Beispiel #1
0
    def test_pairs(self):
        pairs = (
            (("Testacular", "Testacular"), True),
            (("Ass'n Managers v. Lissner", "Association Managers v. Lissner"), False),
        )

        for q, a in pairs:
            self.assertEqual(case_name_in_candidate(*q), a)
Beispiel #2
0
    def test_pairs(self):
        pairs = (
            (('Testacular', 'Testacular'), True),
            (("Ass'n Managers v. Lissner", 'Association Managers v. Lissner'), False),
        )

        for q, a in pairs:
            self.assertEqual(case_name_in_candidate(*q), a)
def find_duplicates(doc, case_path):
    """Return True if it should be saved, else False"""
    log_print("Running duplicate checks...")

    # 1. Is the item completely outside of the current corpus?
    if not needs_dup_check(doc):
        log_print("  - Not a duplicate: Outside of date range for selected court.")
        return []
    else:
        log_print("  - Could be a duplicate: Inside of date range for selected court.")

    # 2. Can we find any duplicates and information about them?
    stats, candidates = dup_finder.get_dup_stats(doc)
    if len(candidates) == 0:
        log_print("  - Not a duplicate: No candidate matches found.")
        return []
    elif len(candidates) == 1:

        if doc.citation.docket_number and candidates[0].get("docketNumber") is not None:
            # One in the other or vice versa
            if (
                re.sub("(\D|0)", "", candidates[0]["docketNumber"]) in re.sub("(\D|0)", "", doc.citation.docket_number)
            ) or (
                re.sub("(\D|0)", "", doc.citation.docket_number) in re.sub("(\D|0)", "", candidates[0]["docketNumber"])
            ):
                log_print("  - Duplicate found: Only one candidate returned and docket number matches.")
                return [candidates[0]["id"]]
            else:
                if doc.docket.court_id == "cit":
                    # CIT documents have neutral citations in the database. Look that up and compare against that.
                    candidate_doc = Document.objects.get(pk=candidates[0]["id"])
                    if doc.citation.neutral_cite and candidate_doc.citation.neutral_cite:
                        if candidate_doc.citation.neutral_cite in doc.citation.docket_number:
                            log_print(
                                "  - Duplicate found: One candidate from CIT and its neutral citation matches the new document's docket number."
                            )
                            return [candidates[0]["id"]]
                else:
                    log_print("  - Not a duplicate: Only one candidate but docket number differs.")
                return []
        else:
            log_print("  - Skipping docket_number dup check.")

        if doc.citation.case_name == candidates[0].get("caseName"):
            log_print("  - Duplicate found: Only one candidate and case name is a perfect match.")
            return [candidates[0]["id"]]

        if dup_helpers.case_name_in_candidate(doc.citation.case_name, candidates[0].get("caseName")):
            log_print(
                "  - Duplicate found: All words in new document's case name are in the candidate's case name (%s)"
                % candidates[0].get("caseName")
            )
            return [candidates[0]["id"]]

    else:
        # More than one candidate.
        if doc.citation.docket_number:
            dups_by_docket_number = dup_helpers.find_same_docket_numbers(doc, candidates)
            if len(dups_by_docket_number) > 1:
                log_print("  - Duplicates found: %s candidates matched by docket number." % len(dups_by_docket_number))
                return [can["id"] for can in dups_by_docket_number]
            elif len(dups_by_docket_number) == 1:
                log_print("  - Duplicate found: Multiple candidates returned, but one matched by docket number.")
                return [dups_by_docket_number[0]["id"]]
            else:
                log_print("  - Could be a duplicate: Unable to find good match via docket number.")
        else:
            log_print("  - Skipping docket_number dup check.")

    # 3. Filter out obviously bad cases and then pass remainder forward for manual review.

    filtered_candidates, filtered_stats = dup_helpers.filter_by_stats(candidates, stats)
    log_print("  - %s candidates before filtering. With stats: %s" % (stats["candidate_count"], stats["cos_sims"]))
    log_print(
        "  - %s candidates after filtering. Using filtered stats: %s"
        % (filtered_stats["candidate_count"], filtered_stats["cos_sims"])
    )
    if len(filtered_candidates) == 0:
        log_print("  - Not a duplicate: After filtering no good candidates remained.")
        return []
    elif len(filtered_candidates) == 1 and filtered_stats["cos_sims"][0] > 0.93:
        log_print(
            "  - Duplicate found: One candidate after filtering and cosine similarity is high (%s)"
            % filtered_stats["cos_sims"][0]
        )
        return [filtered_candidates[0]["id"]]
    else:
        duplicates = []
        high_sims_count = len([sim for sim in filtered_stats["cos_sims"] if sim > 0.98])
        low_sims_count = len([sim for sim in filtered_stats["cos_sims"] if sim < 0.95])
        for k in range(0, len(filtered_candidates)):
            if all(
                [
                    (high_sims_count == 1),  # Only one high score
                    (low_sims_count == filtered_stats["candidate_count"] - 1),  # All but one have low scores
                ]
            ):
                # If only one of the items is very high, then we can ignore the others and assume it's right
                if filtered_stats["cos_sims"][k] > 0.98:
                    duplicates.append(filtered_candidates[k]["id"])
                    break
                else:
                    # ignore the others
                    continue
            else:
                # Have to determine by "hand"
                log_print("  %s) Case name: %s" % (k + 1, doc.citation.case_name))
                log_print("                 %s" % filtered_candidates[k]["caseName"])
                log_print("      Docket nums: %s" % doc.citation.docket_number)
                log_print("                   %s" % filtered_candidates[k].get("docketNumber", "None"))
                log_print("      Cosine Similarity: %s" % filtered_stats["cos_sims"][k])
                log_print("      Candidate URL: file://%s" % case_path)
                log_print("      Match URL: https://www.courtlistener.com%s" % (filtered_candidates[k]["absolute_url"]))

                choice = raw_input("Is this a duplicate? [Y/n]: ")
                choice = choice or "y"
                if choice == "y":
                    duplicates.append(filtered_candidates[k]["id"])

        if len(duplicates) == 0:
            log_print("  - Not a duplicate: Manual determination found no matches.")
            return []
        elif len(duplicates) == 1:
            log_print("  - Duplicate found: Manual determination found one match.")
            return [duplicates[0]]
        elif len(duplicates) > 1:
            log_print("  - Duplicates found: Manual determination found %s matches." % len(duplicates))
            return duplicates
def find_duplicates(doc, case_path):
    """Return True if it should be saved, else False"""
    log_print("Running duplicate checks...")

    # 1. Is the item completely outside of the current corpus?
    if not needs_dup_check(doc):
        log_print(
            "  - Not a duplicate: Outside of date range for selected court.")
        return []
    else:
        log_print(
            "  - Could be a duplicate: Inside of date range for selected court."
        )

    # 2. Can we find any duplicates and information about them?
    stats, candidates = dup_finder.get_dup_stats(doc)
    if len(candidates) == 0:
        log_print("  - Not a duplicate: No candidate matches found.")
        return []
    elif len(candidates) == 1:

        if doc.citation.docket_number and candidates[0].get(
                'docketNumber') is not None:
            # One in the other or vice versa
            if (re.sub("(\D|0)", "", candidates[0]['docketNumber']) in
                                        re.sub("(\D|0)", "", doc.citation.docket_number)) or \
               (re.sub("(\D|0)", "", doc.citation.docket_number) in
                                        re.sub("(\D|0)", "", candidates[0]['docketNumber'])):
                log_print(
                    "  - Duplicate found: Only one candidate returned and docket number matches."
                )
                return [candidates[0]['id']]
            else:
                if doc.docket.court_id == 'cit':
                    # CIT documents have neutral citations in the database. Look that up and compare against that.
                    candidate_doc = Document.objects.get(
                        pk=candidates[0]['id'])
                    if doc.citation.neutral_cite and candidate_doc.citation.neutral_cite:
                        if candidate_doc.citation.neutral_cite in doc.citation.docket_number:
                            log_print(
                                '  - Duplicate found: One candidate from CIT and its neutral citation matches the new document\'s docket number.'
                            )
                            return [candidates[0]['id']]
                else:
                    log_print(
                        "  - Not a duplicate: Only one candidate but docket number differs."
                    )
                return []
        else:
            log_print("  - Skipping docket_number dup check.")

        if doc.citation.case_name == candidates[0].get('caseName'):
            log_print(
                "  - Duplicate found: Only one candidate and case name is a perfect match."
            )
            return [candidates[0]['id']]

        if dup_helpers.case_name_in_candidate(doc.citation.case_name,
                                              candidates[0].get('caseName')):
            log_print(
                "  - Duplicate found: All words in new document's case name are in the candidate's case name (%s)"
                % candidates[0].get('caseName'))
            return [candidates[0]['id']]

    else:
        # More than one candidate.
        if doc.citation.docket_number:
            dups_by_docket_number = dup_helpers.find_same_docket_numbers(
                doc, candidates)
            if len(dups_by_docket_number) > 1:
                log_print(
                    "  - Duplicates found: %s candidates matched by docket number."
                    % len(dups_by_docket_number))
                return [can['id'] for can in dups_by_docket_number]
            elif len(dups_by_docket_number) == 1:
                log_print(
                    "  - Duplicate found: Multiple candidates returned, but one matched by docket number."
                )
                return [dups_by_docket_number[0]['id']]
            else:
                log_print(
                    "  - Could be a duplicate: Unable to find good match via docket number."
                )
        else:
            log_print("  - Skipping docket_number dup check.")

    # 3. Filter out obviously bad cases and then pass remainder forward for manual review.

    filtered_candidates, filtered_stats = dup_helpers.filter_by_stats(
        candidates, stats)
    log_print("  - %s candidates before filtering. With stats: %s" %
              (stats['candidate_count'], stats['cos_sims']))
    log_print("  - %s candidates after filtering. Using filtered stats: %s" %
              (filtered_stats['candidate_count'], filtered_stats['cos_sims']))
    if len(filtered_candidates) == 0:
        log_print(
            "  - Not a duplicate: After filtering no good candidates remained."
        )
        return []
    elif len(
            filtered_candidates) == 1 and filtered_stats['cos_sims'][0] > 0.93:
        log_print(
            "  - Duplicate found: One candidate after filtering and cosine similarity is high (%s)"
            % filtered_stats['cos_sims'][0])
        return [filtered_candidates[0]['id']]
    else:
        duplicates = []
        high_sims_count = len(
            [sim for sim in filtered_stats['cos_sims'] if sim > 0.98])
        low_sims_count = len(
            [sim for sim in filtered_stats['cos_sims'] if sim < 0.95])
        for k in range(0, len(filtered_candidates)):
            if all([
                (high_sims_count == 1),  # Only one high score
                (low_sims_count == filtered_stats['candidate_count'] - 1
                 )  # All but one have low scores
            ]):
                # If only one of the items is very high, then we can ignore the others and assume it's right
                if filtered_stats['cos_sims'][k] > 0.98:
                    duplicates.append(filtered_candidates[k]['id'])
                    break
                else:
                    # ignore the others
                    continue
            else:
                # Have to determine by "hand"
                log_print("  %s) Case name: %s" %
                          (k + 1, doc.citation.case_name))
                log_print("                 %s" %
                          filtered_candidates[k]['caseName'])
                log_print("      Docket nums: %s" % doc.citation.docket_number)
                log_print("                   %s" %
                          filtered_candidates[k].get('docketNumber', 'None'))
                log_print("      Cosine Similarity: %s" %
                          filtered_stats['cos_sims'][k])
                log_print("      Candidate URL: file://%s" % case_path)
                log_print("      Match URL: https://www.courtlistener.com%s" %
                          (filtered_candidates[k]['absolute_url']))

                choice = raw_input("Is this a duplicate? [Y/n]: ")
                choice = choice or "y"
                if choice == 'y':
                    duplicates.append(filtered_candidates[k]['id'])

        if len(duplicates) == 0:
            log_print(
                "  - Not a duplicate: Manual determination found no matches.")
            return []
        elif len(duplicates) == 1:
            log_print(
                "  - Duplicate found: Manual determination found one match.")
            return [duplicates[0]]
        elif len(duplicates) > 1:
            log_print(
                "  - Duplicates found: Manual determination found %s matches."
                % len(duplicates))
            return duplicates