Example #1
0
    def test_pairs(self):
        pairs = (
            (('Testacular', 'Testacular'), True),
            (("Ass'n Managers v. Lissner", 'Association Managers v. Lissner'), False),
        )

        for q, a in pairs:
            self.assertEqual(case_name_in_candidate(*q), a)
def find_duplicates(doc, case_path):
    """Return True if it should be saved, else False"""
    log_print("Running duplicate checks...")

    # 1. Is the item completely outside of the current corpus?
    if not needs_dup_check(doc):
        log_print(
            "  - Not a duplicate: Outside of date range for selected court.")
        return []
    else:
        log_print(
            "  - Could be a duplicate: Inside of date range for selected court."
        )

    # 2. Can we find any duplicates and information about them?
    stats, candidates = dup_finder.get_dup_stats(doc)
    if len(candidates) == 0:
        log_print("  - Not a duplicate: No candidate matches found.")
        return []
    elif len(candidates) == 1:

        if doc.docket.docket_number and candidates[0].get(
                'docketNumber') is not None:
            # One in the other or vice versa
            if (re.sub("(\D|0)", "", candidates[0]['docketNumber']) in
                    re.sub("(\D|0)", "", doc.docket.docket_number)) or \
                    (re.sub("(\D|0)", "", doc.docket.docket_number) in
                         re.sub("(\D|0)", "", candidates[0]['docketNumber'])):
                log_print(
                    "  - Duplicate found: Only one candidate returned and docket number matches."
                )
                return [candidates[0]['id']]
            else:
                if doc.docket.court_id == 'cit':
                    # CIT documents have neutral citations in the database. Look that up and compare against that.
                    candidate_doc = Document.objects.get(
                        pk=candidates[0]['id'])
                    if doc.citation.neutral_cite and candidate_doc.citation.neutral_cite:
                        if candidate_doc.neutral_cite in doc.docket.docket_number:
                            log_print(
                                '  - Duplicate found: One candidate from CIT and its neutral citation matches the new document\'s docket number.'
                            )
                            return [candidates[0]['id']]
                else:
                    log_print(
                        "  - Not a duplicate: Only one candidate but docket number differs."
                    )
                return []
        else:
            log_print("  - Skipping docket_number dup check.")

        if doc.case_name == candidates[0].get('caseName'):
            log_print(
                "  - Duplicate found: Only one candidate and case name is a perfect match."
            )
            return [candidates[0]['id']]

        if dup_helpers.case_name_in_candidate(doc.case_name,
                                              candidates[0].get('caseName')):
            log_print(
                "  - Duplicate found: All words in new document's case name are in the candidate's case name (%s)"
                % candidates[0].get('caseName'))
            return [candidates[0]['id']]

    else:
        # More than one candidate.
        if doc.docket.docket_number:
            dups_by_docket_number = dup_helpers.find_same_docket_numbers(
                doc, candidates)
            if len(dups_by_docket_number) > 1:
                log_print(
                    "  - Duplicates found: %s candidates matched by docket number."
                    % len(dups_by_docket_number))
                return [can['id'] for can in dups_by_docket_number]
            elif len(dups_by_docket_number) == 1:
                log_print(
                    "  - Duplicate found: Multiple candidates returned, but one matched by docket number."
                )
                return [dups_by_docket_number[0]['id']]
            else:
                log_print(
                    "  - Could be a duplicate: Unable to find good match via docket number."
                )
        else:
            log_print("  - Skipping docket_number dup check.")

    # 3. Filter out obviously bad cases and then pass remainder forward for manual review.

    filtered_candidates, filtered_stats = dup_helpers.filter_by_stats(
        candidates, stats)
    log_print("  - %s candidates before filtering. With stats: %s" %
              (stats['candidate_count'], stats['cos_sims']))
    log_print("  - %s candidates after filtering. Using filtered stats: %s" %
              (filtered_stats['candidate_count'], filtered_stats['cos_sims']))
    if len(filtered_candidates) == 0:
        log_print(
            "  - Not a duplicate: After filtering no good candidates remained."
        )
        return []
    elif len(
            filtered_candidates) == 1 and filtered_stats['cos_sims'][0] > 0.93:
        log_print(
            "  - Duplicate found: One candidate after filtering and cosine similarity is high (%s)"
            % filtered_stats['cos_sims'][0])
        return [filtered_candidates[0]['id']]
    else:
        duplicates = []
        high_sims_count = len(
            [sim for sim in filtered_stats['cos_sims'] if sim > 0.98])
        low_sims_count = len(
            [sim for sim in filtered_stats['cos_sims'] if sim < 0.95])
        for k in range(0, len(filtered_candidates)):
            if all([
                (high_sims_count == 1),  # Only one high score
                (low_sims_count == filtered_stats['candidate_count'] - 1)
                    # All but one have low scores
            ]):
                # If only one of the items is very high, then we can ignore the others and assume it's right
                if filtered_stats['cos_sims'][k] > 0.98:
                    duplicates.append(filtered_candidates[k]['id'])
                    break
                else:
                    # ignore the others
                    continue
            else:
                # Have to determine by "hand"
                log_print("  %s) Case name: %s" % (k + 1, doc.case_name))
                log_print("                 %s" %
                          filtered_candidates[k]['caseName'])
                log_print("      Docket nums: %s" % doc.docket.docket_number)
                log_print("                   %s" %
                          filtered_candidates[k].get('docketNumber', 'None'))
                log_print("      Cosine Similarity: %s" %
                          filtered_stats['cos_sims'][k])
                log_print("      Candidate URL: file://%s" % case_path)
                log_print("      Match URL: https://www.courtlistener.com%s" %
                          (filtered_candidates[k]['absolute_url']))

                choice = raw_input("Is this a duplicate? [Y/n]: ")
                choice = choice or "y"
                if choice == 'y':
                    duplicates.append(filtered_candidates[k]['id'])

        if len(duplicates) == 0:
            log_print(
                "  - Not a duplicate: Manual determination found no matches.")
            return []
        elif len(duplicates) == 1:
            log_print(
                "  - Duplicate found: Manual determination found one match.")
            return [duplicates[0]]
        elif len(duplicates) > 1:
            log_print(
                "  - Duplicates found: Manual determination found %s matches."
                % len(duplicates))
            return duplicates
Example #3
0
def find_duplicates(doc, case_path):
    """Return True if it should be saved, else False"""
    log_print("Running duplicate checks...")

    # 1. Is the item completely outside of the current corpus?
    if not needs_dup_check(doc):
        log_print(
            "  - Not a duplicate: Outside of date range for selected court.")
        return []
    else:
        log_print(
            "  - Could be a duplicate: Inside of date range for selected court.")

    # 2. Can we find any duplicates and information about them?
    stats, candidates = dup_finder.get_dup_stats(doc)
    if len(candidates) == 0:
        log_print("  - Not a duplicate: No candidate matches found.")
        return []
    elif len(candidates) == 1:

        if doc.docket.docket_number and candidates[0].get(
                'docketNumber') is not None:
            # One in the other or vice versa
            if (re.sub("(\D|0)", "", candidates[0]['docketNumber']) in
                    re.sub("(\D|0)", "", doc.docket.docket_number)) or \
                    (re.sub("(\D|0)", "", doc.docket.docket_number) in
                         re.sub("(\D|0)", "", candidates[0]['docketNumber'])):
                log_print(
                    "  - Duplicate found: Only one candidate returned and docket number matches.")
                return [candidates[0]['id']]
            else:
                if doc.docket.court_id == 'cit':
                    # CIT documents have neutral citations in the database. Look that up and compare against that.
                    candidate_doc = Document.objects.get(
                        pk=candidates[0]['id'])
                    if doc.citation.neutral_cite and candidate_doc.citation.neutral_cite:
                        if candidate_doc.neutral_cite in doc.docket.docket_number:
                            log_print(
                                '  - Duplicate found: One candidate from CIT and its neutral citation matches the new document\'s docket number.')
                            return [candidates[0]['id']]
                else:
                    log_print(
                        "  - Not a duplicate: Only one candidate but docket number differs.")
                return []
        else:
            log_print("  - Skipping docket_number dup check.")

        if doc.case_name == candidates[0].get('caseName'):
            log_print(
                "  - Duplicate found: Only one candidate and case name is a perfect match.")
            return [candidates[0]['id']]

        if dup_helpers.case_name_in_candidate(doc.case_name,
                                              candidates[0].get('caseName')):
            log_print(
                "  - Duplicate found: All words in new document's case name are in the candidate's case name (%s)" %
                candidates[0].get('caseName'))
            return [candidates[0]['id']]

    else:
        # More than one candidate.
        if doc.docket.docket_number:
            dups_by_docket_number = dup_helpers.find_same_docket_numbers(doc,
                                                                         candidates)
            if len(dups_by_docket_number) > 1:
                log_print(
                    "  - Duplicates found: %s candidates matched by docket number." % len(
                        dups_by_docket_number))
                return [can['id'] for can in dups_by_docket_number]
            elif len(dups_by_docket_number) == 1:
                log_print(
                    "  - Duplicate found: Multiple candidates returned, but one matched by docket number.")
                return [dups_by_docket_number[0]['id']]
            else:
                log_print(
                    "  - Could be a duplicate: Unable to find good match via docket number.")
        else:
            log_print("  - Skipping docket_number dup check.")

    # 3. Filter out obviously bad cases and then pass remainder forward for manual review.

    filtered_candidates, filtered_stats = dup_helpers.filter_by_stats(
        candidates, stats)
    log_print("  - %s candidates before filtering. With stats: %s" % (
        stats['candidate_count'], stats['cos_sims']))
    log_print("  - %s candidates after filtering. Using filtered stats: %s" % (
        filtered_stats['candidate_count'],
        filtered_stats['cos_sims']))
    if len(filtered_candidates) == 0:
        log_print(
            "  - Not a duplicate: After filtering no good candidates remained.")
        return []
    elif len(filtered_candidates) == 1 and filtered_stats['cos_sims'][
        0] > 0.93:
        log_print(
            "  - Duplicate found: One candidate after filtering and cosine similarity is high (%s)" %
            filtered_stats['cos_sims'][0])
        return [filtered_candidates[0]['id']]
    else:
        duplicates = []
        high_sims_count = len(
            [sim for sim in filtered_stats['cos_sims'] if sim > 0.98])
        low_sims_count = len(
            [sim for sim in filtered_stats['cos_sims'] if sim < 0.95])
        for k in range(0, len(filtered_candidates)):
            if all([(high_sims_count == 1),  # Only one high score
                    (low_sims_count == filtered_stats['candidate_count'] - 1)
                    # All but one have low scores
                    ]):
                # If only one of the items is very high, then we can ignore the others and assume it's right
                if filtered_stats['cos_sims'][k] > 0.98:
                    duplicates.append(filtered_candidates[k]['id'])
                    break
                else:
                    # ignore the others
                    continue
            else:
                # Have to determine by "hand"
                log_print("  %s) Case name: %s" % (k + 1, doc.case_name))
                log_print(
                    "                 %s" % filtered_candidates[k]['caseName'])
                log_print("      Docket nums: %s" % doc.docket.docket_number)
                log_print("                   %s" % filtered_candidates[k].get(
                    'docketNumber', 'None'))
                log_print(
                    "      Cosine Similarity: %s" % filtered_stats['cos_sims'][
                        k])
                log_print("      Candidate URL: file://%s" % case_path)
                log_print("      Match URL: https://www.courtlistener.com%s" %
                          (filtered_candidates[k]['absolute_url']))

                choice = raw_input("Is this a duplicate? [Y/n]: ")
                choice = choice or "y"
                if choice == 'y':
                    duplicates.append(filtered_candidates[k]['id'])

        if len(duplicates) == 0:
            log_print(
                "  - Not a duplicate: Manual determination found no matches.")
            return []
        elif len(duplicates) == 1:
            log_print(
                "  - Duplicate found: Manual determination found one match.")
            return [duplicates[0]]
        elif len(duplicates) > 1:
            log_print(
                "  - Duplicates found: Manual determination found %s matches." % len(
                    duplicates))
            return duplicates