def test_pairs(self): pairs = ( (('Testacular', 'Testacular'), True), (("Ass'n Managers v. Lissner", 'Association Managers v. Lissner'), False), ) for q, a in pairs: self.assertEqual(case_name_in_candidate(*q), a)
def find_duplicates(doc, case_path): """Return True if it should be saved, else False""" log_print("Running duplicate checks...") # 1. Is the item completely outside of the current corpus? if not needs_dup_check(doc): log_print( " - Not a duplicate: Outside of date range for selected court.") return [] else: log_print( " - Could be a duplicate: Inside of date range for selected court." ) # 2. Can we find any duplicates and information about them? stats, candidates = dup_finder.get_dup_stats(doc) if len(candidates) == 0: log_print(" - Not a duplicate: No candidate matches found.") return [] elif len(candidates) == 1: if doc.docket.docket_number and candidates[0].get( 'docketNumber') is not None: # One in the other or vice versa if (re.sub("(\D|0)", "", candidates[0]['docketNumber']) in re.sub("(\D|0)", "", doc.docket.docket_number)) or \ (re.sub("(\D|0)", "", doc.docket.docket_number) in re.sub("(\D|0)", "", candidates[0]['docketNumber'])): log_print( " - Duplicate found: Only one candidate returned and docket number matches." ) return [candidates[0]['id']] else: if doc.docket.court_id == 'cit': # CIT documents have neutral citations in the database. Look that up and compare against that. candidate_doc = Document.objects.get( pk=candidates[0]['id']) if doc.citation.neutral_cite and candidate_doc.citation.neutral_cite: if candidate_doc.neutral_cite in doc.docket.docket_number: log_print( ' - Duplicate found: One candidate from CIT and its neutral citation matches the new document\'s docket number.' ) return [candidates[0]['id']] else: log_print( " - Not a duplicate: Only one candidate but docket number differs." ) return [] else: log_print(" - Skipping docket_number dup check.") if doc.case_name == candidates[0].get('caseName'): log_print( " - Duplicate found: Only one candidate and case name is a perfect match." ) return [candidates[0]['id']] if dup_helpers.case_name_in_candidate(doc.case_name, candidates[0].get('caseName')): log_print( " - Duplicate found: All words in new document's case name are in the candidate's case name (%s)" % candidates[0].get('caseName')) return [candidates[0]['id']] else: # More than one candidate. if doc.docket.docket_number: dups_by_docket_number = dup_helpers.find_same_docket_numbers( doc, candidates) if len(dups_by_docket_number) > 1: log_print( " - Duplicates found: %s candidates matched by docket number." % len(dups_by_docket_number)) return [can['id'] for can in dups_by_docket_number] elif len(dups_by_docket_number) == 1: log_print( " - Duplicate found: Multiple candidates returned, but one matched by docket number." ) return [dups_by_docket_number[0]['id']] else: log_print( " - Could be a duplicate: Unable to find good match via docket number." ) else: log_print(" - Skipping docket_number dup check.") # 3. Filter out obviously bad cases and then pass remainder forward for manual review. filtered_candidates, filtered_stats = dup_helpers.filter_by_stats( candidates, stats) log_print(" - %s candidates before filtering. With stats: %s" % (stats['candidate_count'], stats['cos_sims'])) log_print(" - %s candidates after filtering. Using filtered stats: %s" % (filtered_stats['candidate_count'], filtered_stats['cos_sims'])) if len(filtered_candidates) == 0: log_print( " - Not a duplicate: After filtering no good candidates remained." ) return [] elif len( filtered_candidates) == 1 and filtered_stats['cos_sims'][0] > 0.93: log_print( " - Duplicate found: One candidate after filtering and cosine similarity is high (%s)" % filtered_stats['cos_sims'][0]) return [filtered_candidates[0]['id']] else: duplicates = [] high_sims_count = len( [sim for sim in filtered_stats['cos_sims'] if sim > 0.98]) low_sims_count = len( [sim for sim in filtered_stats['cos_sims'] if sim < 0.95]) for k in range(0, len(filtered_candidates)): if all([ (high_sims_count == 1), # Only one high score (low_sims_count == filtered_stats['candidate_count'] - 1) # All but one have low scores ]): # If only one of the items is very high, then we can ignore the others and assume it's right if filtered_stats['cos_sims'][k] > 0.98: duplicates.append(filtered_candidates[k]['id']) break else: # ignore the others continue else: # Have to determine by "hand" log_print(" %s) Case name: %s" % (k + 1, doc.case_name)) log_print(" %s" % filtered_candidates[k]['caseName']) log_print(" Docket nums: %s" % doc.docket.docket_number) log_print(" %s" % filtered_candidates[k].get('docketNumber', 'None')) log_print(" Cosine Similarity: %s" % filtered_stats['cos_sims'][k]) log_print(" Candidate URL: file://%s" % case_path) log_print(" Match URL: https://www.courtlistener.com%s" % (filtered_candidates[k]['absolute_url'])) choice = raw_input("Is this a duplicate? [Y/n]: ") choice = choice or "y" if choice == 'y': duplicates.append(filtered_candidates[k]['id']) if len(duplicates) == 0: log_print( " - Not a duplicate: Manual determination found no matches.") return [] elif len(duplicates) == 1: log_print( " - Duplicate found: Manual determination found one match.") return [duplicates[0]] elif len(duplicates) > 1: log_print( " - Duplicates found: Manual determination found %s matches." % len(duplicates)) return duplicates
def find_duplicates(doc, case_path): """Return True if it should be saved, else False""" log_print("Running duplicate checks...") # 1. Is the item completely outside of the current corpus? if not needs_dup_check(doc): log_print( " - Not a duplicate: Outside of date range for selected court.") return [] else: log_print( " - Could be a duplicate: Inside of date range for selected court.") # 2. Can we find any duplicates and information about them? stats, candidates = dup_finder.get_dup_stats(doc) if len(candidates) == 0: log_print(" - Not a duplicate: No candidate matches found.") return [] elif len(candidates) == 1: if doc.docket.docket_number and candidates[0].get( 'docketNumber') is not None: # One in the other or vice versa if (re.sub("(\D|0)", "", candidates[0]['docketNumber']) in re.sub("(\D|0)", "", doc.docket.docket_number)) or \ (re.sub("(\D|0)", "", doc.docket.docket_number) in re.sub("(\D|0)", "", candidates[0]['docketNumber'])): log_print( " - Duplicate found: Only one candidate returned and docket number matches.") return [candidates[0]['id']] else: if doc.docket.court_id == 'cit': # CIT documents have neutral citations in the database. Look that up and compare against that. candidate_doc = Document.objects.get( pk=candidates[0]['id']) if doc.citation.neutral_cite and candidate_doc.citation.neutral_cite: if candidate_doc.neutral_cite in doc.docket.docket_number: log_print( ' - Duplicate found: One candidate from CIT and its neutral citation matches the new document\'s docket number.') return [candidates[0]['id']] else: log_print( " - Not a duplicate: Only one candidate but docket number differs.") return [] else: log_print(" - Skipping docket_number dup check.") if doc.case_name == candidates[0].get('caseName'): log_print( " - Duplicate found: Only one candidate and case name is a perfect match.") return [candidates[0]['id']] if dup_helpers.case_name_in_candidate(doc.case_name, candidates[0].get('caseName')): log_print( " - Duplicate found: All words in new document's case name are in the candidate's case name (%s)" % candidates[0].get('caseName')) return [candidates[0]['id']] else: # More than one candidate. if doc.docket.docket_number: dups_by_docket_number = dup_helpers.find_same_docket_numbers(doc, candidates) if len(dups_by_docket_number) > 1: log_print( " - Duplicates found: %s candidates matched by docket number." % len( dups_by_docket_number)) return [can['id'] for can in dups_by_docket_number] elif len(dups_by_docket_number) == 1: log_print( " - Duplicate found: Multiple candidates returned, but one matched by docket number.") return [dups_by_docket_number[0]['id']] else: log_print( " - Could be a duplicate: Unable to find good match via docket number.") else: log_print(" - Skipping docket_number dup check.") # 3. Filter out obviously bad cases and then pass remainder forward for manual review. filtered_candidates, filtered_stats = dup_helpers.filter_by_stats( candidates, stats) log_print(" - %s candidates before filtering. With stats: %s" % ( stats['candidate_count'], stats['cos_sims'])) log_print(" - %s candidates after filtering. Using filtered stats: %s" % ( filtered_stats['candidate_count'], filtered_stats['cos_sims'])) if len(filtered_candidates) == 0: log_print( " - Not a duplicate: After filtering no good candidates remained.") return [] elif len(filtered_candidates) == 1 and filtered_stats['cos_sims'][ 0] > 0.93: log_print( " - Duplicate found: One candidate after filtering and cosine similarity is high (%s)" % filtered_stats['cos_sims'][0]) return [filtered_candidates[0]['id']] else: duplicates = [] high_sims_count = len( [sim for sim in filtered_stats['cos_sims'] if sim > 0.98]) low_sims_count = len( [sim for sim in filtered_stats['cos_sims'] if sim < 0.95]) for k in range(0, len(filtered_candidates)): if all([(high_sims_count == 1), # Only one high score (low_sims_count == filtered_stats['candidate_count'] - 1) # All but one have low scores ]): # If only one of the items is very high, then we can ignore the others and assume it's right if filtered_stats['cos_sims'][k] > 0.98: duplicates.append(filtered_candidates[k]['id']) break else: # ignore the others continue else: # Have to determine by "hand" log_print(" %s) Case name: %s" % (k + 1, doc.case_name)) log_print( " %s" % filtered_candidates[k]['caseName']) log_print(" Docket nums: %s" % doc.docket.docket_number) log_print(" %s" % filtered_candidates[k].get( 'docketNumber', 'None')) log_print( " Cosine Similarity: %s" % filtered_stats['cos_sims'][ k]) log_print(" Candidate URL: file://%s" % case_path) log_print(" Match URL: https://www.courtlistener.com%s" % (filtered_candidates[k]['absolute_url'])) choice = raw_input("Is this a duplicate? [Y/n]: ") choice = choice or "y" if choice == 'y': duplicates.append(filtered_candidates[k]['id']) if len(duplicates) == 0: log_print( " - Not a duplicate: Manual determination found no matches.") return [] elif len(duplicates) == 1: log_print( " - Duplicate found: Manual determination found one match.") return [duplicates[0]] elif len(duplicates) > 1: log_print( " - Duplicates found: Manual determination found %s matches." % len( duplicates)) return duplicates