def do_heuristic_match(idb_row, ds): """Use cosine similarity of case names from the IDB to try to find a match out of several possibilities in the DB. :param idb_row: The FJC IDB row to match against :param ds: A list of Dockets that might match :returns: The best-matching Docket in ds if possible, else None """ case_names = [] for d in ds: case_name = harmonize(d.case_name) parts = case_name.lower().split(" v. ") if len(parts) == 1: case_names.append(case_name) elif len(parts) == 2: plaintiff, defendant = parts[0], parts[1] case_names.append("%s v. %s" % (plaintiff[0:30], defendant[0:30])) elif len(parts) > 2: case_names.append(case_name) idb_case_name = harmonize( "%s v. %s" % (idb_row.plaintiff, idb_row.defendant) ) results = find_best_match(case_names, idb_case_name, case_sensitive=False) if results["ratio"] > 0.65: logger.info( "Found good match by case name for %s: %s", idb_case_name, results["match_str"], ) d = ds[results["match_index"]] else: logger.info( "No good match after office and case name filtering. Creating " "new item: %s", idb_row, ) d = None return d
def do_second_pass(options): """In the first pass, we ignored the duplicates that we got, preferring to let them stack up for later analysis. In this pass, we attempt to merge those failed items into the DB by more aggressive filtering and algorithmic selection. """ idb_rows = FjcIntegratedDatabase.objects.filter( dataset_source=CV_2017, docket__isnull=True, ).order_by('pk') for i, idb_row in enumerate(queryset_generator(idb_rows)): # Iterate over all items in the IDB and find them in the Docket # table. If they're not there, create a new item. if i < options['offset']: continue if i >= options['limit'] > 0: break ds = Docket.objects.filter( docket_number_core=idb_row.docket_number, court=idb_row.district, docket_number__startswith='%s:' % idb_row.office).exclude(docket_number__icontains='cr').exclude( case_name__icontains="sealed").exclude( case_name__icontains='suppressed').exclude( case_name__icontains='search warrant') count = ds.count() if count == 0: logger.info("%s: Creating new docket for IDB row: %s", i, idb_row) create_new_docket_from_idb(idb_row.pk) continue elif count == 1: d = ds[0] logger.info("%s: Merging Docket %s with IDB row: %s", i, d, idb_row) merge_docket_with_idb(d.pk, idb_row.pk) continue logger.info( "%s: Still have %s results after office and civil " "docket number filtering. Filtering further.", i, count) case_names = [] for d in ds: case_name = harmonize(d.case_name) parts = case_name.lower().split(' v. ') if len(parts) == 1: case_names.append(case_name) elif len(parts) == 2: plaintiff, defendant = parts[0], parts[1] case_names.append('%s v. %s' % (plaintiff[0:30], defendant[0:30])) elif len(parts) > 2: case_names.append(case_name) idb_case_name = harmonize('%s v. %s' % (idb_row.plaintiff, idb_row.defendant)) results = find_best_match(case_names, idb_case_name, case_sensitive=False) if results['ratio'] > 0.65: logger.info("%s Found good match by case name for %s: %s", i, idb_case_name, results['match_str']) d = ds[results['match_index']] merge_docket_with_idb(d.pk, idb_row.pk) else: logger.info( "%s No good match after office and case name " "filtering. Creating new item: %s", i, idb_row) create_new_docket_from_idb(idb_row.pk)
def do_second_pass(options): """In the first pass, we ignored the duplicates that we got, preferring to let them stack up for later analysis. In this pass, we attempt to merge those failed items into the DB by more aggressive filtering and algorithmic selection. """ idb_rows = FjcIntegratedDatabase.objects.filter( dataset_source=CV_2017, docket__isnull=True, ).order_by('pk') for i, idb_row in enumerate(queryset_generator(idb_rows)): # Iterate over all items in the IDB and find them in the Docket # table. If they're not there, create a new item. if i < options['offset']: continue if i >= options['limit'] > 0: break ds = Docket.objects.filter( docket_number_core=idb_row.docket_number, court=idb_row.district, docket_number__startswith='%s:' % idb_row.office ).exclude( docket_number__icontains='cr' ).exclude( case_name__icontains="sealed" ).exclude( case_name__icontains='suppressed' ).exclude( case_name__icontains='search warrant' ) count = ds.count() if count == 0: logger.info("%s: Creating new docket for IDB row: %s", i, idb_row) create_new_docket_from_idb(idb_row.pk) continue elif count == 1: d = ds[0] logger.info("%s: Merging Docket %s with IDB row: %s", i, d, idb_row) merge_docket_with_idb(d.pk, idb_row.pk) continue logger.info("%s: Still have %s results after office and civil " "docket number filtering. Filtering further.", i, count) case_names = [] for d in ds: case_name = harmonize(d.case_name) parts = case_name.lower().split(' v. ') if len(parts) == 1: case_names.append(case_name) elif len(parts) == 2: plaintiff, defendant = parts[0], parts[1] case_names.append( '%s v. %s' % (plaintiff[0:30], defendant[0:30]) ) elif len(parts) > 2: case_names.append(case_name) idb_case_name = harmonize('%s v. %s' % (idb_row.plaintiff, idb_row.defendant)) results = find_best_match(case_names, idb_case_name, case_sensitive=False) if results['ratio'] > 0.65: logger.info("%s Found good match by case name for %s: %s", i, idb_case_name, results['match_str']) d = ds[results['match_index']] merge_docket_with_idb(d.pk, idb_row.pk) else: logger.info("%s No good match after office and case name " "filtering. Creating new item: %s", i, idb_row) create_new_docket_from_idb(idb_row.pk)