def manual_pre_edits(): """ These fixes were determined from inspecting the initial output of main() for error messages. """ with EditLog( description='Swap reporter and volume number for NOTALEPH001894 (was 26 S.C.L. (McMul.)) ' 'and NOTALEPH001893 (was 7 S.C. Eq. (McCord Eq.))' ).record(): v1 = VolumeMetadata.objects.get(barcode='NOTALEPH001894', reporter__short_name='S.C.L. (McMul.)') v2 = VolumeMetadata.objects.get(barcode='NOTALEPH001893', reporter__short_name='S.C. Eq. (McCord Eq.)') new_r1 = v2.reporter new_volnum1 = v2.volume_number new_r2 = v1.reporter new_volnum2 = v1.volume_number v1.volume_number = new_volnum1 v1.set_reporter(new_r1) v2.volume_number = new_volnum2 v2.set_reporter(new_r2) with EditLog( description='Combine reporters 909 and 1075 into just 1075, and 1076 and 919 into just 919. ' 'This combines the first and second run of S.C.L. (Rich.) and S.C. Eq. (Rich. Eq.), respectively. ' 'They are combined as they use the same numbering sequence -- see ' 'https://www.sccourts.org/courtreg/displayRule.cfm?ruleID=268.0&subRuleID=&ruleType=APP' ).record(): for first_id, second_id in ((909, 1075), (1076, 919)): r1 = Reporter.objects.get(id=first_id) r2 = Reporter.objects.get(id=second_id) r2.start_year = r1.start_year r2.volume_count += r1.volume_count r2.hollis += r1.hollis r2.save() for v in r1.volumes.all(): v.set_reporter(r2) r1.delete() with EditLog( description='Mark 32044078662582 (3A Tenn. (Cooke)) as a duplicate of (the appendix to) 32044078663788 (158 Tenn.). ' 'See E. Lucy Ogden, A Note on 3A Tenn. Reports, 19 Tenn. L. Rev. 74 (1945), explaining that former was separately published ' 'for attorneys who did not subscribe to latter and should not be cited.' ).record(): old = VolumeMetadata.objects.get(pk='32044078662582') new = VolumeMetadata.objects.get(pk='32044078663788') if not old.duplicate_of == new: old.set_duplicate(True, new) with EditLog( description='Update case citation from "50 50 Ky. (B. Mon.) 178 183" to "50 Ky. (B. Mon.) 183".' ).record(): c = Citation.objects.get(cite="50 50 Ky. (B. Mon.) 178 183") c.cite = "50 Ky. (B. Mon.) 183" c.save()
def manual_pre_edits(dry_run='true'): """ These fixes were determined from inspecting the initial output of main() for error messages. """ ## delete "U. S." cites and manually detected typo cites to_delete = [] changed_cites = [] potential_matches = list(Citation.objects.filter(cite__contains=' U. S. ')) actual_matches = [c for c in potential_matches if ';' not in c.cite and re.match(r'\d+ U\. S\. \d+', c.cite)] actual_matches += [ Citation.objects.get(case_id=1493694, cite="822 F.2d 52"), Citation.objects.get(case_id=6210086, cite="For"), ] for cite in actual_matches: print("Would delete %s" % cite) to_delete.append(cite) changed_cites.append(cite.cite) if dry_run == 'false': with EditLog( description='Delete incorrectly identified cites matching "\d+ U. S. \d+" (extra space). ' 'These all refer to prior history of the case.' ).record(): for cite in to_delete: cite.delete() CaseMetadata.update_frontend_urls(changed_cites)
def test_data_edit(volume_metadata): with EditLog(description="test").record() as edit: volume_metadata.publisher = "foo" volume_metadata.save() transactions = list(edit.transactions.all()) assert len(transactions) == 1 volume_metadata.refresh_from_db() assert transactions[0].timestamp == volume_metadata.sys_period.lower
def main(dry_run='true', output_missing='false'): # download data cap_cites_path = base_path / 'us_cites.csv' if not cap_cites_path.exists(): print("pre-loading cap cites") us_reporter = Reporter.objects.get(short_name='U.S.') with connections['capdb'].cursor() as cursor: cursor.execute(""" select m.id, m.volume_id, cite, name, name_abbreviation, decision_date from capdb_casemetadata m, capdb_citation c where m.reporter_id=%s and c.case_id=m.id and c.cite like '%%U.S.%%' """, [us_reporter.id]) with cap_cites_path.open('w') as out_file: csv_writer = csv.writer(out_file) for row in cursor.fetchall(): csv_writer.writerow(row) # load data print("loading data") scdb_new_cites_path = base_path / 'SCDB_2019_01_caseCentered_Citation.csv' scdb_old_cites_path = base_path / 'SCDB_Legacy_05_caseCentered_Citation.csv' cap_cites = list(csv.DictReader((line.replace('\xad', '') for line in cap_cites_path.open()), exported_columns)) scdb_cites = list(csv.DictReader(scdb_new_cites_path.open(encoding='iso-8859-1'))) + list(csv.DictReader(scdb_old_cites_path.open(encoding='iso-8859-1'))) scdb_cites = [c for c in scdb_cites if c['usCite']] cap_cites_by_id = {c['id']:c for c in cap_cites} scdb_cites_by_id = {c['caseId']:c for c in scdb_cites} scdb_cites_lookup = group_by(scdb_cites, lambda c: c['usCite']) # count terms for tf_idf print("counting terms") word_counts = Counter() word_counts.update(i for c in cap_cites for i in tokenize(c['name'])) word_counts.update(i for c in scdb_cites for i in tokenize(c['caseName'])) document_count = len(cap_cites)+len(scdb_cites) ### first pass at checking for matches -- find all cases where cites are the same and titles are similar. # These are "strong" matches. print("checking for matches") matched_cap_case_ids = set() for cap_cite in tqdm(cap_cites): cite = cap_cite['cite'] # strip nominatives from CAP cites cite = re.sub(r'\(.*\) ', '', cite) # skip CAP cites that don't look like "123 U.S. 456" if not re.match(r'\w+ U\.S\. \w+$', cite): continue if cite in scdb_cites_lookup: candidates = scdb_cites_lookup[cite] candidates_by_name = {c['caseName'].lower(): c for c in candidates} best = get_best_match( [cap_cite['name'].lower(), cap_cite['name_abbreviation'].lower()], candidates_by_name.keys(), word_counts, document_count ) if best: c = candidates_by_name[best[0]] c['cap_cite'] = cap_cite c['match_quality'] = 'strong' matched_cap_case_ids.add(cap_cite['id']) else: for c in candidates: c.setdefault('failed_matches', []).append(cap_cite) # apply manual_matches overrides for k, v in manual_matches.items(): c = scdb_cites_by_id[k] c['cap_cite'] = cap_cites_by_id[v] c['match_quality'] = 'confirmed' matched_cap_case_ids.add(c['cap_cite']['id']) # write SCDB cites to database print("Applying corrections") edit_out = csv.writer((base_path / 'scdb_cite_edits.csv').open('w')) cite_objs = Citation.objects.filter(case_id__in=matched_cap_case_ids).select_related('case') cite_objs_by_case_id = group_by(cite_objs, lambda c: c.case_id) to_create = [] to_update = [] for scdb_cite in scdb_cites: if 'cap_cite' not in scdb_cite: continue case_id = int(scdb_cite['cap_cite']['id']) existing_cite_objs_by_reporter = {get_cite_reporter(c.cite): c for c in cite_objs_by_case_id[case_id]} expected_cites = [['SCDB', 'SCDB %s' % scdb_cite['caseId'], 'vendor']] for scdb_key, cite_type in [["usCite", "official"], ["sctCite", "parallel"], ["ledCite", "parallel"], ["lexisCite", "vendor"]]: cite_val = scdb_cite[scdb_key] if cite_val: expected_cites.append([get_cite_reporter(cite_val), cite_val, cite_type]) for reporter, cite_val, cite_type in expected_cites: if reporter in existing_cite_objs_by_reporter: new_cite = existing_cite_objs_by_reporter.pop(reporter) if new_cite.cite == cite_val: edit_out.writerow([case_id, 'skip', new_cite.id, cite_val]) else: edit_out.writerow([case_id, 'update', new_cite.id, new_cite.cite, cite_val]) new_cite.cite = cite_val to_update.append(new_cite) else: new_cite = Citation(cite=cite_val, type=cite_type, case_id=case_id, normalized_cite=normalize_cite(cite_val)) to_create.append(new_cite) edit_out.writerow([case_id, 'create', new_cite.type, new_cite.cite]) if existing_cite_objs_by_reporter: edit_out.writerow([case_id, 'warning', 'ignored cite']+[c.cite for c in existing_cite_objs_by_reporter.values()]) if dry_run == 'false': with EditLog(description='Add SCDB cites').record(): Citation.objects.bulk_create(to_create) Citation.objects.bulk_update(to_update, ['cite']) if output_missing != 'true': return ### second pass at checking for matches -- for all SCDB cites that don't have matches, fetch all cases we have for the # same volume, and look for similar titles. These are "weak" matches. print("checking for fallback matches") no_match = [c for c in scdb_cites if 'cap_cite' not in c] missing_by_volume = group_by(no_match, lambda c: c['usCite'].split()[0]) cap_cites_by_volume = group_by(cap_cites, lambda c: c['cite'].split()[0]) # fetch all cases from the DB that belong to volumes where SCDB cases are missing target_volumes = set(c['volume_id'] for v in missing_by_volume for c in cap_cites_by_volume[v]) db_cites = Citation.objects.filter(cite__contains='U.S.', case__volume_id__in=target_volumes).select_related('case') db_cases = [] for cite in db_cites: # skip cases already affirmatively matched if str(cite.id) in matched_cap_case_ids: continue # conform DB objects to export format from csv c = cite.case c.cite = cite c.decision_date = c.decision_date.strftime("%Y-%m-%d") c.name = c.name.replace('\xad', '') c.name_abbreviation = c.name_abbreviation.replace('\xad', '') db_cases.append(c) db_cases_by_volume_id = group_by(db_cases, lambda d: d.volume_id) # check each missing SCDB cite for cases in the same volume with similar titles for v, missing in tqdm(missing_by_volume.items()): if v in cap_cites_by_volume: cases = sum((db_cases_by_volume_id[vol_id] for vol_id in set(c['volume_id'] for c in cap_cites_by_volume[v])), []) cases_by_name = {} for c in cases: cases_by_name[c.name.lower()] = c cases_by_name[c.name_abbreviation.lower()] = c for m in missing: if re.match(r'131 U\.S\. [a-z]+$', m['usCite']): # special case -- skip cites like "131 U.S. lxxxiii", as we know we don't have them continue best = get_best_match([m['caseName']], cases_by_name.keys(), word_counts, document_count, tf_threshold=10) if not best: continue c = cases_by_name[best[0]] m['cap_cite'] = {k: getattr(c, k) for k in exported_columns} m['match_quality'] = 'weak' # output csv_out = csv.writer((base_path / 'scdb_cite_matchup.csv').open('w')) csv_out.writerow(['match quality', 'volume number', 'SCDB name', 'SCDB cite', 'SCDB date', 'SCDB ID', 'CAP name', 'CAP cite', 'CAP date', 'CAP ID', 'CAP vol id']) for c in scdb_cites: match = c.get('cap_cite') match_row = [match['name'], match['cite'], match['decision_date'], match['id'], match['volume_id']] if match else [] csv_out.writerow([c.get('match_quality', 'none'), c['usCite'].split()[0], c['caseName'], c['usCite'], c['dateDecision'], c['caseId']]+match_row)
def main(dry_run='true'): # includes all patterns to look for and how they should be modified with Path(__file__).parent.joinpath( "modification_instructions.json").open() as fp: modification_instructions = json.load(fp, object_pairs_hook=OrderedDict) # fetch and cache citations print("Prefetching citations") all_cites = Citation.objects.filter(type="parallel").values_list( 'id', 'cite') # opens and scopes the log file handler with Path.home().joinpath("citation_update_logger.tsv").open( "a") as logfile: #writes the header column on the log csvlog = csv.writer(logfile, delimiter='\t', quotechar='"') csvlog.writerow([ "timestamp", "action", "New ID", "Case ID", "New Value", "Old Value", "Old ID", "Split String", "Replacement Explainer", "Filtered", "Regex Verified", "Dry Run" ]) for current_pattern, data in modification_instructions.items(): print("Fixing %s" % current_pattern) modifications = data['modifications'] jack_count = data['counts_and_examples'][ 'count'] # the count from jack's original report example_cases = [ pk for entry_set in data['counts_and_examples']['examples'] for pk in entry_set ] #original examples print(example_cases) regex = '^' + get_escaped_regex( current_pattern) + '$' #turn the citation pattern into a regex matcher = re.compile(regex) matching_citation_ids = [ id for id, cite in all_cites if matcher.match(cite) ] matching_cite_query = Citation.objects.filter( id__in=matching_citation_ids) # simplify the list of example cases to make sure our search regex gets them example_cite_ids = [] for epk in example_cases: case = CaseMetadata.objects.get(pk=epk) for cite in case.citations.filter(type="parallel"): example_cite_ids.append(cite.pk) to_update = [] to_insert = [] to_log = [] matching_citation_count = 0 for matching_citation in matching_cite_query: matching_citation_count += 1 csv_replacement_explainer = "" # string that will say in fairly plain english what was done regex_verified = [ ] # simple splits don't have a verification regex— it just splits and checks to make sure the output doesn't match the original pattern. I want to specify in the log if it was regex verified cite = matching_citation.cite case_id = matching_citation.case_id if matching_citation.pk in example_cite_ids: example_cite_ids.remove( matching_citation.pk ) # this list should be empty with every pattern # The modifications can include a split string which will split a citation up into mutiple citations, # 'filters' which is a list of regex substitution pairs, and "kill" which drops one section. The list orders # are important because they have the same indexes as the split order, so you can know where to apply what if 'splitstring' in modifications: new_cites = [ c.strip() for c in cite.split(modifications['splitstring']) if c.strip() ] else: new_cites = [cite] if 'filters' in modifications: new_cites_filtered = [] filters = modifications['filters'] csv_replacement_explainer += "Using {} replacement".format( len(filters)) if len( filters) == 1 else "Using {} replacements".format( len(filters)) assert len(new_cites) == len(filters) for index, (filter_dict, new_cite) in enumerate(zip(filters, new_cites)): if 'kill' in filter_dict: # print("Dropping {}".format(new_cite)) csv_replacement_explainer += ", drop split field {} ({})".format( index, new_cite) continue for pattern in filter_dict['patterns']: csv_replacement_explainer += ", replace '{}' with '{}' in split field {} ({})".format( pattern[0], pattern[1], index, new_cite) new_cite = re.sub(pattern[0], pattern[1], new_cite) # The 'goal' is a pattern that the new citation should match after being processed. if 'goal' in filter_dict: csv_replacement_explainer += " to get '{}'".format( filter_dict['goal']) regex_verified.append(new_cite) if not re.match( '^' + get_escaped_regex(filter_dict['goal']) + '$', new_cite): raise Exception( "Doesn't Match: '{}'\nCurrent Pattern:'{}'\nRegex: '{}'\nCite_Section: '{}'\n" "Goal: '{}'\nEscaped Goal: '{}'".format( cite, current_pattern, get_escaped_regex(current_pattern), new_cite, filter_dict['goal'], get_escaped_regex( filter_dict['goal']))) new_cites_filtered.append(new_cite) new_cites = new_cites_filtered # if it matches the original pattern, it's wrong for c in new_cites: if re.match(matcher, c): raise Exception( "New Cite Matches Original Regex: '{}'\nCurrent Pattern:'{}'\nRegex: '{}'\n" "Cite_Section: '{}'\n".format( cite, current_pattern, get_escaped_regex(current_pattern), c)) # update records and log for index, new_citation in enumerate(new_cites): reg = True if new_citation in regex_verified else False action = "update" if index == 0 else "create" if action == 'update': # print("Updating: {}".format(new_citation)) matching_citation.cite = new_citation matching_citation.normalized_cite = normalize_cite( new_citation) new_citation_obj = matching_citation to_update.append(matching_citation) elif action == 'create': # print("Creating: {}".format(new_citation)) new_citation_obj = Citation( case_id=case_id, type="parallel", cite=new_citation, normalized_cite=normalize_cite(new_citation), ) to_insert.append(new_citation_obj) to_log.append( (cite, reg, action, csv_replacement_explainer, matching_citation, new_citation_obj)) if not to_log: print("- nothing to do") continue if dry_run == 'false': with EditLog(description='Fix citations matching %s' % current_pattern).record() as edit: Citation.objects.bulk_update(to_update, ['cite', 'normalized_cite']) Citation.objects.bulk_create(to_insert) timestamp = edit.timestamp else: timestamp = datetime.now() log_filters = str(modifications['filters'] ) if 'filters' in modifications else None log_splitstring = str(modifications['splitstring'] ) if 'splitstring' in modifications else None for cite, reg, action, csv_replacement_explainer, matching_citation, new_citation_obj in to_log: # "timestamp", "action", "New ID", "Case ID", "New Value", "Old Value", "Old ID", # "Split String", "Replacement Explainer", "Filtered", "Regex Verified", "Dry Run" csvlog.writerow([ timestamp, action, new_citation_obj.pk, matching_citation.case_id, new_citation_obj.cite, cite, matching_citation.pk, log_splitstring, csv_replacement_explainer, log_filters, reg, dry_run, ]) if matching_citation_count != jack_count: # This didn't happen after a dry run with production data print( "non-matching Jack Count: {}, Query Count: {}, Pattern: {}" .format(jack_count, matching_citation_count, current_pattern)) if len(example_cite_ids) > 0: # This didn't happen after a dry run with production data raise Exception("non-matching example in {}: {}".format( current_pattern, ", ".join(example_cite_ids)))
def main(dry_run='true'): # handle each line from manual_fixes.csv fixes = csv.DictReader(Path(__file__).parent.joinpath('manual_fixes.csv').open()) for fix in fixes: if not fix['official']: continue with EditLog( description='Mark reporter %s as nominative for reporter %s' % (fix['short'], fix['official']) ).record(): nominative_reporter = Reporter.objects.get(id=fix['id']) print("Updating %s" % nominative_reporter) if nominative_reporter.is_nominative: print("- skipping, already fixed") continue ## associate nominative reporter with official reporter if fix['official'].isnumeric(): official_reporter = Reporter.objects.get(id=fix['official']) else: official_reporter = Reporter.objects.get(short_name=fix['official']) nominative_reporter.nominative_for = official_reporter nominative_reporter.is_nominative = True nominative_reporter.short_name = fix['nominative'] print("- update %s to be nominative for %s" % (nominative_reporter, official_reporter)) if dry_run == 'false': nominative_reporter.save() ## prepare to process each volume in nominative reporter print("- update volumes") volumes = natsorted(nominative_reporter.volumes.filter(duplicate=False).order_by('volume_number'), key=lambda v: v.volume_number) last_volume_numbers = [] volume_index = 0 # the 'official offset' column indicates how official volume numbers are derived from nominative volume numbers. # it can be in two formats official_offsets = {} official_offset_default = None if ',' in fix['official offset']: # the first format is a set of ranges, like "1-2: 18, 3-14: 24", meaning volumes 1 and 2 were renumbered # to 18, 19, etc., and 3-14 were renumbered to 24, 25, etc. Parse this into a dict like {1: 18, 2: 19, 3: 24, 4: 25 ...} for offset_range in fix['official offset'].split(', '): start_stop, offset = offset_range.split(': ') start, stop = start_stop.split('-') offset = int(offset) for i, vol_num in enumerate(range(int(start), int(stop)+1)): official_offsets[vol_num] = offset + i else: # the second format is just a single number official_offset_default = int(fix['official offset']) ## process each volume for volume in volumes: ## update volume to have correct volume_number, nominative_volume_number, and references to its official reporter and its nominative reporter volume_number = int(volume.volume_number) if volume_number in last_volume_numbers: print(" - WARNING: duplicate volume number %s" % volume_number) else: volume_index += 1 expected_official_volume_number = official_offsets[volume_index] if official_offsets else volume_index + official_offset_default - 1 expected_nominative_volume_number = volume_index if volume_number != expected_nominative_volume_number and volume_number != expected_official_volume_number: print(" - ERROR: Unexpected volume number: %s" % volume_number) continue last_volume_numbers = [expected_official_volume_number, expected_nominative_volume_number] volume.nominative_volume_number = expected_nominative_volume_number volume.volume_number = expected_official_volume_number volume.reporter = official_reporter volume.nominative_reporter = nominative_reporter print(" - update %s to %s,%s" % (volume_number, volume.volume_number, volume.nominative_volume_number)) if dry_run == 'false': volume.save() ## update citations for each case in volume # Do some sanity checking here -- if the case is supposed to end up with official citation "5 Mass. 123" # and nominative citation "1 Bar 123", then we expect the current official citation to start with either # a "1" or "5", followed by either "Mass." or "Bar" or "Mass. (Bar)". # figure out what we expect: print(" - update cases") official_cite_prefix = "%s %s " % (volume.volume_number, official_reporter.short_name) nominative_cite_prefix = "%s %s " % (volume.nominative_volume_number, nominative_reporter.short_name) expected_short_names = [official_reporter.short_name, nominative_reporter.short_name, "%s (%s)" % (official_reporter.short_name, nominative_reporter.short_name)] expected_prefixes = [alphanum("%s %s" % (n, prefix)) for n in [expected_official_volume_number, expected_nominative_volume_number] for prefix in expected_short_names] if volume.barcode in cite_overrides: wrong_old_prefix, fixed_old_prefix = cite_overrides[volume.barcode] else: wrong_old_prefix, fixed_old_prefix = None, None for case in volume.case_metadatas.prefetch_related('citations'): # check if existing cite matches expectations: official_cite = next(c for c in case.citations.all() if c.type == 'official') old_official_cite = official_cite.cite old_prefix, old_page_num = old_official_cite.rsplit(' ', 1) if fixed_old_prefix and wrong_old_prefix == old_prefix: old_prefix = fixed_old_prefix if alphanum(old_prefix) not in expected_prefixes: print(" - ERROR: cite %s not expected" % old_official_cite) continue # create new official and nominative cites: official_cite.cite = official_cite_prefix + old_page_num nominative_cite = deepcopy(official_cite) nominative_cite.cite = nominative_cite_prefix + old_page_num nominative_cite.type = 'nominative' nominative_cite.pk = None print(" - update %s to %s and %s" % (old_official_cite, official_cite, nominative_cite)) if dry_run == 'false': official_cite.save() nominative_cite.save() if dry_run != 'false': raise EditLog.Cancel
def main(dry_run="true"): with EditLog(description='Fix duplicate volumes').record( dry_run=dry_run != "false"): # mark duplicates for duplicate_volume in to_suppress_to_keep: preferred_volume = VolumeMetadata.objects.get( pk=duplicate_volume[1]) for suppress_this in duplicate_volume[0]: vol = VolumeMetadata.objects.get(pk=suppress_this) if dry_run == "false": vol.set_duplicate(preferred_volume) print("set_duplicate,%s,%s" % (vol.barcode, preferred_volume.barcode)) # mark second parts for parts in part_2: vol_entry_1 = VolumeMetadata.objects.get(pk=parts[0]) vol_entry_2 = VolumeMetadata.objects.get(pk=parts[1]) i = 0 label_1 = "" while not label_1.isdigit(): label_1 = vol_entry_1.page_structures.order_by( '-order')[i].label i += 1 label_1 = int(label_1) i = 0 label_2 = "" while not label_2.isdigit(): label_2 = vol_entry_2.page_structures.order_by( '-order')[i].label i += 1 label_2 = int(label_2) if label_1 > label_2: vol_entry_1, vol_entry_2 = vol_entry_2, vol_entry_1 vol_entry_2.second_part_of = vol_entry_1 if dry_run == "false": vol_entry_2.save() print("second_part_of,%s,%s" % (vol_entry_2.barcode, vol_entry_1.barcode)) # update Wn2d_185 Wn2d_185 = VolumeMetadata.objects.get(pk="Wn2d_185") reporter = Reporter.objects.get(short_name="Wash. 2d") if dry_run == "false": Wn2d_185.set_reporter(reporter) print("set_reporter,%s,%s" % (Wn2d_185.barcode, reporter.pk)) # update 32044057887291 nc_vol = VolumeMetadata.objects.get(pk="32044057887291") if dry_run == "false": nc_vol.set_volume_number("5") print("set_volume_number,%s,%s" % (nc_vol.barcode, "5")) #update 25 Tex. Supp tex_supp = VolumeMetadata.objects.get(pk="32044078588621") if dry_run == "false": tex_supp.set_volume_number("25 Supp.") print("set_volume_number,%s,%s" % (tex_supp.barcode, "25 Supp.")) for case in tex_supp.case_metadatas.all(): cite = "25 Tex. Supp. {}".format(case.first_page) if dry_run == "false": Citation.objects.create(type="parallel", cite=cite, case=case) print("new_citation,%s,%s" % (tex_supp.barcode, cite)) # update 32044078699600 volume = VolumeMetadata.objects.get(pk="32044078699600") duplicate_of = VolumeMetadata.objects.get(pk="32044078592631") if dry_run == "false": volume.set_volume_number('25') volume.set_duplicate(duplicate_of) print("set_volume_number,%s,%s" % (volume.barcode, '25')) print("set_duplicate,%s,%s" % (volume.barcode, duplicate_of.pk)) # update volume numbers for replace in simple_replace: vol = VolumeMetadata.objects.get(pk=replace[0]) if dry_run == "false": vol.set_volume_number(replace[1]) print("set_volume_number,%s,%s" % (vol.barcode, replace[1]))
def main(dry_run='true', log_file='/tmp/fix_reporter_jurs.log'): dry_run = dry_run != 'false' with open(log_file, 'a') as log: for fix in fixes: cases = CaseMetadata.objects.filter(reporter_id=fix['Reporter']) if fix['Volume'] != '*': cases = cases.filter(volume__volume_number=fix['Volume']) if fix['Correct Citation']: cites = Citation.objects.filter( case__in=cases, cite__contains=fix['Wrong Citation']) with EditLog( description= 'Correct citations for reporter %s volume %s from %s to %s' % (fix['Reporter'], fix['Volume'], fix['Wrong Citation'], fix['Correct Citation']), dry_run=dry_run, ).record(): actions = Citation.replace_reporter( cites, fix['Wrong Citation'], fix['Correct Citation'], dry_run=dry_run) for cite, old_cite, new_cite in actions: write_log( log, { "action": 'fix_cite', "cite_id": cite.id, "old": old_cite, "new": new_cite }) elif fix['Correct Reporter']: volume = VolumeMetadata.objects.filter( volume_number=fix['Volume'], reporter_id=fix['Reporter']).first() if volume is None: print("WARNING: nothing to do for %s; skipping" % fix) continue new_reporter = Reporter.objects.get(id=fix['Correct Reporter']) write_log( log, { "action": 'fix_reporter', "volume_id": volume.pk, "old_reporter_id": volume.reporter_id, "new_reporter_id": new_reporter.id }) if not dry_run: with EditLog( description= 'Correct reporter for volume %s from %s to %s' % (volume.pk, volume.reporter_id, new_reporter.id)).record(): volume.set_reporter(new_reporter) else: cases = cases.filter(jurisdiction__name=fix['Wrong Jur']) new_jur = Jurisdiction.objects.get(name=fix['Correct Jur']) new_court = None if fix['Correct Court']: new_court = Court.objects.get(name=fix['Correct Court']) to_update = [] for case in cases: case.jurisdiction = new_jur message = { "action": 'fix_jur', "case_id": case.pk, "old_jur": fix['Wrong Jur'], "new_jur": fix['Correct Jur'] } if new_court: message.update(old_court=case.court_id, new_court=new_court.id) case.court = new_court to_update.append(case) write_log(log, message) if not to_update: print("WARNING: nothing to do for %s; skipping" % fix) continue if not dry_run: with EditLog( description= 'Correct jurisdictions in reporter %s volume %s from %s to %s' % (fix['Reporter'], fix['Volume'], fix['Wrong Jur'], fix['Correct Jur']), ).record(): CaseMetadata.objects.bulk_update( to_update, ['court', 'jurisdiction'])
out.writerow([case.id, "skip", second.cite]) continue # type 3: if re.match(r'\d+ F.2d \d+;.*$', second.cite): old_val = second.cite second.cite = old_val.split(';', 1)[0] to_update.append(second) changed_cites.add(old_val) changed_cites.add(second.cite) out.writerow([case.id, "update", old_val, second.cite]) continue # type 4: delete_cites = [c.cite for c in rest] changed_cites.update(delete_cites) out.writerow([case.id, "delete"] + delete_cites) to_delete.extend(rest) # apply edits: if dry_run == 'false': with EditLog( description= 'Remove incorrectly-detected citations from Ct. Cl. reporter. ' 'See https://github.com/harvard-lil/capstone/issues/1192' ).record(): Citation.objects.bulk_update(to_update, ['cite']) for obj in to_delete: obj.delete() CaseMetadata.update_frontend_urls(changed_cites)
if old_token != old_val: raise Exception("attempt to edit out-of-date token") tokens[index] = new_val # update case, if needed metadata_count = 0 for field in metadata_fields: if field not in data['metadata']: continue old_val, new_val = data['metadata'][field] if getattr(case, field) == old_val: metadata_count += 1 setattr(case, field, new_val) if metadata_count or word_count: with EditLog(description='Case %s edited by user %s: %s metadata fields, %s words' % (case.id, request.user.id, metadata_count, word_count), user_id=request.user.id).record(): CorrectionLog(description=data['description'], user_id=request.user.id, case=case).save() if pages_to_save: PageStructure.objects.bulk_update(pages_to_save, ['blocks']) case.sync_case_body_cache(blocks_by_id=PageStructure.blocks_by_id(pages)) # re-extract citations existing_cites = {c.cite: c for c in ExtractedCitation.objects.filter(cited_by=case)} new_cites = {c.cite: c for c in case.extract_citations()[0]} ExtractedCitation.objects.filter(id__in=[v.id for k, v in existing_cites.items() if k not in new_cites]).delete() ExtractedCitation.objects.bulk_create([v for k, v in new_cites.items() if k not in existing_cites]) case.save() case.reindex() # manual reindex for instant results return HttpResponse('OK')