def get_filter_query_params(self, request, view): def lc_values(values): return [ value.lower() for value in values if isinstance(value, str) ] query_params = super().get_filter_query_params(request, view) if 'cite' in query_params: query_params['cite']['values'] = [ models.normalize_cite(cite) for cite in lc_values(query_params['cite']['values']) ] if 'court' in query_params: query_params['court']['values'] = lc_values( query_params['court']['values']) if 'jurisdiction' in query_params: query_params['jurisdiction']['values'] = lc_values( query_params['jurisdiction']['values']) if 'cites_to' in query_params: old_cites_to = query_params['cites_to']['values'] query_params['cites_to']['values'] = [] for cite in old_cites_to: # check if case id is passed in if cite.isdigit(): try: case = CaseDocument.get(id=cite) # add all citations relating to case query_params['cites_to']['values'] += [ c['normalized_cite'] for c in case.citations ] except NotFoundError: pass else: query_params['cites_to']['values'].append( normalize_cite(cite))
def test_extract_citations(case_factory, tmpdir, settings, elasticsearch): from scripts.extract_cites import EDITIONS as processed_editions settings.MISSED_CITATIONS_DIR = str(tmpdir) blocked_by_date = set( k for k in list(EDITIONS.keys()) + list(VARIATIONS_ONLY.keys()) if all(c['start_year'] > 2000 for c in processed_editions[k])) legitimate_cites = [ "225 F. Supp. 552", # correct ["125 f supp 152", "125 F. Supp. 152"], # normalized ["125 Burnett (Wis.) 152", "125 Bur. 152"], # normalized ["1 F. 2d 2", "1 F.2d 2"], # not matched as "1 F. 2" "2 1/2 Mass. 1", # special volume numbers "3 Suppl. Mass. 2", # special volume numbers "1 La.App. 5 Cir. 2", # not matched as "1 La.App. 5" "2000 WL 12345", # vendor cite ] legitimate_cites += [ "1 %s 1" % c for c in EDITIONS.keys() if c not in blocked_by_date ] legitimate_cites += [["1 %s 1" % k, "1 %s 1" % v] for k, vv in VARIATIONS_ONLY.items() for v in vv if k not in blocked_by_date] legitimate_cites_normalized = set( normalize_cite(c if type(c) is str else c[1]) for c in legitimate_cites) legitimate_cites = [ c if type(c) is str else c[0] for c in legitimate_cites ] illegitimate_cites = [ "2 Dogs 3", # unrecognized reporter "3 Dogs 4", # duplicate unrecognized reporter "1 or 2", # not matched as 1 Or. 2 "word1 Mass. 2word", # not matched if part of larger word "1 Mass.\n 2", # no match across newlines "1 A.3d 1", # no match to reporter that started publishing in 2010 ] illegitimate_cites += ["1 %s 1" % c for c in blocked_by_date] case = case_factory( body_cache__text=", some text, ".join(legitimate_cites + illegitimate_cites), decision_date=datetime(2000, 1, 1)) fabfile.extract_all_citations() update_elasticsearch_from_queue() # check extracted cites cites = list(ExtractedCitation.objects.all()) cite_set = set(c.cite for c in cites) normalized_cite_set = set(c.normalized_cite for c in cites) assert cite_set == set(legitimate_cites) assert normalized_cite_set == legitimate_cites_normalized assert all(c.cited_by_id == case.pk for c in cites)
def get_filter_query_params(self, request, view): query_params = super().get_filter_query_params(request, view) if not query_params: raise ValidationError("Query parameter 'q' is required") extracted_cites = { normalize_cite(i[1]): i[0] for v in query_params['q']['values'] for i in extract_citations_from_text(v) } if not extracted_cites: raise ValidationError("No citations found in query.") query_params['q']['values'] = set(extracted_cites) request.extracted_cites = extracted_cites return query_params
def retrieve(self, request, *args, **kwargs): # for user's convenience, if user gets /cases/casecitation or /cases/Case Citation (or any non-numeric value) # we redirect to /cases/?cite=casecitation id = kwargs[self.lookup_field] if not id.isdigit(): normalized_cite = models.normalize_cite(id) query_string = urllib.parse.urlencode(dict( self.request.query_params, cite=normalized_cite), doseq=True) new_url = reverse('cases-list') + "?" + query_string return HttpResponseRedirect(new_url) if self.request.query_params.get('format') == 'html': # if previously-supported format=html is requested, redirect to frontend_url case = models.CaseMetadata.objects.filter(id=id).first() if case: return HttpResponseRedirect(case.get_full_frontend_url())
def extract_citations(case): misses = defaultdict(lambda: defaultdict(int)) # Don't count instances where a case cites to itself. Typically this is listing the parallel cite, # which leads to false matches when the parallel page has more than one case. self_cites = {c.normalized_cite for c in case.citations.all()} case_citations = [ ExtractedCitation(cite=cite, normalized_cite=normalize_cite(normalized_cite), cited_by=case, reporter_name_original=reporter_str, volume_number_original=vol_num, page_number_original=page_num) for cite, normalized_cite, vol_num, reporter_str, page_num in extract_citations_from_text( case.body_cache.text, case.decision_date.year, misses) if normalized_cite not in self_cites ]
def extract_citations_from_text(text, max_year=9999, misses=None): # use dict.fromkeys to remove dupes while preserving order for match in dict.fromkeys(re.findall(cite_extracting_regex, text)): vol_num, reporter_str, page_num = match # fix known OCR errors if reporter_str in TRANSLATIONS: reporter_str = TRANSLATIONS[reporter_str] # skip strings like 'or' that are known non-citations if reporter_str in INVALID_REPORTERS: if misses is not None: misses['blocked'][reporter_str] += 1 continue # Look for found reporter string in the official and nominative REPORTER dicts # Try exact match, then normalized match candidates = EDITIONS.get(reporter_str) or EDITIONS.get( normalize_cite(reporter_str)) if not candidates: # reporter not found, removing cite and adding to misses list if misses is not None: misses['not_found'][reporter_str] += 1 continue # Find a candidate reporter that was in operation prior to this case being published. # Reporters are sorted by end date, so this will prefer newer reporters. best_candidate = next( (c for c in candidates if c['start_year'] <= max_year), None) if not best_candidate: if misses is not None: misses['invalid_date'][reporter_str] += 1 continue cite = " ".join(match) normalized_cite = "%s %s %s" % (vol_num, best_candidate['reporter'], page_num) yield cite, normalized_cite, vol_num, reporter_str, page_num
def main(dry_run='true', output_missing='false'): # download data cap_cites_path = base_path / 'us_cites.csv' if not cap_cites_path.exists(): print("pre-loading cap cites") us_reporter = Reporter.objects.get(short_name='U.S.') with connections['capdb'].cursor() as cursor: cursor.execute(""" select m.id, m.volume_id, cite, name, name_abbreviation, decision_date from capdb_casemetadata m, capdb_citation c where m.reporter_id=%s and c.case_id=m.id and c.cite like '%%U.S.%%' """, [us_reporter.id]) with cap_cites_path.open('w') as out_file: csv_writer = csv.writer(out_file) for row in cursor.fetchall(): csv_writer.writerow(row) # load data print("loading data") scdb_new_cites_path = base_path / 'SCDB_2019_01_caseCentered_Citation.csv' scdb_old_cites_path = base_path / 'SCDB_Legacy_05_caseCentered_Citation.csv' cap_cites = list(csv.DictReader((line.replace('\xad', '') for line in cap_cites_path.open()), exported_columns)) scdb_cites = list(csv.DictReader(scdb_new_cites_path.open(encoding='iso-8859-1'))) + list(csv.DictReader(scdb_old_cites_path.open(encoding='iso-8859-1'))) scdb_cites = [c for c in scdb_cites if c['usCite']] cap_cites_by_id = {c['id']:c for c in cap_cites} scdb_cites_by_id = {c['caseId']:c for c in scdb_cites} scdb_cites_lookup = group_by(scdb_cites, lambda c: c['usCite']) # count terms for tf_idf print("counting terms") word_counts = Counter() word_counts.update(i for c in cap_cites for i in tokenize(c['name'])) word_counts.update(i for c in scdb_cites for i in tokenize(c['caseName'])) document_count = len(cap_cites)+len(scdb_cites) ### first pass at checking for matches -- find all cases where cites are the same and titles are similar. # These are "strong" matches. print("checking for matches") matched_cap_case_ids = set() for cap_cite in tqdm(cap_cites): cite = cap_cite['cite'] # strip nominatives from CAP cites cite = re.sub(r'\(.*\) ', '', cite) # skip CAP cites that don't look like "123 U.S. 456" if not re.match(r'\w+ U\.S\. \w+$', cite): continue if cite in scdb_cites_lookup: candidates = scdb_cites_lookup[cite] candidates_by_name = {c['caseName'].lower(): c for c in candidates} best = get_best_match( [cap_cite['name'].lower(), cap_cite['name_abbreviation'].lower()], candidates_by_name.keys(), word_counts, document_count ) if best: c = candidates_by_name[best[0]] c['cap_cite'] = cap_cite c['match_quality'] = 'strong' matched_cap_case_ids.add(cap_cite['id']) else: for c in candidates: c.setdefault('failed_matches', []).append(cap_cite) # apply manual_matches overrides for k, v in manual_matches.items(): c = scdb_cites_by_id[k] c['cap_cite'] = cap_cites_by_id[v] c['match_quality'] = 'confirmed' matched_cap_case_ids.add(c['cap_cite']['id']) # write SCDB cites to database print("Applying corrections") edit_out = csv.writer((base_path / 'scdb_cite_edits.csv').open('w')) cite_objs = Citation.objects.filter(case_id__in=matched_cap_case_ids).select_related('case') cite_objs_by_case_id = group_by(cite_objs, lambda c: c.case_id) to_create = [] to_update = [] for scdb_cite in scdb_cites: if 'cap_cite' not in scdb_cite: continue case_id = int(scdb_cite['cap_cite']['id']) existing_cite_objs_by_reporter = {get_cite_reporter(c.cite): c for c in cite_objs_by_case_id[case_id]} expected_cites = [['SCDB', 'SCDB %s' % scdb_cite['caseId'], 'vendor']] for scdb_key, cite_type in [["usCite", "official"], ["sctCite", "parallel"], ["ledCite", "parallel"], ["lexisCite", "vendor"]]: cite_val = scdb_cite[scdb_key] if cite_val: expected_cites.append([get_cite_reporter(cite_val), cite_val, cite_type]) for reporter, cite_val, cite_type in expected_cites: if reporter in existing_cite_objs_by_reporter: new_cite = existing_cite_objs_by_reporter.pop(reporter) if new_cite.cite == cite_val: edit_out.writerow([case_id, 'skip', new_cite.id, cite_val]) else: edit_out.writerow([case_id, 'update', new_cite.id, new_cite.cite, cite_val]) new_cite.cite = cite_val to_update.append(new_cite) else: new_cite = Citation(cite=cite_val, type=cite_type, case_id=case_id, normalized_cite=normalize_cite(cite_val)) to_create.append(new_cite) edit_out.writerow([case_id, 'create', new_cite.type, new_cite.cite]) if existing_cite_objs_by_reporter: edit_out.writerow([case_id, 'warning', 'ignored cite']+[c.cite for c in existing_cite_objs_by_reporter.values()]) if dry_run == 'false': with EditLog(description='Add SCDB cites').record(): Citation.objects.bulk_create(to_create) Citation.objects.bulk_update(to_update, ['cite']) if output_missing != 'true': return ### second pass at checking for matches -- for all SCDB cites that don't have matches, fetch all cases we have for the # same volume, and look for similar titles. These are "weak" matches. print("checking for fallback matches") no_match = [c for c in scdb_cites if 'cap_cite' not in c] missing_by_volume = group_by(no_match, lambda c: c['usCite'].split()[0]) cap_cites_by_volume = group_by(cap_cites, lambda c: c['cite'].split()[0]) # fetch all cases from the DB that belong to volumes where SCDB cases are missing target_volumes = set(c['volume_id'] for v in missing_by_volume for c in cap_cites_by_volume[v]) db_cites = Citation.objects.filter(cite__contains='U.S.', case__volume_id__in=target_volumes).select_related('case') db_cases = [] for cite in db_cites: # skip cases already affirmatively matched if str(cite.id) in matched_cap_case_ids: continue # conform DB objects to export format from csv c = cite.case c.cite = cite c.decision_date = c.decision_date.strftime("%Y-%m-%d") c.name = c.name.replace('\xad', '') c.name_abbreviation = c.name_abbreviation.replace('\xad', '') db_cases.append(c) db_cases_by_volume_id = group_by(db_cases, lambda d: d.volume_id) # check each missing SCDB cite for cases in the same volume with similar titles for v, missing in tqdm(missing_by_volume.items()): if v in cap_cites_by_volume: cases = sum((db_cases_by_volume_id[vol_id] for vol_id in set(c['volume_id'] for c in cap_cites_by_volume[v])), []) cases_by_name = {} for c in cases: cases_by_name[c.name.lower()] = c cases_by_name[c.name_abbreviation.lower()] = c for m in missing: if re.match(r'131 U\.S\. [a-z]+$', m['usCite']): # special case -- skip cites like "131 U.S. lxxxiii", as we know we don't have them continue best = get_best_match([m['caseName']], cases_by_name.keys(), word_counts, document_count, tf_threshold=10) if not best: continue c = cases_by_name[best[0]] m['cap_cite'] = {k: getattr(c, k) for k in exported_columns} m['match_quality'] = 'weak' # output csv_out = csv.writer((base_path / 'scdb_cite_matchup.csv').open('w')) csv_out.writerow(['match quality', 'volume number', 'SCDB name', 'SCDB cite', 'SCDB date', 'SCDB ID', 'CAP name', 'CAP cite', 'CAP date', 'CAP ID', 'CAP vol id']) for c in scdb_cites: match = c.get('cap_cite') match_row = [match['name'], match['cite'], match['decision_date'], match['id'], match['volume_id']] if match else [] csv_out.writerow([c.get('match_quality', 'none'), c['usCite'].split()[0], c['caseName'], c['usCite'], c['dateDecision'], c['caseId']]+match_row)
def main(dry_run='true'): # includes all patterns to look for and how they should be modified with Path(__file__).parent.joinpath( "modification_instructions.json").open() as fp: modification_instructions = json.load(fp, object_pairs_hook=OrderedDict) # fetch and cache citations print("Prefetching citations") all_cites = Citation.objects.filter(type="parallel").values_list( 'id', 'cite') # opens and scopes the log file handler with Path.home().joinpath("citation_update_logger.tsv").open( "a") as logfile: #writes the header column on the log csvlog = csv.writer(logfile, delimiter='\t', quotechar='"') csvlog.writerow([ "timestamp", "action", "New ID", "Case ID", "New Value", "Old Value", "Old ID", "Split String", "Replacement Explainer", "Filtered", "Regex Verified", "Dry Run" ]) for current_pattern, data in modification_instructions.items(): print("Fixing %s" % current_pattern) modifications = data['modifications'] jack_count = data['counts_and_examples'][ 'count'] # the count from jack's original report example_cases = [ pk for entry_set in data['counts_and_examples']['examples'] for pk in entry_set ] #original examples print(example_cases) regex = '^' + get_escaped_regex( current_pattern) + '$' #turn the citation pattern into a regex matcher = re.compile(regex) matching_citation_ids = [ id for id, cite in all_cites if matcher.match(cite) ] matching_cite_query = Citation.objects.filter( id__in=matching_citation_ids) # simplify the list of example cases to make sure our search regex gets them example_cite_ids = [] for epk in example_cases: case = CaseMetadata.objects.get(pk=epk) for cite in case.citations.filter(type="parallel"): example_cite_ids.append(cite.pk) to_update = [] to_insert = [] to_log = [] matching_citation_count = 0 for matching_citation in matching_cite_query: matching_citation_count += 1 csv_replacement_explainer = "" # string that will say in fairly plain english what was done regex_verified = [ ] # simple splits don't have a verification regex— it just splits and checks to make sure the output doesn't match the original pattern. I want to specify in the log if it was regex verified cite = matching_citation.cite case_id = matching_citation.case_id if matching_citation.pk in example_cite_ids: example_cite_ids.remove( matching_citation.pk ) # this list should be empty with every pattern # The modifications can include a split string which will split a citation up into mutiple citations, # 'filters' which is a list of regex substitution pairs, and "kill" which drops one section. The list orders # are important because they have the same indexes as the split order, so you can know where to apply what if 'splitstring' in modifications: new_cites = [ c.strip() for c in cite.split(modifications['splitstring']) if c.strip() ] else: new_cites = [cite] if 'filters' in modifications: new_cites_filtered = [] filters = modifications['filters'] csv_replacement_explainer += "Using {} replacement".format( len(filters)) if len( filters) == 1 else "Using {} replacements".format( len(filters)) assert len(new_cites) == len(filters) for index, (filter_dict, new_cite) in enumerate(zip(filters, new_cites)): if 'kill' in filter_dict: # print("Dropping {}".format(new_cite)) csv_replacement_explainer += ", drop split field {} ({})".format( index, new_cite) continue for pattern in filter_dict['patterns']: csv_replacement_explainer += ", replace '{}' with '{}' in split field {} ({})".format( pattern[0], pattern[1], index, new_cite) new_cite = re.sub(pattern[0], pattern[1], new_cite) # The 'goal' is a pattern that the new citation should match after being processed. if 'goal' in filter_dict: csv_replacement_explainer += " to get '{}'".format( filter_dict['goal']) regex_verified.append(new_cite) if not re.match( '^' + get_escaped_regex(filter_dict['goal']) + '$', new_cite): raise Exception( "Doesn't Match: '{}'\nCurrent Pattern:'{}'\nRegex: '{}'\nCite_Section: '{}'\n" "Goal: '{}'\nEscaped Goal: '{}'".format( cite, current_pattern, get_escaped_regex(current_pattern), new_cite, filter_dict['goal'], get_escaped_regex( filter_dict['goal']))) new_cites_filtered.append(new_cite) new_cites = new_cites_filtered # if it matches the original pattern, it's wrong for c in new_cites: if re.match(matcher, c): raise Exception( "New Cite Matches Original Regex: '{}'\nCurrent Pattern:'{}'\nRegex: '{}'\n" "Cite_Section: '{}'\n".format( cite, current_pattern, get_escaped_regex(current_pattern), c)) # update records and log for index, new_citation in enumerate(new_cites): reg = True if new_citation in regex_verified else False action = "update" if index == 0 else "create" if action == 'update': # print("Updating: {}".format(new_citation)) matching_citation.cite = new_citation matching_citation.normalized_cite = normalize_cite( new_citation) new_citation_obj = matching_citation to_update.append(matching_citation) elif action == 'create': # print("Creating: {}".format(new_citation)) new_citation_obj = Citation( case_id=case_id, type="parallel", cite=new_citation, normalized_cite=normalize_cite(new_citation), ) to_insert.append(new_citation_obj) to_log.append( (cite, reg, action, csv_replacement_explainer, matching_citation, new_citation_obj)) if not to_log: print("- nothing to do") continue if dry_run == 'false': with EditLog(description='Fix citations matching %s' % current_pattern).record() as edit: Citation.objects.bulk_update(to_update, ['cite', 'normalized_cite']) Citation.objects.bulk_create(to_insert) timestamp = edit.timestamp else: timestamp = datetime.now() log_filters = str(modifications['filters'] ) if 'filters' in modifications else None log_splitstring = str(modifications['splitstring'] ) if 'splitstring' in modifications else None for cite, reg, action, csv_replacement_explainer, matching_citation, new_citation_obj in to_log: # "timestamp", "action", "New ID", "Case ID", "New Value", "Old Value", "Old ID", # "Split String", "Replacement Explainer", "Filtered", "Regex Verified", "Dry Run" csvlog.writerow([ timestamp, action, new_citation_obj.pk, matching_citation.case_id, new_citation_obj.cite, cite, matching_citation.pk, log_splitstring, csv_replacement_explainer, log_filters, reg, dry_run, ]) if matching_citation_count != jack_count: # This didn't happen after a dry run with production data print( "non-matching Jack Count: {}, Query Count: {}, Pattern: {}" .format(jack_count, matching_citation_count, current_pattern)) if len(example_cite_ids) > 0: # This didn't happen after a dry run with production data raise Exception("non-matching example in {}: {}".format( current_pattern, ", ".join(example_cite_ids)))
def load_cluster(args): """ Load a single CourtListener cluster with its opinions from disk, and return metadata. This is called within a process pool; see ingest_courtlistener for how it's used. """ cluster_member, opinions_dir = args with cluster_member.open() as f: cluster = json.load(f) # skip clusters without citations if not cluster['citations']: return None # load text of all opinions for this cluster as a single string with html stripped, for simhashing opinion_texts = [] for opinion_url in cluster['sub_opinions']: opinion_id = opinion_url.split('/')[-2] try: with opinions_dir.joinpath(f'{opinion_id}.json').open() as f: opinion = json.load(f) except FileNotFoundError: print("- Opinion file not found:", opinion_id) continue opinion_text = next((opinion[k] for k in [ 'html_with_citations', 'plain_text', 'html', 'html_lawbox', 'html_columbia', 'xml_harvard' ] if opinion[k]), '') opinion_texts.append(re.sub(r'<.+?>', '', opinion_text)) # process citations citations = [] for c in cluster['citations']: cite = f"{c['volume']} {c['reporter']} {c['page']}" try: page_int = int(c['page']) except (TypeError, ValueError): page_int = None citations.append({ 'cite': cite, 'normalized_cite': normalize_cite(cite), 'type': c['type'], 'volume': c['volume'], 'reporter': c['reporter'], 'page': c['page'], 'page_int': page_int, }) # return metadata return { 'id': f"cl-{cluster['id']}", 'source': 'cl', 'source_id': cluster['id'], 'citations': citations, 'name_short': cluster['case_name'], 'name_full': cluster['case_name_full'], 'decision_date': cluster['date_filed'], 'frontend_url': 'https://www.courtlistener.com' + cluster['absolute_url'], 'api_url': cluster['resource_uri'].replace(':80/', '/'), 'simhash': get_simhash("\n".join(opinion_texts)) }
def append(k, v): for key in (k, normalize_cite(k)): if v not in editions[key]: editions[key].append(v)
def fetch(request): """ Extract citations from text and link to PDFs. """ # zip file download error = None if request.method == 'POST' and request.POST.get('download'): if not request.user.is_authenticated: return HttpResponseForbidden() case_ids = set(request.POST.getlist('case_ids')) if not request.user.unlimited_access_in_effect( ) and request.user.case_allowance_remaining < len(case_ids): error = "You do not have sufficient downloads remaining to fetch the requested cases" else: cases_by_id = { c.id: c for c in CaseMetadata.objects.filter( pk__in=case_ids).prefetch_related('citations') } try: tmp = tempfile.NamedTemporaryFile(delete=False) with ZipFile(tmp.name, 'w') as zip: for case_id in case_ids: api_response = api_request(request, CaseDocumentViewSet, 'retrieve', {'id': case_id}, { 'format': 'pdf', 'full_case': 'true' }) if not hasattr( api_response, 'data') or api_response.status_code != 200: return api_response zip.writestr( "cases/" + cases_by_id[int(case_id)].get_pdf_name(), api_response.data) return FileResponse(open(tmp.name, 'rb'), as_attachment=True, filename='cases.zip') finally: os.remove(tmp.name) # prefer POST because it doesn't record queried text in server logs, but also accept GET to allow linking to search results text = request.POST.get('q', '') or request.GET.get('q', '') citations = None if text: citations = extract_citations_from_text(text) if citations: # extract citations citations = [{ 'cite': c[0], 'normalized_cite': normalize_cite(c[1]), 'before': '', 'after': '' } for c in extract_citations_from_text(text)] # get possible cases matching each extracted cite cases = CaseMetadata.objects.in_scope().filter( citations__normalized_cite__in=[ c['normalized_cite'] for c in citations ]).prefetch_related('citations').distinct() cases_by_cite = defaultdict(list) for case in cases: for cite in case.citations.all(): cases_by_cite[cite.normalized_cite].append(case) for result in citations: result['cases'] = cases_by_cite.get(result['normalized_cite'], []) # add context before and after matched cite context_before = 40 context_after = 30 m = re.search( r'([^\n]{,%s})\b%s\b([^\n]{,%s})' % (context_before, re.escape(result['cite']), context_after), text) if m: result['before'] = ('... ' if len(m[1]) == context_before else '') + m[1] result['after'] = m[2] + (' ...' if len(m[2]) == context_after else '')