def test_make_html(self): """Can we make basic HTML conversions properly?""" good_html = ( '<pre class="inline">asdf </pre><span class="citation ' 'no-link"><span class="volume">22</span> <span ' 'class="reporter">U.S.</span> <span class="page">33</span>' '</span><pre class="inline"> asdf</pre>') # Simple example s = 'asdf 22 U.S. 33 asdf' opinion = Opinion(plain_text=s) citations = get_citations(s) new_html = create_cited_html(opinion, citations) self.assertEqual( good_html, new_html, ) # Using a variant format for U.S. (Issue #409) s = 'asdf 22 U. S. 33 asdf' opinion = Opinion(plain_text=s) citations = get_citations(s) new_html = create_cited_html(opinion, citations) self.assertEqual( good_html, new_html, )
def test_make_html(self): """Can we make basic HTML conversions properly?""" good_html = ('<pre class="inline">asdf </pre><span class="citation ' 'no-link"><span class="volume">22</span> <span ' 'class="reporter">U.S.</span> <span class="page">33</span>' '</span><pre class="inline"> asdf</pre>') # Simple example s = 'asdf 22 U.S. 33 asdf' opinion = Opinion(plain_text=s) citations = get_citations(s) new_html = create_cited_html(opinion, citations) self.assertEqual( good_html, new_html, ) # Using a variant format for U.S. (Issue #409) s = 'asdf 22 U. S. 33 asdf' opinion = Opinion(plain_text=s) citations = get_citations(s) new_html = create_cited_html(opinion, citations) self.assertEqual( good_html, new_html, )
def get_document_citations(opinion): """Identify and return citations from the html or plain text of the opinion. """ if opinion.html_columbia: citations = find_citations.get_citations(opinion.html_columbia) elif opinion.html_lawbox: citations = find_citations.get_citations(opinion.html_lawbox) elif opinion.html: citations = find_citations.get_citations(opinion.html) elif opinion.plain_text: citations = find_citations.get_citations(opinion.plain_text, html=False) else: citations = [] return citations
def get_document_citations(opinion): """Identify and return citations from the html or plain text of the opinion. """ if opinion.html_columbia: citations = find_citations.get_citations(opinion.html_columbia) elif opinion.html_lawbox: citations = find_citations.get_citations(opinion.html_lawbox) elif opinion.html: citations = find_citations.get_citations(opinion.html) elif opinion.plain_text: citations = find_citations.get_citations(opinion.plain_text, html=False) else: citations = [] return citations
def test_citation_matching_issue621(self): """Make sure that a citation like 1 Wheat 9 doesn't match 9 Wheat 1""" # The fixture contains a reference to 9 F. 1, so we expect no results. citation_str = '1 F. 9 (1795)' citation = get_citations(citation_str)[0] results = match_citation(citation) self.assertEqual([], results)
def test_citation_matching_issue621(self): """Make sure that a citation like 1 Wheat 9 doesn't match 9 Wheat 1""" # The fixture contains a reference to 9 F. 1, so we expect no results. citation_str = '1 F. 9 (1795)' citation = get_citations(citation_str)[0] results = match_citation(citation) self.assertEqual([], results)
def test_find_tc_citations(self): """Can we parse tax court citations properly?""" test_pairs = ( # Test with atypical formatting for Tax Court Memos ('the 1 T.C. No. 233', [Citation(volume=1, reporter='T.C. No.', page=233, canonical_reporter=u'T.C. No.', lookup_index=0, reporter_index=2, reporter_found='T.C. No.')]), ('word T.C. Memo. 2019-233', [Citation(volume=2019, reporter='T.C. Memo.', page=233, canonical_reporter=u'T.C. Memo.', lookup_index=0, reporter_index=1, reporter_found='T.C. Memo.')]), ('something T.C. Summary Opinion 2019-233', [Citation(volume=2019, reporter='T.C. Summary Opinion', page=233, canonical_reporter=u'T.C. Summary Opinion', lookup_index=0, reporter_index=1, reporter_found='T.C. Summary Opinion')]), ('T.C. Summary Opinion 2018-133', [Citation(volume=2018, reporter='T.C. Summary Opinion', page=133, canonical_reporter=u'T.C. Summary Opinion', lookup_index=0, reporter_index=0, reporter_found='T.C. Summary Opinion')]), ('1 UNITED STATES TAX COURT REPORT (2018)', [Citation(volume=1, reporter='T.C.', page=2018, canonical_reporter=u'T.C.', lookup_index=0, reporter_index=1, reporter_found='UNITED STATES TAX COURT REPORT')]), ('U.S. of A. 1 UNITED STATES TAX COURT REPORT (2018)', [Citation(volume=1, reporter='T.C.', page=2018, canonical_reporter=u'T.C.', lookup_index=0, reporter_index=4, reporter_found='UNITED STATES TAX COURT REPORT')]), ('U.S. 1234 1 U.S. 1', [Citation(volume=1, reporter='U.S.', page=1, canonical_reporter=u'U.S.', lookup_index=0, reporter_index=3, court='scotus', reporter_found='U.S.')]), ) for q, a in test_pairs: print "Testing citation extraction for %s..." % q, cites_found = get_citations(q) self.assertEqual( cites_found, a, msg='%s\n%s\n\n !=\n\n%s' % ( q, ",\n".join([str(cite.__dict__) for cite in cites_found]), ",\n".join([str(cite.__dict__) for cite in a]), ) ) print "✓"
def make_citation(cite_str, cluster, cite_type): """Create and return a citation object for the input values.""" citation_obj = get_citations(cite_str)[0] return Citation( cluster=cluster, volume=citation_obj.volume, reporter=citation_obj.reporter, page=citation_obj.page, type=cite_type, )
def make_citation(cite_str, cluster, cite_type): """Create and return a citation object for the input values.""" citation_obj = get_citations(cite_str)[0] return Citation( cluster=cluster, volume=citation_obj.volume, reporter=citation_obj.reporter, page=citation_obj.page, type=cite_type, )
def get_citations_from_tree(complete_html_tree, case_path): path = ('//center[descendant::text()[not(' 'starts-with(normalize-space(.), "No.") or ' 'starts-with(normalize-space(.), "Case No.") or ' 'starts-with(normalize-space(.), "Record No.")' ')]]') citations = [] for e in complete_html_tree.xpath(path): text = tostring(e, method='text', encoding='unicode') citations.extend(get_citations(text, html=False, do_defendant=False)) if not citations: path = '//title/text()' text = complete_html_tree.xpath(path)[0] citations = get_citations(text, html=False, do_post_citation=False, do_defendant=False) if not citations: try: citations = fixes[case_path]['citations'] except KeyError: if 'input_citations' in DEBUG: subprocess.Popen( ['firefox', 'file://%s' % case_path], shell=False ).communicate() input_citation = raw_input( ' No citations found. What should be here? ') citation_objects = get_citations( input_citation, html=False, do_post_citation=False, do_defendant=False ) add_fix(case_path, {'citations': citation_objects}) citations = citation_objects if 'citations' in DEBUG and len(citations): cite_strs = [str(cite.__dict__) for cite in citations] log_print( " Citations found: %s" % ',\n '.join(cite_strs)) elif 'citations' in DEBUG: log_print(" No citations found!") return citations
def get_citations_from_tree(complete_html_tree, case_path): path = ('//center[descendant::text()[not(' 'starts-with(normalize-space(.), "No.") or ' 'starts-with(normalize-space(.), "Case No.") or ' 'starts-with(normalize-space(.), "Record No.")' ')]]') citations = [] for e in complete_html_tree.xpath(path): text = tostring(e, method='text', encoding='unicode') citations.extend(get_citations(text, html=False, do_defendant=False)) if not citations: path = '//title/text()' text = complete_html_tree.xpath(path)[0] citations = get_citations(text, html=False, do_post_citation=False, do_defendant=False) if not citations: try: citations = fixes[case_path]['citations'] except KeyError: if 'input_citations' in DEBUG: subprocess.Popen( ['firefox', 'file://%s' % case_path], shell=False).communicate() input_citation = raw_input( ' No citations found. What should be here? ') citation_objects = get_citations(input_citation, html=False, do_post_citation=False, do_defendant=False) add_fix(case_path, {'citations': citation_objects}) citations = citation_objects if 'citations' in DEBUG and len(citations): cite_strs = [str(cite.__dict__) for cite in citations] log_print(" Citations found: %s" % ',\n '.join(cite_strs)) elif 'citations' in DEBUG: log_print(" No citations found!") return citations
def find_tax_court_citation(opinion_text): """ Returns a dictionary representation of our Citation object. Return the citation object or nothing. Iterates over lines of text beacuse we assume our citations won't wrap. :param opinion_text: The plain_text of our opinion from the scrape. :return: citation object or None """ for line_of_text in opinion_text.split("\n")[:250]: cites = find_citations.get_citations(line_of_text, html=False) if not cites: continue if "UNITED STATES TAX COURT REPORT" in opinion_text: for cite in cites: if "UNITED STATES TAX COURT REPORT" in cite.reporter_found: cite.type = Citation.SPECIALTY return cite else: for cite in cites: if ( "T.C." not in cite.reporter and "T. C." not in cite.reporter ): # If not the first cite - Skip return None if cite.reporter_index > 2: # If reporter not in first or second term in the line we skip. return None alt_cite = line_of_text.replace( cite.reporter_found, "" ).strip() other_words = alt_cite.split(" ") if len([x for x in other_words if x != ""]) > 3: # If line has more than three non reporter components skip. return None if "T.C." == cite.reporter: cite_type = Citation.SPECIALTY elif "T.C. No." == cite.reporter: cite_type = Citation.SPECIALTY else: cite_type = Citation.NEUTRAL cite.type = cite_type return cite
def find_cites(case_data: Dict[str, str]) -> List[FoundCitation]: """Extract citations from raw string. :param case_data: Case information from the anon 2020 db. :return: Citation objects found in the raw string. """ found_citations = [] cites = re.findall(r"\"(.*?)\"", case_data["lexis_ids_normalized"], re.DOTALL) for cite in cites: fc = get_citations(cite) if len(fc) > 0: found_citations.append(fc[0]) return found_citations
def do_citations(cluster, scdb_info): """ Handle the citation fields. :param cluster: The Cluster to be changed. :param scdb_info: A dict with the SCDB information. """ fields = { "usCite": ("U.S.", Citation.FEDERAL), "sctCite": ("S. Ct.", Citation.FEDERAL), "ledCite": ("L. Ed.", Citation.FEDERAL), "lexisCite": ("U.S. LEXIS", Citation.LEXIS), } for scdb_field, reporter_info in fields.items(): if not scdb_info[scdb_field]: continue try: citation_obj = get_citations( scdb_info[scdb_field], html=False, do_post_citation=False, do_defendant=False, disambiguate=False, )[0] except IndexError: logger.warning("Unable to parse citation for: %s", scdb_info[scdb_field]) else: cites = cluster.citations.filter(reporter=reporter_info[0]) if cites.count() == 1: # Update the existing citation. cite = cites[0] cite.volume = citation_obj.volume cite.reporter = citation_obj.reporter cite.page = citation_obj.page cite.save() else: try: # Create a new citation Citation.objects.create( cluster=cluster, volume=citation_obj.volume, reporter=citation_obj.reporter, page=citation_obj.page, type=reporter_info[1], ) except IntegrityError: # Violated unique_together constraint. Fine. pass
def do_citations(cluster, scdb_info): """ Handle the citation fields. :param cluster: The Cluster to be changed. :param scdb_info: A dict with the SCDB information. """ fields = { 'usCite': ("U.S.", Citation.FEDERAL), 'sctCite': ("S. Ct.", Citation.FEDERAL), 'ledCite': ("L. Ed.", Citation.FEDERAL), 'lexisCite': ("U.S. LEXIS", Citation.LEXIS), } for scdb_field, reporter_info in fields.items(): try: citation_obj = get_citations( scdb_info[scdb_field], html=False, do_post_citation=False, do_defendant=False, disambiguate=False, )[0] except IndexError: logger.warn("Unable to parse citation for: %s", scdb_info[scdb_field]) else: cite = cluster.citations.filter(reporter=reporter_info[0]) if cite: # Update the existing citation. cite.volume = citation_obj.volume cite.reporter = citation_obj.reporter cite.page = citation_obj.page cite.save() else: try: # Create a new citation Citation.objects.create( cluster=cluster, volume=citation_obj.volume, reporter=citation_obj.reporter, page=citation_obj.page, type=reporter_info[1], ) except IntegrityError: # Violated unique_together constraint. Fine. pass
def get_query_citation(cd): """Extract citations from the query string and return them, or return None """ if not cd.get('q'): return None citations = get_citations(cd['q'], html=False) matches = None if len(citations) == 1: # If it's not exactly one match, user doesn't get special help. matches = match_citation(citations[0]) if len(matches) >= 1: # Just return the first result if there is more than one. This # should be rare, and they're ordered by relevance. return matches.result.docs[0] return matches
def get_query_citation(cd): """Extract citations from the query string and return them, or return None """ if not cd.get('q'): return None citations = get_citations(cd['q'], html=False) matches = None if len(citations) == 1: # If it's not exactly one match, user doesn't get special help. matches = match_citation(citations[0]) if len(matches) >= 1: # Just return the first result if there is more than one. This # should be rare, and they're ordered by relevance. return matches.result.docs[0] return matches
def get_query_citation(cd): """Extract citations from the query string and return them, or return None """ if not cd.get("q"): return None citations = get_citations( cd["q"], html=False, do_post_citation=False, do_defendant=False ) matches = None if len(citations) == 1: # If it's not exactly one match, user doesn't get special help. matches = match_citation(citations[0]) if len(matches) == 1: # If more than one match, don't show the tip return matches.result.docs[0] return matches
def test_identifying_parallel_citations(self): """Given a string, can we identify parallel citations""" tests = ( # A pair consisting of a test string and the number of parallel # citations that should be identifiable in that string. # Simple case ("1 U.S. 1 (22 U.S. 33)", 1, 2), # Too far apart ("1 U.S. 1 too many words 22 U.S. 33", 0, 0), # Three citations ("1 U.S. 1, (44 U.S. 33, 99 U.S. 100)", 1, 3), # Parallel citation after a valid citation too early on ("1 U.S. 1 too many words, then 22 U.S. 33, 13 WL 33223", 1, 2), ) for q, citation_group_count, expected_num_parallel_citations in tests: print "Testing parallel citation identification for: %s..." % q, citations = get_citations(q) citation_groups = identify_parallel_citations(citations) computed_num_citation_groups = len(citation_groups) self.assertEqual( computed_num_citation_groups, citation_group_count, msg="Did not have correct number of citation groups. Got %s, " "not %s." % (computed_num_citation_groups, citation_group_count), ) if not citation_groups: # Add an empty list to make testing easier. citation_groups = [[]] computed_num_parallel_citation = len(list(citation_groups)[0]) self.assertEqual( computed_num_parallel_citation, expected_num_parallel_citations, msg="Did not identify correct number of parallel citations in " "the group. Got %s, not %s" % ( computed_num_parallel_citation, expected_num_parallel_citations, ), ) print "✓"
def handle(self, *args, **options): super(Command, self).handle(*args, **options) qs = OpinionCluster.objects.all() start_at = options['start_at'] if start_at: qs = qs.filter(pk__gte=start_at) for i, cluster in enumerate(queryset_generator(qs)): for field in cluster.citation_fields: citation_str = getattr(cluster, field) if citation_str: # Split the citation and add it to the DB. try: citation_obj = get_citations( citation_str, html=False, do_post_citation=False, do_defendant=False, disambiguate=False, )[0] except IndexError: msg = "Errored out on: %s in %s" % (citation_str, cluster.pk) print(msg) logger.info(msg) continue try: Citation.objects.create( cluster=cluster, volume=citation_obj.volume, reporter=citation_obj.reporter, page=citation_obj.page, type=map_model_field_to_citation_type(field) ) except IntegrityError: # Violated unique_together constraint. Fine. pass if i % 1000 == 0: msg = "Completed %s items (last: %s)" print(msg % (i, cluster.pk)) logger.info(msg, i, cluster.pk)
def generate_citation(opinion_text, cluster_id): """ Returns a dictionary representation of our Citation object. This data will only be returned if found, otherwise none is returned and no Citation object is added to the system. It could be a failed parse or the data could simply not be available. :param opinion_text: The plain_text of our opinion from the scrape. :param cluster_id: The id of the associated Opinion_Cluster related to this opinion :return: cite_dict => Returns dictionary of the citation data """ for line_of_text in opinion_text.split("\n")[:250]: cites = find_citations.get_citations(line_of_text, html=False) if not cites: continue for cite in cites: if "T.C." not in cite.reporter and "T. C." not in cite.reporter: continue if "T.C." == cite.reporter: cite_type = Citation.SPECIALTY elif "T.C. No." == cite.reporter: cite_type = Citation.SPECIALTY else: cite_type = Citation.NEUTRAL if not Citation.objects.filter( volume=cite.volume, reporter=cite.reporter, page=cite.page, cluster_id=cluster_id, ): cite.type = cite_type return cite else: logger.info("Citation already in the system. Return None.")
def test_identifying_parallel_citations(self): """Given a string, can we identify parallel citations""" tests = ( # A pair consisting of a test string and the number of parallel # citations that should be identifiable in that string. # Simple case ("1 U.S. 1 (22 U.S. 33)", 1, 2), # Too far apart ("1 U.S. 1 too many words 22 U.S. 33", 0, 0), # Three citations ("1 U.S. 1, (44 U.S. 33, 99 U.S. 100)", 1, 3), # Parallel citation after a valid citation too early on ("1 U.S. 1 too many words, then 22 U.S. 33, 13 WL 33223", 1, 2), ) for q, citation_group_count, expected_num_parallel_citations in tests: print "Testing parallel citation identification for: %s..." % q, citations = get_citations(q) citation_groups = identify_parallel_citations(citations) computed_num_citation_groups = len(citation_groups) self.assertEqual( computed_num_citation_groups, citation_group_count, msg="Did not have correct number of citation groups. Got %s, " "not %s." % (computed_num_citation_groups, citation_group_count) ) if not citation_groups: # Add an empty list to make testing easier. citation_groups = [[]] computed_num_parallel_citation = len(list(citation_groups)[0]) self.assertEqual( computed_num_parallel_citation, expected_num_parallel_citations, msg="Did not identify correct number of parallel citations in " "the group. Got %s, not %s" % ( computed_num_parallel_citation, expected_num_parallel_citations, ) ) print '✓'
def get_query_citation(cd: Dict[str, Any]) -> Optional[List[Citation]]: """Extract citations from the query string and return them, or return None """ if not cd.get("q"): return None citations = get_citations(cd["q"], html=False, do_post_citation=False, do_defendant=False) citations = [c for c in citations if isinstance(c, Citation)] matches = None if len(citations) == 1: # If it's not exactly one match, user doesn't get special help. matches = match_citation(citations[0]) if len(matches) == 1: # If more than one match, don't show the tip return matches.result.docs[0] return matches
def test_disambiguate_citations(self): test_pairs = [ # 1. P.R.R --> Correct abbreviation for a reporter. ('1 P.R.R. 1', [Citation(volume=1, reporter='P.R.R.', page=1, canonical_reporter=u'P.R.R.', lookup_index=0, reporter_index=1, reporter_found='P.R.R.')]), # 2. U. S. --> A simple variant to resolve. ('1 U. S. 1', [Citation(volume=1, reporter='U.S.', page=1, canonical_reporter=u'U.S.', lookup_index=0, court='scotus', reporter_index=1, reporter_found='U. S.')]), # 3. A.2d --> Not a variant, but needs to be looked up in the # EDITIONS variable. ('1 A.2d 1', [Citation(volume=1, reporter='A.2d', page=1, canonical_reporter=u'A.', lookup_index=0, reporter_index=1, reporter_found='A.2d')]), # 4. A. 2d --> An unambiguous variant of an edition ('1 A. 2d 1', [Citation(volume=1, reporter='A.2d', page=1, canonical_reporter=u'A.', lookup_index=0, reporter_index=1, reporter_found='A. 2d')]), # 5. P.R. --> A variant of 'Pen. & W.', 'P.R.R.', or 'P.' that's # resolvable by year ('1 P.R. 1 (1831)', # Of the three, only Pen & W. was being published this year. [Citation(volume=1, reporter='Pen. & W.', page=1, canonical_reporter=u'Pen. & W.', lookup_index=0, year=1831, reporter_index=1, reporter_found='P.R.')]), # 5.1: W.2d --> A variant of an edition that either resolves to # 'Wis. 2d' or 'Wash. 2d' and is resolvable by year. ('1 W.2d 1 (1854)', # Of the two, only Wis. 2d was being published this year. [Citation(volume=1, reporter='Wis. 2d', page=1, canonical_reporter=u'Wis.', lookup_index=0, year=1854, reporter_index=1, reporter_found='W.2d')]), # 5.2: Wash. --> A non-variant that has more than one reporter for # the key, but is resolvable by year ('1 Wash. 1 (1890)', [Citation(volume=1, reporter='Wash.', page=1, canonical_reporter=u'Wash.', lookup_index=1, year=1890, reporter_index=1, reporter_found='Wash.')]), # 6. Cr. --> A variant of Cranch, which is ambiguous, except with # paired with this variation. ('1 Cra. 1', [Citation(volume=1, reporter='Cranch', page=1, canonical_reporter=u'Cranch', lookup_index=0, court='scotus', reporter_index=1, reporter_found='Cra.')]), # 7. Cranch. --> Not a variant, but could refer to either Cranch's # Supreme Court cases or his DC ones. In this case, we cannot # disambiguate. Years are not known, and we have no further # clues. We must simply drop Cranch from the results. ('1 Cranch 1 1 U.S. 23', [Citation(volume=1, reporter='U.S.', page=23, canonical_reporter=u'U.S.', lookup_index=0, court='scotus', reporter_index=4, reporter_found='U.S.')]), # 8. Unsolved problem. In theory, we could use parallel citations # to resolve this, because Rob is getting cited next to La., but # we don't currently know the proximity of citations to each # other, so can't use this. # - Rob. --> Either: # 8.1: A variant of Robards (1862-1865) or # 8.2: Robinson's Louisiana Reports (1841-1846) or # 8.3: Robinson's Virgina Reports (1842-1865) # ('1 Rob. 1 1 La. 1', # [Citation(volume=1, reporter='Rob.', page=1, # canonical_reporter='Rob.', # lookup_index=0), # Citation(volume=1, reporter='La.', page=1, # canonical_reporter='La.', # lookup_index=0)]), ] for pair in test_pairs: print "Testing disambiguation for %s..." % pair[0], citations = get_citations(pair[0], html=False) self.assertEqual( citations, pair[1], msg='%s\n%s != \n%s' % ( pair[0], [cite.__dict__ for cite in citations], [cite.__dict__ for cite in pair[1]] ) ) print "✓"
def parse_harvard_opinions(reporter, volume): """ Parse downloaded CaseLaw Corpus from internet archive and add them to our database. Optionally uses a reporter abbreviation to identify cases to download as used by IA. (Ex. T.C. => tc) Optionally uses a volume integer. If neither is provided, code will cycle through all downloaded files. :param volume: The volume (int) of the reporters (optional) (ex 10) :param reporter: Reporter string as slugify'd (optional) (tc) for T.C. :return: None """ if not reporter and volume: logger.error("You provided a volume but no reporter. Exiting.") return for file_path in filepath_list(reporter, volume): ia_download_url = "/".join( ["https://archive.org/download", file_path.split("/", 9)[-1]] ) if OpinionCluster.objects.filter( filepath_json_harvard=file_path ).exists(): logger.info("Skipping - already in system %s" % ia_download_url) continue try: with open(file_path) as f: data = json.load(f) except ValueError: logger.warning("Empty json: missing case at: %s" % ia_download_url) continue except Exception as e: logger.warning("Unknown error %s for: %s" % (e, ia_download_url)) continue cites = get_citations(data["citations"][0]["cite"], html=False) if not cites: logger.info( "No citation found for %s." % data["citations"][0]["cite"] ) continue case_name = harmonize(data["name_abbreviation"]) case_name_short = cnt.make_case_name_short(case_name) case_name_full = harmonize(data["name"]) citation = cites[0] if skip_processing(citation, case_name): continue # TODO: Generalize this to handle all court types somehow. court_id = match_court_string( data["court"]["name"], state=True, federal_appeals=True, federal_district=True, ) soup = BeautifulSoup(data["casebody"]["data"], "lxml") # Some documents contain images in the HTML # Flag them for a later crawl by using the placeholder '[[Image]]' judge_list = [ find_judge_names(x.text) for x in soup.find_all("judges") ] author_list = [ find_judge_names(x.text) for x in soup.find_all("author") ] # Flatten and dedupe list of judges judges = ", ".join( list(set(itertools.chain.from_iterable(judge_list + author_list))) ) judges = titlecase(judges) docket_string = ( data["docket_number"] .replace("Docket No.", "") .replace("Docket Nos.", "") .strip() ) with transaction.atomic(): logger.info("Adding docket for: %s", citation.base_citation()) docket = Docket.objects.create( case_name=case_name, case_name_short=case_name_short, case_name_full=case_name_full, docket_number=docket_string, court_id=court_id, source=Docket.HARVARD, ia_needs_upload=False, ) # Iterate over other xml fields in Harvard data set # and save as string list for further processing at a later date. json_fields = [ "attorneys", "disposition", "syllabus", "summary", "history", "otherdate", "seealso", "headnotes", "correction", ] data_set = {} while json_fields: key = json_fields.pop(0) data_set[key] = "|".join([x.text for x in soup.find_all(key)]) # Handle partial dates by adding -01v to YYYY-MM dates date_filed, is_approximate = validate_dt(data["decision_date"]) logger.info("Adding cluster for: %s", citation.base_citation()) cluster = OpinionCluster.objects.create( case_name=case_name, case_name_short=case_name_short, case_name_full=case_name_full, precedential_status="Published", docket_id=docket.id, source="U", date_filed=date_filed, date_filed_is_approximate=is_approximate, attorneys=data_set["attorneys"], disposition=data_set["disposition"], syllabus=data_set["syllabus"], summary=data_set["summary"], history=data_set["history"], other_dates=data_set["otherdate"], cross_reference=data_set["seealso"], headnotes=data_set["headnotes"], correction=data_set["correction"], judges=judges, filepath_json_harvard=file_path, ) logger.info("Adding citation for: %s", citation.base_citation()) Citation.objects.create( volume=citation.volume, reporter=citation.reporter, page=citation.page, type=map_reporter_db_cite_type( REPORTERS[citation.reporter][0]["cite_type"] ), cluster_id=cluster.id, ) for op in soup.find_all("opinion"): joined_by_str = titlecase( " ".join( list(set(itertools.chain.from_iterable(judge_list))) ) ) author_str = titlecase( " ".join( list(set(itertools.chain.from_iterable(author_list))) ) ) op_type = map_opinion_type(op.get("type")) opinion_xml = str(op) logger.info("Adding opinion for: %s", citation.base_citation()) Opinion.objects.create( cluster_id=cluster.id, type=op_type, author_str=author_str, xml_harvard=opinion_xml, joined_by_str=joined_by_str, extracted_by_ocr=True, ) logger.info("Finished: %s", citation.base_citation())
def citation_redirector(request, reporter=None, volume=None, page=None): """Take a citation URL and use it to redirect the user to the canonical page for that citation. This uses the same infrastructure as the thing that identifies citations in the text of opinions. """ if request.method == 'POST': form = CitationRedirectorForm(request.POST) if form.is_valid(): # Redirect to the page with the right values cd = form.cleaned_data return HttpResponseRedirect( reverse('citation_redirector', kwargs=cd)) else: # Error in form, somehow. return render(request, 'citation_redirect_info_page.html', { 'show_homepage': True, 'form': form, 'private': True }) else: if all(_ is None for _ in (reporter, volume, page)): # No parameters. Show the standard page. form = CitationRedirectorForm() return render(request, 'citation_redirect_info_page.html', { 'show_homepage': True, 'form': form, 'private': False, }) else: # We have a citation. Look it up, redirect the user or show # disambiguation. citation_str = " ".join([volume, reporter, page]) try: citation = get_citations(citation_str)[0] citation_str = citation.base_citation( ) # Corrects typos/variations. lookup_fields = [map_citations_to_models([citation]).keys()[0]] except IndexError: # Unable to disambiguate the citation. Try looking in *all* # citation fields. lookup_fields = OpinionCluster().citation_fields # We were able to get a match, expand it if it's a federal/state # match. if (len(lookup_fields) == 1 and lookup_fields[0] == 'federal_cite_one'): lookup_fields = [ 'federal_cite_one', 'federal_cite_two', 'federal_cite_three' ] elif (len(lookup_fields) == 1 and lookup_fields[0] == 'state_cite_one'): lookup_fields = [ 'state_cite_one', 'state_cite_two', 'state_cite_three' ] q = Q() for lookup_field in lookup_fields: q |= Q(**{lookup_field: citation_str}) clusters = OpinionCluster.objects.filter(q) # Show the correct page.... if clusters.count() == 0: # No results for an otherwise valid citation. return render( request, 'citation_redirect_info_page.html', { 'none_found': True, 'citation_str': citation_str, 'private': True, }) elif clusters.count() == 1: # Total success. Redirect to correct location. return HttpResponseRedirect(clusters[0].get_absolute_url()) elif clusters.count() > 1: # Multiple results. Show them. return render( request, 'citation_redirect_info_page.html', { 'too_many': True, 'citation_str': citation_str, 'clusters': clusters, 'private': True, })
def make_and_save(item, skipdupes=False, min_dates=None, start_dates=None, testing=True): """Associates case data from `parse_opinions` with objects. Saves these objects. min_date: if not none, will skip cases after min_date """ date_filed = date_argued = date_reargued = date_reargument_denied = date_cert_granted = date_cert_denied = None unknown_date = None for date_cluster in item['dates']: for date_info in date_cluster: # check for any dates that clearly aren't dates if date_info[1].year < 1600 or date_info[1].year > 2020: continue # check for untagged dates that will be assigned to date_filed if date_info[0] is None: date_filed = date_info[1] continue # try to figure out what type of date it is based on its tag string if date_info[0] in FILED_TAGS: date_filed = date_info[1] elif date_info[0] in DECIDED_TAGS: if not date_filed: date_filed = date_info[1] elif date_info[0] in ARGUED_TAGS: date_argued = date_info[1] elif date_info[0] in REARGUE_TAGS: date_reargued = date_info[1] elif date_info[0] in REARGUE_DENIED_TAGS: date_reargument_denied = date_info[1] elif date_info[0] in CERT_GRANTED_TAGS: date_cert_granted = date_info[1] elif date_info[0] in CERT_DENIED_TAGS: date_cert_denied = date_info[1] else: unknown_date = date_info[1] if date_info[0] not in UNKNOWN_TAGS: print("\nFound unknown date tag '%s' with date '%s'.\n" % date_info) # the main date (used for date_filed in OpinionCluster) and panel dates # (used for finding judges) are ordered in terms of which type of dates # best reflect them main_date = (date_filed or date_argued or date_reargued or date_reargument_denied or unknown_date) panel_date = (date_argued or date_reargued or date_reargument_denied or date_filed or unknown_date) if main_date is None: raise Exception("Failed to get a date for " + item['file']) # special rule for Kentucky if item['court_id'] == 'kycourtapp' and main_date <= date(1975, 12, 31): item['court_id'] = 'kycourtapphigh' if min_dates is not None: if min_dates.get(item['court_id']) is not None: if main_date >= min_dates[item['court_id']]: print(main_date, 'after', min_dates[item['court_id']], ' -- skipping.') return if start_dates is not None: if start_dates.get(item['court_id']) is not None: if main_date <= start_dates[item['court_id']]: print(main_date, 'before court founding:', start_dates[item['court_id']], ' -- skipping.') return docket = Docket(source=Docket.COLUMBIA, date_argued=date_argued, date_reargued=date_reargued, date_cert_granted=date_cert_granted, date_cert_denied=date_cert_denied, date_reargument_denied=date_reargument_denied, court_id=item['court_id'], case_name_short=item['case_name_short'] or '', case_name=item['case_name'] or '', case_name_full=item['case_name_full'] or '', docket_number=item['docket'] or '') # get citations in the form of, e.g. {'federal_cite_one': '1 U.S. 1', ...} found_citations = [] for c in item['citations']: found = get_citations(c) if not found: # if the docket number --is-- citation string, we're likely dealing # with a somewhat common triplet of (docket number, date, # jurisdiction), which isn't a citation at all (so there's no # problem) if item['docket']: docket_no = item['docket'].lower() if 'claim no.' in docket_no: docket_no = docket_no.split('claim no.')[0] for junk in DOCKET_JUNK: docket_no = docket_no.replace(junk, '') docket_no = docket_no.strip('.').strip() if docket_no and docket_no in c.lower(): continue # there are a trivial number of letters (except for months and a few # trivial words) in the citation, then it's not a citation at all non_trivial = c.lower() for trivial in TRIVIAL_CITE_WORDS: non_trivial = non_trivial.replace(trivial, '') num_letters = sum( non_trivial.count(letter) for letter in string.lowercase) if num_letters < 3: continue # if there is a string that's known to indicate a bad citation, then # it's not a citation if any(bad in c for bad in BAD_CITES): continue # otherwise, this is a problem raise Exception("Failed to get a citation from the string '%s' in " "court '%s' with docket '%s'." % (c, item['court_id'], item['docket'])) else: found_citations.extend(found) citations_map = map_citations_to_models(found_citations) cluster = OpinionCluster( judges=item.get('judges', '') or "", precedential_status=('Unpublished' if item['unpublished'] else 'Published'), date_filed=main_date, case_name_short=item['case_name_short'] or '', case_name=item['case_name'] or '', case_name_full=item['case_name_full'] or '', source='Z', attorneys=item['attorneys'] or '', posture=item['posture'] or '', **citations_map) panel = [ find_person(n, item['court_id'], case_date=panel_date) for n in item['panel'] ] panel = [x for x in panel if x is not None] opinions = [] for i, opinion_info in enumerate(item['opinions']): if opinion_info['author'] is None: author = None else: author = find_person(opinion_info['author'], item['court_id'], case_date=panel_date) converted_text = convert_columbia_html(opinion_info['opinion']) opinion_type = OPINION_TYPE_MAPPING[opinion_info['type']] if opinion_type == '020lead' and i > 0: opinion_type = '050addendum' opinion = Opinion( author=author, per_curiam=opinion_info['per_curiam'], type=opinion_type, # type=OPINION_TYPE_MAPPING[opinion_info['type']], html_columbia=converted_text, sha1=opinion_info['sha1'], local_path=opinion_info['local_path'], ) joined_by = [ find_person(n, item['court_id'], case_date=panel_date) for n in opinion_info['joining'] ] joined_by = [x for x in joined_by if x is not None] opinions.append((opinion, joined_by)) if min_dates is None: # check to see if this is a duplicate dups = find_dups(docket, cluster) if dups: if skipdupes: print('Duplicate. skipping.') else: raise Exception("Found %s duplicate(s)." % len(dups)) # save all the objects if not testing: try: docket.save() cluster.docket = docket cluster.save(index=False) for member in panel: cluster.panel.add(member) for opinion, joined_by in opinions: opinion.cluster = cluster opinion.save(index=False) for joiner in joined_by: opinion.joined_by.add(joiner) if settings.DEBUG: domain = "http://127.0.0.1:8000" else: domain = "https://www.courtlistener.com" print("Created item at: %s%s" % (domain, cluster.get_absolute_url())) except: # if anything goes wrong, try to delete everything try: docket.delete() except: pass raise
def citation_redirector(request, reporter=None, volume=None, page=None): """Take a citation URL and use it to redirect the user to the canonical page for that citation. This uses the same infrastructure as the thing that identifies citations in the text of opinions. """ if request.method == 'POST': form = CitationRedirectorForm(request.POST) if form.is_valid(): # Redirect to the page with the right values cd = form.cleaned_data return HttpResponseRedirect( reverse('citation_redirector', kwargs=cd) ) else: # Error in form, somehow. return render_to_response( 'citation_redirect_info_page.html', {'show_homepage': True, 'form': form, 'private': True}, RequestContext(request), ) else: if all(_ is None for _ in (reporter, volume, page)): # Show the most basic page form = CitationRedirectorForm() return render_to_response( 'citation_redirect_info_page.html', { 'show_homepage': True, 'form': form, 'private': False, }, RequestContext(request), ) else: # Look up the citation, redirect the user or show disambiguation. citation_str = " ".join([volume, reporter, page]) try: citation = get_citations(citation_str)[0] citation_str = citation.base_citation() # Corrects typos/variations. lookup_fields = [map_citations_to_models([citation]).keys()[0]] except IndexError: # Unable to disambiguate the citation. Try looking in *all* # citation fields. lookup_fields = OpinionCluster().citation_fields # We were able to get a match, expand it if it's a federal/state # match. if (len(lookup_fields) == 1 and lookup_fields[0] == 'federal_cite_one'): lookup_fields = ['federal_cite_one', 'federal_cite_two', 'federal_cite_three'] elif (len(lookup_fields) == 1 and lookup_fields[0] == 'state_cite_one'): lookup_fields = ['state_cite_one', 'state_cite_two', 'state_cite_three'] q = Q() for lookup_field in lookup_fields: q |= Q(**{lookup_field: citation_str}) clusters = OpinionCluster.objects.filter(q) # Show the correct page.... if clusters.count() == 0: # No results for an otherwise valid citation. return render_to_response( 'citation_redirect_info_page.html', { 'none_found': True, 'citation_str': citation_str, 'private': True, }, RequestContext(request), status=404, ) elif clusters.count() == 1: # Total success. Redirect to correct location. return HttpResponseRedirect( clusters[0].get_absolute_url() ) elif clusters.count() > 1: # Multiple results. Show them. return render_to_response( 'citation_redirect_info_page.html', { 'too_many': True, 'citation_str': citation_str, 'clusters': clusters, 'private': True, }, RequestContext(request), status=300, )
def make_objects(self, item, court, sha1_hash, content): """Takes the meta data from the scraper and associates it with objects. Returns the created objects. """ blocked = item['blocked_statuses'] if blocked: date_blocked = date.today() else: date_blocked = None case_name_short = (item.get('case_name_shorts') or self.cnt.make_case_name_short(item['case_names'])) docket = Docket( docket_number=item.get('docket_numbers', ''), case_name=item['case_names'], case_name_short=case_name_short, court=court, blocked=blocked, date_blocked=date_blocked, source=Docket.SCRAPER, ) west_cite_str = item.get('west_citations', '') state_cite_str = item.get('west_state_citations', '') neutral_cite_str = item.get('neutral_citations', '') cluster = OpinionCluster( judges=item.get('judges', ''), date_filed=item['case_dates'], date_filed_is_approximate=item['date_filed_is_approximate'], case_name=item['case_names'], case_name_short=case_name_short, source='C', precedential_status=item['precedential_statuses'], nature_of_suit=item.get('nature_of_suit', ''), blocked=blocked, date_blocked=date_blocked, # These three fields are replaced below. federal_cite_one=west_cite_str, state_cite_one=state_cite_str, neutral_cite=neutral_cite_str, syllabus=item.get('summaries', ''), ) citations = [] if west_cite_str: citation_obj = get_citations(west_cite_str)[0] citations.append( Citation( cluster=cluster, volume=citation_obj.volume, reporter=citation_obj.reporter, page=citation_obj.page, type=Citation.WEST, )) if state_cite_str: citation_obj = get_citations(state_cite_str)[0] citations.append( Citation( cluster=cluster, volume=citation_obj.volume, reporter=citation_obj.reporter, page=citation_obj.page, type=Citation.STATE, )) if neutral_cite_str: citation_obj = get_citations(neutral_cite_str)[0] citations.append( Citation( cluster=cluster, volume=citation_obj.volume, reporter=citation_obj.reporter, page=citation_obj.page, type=Citation.NEUTRAL, )) opinion = Opinion( type='010combined', sha1=sha1_hash, download_url=item['download_urls'], ) error = False try: cf = ContentFile(content) extension = get_extension(content) file_name = trunc(item['case_names'].lower(), 75) + extension opinion.file_with_date = cluster.date_filed opinion.local_path.save(file_name, cf, save=False) except: msg = ('Unable to save binary to disk. Deleted ' 'item: %s.\n %s' % (item['case_names'], traceback.format_exc())) logger.critical(msg.encode('utf-8')) ErrorLog(log_level='CRITICAL', court=court, message=msg).save() error = True return docket, opinion, cluster, citations, error
def make_and_save(item, skipdupes=False, min_dates=None, testing=True): """Associates case data from `parse_opinions` with objects. Saves these objects. min_date: if not none, will skip cases after min_date """ date_filed = date_argued = date_reargued = date_reargument_denied = date_cert_granted = date_cert_denied = None unknown_date = None for date_cluster in item['dates']: for date_info in date_cluster: # check for any dates that clearly aren't dates if date_info[1].year < 1600 or date_info[1].year > 2020: continue # check for untagged dates that will be assigned to date_filed if date_info[0] is None: date_filed = date_info[1] continue # try to figure out what type of date it is based on its tag string if date_info[0] in FILED_TAGS: date_filed = date_info[1] elif date_info[0] in DECIDED_TAGS: if not date_filed: date_filed = date_info[1] elif date_info[0] in ARGUED_TAGS: date_argued = date_info[1] elif date_info[0] in REARGUE_TAGS: date_reargued = date_info[1] elif date_info[0] in REARGUE_DENIED_TAGS: date_reargument_denied = date_info[1] elif date_info[0] in CERT_GRANTED_TAGS: date_cert_granted = date_info[1] elif date_info[0] in CERT_DENIED_TAGS: date_cert_denied = date_info[1] else: unknown_date = date_info[1] if date_info[0] not in UNKNOWN_TAGS: print("\nFound unknown date tag '%s' with date '%s'.\n" % date_info) # the main date (used for date_filed in OpinionCluster) and panel dates # (used for finding judges) are ordered in terms of which type of dates # best reflect them main_date = (date_filed or date_argued or date_reargued or date_reargument_denied or unknown_date) panel_date = (date_argued or date_reargued or date_reargument_denied or date_filed or unknown_date) if main_date is None: raise Exception("Failed to get a date for " + item['file']) if min_dates is not None: if min_dates.get(item['court_id']) is not None: if main_date >= min_dates[item['court_id']]: print(main_date, 'after', min_dates[item['court_id']], ' -- skipping.') return docket = Docket( source=Docket.COLUMBIA, date_argued=date_argued, date_reargued=date_reargued, date_cert_granted=date_cert_granted, date_cert_denied=date_cert_denied, date_reargument_denied=date_reargument_denied, court_id=item['court_id'], case_name_short=item['case_name_short'] or '', case_name=item['case_name'] or '', case_name_full=item['case_name_full'] or '', docket_number=item['docket'] or '' ) # get citations in the form of, e.g. {'federal_cite_one': '1 U.S. 1', ...} found_citations = [] for c in item['citations']: found = get_citations(c) if not found: # if the docket number --is-- citation string, we're likely dealing # with a somewhat common triplet of (docket number, date, # jurisdiction), which isn't a citation at all (so there's no # problem) if item['docket']: docket_no = item['docket'].lower() if 'claim no.' in docket_no: docket_no = docket_no.split('claim no.')[0] for junk in DOCKET_JUNK: docket_no = docket_no.replace(junk, '') docket_no = docket_no.strip('.').strip() if docket_no and docket_no in c.lower(): continue # there are a trivial number of letters (except for months and a few # trivial words) in the citation, then it's not a citation at all non_trivial = c.lower() for trivial in TRIVIAL_CITE_WORDS: non_trivial = non_trivial.replace(trivial, '') num_letters = sum(non_trivial.count(letter) for letter in string.lowercase) if num_letters < 3: continue # if there is a string that's known to indicate a bad citation, then # it's not a citation if any(bad in c for bad in BAD_CITES): continue # otherwise, this is a problem raise Exception("Failed to get a citation from the string '%s' in " "court '%s' with docket '%s'." % ( c, item['court_id'], item['docket'] )) else: found_citations.extend(found) citations_map = map_citations_to_models(found_citations) cluster = OpinionCluster( judges=item.get('judges', '') or "", precedential_status=('Unpublished' if item['unpublished'] else 'Published'), date_filed=main_date, case_name_short=item['case_name_short'] or '', case_name=item['case_name'] or '', case_name_full=item['case_name_full'] or '', source='Z', attorneys=item['attorneys'] or '', posture=item['posture'] or '', **citations_map ) panel = [find_person(n, item['court_id'], case_date=panel_date) for n in item['panel']] panel = [x for x in panel if x is not None] opinions = [] for i, opinion_info in enumerate(item['opinions']): if opinion_info['author'] is None: author = None else: author = find_person(opinion_info['author'], item['court_id'], case_date=panel_date) converted_text = convert_columbia_html(opinion_info['opinion']) opinion_type = OPINION_TYPE_MAPPING[opinion_info['type']] if opinion_type == '020lead' and i > 0: opinion_type = '050addendum' opinion = Opinion( author=author, per_curiam=opinion_info['per_curiam'], type=opinion_type, # type=OPINION_TYPE_MAPPING[opinion_info['type']], html_columbia=converted_text, sha1=opinion_info['sha1'], local_path=opinion_info['local_path'], ) joined_by = [find_person(n, item['court_id'], case_date=panel_date) for n in opinion_info['joining']] joined_by = [x for x in joined_by if x is not None] opinions.append((opinion, joined_by)) if min_dates is None: # check to see if this is a duplicate dups = find_dups(docket, cluster, panel, opinions) if dups: if skipdupes: print('Duplicate. skipping.') else: raise Exception("Found %s duplicate(s)." % len(dups)) # save all the objects if not testing: try: docket.save() cluster.docket = docket cluster.save(index=False) for member in panel: cluster.panel.add(member) for opinion, joined_by in opinions: opinion.cluster = cluster opinion.save(index=False) for joiner in joined_by: opinion.joined_by.add(joiner) if settings.DEBUG: domain = "http://127.0.0.1:8000" else: domain = "https://www.courtlistener.com" print("Created item at: %s%s" % (domain, cluster.get_absolute_url())) except: # if anything goes wrong, try to delete everything try: docket.delete() except: pass raise
def make_and_save(item): """Associates case data from `parse_opinions` with objects. Saves these objects.""" date_filed = date_argued = date_reargued = date_reargument_denied = date_cert_granted = date_cert_denied = None for date_cluster in item['dates']: for date_info in date_cluster: # check for any dates that clearly aren't dates if date_info[1].year < 1600 or date_info[1].year > 2020: continue # check for untagged dates that will be assigned to date_filed if date_info[0] is None: date_filed = date_info[1] continue # try to figure out what type of date it is based on its tag string if date_info[0] in FILED_TAGS: date_filed = date_info[1] elif date_info[0] in DECIDED_TAGS: if not date_filed: date_filed = date_info[1] elif date_info[0] in ARGUED_TAGS: date_argued = date_info[1] elif date_info[0] in REARGUE_TAGS: date_reargued = date_info[1] elif date_info[0] in REARGUE_DENIED_TAGS: date_reargument_denied = date_info[1] elif date_info[0] in CERT_GRANTED_TAGS: date_cert_granted = date_info[1] elif date_info[0] in CERT_DENIED_TAGS: date_cert_denied = date_info[1] else: print("Found unknown date tag '%s' with date '%s'." % date_info) docket = Docket( date_argued=date_argued ,date_reargued=date_reargued ,date_cert_granted=date_cert_granted ,date_cert_denied=date_cert_denied ,date_reargument_denied=date_reargument_denied ,court_id=item['court_id'] ,case_name_short=item['case_name_short'] or '' ,case_name=item['case_name'] or '' ,case_name_full=item['case_name_full'] or '' ,docket_number=item['docket'] or '' ) docket.save() # get citations in the form of, e.g. {'federal_cite_one': '1 U.S. 1', ...} found_citations = [] for c in item['citations']: found = get_citations(c) if not found: raise Exception("Failed to get a citation from the string '%s'." % c) elif len(found) > 1: raise Exception("Got multiple citations from string '%s' when there should have been one." % c) found_citations.append(found[0]) citations_map = map_citations_to_models(found_citations) cluster = OpinionCluster( docket=docket ,precedential_status=('Unpublished' if item['unpublished'] else 'Published') ,date_filed=date_filed ,case_name_short=item['case_name_short'] or '' ,case_name=item['case_name'] or '' ,case_name_full=item['case_name_full'] or '' ,source='Z' ,attorneys=item['attorneys'] or '' ,posture=item['posture'] or '' ,**citations_map ) cluster.save() if date_argued is not None: paneldate = date_argued else: paneldate = date_filed panel = [find_person(n, item['court_id'], paneldate) for n in item['panel']] panel = [x for x in panel if x is not None] for member in panel: cluster.panel.add(member) for opinion_info in item['opinions']: if opinion_info['author'] is None: author = None else: author = find_person(opinion_info['author'], item['court_id'], date_filed or date_argued) opinion = Opinion( cluster=cluster ,author=author ,type=OPINION_TYPE_MAPPING[opinion_info['type']] ,html_columbia=opinion_info['opinion'] ) opinion.save() joined_by = [find_person(n, item['court_id'], paneldate) for n in opinion_info['joining']] joined_by = [x for x in joined_by if x is not None] for joiner in joined_by: opinion.joined_by.add(joiner)
def test_find_citations(self): """Can we find and make Citation objects from strings?""" test_pairs = ( # Basic test ('1 U.S. 1', [ Citation(volume=1, reporter='U.S.', page=1, canonical_reporter=u'U.S.', lookup_index=0, court='scotus', reporter_index=1, reporter_found='U.S.') ]), # Basic test of non-case name before citation (should not be found) ('lissner test 1 U.S. 1', [ Citation(volume=1, reporter='U.S.', page=1, canonical_reporter=u'U.S.', lookup_index=0, court='scotus', reporter_index=3, reporter_found='U.S.') ]), # Test with plaintiff and defendant ('lissner v. test 1 U.S. 1', [ Citation(plaintiff='lissner', defendant='test', volume=1, reporter='U.S.', page=1, canonical_reporter=u'U.S.', lookup_index=0, court='scotus', reporter_index=4, reporter_found='U.S.') ]), # Test with plaintiff, defendant and year ('lissner v. test 1 U.S. 1 (1982)', [ Citation(plaintiff='lissner', defendant='test', volume=1, reporter='U.S.', page=1, year=1982, canonical_reporter=u'U.S.', lookup_index=0, court='scotus', reporter_index=4, reporter_found='U.S.') ]), # Test with different reporter than all of above. ('bob lissner v. test 1 F.2d 1 (1982)', [ Citation(plaintiff='lissner', defendant='test', volume=1, reporter='F.2d', page=1, year=1982, canonical_reporter=u'F.', lookup_index=0, reporter_index=5, reporter_found='F.2d') ]), # Test with court and extra information ('bob lissner v. test 1 U.S. 12, 347-348 (4th Cir. 1982)', [ Citation(plaintiff='lissner', defendant='test', volume=1, reporter='U.S.', page=12, year=1982, extra=u'347-348', court='ca4', canonical_reporter=u'U.S.', lookup_index=0, reporter_index=5, reporter_found='U.S.') ]), # Test with text before and after and a variant reporter ('asfd 22 U. S. 332 (1975) asdf', [ Citation(volume=22, reporter='U.S.', page=332, year=1975, canonical_reporter=u'U.S.', lookup_index=0, court='scotus', reporter_index=2, reporter_found='U. S.') ]), # Test with finding reporter when it's a second edition ('asdf 22 A.2d 332 asdf', [ Citation(volume=22, reporter='A.2d', page=332, canonical_reporter=u'A.', lookup_index=0, reporter_index=2, reporter_found='A.2d') ]), # Test finding a variant second edition reporter ('asdf 22 A. 2d 332 asdf', [ Citation(volume=22, reporter='A.2d', page=332, canonical_reporter=u'A.', lookup_index=0, reporter_index=2, reporter_found='A. 2d') ]), # Test finding a variant of an edition resolvable by variant alone. ('171 Wn.2d 1016', [ Citation(volume=171, reporter='Wash. 2d', page=1016, canonical_reporter=u'Wash.', lookup_index=1, reporter_index=1, reporter_found='Wn.2d') ]), # Test finding two citations where one of them has abutting # punctuation. ('2 U.S. 3, 4-5 (3 Atl. 33)', [ Citation(volume=2, reporter="U.S.", page=3, extra=u'4-5', canonical_reporter=u"U.S.", lookup_index=0, reporter_index=1, reporter_found="U.S.", court='scotus'), Citation(volume=3, reporter="A.", page=33, canonical_reporter=u"A.", lookup_index=0, reporter_index=5, reporter_found="Atl.") ]), # Test with the page number as a Roman numeral ('12 Neb. App. lxiv (2004)', [ Citation(volume=12, reporter='Neb. Ct. App.', page='lxiv', year=2004, canonical_reporter=u'Neb. Ct. App.', lookup_index=0, reporter_index=1, reporter_found='Neb. App.') ]), # Test with the 'digit-REPORTER-digit' corner-case formatting ('2007-NMCERT-008', [ Citation(volume=2007, reporter='NMCERT', page=8, canonical_reporter=u'NMCERT', lookup_index=0, reporter_index=1, reporter_found='NMCERT') ]), ('2006-Ohio-2095', [ Citation(volume=2006, reporter='Ohio', page=2095, canonical_reporter=u'Ohio', lookup_index=0, reporter_index=1, reporter_found='Ohio') ]), ) for q, a in test_pairs: print "Testing citation extraction for %s..." % q, cites_found = get_citations(q) self.assertEqual( cites_found, a, msg='%s\n%s\n\n !=\n\n%s' % ( q, ",\n".join([str(cite.__dict__) for cite in cites_found]), ",\n".join([str(cite.__dict__) for cite in a]), )) print "✓"
def parse_harvard_opinions(reporter, volume, make_searchable): """ Parse downloaded CaseLaw Corpus from internet archive and add them to our database. Optionally uses a reporter abbreviation to identify cases to download as used by IA. (Ex. T.C. => tc) Optionally uses a volume integer. If neither is provided, code will cycle through all downloaded files. :param volume: The volume (int) of the reporters (optional) (ex 10) :param reporter: Reporter string as slugify'd (optional) (tc) for T.C. :param make_searchable: Boolean to indicate saving to solr :return: None """ if not reporter and volume: logger.error("You provided a volume but no reporter. Exiting.") return for file_path in filepath_list(reporter, volume): ia_download_url = "/".join( ["https://archive.org/download", file_path.split("/", 9)[-1]]) if OpinionCluster.objects.filter( filepath_json_harvard=file_path).exists(): logger.info("Skipping - already in system %s" % ia_download_url) continue try: with open(file_path) as f: data = json.load(f) except ValueError: logger.warning("Empty json: missing case at: %s" % ia_download_url) continue except Exception as e: logger.warning("Unknown error %s for: %s" % (e, ia_download_url)) continue cites = get_citations(data["citations"][0]["cite"], html=False) if not cites: logger.info("No citation found for %s." % data["citations"][0]["cite"]) continue case_name = harmonize(data["name_abbreviation"]) case_name_short = cnt.make_case_name_short(case_name) case_name_full = harmonize(data["name"]) citation = cites[0] if skip_processing(citation, case_name, file_path): continue # TODO: Generalize this to handle all court types somehow. court_id = match_court_string( data["court"]["name"], state=True, federal_appeals=True, federal_district=True, ) soup = BeautifulSoup(data["casebody"]["data"], "lxml") # Some documents contain images in the HTML # Flag them for a later crawl by using the placeholder '[[Image]]' judge_list = [ find_judge_names(x.text) for x in soup.find_all("judges") ] author_list = [ find_judge_names(x.text) for x in soup.find_all("author") ] # Flatten and dedupe list of judges judges = ", ".join( sorted( list( set(itertools.chain.from_iterable(judge_list + author_list))))) judges = titlecase(judges) docket_string = (data["docket_number"].replace( "Docket No.", "").replace("Docket Nos.", "").strip()) short_fields = ["attorneys", "disposition", "otherdate", "seealso"] long_fields = [ "syllabus", "summary", "history", "headnotes", "correction", ] short_data = parse_extra_fields(soup, short_fields, False) long_data = parse_extra_fields(soup, long_fields, True) with transaction.atomic(): logger.info("Adding docket for: %s", citation.base_citation()) docket = Docket( case_name=case_name, case_name_short=case_name_short, case_name_full=case_name_full, docket_number=docket_string, court_id=court_id, source=Docket.HARVARD, ia_needs_upload=False, ) try: with transaction.atomic(): docket.save() except OperationalError as e: if "exceeds maximum" in str(e): docket.docket_number = ( "%s, See Corrections for full Docket Number" % trunc(docket_string, length=5000, ellipsis="...")) docket.save() long_data["correction"] = "%s <br> %s" % ( data["docket_number"], long_data["correction"], ) # Handle partial dates by adding -01v to YYYY-MM dates date_filed, is_approximate = validate_dt(data["decision_date"]) logger.info("Adding cluster for: %s", citation.base_citation()) cluster = OpinionCluster( case_name=case_name, case_name_short=case_name_short, case_name_full=case_name_full, precedential_status="Published", docket_id=docket.id, source="U", date_filed=date_filed, date_filed_is_approximate=is_approximate, attorneys=short_data["attorneys"], disposition=short_data["disposition"], syllabus=long_data["syllabus"], summary=long_data["summary"], history=long_data["history"], other_dates=short_data["otherdate"], cross_reference=short_data["seealso"], headnotes=long_data["headnotes"], correction=long_data["correction"], judges=judges, filepath_json_harvard=file_path, ) cluster.save(index=False) logger.info("Adding citation for: %s", citation.base_citation()) Citation.objects.create( volume=citation.volume, reporter=citation.reporter, page=citation.page, type=map_reporter_db_cite_type( REPORTERS[citation.canonical_reporter][0]["cite_type"]), cluster_id=cluster.id, ) new_op_pks = [] for op in soup.find_all("opinion"): # This code cleans author tags for processing. # It is particularly useful for identifiying Per Curiam for elem in [op.find("author")]: if elem is not None: [x.extract() for x in elem.find_all("page-number")] auth = op.find("author") if auth is not None: author_tag_str = titlecase(auth.text.strip(":")) author_str = titlecase("".join( find_judge_names(author_tag_str))) else: author_str = "" author_tag_str = "" per_curiam = True if author_tag_str == "Per Curiam" else False # If Per Curiam is True set author string to Per Curiam if per_curiam: author_str = "Per Curiam" op_type = map_opinion_type(op.get("type")) opinion_xml = str(op) logger.info("Adding opinion for: %s", citation.base_citation()) op = Opinion( cluster_id=cluster.id, type=op_type, author_str=author_str, xml_harvard=opinion_xml, per_curiam=per_curiam, extracted_by_ocr=True, ) # Don't index now; do so later if desired op.save(index=False) new_op_pks.append(op.pk) if make_searchable: add_items_to_solr.delay(new_op_pks, "search.Opinion") logger.info("Finished: %s", citation.base_citation())
def test_disambiguate_citations(self): test_pairs = [ # 1. P.R.R --> Correct abbreviation for a reporter. ('1 P.R.R. 1', [ Citation(volume=1, reporter='P.R.R.', page=1, canonical_reporter=u'P.R.R.', lookup_index=0, reporter_index=1, reporter_found='P.R.R.') ]), # 2. U. S. --> A simple variant to resolve. ('1 U. S. 1', [ Citation(volume=1, reporter='U.S.', page=1, canonical_reporter=u'U.S.', lookup_index=0, court='scotus', reporter_index=1, reporter_found='U. S.') ]), # 3. A.2d --> Not a variant, but needs to be looked up in the # EDITIONS variable. ('1 A.2d 1', [ Citation(volume=1, reporter='A.2d', page=1, canonical_reporter=u'A.', lookup_index=0, reporter_index=1, reporter_found='A.2d') ]), # 4. A. 2d --> An unambiguous variant of an edition ('1 A. 2d 1', [ Citation(volume=1, reporter='A.2d', page=1, canonical_reporter=u'A.', lookup_index=0, reporter_index=1, reporter_found='A. 2d') ]), # 5. P.R. --> A variant of 'Pen. & W.', 'P.R.R.', or 'P.' that's # resolvable by year ( '1 P.R. 1 (1831)', # Of the three, only Pen & W. was being published this year. [ Citation(volume=1, reporter='Pen. & W.', page=1, canonical_reporter=u'Pen. & W.', lookup_index=0, year=1831, reporter_index=1, reporter_found='P.R.') ]), # 5.1: W.2d --> A variant of an edition that either resolves to # 'Wis. 2d' or 'Wash. 2d' and is resolvable by year. ( '1 W.2d 1 (1854)', # Of the two, only Wis. 2d was being published this year. [ Citation(volume=1, reporter='Wis. 2d', page=1, canonical_reporter=u'Wis.', lookup_index=0, year=1854, reporter_index=1, reporter_found='W.2d') ]), # 5.2: Wash. --> A non-variant that has more than one reporter for # the key, but is resolvable by year ('1 Wash. 1 (1890)', [ Citation(volume=1, reporter='Wash.', page=1, canonical_reporter=u'Wash.', lookup_index=1, year=1890, reporter_index=1, reporter_found='Wash.') ]), # 6. Cr. --> A variant of Cranch, which is ambiguous, except with # paired with this variation. ('1 Cra. 1', [ Citation(volume=1, reporter='Cranch', page=1, canonical_reporter=u'Cranch', lookup_index=0, court='scotus', reporter_index=1, reporter_found='Cra.') ]), # 7. Cranch. --> Not a variant, but could refer to either Cranch's # Supreme Court cases or his DC ones. In this case, we cannot # disambiguate. Years are not known, and we have no further # clues. We must simply drop Cranch from the results. ('1 Cranch 1 1 U.S. 23', [ Citation(volume=1, reporter='U.S.', page=23, canonical_reporter=u'U.S.', lookup_index=0, court='scotus', reporter_index=4, reporter_found='U.S.') ]), # 8. Unsolved problem. In theory, we could use parallel citations # to resolve this, because Rob is getting cited next to La., but # we don't currently know the proximity of citations to each # other, so can't use this. # - Rob. --> Either: # 8.1: A variant of Robards (1862-1865) or # 8.2: Robinson's Louisiana Reports (1841-1846) or # 8.3: Robinson's Virgina Reports (1842-1865) # ('1 Rob. 1 1 La. 1', # [Citation(volume=1, reporter='Rob.', page=1, # canonical_reporter='Rob.', # lookup_index=0), # Citation(volume=1, reporter='La.', page=1, # canonical_reporter='La.', # lookup_index=0)]), ] for pair in test_pairs: print "Testing disambiguation for %s..." % pair[0], citations = get_citations(pair[0], html=False) self.assertEqual(citations, pair[1], msg='%s\n%s != \n%s' % (pair[0], [cite.__dict__ for cite in citations ], [cite.__dict__ for cite in pair[1]])) print "✓"
def test_find_citations(self): """Can we find and make Citation objects from strings?""" test_pairs = ( # Basic test ('1 U.S. 1', [Citation(volume=1, reporter='U.S.', page=1, canonical_reporter=u'U.S.', lookup_index=0, court='scotus', reporter_index=1, reporter_found='U.S.')]), # Basic test of non-case name before citation (should not be found) ('lissner test 1 U.S. 1', [Citation(volume=1, reporter='U.S.', page=1, canonical_reporter=u'U.S.', lookup_index=0, court='scotus', reporter_index=3, reporter_found='U.S.')]), # Test with plaintiff and defendant ('lissner v. test 1 U.S. 1', [Citation(plaintiff='lissner', defendant='test', volume=1, reporter='U.S.', page=1, canonical_reporter=u'U.S.', lookup_index=0, court='scotus', reporter_index=4, reporter_found='U.S.')]), # Test with plaintiff, defendant and year ('lissner v. test 1 U.S. 1 (1982)', [Citation(plaintiff='lissner', defendant='test', volume=1, reporter='U.S.', page=1, year=1982, canonical_reporter=u'U.S.', lookup_index=0, court='scotus', reporter_index=4, reporter_found='U.S.')]), # Test with different reporter than all of above. ('bob lissner v. test 1 F.2d 1 (1982)', [Citation(plaintiff='lissner', defendant='test', volume=1, reporter='F.2d', page=1, year=1982, canonical_reporter=u'F.', lookup_index=0, reporter_index=5, reporter_found='F.2d')]), # Test with court and extra information ('bob lissner v. test 1 U.S. 12, 347-348 (4th Cir. 1982)', [Citation(plaintiff='lissner', defendant='test', volume=1, reporter='U.S.', page=12, year=1982, extra=u'347-348', court='ca4', canonical_reporter=u'U.S.', lookup_index=0, reporter_index=5, reporter_found='U.S.')]), # Test with text before and after and a variant reporter ('asfd 22 U. S. 332 (1975) asdf', [Citation(volume=22, reporter='U.S.', page=332, year=1975, canonical_reporter=u'U.S.', lookup_index=0, court='scotus', reporter_index=2, reporter_found='U. S.')]), # Test with finding reporter when it's a second edition ('asdf 22 A.2d 332 asdf', [Citation(volume=22, reporter='A.2d', page=332, canonical_reporter=u'A.', lookup_index=0, reporter_index=2, reporter_found='A.2d')]), # Test finding a variant second edition reporter ('asdf 22 A. 2d 332 asdf', [Citation(volume=22, reporter='A.2d', page=332, canonical_reporter=u'A.', lookup_index=0, reporter_index=2, reporter_found='A. 2d')]), # Test finding a variant of an edition resolvable by variant alone. ('171 Wn.2d 1016', [Citation(volume=171, reporter='Wash. 2d', page=1016, canonical_reporter=u'Wash.', lookup_index=1, reporter_index=1, reporter_found='Wn.2d')]), # Test finding two citations where one of them has abutting # punctuation. ('2 U.S. 3, 4-5 (3 Atl. 33)', [Citation(volume=2, reporter="U.S.", page=3, extra=u'4-5', canonical_reporter=u"U.S.", lookup_index=0, reporter_index=1, reporter_found="U.S.", court='scotus'), Citation(volume=3, reporter="A.", page=33, canonical_reporter=u"A.", lookup_index=0, reporter_index=5, reporter_found="Atl.")]), # Test with the page number as a Roman numeral ('12 Neb. App. lxiv (2004)', [Citation(volume=12, reporter='Neb. Ct. App.', page='lxiv', year=2004, canonical_reporter=u'Neb. Ct. App.', lookup_index=0, reporter_index=1, reporter_found='Neb. App.')]), # Test with the 'digit-REPORTER-digit' corner-case formatting ('2007-NMCERT-008', [Citation(volume=2007, reporter='NMCERT', page=8, canonical_reporter=u'NMCERT', lookup_index=0, reporter_index=1, reporter_found='NMCERT')]), ('2006-Ohio-2095', [Citation(volume=2006, reporter='Ohio', page=2095, canonical_reporter=u'Ohio', lookup_index=0, reporter_index=1, reporter_found='Ohio')]), ('2017 IL App (4th) 160407WC', [Citation(volume=2017, reporter='IL App (4th)', page='160407WC', canonical_reporter=u'IL App (4th)', lookup_index=0, reporter_index=1, reporter_found='IL App (4th)')]), ('2017 IL App (1st) 143684-B', [Citation(volume=2017, reporter='IL App (1st)', page='143684-B', canonical_reporter=u'IL App (1st)', lookup_index=0, reporter_index=1, reporter_found='IL App (1st)')]) ) for q, a in test_pairs: print "Testing citation extraction for %s..." % q, cites_found = get_citations(q) self.assertEqual( cites_found, a, msg='%s\n%s\n\n !=\n\n%s' % ( q, ",\n".join([str(cite.__dict__) for cite in cites_found]), ",\n".join([str(cite.__dict__) for cite in a]), ) ) print "✓"