def test_citation_matching_issue621(self) -> None: """Make sure that a citation like 1 Wheat 9 doesn't match 9 Wheat 1""" # The fixture contains a reference to 9 F. 1, so we expect no results. citation_str = "1 F. 9 (1795)" citation = get_citations(citation_str)[0] results = match_citation(citation) self.assertEqual([], results)
def get_document_citations( opinion: Opinion, ) -> List[Union[NonopinionCitation, Citation]]: """Identify and return citations from the html or plain text of the opinion. """ if opinion.html_anon_2020: citations = get_citations( text=opinion.html_anon_2020, clean=( "html", "whitespace", ), ) elif opinion.html_columbia: citations = get_citations( text=opinion.html_columbia, clean=( "html", "whitespace", ), ) elif opinion.html_lawbox: citations = get_citations( text=opinion.html_lawbox, clean=( "html", "whitespace", ), ) elif opinion.html: citations = get_citations( text=opinion.html, clean=( "html", "whitespace", ), ) elif opinion.plain_text: citations = get_citations(text=opinion.plain_text, clean=("whitespace", )) else: citations = [] return citations
def make_citation( cite_str: str, cluster: OpinionCluster, cite_type: int, ) -> Citation: """Create and return a citation object for the input values.""" citation_obj = get_citations(cite_str)[0] return Citation( cluster=cluster, volume=citation_obj.volume, reporter=citation_obj.reporter, page=citation_obj.page, type=cite_type, )
def find_cites(case_data: Dict[str, str]) -> List[FoundCitation]: """Extract citations from raw string. :param case_data: Case information from the anon 2020 db. :return: Citation objects found in the raw string. """ found_citations = [] cites = re.findall(r"\"(.*?)\"", case_data["lexis_ids_normalized"], re.DOTALL) for cite in cites: fc = get_citations(clean_text(cite, ["html", "inline_whitespace"])) if len(fc) > 0: found_citations.append(fc[0]) return found_citations
def find_tax_court_citation(opinion_text): """ Returns a dictionary representation of our Citation object. Return the citation object or nothing. Iterates over lines of text beacuse we assume our citations won't wrap. :param opinion_text: The plain_text of our opinion from the scrape. :return: citation object or None """ for line_of_text in opinion_text.split("\n")[:250]: cites = get_citations(line_of_text) if not cites: continue if "UNITED STATES TAX COURT REPORT" in opinion_text: for cite in cites: if "UNITED STATES TAX COURT REPORT" in cite.reporter_found: cite.type = Citation.SPECIALTY return cite else: for cite in cites: if ("T.C." not in cite.reporter and "T. C." not in cite.reporter): # If not the first cite - Skip return None if cite.reporter_index > 2: # If reporter not in first or second term in the line we skip. return None alt_cite = line_of_text.replace(cite.reporter_found, "").strip() other_words = alt_cite.split(" ") if len([x for x in other_words if x != ""]) > 3: # If line has more than three non reporter components skip. return None if "T.C." == cite.reporter: cite_type = Citation.SPECIALTY elif "T.C. No." == cite.reporter: cite_type = Citation.SPECIALTY else: cite_type = Citation.NEUTRAL cite.type = cite_type return cite
def do_citations(cluster, scdb_info): """ Handle the citation fields. :param cluster: The Cluster to be changed. :param scdb_info: A dict with the SCDB information. """ fields = { "usCite": ("U.S.", Citation.FEDERAL), "sctCite": ("S. Ct.", Citation.FEDERAL), "ledCite": ("L. Ed.", Citation.FEDERAL), "lexisCite": ("U.S. LEXIS", Citation.LEXIS), } for scdb_field, reporter_info in fields.items(): if not scdb_info[scdb_field]: continue try: citation_obj = get_citations( scdb_info[scdb_field], do_post_citation=False, do_defendant=False, disambiguate=False, )[0] except IndexError: logger.warning("Unable to parse citation for: %s", scdb_info[scdb_field]) else: cites = cluster.citations.filter(reporter=reporter_info[0]) if cites.count() == 1: # Update the existing citation. cite = cites[0] cite.volume = citation_obj.volume cite.reporter = citation_obj.reporter cite.page = citation_obj.page cite.save() else: try: # Create a new citation Citation.objects.create( cluster=cluster, volume=citation_obj.volume, reporter=citation_obj.reporter, page=citation_obj.page, type=reporter_info[1], ) except IntegrityError: # Violated unique_together constraint. Fine. pass
def test_make_html_from_html(self) -> None: """Can we convert the HTML of an opinion into modified HTML?""" # fmt: off test_pairs = [ # Id. citation with HTML tags ('<div><p>the improper views of the Legislature.\" 2 <i>id.,</i> ' 'at 73.</p>\n<p>Nathaniel Gorham of Massachusetts</p></div>', '<div><p>the improper views of the Legislature." 2<span class="' 'citation no-link"> <i><span class="id_token">id.,</span></i> at ' '73.</span></p>\n<p>Nathaniel Gorham of Massachusetts</p></div>'), # Id. citation with an intervening HTML tag # (We expect the HTML to be unchanged, since it's too risky to # modify with another tag in the way) ('<div><p>the improper views of the Legislature.\" 2 <i>id.,</i> ' 'at <b>73, bolded</b>.</p>\n<p>Nathaniel Gorham of Massachusetts' '</p></div>', '<div><p>the improper views of the Legislature.\" 2 <i>id.,</i> ' 'at <b>73, bolded</b>.</p>\n<p>Nathaniel Gorham of Massachusetts' '</p></div>'), # Ibid. citation with HTML tags ('<div><p>possess any peculiar knowledge of the mere policy of ' 'public measures.\" <i>Ibid.</i> Gerry of Massachusetts ' 'like</p></div>', '<div><p>possess any peculiar knowledge of the mere policy of ' 'public measures."<span class="citation no-link"> <i><span class=' '"id_token">Ibid.</span></i> Gerry of Massachusetts </span>like' '</p></div>'), ] # fmt: on for s, expected_html in test_pairs: print("Testing html to html conversion for %s..." % s, end=" ") opinion = Opinion(html=s) citations = get_citations(s, clean=("html", "whitespace")) created_html = create_cited_html(opinion, citations) self.assertEqual( created_html, expected_html, msg="\n%s\n\n !=\n\n%s" % (created_html, expected_html), ) print("✓")
def test_identifying_parallel_citations(self) -> None: """Given a string, can we identify parallel citations""" tests = ( # A pair consisting of a test string and the number of parallel # citations that should be identifiable in that string. # Simple case ("1 U.S. 1 (22 U.S. 33)", 1, 2), # Too far apart ("1 U.S. 1 too many words 22 U.S. 33", 0, 0), # Three citations ("1 U.S. 1, (44 U.S. 33, 99 U.S. 100)", 1, 3), # Parallel citation after a valid citation too early on ("1 U.S. 1 too many words, then 22 U.S. 33, 13 WL 33223", 1, 2), ) for q, citation_group_count, expected_num_parallel_citations in tests: print( "Testing parallel citation identification for: %s..." % q, end=" ", ) citations = get_citations(q) citation_groups = identify_parallel_citations(citations) computed_num_citation_groups = len(citation_groups) self.assertEqual( computed_num_citation_groups, citation_group_count, msg="Did not have correct number of citation groups. Got %s, " "not %s." % (computed_num_citation_groups, citation_group_count), ) if not citation_groups: # Add an empty list to make testing easier. citation_groups = [[]] computed_num_parallel_citation = len(list(citation_groups)[0]) self.assertEqual( computed_num_parallel_citation, expected_num_parallel_citations, msg="Did not identify correct number of parallel citations in " "the group. Got %s, not %s" % ( computed_num_parallel_citation, expected_num_parallel_citations, ), ) print("✓")
def get_query_citation(cd: Dict[str, Any]) -> Optional[List[Citation]]: """Extract citations from the query string and return them, or return None """ if not cd.get("q"): return None citations = get_citations(cd["q"], do_post_citation=False, do_defendant=False) citations = [c for c in citations if isinstance(c, Citation)] matches = None if len(citations) == 1: # If it's not exactly one match, user doesn't get special help. matches = match_citation(citations[0]) if len(matches) == 1: # If more than one match, don't show the tip return matches.result.docs[0] return matches
def make_and_save(item, skipdupes=False, min_dates=None, start_dates=None, testing=True): """Associates case data from `parse_opinions` with objects. Saves these objects. min_date: if not none, will skip cases after min_date """ date_filed = (date_argued) = ( date_reargued ) = date_reargument_denied = date_cert_granted = date_cert_denied = None unknown_date = None for date_cluster in item["dates"]: for date_info in date_cluster: # check for any dates that clearly aren't dates if date_info[1].year < 1600 or date_info[1].year > 2020: continue # check for untagged dates that will be assigned to date_filed if date_info[0] is None: date_filed = date_info[1] continue # try to figure out what type of date it is based on its tag string if date_info[0] in FILED_TAGS: date_filed = date_info[1] elif date_info[0] in DECIDED_TAGS: if not date_filed: date_filed = date_info[1] elif date_info[0] in ARGUED_TAGS: date_argued = date_info[1] elif date_info[0] in REARGUE_TAGS: date_reargued = date_info[1] elif date_info[0] in REARGUE_DENIED_TAGS: date_reargument_denied = date_info[1] elif date_info[0] in CERT_GRANTED_TAGS: date_cert_granted = date_info[1] elif date_info[0] in CERT_DENIED_TAGS: date_cert_denied = date_info[1] else: unknown_date = date_info[1] if date_info[0] not in UNKNOWN_TAGS: print("\nFound unknown date tag '%s' with date '%s'.\n" % date_info) # the main date (used for date_filed in OpinionCluster) and panel dates # (used for finding judges) are ordered in terms of which type of dates # best reflect them main_date = (date_filed or date_argued or date_reargued or date_reargument_denied or unknown_date) panel_date = (date_argued or date_reargued or date_reargument_denied or date_filed or unknown_date) if main_date is None: raise Exception("Failed to get a date for " + item["file"]) # special rule for Kentucky if item["court_id"] == "kycourtapp" and main_date <= date(1975, 12, 31): item["court_id"] = "kycourtapphigh" if min_dates is not None: if min_dates.get(item["court_id"]) is not None: if main_date >= min_dates[item["court_id"]]: print( main_date, "after", min_dates[item["court_id"]], " -- skipping.", ) return if start_dates is not None: if start_dates.get(item["court_id"]) is not None: if main_date <= start_dates[item["court_id"]]: print( main_date, "before court founding:", start_dates[item["court_id"]], " -- skipping.", ) return docket = Docket( source=Docket.COLUMBIA, date_argued=date_argued, date_reargued=date_reargued, date_cert_granted=date_cert_granted, date_cert_denied=date_cert_denied, date_reargument_denied=date_reargument_denied, court_id=item["court_id"], case_name_short=item["case_name_short"] or "", case_name=item["case_name"] or "", case_name_full=item["case_name_full"] or "", docket_number=item["docket"] or "", ) # get citation objects in a list for addition to the cluster found_citations = [] for c in item["citations"]: found = get_citations(clean_text(c, ["html", "inline_whitespace"])) if not found: # if the docket number --is-- citation string, we're likely dealing # with a somewhat common triplet of (docket number, date, # jurisdiction), which isn't a citation at all (so there's no # problem) if item["docket"]: docket_no = item["docket"].lower() if "claim no." in docket_no: docket_no = docket_no.split("claim no.")[0] for junk in DOCKET_JUNK: docket_no = docket_no.replace(junk, "") docket_no = docket_no.strip(".").strip() if docket_no and docket_no in c.lower(): continue # there are a trivial number of letters (except for # months and a few trivial words) in the citation, # then it's not a citation at all non_trivial = c.lower() for trivial in TRIVIAL_CITE_WORDS: non_trivial = non_trivial.replace(trivial, "") num_letters = sum( non_trivial.count(letter) for letter in string.lowercase) if num_letters < 3: continue # if there is a string that's known to indicate # a bad citation, then it's not a citation if any(bad in c for bad in BAD_CITES): continue # otherwise, this is a problem raise Exception("Failed to get a citation from the string '%s' in " "court '%s' with docket '%s'." % (c, item["court_id"], item["docket"])) else: found_citations.extend(found.to_model()) cluster = OpinionCluster( judges=item.get("judges", "") or "", precedential_status=("Unpublished" if item["unpublished"] else "Published"), date_filed=main_date, case_name_short=item["case_name_short"] or "", case_name=item["case_name"] or "", case_name_full=item["case_name_full"] or "", source="Z", attorneys=item["attorneys"] or "", posture=item["posture"] or "", ) panel = lookup_judges_by_last_name_list(item["panel"], item["court_id"], panel_date) opinions = [] for i, opinion_info in enumerate(item["opinions"]): if opinion_info["author"] is None: author = None else: author = lookup_judge_by_last_name(opinion_info["author"], item["court_id"], panel_date) converted_text = convert_columbia_html(opinion_info["opinion"]) opinion_type = OPINION_TYPE_MAPPING[opinion_info["type"]] if opinion_type == Opinion.LEAD and i > 0: opinion_type = Opinion.ADDENDUM opinion = Opinion( author=author, per_curiam=opinion_info["per_curiam"], type=opinion_type, # type=OPINION_TYPE_MAPPING[opinion_info['type']], html_columbia=converted_text, sha1=opinion_info["sha1"], # This is surely not updated for the new S3 world. If you're # reading this, you'll need to update this code. local_path=opinion_info["local_path"], ) joined_by = lookup_judges_by_last_name_list(item["joining"], item["court_id"], panel_date) opinions.append((opinion, joined_by)) if min_dates is None: # check to see if this is a duplicate dups = find_dups(docket, cluster) if dups: if skipdupes: print("Duplicate. skipping.") else: raise Exception("Found %s duplicate(s)." % len(dups)) # save all the objects if not testing: try: docket.save() cluster.docket = docket cluster.save(index=False) for citation in found_citations: citation.cluster = cluster citation.save() for member in panel: cluster.panel.add(member) for opinion, joined_by in opinions: opinion.cluster = cluster opinion.save(index=False) for joiner in joined_by: opinion.joined_by.add(joiner) if settings.DEBUG: domain = "http://127.0.0.1:8000" else: domain = "https://www.courtlistener.com" print("Created item at: %s%s" % (domain, cluster.get_absolute_url())) except: # if anything goes wrong, try to delete everything try: docket.delete() except: pass raise
def parse_harvard_opinions(reporter, volume, make_searchable): """ Parse downloaded CaseLaw Corpus from internet archive and add them to our database. Optionally uses a reporter abbreviation to identify cases to download as used by IA. (Ex. T.C. => tc) Optionally uses a volume integer. If neither is provided, code will cycle through all downloaded files. :param volume: The volume (int) of the reporters (optional) (ex 10) :param reporter: Reporter string as slugify'd (optional) (tc) for T.C. :param make_searchable: Boolean to indicate saving to solr :return: None """ if not reporter and volume: logger.error("You provided a volume but no reporter. Exiting.") return for file_path in filepath_list(reporter, volume): ia_download_url = "/".join( ["https://archive.org/download", file_path.split("/", 9)[-1]] ) if OpinionCluster.objects.filter( filepath_json_harvard=file_path ).exists(): logger.info("Skipping - already in system %s" % ia_download_url) continue try: with open(file_path) as f: data = json.load(f) except ValueError: logger.warning("Empty json: missing case at: %s" % ia_download_url) continue except Exception as e: logger.warning("Unknown error %s for: %s" % (e, ia_download_url)) continue cites = get_citations(data["citations"][0]["cite"]) if not cites: logger.info( "No citation found for %s." % data["citations"][0]["cite"] ) continue case_name = harmonize(data["name_abbreviation"]) case_name_short = cnt.make_case_name_short(case_name) case_name_full = harmonize(data["name"]) citation = cites[0] if skip_processing(citation, case_name, file_path): continue # TODO: Generalize this to handle all court types somehow. court_id = match_court_string( data["court"]["name"], state=True, federal_appeals=True, federal_district=True, ) soup = BeautifulSoup(data["casebody"]["data"], "lxml") # Some documents contain images in the HTML # Flag them for a later crawl by using the placeholder '[[Image]]' judge_list = [ extract_judge_last_name(x.text) for x in soup.find_all("judges") ] author_list = [ extract_judge_last_name(x.text) for x in soup.find_all("author") ] # Flatten and dedupe list of judges judges = ", ".join( sorted( list( set( itertools.chain.from_iterable(judge_list + author_list) ) ) ) ) judges = titlecase(judges) docket_string = ( data["docket_number"] .replace("Docket No.", "") .replace("Docket Nos.", "") .strip() ) short_fields = ["attorneys", "disposition", "otherdate", "seealso"] long_fields = [ "syllabus", "summary", "history", "headnotes", "correction", ] short_data = parse_extra_fields(soup, short_fields, False) long_data = parse_extra_fields(soup, long_fields, True) with transaction.atomic(): logger.info("Adding docket for: %s", citation.base_citation()) docket = Docket( case_name=case_name, case_name_short=case_name_short, case_name_full=case_name_full, docket_number=docket_string, court_id=court_id, source=Docket.HARVARD, ia_needs_upload=False, ) try: with transaction.atomic(): docket.save() except OperationalError as e: if "exceeds maximum" in str(e): docket.docket_number = ( "%s, See Corrections for full Docket Number" % trunc(docket_string, length=5000, ellipsis="...") ) docket.save() long_data["correction"] = "%s <br> %s" % ( data["docket_number"], long_data["correction"], ) # Handle partial dates by adding -01v to YYYY-MM dates date_filed, is_approximate = validate_dt(data["decision_date"]) logger.info("Adding cluster for: %s", citation.base_citation()) cluster = OpinionCluster( case_name=case_name, case_name_short=case_name_short, case_name_full=case_name_full, precedential_status="Published", docket_id=docket.id, source="U", date_filed=date_filed, date_filed_is_approximate=is_approximate, attorneys=short_data["attorneys"], disposition=short_data["disposition"], syllabus=long_data["syllabus"], summary=long_data["summary"], history=long_data["history"], other_dates=short_data["otherdate"], cross_reference=short_data["seealso"], headnotes=long_data["headnotes"], correction=long_data["correction"], judges=judges, filepath_json_harvard=file_path, ) cluster.save(index=False) logger.info("Adding citation for: %s", citation.base_citation()) Citation.objects.create( volume=citation.volume, reporter=citation.reporter, page=citation.page, type=map_reporter_db_cite_type( REPORTERS[citation.canonical_reporter][0]["cite_type"] ), cluster_id=cluster.id, ) new_op_pks = [] for op in soup.find_all("opinion"): # This code cleans author tags for processing. # It is particularly useful for identifiying Per Curiam for elem in [op.find("author")]: if elem is not None: [x.extract() for x in elem.find_all("page-number")] auth = op.find("author") if auth is not None: author_tag_str = titlecase(auth.text.strip(":")) author_str = titlecase( "".join(extract_judge_last_name(author_tag_str)) ) else: author_str = "" author_tag_str = "" per_curiam = True if author_tag_str == "Per Curiam" else False # If Per Curiam is True set author string to Per Curiam if per_curiam: author_str = "Per Curiam" op_type = map_opinion_type(op.get("type")) opinion_xml = str(op) logger.info("Adding opinion for: %s", citation.base_citation()) op = Opinion( cluster_id=cluster.id, type=op_type, author_str=author_str, xml_harvard=opinion_xml, per_curiam=per_curiam, extracted_by_ocr=True, ) # Don't index now; do so later if desired op.save(index=False) new_op_pks.append(op.pk) if make_searchable: add_items_to_solr.delay(new_op_pks, "search.Opinion") logger.info("Finished: %s", citation.base_citation())
def handle(self, *args, **options): """Identify parallel citations and save them as requested. This process proceeds in two phases. The first phase is to work through the entire corpus, identifying citations that occur very near to each other. These are considered parallel citations, and they are built into a graph data structure where citations are nodes and each parallel citation is an edge. The weight of each edge is determined by the number of times a parallel citation has been identified between two citations. This should solve problems like typos or other issues with our heuristic approach. The second phase of this process is to update the database with the high quality citations. This can only be done by matching the citations with actual items in the database and then updating them with parallel citations that are sufficiently likely to be good. """ super(Command, self).handle(*args, **options) no_option = not any([options.get("doc_id"), options.get("all")]) if no_option: raise CommandError( "Please specify if you want all items or a specific item.") if not options["update_database"]: logger.info( "--update_database is not set. No changes will be made to the " "database.") logger.info("## Entering phase one: Building a network object of " "all citations.\n") q = Opinion.objects.all() if options.get("doc_id"): q = q.filter(pk__in=options["doc_id"]) count = q.count() opinions = queryset_generator(q, chunksize=10000) node_count = edge_count = completed = 0 subtasks = [] for o in opinions: subtasks.append( identify_parallel_citations.s( get_citations(get_and_clean_opinion_text(o).cleaned_text))) last_item = count == completed + 1 if (completed % 50 == 0) or last_item: job = group(subtasks) result = job.apply_async().join() [ self.add_groups_to_network(citation_groups) for citation_groups in result ] subtasks = [] completed += 1 if completed % 250 == 0 or last_item: # Only do this once in a while. node_count = len(self.g.nodes()) edge_count = len(self.g.edges()) sys.stdout.write("\r Completed %s of %s. (%s nodes, %s edges)" % (completed, count, node_count, edge_count)) sys.stdout.flush() logger.info("\n\n## Entering phase two: Saving the best edges to " "the database.\n\n") for sub_graph in nx.connected_component_subgraphs(self.g): self.handle_subgraph(sub_graph, options) logger.info(f"\n\n## Done. Added {self.update_count} new citations.") self.do_solr(options)
def test_make_html_from_plain_text(self) -> None: """Can we convert the plain text of an opinion into HTML?""" # fmt: off full_citation_html = ('<pre class="inline">asdf </pre><span class="' 'citation no-link"><span class="volume">22' '</span> <span class="reporter">U.S.</span> ' '<span class="page">33</span> </span><pre class=' '"inline">asdf</pre>') test_pairs = [ # Simple example for full citations ('asdf 22 U.S. 33 asdf', full_citation_html), # Using a variant format for U.S. (Issue #409) ('asdf 22 U. S. 33 asdf', full_citation_html), # Full citation across line break ('asdf John v. Doe, 123\nU.S. 456, upholding foo bar', '<pre class="inline">asdf John v. Doe, </pre><span class="' 'citation no-link"><span class="volume">123</span>\n<span class=' '"reporter">U.S.</span> <span class="page">456</span></span><pre' ' class="inline">, upholding foo bar</pre>'), # Basic short form citation ('existing text asdf, 515 U.S., at 240. foobar', '<pre class="inline">existing text </pre><span class="citation ' 'no-link"><span class="antecedent_guess">asdf,</span> <span ' 'class="volume">515</span> <span class="reporter">U.S.</span>, ' 'at <span class="page">240</span></span><pre class="inline">. ' 'foobar</pre>'), # Short form citation with no comma after reporter in original ('existing text asdf, 1 U. S. at 2. foobar', '<pre class="inline">existing text </pre><span class="citation ' 'no-link"><span class="antecedent_guess">asdf,</span> <span class' '="volume">1</span> <span class="reporter">U.S.</span> at <span ' 'class="page">2</span></span><pre class="inline">. foobar</pre>'), # Short form citation across line break ('asdf.’ ” 123 \n U.S., at 456. Foo bar foobar', '<pre class="inline">asdf.’ </pre><span class="' 'citation no-link"><span class="antecedent_guess">”' '</span> <span class="volume">123</span> \n <span class=' '"reporter">U.S.</span>, at <span class="page">456</span></span>' '<pre class="inline">. Foo bar foobar</pre>'), # First kind of supra citation (standard kind) ('existing text asdf, supra, at 2. foobar', '<pre class="inline">existing text </pre><span class="citation ' 'no-link"><span class="antecedent_guess">asdf,</span> supra, at ' '<span class="page">2</span></span><pre class="inline">. foobar' '</pre>'), # Second kind of supra citation (with volume) ('existing text asdf, 123 supra, at 2. foo bar', '<pre class="inline">existing text </pre><span class="citation ' 'no-link"><span class="antecedent_guess">asdf,</span> <span ' 'class="volume">123</span> supra, at <span class="page">2</span>' '</span><pre class="inline">. foo bar</pre>'), # Third kind of supra citation (sans page) ('existing text asdf, supra, foo bar', '<pre class="inline">existing text </pre><span class="citation ' 'no-link"><span class="antecedent_guess">asdf,</span> supra' '</span><pre class="inline">, foo bar</pre>'), # Fourth kind of supra citation (with period) ('existing text asdf, supra. foo bar', '<pre class="inline">existing text </pre><span class="citation ' 'no-link"><span class="antecedent_guess">asdf,</span> supra' '</span><pre class="inline">. foo bar</pre>'), # Supra citation across line break ('existing text asdf, supra, at\n99 (quoting foo)', '<pre class="inline">existing text </pre><span class="citation ' 'no-link"><span class="antecedent_guess">asdf,</span> supra, ' 'at\n<span class="page">99</span> </span><pre class="inline">' '(quoting foo)</pre>'), # Id. citation ("Id., at 123") ('asdf, id., at 123. Lorem ipsum dolor sit amet', '<pre class="inline">asdf</pre><span class="citation no-link">, ' '<span class="id_token">id.,</span> at 123. </span><pre class="' 'inline">Lorem ipsum dolor sit amet</pre>'), # Duplicate Id. citation ('asd, id., at 123. Lo rem ip sum. asdf, id., at 123. Lo rem ip.', '<pre class="inline">asd</pre><span class="citation no-link">, ' '<span class="id_token">id.,</span> at 123. </span><pre class="' 'inline">Lo rem ip sum. asdf</pre><span class="citation ' 'no-link">, <span class="id_token">id.,</span> at 123. </span>' '<pre class="inline">Lo rem ip.</pre>'), # Id. citation across line break ('asdf." Id., at 315.\n Lorem ipsum dolor sit amet', '<pre class="inline">asdf."</pre><span class="citation no-link"> ' '<span class="id_token">Id.,</span> at 315.\n</span><pre class="' 'inline"> Lorem ipsum dolor sit amet</pre>'), # Ibid. citation ("... Ibid.") ('asdf, Ibid. Lorem ipsum dolor sit amet', '<pre class="inline">asdf</pre><span class="citation no-link">, ' '<span class="id_token">Ibid.</span> Lorem ipsum dolor </span>' '<pre class="inline">sit amet</pre>'), # NonopinionCitation (currently nothing should happen here) ('Lorem ipsum dolor sit amet. U.S. Code §3617. Foo bar.', '<pre class="inline">Lorem ipsum dolor sit amet. U.S. Code ' '§3617. Foo bar.</pre>'), ] # fmt: on for s, expected_html in test_pairs: print("Testing plain text to html conversion for %s..." % s, end=" ") opinion = Opinion(plain_text=s) citations = get_citations(s) created_html = create_cited_html(opinion, citations) self.assertEqual( created_html, expected_html, msg="\n%s\n\n !=\n\n%s" % (created_html, expected_html), ) print("✓")