def setUp(self): docket = Docket( case_name=u'foo', court=Court.objects.get(pk='test'), source=Docket.DEFAULT ) docket.save() # Must be more than a year old for all tests to be runnable. last_month = now().date() - timedelta(days=400) self.doc_cluster = OpinionCluster( case_name=u"foo", docket=docket, date_filed=last_month ) self.doc_cluster.save(index=False) opinion = Opinion(cluster=self.doc_cluster, type='Lead Opinion') opinion.save(index=False) opinion2 = Opinion(cluster=self.doc_cluster, type='Concurrence') opinion2.save(index=False) OpinionsCited.objects.create( citing_opinion=opinion2, cited_opinion=opinion ) # Scrape the audio "site" and add its contents site = test_oral_arg_scraper.Site().parse() OralArgumentCommand().scrape_court(site, full_crawl=True)
def setUp(self) -> None: docket = Docket( case_name="foo", court=Court.objects.get(pk="test"), source=Docket.DEFAULT, ) docket.save() # Must be more than a year old for all tests to be runnable. last_month = now().date() - timedelta(days=400) self.doc_cluster = OpinionCluster(case_name="foo", docket=docket, date_filed=last_month) self.doc_cluster.save(index=False) opinion = Opinion(cluster=self.doc_cluster, type="Lead Opinion") opinion.save(index=False) opinion2 = Opinion(cluster=self.doc_cluster, type="Concurrence") opinion2.save(index=False) OpinionsCited.objects.create(citing_opinion=opinion2, cited_opinion=opinion) # Scrape the audio "site" and add its contents site = test_oral_arg_scraper.Site().parse() OralArgumentCommand().scrape_court(site, full_crawl=True)
class StaticFilesTest(TestCase): good_mp3_path = 'mp3/2014/06/09/ander_v._leo.mp3' good_txt_path = 'txt/2015/12/28/opinion_text.txt' good_pdf_path = 'pdf/2013/06/12/' + \ 'in_re_motion_for_consent_to_disclosure_of_court_records.pdf' def setUp(self): self.court = Court.objects.get(pk='test') self.docket = Docket(case_name=u'Docket', court=self.court, source=Docket.DEFAULT) self.docket.save() self.audio = Audio(local_path_original_file=self.good_mp3_path, local_path_mp3=self.good_mp3_path, docket=self.docket, blocked=False, case_name_full='Ander v. Leo', date_created=datetime.date(2014, 6, 9)) self.audio.save(index=False) self.opinioncluster = OpinionCluster( case_name=u'Hotline Bling', docket=self.docket, date_filed=datetime.date(2015, 12, 14), ) self.opinioncluster.save(index=False) self.txtopinion = Opinion(cluster=self.opinioncluster, type='Lead Opinion', local_path=self.good_txt_path) self.txtopinion.save(index=False) self.pdfopinion = Opinion(cluster=self.opinioncluster, type='Lead Opinion', local_path=self.good_pdf_path) self.pdfopinion.save(index=False) def test_serve_static_file_serves_mp3(self): request = HttpRequest() file_path = self.audio.local_path_mp3 response = serve_static_file(request, file_path=self.good_mp3_path) self.assertEqual(response.status_code, 200) self.assertEqual(response['Content-Type'], 'audio/mpeg') self.assertIn('inline;', response['Content-Disposition']) def test_serve_static_file_serves_txt(self): request = HttpRequest() response = serve_static_file(request, file_path=self.good_txt_path) self.assertEqual(response.status_code, 200) self.assertEqual(response['Content-Type'], 'text/plain') self.assertIn('inline;', response['Content-Disposition']) self.assertIn('FOR THE DISTRICT OF COLUMBIA CIRCUIT', response.content) def test_serve_static_file_serves_pdf(self): request = HttpRequest() response = serve_static_file(request, file_path=self.good_pdf_path) self.assertEqual(response.status_code, 200) self.assertEqual(response['Content-Type'], 'application/pdf') self.assertIn('inline;', response['Content-Disposition'])
def add_oc_and_o(self, old_document, old_citation, old_docket, new_docket): """Add the OpinionCluster and Opinion, updating existing items if present. """ new_opinion_cluster = OpinionClusterNew( pk=old_document.pk, docket=new_docket, judges=self._none_to_blank(old_document.judges), date_modified=old_document.date_modified, date_created=old_document.date_modified, date_filed=old_document.date_filed, slug=self._none_to_blank(old_citation.slug), citation_id=old_document.citation_id, case_name_short=old_docket.case_name_short, case_name=old_docket.case_name, case_name_full=old_docket.case_name_full, federal_cite_one=self._none_to_blank(old_citation.federal_cite_one), federal_cite_two=self._none_to_blank(old_citation.federal_cite_two), federal_cite_three=self._none_to_blank(old_citation.federal_cite_three), state_cite_one=self._none_to_blank(old_citation.state_cite_one), state_cite_two=self._none_to_blank(old_citation.state_cite_two), state_cite_three=self._none_to_blank(old_citation.state_cite_three), state_cite_regional=self._none_to_blank(old_citation.state_cite_regional), specialty_cite_one=self._none_to_blank(old_citation.specialty_cite_one), scotus_early_cite=self._none_to_blank(old_citation.scotus_early_cite), lexis_cite=self._none_to_blank(old_citation.lexis_cite), westlaw_cite=self._none_to_blank(old_citation.westlaw_cite), neutral_cite=self._none_to_blank(old_citation.neutral_cite), scdb_id=self._none_to_blank(old_document.supreme_court_db_id), source=old_document.source, nature_of_suit=old_document.nature_of_suit, citation_count=old_document.citation_count, precedential_status=old_document.precedential_status, date_blocked=old_document.date_blocked, blocked=old_document.blocked, ) new_opinion_cluster.save( using='default', index=False, ) new_opinion = OpinionNew( pk=old_document.pk, cluster=new_opinion_cluster, date_modified=old_document.date_modified, date_created=old_document.time_retrieved, type='010combined', sha1=old_document.sha1, download_url=old_document.download_url, local_path=old_document.local_path, plain_text=old_document.plain_text, html=self._none_to_blank(old_document.html), html_lawbox=self._none_to_blank(old_document.html_lawbox), html_with_citations=old_document.html_with_citations, extracted_by_ocr=old_document.extracted_by_ocr, ) new_opinion.save( using='default', index=False, )
def test_save_old_opinion(self): """Can we save opinions older than 1900?""" docket = Docket(case_name=u"Blah", court_id='test', source=Docket.DEFAULT) docket.save() oc = OpinionCluster( case_name=u"Blah", docket=docket, date_filed=datetime.date(1899, 1, 1), ) oc.save() o = Opinion(cluster=oc, type='Lead Opinion') try: cf = ContentFile(StringIO.StringIO('blah').read()) o.file_with_date = datetime.date(1899, 1, 1) o.local_path.save('file_name.pdf', cf, save=False) o.save(index=False) except ValueError as e: raise ValueError("Unable to save a case older than 1900. Did you " "try to use `strftime`...again?")
def migrate_opinions_oral_args_and_dockets(self): self.stdout.write("Migrating dockets, audio files, and opinions to new " "database...") q = DocketOld.objects.using('old').all() old_dockets = queryset_generator(q) num_dockets = q.count() progress = 0 self._print_progress(progress, num_dockets) for old_docket in old_dockets: # First do the docket, then create the cluster and opinion objects. try: old_audio = old_docket.audio_files.all()[0] except IndexError: old_audio = None try: old_document = old_docket.documents.all()[0] except IndexError: old_document = None if old_document is not None: old_citation = old_document.citation old_doc_case_name, old_doc_case_name_full, old_doc_case_name_short = self._get_case_names(old_citation.case_name) if old_audio is not None: old_audio_case_name, old_audio_case_name_full, old_audio_case_name_short = self._get_case_names(old_audio.case_name) court = CourtNew.objects.get(pk=old_docket.court_id) # Courts are in place thanks to initial data. new_docket = DocketNew( pk=old_docket.pk, date_modified=old_docket.date_modified, date_created=old_docket.date_modified, court=court, case_name=old_doc_case_name, case_name_full=old_doc_case_name_full, case_name_short=old_doc_case_name_short, slug=self._none_to_blank(old_docket.slug), docket_number=self._none_to_blank(old_citation.docket_number), date_blocked=old_docket.date_blocked, blocked=old_docket.blocked, ) if old_audio is not None: new_docket.date_argued = old_audio.date_argued new_docket.save(using='default') if old_document is not None: new_opinion_cluster = OpinionClusterNew( pk=old_document.pk, docket=new_docket, judges=self._none_to_blank(old_document.judges), date_modified=old_document.date_modified, date_created=old_document.date_modified, date_filed=old_document.date_filed, slug=self._none_to_blank(old_citation.slug), citation_id=old_document.citation_id, case_name_short=old_doc_case_name_short, case_name=old_doc_case_name, case_name_full=old_doc_case_name_full, federal_cite_one=self._none_to_blank( old_citation.federal_cite_one), federal_cite_two=self._none_to_blank( old_citation.federal_cite_two), federal_cite_three=self._none_to_blank( old_citation.federal_cite_three), state_cite_one=self._none_to_blank( old_citation.state_cite_one), state_cite_two=self._none_to_blank( old_citation.state_cite_two), state_cite_three=self._none_to_blank( old_citation.state_cite_three), state_cite_regional=self._none_to_blank( old_citation.state_cite_regional), specialty_cite_one=self._none_to_blank( old_citation.specialty_cite_one), scotus_early_cite=self._none_to_blank( old_citation.scotus_early_cite), lexis_cite=self._none_to_blank(old_citation.lexis_cite), westlaw_cite=self._none_to_blank(old_citation.westlaw_cite), neutral_cite=self._none_to_blank(old_citation.neutral_cite), scdb_id=self._none_to_blank( old_document.supreme_court_db_id), source=old_document.source, nature_of_suit=old_document.nature_of_suit, citation_count=old_document.citation_count, precedential_status=old_document.precedential_status, date_blocked=old_document.date_blocked, blocked=old_document.blocked, ) new_opinion_cluster.save( using='default', index=False, ) new_opinion = OpinionNew( pk=old_document.pk, cluster=new_opinion_cluster, date_modified=old_document.date_modified, date_created=old_document.time_retrieved, type='010combined', sha1=old_document.sha1, download_url=old_document.download_url, local_path=old_document.local_path, plain_text=old_document.plain_text, html=self._none_to_blank(old_document.html), html_lawbox=self._none_to_blank(old_document.html_lawbox), html_with_citations=old_document.html_with_citations, extracted_by_ocr=old_document.extracted_by_ocr, ) new_opinion.save( using='default', index=False, ) if old_audio is not None: new_audio_file = AudioNew( pk=old_audio.pk, docket=new_docket, source=old_audio.source, case_name=old_audio_case_name, case_name_short=old_audio_case_name_short, case_name_full=old_audio_case_name_full, judges=self._none_to_blank(old_audio.judges), date_created=old_audio.time_retrieved, date_modified=old_audio.date_modified, sha1=old_audio.sha1, download_url=old_audio.download_url, local_path_mp3=old_audio.local_path_mp3, local_path_original_file=old_audio.local_path_original_file, duration=old_audio.duration, processing_complete=old_audio.processing_complete, date_blocked=old_audio.date_blocked, blocked=old_audio.blocked, ) new_audio_file.save( using='default', index=False, ) progress += 1 self._print_progress(progress, num_dockets) self.stdout.write(u'') # Newline
def merge_or_add_opinions( cluster_id: int, html_str: str, data: Dict[str, Any], date_argued: datetime.date, date_filed: datetime.date, case_names: Dict[str, str], status: str, docket_number: str, found_citations: List[FoundCitation], ) -> Optional[Docket]: """Merge opinions if applicable. If opinion not in system, merge or add to cluster. If opinion in system came from harvard, add new opinion to cluster, else we merge new opinion data into scraped opinion. :param cluster_id: Opinion Cluster id. :param html_str: HTML opinion to add. :param data: Case data to import. :param date_argued: Date case was argued. :param date_filed: Date case was filed. :param case_names: A dict with the three case name types :param status: Whether it's precedential :param docket_number: The docket number :param found_citations: A list of FoundCitation objects. :return: The merged docket, cluster, and opinion. """ does_exist = (Opinion.objects.filter(cluster_id=cluster_id).exclude( html_anon_2020="").exists()) if does_exist: logger.info(f"Opinion already in database at {cluster_id}") return logger.info(f"Starting merger of opinions in cluster {cluster_id}.") cluster = OpinionCluster.objects.get(pk=cluster_id) docket = cluster.docket # Dates are uniformly good in our dataset # validation and is_approx not needed # Merge docket information docket.add_anon_2020_source() docket.date_argued = date_argued or docket.date_argued docket.docket_number = docket_number or docket.docket_number docket.case_name_short = (case_names["case_name_short"] or docket.case_name_short) docket.case_name = case_names["case_name"] or docket.case_name docket.case_name_full = (case_names["case_name_full"] or docket.case_name_full) # Merge cluster information cluster.date_filed = date_filed or cluster.date_filed cluster.precedential_status = status or cluster.precedential_status cluster.attorneys = data["representation"] or cluster.attorneys cluster.disposition = data["summary_disposition"] or cluster.disposition cluster.summary = data["summary_court"] or cluster.summary cluster.history = data["history"] or cluster.history cluster.cross_reference = (data["history_docket_numbers"] or cluster.cross_reference) cluster.correction = data["publication_status_note"] or cluster.correction if data["judges"]: cluster.judges = (data["judges"].replace("{", "").replace("}", "") or cluster.judges) cluster.case_name_short = (case_names["case_name_short"] or cluster.case_name_short) cluster.case_name = case_names["case_name"] or cluster.case_name cluster.case_name_full = (case_names["case_name_full"] or cluster.case_name_full) docket.save() cluster.save() # Add citations to cluster if applicable for citation in found_citations: Citation.objects.get_or_create( volume=citation.volume, reporter=citation.reporter, page=citation.page, type=map_reporter_db_cite_type( REPORTERS[citation.canonical_reporter][0]["cite_type"]), cluster_id=cluster.id, ) # Merge with scrape or add opinion to cluster with harvard if OpinionCluster.objects.get(pk=cluster_id).source == "C": opinion = Opinion.objects.get(cluster_id=cluster_id) logger.info("Merge with Harvard data") opinion.html_anon_2020 = html_str else: opinion = Opinion( cluster_id=cluster.id, type=Opinion.COMBINED, html_anon_2020=html_str, extracted_by_ocr=False, ) opinion.save() logger.info(f"Finished merging opinion in cluster {cluster_id}.") return docket
def add_new_records( html_str: str, data: Dict[str, Any], date_argued: datetime.date, date_filed: datetime.date, case_names: Dict[str, str], status: str, docket_number: str, found_citations: List[FoundCitation], court_id: str, ) -> Docket: """Create new records in the DB based on parsed data :param html_str: HTML opinion to add :param data: Case data to import :param date_argued: Date case was argued. :param date_filed: Date case was filed. :param case_names: A dict with the three case name types :param status: Whether it's precedential :param docket_number: The docket number :param found_citations: A list of FoundCitation objects. :param court_id: The CL id of the court :return: None. """ docket = Docket.objects.create( **case_names, docket_number=docket_number, court_id=court_id, source=Docket.ANON_2020, ia_needs_upload=False, date_argued=date_argued, ) logger.info("Add cluster for: %s", found_citations[0].base_citation()) judges = data["judges"] or "" cluster = OpinionCluster( **case_names, precedential_status=status, docket_id=docket.id, source=docket.ANON_2020, date_filed=date_filed, attorneys=data["representation"] or "", disposition=data["summary_disposition"] or "", summary=data["summary_court"] or "", history=data["history"] or "", cross_reference=data["history_docket_numbers"] or "", correction=data["publication_status_note"] or "", judges=judges.replace("{", "").replace("}", "") or "", ) cluster.save(index=False) for citation in found_citations: logger.info("Adding citation for: %s", citation.base_citation()) Citation.objects.get_or_create( volume=citation.volume, reporter=citation.reporter, page=citation.page, type=map_reporter_db_cite_type( REPORTERS[citation.canonical_reporter][0]["cite_type"]), cluster_id=cluster.id, ) op = Opinion( cluster_id=cluster.id, type=Opinion.COMBINED, html_anon_2020=html_str, extracted_by_ocr=False, ) op.save() logger.info( f"Finished importing cluster {cluster.id}; {found_citations[0].base_citation()}" ) return docket
def make_and_save(item, skipdupes=False, min_dates=None, start_dates=None, testing=True): """Associates case data from `parse_opinions` with objects. Saves these objects. min_date: if not none, will skip cases after min_date """ date_filed = (date_argued) = ( date_reargued ) = date_reargument_denied = date_cert_granted = date_cert_denied = None unknown_date = None for date_cluster in item["dates"]: for date_info in date_cluster: # check for any dates that clearly aren't dates if date_info[1].year < 1600 or date_info[1].year > 2020: continue # check for untagged dates that will be assigned to date_filed if date_info[0] is None: date_filed = date_info[1] continue # try to figure out what type of date it is based on its tag string if date_info[0] in FILED_TAGS: date_filed = date_info[1] elif date_info[0] in DECIDED_TAGS: if not date_filed: date_filed = date_info[1] elif date_info[0] in ARGUED_TAGS: date_argued = date_info[1] elif date_info[0] in REARGUE_TAGS: date_reargued = date_info[1] elif date_info[0] in REARGUE_DENIED_TAGS: date_reargument_denied = date_info[1] elif date_info[0] in CERT_GRANTED_TAGS: date_cert_granted = date_info[1] elif date_info[0] in CERT_DENIED_TAGS: date_cert_denied = date_info[1] else: unknown_date = date_info[1] if date_info[0] not in UNKNOWN_TAGS: print("\nFound unknown date tag '%s' with date '%s'.\n" % date_info) # the main date (used for date_filed in OpinionCluster) and panel dates # (used for finding judges) are ordered in terms of which type of dates # best reflect them main_date = (date_filed or date_argued or date_reargued or date_reargument_denied or unknown_date) panel_date = (date_argued or date_reargued or date_reargument_denied or date_filed or unknown_date) if main_date is None: raise Exception("Failed to get a date for " + item["file"]) # special rule for Kentucky if item["court_id"] == "kycourtapp" and main_date <= date(1975, 12, 31): item["court_id"] = "kycourtapphigh" if min_dates is not None: if min_dates.get(item["court_id"]) is not None: if main_date >= min_dates[item["court_id"]]: print( main_date, "after", min_dates[item["court_id"]], " -- skipping.", ) return if start_dates is not None: if start_dates.get(item["court_id"]) is not None: if main_date <= start_dates[item["court_id"]]: print( main_date, "before court founding:", start_dates[item["court_id"]], " -- skipping.", ) return docket = Docket( source=Docket.COLUMBIA, date_argued=date_argued, date_reargued=date_reargued, date_cert_granted=date_cert_granted, date_cert_denied=date_cert_denied, date_reargument_denied=date_reargument_denied, court_id=item["court_id"], case_name_short=item["case_name_short"] or "", case_name=item["case_name"] or "", case_name_full=item["case_name_full"] or "", docket_number=item["docket"] or "", ) # get citation objects in a list for addition to the cluster found_citations = [] for c in item["citations"]: found = get_citations(clean_text(c, ["html", "inline_whitespace"])) if not found: # if the docket number --is-- citation string, we're likely dealing # with a somewhat common triplet of (docket number, date, # jurisdiction), which isn't a citation at all (so there's no # problem) if item["docket"]: docket_no = item["docket"].lower() if "claim no." in docket_no: docket_no = docket_no.split("claim no.")[0] for junk in DOCKET_JUNK: docket_no = docket_no.replace(junk, "") docket_no = docket_no.strip(".").strip() if docket_no and docket_no in c.lower(): continue # there are a trivial number of letters (except for # months and a few trivial words) in the citation, # then it's not a citation at all non_trivial = c.lower() for trivial in TRIVIAL_CITE_WORDS: non_trivial = non_trivial.replace(trivial, "") num_letters = sum( non_trivial.count(letter) for letter in string.lowercase) if num_letters < 3: continue # if there is a string that's known to indicate # a bad citation, then it's not a citation if any(bad in c for bad in BAD_CITES): continue # otherwise, this is a problem raise Exception("Failed to get a citation from the string '%s' in " "court '%s' with docket '%s'." % (c, item["court_id"], item["docket"])) else: found_citations.extend(found.to_model()) cluster = OpinionCluster( judges=item.get("judges", "") or "", precedential_status=("Unpublished" if item["unpublished"] else "Published"), date_filed=main_date, case_name_short=item["case_name_short"] or "", case_name=item["case_name"] or "", case_name_full=item["case_name_full"] or "", source="Z", attorneys=item["attorneys"] or "", posture=item["posture"] or "", ) panel = lookup_judges_by_last_name_list(item["panel"], item["court_id"], panel_date) opinions = [] for i, opinion_info in enumerate(item["opinions"]): if opinion_info["author"] is None: author = None else: author = lookup_judge_by_last_name(opinion_info["author"], item["court_id"], panel_date) converted_text = convert_columbia_html(opinion_info["opinion"]) opinion_type = OPINION_TYPE_MAPPING[opinion_info["type"]] if opinion_type == Opinion.LEAD and i > 0: opinion_type = Opinion.ADDENDUM opinion = Opinion( author=author, per_curiam=opinion_info["per_curiam"], type=opinion_type, # type=OPINION_TYPE_MAPPING[opinion_info['type']], html_columbia=converted_text, sha1=opinion_info["sha1"], # This is surely not updated for the new S3 world. If you're # reading this, you'll need to update this code. local_path=opinion_info["local_path"], ) joined_by = lookup_judges_by_last_name_list(item["joining"], item["court_id"], panel_date) opinions.append((opinion, joined_by)) if min_dates is None: # check to see if this is a duplicate dups = find_dups(docket, cluster) if dups: if skipdupes: print("Duplicate. skipping.") else: raise Exception("Found %s duplicate(s)." % len(dups)) # save all the objects if not testing: try: docket.save() cluster.docket = docket cluster.save(index=False) for citation in found_citations: citation.cluster = cluster citation.save() for member in panel: cluster.panel.add(member) for opinion, joined_by in opinions: opinion.cluster = cluster opinion.save(index=False) for joiner in joined_by: opinion.joined_by.add(joiner) if settings.DEBUG: domain = "http://127.0.0.1:8000" else: domain = "https://www.courtlistener.com" print("Created item at: %s%s" % (domain, cluster.get_absolute_url())) except: # if anything goes wrong, try to delete everything try: docket.delete() except: pass raise
def parse_harvard_opinions(reporter, volume, make_searchable): """ Parse downloaded CaseLaw Corpus from internet archive and add them to our database. Optionally uses a reporter abbreviation to identify cases to download as used by IA. (Ex. T.C. => tc) Optionally uses a volume integer. If neither is provided, code will cycle through all downloaded files. :param volume: The volume (int) of the reporters (optional) (ex 10) :param reporter: Reporter string as slugify'd (optional) (tc) for T.C. :param make_searchable: Boolean to indicate saving to solr :return: None """ if not reporter and volume: logger.error("You provided a volume but no reporter. Exiting.") return for file_path in filepath_list(reporter, volume): ia_download_url = "/".join( ["https://archive.org/download", file_path.split("/", 9)[-1]] ) if OpinionCluster.objects.filter( filepath_json_harvard=file_path ).exists(): logger.info("Skipping - already in system %s" % ia_download_url) continue try: with open(file_path) as f: data = json.load(f) except ValueError: logger.warning("Empty json: missing case at: %s" % ia_download_url) continue except Exception as e: logger.warning("Unknown error %s for: %s" % (e, ia_download_url)) continue cites = get_citations(data["citations"][0]["cite"]) if not cites: logger.info( "No citation found for %s." % data["citations"][0]["cite"] ) continue case_name = harmonize(data["name_abbreviation"]) case_name_short = cnt.make_case_name_short(case_name) case_name_full = harmonize(data["name"]) citation = cites[0] if skip_processing(citation, case_name, file_path): continue # TODO: Generalize this to handle all court types somehow. court_id = match_court_string( data["court"]["name"], state=True, federal_appeals=True, federal_district=True, ) soup = BeautifulSoup(data["casebody"]["data"], "lxml") # Some documents contain images in the HTML # Flag them for a later crawl by using the placeholder '[[Image]]' judge_list = [ extract_judge_last_name(x.text) for x in soup.find_all("judges") ] author_list = [ extract_judge_last_name(x.text) for x in soup.find_all("author") ] # Flatten and dedupe list of judges judges = ", ".join( sorted( list( set( itertools.chain.from_iterable(judge_list + author_list) ) ) ) ) judges = titlecase(judges) docket_string = ( data["docket_number"] .replace("Docket No.", "") .replace("Docket Nos.", "") .strip() ) short_fields = ["attorneys", "disposition", "otherdate", "seealso"] long_fields = [ "syllabus", "summary", "history", "headnotes", "correction", ] short_data = parse_extra_fields(soup, short_fields, False) long_data = parse_extra_fields(soup, long_fields, True) with transaction.atomic(): logger.info("Adding docket for: %s", citation.base_citation()) docket = Docket( case_name=case_name, case_name_short=case_name_short, case_name_full=case_name_full, docket_number=docket_string, court_id=court_id, source=Docket.HARVARD, ia_needs_upload=False, ) try: with transaction.atomic(): docket.save() except OperationalError as e: if "exceeds maximum" in str(e): docket.docket_number = ( "%s, See Corrections for full Docket Number" % trunc(docket_string, length=5000, ellipsis="...") ) docket.save() long_data["correction"] = "%s <br> %s" % ( data["docket_number"], long_data["correction"], ) # Handle partial dates by adding -01v to YYYY-MM dates date_filed, is_approximate = validate_dt(data["decision_date"]) logger.info("Adding cluster for: %s", citation.base_citation()) cluster = OpinionCluster( case_name=case_name, case_name_short=case_name_short, case_name_full=case_name_full, precedential_status="Published", docket_id=docket.id, source="U", date_filed=date_filed, date_filed_is_approximate=is_approximate, attorneys=short_data["attorneys"], disposition=short_data["disposition"], syllabus=long_data["syllabus"], summary=long_data["summary"], history=long_data["history"], other_dates=short_data["otherdate"], cross_reference=short_data["seealso"], headnotes=long_data["headnotes"], correction=long_data["correction"], judges=judges, filepath_json_harvard=file_path, ) cluster.save(index=False) logger.info("Adding citation for: %s", citation.base_citation()) Citation.objects.create( volume=citation.volume, reporter=citation.reporter, page=citation.page, type=map_reporter_db_cite_type( REPORTERS[citation.canonical_reporter][0]["cite_type"] ), cluster_id=cluster.id, ) new_op_pks = [] for op in soup.find_all("opinion"): # This code cleans author tags for processing. # It is particularly useful for identifiying Per Curiam for elem in [op.find("author")]: if elem is not None: [x.extract() for x in elem.find_all("page-number")] auth = op.find("author") if auth is not None: author_tag_str = titlecase(auth.text.strip(":")) author_str = titlecase( "".join(extract_judge_last_name(author_tag_str)) ) else: author_str = "" author_tag_str = "" per_curiam = True if author_tag_str == "Per Curiam" else False # If Per Curiam is True set author string to Per Curiam if per_curiam: author_str = "Per Curiam" op_type = map_opinion_type(op.get("type")) opinion_xml = str(op) logger.info("Adding opinion for: %s", citation.base_citation()) op = Opinion( cluster_id=cluster.id, type=op_type, author_str=author_str, xml_harvard=opinion_xml, per_curiam=per_curiam, extracted_by_ocr=True, ) # Don't index now; do so later if desired op.save(index=False) new_op_pks.append(op.pk) if make_searchable: add_items_to_solr.delay(new_op_pks, "search.Opinion") logger.info("Finished: %s", citation.base_citation())
def make_and_save(item, skipdupes=False, min_dates=None, testing=True): """Associates case data from `parse_opinions` with objects. Saves these objects. min_date: if not none, will skip cases after min_date """ date_filed = date_argued = date_reargued = date_reargument_denied = date_cert_granted = date_cert_denied = None unknown_date = None for date_cluster in item['dates']: for date_info in date_cluster: # check for any dates that clearly aren't dates if date_info[1].year < 1600 or date_info[1].year > 2020: continue # check for untagged dates that will be assigned to date_filed if date_info[0] is None: date_filed = date_info[1] continue # try to figure out what type of date it is based on its tag string if date_info[0] in FILED_TAGS: date_filed = date_info[1] elif date_info[0] in DECIDED_TAGS: if not date_filed: date_filed = date_info[1] elif date_info[0] in ARGUED_TAGS: date_argued = date_info[1] elif date_info[0] in REARGUE_TAGS: date_reargued = date_info[1] elif date_info[0] in REARGUE_DENIED_TAGS: date_reargument_denied = date_info[1] elif date_info[0] in CERT_GRANTED_TAGS: date_cert_granted = date_info[1] elif date_info[0] in CERT_DENIED_TAGS: date_cert_denied = date_info[1] else: unknown_date = date_info[1] if date_info[0] not in UNKNOWN_TAGS: print("\nFound unknown date tag '%s' with date '%s'.\n" % date_info) # the main date (used for date_filed in OpinionCluster) and panel dates # (used for finding judges) are ordered in terms of which type of dates # best reflect them main_date = (date_filed or date_argued or date_reargued or date_reargument_denied or unknown_date) panel_date = (date_argued or date_reargued or date_reargument_denied or date_filed or unknown_date) if main_date is None: raise Exception("Failed to get a date for " + item['file']) if min_dates is not None: if min_dates.get(item['court_id']) is not None: if main_date >= min_dates[item['court_id']]: print(main_date, 'after', min_dates[item['court_id']], ' -- skipping.') return docket = Docket( source=Docket.COLUMBIA, date_argued=date_argued, date_reargued=date_reargued, date_cert_granted=date_cert_granted, date_cert_denied=date_cert_denied, date_reargument_denied=date_reargument_denied, court_id=item['court_id'], case_name_short=item['case_name_short'] or '', case_name=item['case_name'] or '', case_name_full=item['case_name_full'] or '', docket_number=item['docket'] or '' ) # get citations in the form of, e.g. {'federal_cite_one': '1 U.S. 1', ...} found_citations = [] for c in item['citations']: found = get_citations(c) if not found: # if the docket number --is-- citation string, we're likely dealing # with a somewhat common triplet of (docket number, date, # jurisdiction), which isn't a citation at all (so there's no # problem) if item['docket']: docket_no = item['docket'].lower() if 'claim no.' in docket_no: docket_no = docket_no.split('claim no.')[0] for junk in DOCKET_JUNK: docket_no = docket_no.replace(junk, '') docket_no = docket_no.strip('.').strip() if docket_no and docket_no in c.lower(): continue # there are a trivial number of letters (except for months and a few # trivial words) in the citation, then it's not a citation at all non_trivial = c.lower() for trivial in TRIVIAL_CITE_WORDS: non_trivial = non_trivial.replace(trivial, '') num_letters = sum(non_trivial.count(letter) for letter in string.lowercase) if num_letters < 3: continue # if there is a string that's known to indicate a bad citation, then # it's not a citation if any(bad in c for bad in BAD_CITES): continue # otherwise, this is a problem raise Exception("Failed to get a citation from the string '%s' in " "court '%s' with docket '%s'." % ( c, item['court_id'], item['docket'] )) else: found_citations.extend(found) citations_map = map_citations_to_models(found_citations) cluster = OpinionCluster( judges=item.get('judges', '') or "", precedential_status=('Unpublished' if item['unpublished'] else 'Published'), date_filed=main_date, case_name_short=item['case_name_short'] or '', case_name=item['case_name'] or '', case_name_full=item['case_name_full'] or '', source='Z', attorneys=item['attorneys'] or '', posture=item['posture'] or '', **citations_map ) panel = [find_person(n, item['court_id'], case_date=panel_date) for n in item['panel']] panel = [x for x in panel if x is not None] opinions = [] for i, opinion_info in enumerate(item['opinions']): if opinion_info['author'] is None: author = None else: author = find_person(opinion_info['author'], item['court_id'], case_date=panel_date) converted_text = convert_columbia_html(opinion_info['opinion']) opinion_type = OPINION_TYPE_MAPPING[opinion_info['type']] if opinion_type == '020lead' and i > 0: opinion_type = '050addendum' opinion = Opinion( author=author, per_curiam=opinion_info['per_curiam'], type=opinion_type, # type=OPINION_TYPE_MAPPING[opinion_info['type']], html_columbia=converted_text, sha1=opinion_info['sha1'], local_path=opinion_info['local_path'], ) joined_by = [find_person(n, item['court_id'], case_date=panel_date) for n in opinion_info['joining']] joined_by = [x for x in joined_by if x is not None] opinions.append((opinion, joined_by)) if min_dates is None: # check to see if this is a duplicate dups = find_dups(docket, cluster, panel, opinions) if dups: if skipdupes: print('Duplicate. skipping.') else: raise Exception("Found %s duplicate(s)." % len(dups)) # save all the objects if not testing: try: docket.save() cluster.docket = docket cluster.save(index=False) for member in panel: cluster.panel.add(member) for opinion, joined_by in opinions: opinion.cluster = cluster opinion.save(index=False) for joiner in joined_by: opinion.joined_by.add(joiner) if settings.DEBUG: domain = "http://127.0.0.1:8000" else: domain = "https://www.courtlistener.com" print("Created item at: %s%s" % (domain, cluster.get_absolute_url())) except: # if anything goes wrong, try to delete everything try: docket.delete() except: pass raise
def make_and_save(item): """Associates case data from `parse_opinions` with objects. Saves these objects.""" date_filed = date_argued = date_reargued = date_reargument_denied = date_cert_granted = date_cert_denied = None for date_cluster in item['dates']: for date_info in date_cluster: # check for any dates that clearly aren't dates if date_info[1].year < 1600 or date_info[1].year > 2020: continue # check for untagged dates that will be assigned to date_filed if date_info[0] is None: date_filed = date_info[1] continue # try to figure out what type of date it is based on its tag string if date_info[0] in FILED_TAGS: date_filed = date_info[1] elif date_info[0] in DECIDED_TAGS: if not date_filed: date_filed = date_info[1] elif date_info[0] in ARGUED_TAGS: date_argued = date_info[1] elif date_info[0] in REARGUE_TAGS: date_reargued = date_info[1] elif date_info[0] in REARGUE_DENIED_TAGS: date_reargument_denied = date_info[1] elif date_info[0] in CERT_GRANTED_TAGS: date_cert_granted = date_info[1] elif date_info[0] in CERT_DENIED_TAGS: date_cert_denied = date_info[1] else: print("Found unknown date tag '%s' with date '%s'." % date_info) docket = Docket( date_argued=date_argued ,date_reargued=date_reargued ,date_cert_granted=date_cert_granted ,date_cert_denied=date_cert_denied ,date_reargument_denied=date_reargument_denied ,court_id=item['court_id'] ,case_name_short=item['case_name_short'] or '' ,case_name=item['case_name'] or '' ,case_name_full=item['case_name_full'] or '' ,docket_number=item['docket'] or '' ) docket.save() # get citations in the form of, e.g. {'federal_cite_one': '1 U.S. 1', ...} found_citations = [] for c in item['citations']: found = get_citations(c) if not found: raise Exception("Failed to get a citation from the string '%s'." % c) elif len(found) > 1: raise Exception("Got multiple citations from string '%s' when there should have been one." % c) found_citations.append(found[0]) citations_map = map_citations_to_models(found_citations) cluster = OpinionCluster( docket=docket ,precedential_status=('Unpublished' if item['unpublished'] else 'Published') ,date_filed=date_filed ,case_name_short=item['case_name_short'] or '' ,case_name=item['case_name'] or '' ,case_name_full=item['case_name_full'] or '' ,source='Z' ,attorneys=item['attorneys'] or '' ,posture=item['posture'] or '' ,**citations_map ) cluster.save() if date_argued is not None: paneldate = date_argued else: paneldate = date_filed panel = [find_person(n, item['court_id'], paneldate) for n in item['panel']] panel = [x for x in panel if x is not None] for member in panel: cluster.panel.add(member) for opinion_info in item['opinions']: if opinion_info['author'] is None: author = None else: author = find_person(opinion_info['author'], item['court_id'], date_filed or date_argued) opinion = Opinion( cluster=cluster ,author=author ,type=OPINION_TYPE_MAPPING[opinion_info['type']] ,html_columbia=opinion_info['opinion'] ) opinion.save() joined_by = [find_person(n, item['court_id'], paneldate) for n in opinion_info['joining']] joined_by = [x for x in joined_by if x is not None] for joiner in joined_by: opinion.joined_by.add(joiner)
class StaticFilesTest(TestCase): good_mp3_path = "mp3/2014/06/09/ander_v._leo.mp3" good_txt_path = "txt/2015/12/28/opinion_text.txt" good_pdf_path = ( "pdf/2013/06/12/" + "in_re_motion_for_consent_to_disclosure_of_court_records.pdf") def setUp(self): self.court = Court.objects.get(pk="test") self.docket = Docket(case_name=u"Docket", court=self.court, source=Docket.DEFAULT) self.docket.save() self.audio = Audio( local_path_original_file=self.good_mp3_path, local_path_mp3=self.good_mp3_path, docket=self.docket, blocked=False, case_name_full="Ander v. Leo", date_created=datetime.date(2014, 6, 9), ) self.audio.save(index=False) self.opinioncluster = OpinionCluster( case_name=u"Hotline Bling", docket=self.docket, date_filed=datetime.date(2015, 12, 14), ) self.opinioncluster.save(index=False) self.txtopinion = Opinion( cluster=self.opinioncluster, type="Lead Opinion", local_path=self.good_txt_path, ) self.txtopinion.save(index=False) self.pdfopinion = Opinion( cluster=self.opinioncluster, type="Lead Opinion", local_path=self.good_pdf_path, ) self.pdfopinion.save(index=False) def test_serve_static_file_serves_mp3(self): request = HttpRequest() file_path = self.audio.local_path_mp3 response = serve_static_file(request, file_path=self.good_mp3_path) self.assertEqual(response.status_code, 200) self.assertEqual(response["Content-Type"], "audio/mpeg") self.assertIn("inline;", response["Content-Disposition"]) def test_serve_static_file_serves_txt(self): request = HttpRequest() response = serve_static_file(request, file_path=self.good_txt_path) self.assertEqual(response.status_code, 200) self.assertEqual(response["Content-Type"], "text/plain") self.assertIn("inline;", response["Content-Disposition"]) self.assertIn("FOR THE DISTRICT OF COLUMBIA CIRCUIT", response.content) def test_serve_static_file_serves_pdf(self): request = HttpRequest() response = serve_static_file(request, file_path=self.good_pdf_path) self.assertEqual(response.status_code, 200) self.assertEqual(response["Content-Type"], "application/pdf") self.assertIn("inline;", response["Content-Disposition"])
def add_oc_and_o(self, old_document, old_citation, old_docket, new_docket): """Add the OpinionCluster and Opinion, updating existing items if present. """ new_opinion_cluster = OpinionClusterNew( pk=old_document.pk, docket=new_docket, judges=self._none_to_blank(old_document.judges), date_modified=old_document.date_modified, date_created=old_document.date_modified, date_filed=old_document.date_filed, slug=self._none_to_blank(old_citation.slug), citation_id=old_document.citation_id, case_name_short=old_docket.case_name_short, case_name=old_docket.case_name, case_name_full=old_docket.case_name_full, federal_cite_one=self._none_to_blank( old_citation.federal_cite_one), federal_cite_two=self._none_to_blank( old_citation.federal_cite_two), federal_cite_three=self._none_to_blank( old_citation.federal_cite_three), state_cite_one=self._none_to_blank(old_citation.state_cite_one), state_cite_two=self._none_to_blank(old_citation.state_cite_two), state_cite_three=self._none_to_blank( old_citation.state_cite_three), state_cite_regional=self._none_to_blank( old_citation.state_cite_regional), specialty_cite_one=self._none_to_blank( old_citation.specialty_cite_one), scotus_early_cite=self._none_to_blank( old_citation.scotus_early_cite), lexis_cite=self._none_to_blank(old_citation.lexis_cite), westlaw_cite=self._none_to_blank(old_citation.westlaw_cite), neutral_cite=self._none_to_blank(old_citation.neutral_cite), scdb_id=self._none_to_blank(old_document.supreme_court_db_id), source=old_document.source, nature_of_suit=old_document.nature_of_suit, citation_count=old_document.citation_count, precedential_status=old_document.precedential_status, date_blocked=old_document.date_blocked, blocked=old_document.blocked, ) new_opinion_cluster.save( using='default', index=False, ) new_opinion = OpinionNew( pk=old_document.pk, cluster=new_opinion_cluster, date_modified=old_document.date_modified, date_created=old_document.time_retrieved, type='010combined', sha1=old_document.sha1, download_url=old_document.download_url, local_path=old_document.local_path, plain_text=old_document.plain_text, html=self._none_to_blank(old_document.html), html_lawbox=self._none_to_blank(old_document.html_lawbox), html_with_citations=old_document.html_with_citations, extracted_by_ocr=old_document.extracted_by_ocr, ) new_opinion.save( using='default', index=False, )
def make_and_save(item, skipdupes=False, min_dates=None, start_dates=None, testing=True): """Associates case data from `parse_opinions` with objects. Saves these objects. min_date: if not none, will skip cases after min_date """ date_filed = date_argued = date_reargued = date_reargument_denied = date_cert_granted = date_cert_denied = None unknown_date = None for date_cluster in item['dates']: for date_info in date_cluster: # check for any dates that clearly aren't dates if date_info[1].year < 1600 or date_info[1].year > 2020: continue # check for untagged dates that will be assigned to date_filed if date_info[0] is None: date_filed = date_info[1] continue # try to figure out what type of date it is based on its tag string if date_info[0] in FILED_TAGS: date_filed = date_info[1] elif date_info[0] in DECIDED_TAGS: if not date_filed: date_filed = date_info[1] elif date_info[0] in ARGUED_TAGS: date_argued = date_info[1] elif date_info[0] in REARGUE_TAGS: date_reargued = date_info[1] elif date_info[0] in REARGUE_DENIED_TAGS: date_reargument_denied = date_info[1] elif date_info[0] in CERT_GRANTED_TAGS: date_cert_granted = date_info[1] elif date_info[0] in CERT_DENIED_TAGS: date_cert_denied = date_info[1] else: unknown_date = date_info[1] if date_info[0] not in UNKNOWN_TAGS: print("\nFound unknown date tag '%s' with date '%s'.\n" % date_info) # the main date (used for date_filed in OpinionCluster) and panel dates # (used for finding judges) are ordered in terms of which type of dates # best reflect them main_date = (date_filed or date_argued or date_reargued or date_reargument_denied or unknown_date) panel_date = (date_argued or date_reargued or date_reargument_denied or date_filed or unknown_date) if main_date is None: raise Exception("Failed to get a date for " + item['file']) # special rule for Kentucky if item['court_id'] == 'kycourtapp' and main_date <= date(1975, 12, 31): item['court_id'] = 'kycourtapphigh' if min_dates is not None: if min_dates.get(item['court_id']) is not None: if main_date >= min_dates[item['court_id']]: print(main_date, 'after', min_dates[item['court_id']], ' -- skipping.') return if start_dates is not None: if start_dates.get(item['court_id']) is not None: if main_date <= start_dates[item['court_id']]: print(main_date, 'before court founding:', start_dates[item['court_id']], ' -- skipping.') return docket = Docket(source=Docket.COLUMBIA, date_argued=date_argued, date_reargued=date_reargued, date_cert_granted=date_cert_granted, date_cert_denied=date_cert_denied, date_reargument_denied=date_reargument_denied, court_id=item['court_id'], case_name_short=item['case_name_short'] or '', case_name=item['case_name'] or '', case_name_full=item['case_name_full'] or '', docket_number=item['docket'] or '') # get citations in the form of, e.g. {'federal_cite_one': '1 U.S. 1', ...} found_citations = [] for c in item['citations']: found = get_citations(c) if not found: # if the docket number --is-- citation string, we're likely dealing # with a somewhat common triplet of (docket number, date, # jurisdiction), which isn't a citation at all (so there's no # problem) if item['docket']: docket_no = item['docket'].lower() if 'claim no.' in docket_no: docket_no = docket_no.split('claim no.')[0] for junk in DOCKET_JUNK: docket_no = docket_no.replace(junk, '') docket_no = docket_no.strip('.').strip() if docket_no and docket_no in c.lower(): continue # there are a trivial number of letters (except for months and a few # trivial words) in the citation, then it's not a citation at all non_trivial = c.lower() for trivial in TRIVIAL_CITE_WORDS: non_trivial = non_trivial.replace(trivial, '') num_letters = sum( non_trivial.count(letter) for letter in string.lowercase) if num_letters < 3: continue # if there is a string that's known to indicate a bad citation, then # it's not a citation if any(bad in c for bad in BAD_CITES): continue # otherwise, this is a problem raise Exception("Failed to get a citation from the string '%s' in " "court '%s' with docket '%s'." % (c, item['court_id'], item['docket'])) else: found_citations.extend(found) citations_map = map_citations_to_models(found_citations) cluster = OpinionCluster( judges=item.get('judges', '') or "", precedential_status=('Unpublished' if item['unpublished'] else 'Published'), date_filed=main_date, case_name_short=item['case_name_short'] or '', case_name=item['case_name'] or '', case_name_full=item['case_name_full'] or '', source='Z', attorneys=item['attorneys'] or '', posture=item['posture'] or '', **citations_map) panel = [ find_person(n, item['court_id'], case_date=panel_date) for n in item['panel'] ] panel = [x for x in panel if x is not None] opinions = [] for i, opinion_info in enumerate(item['opinions']): if opinion_info['author'] is None: author = None else: author = find_person(opinion_info['author'], item['court_id'], case_date=panel_date) converted_text = convert_columbia_html(opinion_info['opinion']) opinion_type = OPINION_TYPE_MAPPING[opinion_info['type']] if opinion_type == '020lead' and i > 0: opinion_type = '050addendum' opinion = Opinion( author=author, per_curiam=opinion_info['per_curiam'], type=opinion_type, # type=OPINION_TYPE_MAPPING[opinion_info['type']], html_columbia=converted_text, sha1=opinion_info['sha1'], local_path=opinion_info['local_path'], ) joined_by = [ find_person(n, item['court_id'], case_date=panel_date) for n in opinion_info['joining'] ] joined_by = [x for x in joined_by if x is not None] opinions.append((opinion, joined_by)) if min_dates is None: # check to see if this is a duplicate dups = find_dups(docket, cluster) if dups: if skipdupes: print('Duplicate. skipping.') else: raise Exception("Found %s duplicate(s)." % len(dups)) # save all the objects if not testing: try: docket.save() cluster.docket = docket cluster.save(index=False) for member in panel: cluster.panel.add(member) for opinion, joined_by in opinions: opinion.cluster = cluster opinion.save(index=False) for joiner in joined_by: opinion.joined_by.add(joiner) if settings.DEBUG: domain = "http://127.0.0.1:8000" else: domain = "https://www.courtlistener.com" print("Created item at: %s%s" % (domain, cluster.get_absolute_url())) except: # if anything goes wrong, try to delete everything try: docket.delete() except: pass raise
class StaticFilesTest(TestCase): good_mp3_path = 'mp3/2014/06/09/ander_v._leo.mp3' good_txt_path = 'txt/2015/12/28/opinion_text.txt' good_pdf_path = 'pdf/2013/06/12/' + \ 'in_re_motion_for_consent_to_disclosure_of_court_records.pdf' def setUp(self): self.court = Court.objects.get(pk='test') self.docket = Docket(case_name=u'Docket', court=self.court, source=Docket.DEFAULT) self.docket.save() self.audio = Audio( local_path_original_file=self.good_mp3_path, local_path_mp3=self.good_mp3_path, docket=self.docket, blocked=False, case_name_full='Ander v. Leo', date_created=datetime.date(2014, 6, 9) ) self.audio.save(index=False) self.opinioncluster = OpinionCluster( case_name=u'Hotline Bling', docket=self.docket, date_filed=datetime.date(2015, 12, 14), ) self.opinioncluster.save(index=False) self.txtopinion = Opinion( cluster=self.opinioncluster, type='Lead Opinion', local_path=self.good_txt_path ) self.txtopinion.save(index=False) self.pdfopinion = Opinion( cluster=self.opinioncluster, type='Lead Opinion', local_path=self.good_pdf_path ) self.pdfopinion.save(index=False) def test_serve_static_file_serves_mp3(self): request = HttpRequest() file_path = self.audio.local_path_mp3 response = serve_static_file(request, file_path=self.good_mp3_path) self.assertEqual(response.status_code, 200) self.assertEqual(response['Content-Type'], 'audio/mpeg') self.assertIn('inline;', response['Content-Disposition']) def test_serve_static_file_serves_txt(self): request = HttpRequest() response = serve_static_file(request, file_path=self.good_txt_path) self.assertEqual(response.status_code, 200) self.assertEqual(response['Content-Type'], 'text/plain') self.assertIn('inline;', response['Content-Disposition']) self.assertIn( 'FOR THE DISTRICT OF COLUMBIA CIRCUIT', response.content ) def test_serve_static_file_serves_pdf(self): request = HttpRequest() response = serve_static_file(request, file_path=self.good_pdf_path) self.assertEqual(response.status_code, 200) self.assertEqual(response['Content-Type'], 'application/pdf') self.assertIn('inline;', response['Content-Disposition'])