def import_anon_2020_db( import_dir: str, skip_until: Optional[str], make_searchable: Optional[bool], ) -> None: """Import data from anon 2020 DB into our system. Iterate over thousands of directories each containing a tax case containing case json and a preprocessed HTML object. Check if we have a copy of this opinion in our system and either add the opinion to a case we already have or create a new docket, cluster, citations and opinion to our database. :param import_dir: Location of directory of import data. :param skip_until: ID for case we should begin processing, if any. :param make_searchable: Should we add content to SOLR. :return: None. """ directories = iglob(f"{import_dir}/*/????-*.json") for dir in directories: logger.info(f"Importing case id: {dir.split('/')[-2]}") if skip_until: if skip_until in dir: skip_until = False continue # Prepare data and html with open(dir, "rb") as f: data = json.load(f) with open(dir.replace("json", "html"), "rb") as f: soup = bs4(f.read(), "html.parser") case_names = do_case_name(soup, data) court_id = find_court_id(data["court"]) date_argued, date_filed = process_dates(data) docket_number = do_docket_number(data) html_str = soup.find("div", {"class": "container"}).decode_contents() found_cites = find_cites(data) status = check_publication_status(found_cites) cluster_id = None if found_cites: cluster_id = attempt_cluster_lookup(found_cites, docket_number) if cluster_id is not None: # Matching citations. Merge. docket = merge_or_add_opinions( cluster_id, html_str, data, date_argued, date_filed, case_names, status, docket_number, found_cites, ) else: # No matching citations. Create new records. docket = add_new_records( html_str, data, date_argued, date_filed, case_names, status, docket_number, found_cites, court_id, ) if make_searchable and docket: add_items_to_solr.delay([docket.pk], "search.Docket")
def find_citations_for_opinion_by_pks(self, opinion_pks, index=True): """Find citations for search.Opinion objects. :param opinion_pks: An iterable of search.Opinion PKs :param index: Whether to add the item to Solr :return: None """ opinions = Opinion.objects.filter(pk__in=opinion_pks) for opinion in opinions: # Returns a list of Citation objects, i.e., something like # [FullCitation, FullCitation, ShortformCitation, FullCitation, # SupraCitation, SupraCitation, ShortformCitation, FullCitation] citations = get_document_citations(opinion) # If no citations are found, continue if not citations: continue # Match all those different Citation objects to Opinion objects, using # a variety of hueristics. try: citation_matches = match_citations.get_citation_matches( opinion, citations) except ResponseNotReady as e: # Threading problem in httplib, which is used in the Solr query. raise self.retry(exc=e, countdown=2) # Consolidate duplicate matches, keeping a counter of how often each # match appears (so we know how many times an opinion cites another). # keys = cited opinion # values = number of times that opinion is cited grouped_matches = Counter(citation_matches) # Increase the citation count for the cluster of each matched opinion # if that cluster has not already been cited by this opinion. First, # calculate a list of the IDs of every opinion whose cluster will need # updating. all_cited_opinions = opinion.opinions_cited.all().values_list( "pk", flat=True) opinion_ids_to_update = set() for matched_opinion in grouped_matches: if matched_opinion.pk not in all_cited_opinions: opinion_ids_to_update.add(matched_opinion.pk) # Then, increment the citation_count fields for those matched clusters # all at once. Trigger a single Solr update as well, if required. opinion_clusters_to_update = OpinionCluster.objects.filter( sub_opinions__pk__in=opinion_ids_to_update) opinion_clusters_to_update.update(citation_count=F("citation_count") + 1) if index: add_items_to_solr.delay( opinion_clusters_to_update.values_list("pk", flat=True), "search.OpinionCluster", ) # Generate the citing opinion's new HTML (with inline citation links) opinion.html_with_citations = create_cited_html(opinion, citations) # Nuke existing citations OpinionsCited.objects.filter(citing_opinion_id=opinion.pk).delete() # Create the new ones. OpinionsCited.objects.bulk_create([ OpinionsCited( citing_opinion_id=opinion.pk, cited_opinion_id=matched_opinion.pk, depth=grouped_matches[matched_opinion], ) for matched_opinion in grouped_matches ]) # Save all the changes to the citing opinion opinion.save() # If a Solr update was requested, do a single one at the end with all the # pks of the passed opinions if index: add_items_to_solr.delay(opinion_pks, "search.Opinion")
def save_model(self, request, obj, form, change): obj.save() from cl.search.tasks import add_items_to_solr add_items_to_solr.delay([obj.citing_opinion_id], "search.Opinion")
def save_model(self, request, obj, form, change): obj.save() from cl.search.tasks import add_items_to_solr add_items_to_solr.delay([obj.pk], "search.OpinionCluster")
def save_model(self, request, obj, form, change): obj.save() from cl.search.tasks import add_items_to_solr add_items_to_solr.delay([obj.person_id], "people_db.Person")
def delete_model(self, request, obj): obj.delete() from cl.search.tasks import add_items_to_solr add_items_to_solr.delay([obj.person_id], "people_db.Person")
def parse_harvard_opinions(reporter, volume, make_searchable): """ Parse downloaded CaseLaw Corpus from internet archive and add them to our database. Optionally uses a reporter abbreviation to identify cases to download as used by IA. (Ex. T.C. => tc) Optionally uses a volume integer. If neither is provided, code will cycle through all downloaded files. :param volume: The volume (int) of the reporters (optional) (ex 10) :param reporter: Reporter string as slugify'd (optional) (tc) for T.C. :param make_searchable: Boolean to indicate saving to solr :return: None """ if not reporter and volume: logger.error("You provided a volume but no reporter. Exiting.") return for file_path in filepath_list(reporter, volume): ia_download_url = "/".join( ["https://archive.org/download", file_path.split("/", 9)[-1]] ) if OpinionCluster.objects.filter( filepath_json_harvard=file_path ).exists(): logger.info("Skipping - already in system %s" % ia_download_url) continue try: with open(file_path) as f: data = json.load(f) except ValueError: logger.warning("Empty json: missing case at: %s" % ia_download_url) continue except Exception as e: logger.warning("Unknown error %s for: %s" % (e, ia_download_url)) continue cites = get_citations(data["citations"][0]["cite"]) if not cites: logger.info( "No citation found for %s." % data["citations"][0]["cite"] ) continue case_name = harmonize(data["name_abbreviation"]) case_name_short = cnt.make_case_name_short(case_name) case_name_full = harmonize(data["name"]) citation = cites[0] if skip_processing(citation, case_name, file_path): continue # TODO: Generalize this to handle all court types somehow. court_id = match_court_string( data["court"]["name"], state=True, federal_appeals=True, federal_district=True, ) soup = BeautifulSoup(data["casebody"]["data"], "lxml") # Some documents contain images in the HTML # Flag them for a later crawl by using the placeholder '[[Image]]' judge_list = [ extract_judge_last_name(x.text) for x in soup.find_all("judges") ] author_list = [ extract_judge_last_name(x.text) for x in soup.find_all("author") ] # Flatten and dedupe list of judges judges = ", ".join( sorted( list( set( itertools.chain.from_iterable(judge_list + author_list) ) ) ) ) judges = titlecase(judges) docket_string = ( data["docket_number"] .replace("Docket No.", "") .replace("Docket Nos.", "") .strip() ) short_fields = ["attorneys", "disposition", "otherdate", "seealso"] long_fields = [ "syllabus", "summary", "history", "headnotes", "correction", ] short_data = parse_extra_fields(soup, short_fields, False) long_data = parse_extra_fields(soup, long_fields, True) with transaction.atomic(): logger.info("Adding docket for: %s", citation.base_citation()) docket = Docket( case_name=case_name, case_name_short=case_name_short, case_name_full=case_name_full, docket_number=docket_string, court_id=court_id, source=Docket.HARVARD, ia_needs_upload=False, ) try: with transaction.atomic(): docket.save() except OperationalError as e: if "exceeds maximum" in str(e): docket.docket_number = ( "%s, See Corrections for full Docket Number" % trunc(docket_string, length=5000, ellipsis="...") ) docket.save() long_data["correction"] = "%s <br> %s" % ( data["docket_number"], long_data["correction"], ) # Handle partial dates by adding -01v to YYYY-MM dates date_filed, is_approximate = validate_dt(data["decision_date"]) logger.info("Adding cluster for: %s", citation.base_citation()) cluster = OpinionCluster( case_name=case_name, case_name_short=case_name_short, case_name_full=case_name_full, precedential_status="Published", docket_id=docket.id, source="U", date_filed=date_filed, date_filed_is_approximate=is_approximate, attorneys=short_data["attorneys"], disposition=short_data["disposition"], syllabus=long_data["syllabus"], summary=long_data["summary"], history=long_data["history"], other_dates=short_data["otherdate"], cross_reference=short_data["seealso"], headnotes=long_data["headnotes"], correction=long_data["correction"], judges=judges, filepath_json_harvard=file_path, ) cluster.save(index=False) logger.info("Adding citation for: %s", citation.base_citation()) Citation.objects.create( volume=citation.volume, reporter=citation.reporter, page=citation.page, type=map_reporter_db_cite_type( REPORTERS[citation.canonical_reporter][0]["cite_type"] ), cluster_id=cluster.id, ) new_op_pks = [] for op in soup.find_all("opinion"): # This code cleans author tags for processing. # It is particularly useful for identifiying Per Curiam for elem in [op.find("author")]: if elem is not None: [x.extract() for x in elem.find_all("page-number")] auth = op.find("author") if auth is not None: author_tag_str = titlecase(auth.text.strip(":")) author_str = titlecase( "".join(extract_judge_last_name(author_tag_str)) ) else: author_str = "" author_tag_str = "" per_curiam = True if author_tag_str == "Per Curiam" else False # If Per Curiam is True set author string to Per Curiam if per_curiam: author_str = "Per Curiam" op_type = map_opinion_type(op.get("type")) opinion_xml = str(op) logger.info("Adding opinion for: %s", citation.base_citation()) op = Opinion( cluster_id=cluster.id, type=op_type, author_str=author_str, xml_harvard=opinion_xml, per_curiam=per_curiam, extracted_by_ocr=True, ) # Don't index now; do so later if desired op.save(index=False) new_op_pks.append(op.pk) if make_searchable: add_items_to_solr.delay(new_op_pks, "search.Opinion") logger.info("Finished: %s", citation.base_citation())
def find_citations_for_opinion_by_pks( self, opinion_pks: List[int], index: bool = True, ) -> None: """Find citations for search.Opinion objects. :param opinion_pks: An iterable of search.Opinion PKs :param index: Whether to add the item to Solr :return: None """ opinions: List[Opinion] = Opinion.objects.filter(pk__in=opinion_pks) for opinion in opinions: # Memoize parsed versions of the opinion's text get_and_clean_opinion_text(opinion) # Extract the citations from the opinion's text citations: List[CitationBase] = get_citations(opinion.cleaned_text) # If no citations are found, continue if not citations: continue # Resolve all those different citation objects to Opinion objects, # using a variety of heuristics. try: citation_resolutions: Dict[ MatchedResourceType, List[SupportedCitationType]] = do_resolve_citations( citations, opinion) except ResponseNotReady as e: # Threading problem in httplib, which is used in the Solr query. raise self.retry(exc=e, countdown=2) # Generate the citing opinion's new HTML with inline citation links opinion.html_with_citations = create_cited_html( opinion, citation_resolutions) # Delete the unmatched citations citation_resolutions.pop(NO_MATCH_RESOURCE, None) # Increase the citation count for the cluster of each matched opinion # if that cluster has not already been cited by this opinion. First, # calculate a list of the IDs of every opinion whose cluster will need # updating. all_cited_opinions = opinion.opinions_cited.all().values_list( "pk", flat=True) opinion_ids_to_update = set() for _opinion in citation_resolutions.keys(): if _opinion.pk not in all_cited_opinions: opinion_ids_to_update.add(_opinion.pk) # Finally, commit these changes to the database in a single # transcation block. Trigger a single Solr update as well, if # required. with transaction.atomic(): opinion_clusters_to_update = OpinionCluster.objects.filter( sub_opinions__pk__in=opinion_ids_to_update) opinion_clusters_to_update.update( citation_count=F("citation_count") + 1) if index: add_items_to_solr.delay( opinion_clusters_to_update.values_list("pk", flat=True), "search.OpinionCluster", ) # Nuke existing citations OpinionsCited.objects.filter(citing_opinion_id=opinion.pk).delete() # Create the new ones. OpinionsCited.objects.bulk_create([ OpinionsCited( citing_opinion_id=opinion.pk, cited_opinion_id=_opinion.pk, depth=len(_citations), ) for _opinion, _citations in citation_resolutions.items() ]) # Save all the changes to the citing opinion (send to solr later) opinion.save(index=False) # If a Solr update was requested, do a single one at the end with all the # pks of the passed opinions if index: add_items_to_solr.delay(opinion_pks, "search.Opinion")
def fix_fjc_positions(self, infile=None): """ Addresses issue #624. We had some errant regexes in the district court assignments. This code reassigns the court fields for these judges where the new regexes differs from the old ones. :param infile: The import file with fjc-data.xslx :return: None """ if infile is None: self.ensure_input_file() infile = self.options["input_file"] textfields = [ "firstname", "midname", "lastname", "gender", "Place of Birth (City)", "Place of Birth (State)", "Place of Death (City)", "Place of Death (State)", ] df = pd.read_excel(infile, 0) for x in textfields: df[x] = df[x].replace(np.nan, "", regex=True) df["Employment text field"].replace( to_replace=r";\sno", value=r", no", inplace=True, regex=True ) for i, item in df.iterrows(): fjc_id = item["Judge Identification Number"] p = Person.objects.get(fjc_id=fjc_id) logger.info( "Doing person with FJC ID: %s, " "https://courtlistener.com%s" % (fjc_id, p.get_absolute_url()) ) exclusions = [] for posnum in range(1, 7): if posnum > 1: pos_str = " (%s)" % posnum else: pos_str = "" if pd.isnull(item["Court Name" + pos_str]): continue courtid = match_court_string( item["Court Name" + pos_str], federal_district=True ) if courtid is None: raise Exception date_termination = process_date_string( item["Date of Termination" + pos_str] ) date_start = process_date_string( item["Commission Date" + pos_str] ) date_recess_appointment = process_date_string( item["Recess Appointment date" + pos_str] ) if pd.isnull(date_start) and not pd.isnull( date_recess_appointment ): date_start = date_recess_appointment if pd.isnull(date_start): # if still no start date, skip continue positions = Position.objects.filter( person=p, date_start=date_start, date_termination=date_termination, position_type="jud", ).exclude(pk__in=exclusions) position_count = positions.count() if position_count < 1: logger.info( "Couldn't find position to match '%s' on '%s' " "with exclusions: %s" % (p, date_start, exclusions) ) add_positions_from_row( item, p, self.debug, fix_nums=[posnum] ) if not self.debug: add_items_to_solr.delay([p.pk], "people_db.Person") continue elif position_count == 1: # Good case. Press on! position = positions[0] exclusions.append(position.pk) elif position_count > 1: logger.info( "Got too many results for '%s' on '%s'. Got %s" % (p, date_start, position_count) ) continue if position.court.pk == courtid: logger.info( "Court IDs are both '%s'. No changes made." % courtid ) else: logger.info( "Court IDs are different! Old: %s, New: %s" % (position.court.pk, courtid) ) court = Court.objects.get(pk=courtid) position.court = court if not self.debug: position.save() add_items_to_solr.delay([p.pk], "people_db.Person")
def fix_fjc_positions(self, infile=None): """ Addresses issue #624. We had some errant regexes in the district court assignments. This code reassigns the court fields for these judges where the new regexes differs from the old ones. :param infile: The import file with fjc-data.xslx :return: None """ if infile is None: self.ensure_input_file() infile = self.options['input_file'] textfields = ['firstname', 'midname', 'lastname', 'gender', 'Place of Birth (City)', 'Place of Birth (State)', 'Place of Death (City)', 'Place of Death (State)'] df = pd.read_excel(infile, 0) for x in textfields: df[x] = df[x].replace(np.nan, '', regex=True) df['Employment text field'].replace(to_replace=r';\sno', value=r', no', inplace=True, regex=True) for i, item in df.iterrows(): fjc_id = item['Judge Identification Number'] p = Person.objects.get(fjc_id=fjc_id) logger.info("Doing person with FJC ID: %s, " "https://courtlistener.com%s" % (fjc_id, p.get_absolute_url())) exclusions = [] for posnum in range(1, 7): if posnum > 1: pos_str = ' (%s)' % posnum else: pos_str = '' if pd.isnull(item['Court Name' + pos_str]): continue courtid = match_court_string(item['Court Name' + pos_str], federal_district=True) if courtid is None: raise Exception date_termination = process_date_string( item['Date of Termination' + pos_str]) date_start = process_date_string( item['Commission Date' + pos_str]) date_recess_appointment = process_date_string( item['Recess Appointment date' + pos_str]) if pd.isnull(date_start) and not pd.isnull( date_recess_appointment): date_start = date_recess_appointment if pd.isnull(date_start): # if still no start date, skip continue positions = (Position.objects .filter(person=p, date_start=date_start, date_termination=date_termination, position_type='jud') .exclude(pk__in=exclusions)) position_count = positions.count() if position_count < 1: logger.info("Couldn't find position to match '%s' on '%s' " "with exclusions: %s" % (p, date_start, exclusions)) add_positions_from_row(item, p, self.debug, fix_nums=[posnum]) if not self.debug: add_items_to_solr.delay([p.pk], 'people_db.Person') continue elif position_count == 1: # Good case. Press on! position = positions[0] exclusions.append(position.pk) elif position_count > 1: logger.info("Got too many results for '%s' on '%s'. Got %s" % (p, date_start, position_count)) continue if position.court.pk == courtid: logger.info("Court IDs are both '%s'. No changes made." % courtid) else: logger.info("Court IDs are different! Old: %s, New: %s" % (position.court.pk, courtid)) court = Court.objects.get(pk=courtid) position.court = court if not self.debug: position.save() add_items_to_solr.delay([p.pk], 'people_db.Person')
def delete_model(self, request, obj): obj.delete() from cl.search.tasks import add_items_to_solr add_items_to_solr.delay([obj.person_id], 'people_db.Person')
def save_model(self, request, obj, form, change): obj.save() from cl.search.tasks import add_items_to_solr add_items_to_solr.delay([obj.person_id], 'people_db.Person')
def save_model(self, request, obj, form, change): obj.save() from cl.search.tasks import add_items_to_solr add_items_to_solr.delay([obj.pk], 'search.Opinion')