def get_next_date_range(court_id, span=7): """Get the next start and end query dates for a court. Check the DB for the last date for a court that was completed. Return the day after that date + span days into the future as the range to query for the requested court. If the court is still in progress, return (None, None). :param court_id: A PACER Court ID :param span: The number of days to go forward from the last completed date """ court_id = map_pacer_to_cl_id(court_id) try: last_completion_log = (PACERFreeDocumentLog.objects.filter( court_id=court_id, ).exclude( status=PACERFreeDocumentLog.SCRAPE_FAILED, ).latest( "date_queried")) except PACERFreeDocumentLog.DoesNotExist: logger.warning("FAILED ON: %s" % court_id) raise if last_completion_log.status == PACERFreeDocumentLog.SCRAPE_IN_PROGRESS: return None, None # Ensure that we go back five days from the last time we had success if # that success was in the last few days. last_complete_date = min(now().date() - timedelta(days=5), last_completion_log.date_queried) next_end_date = min(now().date(), last_complete_date + timedelta(days=span)) return last_complete_date, next_end_date
def get_docket_ids(last_x_days: int) -> Set[int]: """Get docket IDs to update via iquery :param last_x_days: How many of the last days relative to today should we inspect? E.g. 1 means just today, 2 means today and yesterday, etc. :return: docket IDs for which we should crawl iquery """ docket_ids = set() if hasattr(settings, "MATOMO_TOKEN"): try: r = requests.get( settings.MATOMO_REPORT_URL, timeout=10, params={ "idSite": settings.MATOMO_SITE_ID, "module": "API", "method": "Live.getLastVisitsDetails", "period": "day", "format": "json", "date": "last%s" % last_x_days, "token_auth": settings.MATOMO_TOKEN, }, ) r.raise_for_status() j = r.json() except ( ConnectionRefusedError, JSONDecodeError, RequestException, ) as e: logger.warning( "iQuery scraper was unable to get results from Matomo. Got " "exception: %s" % e) else: for item in j: for actiondetail in item["actionDetails"]: url = actiondetail.get("url") if url is None: continue match = re.search( r"^https://www\.courtlistener\.com/docket/([0-9]+)/", url, ) if match is None: continue docket_ids.add(match.group(1)) # Add in docket IDs that have docket alerts or are favorited docket_ids.update(DocketAlert.objects.values_list("docket", flat=True)) docket_ids.update( Favorite.objects.exclude(docket_id=None).values_list("docket_id", flat=True)) docket_ids.update( Docket.objects.filter( case_name__isnull=True, source__in=Docket.RECAP_SOURCES).order_by("?").values_list( "pk", flat=True)) return docket_ids
def get_documents(options): """Download documents from PACER if we don't already have them.""" q = options["queue"] throttle = CeleryThrottle(queue_name=q) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() page_size = 20000 main_query = build_main_query_from_query_string( QUERY_STRING, {"rows": page_size, "fl": ["id", "docket_id"]}, {"group": False, "facet": False, "highlight": False}, ) si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode="r") results = si.query().add_extra(**main_query).execute() si.conn.http_connection.close() logger.info("Got %s search results.", results.result.numFound) for i, result in enumerate(results): if i < options["offset"]: continue if i >= options["limit"] > 0: break throttle.maybe_wait() logger.info( "Doing item %s w/rd: %s, d: %s", i, result["id"], result["docket_id"], ) try: rd = RECAPDocument.objects.get(pk=result["id"]) except RECAPDocument.DoesNotExist: logger.warning( "Unable to find RECAP Document with id %s", result["id"] ) continue if rd.is_available: logger.info("Already have pk %s; just tagging it.", rd.pk) add_tags(rd, TAG) continue if not rd.pacer_doc_id: logger.info("Unable to find pacer_doc_id for: %s", rd.pk) continue chain( get_pacer_doc_by_rd.s(rd.pk, session.cookies, tag=TAG).set( queue=q ), extract_recap_pdf.si(rd.pk).set(queue=q), add_items_to_solr.si([rd.pk], "search.RECAPDocument").set(queue=q), ).apply_async()
def set_if_falsy(obj, attribute, new_value): """Check if the value passed in is Falsy. If so, set it to the value of new_value. return ok: Whether the item was set successfully """ current_value = getattr(obj, attribute) if current_value is not None and isinstance(current_value, str): current_value = current_value.strip() does_not_currently_have_a_value = not current_value current_value_not_zero = current_value != 0 new_value_not_blank = new_value.strip() != "" ok = True if all([ does_not_currently_have_a_value, current_value_not_zero, new_value_not_blank, ]): logger.info("Updating %s with %s." % (attribute, new_value.encode())) setattr(obj, attribute, new_value) else: # Report if there's a difference -- that might spell trouble. values_differ = False if (isinstance(current_value, str) and isinstance(new_value, str) and "".join(current_value.split()) != "".join( new_value.split())): # Handles strings and normalizes them for comparison. values_differ = True elif isinstance(current_value, int) and current_value != int(new_value): # Handles ints, which need no normalization for comparison. values_differ = True if values_differ: logger.warning( "WARNING: Didn't set '{attr}' attribute on obj {obj_id} " "because it already had a value, but the new value " "('{new}') differs from current value ('{current}')". format( attr=attribute, obj_id=obj.pk, new=new_value, current=force_bytes(current_value), )) ok = False else: # The values were the same. logger.info("'%s' field unchanged -- old and new values were " "the same: %s" % (attribute, new_value)) return ok
def do_citations(cluster, scdb_info): """ Handle the citation fields. :param cluster: The Cluster to be changed. :param scdb_info: A dict with the SCDB information. """ fields = { "usCite": ("U.S.", Citation.FEDERAL), "sctCite": ("S. Ct.", Citation.FEDERAL), "ledCite": ("L. Ed.", Citation.FEDERAL), "lexisCite": ("U.S. LEXIS", Citation.LEXIS), } for scdb_field, reporter_info in fields.items(): if not scdb_info[scdb_field]: continue try: citation_obj = get_citations( scdb_info[scdb_field], html=False, do_post_citation=False, do_defendant=False, disambiguate=False, )[0] except IndexError: logger.warning("Unable to parse citation for: %s", scdb_info[scdb_field]) else: cites = cluster.citations.filter(reporter=reporter_info[0]) if cites.count() == 1: # Update the existing citation. cite = cites[0] cite.volume = citation_obj.volume cite.reporter = citation_obj.reporter cite.page = citation_obj.page cite.save() else: try: # Create a new citation Citation.objects.create( cluster=cluster, volume=citation_obj.volume, reporter=citation_obj.reporter, page=citation_obj.page, type=reporter_info[1], ) except IntegrityError: # Violated unique_together constraint. Fine. pass
def do_first_pass(options): idb_rows = FjcIntegratedDatabase.objects.filter( dataset_source=CV_2017, ).order_by("pk") q = options["queue"] throttle = CeleryThrottle(queue_name=q) for i, idb_row in enumerate(queryset_generator(idb_rows)): # Iterate over all items in the IDB and find them in the Docket # table. If they're not there, create a new item. if i < options["offset"]: continue if i >= options["limit"] > 0: break throttle.maybe_wait() # TODO: See conversation in #courtlistener channel from 2019-07-11, # In which it appears we matched a criminal case with a civil one. # The code below doesn't protect against that, but it should (and I # think it does in the `do_second_pass` code, below. ds = Docket.objects.filter( docket_number_core=idb_row.docket_number, court=idb_row.district, ) count = ds.count() if count == 0: logger.info( "%s: Creating new docket for IDB row: %s", i, idb_row ) create_new_docket_from_idb.apply_async( args=(idb_row.pk,), queue=q, ) elif count == 1: d = ds[0] logger.info( "%s: Merging Docket %s with IDB row: %s", i, d, idb_row ) merge_docket_with_idb.apply_async( args=(d.pk, idb_row.pk), queue=q ) elif count > 1: logger.warning( "%s: Unable to merge. Got %s dockets for row: %s", i, count, idb_row, )
def find_missing_or_incorrect_docket_numbers(options): """Iterate over tax cases to verify which docket numbers are correct. :param options: :return: Nothing """ should_fix = options["fix"] ocs = OpinionCluster.objects.filter(docket__court="tax").exclude( sub_opinions__plain_text="") logger.info("%s clusters found", ocs.count()) for oc in ocs: logger.info("Analyzing cluster %s", oc.id) ops = oc.sub_opinions.all() assert ops.count() == 1 for op in ops: logger.warning( "Reference url: https://www.courtlistener.com/opinion/%s/x", oc.id, ) # Only loop over the first opinion because these # cases should only one have one # because they were extracted from the tax courts dockets_in_db = oc.docket.docket_number.strip() found_dockets = get_tax_docket_numbers(op.plain_text) if found_dockets == dockets_in_db: if (oc.docket.docket_number.strip() == "" and dockets_in_db == ""): logger.info("No docket numbers found in db or text.") else: logger.info("Docket numbers appear correct.") continue else: if dockets_in_db == "": logger.warning( "Docket No(s). found for the first time: %s", found_dockets, ) elif found_dockets == "": logger.warning( "Docket No(s). not found in text but Docket No(s). %s in db", dockets_in_db, ) else: logger.warning( "Dockets in db (%s) != (%s) docket parsed from text", dockets_in_db, found_dockets, ) if should_fix: oc.docket.docket_number = found_dockets oc.docket.save()
def process_citations(data, debug): """Walk through the citations and add them one at a time. """ updated_ids = set() for index, item in data.iterrows(): logger.info("\nAdding citation from %s to %s" % (item["citing"], item["cited"])) try: cite = OpinionsCited.objects.get( citing_opinion_id=item["citing"], cited_opinion_id=item["cited"], ) msg = "Citation already exists. Doing nothing:\n" except OpinionsCited.DoesNotExist: cite = OpinionsCited( citing_opinion_id=item["citing"], cited_opinion_id=item["cited"], ) msg = "Created new citation:\n" if not debug: cite.save() updated_ids.add(cite.citing_opinion.pk) try: logger.info( " %s" " %s: %s\n" " From: %s\n" " To: %s\n" % (msg, cite.pk, cite, cite.citing_opinion, cite.cited_opinion)) except Opinion.DoesNotExist: logger.warning( " Unable to create citation. Underlying Opinion doesn't " "exist.") logger.info("\nUpdating Solr...") if not debug: add_items_to_solr(updated_ids, "search.Opinion") logger.info("Done.")
def create_or_update_row(values: Dict[str, str]) -> FjcIntegratedDatabase: fjc_filters = [ { "district": values["district"], "docket_number": values["docket_number"], "origin": values["origin"], "date_filed": values["date_filed"], }, # Match on defendant (that'll work better on criminal cases). It can # change over time, but if we find a match that's a very strong # indicator and we should use it. { "defendant": values["defendant"] }, ] existing_rows = FjcIntegratedDatabase.objects.all() for fjc_filter in fjc_filters: existing_rows = existing_rows.filter(**fjc_filter) existing_row_count = existing_rows.count() if existing_row_count == 0: fjc_row = FjcIntegratedDatabase.objects.create(**values) logger.info("Added row: %s", fjc_row) break elif existing_row_count == 1: existing_rows.update(date_modified=now(), **values) fjc_row = existing_rows[0] logger.info("Updated row: %s" % fjc_row) break else: # Didn't hit a break b/c too many matches. logger.warning( "Got %s results when looking up row by filters: %s", existing_row_count, fjc_filter, ) fjc_row = None return fjc_row
def map_judges_to_photos(self): """Identify which of the judges in the DB have photos. We iterate over the entire collection of judges, identifying which have photos. We could instead iterate over the photos, but that increases the risk of duplicate issues. """ # Create a dict of judge paths, mapping paths to empty lists. judge_paths = os.listdir(os.path.join(judge_root, "orig")) judge_map = {} for path in judge_paths: judge_map[path] = [] # Iterate over the people, attempting to look them up in the list people = Person.objects.filter(is_alias_of=None) for person in people: for name in self.make_slugs(person): if name in judge_map: # If there's a hit, add the path to the dict of judge paths. judge_map[name].append(person) break # After iterating, set all people to not have photos. if not self.debug: people.update(has_photo=False) found = 0 missed = 0 multi = 0 for path, people in judge_map.items(): if len(people) == 0: logger.warning("Did not find a judge for %s" % path) missed += 1 if len(people) == 1: person = people[0] found += 1 if not self.debug: logger.info("Updating judge %s" % person) person.has_photo = True person.save() if len(people) > 1: logger.warning("Found more than one match for %s:" % path) for person in people: logger.warning("Found: %s - %s" % ( person, granular_date( person, "date_dob", iso=True, ), )) multi += 1 logger.info("\n\n%s Matches\n%s Missed\n%s Multiple results" % (found, missed, multi))
def scrape_court(self, site, full_crawl=False, ocr_available=True): # Get the court object early for logging # opinions.united_states.federal.ca9_u --> ca9 court_str = site.court_id.split(".")[-1].split("_")[0] court = Court.objects.get(pk=court_str) dup_checker = DupChecker(court, full_crawl=full_crawl) if dup_checker.abort_by_url_hash(site.url, site.hash): return if site.cookies: logger.info(f"Using cookies: {site.cookies}") for i, item in enumerate(site): msg, r = get_binary_content( item["download_urls"], site.cookies, method=site.method, ) if msg: logger.warning(msg) ErrorLog(log_level="WARNING", court=court, message=msg).save() continue content = site.cleanup_content(r.content) current_date = item["case_dates"] try: next_date = site[i + 1]["case_dates"] except IndexError: next_date = None # request.content is sometimes a str, sometimes unicode, so # force it all to be bytes, pleasing hashlib. sha1_hash = sha1(force_bytes(content)) if ( court_str == "nev" and item["precedential_statuses"] == "Unpublished" ): # Nevada's non-precedential cases have different SHA1 sums # every time. lookup_params = { "lookup_value": item["download_urls"], "lookup_by": "download_url", } else: lookup_params = { "lookup_value": sha1_hash, "lookup_by": "sha1", } proceed = dup_checker.press_on( Opinion, current_date, next_date, **lookup_params ) if dup_checker.emulate_break: break if not proceed: continue # Not a duplicate, carry on logger.info( f"Adding new document found at: {item['download_urls'].encode()}" ) dup_checker.reset() docket, opinion, cluster, citations = make_objects( item, court, sha1_hash, content ) save_everything( items={ "docket": docket, "opinion": opinion, "cluster": cluster, "citations": citations, }, index=False, ) extract_doc_content.delay( opinion.pk, ocr_available=ocr_available, citation_jitter=True ) logger.info( f"Successfully added doc {opinion.pk}: {item['case_names'].encode()}" ) # Update the hash if everything finishes properly. logger.info(f"{site.court_id}: Successfully crawled opinions.") if not full_crawl: # Only update the hash if no errors occurred. dup_checker.update_site_hash(site.hash)
def parse_harvard_opinions(reporter, volume, make_searchable): """ Parse downloaded CaseLaw Corpus from internet archive and add them to our database. Optionally uses a reporter abbreviation to identify cases to download as used by IA. (Ex. T.C. => tc) Optionally uses a volume integer. If neither is provided, code will cycle through all downloaded files. :param volume: The volume (int) of the reporters (optional) (ex 10) :param reporter: Reporter string as slugify'd (optional) (tc) for T.C. :param make_searchable: Boolean to indicate saving to solr :return: None """ if not reporter and volume: logger.error("You provided a volume but no reporter. Exiting.") return for file_path in filepath_list(reporter, volume): ia_download_url = "/".join( ["https://archive.org/download", file_path.split("/", 9)[-1]] ) if OpinionCluster.objects.filter( filepath_json_harvard=file_path ).exists(): logger.info("Skipping - already in system %s" % ia_download_url) continue try: with open(file_path) as f: data = json.load(f) except ValueError: logger.warning("Empty json: missing case at: %s" % ia_download_url) continue except Exception as e: logger.warning("Unknown error %s for: %s" % (e, ia_download_url)) continue cites = get_citations(data["citations"][0]["cite"]) if not cites: logger.info( "No citation found for %s." % data["citations"][0]["cite"] ) continue case_name = harmonize(data["name_abbreviation"]) case_name_short = cnt.make_case_name_short(case_name) case_name_full = harmonize(data["name"]) citation = cites[0] if skip_processing(citation, case_name, file_path): continue # TODO: Generalize this to handle all court types somehow. court_id = match_court_string( data["court"]["name"], state=True, federal_appeals=True, federal_district=True, ) soup = BeautifulSoup(data["casebody"]["data"], "lxml") # Some documents contain images in the HTML # Flag them for a later crawl by using the placeholder '[[Image]]' judge_list = [ extract_judge_last_name(x.text) for x in soup.find_all("judges") ] author_list = [ extract_judge_last_name(x.text) for x in soup.find_all("author") ] # Flatten and dedupe list of judges judges = ", ".join( sorted( list( set( itertools.chain.from_iterable(judge_list + author_list) ) ) ) ) judges = titlecase(judges) docket_string = ( data["docket_number"] .replace("Docket No.", "") .replace("Docket Nos.", "") .strip() ) short_fields = ["attorneys", "disposition", "otherdate", "seealso"] long_fields = [ "syllabus", "summary", "history", "headnotes", "correction", ] short_data = parse_extra_fields(soup, short_fields, False) long_data = parse_extra_fields(soup, long_fields, True) with transaction.atomic(): logger.info("Adding docket for: %s", citation.base_citation()) docket = Docket( case_name=case_name, case_name_short=case_name_short, case_name_full=case_name_full, docket_number=docket_string, court_id=court_id, source=Docket.HARVARD, ia_needs_upload=False, ) try: with transaction.atomic(): docket.save() except OperationalError as e: if "exceeds maximum" in str(e): docket.docket_number = ( "%s, See Corrections for full Docket Number" % trunc(docket_string, length=5000, ellipsis="...") ) docket.save() long_data["correction"] = "%s <br> %s" % ( data["docket_number"], long_data["correction"], ) # Handle partial dates by adding -01v to YYYY-MM dates date_filed, is_approximate = validate_dt(data["decision_date"]) logger.info("Adding cluster for: %s", citation.base_citation()) cluster = OpinionCluster( case_name=case_name, case_name_short=case_name_short, case_name_full=case_name_full, precedential_status="Published", docket_id=docket.id, source="U", date_filed=date_filed, date_filed_is_approximate=is_approximate, attorneys=short_data["attorneys"], disposition=short_data["disposition"], syllabus=long_data["syllabus"], summary=long_data["summary"], history=long_data["history"], other_dates=short_data["otherdate"], cross_reference=short_data["seealso"], headnotes=long_data["headnotes"], correction=long_data["correction"], judges=judges, filepath_json_harvard=file_path, ) cluster.save(index=False) logger.info("Adding citation for: %s", citation.base_citation()) Citation.objects.create( volume=citation.volume, reporter=citation.reporter, page=citation.page, type=map_reporter_db_cite_type( REPORTERS[citation.canonical_reporter][0]["cite_type"] ), cluster_id=cluster.id, ) new_op_pks = [] for op in soup.find_all("opinion"): # This code cleans author tags for processing. # It is particularly useful for identifiying Per Curiam for elem in [op.find("author")]: if elem is not None: [x.extract() for x in elem.find_all("page-number")] auth = op.find("author") if auth is not None: author_tag_str = titlecase(auth.text.strip(":")) author_str = titlecase( "".join(extract_judge_last_name(author_tag_str)) ) else: author_str = "" author_tag_str = "" per_curiam = True if author_tag_str == "Per Curiam" else False # If Per Curiam is True set author string to Per Curiam if per_curiam: author_str = "Per Curiam" op_type = map_opinion_type(op.get("type")) opinion_xml = str(op) logger.info("Adding opinion for: %s", citation.base_citation()) op = Opinion( cluster_id=cluster.id, type=op_type, author_str=author_str, xml_harvard=opinion_xml, per_curiam=per_curiam, extracted_by_ocr=True, ) # Don't index now; do so later if desired op.save(index=False) new_op_pks.append(op.pk) if make_searchable: add_items_to_solr.delay(new_op_pks, "search.Opinion") logger.info("Finished: %s", citation.base_citation())
def lookup_row(row): """Lookup the row provided in the FJC DB. :param row: A row dict as pulled from the CSV using the csv DictReader :returns int: The PK of the row that matched. """ try: plaintiff, defendant = row["Case Name"].lower().split(" v. ", 1) except IndexError: logger.warning("Unable to find ' v. ' in case name.") return except ValueError: logger.warning("Got multiple ' v. ' in the case name.") return opinion_date = datetime.strptime(row["Date"], "%m/%d/%Y") orig_query = ( FjcIntegratedDatabase.objects.filter( # All of these are civil. dataset_source=CV_2017, # Ensure the correct court. district__fjc_court_id=row["AO ID"], # The docket must have been filed *before* the date of the opinion. date_filed__lte=opinion_date, # But not more than five years prior to the opinion. date_filed__gte=opinion_date - timedelta(days=365 * 5), ).exclude( # FJC Ids are duplicated across bankruptcy and district. Since we only # know the FJC court ID, just exclude bankruptcy cases as a rule. That # will ensure we limit ourselves to the correct jurisdiction. district__jurisdiction=Court.FEDERAL_BANKRUPTCY, ).order_by( "-date_filed")) # Start with the strictest, then broaden when you fail. Truncate at 30 # chars (that's all the field can contain). filter_tuples = [ ( # Try an exact match on case name. (), { "plaintiff__iexact": plaintiff[:30], "defendant__iexact": defendant[:30], }, ), ( # Try a starts with match on case name. (), { "plaintiff__istartswith": plaintiff[:30], "defendant__istartswith": defendant[:30], }, ), ( # To to find a match that contains the first three words from the # plaintiff and defendant (in any order). Note Q objects are args, not # kwargs, hence different format here. ( make_party_q(defendant, "defendant", slice(None, 3)), make_party_q(plaintiff, "plaintiff", slice(None, 3)), ), {}, ), ( # Broaden. Try just the first word from plaintiff & defendant matching. ( make_party_q(defendant, "defendant", slice(None, 1)), make_party_q(plaintiff, "plaintiff", slice(None, 1)), ), {}, ), ( # Explore. Try the second word of the plaintiff instead. It's often a # last name and worth a try. ( make_party_q(plaintiff, "plaintiff", slice(1, 2)), make_party_q(defendant, "defendant", slice(None, 1)), ), {}, ), ] for args, kwargs in filter_tuples: results = orig_query.filter(*args, **kwargs) count = results.count() if count == 0: logger.warning("Unable to find result (args: %s, kwargs: %s). " "Broadening if possible." % (args, kwargs)) continue if count == 1: logger.info("Got one result. Bingo (args: %s, kwargs: %s)." % (args, kwargs)) return results[0] elif 5 > count > 1: logger.info("Got %s results. Choosing closest to document date." % count) return results[0] else: logger.warning( "Got too many results. Cannot identify correct case " "(args: %s, kwargs: %s)." % (args, kwargs)) return
def download_documents(options): """We've got good values in the new columns, so just need to look those up, and get the documents from PACER. """ f = open(options["input_file"], "r") dialect = csv.Sniffer().sniff(f.read(1024)) f.seek(0) reader = csv.DictReader(f, dialect=dialect) q = options["queue"] throttle = CeleryThrottle(queue_name=q, min_items=options["queue_length"]) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() for i, row in enumerate(reader): if i < options["offset"]: continue if i >= options["limit"] > 0: break throttle.maybe_wait() logger.info("Doing row %s: %s", i, row) docket_number = (row["cl_d_docket_number"] or row["cl_d_docket_number (student)"] or None) if not docket_number: logger.warning("No docket number found for row: %s", i) continue court = Court.federal_courts.district_courts().get( fjc_court_id=row["AO ID"].rjust(2, "0"), ) try: d = Docket.objects.get(docket_number=docket_number, court=court) except Docket.MultipleObjectsReturned: logger.warning("Multiple objects returned for row: %s", i) continue except Docket.DoesNotExist: logger.warning("Could not find docket for row: %s", i) continue # Got the docket, now get the documents from it, tag & OCR them. document_date = datetime.strptime(row["Date"], "%m/%d/%Y").date() des = d.docket_entries.filter(date_filed=document_date) count = des.count() if count == 0: logger.warning("No docket entries found for row: %s", i) continue elif des.count() == 1: good_des = [des[0]] else: # More than one item. Apply filtering rules. good_des = filter_des(des) # We've got our des, now download them. for de in good_des: rds = de.recap_documents.filter( document_type=RECAPDocument.PACER_DOCUMENT) for rd in rds: if not rd.pacer_doc_id: logger.warning( "Unable to get pacer_doc_id for item with " "rd_pk: %s. Restricted document?", rd.pk, ) continue if options["task"] == "add_extra_tags": # Wherein I belatedly realize we need a tag specifically # for this part of the project. add_tags(rd, TAG_NAME_OPINIONS) else: # Otherwise, do the normal download thing. chain( get_pacer_doc_by_rd.s(rd.pk, session.cookies, tag=TAG_NAME).set(queue=q), extract_recap_pdf.si(rd.pk).set(queue=q), add_items_to_solr.si( [rd.pk], "search.RECAPDocument").set(queue=q), ).apply_async() f.close()
def parse_harvard_opinions(reporter, volume): """ Parse downloaded CaseLaw Corpus from internet archive and add them to our database. Optionally uses a reporter abbreviation to identify cases to download as used by IA. (Ex. T.C. => tc) Optionally uses a volume integer. If neither is provided, code will cycle through all downloaded files. :param volume: The volume (int) of the reporters (optional) (ex 10) :param reporter: Reporter string as slugify'd (optional) (tc) for T.C. :return: None """ if not reporter and volume: logger.error("You provided a volume but no reporter. Exiting.") return for file_path in filepath_list(reporter, volume): ia_download_url = "/".join( ["https://archive.org/download", file_path.split("/", 9)[-1]] ) if OpinionCluster.objects.filter( filepath_json_harvard=file_path ).exists(): logger.info("Skipping - already in system %s" % ia_download_url) continue try: with open(file_path) as f: data = json.load(f) except ValueError: logger.warning("Empty json: missing case at: %s" % ia_download_url) continue except Exception as e: logger.warning("Unknown error %s for: %s" % (e, ia_download_url)) continue cites = get_citations(data["citations"][0]["cite"], html=False) if not cites: logger.info( "No citation found for %s." % data["citations"][0]["cite"] ) continue case_name = harmonize(data["name_abbreviation"]) case_name_short = cnt.make_case_name_short(case_name) case_name_full = harmonize(data["name"]) citation = cites[0] if skip_processing(citation, case_name): continue # TODO: Generalize this to handle all court types somehow. court_id = match_court_string( data["court"]["name"], state=True, federal_appeals=True, federal_district=True, ) soup = BeautifulSoup(data["casebody"]["data"], "lxml") # Some documents contain images in the HTML # Flag them for a later crawl by using the placeholder '[[Image]]' judge_list = [ find_judge_names(x.text) for x in soup.find_all("judges") ] author_list = [ find_judge_names(x.text) for x in soup.find_all("author") ] # Flatten and dedupe list of judges judges = ", ".join( list(set(itertools.chain.from_iterable(judge_list + author_list))) ) judges = titlecase(judges) docket_string = ( data["docket_number"] .replace("Docket No.", "") .replace("Docket Nos.", "") .strip() ) with transaction.atomic(): logger.info("Adding docket for: %s", citation.base_citation()) docket = Docket.objects.create( case_name=case_name, case_name_short=case_name_short, case_name_full=case_name_full, docket_number=docket_string, court_id=court_id, source=Docket.HARVARD, ia_needs_upload=False, ) # Iterate over other xml fields in Harvard data set # and save as string list for further processing at a later date. json_fields = [ "attorneys", "disposition", "syllabus", "summary", "history", "otherdate", "seealso", "headnotes", "correction", ] data_set = {} while json_fields: key = json_fields.pop(0) data_set[key] = "|".join([x.text for x in soup.find_all(key)]) # Handle partial dates by adding -01v to YYYY-MM dates date_filed, is_approximate = validate_dt(data["decision_date"]) logger.info("Adding cluster for: %s", citation.base_citation()) cluster = OpinionCluster.objects.create( case_name=case_name, case_name_short=case_name_short, case_name_full=case_name_full, precedential_status="Published", docket_id=docket.id, source="U", date_filed=date_filed, date_filed_is_approximate=is_approximate, attorneys=data_set["attorneys"], disposition=data_set["disposition"], syllabus=data_set["syllabus"], summary=data_set["summary"], history=data_set["history"], other_dates=data_set["otherdate"], cross_reference=data_set["seealso"], headnotes=data_set["headnotes"], correction=data_set["correction"], judges=judges, filepath_json_harvard=file_path, ) logger.info("Adding citation for: %s", citation.base_citation()) Citation.objects.create( volume=citation.volume, reporter=citation.reporter, page=citation.page, type=map_reporter_db_cite_type( REPORTERS[citation.reporter][0]["cite_type"] ), cluster_id=cluster.id, ) for op in soup.find_all("opinion"): joined_by_str = titlecase( " ".join( list(set(itertools.chain.from_iterable(judge_list))) ) ) author_str = titlecase( " ".join( list(set(itertools.chain.from_iterable(author_list))) ) ) op_type = map_opinion_type(op.get("type")) opinion_xml = str(op) logger.info("Adding opinion for: %s", citation.base_citation()) Opinion.objects.create( cluster_id=cluster.id, type=op_type, author_str=author_str, xml_harvard=opinion_xml, joined_by_str=joined_by_str, extracted_by_ocr=True, ) logger.info("Finished: %s", citation.base_citation())
def scrape_court(self, site, full_crawl=False, backscrape=False): download_error = False # Get the court object early for logging # opinions.united_states.federal.ca9_u --> ca9 court_str = site.court_id.split(".")[-1].split("_")[0] court = Court.objects.get(pk=court_str) dup_checker = DupChecker(court, full_crawl=full_crawl) abort = dup_checker.abort_by_url_hash(site.url, site.hash) if not abort: if site.cookies: logger.info("Using cookies: %s" % site.cookies) for i, item in enumerate(site): msg, r = get_binary_content( item["download_urls"], site.cookies, site._get_adapter_instance(), method=site.method, ) if msg: logger.warning(msg) ErrorLog( log_level="WARNING", court=court, message=msg ).save() continue content = site.cleanup_content(r.content) current_date = item["case_dates"] try: next_date = site[i + 1]["case_dates"] except IndexError: next_date = None # request.content is sometimes a str, sometimes unicode, so # force it all to be bytes, pleasing hashlib. sha1_hash = sha1(force_bytes(content)) onwards = dup_checker.press_on( Audio, current_date, next_date, lookup_value=sha1_hash, lookup_by="sha1", ) if dup_checker.emulate_break: break if onwards: # Not a duplicate, carry on logger.info( "Adding new document found at: %s" % item["download_urls"].encode("utf-8") ) dup_checker.reset() docket, audio_file, error = make_objects( item, court, sha1_hash, content ) if error: download_error = True continue save_everything( items={"docket": docket, "audio_file": audio_file}, index=False, backscrape=backscrape, ) process_audio_file.apply_async( (audio_file.pk,), countdown=random.randint(0, 3600) ) logger.info( "Successfully added audio file {pk}: {name}".format( pk=audio_file.pk, name=item["case_names"].encode("utf-8"), ) ) # Update the hash if everything finishes properly. logger.info( "%s: Successfully crawled oral arguments." % site.court_id ) if not download_error and not full_crawl: # Only update the hash if no errors occurred. dup_checker.update_site_hash(site.hash)
def find_missing_or_incorrect_citations(options): """Iterate over tax cases to verify which citations are correctly parsed This code should pull back all the cases with plaintext tax courts to parse. Iterate over those cases extracting the citation if any :param options: :return: """ should_fix = options["fix"] ocs = OpinionCluster.objects.filter(docket__court="tax").exclude( sub_opinions__plain_text="") logger.info("%s clusters found", ocs.count()) for oc in ocs: logger.warning( "Reference url: https://www.courtlistener.com/opinion/%s/x", oc.id, ) cites = oc.citations.all() logger.info("Found %s cite(s) for case in db", cites.count()) if cites.count() > 0: if should_fix: logger.warning("Deleting cites in cluster %s", oc.id) cites.delete() ops = oc.sub_opinions.all() assert ops.count() == 1 for op in ops: # Only loop over the first opinion because # these cases should only one have one opinion found_cite = find_tax_court_citation(op.plain_text) if found_cite is not None: found_cite_str = found_cite.base_citation() logger.info("Found citation in plain text as %s", found_cite_str) if should_fix: logger.warning("Creating citation: %s", found_cite_str) Citation.objects.create( volume=found_cite.volume, reporter=found_cite.reporter, page=found_cite.page, type=found_cite.type, cluster_id=oc.id, ) else: if cites.count() > 0: for cite in cites: if str(cite) != found_cite_str: logger.warning( "Have (%s), Expect (%s)", cite, found_cite_str, ) else: logger.warning("Add %s to db", found_cite_str) else: if cites.count() > 0: for cite in cites: logger.warning("Have (%s), Expect None", cite) logger.warning("%s should be removed", cite) else: logger.info("No citation in db or text: %s", oc.id)