def update_docket_metadata(d, docket_data): """Update the Docket object with the data from Juriscraper. Works on either docket history report or docket report (appellate or district) results. """ d = update_case_names(d, docket_data['case_name']) mark_ia_upload_needed(d) d.docket_number = docket_data['docket_number'] or d.docket_number d.date_filed = docket_data['date_filed'] or d.date_filed d.date_last_filing = docket_data.get( 'date_last_filing') or d.date_last_filing d.date_terminated = docket_data.get('date_terminated') or d.date_terminated d.cause = docket_data.get('cause') or d.cause d.nature_of_suit = docket_data.get('nature_of_suit') or d.nature_of_suit d.jury_demand = docket_data.get('jury_demand') or d.jury_demand d.jurisdiction_type = docket_data.get( 'jurisdiction') or d.jurisdiction_type d.mdl_status = docket_data.get('mdl_status') or d.mdl_status judges = get_candidate_judges(docket_data.get('assigned_to_str'), d.court_id, docket_data['date_filed']) if judges is not None and len(judges) == 1: d.assigned_to = judges[0] d.assigned_to_str = docket_data.get('assigned_to_str') or '' judges = get_candidate_judges(docket_data.get('referred_to_str'), d.court_id, docket_data['date_filed']) if judges is not None and len(judges) == 1: d.referred_to = judges[0] d.referred_to_str = docket_data.get('referred_to_str') or '' d.blocked, d.date_blocked = get_blocked_status(d) return d
def upload_recap_json(self, pk): """Make a JSON object for a RECAP docket and upload it to IA""" # This is a pretty highly optimized query that uses only 13 hits to the DB # when generating a docket JSON rendering, regardless of how many related # objects the docket has such as docket entries, parties, etc. ds = Docket.objects.filter(pk=pk).select_related( 'originating_court_information', ).prefetch_related( 'panel', 'parties__attorneys__roles', 'parties__party_types__criminal_complaints', 'parties__party_types__criminal_counts', # Django appears to have a bug where you can't defer a field on a # queryset where you prefetch the values. If you try to, it crashes. # We should be able to just do the prefetch below like the ones above # and then do the defer statement at the end, but that throws an error. Prefetch( 'docket_entries__recap_documents', queryset=RECAPDocument.objects.all().defer('plain_text') ) ) d = ds[0] renderer = JSONRenderer() json_str = renderer.render( IADocketSerializer(d).data, accepted_media_type='application/json; indent=2', ) file_name = get_docket_filename(d.court_id, d.pacer_case_id, 'json') bucket_name = get_bucket_name(d.court_id, d.pacer_case_id) responses = upload_to_ia( self, identifier=bucket_name, files={file_name: StringIO(json_str)}, title=best_case_name(d), collection=settings.IA_COLLECTIONS, court_id=d.court_id, source_url='https://www.courtlistener.com%s' % d.get_absolute_url(), media_type='texts', description="This item represents a case in PACER, the U.S. " "Government's website for federal case data. This " "information is uploaded quarterly. To see our most " "recent version please use the source url parameter, " "linked below. To see the canonical source for this data, " "please consult PACER directly.", ) if responses is None: increment_failure_count(d) return if all(r.ok for r in responses): d.ia_upload_failure_count = None d.ia_date_first_changed = None d.filepath_ia_json = "https://archive.org/download/%s/%s" % ( bucket_name, file_name) mark_ia_upload_needed(d) d.save() else: increment_failure_count(d)
def update_docket_metadata(d: Docket, docket_data: Dict[str, Any]) -> Docket: """Update the Docket object with the data from Juriscraper. Works on either docket history report or docket report (appellate or district) results. """ d = update_case_names(d, docket_data["case_name"]) mark_ia_upload_needed(d, save_docket=False) d.docket_number = docket_data["docket_number"] or d.docket_number d.date_filed = docket_data.get("date_filed") or d.date_filed d.date_last_filing = ( docket_data.get("date_last_filing") or d.date_last_filing ) d.date_terminated = docket_data.get("date_terminated") or d.date_terminated d.cause = docket_data.get("cause") or d.cause d.nature_of_suit = docket_data.get("nature_of_suit") or d.nature_of_suit d.jury_demand = docket_data.get("jury_demand") or d.jury_demand d.jurisdiction_type = ( docket_data.get("jurisdiction") or d.jurisdiction_type ) d.mdl_status = docket_data.get("mdl_status") or d.mdl_status judges = get_candidate_judges( docket_data.get("assigned_to_str"), d.court_id, docket_data.get("date_filed"), ) if judges is not None and len(judges) == 1: d.assigned_to = judges[0] d.assigned_to_str = docket_data.get("assigned_to_str") or "" judges = get_candidate_judges( docket_data.get("referred_to_str"), d.court_id, docket_data.get("date_filed"), ) if judges is not None and len(judges) == 1: d.referred_to = judges[0] d.referred_to_str = docket_data.get("referred_to_str") or "" d.blocked, d.date_blocked = get_blocked_status(d) return d
def update_rd_metadata(self, rd_pk, response, court_id, pacer_case_id, pacer_doc_id, document_number, attachment_number): """After querying PACER and downloading a document, save it to the DB. :param rd_pk: The primary key of the RECAPDocument to work on :param response: A requests.Response object containing the PDF data. :param court_id: A CourtListener court ID to use for file names. :param pacer_case_id: The pacer_case_id to use in error logs. :param pacer_doc_id: The pacer_doc_id to use in error logs. :param document_number: The docket entry number for use in file names. :param attachment_number: The attachment number (if applicable) for use in file names. :return: A two-tuple of a boolean indicating success and a corresponding error/success message string. """ rd = RECAPDocument.objects.get(pk=rd_pk) if response is None: msg = "Unable to get PDF for RECAP Document '%s' " \ "at '%s' with doc id '%s'" % (rd_pk, court_id, pacer_doc_id) logger.error(msg) self.request.callbacks = None return False, msg file_name = get_document_filename(court_id, pacer_case_id, document_number, attachment_number) cf = ContentFile(response.content) rd.filepath_local.save(file_name, cf, save=False) rd.file_size = rd.filepath_local.size rd.is_available = True # We've got the PDF. # request.content is sometimes a str, sometimes unicode, so # force it all to be bytes, pleasing hashlib. rd.sha1 = hashlib.sha1(force_bytes(response.content)).hexdigest() rd.page_count = get_page_count(rd.filepath_local.path, 'pdf') # Save and extract, skipping OCR. rd.save() # Make sure we mark the docket as needing upload changed = mark_ia_upload_needed(rd.docket_entry.docket) if changed: rd.docket_entry.docket.save() return True, 'Saved item successfully'
def process_recap_pdf(self, pk): """Process an uploaded PDF from the RECAP API endpoint. :param pk: The PK of the processing queue item you want to work on. :return: A RECAPDocument object that was created or updated. """ """Save a RECAP PDF to the database.""" pq = ProcessingQueue.objects.get(pk=pk) mark_pq_status(pq, "", PROCESSING_STATUS.IN_PROGRESS) if pq.attachment_number is None: document_type = RECAPDocument.PACER_DOCUMENT else: document_type = RECAPDocument.ATTACHMENT logger.info("Processing RECAP item (debug is: %s): %s " % (pq.debug, pq)) try: if pq.pacer_case_id: rd = RECAPDocument.objects.get( docket_entry__docket__pacer_case_id=pq.pacer_case_id, pacer_doc_id=pq.pacer_doc_id, ) else: # Sometimes we don't have the case ID from PACER. Try to make this # work anyway. rd = RECAPDocument.objects.get(pacer_doc_id=pq.pacer_doc_id,) except (RECAPDocument.DoesNotExist, RECAPDocument.MultipleObjectsReturned): try: d = Docket.objects.get( pacer_case_id=pq.pacer_case_id, court_id=pq.court_id ) except Docket.DoesNotExist as exc: # No Docket and no RECAPDocument. Do a retry. Hopefully # the docket will be in place soon (it could be in a # different upload task that hasn't yet been processed). logger.warning( "Unable to find docket for processing queue '%s'. " "Retrying if max_retries is not exceeded." % pq ) error_message = "Unable to find docket for item." if (self.request.retries == self.max_retries) or pq.debug: mark_pq_status(pq, error_message, PROCESSING_STATUS.FAILED) return None else: mark_pq_status( pq, error_message, PROCESSING_STATUS.QUEUED_FOR_RETRY ) raise self.retry(exc=exc) except Docket.MultipleObjectsReturned: msg = "Too many dockets found when trying to save '%s'" % pq mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED) return None # Got the Docket, attempt to get/create the DocketEntry, and then # create the RECAPDocument try: de = DocketEntry.objects.get( docket=d, entry_number=pq.document_number ) except DocketEntry.DoesNotExist as exc: logger.warning( "Unable to find docket entry for processing " "queue '%s'." % pq ) msg = "Unable to find docket entry for item." if (self.request.retries == self.max_retries) or pq.debug: mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED) return None else: mark_pq_status(pq, msg, PROCESSING_STATUS.QUEUED_FOR_RETRY) raise self.retry(exc=exc) else: # If we're here, we've got the docket and docket # entry, but were unable to find the document by # pacer_doc_id. This happens when pacer_doc_id is # missing, for example. ∴, try to get the document # from the docket entry. try: rd = RECAPDocument.objects.get( docket_entry=de, document_number=pq.document_number, attachment_number=pq.attachment_number, document_type=document_type, ) except ( RECAPDocument.DoesNotExist, RECAPDocument.MultipleObjectsReturned, ): # Unable to find it. Make a new item. rd = RECAPDocument( docket_entry=de, pacer_doc_id=pq.pacer_doc_id, document_type=document_type, ) rd.document_number = pq.document_number rd.attachment_number = pq.attachment_number # Do the file, finally. try: content = pq.filepath_local.read() except IOError as exc: msg = "Internal processing error (%s: %s)." % (exc.errno, exc.strerror) if (self.request.retries == self.max_retries) or pq.debug: mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED) return None else: mark_pq_status(pq, msg, PROCESSING_STATUS.QUEUED_FOR_RETRY) raise self.retry(exc=exc) new_sha1 = sha1(content) existing_document = all( [ rd.sha1 == new_sha1, rd.is_available, rd.filepath_local and os.path.isfile(rd.filepath_local.path), ] ) if not existing_document: # Different sha1, it wasn't available, or it's missing from disk. Move # the new file over from the processing queue storage. cf = ContentFile(content) file_name = get_document_filename( rd.docket_entry.docket.court_id, rd.docket_entry.docket.pacer_case_id, rd.document_number, rd.attachment_number, ) if not pq.debug: rd.filepath_local.save(file_name, cf, save=False) # Do page count and extraction extension = rd.filepath_local.path.split(".")[-1] rd.page_count = get_page_count(rd.filepath_local.path, extension) rd.file_size = rd.filepath_local.size rd.ocr_status = None rd.is_available = True rd.sha1 = new_sha1 rd.date_upload = now() if not pq.debug: try: rd.save() except (IntegrityError, ValidationError): msg = "Duplicate key on unique_together constraint" mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED) rd.filepath_local.delete(save=False) return None if not existing_document and not pq.debug: extract_recap_pdf(rd.pk) add_items_to_solr([rd.pk], "search.RECAPDocument") mark_pq_successful( pq, d_id=rd.docket_entry.docket_id, de_id=rd.docket_entry_id, rd_id=rd.pk, ) mark_ia_upload_needed(rd.docket_entry.docket, save_docket=True) return rd
def process_recap_attachment(self, pk, tag_names=None): """Process an uploaded attachment page from the RECAP API endpoint. :param pk: The primary key of the processing queue item you want to work on :param tag_names: A list of tag names to add to all items created or modified in this function. :return: None """ pq = ProcessingQueue.objects.get(pk=pk) mark_pq_status(pq, "", PROCESSING_STATUS.IN_PROGRESS) logger.info("Processing RECAP item (debug is: %s): %s" % (pq.debug, pq)) att_page = AttachmentPage(map_cl_to_pacer_id(pq.court_id)) with open(pq.filepath_local.path) as f: text = f.read().decode("utf-8") att_page._parse_text(text) att_data = att_page.data logger.info("Parsing completed for item %s" % pq) if att_data == {}: # Bad attachment page. msg = "Not a valid attachment page upload." mark_pq_status(pq, msg, PROCESSING_STATUS.INVALID_CONTENT) self.request.chain = None return None if pq.pacer_case_id in ["undefined", "null"]: # Bad data from the client. Fix it with parsed data. pq.pacer_case_id = att_data.get("pacer_case_id") pq.save() # Merge the contents of the data into CL. try: params = { "pacer_doc_id": att_data["pacer_doc_id"], "docket_entry__docket__court": pq.court, } if pq.pacer_case_id: params["docket_entry__docket__pacer_case_id"] = pq.pacer_case_id main_rd = RECAPDocument.objects.get(**params) except RECAPDocument.MultipleObjectsReturned: # Unclear how to proceed and we don't want to associate this data with # the wrong case. We must punt. msg = ( "Too many documents found when attempting to associate " "attachment data" ) mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED) return None except RECAPDocument.DoesNotExist as exc: msg = "Could not find docket to associate with attachment metadata" if (self.request.retries == self.max_retries) or pq.debug: mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED) return None else: mark_pq_status(pq, msg, PROCESSING_STATUS.QUEUED_FOR_RETRY) raise self.retry(exc=exc) # We got the right item. Update/create all the attachments for # the docket entry. de = main_rd.docket_entry if att_data["document_number"] is None: # Bankruptcy attachment page. Use the document number from the Main doc att_data["document_number"] = main_rd.document_number rds_created = [] if not pq.debug: # Save the old HTML to the docket entry. pacer_file = PacerHtmlFiles( content_object=de, upload_type=UPLOAD_TYPE.ATTACHMENT_PAGE ) pacer_file.filepath.save( "attachment_page.html", # Irrelevant b/c UUIDFileSystemStorage ContentFile(text), ) # Create/update the attachment items. tags = [] if tag_names: for tag_name in tag_names: tag, _ = Tag.objects.get_or_create(name=tag_name) tags.append(tag) for attachment in att_data["attachments"]: if all( [ attachment["attachment_number"], # Missing on sealed items. attachment.get("pacer_doc_id", False), # Missing on some restricted docs (see Juriscraper) attachment["page_count"] is not None, attachment["description"], ] ): rd, created = RECAPDocument.objects.update_or_create( docket_entry=de, document_number=att_data["document_number"], attachment_number=attachment["attachment_number"], document_type=RECAPDocument.ATTACHMENT, ) if created: rds_created.append(rd) needs_save = False for field in ["description", "pacer_doc_id"]: if attachment[field]: setattr(rd, field, attachment[field]) needs_save = True # Only set page_count and file_size if they're blank, in case # we got the real value by measuring. if rd.page_count is None: rd.page_count = attachment["page_count"] if rd.file_size is None and attachment["file_size_str"]: try: rd.file_size = convert_size_to_bytes( attachment["file_size_str"] ) except ValueError: pass if needs_save: rd.save() if tags: for tag in tags: tag.tag_object(rd) # Do *not* do this async — that can cause race conditions. add_items_to_solr([rd.pk], "search.RECAPDocument") mark_pq_successful(pq, d_id=de.docket_id, de_id=de.pk) process_orphan_documents( rds_created, pq.court_id, main_rd.docket_entry.docket.date_filed ) changed = mark_ia_upload_needed(de.docket) if changed: de.docket.save()
def process_free_opinion_result(self, row_pk, cnt): """Process a single result from the free opinion report""" result = PACERFreeDocumentRow.objects.get(pk=row_pk) result.court = Court.objects.get(pk=map_pacer_to_cl_id(result.court_id)) result.case_name = harmonize(result.case_name) result.case_name_short = cnt.make_case_name_short(result.case_name) row_copy = copy.copy(result) # If we don't do this, the doc's date_filed becomes the docket's # date_filed. Bad. delattr(row_copy, 'date_filed') # If we don't do this, we get the PACER court id and it crashes delattr(row_copy, 'court_id') # If we don't do this, the id of result tries to smash that of the docket. delattr(row_copy, 'id') try: with transaction.atomic(): docket = lookup_and_save(row_copy) if not docket: msg = "Unable to create docket for %s" % result logger.error(msg) result.error_msg = msg result.save() self.request.callbacks = None return docket.blocked, docket.date_blocked = get_blocked_status(docket) mark_ia_upload_needed(docket) docket.save() de, de_created = DocketEntry.objects.update_or_create( docket=docket, entry_number=result.document_number, defaults={ 'date_filed': result.date_filed, 'description': result.description, }) rd, rd_created = RECAPDocument.objects.update_or_create( docket_entry=de, document_number=result.document_number, attachment_number=None, defaults={ 'pacer_doc_id': result.pacer_doc_id, 'document_type': RECAPDocument.PACER_DOCUMENT, 'is_free_on_pacer': True, }) except IntegrityError as e: msg = "Raised IntegrityError: %s" % e logger.error(msg) if self.request.retries == self.max_retries: result.error_msg = msg result.save() return raise self.retry(exc=e) except DatabaseError as e: msg = "Unable to complete database transaction:\n%s" % e logger.error(msg) result.error_msg = msg result.save() self.request.callbacks = None return if not rd_created and rd.is_available: # The item already exists and is available. Fantastic, mark it as free, # and call it a day. rd.is_free_on_pacer = True rd.save() result.delete() self.request.callbacks = None return return { 'result': result, 'rd_pk': rd.pk, 'pacer_court_id': result.court_id }
def merge_attachment_page_data( court: Court, pacer_case_id: int, pacer_doc_id: int, document_number: int, text: str, attachment_dicts: List[Dict[str, Union[int, str]]], debug: bool = False, ) -> Tuple[List[RECAPDocument], DocketEntry]: """Merge attachment page data into the docket :param court: The court object we're working with :param pacer_case_id: A PACER case ID :param pacer_doc_id: A PACER document ID :param document_number: The docket entry number :param text: The text of the attachment page :param attachment_dicts: A list of Juriscraper-parsed dicts for each attachment. :param debug: Whether to do saves during this process. :return: A list of RECAPDocuments modified or created during the process, and the DocketEntry object associated with the RECAPDocuments :raises: RECAPDocument.MultipleObjectsReturned, RECAPDocument.DoesNotExist """ try: params = { "pacer_doc_id": pacer_doc_id, "docket_entry__docket__court": court, } if pacer_case_id: params["docket_entry__docket__pacer_case_id"] = pacer_case_id main_rd = RECAPDocument.objects.get(**params) except RECAPDocument.MultipleObjectsReturned as exc: # Unclear how to proceed and we don't want to associate this data with # the wrong case. We must punt. raise exc except RECAPDocument.DoesNotExist as exc: # Can't find the docket to associate with the attachment metadata # It may be possible to go look for orphaned documents at this stage # and to then add them here, as we do when adding dockets. This need is # particularly acute for those that get free look emails and then go to # the attachment page. raise exc # We got the right item. Update/create all the attachments for # the docket entry. de = main_rd.docket_entry if document_number is None: # Bankruptcy attachment page. Use the document number from the Main doc document_number = main_rd.document_number if debug: return [], de # Save the old HTML to the docket entry. pacer_file = PacerHtmlFiles( content_object=de, upload_type=UPLOAD_TYPE.ATTACHMENT_PAGE ) pacer_file.filepath.save( "attachment_page.html", # Irrelevant b/c UUIDFileSystemStorage ContentFile(text), ) # Create/update the attachment items. rds_created = [] rds_affected = [] for attachment in attachment_dicts: sanity_checks = [ attachment["attachment_number"], # Missing on sealed items. attachment.get("pacer_doc_id", False), # Missing on some restricted docs (see Juriscraper) attachment["page_count"] is not None, attachment["description"], ] if not all(sanity_checks): continue rd, created = RECAPDocument.objects.update_or_create( docket_entry=de, document_number=document_number, attachment_number=attachment["attachment_number"], document_type=RECAPDocument.ATTACHMENT, ) if created: rds_created.append(rd) rds_affected.append(rd) for field in ["description", "pacer_doc_id"]: if attachment[field]: setattr(rd, field, attachment[field]) # Only set page_count and file_size if they're blank, in case # we got the real value by measuring. if rd.page_count is None: rd.page_count = attachment["page_count"] if rd.file_size is None and attachment["file_size_str"]: try: rd.file_size = convert_size_to_bytes( attachment["file_size_str"] ) except ValueError: pass rd.save() # Do *not* do this async — that can cause race conditions. add_items_to_solr([rd.pk], "search.RECAPDocument") mark_ia_upload_needed(de.docket, save_docket=True) process_orphan_documents( rds_created, court.pk, main_rd.docket_entry.docket.date_filed ) return rds_affected, de