def process_recap_appellate_docket(self, pk): """Process an uploaded appellate docket from the RECAP API endpoint. :param pk: The primary key of the processing queue item you want to work on. :returns: A dict of the form: { // The PK of the docket that's created or updated 'docket_pk': 22, // A boolean indicating whether a new docket entry or // recap document was created (implying a Solr needs // updating). 'content_updated': True, } This value is a dict so that it can be ingested in a Celery chain. """ start_time = now() pq = ProcessingQueue.objects.get(pk=pk) mark_pq_status(pq, "", PROCESSING_STATUS.IN_PROGRESS) logger.info( "Processing Appellate RECAP item" " (debug is: %s): %s" % (pq.debug, pq) ) report = AppellateDocketReport(map_cl_to_pacer_id(pq.court_id)) try: text = pq.filepath_local.read().decode("utf-8") except IOError as exc: msg = "Internal processing error (%s: %s)." % (exc.errno, exc.strerror) if (self.request.retries == self.max_retries) or pq.debug: mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED) return None else: mark_pq_status(pq, msg, PROCESSING_STATUS.QUEUED_FOR_RETRY) raise self.retry(exc=exc) report._parse_text(text) data = report.data logger.info("Parsing completed of item %s" % pq) if data == {}: # Not really a docket. Some sort of invalid document (see Juriscraper). msg = "Not a valid docket upload." mark_pq_status(pq, msg, PROCESSING_STATUS.INVALID_CONTENT) self.request.chain = None return None # Merge the contents of the docket into CL. d, docket_count = find_docket_object( pq.court_id, pq.pacer_case_id, data["docket_number"] ) if docket_count > 1: logger.info( "Found %s dockets during lookup. Choosing oldest." % docket_count ) d = d.earliest("date_created") d.add_recap_source() update_docket_metadata(d, data) d, og_info = update_docket_appellate_metadata(d, data) if not d.pacer_case_id: d.pacer_case_id = pq.pacer_case_id if pq.debug: mark_pq_successful(pq, d_id=d.pk) self.request.chain = None return {"docket_pk": d.pk, "content_updated": False} if og_info is not None: og_info.save() d.originating_court_information = og_info d.save() # Add the HTML to the docket in case we need it someday. pacer_file = PacerHtmlFiles( content_object=d, upload_type=UPLOAD_TYPE.APPELLATE_DOCKET ) pacer_file.filepath.save( "docket.html", # We only care about the ext w/UUIDFileSystemStorage ContentFile(text), ) rds_created, content_updated = add_docket_entries( d, data["docket_entries"] ) add_parties_and_attorneys(d, data["parties"]) process_orphan_documents(rds_created, pq.court_id, d.date_filed) if content_updated and docket_count > 0: newly_enqueued = enqueue_docket_alert(d.pk) if newly_enqueued: send_docket_alert(d.pk, start_time) mark_pq_successful(pq, d_id=d.pk) return { "docket_pk": d.pk, "content_updated": bool(rds_created or content_updated), }
def process_recap_docket_history_report(self, pk): """Process the docket history report. :param pk: The primary key of the processing queue item you want to work on :returns: A dict indicating whether the docket needs Solr re-indexing. """ start_time = now() pq = ProcessingQueue.objects.get(pk=pk) mark_pq_status(pq, "", PROCESSING_STATUS.IN_PROGRESS) logger.info("Processing RECAP item (debug is: %s): %s" % (pq.debug, pq)) try: text = pq.filepath_local.read().decode("utf-8") except IOError as exc: msg = "Internal processing error (%s: %s)." % (exc.errno, exc.strerror) if (self.request.retries == self.max_retries) or pq.debug: mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED) return None else: mark_pq_status(pq, msg, PROCESSING_STATUS.QUEUED_FOR_RETRY) raise self.retry(exc=exc) report = DocketHistoryReport(map_cl_to_pacer_id(pq.court_id)) report._parse_text(text) data = report.data logger.info("Parsing completed for item %s" % pq) if data == {}: # Bad docket history page. msg = "Not a valid docket history page upload." mark_pq_status(pq, msg, PROCESSING_STATUS.INVALID_CONTENT) self.request.chain = None return None # Merge the contents of the docket into CL. d, docket_count = find_docket_object( pq.court_id, pq.pacer_case_id, data["docket_number"] ) if docket_count > 1: logger.info( "Found %s dockets during lookup. Choosing oldest." % docket_count ) d = d.earliest("date_created") d.add_recap_source() update_docket_metadata(d, data) if pq.debug: mark_pq_successful(pq, d_id=d.pk) self.request.chain = None return {"docket_pk": d.pk, "content_updated": False} try: d.save() except IntegrityError as exc: logger.warning( "Race condition experienced while attempting docket save." ) error_message = "Unable to save docket due to IntegrityError." if self.request.retries == self.max_retries: mark_pq_status(pq, error_message, PROCESSING_STATUS.FAILED) self.request.chain = None return None else: mark_pq_status( pq, error_message, PROCESSING_STATUS.QUEUED_FOR_RETRY ) raise self.retry(exc=exc) # Add the HTML to the docket in case we need it someday. pacer_file = PacerHtmlFiles( content_object=d, upload_type=UPLOAD_TYPE.DOCKET_HISTORY_REPORT ) pacer_file.filepath.save( # We only care about the ext w/UUIDFileSystemStorage "docket_history.html", ContentFile(text), ) rds_created, content_updated = add_docket_entries( d, data["docket_entries"] ) process_orphan_documents(rds_created, pq.court_id, d.date_filed) if content_updated and docket_count > 0: newly_enqueued = enqueue_docket_alert(d.pk) if newly_enqueued: send_docket_alert(d.pk, start_time) mark_pq_successful(pq, d_id=d.pk) return { "docket_pk": d.pk, "content_updated": bool(rds_created or content_updated), }
def process_recap_attachment(self, pk, tag_names=None): """Process an uploaded attachment page from the RECAP API endpoint. :param pk: The primary key of the processing queue item you want to work on :param tag_names: A list of tag names to add to all items created or modified in this function. :return: None """ pq = ProcessingQueue.objects.get(pk=pk) mark_pq_status(pq, "", PROCESSING_STATUS.IN_PROGRESS) logger.info("Processing RECAP item (debug is: %s): %s" % (pq.debug, pq)) att_page = AttachmentPage(map_cl_to_pacer_id(pq.court_id)) with open(pq.filepath_local.path) as f: text = f.read().decode("utf-8") att_page._parse_text(text) att_data = att_page.data logger.info("Parsing completed for item %s" % pq) if att_data == {}: # Bad attachment page. msg = "Not a valid attachment page upload." mark_pq_status(pq, msg, PROCESSING_STATUS.INVALID_CONTENT) self.request.chain = None return None if pq.pacer_case_id in ["undefined", "null"]: # Bad data from the client. Fix it with parsed data. pq.pacer_case_id = att_data.get("pacer_case_id") pq.save() # Merge the contents of the data into CL. try: params = { "pacer_doc_id": att_data["pacer_doc_id"], "docket_entry__docket__court": pq.court, } if pq.pacer_case_id: params["docket_entry__docket__pacer_case_id"] = pq.pacer_case_id main_rd = RECAPDocument.objects.get(**params) except RECAPDocument.MultipleObjectsReturned: # Unclear how to proceed and we don't want to associate this data with # the wrong case. We must punt. msg = ( "Too many documents found when attempting to associate " "attachment data" ) mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED) return None except RECAPDocument.DoesNotExist as exc: msg = "Could not find docket to associate with attachment metadata" if (self.request.retries == self.max_retries) or pq.debug: mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED) return None else: mark_pq_status(pq, msg, PROCESSING_STATUS.QUEUED_FOR_RETRY) raise self.retry(exc=exc) # We got the right item. Update/create all the attachments for # the docket entry. de = main_rd.docket_entry if att_data["document_number"] is None: # Bankruptcy attachment page. Use the document number from the Main doc att_data["document_number"] = main_rd.document_number rds_created = [] if not pq.debug: # Save the old HTML to the docket entry. pacer_file = PacerHtmlFiles( content_object=de, upload_type=UPLOAD_TYPE.ATTACHMENT_PAGE ) pacer_file.filepath.save( "attachment_page.html", # Irrelevant b/c UUIDFileSystemStorage ContentFile(text), ) # Create/update the attachment items. tags = [] if tag_names: for tag_name in tag_names: tag, _ = Tag.objects.get_or_create(name=tag_name) tags.append(tag) for attachment in att_data["attachments"]: if all( [ attachment["attachment_number"], # Missing on sealed items. attachment.get("pacer_doc_id", False), # Missing on some restricted docs (see Juriscraper) attachment["page_count"] is not None, attachment["description"], ] ): rd, created = RECAPDocument.objects.update_or_create( docket_entry=de, document_number=att_data["document_number"], attachment_number=attachment["attachment_number"], document_type=RECAPDocument.ATTACHMENT, ) if created: rds_created.append(rd) needs_save = False for field in ["description", "pacer_doc_id"]: if attachment[field]: setattr(rd, field, attachment[field]) needs_save = True # Only set page_count and file_size if they're blank, in case # we got the real value by measuring. if rd.page_count is None: rd.page_count = attachment["page_count"] if rd.file_size is None and attachment["file_size_str"]: try: rd.file_size = convert_size_to_bytes( attachment["file_size_str"] ) except ValueError: pass if needs_save: rd.save() if tags: for tag in tags: tag.tag_object(rd) # Do *not* do this async — that can cause race conditions. add_items_to_solr([rd.pk], "search.RECAPDocument") mark_pq_successful(pq, d_id=de.docket_id, de_id=de.pk) process_orphan_documents( rds_created, pq.court_id, main_rd.docket_entry.docket.date_filed ) changed = mark_ia_upload_needed(de.docket) if changed: de.docket.save()
def process_recap_docket(self, pk): """Process an uploaded docket from the RECAP API endpoint. :param pk: The primary key of the processing queue item you want to work on. :returns: A dict of the form: { // The PK of the docket that's created or updated 'docket_pk': 22, // A boolean indicating whether a new docket entry or // recap document was created (implying a Solr needs // updating). 'content_updated': True, } This value is a dict so that it can be ingested in a Celery chain. """ start_time = now() pq = ProcessingQueue.objects.get(pk=pk) mark_pq_status(pq, "", PROCESSING_STATUS.IN_PROGRESS) logger.info(f"Processing RECAP item (debug is: {pq.debug}): {pq}") report = DocketReport(map_cl_to_pacer_id(pq.court_id)) try: text = pq.filepath_local.read().decode() except IOError as exc: msg = f"Internal processing error ({exc.errno}: {exc.strerror})." if (self.request.retries == self.max_retries) or pq.debug: mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED) return None else: mark_pq_status(pq, msg, PROCESSING_STATUS.QUEUED_FOR_RETRY) raise self.retry(exc=exc) if "History/Documents" in text: # Prior to 1.1.8, we did not separate docket history reports into their # own upload_type. Alas, we still have some old clients around, so we # need to handle those clients here. pq.upload_type = UPLOAD_TYPE.DOCKET_HISTORY_REPORT pq.save() process_recap_docket_history_report(pk) self.request.chain = None return None report._parse_text(text) data = report.data logger.info(f"Parsing completed of item {pq}") if data == {}: # Not really a docket. Some sort of invalid document (see Juriscraper). msg = "Not a valid docket upload." mark_pq_status(pq, msg, PROCESSING_STATUS.INVALID_CONTENT) self.request.chain = None return None # Merge the contents of the docket into CL. d = find_docket_object(pq.court_id, pq.pacer_case_id, data["docket_number"]) d.add_recap_source() update_docket_metadata(d, data) if not d.pacer_case_id: d.pacer_case_id = pq.pacer_case_id if pq.debug: mark_pq_successful(pq, d_id=d.pk) self.request.chain = None return {"docket_pk": d.pk, "content_updated": False} d.save() # Add the HTML to the docket in case we need it someday. pacer_file = PacerHtmlFiles(content_object=d, upload_type=UPLOAD_TYPE.DOCKET) pacer_file.filepath.save( "docket.html", # We only care about the ext w/UUIDFileSystemStorage ContentFile(text), ) rds_created, content_updated = add_docket_entries(d, data["docket_entries"]) add_parties_and_attorneys(d, data["parties"]) process_orphan_documents(rds_created, pq.court_id, d.date_filed) if content_updated: newly_enqueued = enqueue_docket_alert(d.pk) if newly_enqueued: send_docket_alert(d.pk, start_time) mark_pq_successful(pq, d_id=d.pk) return { "docket_pk": d.pk, "content_updated": bool(rds_created or content_updated), }
def process_recap_docket(self, pk): """Process an uploaded docket from the RECAP API endpoint. :param pk: The primary key of the processing queue item you want to work on. :returns: A dict of the form: { // The PK of the docket that's created or updated 'docket_pk': 22, // A boolean indicating whether a new docket entry or // recap document was created (implying a Solr needs // updating). 'content_updated': True, } This value is a dict so that it can be ingested in a Celery chain. """ start_time = now() pq = ProcessingQueue.objects.get(pk=pk) mark_pq_status(pq, '', PROCESSING_STATUS.IN_PROGRESS) logger.info("Processing RECAP item (debug is: %s): %s" % (pq.debug, pq)) report = DocketReport(map_cl_to_pacer_id(pq.court_id)) text = pq.filepath_local.read().decode('utf-8') if 'History/Documents' in text: # Prior to 1.1.8, we did not separate docket history reports into their # own upload_type. Alas, we still have some old clients around, so we # need to handle those clients here. pq.upload_type = UPLOAD_TYPE.DOCKET_HISTORY_REPORT pq.save() process_recap_docket_history_report(pk) self.request.chain = None return None report._parse_text(text) data = report.data logger.info("Parsing completed of item %s" % pq) if data == {}: # Not really a docket. Some sort of invalid document (see Juriscraper). msg = "Not a valid docket upload." mark_pq_status(pq, msg, PROCESSING_STATUS.INVALID_CONTENT) self.request.chain = None return None # Merge the contents of the docket into CL. d, docket_count = find_docket_object(pq.court_id, pq.pacer_case_id, data['docket_number']) if docket_count > 1: logger.info("Found %s dockets during lookup. Choosing oldest." % docket_count) d = d.earliest('date_created') d.add_recap_source() update_docket_metadata(d, data) if not d.pacer_case_id: d.pacer_case_id = pq.pacer_case_id if pq.debug: mark_pq_successful(pq, d_id=d.pk) self.request.chain = None return {'docket_pk': d.pk, 'content_updated': False} d.save() # Add the HTML to the docket in case we need it someday. pacer_file = PacerHtmlFiles(content_object=d, upload_type=UPLOAD_TYPE.DOCKET) pacer_file.filepath.save( 'docket.html', # We only care about the ext w/UUIDFileSystemStorage ContentFile(text), ) rds_created, content_updated = add_docket_entries( d, data['docket_entries']) add_parties_and_attorneys(d, data['parties']) process_orphan_documents(rds_created, pq.court_id, d.date_filed) if content_updated and docket_count > 0: newly_enqueued = enqueue_docket_alert(d.pk) if newly_enqueued: send_docket_alert(d.pk, start_time) mark_pq_successful(pq, d_id=d.pk) return { 'docket_pk': d.pk, 'content_updated': bool(rds_created or content_updated), }