def fetch_pacer_doc_by_rd(self, rd_pk, fq_pk): """Fetch a PACER PDF by rd_pk This is very similar to get_pacer_doc_by_rd, except that it manages status as it proceeds and it gets the cookie info from redis. :param rd_pk: The PK of the RECAP Document to get. :param fq_pk: The PK of the RECAP Fetch Queue to update. :return: The RECAPDocument PK """ rd = RECAPDocument.objects.get(pk=rd_pk) fq = PacerFetchQueue.objects.get(pk=fq_pk) mark_fq_status(fq, "", PROCESSING_STATUS.IN_PROGRESS) if rd.is_available: msg = "PDF already marked as 'is_available'. Doing nothing." mark_fq_status(fq, msg, PROCESSING_STATUS.SUCCESSFUL) self.request.chain = None return cookies = get_pacer_cookie_from_cache(fq.user_id) if not cookies: msg = "Unable to find cached cookies. Aborting request." mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) self.request.chain = None return pacer_case_id = rd.docket_entry.docket.pacer_case_id try: r = download_pacer_pdf_by_rd( rd.pk, pacer_case_id, rd.pacer_doc_id, cookies ) except (requests.RequestException, HTTPError): msg = "Failed to get PDF from network." mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) self.request.chain = None return court_id = rd.docket_entry.docket.court_id success, msg = update_rd_metadata( self, rd_pk, r, court_id, pacer_case_id, rd.pacer_doc_id, rd.document_number, rd.attachment_number, ) if success is False: mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) self.request.chain = None return return rd.pk
def fetch_attachment_page(self, fq_pk): """Fetch a PACER attachment page by rd_pk This is very similar to process_recap_attachment, except that it manages status as it proceeds and it gets the cookie info from redis. :param fq_pk: The PK of the RECAP Fetch Queue to update. :return: None """ fq = PacerFetchQueue.objects.get(pk=fq_pk) mark_fq_status(fq, "", PROCESSING_STATUS.IN_PROGRESS) rd = fq.recap_document if not rd.pacer_doc_id: msg = ( "Unable to get attachment page: Unknown pacer_doc_id for " "RECAP Document object %s" % rd.pk ) mark_fq_status(fq, msg, PROCESSING_STATUS.NEEDS_INFO) return cookies = get_pacer_cookie_from_cache(fq.user_id) if not cookies: msg = "Unable to find cached cookies. Aborting request." mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) return try: r = get_attachment_page_by_rd(rd.pk, cookies) except (requests.RequestException, HTTPError): msg = "Failed to get attachment page from network." mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) return text = r.response.text att_data = get_data_from_att_report(text, rd.docket_entry.docket.court_id,) if att_data == {}: msg = "Not a valid attachment page upload" mark_fq_status(fq, msg, PROCESSING_STATUS.INVALID_CONTENT) return try: merge_attachment_page_data( rd.docket_entry.docket.court, rd.docket_entry.docket.pacer_case_id, att_data["pacer_doc_id"], att_data["document_number"], text, att_data["attachments"], ) except RECAPDocument.MultipleObjectsReturned: msg = ( "Too many documents found when attempting to associate " "attachment data" ) mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) return except RECAPDocument.DoesNotExist as exc: msg = "Could not find docket to associate with attachment metadata" if self.request.retries == self.max_retries: mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) return mark_fq_status(fq, msg, PROCESSING_STATUS.QUEUED_FOR_RETRY) raise self.retry(exc=exc) msg = "Successfully completed fetch and save." mark_fq_status(fq, msg, PROCESSING_STATUS.SUCCESSFUL)
def fetch_docket(self, fq_pk): """Fetch a docket from PACER This mirrors code elsewhere that gets dockets, but manages status as it goes through the process. :param fq_pk: The PK of the RECAP Fetch Queue to update. :return: None """ fq = PacerFetchQueue.objects.get(pk=fq_pk) mark_pq_status(fq, "", PROCESSING_STATUS.IN_PROGRESS) cookies = get_pacer_cookie_from_cache(fq.user_id) if cookies is None: msg = ( "Cookie cache expired before task could run for user: %s" % fq.user_id ) mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) court_id = fq.court_id or getattr(fq.docket, "court_id", None) s = PacerSession(cookies=cookies) try: result = fetch_pacer_case_id_and_title(s, fq, court_id) except (requests.RequestException, ReadTimeoutError) as exc: msg = "Network error getting pacer_case_id for fq: %s." if self.request.retries == self.max_retries: mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) self.request.chain = None return None mark_fq_status( fq, msg + "Retrying.", PROCESSING_STATUS.QUEUED_FOR_RETRY ) raise self.retry(exc=exc) except PacerLoginException as exc: msg = "PacerLoginException while getting pacer_case_id for fq: %s." if self.request.retries == self.max_retries: mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) self.request.chain = None return None mark_fq_status( fq, msg + "Retrying.", PROCESSING_STATUS.QUEUED_FOR_RETRY ) raise self.retry(exc=exc) except ParsingException: msg = "Unable to parse pacer_case_id for docket." mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) self.request.chain = None return None # result can be one of three values: # None --> Sealed or missing case # Empty dict --> Didn't run the pacer_case_id lookup (wasn't needed) # Full dict --> Ran the query, got back results if result is None: msg = "Cannot find case by docket number (perhaps it's sealed?)" mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) self.request.chain = None return None pacer_case_id = getattr(fq.docket, "pacer_case_id", None) or result.get( "pacer_case_id" ) if not pacer_case_id: msg = "Unable to determine pacer_case_id for docket." mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) self.request.chain = None return None try: result = fetch_docket_by_pacer_case_id(s, court_id, pacer_case_id, fq,) except (requests.RequestException, ReadTimeoutError) as exc: msg = "Network error getting pacer_case_id for fq: %s." if self.request.retries == self.max_retries: mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) self.request.chain = None return None mark_fq_status( fq, msg + "Retrying.", PROCESSING_STATUS.QUEUED_FOR_RETRY ) raise self.retry(exc=exc) except ParsingException: msg = "Unable to parse pacer_case_id for docket." mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) self.request.chain = None return None msg = "Successfully got and merged docket. Adding to Solr as final step." mark_fq_status(fq, msg, PROCESSING_STATUS.SUCCESSFUL) return result
def fetch_pacer_doc_by_rd(self, rd_pk: int, fq_pk: int) -> Optional[int]: """Fetch a PACER PDF by rd_pk This is very similar to get_pacer_doc_by_rd, except that it manages status as it proceeds and it gets the cookie info from redis. :param rd_pk: The PK of the RECAP Document to get. :param fq_pk: The PK of the RECAP Fetch Queue to update. :return: The RECAPDocument PK """ rd = RECAPDocument.objects.get(pk=rd_pk) fq = PacerFetchQueue.objects.get(pk=fq_pk) mark_fq_status(fq, "", PROCESSING_STATUS.IN_PROGRESS) if rd.is_available: msg = "PDF already marked as 'is_available'. Doing nothing." mark_fq_status(fq, msg, PROCESSING_STATUS.SUCCESSFUL) self.request.chain = None return if not rd.pacer_doc_id: msg = ( "Missing 'pacer_doc_id' attribute. Without this attribute we " "cannot identify the document properly. Missing pacer_doc_id " "attributes usually indicate that the item may not have a " "document associated with it, or it may need to be updated via " "the docket report to acquire a pacer_doc_id. Aborting request.") mark_fq_status(fq, msg, PROCESSING_STATUS.INVALID_CONTENT) self.request.chain = None return cookies = get_pacer_cookie_from_cache(fq.user_id) if not cookies: msg = "Unable to find cached cookies. Aborting request." mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) self.request.chain = None return pacer_case_id = rd.docket_entry.docket.pacer_case_id try: r = download_pacer_pdf_by_rd(rd.pk, pacer_case_id, rd.pacer_doc_id, cookies) except (requests.RequestException, HTTPError): msg = "Failed to get PDF from network." mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) self.request.chain = None return court_id = rd.docket_entry.docket.court_id success, msg = update_rd_metadata( self, rd_pk, r, court_id, pacer_case_id, rd.pacer_doc_id, rd.document_number, rd.attachment_number, ) if success is False: mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) self.request.chain = None return return rd.pk