def get_and_process_pdf(self, data, session, row_pk, index=False): if data is None: return result = data['result'] rd = RECAPDocument.objects.get(pk=data['rd_pk']) report = FreeOpinionReport(data['pacer_court_id'], session) try: r = report.download_pdf(result.pacer_case_id, result.pacer_doc_id) except (ConnectTimeout, ConnectionError, ReadTimeout, ReadTimeoutError, ChunkedEncodingError) as exc: logger.warning("Unable to get PDF for %s" % result) raise self.retry(exc=exc) except HTTPError as exc: if exc.response.status_code in [HTTP_500_INTERNAL_SERVER_ERROR, HTTP_504_GATEWAY_TIMEOUT]: logger.warning("Ran into HTTPError: %s. Retrying." % exc.response.status_code) raise self.retry(exc) else: msg = "Ran into unknown HTTPError. %s. Aborting." % \ exc.response.status_code logger.error(msg) PACERFreeDocumentRow.objects.filter(pk=row_pk).update(error_msg=msg) self.request.callbacks = None return if r is None: msg = "Unable to get PDF for %s at %s with doc id %s" % \ (result, result.court_id, result.pacer_doc_id) logger.error(msg) PACERFreeDocumentRow.objects.filter(pk=row_pk).update(error_msg=msg) self.request.callbacks = None return file_name = get_document_filename( result.court.pk, result.pacer_case_id, result.document_number, 0, # Attachment number is zero for all free opinions. ) cf = ContentFile(r.content) rd.filepath_local.save(file_name, cf, save=False) rd.is_available = True # We've got the PDF. # request.content is sometimes a str, sometimes unicode, so # force it all to be bytes, pleasing hashlib. rd.sha1 = hashlib.sha1(force_bytes(r.content)).hexdigest() rd.is_free_on_pacer = True rd.page_count = get_page_count(rd.filepath_local.path, 'pdf') # Save and extract, skipping OCR. rd.save(do_extraction=False, index=index) extract_recap_pdf(rd.pk, skip_ocr=True, check_if_needed=False) return {'result': result, 'rd_pk': rd.pk}
def make_recap_document(self, doc_node, docket_entry, entry_number, attachment_number, document_type, debug): """Do nothing for items that don't start with zero. For ones that do, find the stripped version, fix it, download the correct item, extract it and finally save it to Solr. """ if not entry_number.startswith('0'): # Only touch things where the new value leads with a zero. return None else: logger.info(" Doing docket_entry: %s, document_number, " "%s and attachment number: %s" % (docket_entry, entry_number, attachment_number)) old_entry_number = int(entry_number) try: rd = RECAPDocument.objects.get( docket_entry=docket_entry, document_number=old_entry_number, attachment_number=attachment_number or None, ) logger.info(" Found item.") except RECAPDocument.DoesNotExist: logger.info(" Failed to find item.") return None rd.document_number = entry_number if rd.is_available: new_ia = get_ia_document_url_from_path(self.path, entry_number, attachment_number) logger.info(" Updating IA URL from %s to %s" % (rd.filepath_ia, new_ia)) rd.filepath_ia = new_ia if not os.path.isfile(rd.filepath_local.path): # Set the value correctly and get the file from IA if we don't # already have it. new_local_path = os.path.join( 'recap', get_local_document_url_from_path(self.path, entry_number, attachment_number), ) logger.info(" Updating local path from %s to %s" % (rd.filepath_local, new_local_path)) rd.filepath_local = new_local_path filename = rd.filepath_ia.rsplit('/', 1)[-1] logger.info(" Downloading item with filename %s" % filename) if not debug: download_recap_item(rd.filepath_ia, filename) else: logger.info(" File already on disk. Punting.") if rd.page_count is None: logger.info(" Getting page count.") extension = rd.filepath_local.path.split('.')[-1] rd.page_count = get_page_count(rd.filepath_local.path, extension) else: logger.info(" Item not available in RECAP. Punting.") return None if not debug: try: extract_recap_pdf(rd.pk, check_if_needed=False) rd.save(do_extraction=False, index=True) logger.info( " Item saved at https://www.courtlistener.com%s" % rd.get_absolute_url()) except IntegrityError: logger.info(" Integrity error while saving.") return None else: logger.info(" No save requested in debug mode.") return rd
def process_recap_pdf(self, pk): """Process an uploaded PDF from the RECAP API endpoint. :param pk: The PK of the processing queue item you want to work on. :return: A RECAPDocument object that was created or updated. """ """Save a RECAP PDF to the database.""" pq = ProcessingQueue.objects.get(pk=pk) mark_pq_status(pq, '', pq.PROCESSING_IN_PROGRESS) if pq.attachment_number is None: document_type = RECAPDocument.PACER_DOCUMENT else: document_type = RECAPDocument.ATTACHMENT logger.info("Processing RECAP item (debug is: %s): %s " % (pq.debug, pq)) try: if pq.pacer_case_id: rd = RECAPDocument.objects.get( docket_entry__docket__pacer_case_id=pq.pacer_case_id, pacer_doc_id=pq.pacer_doc_id, ) else: # Sometimes we don't have the case ID from PACER. Try to make this # work anyway. rd = RECAPDocument.objects.get( pacer_doc_id=pq.pacer_doc_id, ) except (RECAPDocument.DoesNotExist, RECAPDocument.MultipleObjectsReturned): try: d = Docket.objects.get(pacer_case_id=pq.pacer_case_id, court_id=pq.court_id) except Docket.DoesNotExist as exc: # No Docket and no RECAPDocument. Do a retry. Hopefully the docket # will be in place soon (it could be in a different upload task that # hasn't yet been processed). logger.warning("Unable to find docket for processing queue '%s'. " "Retrying if max_retries is not exceeded." % pq) error_message = "Unable to find docket for item." if (self.request.retries == self.max_retries) or pq.debug: mark_pq_status(pq, error_message, pq.PROCESSING_FAILED) return None else: mark_pq_status(pq, error_message, pq.QUEUED_FOR_RETRY) raise self.retry(exc=exc) except Docket.MultipleObjectsReturned: msg = "Too many dockets found when trying to save '%s'" % pq mark_pq_status(pq, msg, pq.PROCESSING_FAILED) return None else: # Got the Docket, attempt to get/create the DocketEntry, and then # create the RECAPDocument try: de = DocketEntry.objects.get(docket=d, entry_number=pq.document_number) except DocketEntry.DoesNotExist as exc: logger.warning("Unable to find docket entry for processing " "queue '%s'. Retrying if max_retries is not " "exceeded." % pq) pq.error_message = "Unable to find docket entry for item." if (self.request.retries == self.max_retries) or pq.debug: pq.status = pq.PROCESSING_FAILED pq.save() return None else: pq.status = pq.QUEUED_FOR_RETRY pq.save() raise self.retry(exc=exc) else: # If we're here, we've got the docket and docket entry, but # were unable to find the document by pacer_doc_id. This happens # when pacer_doc_id is missing, for example. ∴, try to get the # document from the docket entry. try: rd = RECAPDocument.objects.get( docket_entry=de, document_number=pq.document_number, attachment_number=pq.attachment_number, document_type=document_type, ) except (RECAPDocument.DoesNotExist, RECAPDocument.MultipleObjectsReturned): # Unable to find it. Make a new item. rd = RECAPDocument( docket_entry=de, pacer_doc_id=pq.pacer_doc_id, date_upload=timezone.now(), document_type=document_type, ) rd.document_number = pq.document_number rd.attachment_number = pq.attachment_number # Do the file, finally. content = pq.filepath_local.read() new_sha1 = hashlib.sha1(content).hexdigest() existing_document = all([ rd.sha1 == new_sha1, rd.is_available, rd.filepath_local and os.path.isfile(rd.filepath_local.path) ]) if not existing_document: # Different sha1, it wasn't available, or it's missing from disk. Move # the new file over from the processing queue storage. cf = ContentFile(content) file_name = get_document_filename( rd.docket_entry.docket.court_id, rd.docket_entry.docket.pacer_case_id, rd.document_number, rd.attachment_number, ) if not pq.debug: rd.filepath_local.save(file_name, cf, save=False) # Do page count and extraction extension = rd.filepath_local.path.split('.')[-1] rd.page_count = get_page_count(rd.filepath_local.path, extension) rd.ocr_status = None rd.is_available = True rd.sha1 = new_sha1 if not pq.debug: try: rd.save() except IntegrityError: msg = "Duplicate key on unique_together constraint" mark_pq_status(pq, msg, pq.PROCESSING_FAILED) rd.filepath_local.delete(save=False) return None if not existing_document and not pq.debug: extract_recap_pdf(rd.pk) add_or_update_recap_document([rd.pk], force_commit=False) mark_pq_successful(pq, d_id=rd.docket_entry.docket_id, de_id=rd.docket_entry_id, rd_id=rd.pk) return rd
def get_pacer_doc_by_rd_and_description(self, rd_pk, description_re, cookies, fallback_to_main_doc=False, tag=None): """Using a RECAPDocument object ID and a description of a document, get the document from PACER. This function was originally meant to get civil cover sheets, but can be repurposed as needed. :param rd_pk: The PK of a RECAPDocument object to use as a source. :param description_re: A compiled regular expression to search against the description provided by the attachment page. :param cookies: A requests.cookies.RequestsCookieJar with the cookies of a logged-in PACER user. :param fallback_to_main_doc: Should we grab the main doc if none of the attachments match the regex? :param tag: A tag name to apply to any downloaded content. :return: None """ rd = RECAPDocument.objects.get(pk=rd_pk) att_report = get_attachment_page_by_rd(self, rd_pk, cookies) att_found = None for attachment in att_report.data.get('attachments', []): if description_re.search(attachment['description']): att_found = attachment.copy() document_type = RECAPDocument.ATTACHMENT break if not att_found: if fallback_to_main_doc: logger.info("Falling back to main document for pacer_doc_id: %s" % rd.pacer_doc_id) att_found = att_report.data document_type = RECAPDocument.PACER_DOCUMENT else: msg = "Aborting. Did not find civil cover sheet for %s." % rd logger.error(msg) self.request.callbacks = None return if not att_found.get('pacer_doc_id'): logger.warn("No pacer_doc_id for document (is it sealed?)") self.request.callbacks = None return # Try to find the attachment already in the collection rd, _ = RECAPDocument.objects.get_or_create( docket_entry=rd.docket_entry, attachment_number=att_found.get('attachment_number'), document_number=rd.document_number, pacer_doc_id=att_found['pacer_doc_id'], document_type=document_type, defaults={ 'date_upload': now(), }, ) # Replace the description if we have description data. # Else fallback on old. rd.description = att_found.get('description', '') or rd.description if tag is not None: tag, _ = Tag.objects.get_or_create(name=tag) tag.tag_object(rd) if rd.is_available: # Great. Call it a day. rd.save() return pacer_case_id = rd.docket_entry.docket.pacer_case_id r = download_pacer_pdf_by_rd(rd.pk, pacer_case_id, att_found['pacer_doc_id'], cookies) court_id = rd.docket_entry.docket.court_id success, msg = update_rd_metadata( self, rd_pk, r, court_id, pacer_case_id, rd.pacer_doc_id, rd.document_number, rd.attachment_number) if success is False: return # Skip OCR for now. It'll happen in a second step. extract_recap_pdf(rd.pk, skip_ocr=True) add_or_update_recap_document([rd.pk])
def get_pacer_doc_by_rd_and_description(self, rd_pk, description_re, session, fallback_to_main_doc=False, tag=None): """Using a RECAPDocument object ID and a description of a document, get the document from PACER. This function was originally meant to get civil cover sheets, but can be repurposed as needed. :param rd_pk: The PK of a RECAPDocument object to use as a source. :param description_re: A compiled regular expression to search against the description provided by the attachment page. :param session: The PACER session object to use. :param fallback_to_main_doc: Should we grab the main doc if none of the attachments match the regex? :param tag: A tag name to apply to any downloaded content. :return: None """ rd = RECAPDocument.objects.get(pk=rd_pk) if not rd.pacer_doc_id: # Some docket entries are just text/don't have a pacer_doc_id. self.request.callbacks = None return d = rd.docket_entry.docket pacer_court_id = map_cl_to_pacer_id(d.court_id) att_report = AttachmentPage(pacer_court_id, session) try: att_report.query(rd.pacer_doc_id) except (ConnectTimeout, ConnectionError, ReadTimeout, ReadTimeoutError, ChunkedEncodingError) as exc: logger.warning("Unable to get PDF for %s" % rd) raise self.retry(exc=exc) except HTTPError as exc: if exc.response.status_code in [ HTTP_500_INTERNAL_SERVER_ERROR, HTTP_504_GATEWAY_TIMEOUT ]: logger.warning("Ran into HTTPError: %s. Retrying." % exc.response.status_code) raise self.retry(exc) else: msg = "Ran into unknown HTTPError. %s. Aborting." % \ exc.response.status_code logger.error(msg) self.request.callbacks = None return att_found = None for attachment in att_report.data.get('attachments', []): if description_re.search(attachment['description']): att_found = attachment.copy() document_type = RECAPDocument.ATTACHMENT break if not att_found: if fallback_to_main_doc: logger.info("Falling back to main document for pacer_doc_id: %s" % rd.pacer_doc_id) att_found = att_report.data document_type = RECAPDocument.PACER_DOCUMENT else: msg = "Aborting. Did not find civil cover sheet for %s." % rd logger.error(msg) self.request.callbacks = None return if not att_found.get('pacer_doc_id'): logger.warn("No pacer_doc_id for document (is it sealed?)") self.request.callbacks = None return # Try to find the attachment already in the collection rd, _ = RECAPDocument.objects.get_or_create( docket_entry=rd.docket_entry, attachment_number=att_found.get('attachment_number'), document_number=rd.document_number, pacer_doc_id=att_found['pacer_doc_id'], document_type=document_type, defaults={ 'date_upload': now(), }, ) # Replace the description if we have description data. Else fallback on old. rd.description = att_found.get('description', '') or rd.description if tag is not None: tag, _ = Tag.objects.get_or_create(name=tag) rd.tags.add(tag) if rd.is_available: # Great. Call it a day. rd.save(do_extraction=False, index=False) return # Not available. Go get it. try: pacer_case_id = rd.docket_entry.docket.pacer_case_id r = att_report.download_pdf(pacer_case_id, att_found['pacer_doc_id']) except (ConnectTimeout, ConnectionError, ReadTimeout, ReadTimeoutError, ChunkedEncodingError) as exc: logger.warning("Unable to get PDF for %s" % att_found['pacer_doc_id']) raise self.retry(exc=exc) except HTTPError as exc: if exc.response.status_code in [ HTTP_500_INTERNAL_SERVER_ERROR, HTTP_504_GATEWAY_TIMEOUT ]: logger.warning("Ran into HTTPError: %s. Retrying." % exc.response.status_code) raise self.retry(exc) else: msg = "Ran into unknown HTTPError. %s. Aborting." % \ exc.response.status_code logger.error(msg) self.request.callbacks = None return if r is None: msg = "Unable to get PDF for %s at PACER court '%s' with doc id %s" % \ (rd, pacer_court_id, rd.pacer_doc_id) logger.error(msg) self.request.callbacks = None return file_name = get_document_filename( d.court_id, pacer_case_id, rd.document_number, rd.attachment_number, ) cf = ContentFile(r.content) rd.filepath_local.save(file_name, cf, save=False) rd.is_available = True # We've got the PDF. # request.content is sometimes a str, sometimes unicode, force it all to be # bytes, pleasing hashlib. rd.sha1 = hashlib.sha1(force_bytes(r.content)).hexdigest() rd.page_count = get_page_count(rd.filepath_local.path, 'pdf') # Save, extract, then save to Solr. Skip OCR for now. Don't do these async. rd.save(do_extraction=False, index=False) extract_recap_pdf(rd.pk, skip_ocr=True) add_or_update_recap_document([rd.pk])
def process_recap_pdf(self, pk): """Save a RECAP PDF to the database.""" pq = ProcessingQueue.objects.get(pk=pk) pq.status = pq.PROCESSING_IN_PROGRESS pq.save() logger.info("Processing RECAP item: %s" % pq) try: rd = RECAPDocument.objects.get( docket_entry__docket__pacer_case_id=pq.pacer_case_id, pacer_doc_id=pq.pacer_doc_id, ) except RECAPDocument.DoesNotExist: try: d = Docket.objects.get(pacer_case_id=pq.pacer_case_id, court_id=pq.court_id) except Docket.DoesNotExist as exc: # No Docket and no RECAPDocument. Do a retry. Hopefully the docket # will be in place soon (it could be in a different upload task that # hasn't yet been processed). logger.warning("Unable to find docket for processing queue '%s'. " "Retrying if max_retries is not exceeded." % pq) pq.error_message = "Unable to find docket for item." if self.request.retries == self.max_retries: pq.status = pq.PROCESSING_FAILED else: pq.status = pq.QUEUED_FOR_RETRY pq.save() raise self.retry(exc=exc) except Docket.MultipleObjectsReturned: msg = "Too many dockets found when trying to save '%s'" % pq logger.error(msg) pq.error_message = msg pq.status = pq.PROCESSING_FAILED pq.save() return None else: # Got the Docket, attempt to get/create the DocketEntry, and then # create the RECAPDocument try: de = DocketEntry.objects.get(docket=d, entry_number=pq.document_number) except DocketEntry.DoesNotExist as exc: logger.warning("Unable to find docket entry for processing " "queue '%s'. Retrying if max_retries is not " "exceeded." % pq) pq.error_message = "Unable to find docket entry for item." if self.request.retries == self.max_retries: pq.status = pq.PROCESSING_FAILED else: pq.status = pq.QUEUED_FOR_RETRY pq.save() raise self.retry(exc=exc) # All objects accounted for. Make some data. rd = RECAPDocument( docket_entry=de, pacer_doc_id=pq.pacer_doc_id, date_upload=timezone.now(), ) if pq.attachment_number is None: rd.document_type = RECAPDocument.PACER_DOCUMENT else: rd.document_type = RECAPDocument.ATTACHMENT rd.document_number = pq.document_number rd.attachment_number = pq.attachment_number # Do the file, finally. content = pq.filepath_local.read() new_sha1 = hashlib.sha1(content).hexdigest() if all([rd.sha1 == new_sha1, rd.is_available, rd.filepath_local and os.path.isfile(rd.filepath_local.path)]): # All good. Press on. new_document = False else: # Different sha1, it wasn't available, or it's missing from disk. Move # the new file over from the processing queue storage. new_document = True cf = ContentFile(content) file_name = get_document_filename( rd.docket_entry.docket.court_id, rd.docket_entry.docket.pacer_case_id, rd.document_number, rd.attachment_number, ) rd.filepath_local.save(file_name, cf, save=False) rd.is_available = True rd.sha1 = new_sha1 # Do page count and extraction extension = rd.filepath_local.path.split('.')[-1] rd.page_count = get_page_count(rd.filepath_local.path, extension) rd.ocr_status = None # Ditch the original file pq.filepath_local.delete(save=False) pq.error_message = '' # Clear out errors b/c successful pq.status = pq.PROCESSING_SUCCESSFUL pq.save() rd.save() if new_document: extract_recap_pdf(rd.pk) add_or_update_recap_document([rd.pk], force_commit=False) return rd
def process_recap_pdf(self, pk): """Process an uploaded PDF from the RECAP API endpoint. :param pk: The PK of the processing queue item you want to work on. :return: A RECAPDocument object that was created or updated. """ """Save a RECAP PDF to the database.""" pq = ProcessingQueue.objects.get(pk=pk) mark_pq_status(pq, "", PROCESSING_STATUS.IN_PROGRESS) if pq.attachment_number is None: document_type = RECAPDocument.PACER_DOCUMENT else: document_type = RECAPDocument.ATTACHMENT logger.info("Processing RECAP item (debug is: %s): %s " % (pq.debug, pq)) try: if pq.pacer_case_id: rd = RECAPDocument.objects.get( docket_entry__docket__pacer_case_id=pq.pacer_case_id, pacer_doc_id=pq.pacer_doc_id, ) else: # Sometimes we don't have the case ID from PACER. Try to make this # work anyway. rd = RECAPDocument.objects.get(pacer_doc_id=pq.pacer_doc_id) except (RECAPDocument.DoesNotExist, RECAPDocument.MultipleObjectsReturned): try: d = Docket.objects.get(pacer_case_id=pq.pacer_case_id, court_id=pq.court_id) except Docket.DoesNotExist as exc: # No Docket and no RECAPDocument. Do a retry. Hopefully # the docket will be in place soon (it could be in a # different upload task that hasn't yet been processed). logger.warning("Unable to find docket for processing queue '%s'. " "Retrying if max_retries is not exceeded." % pq) error_message = "Unable to find docket for item." if (self.request.retries == self.max_retries) or pq.debug: mark_pq_status(pq, error_message, PROCESSING_STATUS.FAILED) return None else: mark_pq_status(pq, error_message, PROCESSING_STATUS.QUEUED_FOR_RETRY) raise self.retry(exc=exc) except Docket.MultipleObjectsReturned: msg = "Too many dockets found when trying to save '%s'" % pq mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED) return None # Got the Docket, attempt to get/create the DocketEntry, and then # create the RECAPDocument try: de = DocketEntry.objects.get(docket=d, entry_number=pq.document_number) except DocketEntry.DoesNotExist as exc: logger.warning("Unable to find docket entry for processing " "queue '%s'." % pq) msg = "Unable to find docket entry for item." if (self.request.retries == self.max_retries) or pq.debug: mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED) return None else: mark_pq_status(pq, msg, PROCESSING_STATUS.QUEUED_FOR_RETRY) raise self.retry(exc=exc) else: # If we're here, we've got the docket and docket # entry, but were unable to find the document by # pacer_doc_id. This happens when pacer_doc_id is # missing, for example. ∴, try to get the document # from the docket entry. try: rd = RECAPDocument.objects.get( docket_entry=de, document_number=pq.document_number, attachment_number=pq.attachment_number, document_type=document_type, ) except ( RECAPDocument.DoesNotExist, RECAPDocument.MultipleObjectsReturned, ): # Unable to find it. Make a new item. rd = RECAPDocument( docket_entry=de, pacer_doc_id=pq.pacer_doc_id, document_type=document_type, ) rd.document_number = pq.document_number rd.attachment_number = pq.attachment_number # Do the file, finally. try: content = pq.filepath_local.read() except IOError as exc: msg = "Internal processing error (%s: %s)." % (exc.errno, exc.strerror) if (self.request.retries == self.max_retries) or pq.debug: mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED) return None else: mark_pq_status(pq, msg, PROCESSING_STATUS.QUEUED_FOR_RETRY) raise self.retry(exc=exc) new_sha1 = sha1(content) existing_document = all([ rd.sha1 == new_sha1, rd.is_available, rd.filepath_local, ]) if not existing_document: # Different sha1, it wasn't available, or it's missing from disk. Move # the new file over from the processing queue storage. cf = ContentFile(content) file_name = get_document_filename( rd.docket_entry.docket.court_id, rd.docket_entry.docket.pacer_case_id, rd.document_number, rd.attachment_number, ) if not pq.debug: rd.filepath_local.save(file_name, cf, save=False) # Do page count and extraction extension = rd.filepath_local.name.split(".")[-1] with NamedTemporaryFile( prefix="rd_page_count_", suffix=f".{extension}", buffering=0, ) as tmp: tmp.write(content) rd.page_count = get_page_count(tmp.name, extension) rd.file_size = rd.filepath_local.size rd.ocr_status = None rd.is_available = True rd.sha1 = new_sha1 rd.date_upload = now() if not pq.debug: try: rd.save() except (IntegrityError, ValidationError): msg = "Duplicate key on unique_together constraint" mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED) rd.filepath_local.delete(save=False) return None if not existing_document and not pq.debug: extract_recap_pdf(rd.pk) add_items_to_solr([rd.pk], "search.RECAPDocument") mark_pq_successful( pq, d_id=rd.docket_entry.docket_id, de_id=rd.docket_entry_id, rd_id=rd.pk, ) mark_ia_upload_needed(rd.docket_entry.docket, save_docket=True) return rd
def make_recap_document(self, doc_node, docket_entry, entry_number, attachment_number, document_type, debug): """Do nothing for items that don't start with zero. For ones that do, find the stripped version, fix it, download the correct item, extract it and finally save it to Solr. """ if not entry_number.startswith('0'): # Only touch things where the new value leads with a zero. return None else: logger.info(" Doing docket_entry: %s, document_number, " "%s and attachment number: %s" % (docket_entry, entry_number, attachment_number)) old_entry_number = int(entry_number) try: rd = RECAPDocument.objects.get( docket_entry=docket_entry, document_number=old_entry_number, attachment_number=attachment_number or None, ) logger.info(" Found item.") except RECAPDocument.DoesNotExist: logger.info(" Failed to find item.") return None rd.document_number = entry_number if rd.is_available: new_ia = get_ia_document_url_from_path( self.path, entry_number, attachment_number) logger.info(" Updating IA URL from %s to %s" % (rd.filepath_ia, new_ia)) rd.filepath_ia = new_ia if not os.path.isfile(rd.filepath_local.path): # Set the value correctly and get the file from IA if we don't # already have it. new_local_path = os.path.join( 'recap', get_local_document_url_from_path(self.path, entry_number, attachment_number), ) logger.info(" Updating local path from %s to %s" % (rd.filepath_local, new_local_path)) rd.filepath_local = new_local_path filename = rd.filepath_ia.rsplit('/', 1)[-1] logger.info(" Downloading item with filename %s" % filename) if not debug: download_recap_item(rd.filepath_ia, filename) else: logger.info(" File already on disk. Punting.") if rd.page_count is None: logger.info(" Getting page count.") extension = rd.filepath_local.path.split('.')[-1] rd.page_count = get_page_count(rd.filepath_local.path, extension) else: logger.info(" Item not available in RECAP. Punting.") return None if not debug: try: extract_recap_pdf(rd.pk, check_if_needed=False) rd.save(do_extraction=False, index=True) logger.info(" Item saved at https://www.courtlistener.com%s" % rd.get_absolute_url()) except IntegrityError: logger.info(" Integrity error while saving.") return None else: logger.info(" No save requested in debug mode.") return rd
def process_recap_pdf(self, pk): """Save a RECAP PDF to the database.""" pq = ProcessingQueue.objects.get(pk=pk) pq.status = pq.PROCESSING_IN_PROGRESS pq.save() logger.info("Processing RECAP item: %s" % pq) try: rd = RECAPDocument.objects.get( docket_entry__docket__pacer_case_id=pq.pacer_case_id, pacer_doc_id=pq.pacer_doc_id, ) except RECAPDocument.DoesNotExist: try: d = Docket.objects.get(pacer_case_id=pq.pacer_case_id, court_id=pq.court_id) except Docket.DoesNotExist as exc: # No Docket and no RECAPDocument. Do a retry. Hopefully the docket # will be in place soon (it could be in a different upload task that # hasn't yet been processed). logger.warning("Unable to find docket for processing queue '%s'. " "Retrying if max_retries is not exceeded." % pq) pq.error_message = "Unable to find docket for item." if self.request.retries == self.max_retries: pq.status = pq.PROCESSING_FAILED else: pq.status = pq.QUEUED_FOR_RETRY pq.save() raise self.retry(exc=exc) except Docket.MultipleObjectsReturned: msg = "Too many dockets found when trying to save '%s'" % pq logger.error(msg) pq.error_message = msg pq.status = pq.PROCESSING_FAILED pq.save() return None else: # Got the Docket, attempt to get/create the DocketEntry, and then # create the RECAPDocument try: de = DocketEntry.objects.get(docket=d, entry_number=pq.document_number) except DocketEntry.DoesNotExist as exc: logger.warning("Unable to find docket entry for processing " "queue '%s'. Retrying if max_retries is not " "exceeded." % pq) pq.error_message = "Unable to find docket entry for item." if self.request.retries == self.max_retries: pq.status = pq.PROCESSING_FAILED else: pq.status = pq.QUEUED_FOR_RETRY pq.save() raise self.retry(exc=exc) # All objects accounted for. Make some data. rd = RECAPDocument( docket_entry=de, pacer_doc_id=pq.pacer_doc_id, date_upload=timezone.now(), ) if pq.attachment_number is None: rd.document_type = RECAPDocument.PACER_DOCUMENT else: rd.document_type = RECAPDocument.ATTACHMENT rd.document_number = pq.document_number rd.attachment_number = pq.attachment_number # Do the file, finally. content = pq.filepath_local.read() new_sha1 = hashlib.sha1(content).hexdigest() if all([ rd.sha1 == new_sha1, rd.is_available, rd.filepath_local and os.path.isfile(rd.filepath_local.path) ]): # All good. Press on. new_document = False else: # Different sha1, it wasn't available, or it's missing from disk. Move # the new file over from the processing queue storage. new_document = True cf = ContentFile(content) file_name = get_document_filename( rd.docket_entry.docket.court_id, rd.docket_entry.docket.pacer_case_id, rd.document_number, rd.attachment_number, ) rd.filepath_local.save(file_name, cf, save=False) rd.is_available = True rd.sha1 = new_sha1 # Do page count and extraction extension = rd.filepath_local.path.split('.')[-1] rd.page_count = get_page_count(rd.filepath_local.path, extension) rd.ocr_status = None # Ditch the original file pq.filepath_local.delete(save=False) pq.error_message = '' # Clear out errors b/c successful pq.status = pq.PROCESSING_SUCCESSFUL pq.save() rd.save() if new_document: extract_recap_pdf(rd.pk) add_or_update_recap_document([rd.pk], force_commit=False) return rd