def make_recap_document(self, doc_node, docket_entry, entry_number, attachment_number, document_type, debug): """Make a PACER document.""" pacer_document_id = self.get_str_from_node(doc_node, 'pacer_doc_id') try: rd = RECAPDocument.objects.get( docket_entry=docket_entry, document_number=entry_number, # Use the attachment number if it is not 0, else use None. attachment_number=attachment_number or None, ) except RECAPDocument.DoesNotExist: rd = RECAPDocument( docket_entry=docket_entry, pacer_doc_id=pacer_document_id, document_number=entry_number, ) else: rd.pacer_doc_id = pacer_document_id or rd.pacer_doc_id rd.date_upload = self.get_datetime_from_node(doc_node, 'upload_date') rd.document_type = document_type or rd.document_type # If we can't parse the availability node (it returns None), default it # to False. availability = self.get_bool_from_node(doc_node, 'available') rd.is_available = False if availability is None else availability rd.sha1 = self.get_str_from_node(doc_node, 'sha1') rd.description = (self.get_str_from_node(doc_node, 'short_desc') or rd.description) if rd.is_available: rd.filepath_ia = get_ia_document_url_from_path( self.path, entry_number, attachment_number) rd.filepath_local = os.path.join( 'recap', get_local_document_url_from_path(self.path, entry_number, attachment_number), ) if rd.page_count is None: extension = rd.filepath_local.path.split('.')[-1] rd.page_count = get_page_count(rd.filepath_local.path, extension) if document_type == RECAPDocument.ATTACHMENT: rd.attachment_number = attachment_number if not debug: try: rd.save(do_extraction=False, index=False) except IntegrityError as e: # This happens when a pacer_doc_id has been wrongly set as # the document_number, see for example, document 19 and # document 00405193374 here: https://ia802300.us.archive.org/23/items/gov.uscourts.ca4.14-1872/gov.uscourts.ca4.14-1872.docket.xml logger.error( "Unable to create RECAPDocument for document #%s, " "attachment #%s on entry: %s due to " "IntegrityError." % (rd.document_number, rd.attachment_number, rd.docket_entry)) return None return rd
def get_and_process_pdf(self, data, session, row_pk, index=False): if data is None: return result = data['result'] rd = RECAPDocument.objects.get(pk=data['rd_pk']) report = FreeOpinionReport(data['pacer_court_id'], session) try: r = report.download_pdf(result.pacer_case_id, result.pacer_doc_id) except (ConnectTimeout, ConnectionError, ReadTimeout, ReadTimeoutError, ChunkedEncodingError) as exc: logger.warning("Unable to get PDF for %s" % result) raise self.retry(exc=exc) except HTTPError as exc: if exc.response.status_code in [ HTTP_500_INTERNAL_SERVER_ERROR, HTTP_504_GATEWAY_TIMEOUT ]: logger.warning("Ran into HTTPError: %s. Retrying." % exc.response.status_code) raise self.retry(exc) else: msg = "Ran into unknown HTTPError. %s. Aborting." % \ exc.response.status_code logger.error(msg) PACERFreeDocumentRow.objects.filter(pk=row_pk).update( error_msg=msg) self.request.callbacks = None return if r is None: msg = "Unable to get PDF for %s at %s with doc id %s" % \ (result, result.court_id, result.pacer_doc_id) logger.error(msg) PACERFreeDocumentRow.objects.filter(pk=row_pk).update(error_msg=msg) self.request.callbacks = None return file_name = get_document_filename( result.court.pk, result.pacer_case_id, result.document_number, 0, # Attachment number is zero for all free opinions. ) cf = ContentFile(r.content) rd.filepath_local.save(file_name, cf, save=False) rd.is_available = True # We've got the PDF. # request.content is sometimes a str, sometimes unicode, so # force it all to be bytes, pleasing hashlib. rd.sha1 = hashlib.sha1(force_bytes(r.content)).hexdigest() rd.is_free_on_pacer = True rd.page_count = get_page_count(rd.filepath_local.path, 'pdf') # Save and extract, skipping OCR. rd.save(do_extraction=False, index=index) extract_recap_pdf(rd.pk, skip_ocr=True, check_if_needed=False) return {'result': result, 'rd_pk': rd.pk}
def get_and_process_pdf(self, data, session, row_pk, index=False): if data is None: return result = data['result'] rd = RECAPDocument.objects.get(pk=data['rd_pk']) report = FreeOpinionReport(data['pacer_court_id'], session) try: r = report.download_pdf(result.pacer_case_id, result.pacer_doc_id) except (ConnectTimeout, ConnectionError, ReadTimeout, ReadTimeoutError, ChunkedEncodingError) as exc: logger.warning("Unable to get PDF for %s" % result) raise self.retry(exc=exc) except HTTPError as exc: if exc.response.status_code in [HTTP_500_INTERNAL_SERVER_ERROR, HTTP_504_GATEWAY_TIMEOUT]: logger.warning("Ran into HTTPError: %s. Retrying." % exc.response.status_code) raise self.retry(exc) else: msg = "Ran into unknown HTTPError. %s. Aborting." % \ exc.response.status_code logger.error(msg) PACERFreeDocumentRow.objects.filter(pk=row_pk).update(error_msg=msg) self.request.callbacks = None return if r is None: msg = "Unable to get PDF for %s at %s with doc id %s" % \ (result, result.court_id, result.pacer_doc_id) logger.error(msg) PACERFreeDocumentRow.objects.filter(pk=row_pk).update(error_msg=msg) self.request.callbacks = None return file_name = get_document_filename( result.court.pk, result.pacer_case_id, result.document_number, 0, # Attachment number is zero for all free opinions. ) cf = ContentFile(r.content) rd.filepath_local.save(file_name, cf, save=False) rd.is_available = True # We've got the PDF. # request.content is sometimes a str, sometimes unicode, so # force it all to be bytes, pleasing hashlib. rd.sha1 = hashlib.sha1(force_bytes(r.content)).hexdigest() rd.is_free_on_pacer = True rd.page_count = get_page_count(rd.filepath_local.path, 'pdf') # Save and extract, skipping OCR. rd.save(do_extraction=False, index=index) extract_recap_pdf(rd.pk, skip_ocr=True, check_if_needed=False) return {'result': result, 'rd_pk': rd.pk}
def make_recap_document(self, doc_node, docket_entry, entry_number, attachment_number, document_type, debug): """Make a PACER document.""" pacer_document_id = self.get_str_from_node( doc_node, 'pacer_doc_id') try: rd = RECAPDocument.objects.get( docket_entry=docket_entry, document_number=entry_number, # Use the attachment number if it is not 0, else use None. attachment_number=attachment_number or None, ) except RECAPDocument.DoesNotExist: rd = RECAPDocument( docket_entry=docket_entry, pacer_doc_id=pacer_document_id, document_number=entry_number, ) else: rd.pacer_doc_id = pacer_document_id or rd.pacer_doc_id rd.date_upload = self.get_datetime_from_node(doc_node, 'upload_date') rd.document_type = document_type or rd.document_type if not rd.is_available: # If we can't parse the availability node (it returns None), # default it to False. availability = self.get_bool_from_node(doc_node, 'available') rd.is_available = False if availability is None else availability if not rd.sha1: rd.sha1 = self.get_str_from_node(doc_node, 'sha1') rd.description = (self.get_str_from_node(doc_node, 'short_desc') or rd.description) if rd.is_available: rd.filepath_ia = get_ia_document_url_from_path( self.path, entry_number, attachment_number) rd.filepath_local = os.path.join( 'recap', get_local_document_url_from_path(self.path, entry_number, attachment_number), ) if rd.page_count is None: extension = rd.filepath_local.path.split('.')[-1] rd.page_count = get_page_count(rd.filepath_local.path, extension) if document_type == RECAPDocument.ATTACHMENT: rd.attachment_number = attachment_number if not debug: rd.save(do_extraction=False, index=False) return rd
def update_rd_metadata(self, rd_pk, response, court_id, pacer_case_id, pacer_doc_id, document_number, attachment_number): """After querying PACER and downloading a document, save it to the DB. :param rd_pk: The primary key of the RECAPDocument to work on :param response: A requests.Response object containing the PDF data. :param court_id: A CourtListener court ID to use for file names. :param pacer_case_id: The pacer_case_id to use in error logs. :param pacer_doc_id: The pacer_doc_id to use in error logs. :param document_number: The docket entry number for use in file names. :param attachment_number: The attachment number (if applicable) for use in file names. :return: A two-tuple of a boolean indicating success and a corresponding error/success message string. """ rd = RECAPDocument.objects.get(pk=rd_pk) if response is None: msg = "Unable to get PDF for RECAP Document '%s' " \ "at '%s' with doc id '%s'" % (rd_pk, court_id, pacer_doc_id) logger.error(msg) self.request.callbacks = None return False, msg file_name = get_document_filename(court_id, pacer_case_id, document_number, attachment_number) cf = ContentFile(response.content) rd.filepath_local.save(file_name, cf, save=False) rd.file_size = rd.filepath_local.size rd.is_available = True # We've got the PDF. # request.content is sometimes a str, sometimes unicode, so # force it all to be bytes, pleasing hashlib. rd.sha1 = hashlib.sha1(force_bytes(response.content)).hexdigest() rd.page_count = get_page_count(rd.filepath_local.path, 'pdf') # Save and extract, skipping OCR. rd.save() # Make sure we mark the docket as needing upload changed = mark_ia_upload_needed(rd.docket_entry.docket) if changed: rd.docket_entry.docket.save() return True, 'Saved item successfully'
def handle(self, *args, **options): raw_input( "This is a very primitive script that has serious performance " "issues with large datasets. Press any key to proceed anyway. " "Otherwise, press CTRL+C to exit.") cnt = Counter() for r in RECAPDocument.objects.all(): try: path = r.filepath_local.path except ValueError: cnt['no_file'] += 1 else: extension = path.split('.')[-1] count = get_page_count(path, extension) r.page_count = count r.save(do_extraction=False, index=False) cnt['successes'] += 1 if count is not None: cnt['total_pages'] += count print cnt
def handle(self, *args, **options): super(Command, self).handle(*args, **options) input("This is a very primitive script that has serious performance " "issues with large datasets. Press any key to proceed anyway. " "Otherwise, press CTRL+C to exit.") cnt = Counter() for rd in RECAPDocument.objects.all(): try: path = rd.filepath_local.path except ValueError: cnt["no_file"] += 1 else: extension = path.split(".")[-1] count = get_page_count(path, extension) rd.page_count = count rd.save(do_extraction=False, index=False) cnt["successes"] += 1 if count is not None: cnt["total_pages"] += count logger.info(cnt)
def handle(self, *args, **options): raw_input( "This is a very primitive script that has serious performance " "issues with large datasets. Press any key to proceed anyway. " "Otherwise, press CTRL+C to exit." ) cnt = Counter() for rd in RECAPDocument.objects.all(): try: path = rd.filepath_local.path except ValueError: cnt['no_file'] += 1 else: extension = path.split('.')[-1] count = get_page_count(path, extension) rd.page_count = count rd.save(do_extraction=False, index=False) cnt['successes'] += 1 if count is not None: cnt['total_pages'] += count print cnt
def make_recap_document(self, doc_node, docket_entry, entry_number, attachment_number, document_type, debug): """Do nothing for items that don't start with zero. For ones that do, find the stripped version, fix it, download the correct item, extract it and finally save it to Solr. """ if not entry_number.startswith('0'): # Only touch things where the new value leads with a zero. return None else: logger.info(" Doing docket_entry: %s, document_number, " "%s and attachment number: %s" % (docket_entry, entry_number, attachment_number)) old_entry_number = int(entry_number) try: rd = RECAPDocument.objects.get( docket_entry=docket_entry, document_number=old_entry_number, attachment_number=attachment_number or None, ) logger.info(" Found item.") except RECAPDocument.DoesNotExist: logger.info(" Failed to find item.") return None rd.document_number = entry_number if rd.is_available: new_ia = get_ia_document_url_from_path(self.path, entry_number, attachment_number) logger.info(" Updating IA URL from %s to %s" % (rd.filepath_ia, new_ia)) rd.filepath_ia = new_ia if not os.path.isfile(rd.filepath_local.path): # Set the value correctly and get the file from IA if we don't # already have it. new_local_path = os.path.join( 'recap', get_local_document_url_from_path(self.path, entry_number, attachment_number), ) logger.info(" Updating local path from %s to %s" % (rd.filepath_local, new_local_path)) rd.filepath_local = new_local_path filename = rd.filepath_ia.rsplit('/', 1)[-1] logger.info(" Downloading item with filename %s" % filename) if not debug: download_recap_item(rd.filepath_ia, filename) else: logger.info(" File already on disk. Punting.") if rd.page_count is None: logger.info(" Getting page count.") extension = rd.filepath_local.path.split('.')[-1] rd.page_count = get_page_count(rd.filepath_local.path, extension) else: logger.info(" Item not available in RECAP. Punting.") return None if not debug: try: extract_recap_pdf(rd.pk, check_if_needed=False) rd.save(do_extraction=False, index=True) logger.info( " Item saved at https://www.courtlistener.com%s" % rd.get_absolute_url()) except IntegrityError: logger.info(" Integrity error while saving.") return None else: logger.info(" No save requested in debug mode.") return rd
def process_recap_pdf(self, pk): """Process an uploaded PDF from the RECAP API endpoint. :param pk: The PK of the processing queue item you want to work on. :return: A RECAPDocument object that was created or updated. """ """Save a RECAP PDF to the database.""" pq = ProcessingQueue.objects.get(pk=pk) mark_pq_status(pq, "", PROCESSING_STATUS.IN_PROGRESS) if pq.attachment_number is None: document_type = RECAPDocument.PACER_DOCUMENT else: document_type = RECAPDocument.ATTACHMENT logger.info("Processing RECAP item (debug is: %s): %s " % (pq.debug, pq)) try: if pq.pacer_case_id: rd = RECAPDocument.objects.get( docket_entry__docket__pacer_case_id=pq.pacer_case_id, pacer_doc_id=pq.pacer_doc_id, ) else: # Sometimes we don't have the case ID from PACER. Try to make this # work anyway. rd = RECAPDocument.objects.get(pacer_doc_id=pq.pacer_doc_id,) except (RECAPDocument.DoesNotExist, RECAPDocument.MultipleObjectsReturned): try: d = Docket.objects.get( pacer_case_id=pq.pacer_case_id, court_id=pq.court_id ) except Docket.DoesNotExist as exc: # No Docket and no RECAPDocument. Do a retry. Hopefully # the docket will be in place soon (it could be in a # different upload task that hasn't yet been processed). logger.warning( "Unable to find docket for processing queue '%s'. " "Retrying if max_retries is not exceeded." % pq ) error_message = "Unable to find docket for item." if (self.request.retries == self.max_retries) or pq.debug: mark_pq_status(pq, error_message, PROCESSING_STATUS.FAILED) return None else: mark_pq_status( pq, error_message, PROCESSING_STATUS.QUEUED_FOR_RETRY ) raise self.retry(exc=exc) except Docket.MultipleObjectsReturned: msg = "Too many dockets found when trying to save '%s'" % pq mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED) return None # Got the Docket, attempt to get/create the DocketEntry, and then # create the RECAPDocument try: de = DocketEntry.objects.get( docket=d, entry_number=pq.document_number ) except DocketEntry.DoesNotExist as exc: logger.warning( "Unable to find docket entry for processing " "queue '%s'." % pq ) msg = "Unable to find docket entry for item." if (self.request.retries == self.max_retries) or pq.debug: mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED) return None else: mark_pq_status(pq, msg, PROCESSING_STATUS.QUEUED_FOR_RETRY) raise self.retry(exc=exc) else: # If we're here, we've got the docket and docket # entry, but were unable to find the document by # pacer_doc_id. This happens when pacer_doc_id is # missing, for example. ∴, try to get the document # from the docket entry. try: rd = RECAPDocument.objects.get( docket_entry=de, document_number=pq.document_number, attachment_number=pq.attachment_number, document_type=document_type, ) except ( RECAPDocument.DoesNotExist, RECAPDocument.MultipleObjectsReturned, ): # Unable to find it. Make a new item. rd = RECAPDocument( docket_entry=de, pacer_doc_id=pq.pacer_doc_id, document_type=document_type, ) rd.document_number = pq.document_number rd.attachment_number = pq.attachment_number # Do the file, finally. try: content = pq.filepath_local.read() except IOError as exc: msg = "Internal processing error (%s: %s)." % (exc.errno, exc.strerror) if (self.request.retries == self.max_retries) or pq.debug: mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED) return None else: mark_pq_status(pq, msg, PROCESSING_STATUS.QUEUED_FOR_RETRY) raise self.retry(exc=exc) new_sha1 = sha1(content) existing_document = all( [ rd.sha1 == new_sha1, rd.is_available, rd.filepath_local and os.path.isfile(rd.filepath_local.path), ] ) if not existing_document: # Different sha1, it wasn't available, or it's missing from disk. Move # the new file over from the processing queue storage. cf = ContentFile(content) file_name = get_document_filename( rd.docket_entry.docket.court_id, rd.docket_entry.docket.pacer_case_id, rd.document_number, rd.attachment_number, ) if not pq.debug: rd.filepath_local.save(file_name, cf, save=False) # Do page count and extraction extension = rd.filepath_local.path.split(".")[-1] rd.page_count = get_page_count(rd.filepath_local.path, extension) rd.file_size = rd.filepath_local.size rd.ocr_status = None rd.is_available = True rd.sha1 = new_sha1 rd.date_upload = now() if not pq.debug: try: rd.save() except (IntegrityError, ValidationError): msg = "Duplicate key on unique_together constraint" mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED) rd.filepath_local.delete(save=False) return None if not existing_document and not pq.debug: extract_recap_pdf(rd.pk) add_items_to_solr([rd.pk], "search.RECAPDocument") mark_pq_successful( pq, d_id=rd.docket_entry.docket_id, de_id=rd.docket_entry_id, rd_id=rd.pk, ) mark_ia_upload_needed(rd.docket_entry.docket, save_docket=True) return rd
def get_pacer_doc_by_rd_and_description(self, rd_pk, description_re, session, fallback_to_main_doc=False, tag=None): """Using a RECAPDocument object ID and a description of a document, get the document from PACER. This function was originally meant to get civil cover sheets, but can be repurposed as needed. :param rd_pk: The PK of a RECAPDocument object to use as a source. :param description_re: A compiled regular expression to search against the description provided by the attachment page. :param session: The PACER session object to use. :param fallback_to_main_doc: Should we grab the main doc if none of the attachments match the regex? :param tag: A tag name to apply to any downloaded content. :return: None """ rd = RECAPDocument.objects.get(pk=rd_pk) if not rd.pacer_doc_id: # Some docket entries are just text/don't have a pacer_doc_id. self.request.callbacks = None return d = rd.docket_entry.docket pacer_court_id = map_cl_to_pacer_id(d.court_id) att_report = AttachmentPage(pacer_court_id, session) try: att_report.query(rd.pacer_doc_id) except (ConnectTimeout, ConnectionError, ReadTimeout, ReadTimeoutError, ChunkedEncodingError) as exc: logger.warning("Unable to get PDF for %s" % rd) raise self.retry(exc=exc) except HTTPError as exc: if exc.response.status_code in [ HTTP_500_INTERNAL_SERVER_ERROR, HTTP_504_GATEWAY_TIMEOUT ]: logger.warning("Ran into HTTPError: %s. Retrying." % exc.response.status_code) raise self.retry(exc) else: msg = "Ran into unknown HTTPError. %s. Aborting." % \ exc.response.status_code logger.error(msg) self.request.callbacks = None return att_found = None for attachment in att_report.data.get('attachments', []): if description_re.search(attachment['description']): att_found = attachment.copy() document_type = RECAPDocument.ATTACHMENT break if not att_found: if fallback_to_main_doc: logger.info("Falling back to main document for pacer_doc_id: %s" % rd.pacer_doc_id) att_found = att_report.data document_type = RECAPDocument.PACER_DOCUMENT else: msg = "Aborting. Did not find civil cover sheet for %s." % rd logger.error(msg) self.request.callbacks = None return if not att_found.get('pacer_doc_id'): logger.warn("No pacer_doc_id for document (is it sealed?)") self.request.callbacks = None return # Try to find the attachment already in the collection rd, _ = RECAPDocument.objects.get_or_create( docket_entry=rd.docket_entry, attachment_number=att_found.get('attachment_number'), document_number=rd.document_number, pacer_doc_id=att_found['pacer_doc_id'], document_type=document_type, defaults={ 'date_upload': now(), }, ) # Replace the description if we have description data. Else fallback on old. rd.description = att_found.get('description', '') or rd.description if tag is not None: tag, _ = Tag.objects.get_or_create(name=tag) rd.tags.add(tag) if rd.is_available: # Great. Call it a day. rd.save(do_extraction=False, index=False) return # Not available. Go get it. try: pacer_case_id = rd.docket_entry.docket.pacer_case_id r = att_report.download_pdf(pacer_case_id, att_found['pacer_doc_id']) except (ConnectTimeout, ConnectionError, ReadTimeout, ReadTimeoutError, ChunkedEncodingError) as exc: logger.warning("Unable to get PDF for %s" % att_found['pacer_doc_id']) raise self.retry(exc=exc) except HTTPError as exc: if exc.response.status_code in [ HTTP_500_INTERNAL_SERVER_ERROR, HTTP_504_GATEWAY_TIMEOUT ]: logger.warning("Ran into HTTPError: %s. Retrying." % exc.response.status_code) raise self.retry(exc) else: msg = "Ran into unknown HTTPError. %s. Aborting." % \ exc.response.status_code logger.error(msg) self.request.callbacks = None return if r is None: msg = "Unable to get PDF for %s at PACER court '%s' with doc id %s" % \ (rd, pacer_court_id, rd.pacer_doc_id) logger.error(msg) self.request.callbacks = None return file_name = get_document_filename( d.court_id, pacer_case_id, rd.document_number, rd.attachment_number, ) cf = ContentFile(r.content) rd.filepath_local.save(file_name, cf, save=False) rd.is_available = True # We've got the PDF. # request.content is sometimes a str, sometimes unicode, force it all to be # bytes, pleasing hashlib. rd.sha1 = hashlib.sha1(force_bytes(r.content)).hexdigest() rd.page_count = get_page_count(rd.filepath_local.path, 'pdf') # Save, extract, then save to Solr. Skip OCR for now. Don't do these async. rd.save(do_extraction=False, index=False) extract_recap_pdf(rd.pk, skip_ocr=True) add_or_update_recap_document([rd.pk])
def process_recap_pdf(self, pk): """Save a RECAP PDF to the database.""" pq = ProcessingQueue.objects.get(pk=pk) pq.status = pq.PROCESSING_IN_PROGRESS pq.save() logger.info("Processing RECAP item (debug is: %s): %s " % (pq.debug, pq)) try: rd = RECAPDocument.objects.get( docket_entry__docket__pacer_case_id=pq.pacer_case_id, pacer_doc_id=pq.pacer_doc_id, ) except RECAPDocument.DoesNotExist: try: d = Docket.objects.get(pacer_case_id=pq.pacer_case_id, court_id=pq.court_id) except Docket.DoesNotExist as exc: # No Docket and no RECAPDocument. Do a retry. Hopefully the docket # will be in place soon (it could be in a different upload task that # hasn't yet been processed). logger.warning("Unable to find docket for processing queue '%s'. " "Retrying if max_retries is not exceeded." % pq) pq.error_message = "Unable to find docket for item." if (self.request.retries == self.max_retries) or pq.debug: pq.status = pq.PROCESSING_FAILED pq.save() return None else: pq.status = pq.QUEUED_FOR_RETRY pq.save() raise self.retry(exc=exc) except Docket.MultipleObjectsReturned: msg = "Too many dockets found when trying to save '%s'" % pq logger.error(msg) pq.error_message = msg pq.status = pq.PROCESSING_FAILED pq.save() return None else: # Got the Docket, attempt to get/create the DocketEntry, and then # create the RECAPDocument try: de = DocketEntry.objects.get(docket=d, entry_number=pq.document_number) except DocketEntry.DoesNotExist as exc: logger.warning("Unable to find docket entry for processing " "queue '%s'. Retrying if max_retries is not " "exceeded." % pq) pq.error_message = "Unable to find docket entry for item." if (self.request.retries == self.max_retries) or pq.debug: pq.status = pq.PROCESSING_FAILED pq.save() return None else: pq.status = pq.QUEUED_FOR_RETRY pq.save() raise self.retry(exc=exc) # All objects accounted for. Make some data. rd = RECAPDocument( docket_entry=de, pacer_doc_id=pq.pacer_doc_id, date_upload=timezone.now(), ) if pq.attachment_number is None: rd.document_type = RECAPDocument.PACER_DOCUMENT else: rd.document_type = RECAPDocument.ATTACHMENT rd.document_number = pq.document_number rd.attachment_number = pq.attachment_number # Do the file, finally. content = pq.filepath_local.read() new_sha1 = hashlib.sha1(content).hexdigest() existing_document = all([ rd.sha1 == new_sha1, rd.is_available, rd.filepath_local and os.path.isfile(rd.filepath_local.path) ]) if not existing_document: # Different sha1, it wasn't available, or it's missing from disk. Move # the new file over from the processing queue storage. cf = ContentFile(content) file_name = get_document_filename( rd.docket_entry.docket.court_id, rd.docket_entry.docket.pacer_case_id, rd.document_number, rd.attachment_number, ) rd.filepath_local.save(file_name, cf, save=False) rd.is_available = True rd.sha1 = new_sha1 # Do page count and extraction extension = rd.filepath_local.path.split('.')[-1] rd.page_count = get_page_count(rd.filepath_local.path, extension) rd.ocr_status = None if not pq.debug: rd.save() if not existing_document and not pq.debug: extract_recap_pdf(rd.pk) add_or_update_recap_document([rd.pk], force_commit=False) mark_pq_successful(pq, d_id=rd.docket_entry.docket_id, de_id=rd.docket_entry_id, rd_id=rd.pk) return rd
def process_recap_pdf(self, pk): """Save a RECAP PDF to the database.""" pq = ProcessingQueue.objects.get(pk=pk) pq.status = pq.PROCESSING_IN_PROGRESS pq.save() logger.info("Processing RECAP item: %s" % pq) try: rd = RECAPDocument.objects.get( docket_entry__docket__pacer_case_id=pq.pacer_case_id, pacer_doc_id=pq.pacer_doc_id, ) except RECAPDocument.DoesNotExist: try: d = Docket.objects.get(pacer_case_id=pq.pacer_case_id, court_id=pq.court_id) except Docket.DoesNotExist as exc: # No Docket and no RECAPDocument. Do a retry. Hopefully the docket # will be in place soon (it could be in a different upload task that # hasn't yet been processed). logger.warning("Unable to find docket for processing queue '%s'. " "Retrying if max_retries is not exceeded." % pq) pq.error_message = "Unable to find docket for item." if self.request.retries == self.max_retries: pq.status = pq.PROCESSING_FAILED else: pq.status = pq.QUEUED_FOR_RETRY pq.save() raise self.retry(exc=exc) except Docket.MultipleObjectsReturned: msg = "Too many dockets found when trying to save '%s'" % pq logger.error(msg) pq.error_message = msg pq.status = pq.PROCESSING_FAILED pq.save() return None else: # Got the Docket, attempt to get/create the DocketEntry, and then # create the RECAPDocument try: de = DocketEntry.objects.get(docket=d, entry_number=pq.document_number) except DocketEntry.DoesNotExist as exc: logger.warning("Unable to find docket entry for processing " "queue '%s'. Retrying if max_retries is not " "exceeded." % pq) pq.error_message = "Unable to find docket entry for item." if self.request.retries == self.max_retries: pq.status = pq.PROCESSING_FAILED else: pq.status = pq.QUEUED_FOR_RETRY pq.save() raise self.retry(exc=exc) # All objects accounted for. Make some data. rd = RECAPDocument( docket_entry=de, pacer_doc_id=pq.pacer_doc_id, date_upload=timezone.now(), ) if pq.attachment_number is None: rd.document_type = RECAPDocument.PACER_DOCUMENT else: rd.document_type = RECAPDocument.ATTACHMENT rd.document_number = pq.document_number rd.attachment_number = pq.attachment_number # Do the file, finally. content = pq.filepath_local.read() new_sha1 = hashlib.sha1(content).hexdigest() if all([rd.sha1 == new_sha1, rd.is_available, rd.filepath_local and os.path.isfile(rd.filepath_local.path)]): # All good. Press on. new_document = False else: # Different sha1, it wasn't available, or it's missing from disk. Move # the new file over from the processing queue storage. new_document = True cf = ContentFile(content) file_name = get_document_filename( rd.docket_entry.docket.court_id, rd.docket_entry.docket.pacer_case_id, rd.document_number, rd.attachment_number, ) rd.filepath_local.save(file_name, cf, save=False) rd.is_available = True rd.sha1 = new_sha1 # Do page count and extraction extension = rd.filepath_local.path.split('.')[-1] rd.page_count = get_page_count(rd.filepath_local.path, extension) rd.ocr_status = None # Ditch the original file pq.filepath_local.delete(save=False) pq.error_message = '' # Clear out errors b/c successful pq.status = pq.PROCESSING_SUCCESSFUL pq.save() rd.save() if new_document: extract_recap_pdf(rd.pk) add_or_update_recap_document([rd.pk], force_commit=False) return rd
def make_recap_document(self, doc_node, docket_entry, entry_number, attachment_number, document_type, debug): """Do nothing for items that don't start with zero. For ones that do, find the stripped version, fix it, download the correct item, extract it and finally save it to Solr. """ if not entry_number.startswith('0'): # Only touch things where the new value leads with a zero. return None else: logger.info(" Doing docket_entry: %s, document_number, " "%s and attachment number: %s" % (docket_entry, entry_number, attachment_number)) old_entry_number = int(entry_number) try: rd = RECAPDocument.objects.get( docket_entry=docket_entry, document_number=old_entry_number, attachment_number=attachment_number or None, ) logger.info(" Found item.") except RECAPDocument.DoesNotExist: logger.info(" Failed to find item.") return None rd.document_number = entry_number if rd.is_available: new_ia = get_ia_document_url_from_path( self.path, entry_number, attachment_number) logger.info(" Updating IA URL from %s to %s" % (rd.filepath_ia, new_ia)) rd.filepath_ia = new_ia if not os.path.isfile(rd.filepath_local.path): # Set the value correctly and get the file from IA if we don't # already have it. new_local_path = os.path.join( 'recap', get_local_document_url_from_path(self.path, entry_number, attachment_number), ) logger.info(" Updating local path from %s to %s" % (rd.filepath_local, new_local_path)) rd.filepath_local = new_local_path filename = rd.filepath_ia.rsplit('/', 1)[-1] logger.info(" Downloading item with filename %s" % filename) if not debug: download_recap_item(rd.filepath_ia, filename) else: logger.info(" File already on disk. Punting.") if rd.page_count is None: logger.info(" Getting page count.") extension = rd.filepath_local.path.split('.')[-1] rd.page_count = get_page_count(rd.filepath_local.path, extension) else: logger.info(" Item not available in RECAP. Punting.") return None if not debug: try: extract_recap_pdf(rd.pk, check_if_needed=False) rd.save(do_extraction=False, index=True) logger.info(" Item saved at https://www.courtlistener.com%s" % rd.get_absolute_url()) except IntegrityError: logger.info(" Integrity error while saving.") return None else: logger.info(" No save requested in debug mode.") return rd