def upload_pdf_to_ia(self, rd_pk): rd = RECAPDocument.objects.get(pk=rd_pk) d = rd.docket_entry.docket file_name = get_document_filename( d.court_id, d.pacer_case_id, rd.document_number, rd.attachment_number or 0, ) bucket_name = get_bucket_name(d.court_id, d.pacer_case_id) responses = upload_to_ia( self, identifier=bucket_name, files=rd.filepath_local.path, title=best_case_name(d), collection=settings.IA_COLLECTIONS, court_id=d.court_id, source_url='https://www.courtlistener.com%s' % rd.get_absolute_url(), media_type='texts', description="This item represents a case in PACER, the U.S. " "Government's website for federal case data. If you wish " "to see the entire case, please consult PACER directly.", ) if responses is None: increment_failure_count(rd) return if all(r.ok for r in responses): rd.ia_upload_failure_count = None rd.filepath_ia = "https://archive.org/download/%s/%s" % (bucket_name, file_name) rd.save() else: increment_failure_count(rd)
def upload_free_opinion_to_ia(self, rd_pk): rd = RECAPDocument.objects.get(pk=rd_pk) d = rd.docket_entry.docket file_name = get_document_filename( d.court_id, d.pacer_case_id, rd.document_number, 0, # Attachment number is zero for all free opinions. ) bucket_name = get_bucket_name(d.court_id, d.pacer_case_id) try: responses = upload_to_ia( identifier=bucket_name, files=rd.filepath_local.path, metadata={ 'title': best_case_name(d), 'collection': settings.IA_COLLECTIONS, 'contributor': '<a href="https://free.law">Free Law Project</a>', 'court': d.court_id, 'language': 'eng', 'mediatype': 'texts', 'description': "This item represents a case in PACER, " "the U.S. Government's website for " "federal case data. If you wish to see " "the entire case, please consult PACER " "directly.", 'licenseurl': 'https://www.usa.gov/government-works', }, ) except (OverloadedException, ExpatError) as exc: # Overloaded: IA wants us to slow down. # ExpatError: The syntax of the XML file that's supposed to be returned # by IA is bad (or something). if self.request.retries == self.max_retries: # Give up for now. It'll get done next time cron is run. return raise self.retry(exc=exc) except HTTPError as exc: if exc.response.status_code in [ HTTP_403_FORBIDDEN, # Can't access bucket, typically. HTTP_400_BAD_REQUEST, # Corrupt PDF, typically. ]: return [exc.response] if self.request.retries == self.max_retries: # This exception is also raised when the endpoint is overloaded, but # doesn't get caught in the OverloadedException below due to # multiple processes running at the same time. Just give up for now. return raise self.retry(exc=exc) except (requests.Timeout, requests.RequestException) as exc: logger.warning("Timeout or unknown RequestException. Unable to upload " "to IA. Trying again if retries not exceeded: %s" % rd) if self.request.retries == self.max_retries: # Give up for now. It'll get done next time cron is run. return raise self.retry(exc=exc) if all(r.ok for r in responses): rd.filepath_ia = "https://archive.org/download/%s/%s" % ( bucket_name, file_name) rd.save(do_extraction=False, index=False)
def get_and_process_pdf(self, data, session, row_pk, index=False): if data is None: return result = data['result'] rd = RECAPDocument.objects.get(pk=data['rd_pk']) report = FreeOpinionReport(data['pacer_court_id'], session) try: r = report.download_pdf(result.pacer_case_id, result.pacer_doc_id) except (ConnectTimeout, ConnectionError, ReadTimeout, ReadTimeoutError, ChunkedEncodingError) as exc: logger.warning("Unable to get PDF for %s" % result) raise self.retry(exc=exc) except HTTPError as exc: if exc.response.status_code in [ HTTP_500_INTERNAL_SERVER_ERROR, HTTP_504_GATEWAY_TIMEOUT ]: logger.warning("Ran into HTTPError: %s. Retrying." % exc.response.status_code) raise self.retry(exc) else: msg = "Ran into unknown HTTPError. %s. Aborting." % \ exc.response.status_code logger.error(msg) PACERFreeDocumentRow.objects.filter(pk=row_pk).update( error_msg=msg) self.request.callbacks = None return if r is None: msg = "Unable to get PDF for %s at %s with doc id %s" % \ (result, result.court_id, result.pacer_doc_id) logger.error(msg) PACERFreeDocumentRow.objects.filter(pk=row_pk).update(error_msg=msg) self.request.callbacks = None return file_name = get_document_filename( result.court.pk, result.pacer_case_id, result.document_number, 0, # Attachment number is zero for all free opinions. ) cf = ContentFile(r.content) rd.filepath_local.save(file_name, cf, save=False) rd.is_available = True # We've got the PDF. # request.content is sometimes a str, sometimes unicode, so # force it all to be bytes, pleasing hashlib. rd.sha1 = hashlib.sha1(force_bytes(r.content)).hexdigest() rd.is_free_on_pacer = True rd.page_count = get_page_count(rd.filepath_local.path, 'pdf') # Save and extract, skipping OCR. rd.save(do_extraction=False, index=index) extract_recap_pdf(rd.pk, skip_ocr=True, check_if_needed=False) return {'result': result, 'rd_pk': rd.pk}
def get_and_process_pdf(self, data, session, row_pk, index=False): if data is None: return result = data['result'] rd = RECAPDocument.objects.get(pk=data['rd_pk']) report = FreeOpinionReport(data['pacer_court_id'], session) try: r = report.download_pdf(result.pacer_case_id, result.pacer_doc_id) except (ConnectTimeout, ConnectionError, ReadTimeout, ReadTimeoutError, ChunkedEncodingError) as exc: logger.warning("Unable to get PDF for %s" % result) raise self.retry(exc=exc) except HTTPError as exc: if exc.response.status_code in [HTTP_500_INTERNAL_SERVER_ERROR, HTTP_504_GATEWAY_TIMEOUT]: logger.warning("Ran into HTTPError: %s. Retrying." % exc.response.status_code) raise self.retry(exc) else: msg = "Ran into unknown HTTPError. %s. Aborting." % \ exc.response.status_code logger.error(msg) PACERFreeDocumentRow.objects.filter(pk=row_pk).update(error_msg=msg) self.request.callbacks = None return if r is None: msg = "Unable to get PDF for %s at %s with doc id %s" % \ (result, result.court_id, result.pacer_doc_id) logger.error(msg) PACERFreeDocumentRow.objects.filter(pk=row_pk).update(error_msg=msg) self.request.callbacks = None return file_name = get_document_filename( result.court.pk, result.pacer_case_id, result.document_number, 0, # Attachment number is zero for all free opinions. ) cf = ContentFile(r.content) rd.filepath_local.save(file_name, cf, save=False) rd.is_available = True # We've got the PDF. # request.content is sometimes a str, sometimes unicode, so # force it all to be bytes, pleasing hashlib. rd.sha1 = hashlib.sha1(force_bytes(r.content)).hexdigest() rd.is_free_on_pacer = True rd.page_count = get_page_count(rd.filepath_local.path, 'pdf') # Save and extract, skipping OCR. rd.save(do_extraction=False, index=index) extract_recap_pdf(rd.pk, skip_ocr=True, check_if_needed=False) return {'result': result, 'rd_pk': rd.pk}
def get_and_merge_items(items, log): """Get the items returned from the RECAP server and merge them into CL. Items is a list of dicts like so, sorted by court, case number, document number and attachment number: [{'attachment_number': '0', 'document_number': '1', 'case_number': '186759', 'court_id': 'almb', 'is_available': '0'}, ... ] Note that all values are strings. The idea is to iterate over all of these dicts, grabbing the docket, and adding any items that have is_available = 1. """ update_log_status(log, RECAPLog.GETTING_AND_MERGING_ITEMS) tasks = [] for prev, item, nxt in previous_and_next(items): if prev is None or item['case_number'] != prev['case_number']: # New case. Get the next docket before getting any PDFs. url = get_docketxml_url(item['court_id'], item['case_number']) logger.info("New docket found at: %s" % url) filename = get_docket_filename(item['court_id'], item['case_number']) tasks.append(download_recap_item.si(url, filename, clobber=True)) # Get the document filename = get_document_filename(item['court_id'], item['case_number'], item['document_number'], item['attachment_number']) location = os.path.join(settings.MEDIA_ROOT, 'recap', filename) if not os.path.isfile(location) and int(item['is_available']): # We don't have it yet, and it's available to get. Get it! url = get_pdf_url(item['court_id'], item['case_number'], filename) tasks.append(download_recap_item.si(url, filename)) if nxt is None or item['case_number'] != nxt['case_number']: # Last item in the case. Send for processing. if len(tasks) > 0: logger.info("Sending %s tasks for processing." % len(tasks)) filename = get_docket_filename(item['court_id'], item['case_number']) chord(tasks)(chain( parse_recap_docket.si(filename, debug=False), extract_recap_pdf.s().set(priority=5), add_or_update_recap_document.s(coalesce_docket=True), )) tasks = [] logger.info("Finished queueing new cases.")
def update_rd_metadata(self, rd_pk, response, court_id, pacer_case_id, pacer_doc_id, document_number, attachment_number): """After querying PACER and downloading a document, save it to the DB. :param rd_pk: The primary key of the RECAPDocument to work on :param response: A requests.Response object containing the PDF data. :param court_id: A CourtListener court ID to use for file names. :param pacer_case_id: The pacer_case_id to use in error logs. :param pacer_doc_id: The pacer_doc_id to use in error logs. :param document_number: The docket entry number for use in file names. :param attachment_number: The attachment number (if applicable) for use in file names. :return: A two-tuple of a boolean indicating success and a corresponding error/success message string. """ rd = RECAPDocument.objects.get(pk=rd_pk) if response is None: msg = "Unable to get PDF for RECAP Document '%s' " \ "at '%s' with doc id '%s'" % (rd_pk, court_id, pacer_doc_id) logger.error(msg) self.request.callbacks = None return False, msg file_name = get_document_filename(court_id, pacer_case_id, document_number, attachment_number) cf = ContentFile(response.content) rd.filepath_local.save(file_name, cf, save=False) rd.file_size = rd.filepath_local.size rd.is_available = True # We've got the PDF. # request.content is sometimes a str, sometimes unicode, so # force it all to be bytes, pleasing hashlib. rd.sha1 = hashlib.sha1(force_bytes(response.content)).hexdigest() rd.page_count = get_page_count(rd.filepath_local.path, 'pdf') # Save and extract, skipping OCR. rd.save() # Make sure we mark the docket as needing upload changed = mark_ia_upload_needed(rd.docket_entry.docket) if changed: rd.docket_entry.docket.save() return True, 'Saved item successfully'
def make_download_tasks(data, line_count, start_line): """For every item in the CSV, send it to Celery for processing""" previous_casenum = None subtasks = [] completed = 0 for index, item in data.iterrows(): if completed < start_line - 1: # Skip ahead if start_lines is provided. completed += 1 continue if item['casenum'] != previous_casenum: # New case, get the docket before getting the pdf logger.info("New docket found with casenum: %s" % item['casenum']) previous_casenum = item['casenum'] filename = get_docket_filename(item['court'], item['casenum']) url = get_docketxml_url(item['court'], item['casenum']) subtasks.append(download_recap_item.subtask((url, filename))) # Get the document filename = get_document_filename(item['court'], item['casenum'], item['docnum'], item['subdocnum']) url = get_pdf_url(item['court'], item['casenum'], filename) subtasks.append(download_recap_item.subtask((url, filename))) # Every n items or on the last item, send the subtasks to Celery. last_item = (line_count == completed + 1) if (len(subtasks) >= 1000) or last_item: msg = ("Sent %s subtasks to celery. We have processed %s " "rows so far." % (len(subtasks), completed + 1)) logger.info(msg) print msg job = TaskSet(tasks=subtasks) job.apply_async().join() subtasks = [] completed += 1
def make_download_tasks(data, line_count, start_line): """For every item in the CSV, send it to Celery for processing""" previous_casenum = None subtasks = [] completed = 0 for index, item in data.iterrows(): if completed < start_line - 1: # Skip ahead if start_lines is provided. completed += 1 continue last_item = (line_count == completed + 1) if item['casenum'] != previous_casenum: # New case, get the docket before getting the pdf logger.info("New docket found with casenum: %s" % item['casenum']) previous_casenum = item['casenum'] filename = get_docket_filename(item['court'], item['casenum']) url = get_docketxml_url(item['court'], item['casenum']) subtasks.append(download_recap_item.subtask((url, filename))) # Get the document filename = get_document_filename(item['court'], item['casenum'], item['docnum'], item['subdocnum']) url = get_pdf_url(item['court'], item['casenum'], filename) subtasks.append(download_recap_item.subtask((url, filename))) # Every n items send the subtasks to Celery. if (len(subtasks) >= 1000) or last_item: msg = ("Sent %s subtasks to celery. We have processed %s " "rows so far." % (len(subtasks), completed + 1)) logger.info(msg) print msg job = TaskSet(tasks=subtasks) job.apply_async().join() subtasks = [] completed += 1
def process_recap_pdf(self, pk): """Process an uploaded PDF from the RECAP API endpoint. :param pk: The PK of the processing queue item you want to work on. :return: A RECAPDocument object that was created or updated. """ """Save a RECAP PDF to the database.""" pq = ProcessingQueue.objects.get(pk=pk) mark_pq_status(pq, "", PROCESSING_STATUS.IN_PROGRESS) if pq.attachment_number is None: document_type = RECAPDocument.PACER_DOCUMENT else: document_type = RECAPDocument.ATTACHMENT logger.info("Processing RECAP item (debug is: %s): %s " % (pq.debug, pq)) try: if pq.pacer_case_id: rd = RECAPDocument.objects.get( docket_entry__docket__pacer_case_id=pq.pacer_case_id, pacer_doc_id=pq.pacer_doc_id, ) else: # Sometimes we don't have the case ID from PACER. Try to make this # work anyway. rd = RECAPDocument.objects.get(pacer_doc_id=pq.pacer_doc_id,) except (RECAPDocument.DoesNotExist, RECAPDocument.MultipleObjectsReturned): try: d = Docket.objects.get( pacer_case_id=pq.pacer_case_id, court_id=pq.court_id ) except Docket.DoesNotExist as exc: # No Docket and no RECAPDocument. Do a retry. Hopefully # the docket will be in place soon (it could be in a # different upload task that hasn't yet been processed). logger.warning( "Unable to find docket for processing queue '%s'. " "Retrying if max_retries is not exceeded." % pq ) error_message = "Unable to find docket for item." if (self.request.retries == self.max_retries) or pq.debug: mark_pq_status(pq, error_message, PROCESSING_STATUS.FAILED) return None else: mark_pq_status( pq, error_message, PROCESSING_STATUS.QUEUED_FOR_RETRY ) raise self.retry(exc=exc) except Docket.MultipleObjectsReturned: msg = "Too many dockets found when trying to save '%s'" % pq mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED) return None # Got the Docket, attempt to get/create the DocketEntry, and then # create the RECAPDocument try: de = DocketEntry.objects.get( docket=d, entry_number=pq.document_number ) except DocketEntry.DoesNotExist as exc: logger.warning( "Unable to find docket entry for processing " "queue '%s'." % pq ) msg = "Unable to find docket entry for item." if (self.request.retries == self.max_retries) or pq.debug: mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED) return None else: mark_pq_status(pq, msg, PROCESSING_STATUS.QUEUED_FOR_RETRY) raise self.retry(exc=exc) else: # If we're here, we've got the docket and docket # entry, but were unable to find the document by # pacer_doc_id. This happens when pacer_doc_id is # missing, for example. ∴, try to get the document # from the docket entry. try: rd = RECAPDocument.objects.get( docket_entry=de, document_number=pq.document_number, attachment_number=pq.attachment_number, document_type=document_type, ) except ( RECAPDocument.DoesNotExist, RECAPDocument.MultipleObjectsReturned, ): # Unable to find it. Make a new item. rd = RECAPDocument( docket_entry=de, pacer_doc_id=pq.pacer_doc_id, document_type=document_type, ) rd.document_number = pq.document_number rd.attachment_number = pq.attachment_number # Do the file, finally. try: content = pq.filepath_local.read() except IOError as exc: msg = "Internal processing error (%s: %s)." % (exc.errno, exc.strerror) if (self.request.retries == self.max_retries) or pq.debug: mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED) return None else: mark_pq_status(pq, msg, PROCESSING_STATUS.QUEUED_FOR_RETRY) raise self.retry(exc=exc) new_sha1 = sha1(content) existing_document = all( [ rd.sha1 == new_sha1, rd.is_available, rd.filepath_local and os.path.isfile(rd.filepath_local.path), ] ) if not existing_document: # Different sha1, it wasn't available, or it's missing from disk. Move # the new file over from the processing queue storage. cf = ContentFile(content) file_name = get_document_filename( rd.docket_entry.docket.court_id, rd.docket_entry.docket.pacer_case_id, rd.document_number, rd.attachment_number, ) if not pq.debug: rd.filepath_local.save(file_name, cf, save=False) # Do page count and extraction extension = rd.filepath_local.path.split(".")[-1] rd.page_count = get_page_count(rd.filepath_local.path, extension) rd.file_size = rd.filepath_local.size rd.ocr_status = None rd.is_available = True rd.sha1 = new_sha1 rd.date_upload = now() if not pq.debug: try: rd.save() except (IntegrityError, ValidationError): msg = "Duplicate key on unique_together constraint" mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED) rd.filepath_local.delete(save=False) return None if not existing_document and not pq.debug: extract_recap_pdf(rd.pk) add_items_to_solr([rd.pk], "search.RECAPDocument") mark_pq_successful( pq, d_id=rd.docket_entry.docket_id, de_id=rd.docket_entry_id, rd_id=rd.pk, ) mark_ia_upload_needed(rd.docket_entry.docket, save_docket=True) return rd
def get_pacer_doc_by_rd_and_description(self, rd_pk, description_re, session, fallback_to_main_doc=False, tag=None): """Using a RECAPDocument object ID and a description of a document, get the document from PACER. This function was originally meant to get civil cover sheets, but can be repurposed as needed. :param rd_pk: The PK of a RECAPDocument object to use as a source. :param description_re: A compiled regular expression to search against the description provided by the attachment page. :param session: The PACER session object to use. :param fallback_to_main_doc: Should we grab the main doc if none of the attachments match the regex? :param tag: A tag name to apply to any downloaded content. :return: None """ rd = RECAPDocument.objects.get(pk=rd_pk) if not rd.pacer_doc_id: # Some docket entries are just text/don't have a pacer_doc_id. self.request.callbacks = None return d = rd.docket_entry.docket pacer_court_id = map_cl_to_pacer_id(d.court_id) att_report = AttachmentPage(pacer_court_id, session) try: att_report.query(rd.pacer_doc_id) except (ConnectTimeout, ConnectionError, ReadTimeout, ReadTimeoutError, ChunkedEncodingError) as exc: logger.warning("Unable to get PDF for %s" % rd) raise self.retry(exc=exc) except HTTPError as exc: if exc.response.status_code in [ HTTP_500_INTERNAL_SERVER_ERROR, HTTP_504_GATEWAY_TIMEOUT ]: logger.warning("Ran into HTTPError: %s. Retrying." % exc.response.status_code) raise self.retry(exc) else: msg = "Ran into unknown HTTPError. %s. Aborting." % \ exc.response.status_code logger.error(msg) self.request.callbacks = None return att_found = None for attachment in att_report.data.get('attachments', []): if description_re.search(attachment['description']): att_found = attachment.copy() document_type = RECAPDocument.ATTACHMENT break if not att_found: if fallback_to_main_doc: logger.info("Falling back to main document for pacer_doc_id: %s" % rd.pacer_doc_id) att_found = att_report.data document_type = RECAPDocument.PACER_DOCUMENT else: msg = "Aborting. Did not find civil cover sheet for %s." % rd logger.error(msg) self.request.callbacks = None return if not att_found.get('pacer_doc_id'): logger.warn("No pacer_doc_id for document (is it sealed?)") self.request.callbacks = None return # Try to find the attachment already in the collection rd, _ = RECAPDocument.objects.get_or_create( docket_entry=rd.docket_entry, attachment_number=att_found.get('attachment_number'), document_number=rd.document_number, pacer_doc_id=att_found['pacer_doc_id'], document_type=document_type, defaults={ 'date_upload': now(), }, ) # Replace the description if we have description data. Else fallback on old. rd.description = att_found.get('description', '') or rd.description if tag is not None: tag, _ = Tag.objects.get_or_create(name=tag) rd.tags.add(tag) if rd.is_available: # Great. Call it a day. rd.save(do_extraction=False, index=False) return # Not available. Go get it. try: pacer_case_id = rd.docket_entry.docket.pacer_case_id r = att_report.download_pdf(pacer_case_id, att_found['pacer_doc_id']) except (ConnectTimeout, ConnectionError, ReadTimeout, ReadTimeoutError, ChunkedEncodingError) as exc: logger.warning("Unable to get PDF for %s" % att_found['pacer_doc_id']) raise self.retry(exc=exc) except HTTPError as exc: if exc.response.status_code in [ HTTP_500_INTERNAL_SERVER_ERROR, HTTP_504_GATEWAY_TIMEOUT ]: logger.warning("Ran into HTTPError: %s. Retrying." % exc.response.status_code) raise self.retry(exc) else: msg = "Ran into unknown HTTPError. %s. Aborting." % \ exc.response.status_code logger.error(msg) self.request.callbacks = None return if r is None: msg = "Unable to get PDF for %s at PACER court '%s' with doc id %s" % \ (rd, pacer_court_id, rd.pacer_doc_id) logger.error(msg) self.request.callbacks = None return file_name = get_document_filename( d.court_id, pacer_case_id, rd.document_number, rd.attachment_number, ) cf = ContentFile(r.content) rd.filepath_local.save(file_name, cf, save=False) rd.is_available = True # We've got the PDF. # request.content is sometimes a str, sometimes unicode, force it all to be # bytes, pleasing hashlib. rd.sha1 = hashlib.sha1(force_bytes(r.content)).hexdigest() rd.page_count = get_page_count(rd.filepath_local.path, 'pdf') # Save, extract, then save to Solr. Skip OCR for now. Don't do these async. rd.save(do_extraction=False, index=False) extract_recap_pdf(rd.pk, skip_ocr=True) add_or_update_recap_document([rd.pk])
def process_recap_pdf(self, pk): """Save a RECAP PDF to the database.""" pq = ProcessingQueue.objects.get(pk=pk) pq.status = pq.PROCESSING_IN_PROGRESS pq.save() logger.info("Processing RECAP item (debug is: %s): %s " % (pq.debug, pq)) try: rd = RECAPDocument.objects.get( docket_entry__docket__pacer_case_id=pq.pacer_case_id, pacer_doc_id=pq.pacer_doc_id, ) except RECAPDocument.DoesNotExist: try: d = Docket.objects.get(pacer_case_id=pq.pacer_case_id, court_id=pq.court_id) except Docket.DoesNotExist as exc: # No Docket and no RECAPDocument. Do a retry. Hopefully the docket # will be in place soon (it could be in a different upload task that # hasn't yet been processed). logger.warning("Unable to find docket for processing queue '%s'. " "Retrying if max_retries is not exceeded." % pq) pq.error_message = "Unable to find docket for item." if (self.request.retries == self.max_retries) or pq.debug: pq.status = pq.PROCESSING_FAILED pq.save() return None else: pq.status = pq.QUEUED_FOR_RETRY pq.save() raise self.retry(exc=exc) except Docket.MultipleObjectsReturned: msg = "Too many dockets found when trying to save '%s'" % pq logger.error(msg) pq.error_message = msg pq.status = pq.PROCESSING_FAILED pq.save() return None else: # Got the Docket, attempt to get/create the DocketEntry, and then # create the RECAPDocument try: de = DocketEntry.objects.get(docket=d, entry_number=pq.document_number) except DocketEntry.DoesNotExist as exc: logger.warning("Unable to find docket entry for processing " "queue '%s'. Retrying if max_retries is not " "exceeded." % pq) pq.error_message = "Unable to find docket entry for item." if (self.request.retries == self.max_retries) or pq.debug: pq.status = pq.PROCESSING_FAILED pq.save() return None else: pq.status = pq.QUEUED_FOR_RETRY pq.save() raise self.retry(exc=exc) # All objects accounted for. Make some data. rd = RECAPDocument( docket_entry=de, pacer_doc_id=pq.pacer_doc_id, date_upload=timezone.now(), ) if pq.attachment_number is None: rd.document_type = RECAPDocument.PACER_DOCUMENT else: rd.document_type = RECAPDocument.ATTACHMENT rd.document_number = pq.document_number rd.attachment_number = pq.attachment_number # Do the file, finally. content = pq.filepath_local.read() new_sha1 = hashlib.sha1(content).hexdigest() existing_document = all([ rd.sha1 == new_sha1, rd.is_available, rd.filepath_local and os.path.isfile(rd.filepath_local.path) ]) if not existing_document: # Different sha1, it wasn't available, or it's missing from disk. Move # the new file over from the processing queue storage. cf = ContentFile(content) file_name = get_document_filename( rd.docket_entry.docket.court_id, rd.docket_entry.docket.pacer_case_id, rd.document_number, rd.attachment_number, ) rd.filepath_local.save(file_name, cf, save=False) rd.is_available = True rd.sha1 = new_sha1 # Do page count and extraction extension = rd.filepath_local.path.split('.')[-1] rd.page_count = get_page_count(rd.filepath_local.path, extension) rd.ocr_status = None if not pq.debug: rd.save() if not existing_document and not pq.debug: extract_recap_pdf(rd.pk) add_or_update_recap_document([rd.pk], force_commit=False) mark_pq_successful(pq, d_id=rd.docket_entry.docket_id, de_id=rd.docket_entry_id, rd_id=rd.pk) return rd
def process_recap_pdf(self, pk): """Save a RECAP PDF to the database.""" pq = ProcessingQueue.objects.get(pk=pk) pq.status = pq.PROCESSING_IN_PROGRESS pq.save() logger.info("Processing RECAP item: %s" % pq) try: rd = RECAPDocument.objects.get( docket_entry__docket__pacer_case_id=pq.pacer_case_id, pacer_doc_id=pq.pacer_doc_id, ) except RECAPDocument.DoesNotExist: try: d = Docket.objects.get(pacer_case_id=pq.pacer_case_id, court_id=pq.court_id) except Docket.DoesNotExist as exc: # No Docket and no RECAPDocument. Do a retry. Hopefully the docket # will be in place soon (it could be in a different upload task that # hasn't yet been processed). logger.warning("Unable to find docket for processing queue '%s'. " "Retrying if max_retries is not exceeded." % pq) pq.error_message = "Unable to find docket for item." if self.request.retries == self.max_retries: pq.status = pq.PROCESSING_FAILED else: pq.status = pq.QUEUED_FOR_RETRY pq.save() raise self.retry(exc=exc) except Docket.MultipleObjectsReturned: msg = "Too many dockets found when trying to save '%s'" % pq logger.error(msg) pq.error_message = msg pq.status = pq.PROCESSING_FAILED pq.save() return None else: # Got the Docket, attempt to get/create the DocketEntry, and then # create the RECAPDocument try: de = DocketEntry.objects.get(docket=d, entry_number=pq.document_number) except DocketEntry.DoesNotExist as exc: logger.warning("Unable to find docket entry for processing " "queue '%s'. Retrying if max_retries is not " "exceeded." % pq) pq.error_message = "Unable to find docket entry for item." if self.request.retries == self.max_retries: pq.status = pq.PROCESSING_FAILED else: pq.status = pq.QUEUED_FOR_RETRY pq.save() raise self.retry(exc=exc) # All objects accounted for. Make some data. rd = RECAPDocument( docket_entry=de, pacer_doc_id=pq.pacer_doc_id, date_upload=timezone.now(), ) if pq.attachment_number is None: rd.document_type = RECAPDocument.PACER_DOCUMENT else: rd.document_type = RECAPDocument.ATTACHMENT rd.document_number = pq.document_number rd.attachment_number = pq.attachment_number # Do the file, finally. content = pq.filepath_local.read() new_sha1 = hashlib.sha1(content).hexdigest() if all([rd.sha1 == new_sha1, rd.is_available, rd.filepath_local and os.path.isfile(rd.filepath_local.path)]): # All good. Press on. new_document = False else: # Different sha1, it wasn't available, or it's missing from disk. Move # the new file over from the processing queue storage. new_document = True cf = ContentFile(content) file_name = get_document_filename( rd.docket_entry.docket.court_id, rd.docket_entry.docket.pacer_case_id, rd.document_number, rd.attachment_number, ) rd.filepath_local.save(file_name, cf, save=False) rd.is_available = True rd.sha1 = new_sha1 # Do page count and extraction extension = rd.filepath_local.path.split('.')[-1] rd.page_count = get_page_count(rd.filepath_local.path, extension) rd.ocr_status = None # Ditch the original file pq.filepath_local.delete(save=False) pq.error_message = '' # Clear out errors b/c successful pq.status = pq.PROCESSING_SUCCESSFUL pq.save() rd.save() if new_document: extract_recap_pdf(rd.pk) add_or_update_recap_document([rd.pk], force_commit=False) return rd