def get_attachment_page_by_rd(self, rd_pk, cookies): """Get the attachment page for the item in PACER. :param rd_pk: The PK of a RECAPDocument object to use as a source. :param cookies: A requests.cookies.RequestsCookieJar with the cookies of a logged-on PACER user. :return: The attachment report populated with the results """ rd = RECAPDocument.objects.get(pk=rd_pk) if not rd.pacer_doc_id: # Some docket entries are just text/don't have a pacer_doc_id. self.request.callbacks = None return s = PacerSession(cookies=cookies) pacer_court_id = map_cl_to_pacer_id(rd.docket_entry.docket.court_id) att_report = AttachmentPage(pacer_court_id, s) try: att_report.query(rd.pacer_doc_id) except HTTPError as exc: if exc.response.status_code in [ HTTP_500_INTERNAL_SERVER_ERROR, HTTP_504_GATEWAY_TIMEOUT ]: logger.warning("Ran into HTTPError: %s. Retrying.", exc.response.status_code) raise self.retry(exc) else: msg = "Ran into unknown HTTPError. %s. Aborting." logger.error(msg, exc.response.status_code) self.request.callbacks = None return except requests.RequestException as exc: logger.warning("Unable to get attachment page for %s", rd) raise self.retry(exc=exc) return att_report
def test_parsing_results(self): """Can we do a simple query and parse?""" paths = [] path_root = os.path.join(TESTS_ROOT, "examples", "pacer", "attachment_pages") for root, dirnames, filenames in os.walk(path_root): for filename in fnmatch.filter(filenames, '*.html'): paths.append(os.path.join(root, filename)) paths.sort() path_max_len = max(len(path) for path in paths) + 2 for i, path in enumerate(paths): sys.stdout.write("%s. Doing %s" % (i, path.ljust(path_max_len))) dirname, filename = os.path.split(path) filename_sans_ext = filename.split('.')[0] json_path = os.path.join(dirname, '%s.json' % filename_sans_ext) court = filename_sans_ext.split('_')[0] report = AttachmentPage(court) with open(path, 'r') as f: report._parse_text(f.read().decode('utf-8')) data = report.data if not os.path.exists(json_path): with open(json_path, 'w') as f: print("Creating new file at %s" % json_path) json.dump(data, f, indent=2, sort_keys=True) continue with open(json_path) as f: j = json.load(f) self.assertEqual(j, data) sys.stdout.write("✓\n")
def test_parsing_results(self): """Can we do a simple query and parse?""" paths = [] path_root = os.path.join(TESTS_ROOT, "examples", "pacer", "attachment_pages") for root, dirnames, filenames in os.walk(path_root): for filename in fnmatch.filter(filenames, '*.html'): paths.append(os.path.join(root, filename)) paths.sort() path_max_len = max(len(path) for path in paths) + 2 for i, path in enumerate(paths): sys.stdout.write("%s. Doing %s" % (i, path.ljust(path_max_len))) dirname, filename = os.path.split(path) filename_sans_ext = filename.split('.')[0] json_path = os.path.join(dirname, '%s.json' % filename_sans_ext) court = filename_sans_ext.split('_')[0] report = AttachmentPage(court) with open(path, 'r') as f: report._parse_text(f.read().decode('utf-8')) data = report.data with open(json_path) as f: j = json.load(f) self.assertEqual(j, data) sys.stdout.write("✓\n")
def process_recap_attachment(self, pk): """Process an uploaded attachment page from the RECAP API endpoint. :param pk: The primary key of the processing queue item you want to work on :return: None """ pq = ProcessingQueue.objects.get(pk=pk) mark_pq_status(pq, '', pq.PROCESSING_IN_PROGRESS) logger.info("Processing RECAP item (debug is: %s): %s" % (pq.debug, pq)) att_page = AttachmentPage(map_cl_to_pacer_id(pq.court_id)) with open(pq.filepath_local.path) as f: text = f.read().decode('utf-8') att_page._parse_text(text) att_data = att_page.data logger.info("Parsing completed for item %s" % pq) if pq.pacer_case_id in ['undefined', 'null']: # Bad data from the client. Fix it with parsed data. pq.pacer_case_id = att_data.get('pacer_case_id') pq.save() # Merge the contents of the data into CL. try: params = { 'pacer_doc_id': att_data['pacer_doc_id'], 'docket_entry__docket__court': pq.court, } if pq.pacer_case_id: params['docket_entry__docket__pacer_case_id'] = pq.pacer_case_id main_rd = RECAPDocument.objects.get(**params) except RECAPDocument.MultipleObjectsReturned: # Unclear how to proceed and we don't want to associate this data with # the wrong case. We must punt. msg = "Too many documents found when attempting to associate " \ "attachment data" mark_pq_status(pq, msg, pq.PROCESSING_FAILED) return None except RECAPDocument.DoesNotExist as exc: msg = "Could not find docket to associate with attachment metadata" if (self.request.retries == self.max_retries) or pq.debug: mark_pq_status(pq, msg, pq.PROCESSING_FAILED) return None else: mark_pq_status(pq, msg, pq.QUEUED_FOR_RETRY) raise self.retry(exc=exc) # We got the right item. Update/create all the attachments for # the docket entry. de = main_rd.docket_entry if att_data['document_number'] is None: # Bankruptcy attachment page. Use the document number from the Main doc att_data['document_number'] = main_rd.document_number rds_created = [] if not pq.debug: # Save the old HTML to the docket entry. pacer_file = PacerHtmlFiles(content_object=de, upload_type=ATTACHMENT_PAGE) pacer_file.filepath.save( 'attachment_page.html', # Irrelevant b/c UUIDFileSystemStorage ContentFile(text), ) # Create/update the attachment items. for attachment in att_data['attachments']: if all([attachment['attachment_number'], # Missing on sealed items. attachment.get('pacer_doc_id', False), # Missing on some restricted docs (see Juriscraper) attachment['page_count'] is not None, attachment['description']]): rd, created = RECAPDocument.objects.update_or_create( docket_entry=de, document_number=att_data['document_number'], attachment_number=attachment['attachment_number'], document_type=RECAPDocument.ATTACHMENT, ) if created: rds_created.append(rd) needs_save = False for field in ['description', 'pacer_doc_id']: if attachment[field]: setattr(rd, field, attachment[field]) needs_save = True if needs_save: rd.save() # Do *not* do this async — that can cause race conditions. add_or_update_recap_document([rd.pk], force_commit=False) mark_pq_successful(pq, d_id=de.docket_id, de_id=de.pk) process_orphan_documents(rds_created, pq.court_id, main_rd.docket_entry.docket.date_filed)
def get_pacer_doc_by_rd_and_description(self, rd_pk, description_re, session, fallback_to_main_doc=False, tag=None): """Using a RECAPDocument object ID and a description of a document, get the document from PACER. This function was originally meant to get civil cover sheets, but can be repurposed as needed. :param rd_pk: The PK of a RECAPDocument object to use as a source. :param description_re: A compiled regular expression to search against the description provided by the attachment page. :param session: The PACER session object to use. :param fallback_to_main_doc: Should we grab the main doc if none of the attachments match the regex? :param tag: A tag name to apply to any downloaded content. :return: None """ rd = RECAPDocument.objects.get(pk=rd_pk) if not rd.pacer_doc_id: # Some docket entries are just text/don't have a pacer_doc_id. self.request.callbacks = None return d = rd.docket_entry.docket pacer_court_id = map_cl_to_pacer_id(d.court_id) att_report = AttachmentPage(pacer_court_id, session) try: att_report.query(rd.pacer_doc_id) except (ConnectTimeout, ConnectionError, ReadTimeout, ReadTimeoutError, ChunkedEncodingError) as exc: logger.warning("Unable to get PDF for %s" % rd) raise self.retry(exc=exc) except HTTPError as exc: if exc.response.status_code in [ HTTP_500_INTERNAL_SERVER_ERROR, HTTP_504_GATEWAY_TIMEOUT ]: logger.warning("Ran into HTTPError: %s. Retrying." % exc.response.status_code) raise self.retry(exc) else: msg = "Ran into unknown HTTPError. %s. Aborting." % \ exc.response.status_code logger.error(msg) self.request.callbacks = None return att_found = None for attachment in att_report.data.get('attachments', []): if description_re.search(attachment['description']): att_found = attachment.copy() document_type = RECAPDocument.ATTACHMENT break if not att_found: if fallback_to_main_doc: logger.info("Falling back to main document for pacer_doc_id: %s" % rd.pacer_doc_id) att_found = att_report.data document_type = RECAPDocument.PACER_DOCUMENT else: msg = "Aborting. Did not find civil cover sheet for %s." % rd logger.error(msg) self.request.callbacks = None return if not att_found.get('pacer_doc_id'): logger.warn("No pacer_doc_id for document (is it sealed?)") self.request.callbacks = None return # Try to find the attachment already in the collection rd, _ = RECAPDocument.objects.get_or_create( docket_entry=rd.docket_entry, attachment_number=att_found.get('attachment_number'), document_number=rd.document_number, pacer_doc_id=att_found['pacer_doc_id'], document_type=document_type, defaults={ 'date_upload': now(), }, ) # Replace the description if we have description data. Else fallback on old. rd.description = att_found.get('description', '') or rd.description if tag is not None: tag, _ = Tag.objects.get_or_create(name=tag) rd.tags.add(tag) if rd.is_available: # Great. Call it a day. rd.save(do_extraction=False, index=False) return # Not available. Go get it. try: pacer_case_id = rd.docket_entry.docket.pacer_case_id r = att_report.download_pdf(pacer_case_id, att_found['pacer_doc_id']) except (ConnectTimeout, ConnectionError, ReadTimeout, ReadTimeoutError, ChunkedEncodingError) as exc: logger.warning("Unable to get PDF for %s" % att_found['pacer_doc_id']) raise self.retry(exc=exc) except HTTPError as exc: if exc.response.status_code in [ HTTP_500_INTERNAL_SERVER_ERROR, HTTP_504_GATEWAY_TIMEOUT ]: logger.warning("Ran into HTTPError: %s. Retrying." % exc.response.status_code) raise self.retry(exc) else: msg = "Ran into unknown HTTPError. %s. Aborting." % \ exc.response.status_code logger.error(msg) self.request.callbacks = None return if r is None: msg = "Unable to get PDF for %s at PACER court '%s' with doc id %s" % \ (rd, pacer_court_id, rd.pacer_doc_id) logger.error(msg) self.request.callbacks = None return file_name = get_document_filename( d.court_id, pacer_case_id, rd.document_number, rd.attachment_number, ) cf = ContentFile(r.content) rd.filepath_local.save(file_name, cf, save=False) rd.is_available = True # We've got the PDF. # request.content is sometimes a str, sometimes unicode, force it all to be # bytes, pleasing hashlib. rd.sha1 = hashlib.sha1(force_bytes(r.content)).hexdigest() rd.page_count = get_page_count(rd.filepath_local.path, 'pdf') # Save, extract, then save to Solr. Skip OCR for now. Don't do these async. rd.save(do_extraction=False, index=False) extract_recap_pdf(rd.pk, skip_ocr=True) add_or_update_recap_document([rd.pk])
def process_recap_attachment(self, pk, tag_names=None): """Process an uploaded attachment page from the RECAP API endpoint. :param pk: The primary key of the processing queue item you want to work on :param tag_names: A list of tag names to add to all items created or modified in this function. :return: None """ pq = ProcessingQueue.objects.get(pk=pk) mark_pq_status(pq, "", PROCESSING_STATUS.IN_PROGRESS) logger.info("Processing RECAP item (debug is: %s): %s" % (pq.debug, pq)) att_page = AttachmentPage(map_cl_to_pacer_id(pq.court_id)) with open(pq.filepath_local.path) as f: text = f.read().decode("utf-8") att_page._parse_text(text) att_data = att_page.data logger.info("Parsing completed for item %s" % pq) if att_data == {}: # Bad attachment page. msg = "Not a valid attachment page upload." mark_pq_status(pq, msg, PROCESSING_STATUS.INVALID_CONTENT) self.request.chain = None return None if pq.pacer_case_id in ["undefined", "null"]: # Bad data from the client. Fix it with parsed data. pq.pacer_case_id = att_data.get("pacer_case_id") pq.save() # Merge the contents of the data into CL. try: params = { "pacer_doc_id": att_data["pacer_doc_id"], "docket_entry__docket__court": pq.court, } if pq.pacer_case_id: params["docket_entry__docket__pacer_case_id"] = pq.pacer_case_id main_rd = RECAPDocument.objects.get(**params) except RECAPDocument.MultipleObjectsReturned: # Unclear how to proceed and we don't want to associate this data with # the wrong case. We must punt. msg = ( "Too many documents found when attempting to associate " "attachment data" ) mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED) return None except RECAPDocument.DoesNotExist as exc: msg = "Could not find docket to associate with attachment metadata" if (self.request.retries == self.max_retries) or pq.debug: mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED) return None else: mark_pq_status(pq, msg, PROCESSING_STATUS.QUEUED_FOR_RETRY) raise self.retry(exc=exc) # We got the right item. Update/create all the attachments for # the docket entry. de = main_rd.docket_entry if att_data["document_number"] is None: # Bankruptcy attachment page. Use the document number from the Main doc att_data["document_number"] = main_rd.document_number rds_created = [] if not pq.debug: # Save the old HTML to the docket entry. pacer_file = PacerHtmlFiles( content_object=de, upload_type=UPLOAD_TYPE.ATTACHMENT_PAGE ) pacer_file.filepath.save( "attachment_page.html", # Irrelevant b/c UUIDFileSystemStorage ContentFile(text), ) # Create/update the attachment items. tags = [] if tag_names: for tag_name in tag_names: tag, _ = Tag.objects.get_or_create(name=tag_name) tags.append(tag) for attachment in att_data["attachments"]: if all( [ attachment["attachment_number"], # Missing on sealed items. attachment.get("pacer_doc_id", False), # Missing on some restricted docs (see Juriscraper) attachment["page_count"] is not None, attachment["description"], ] ): rd, created = RECAPDocument.objects.update_or_create( docket_entry=de, document_number=att_data["document_number"], attachment_number=attachment["attachment_number"], document_type=RECAPDocument.ATTACHMENT, ) if created: rds_created.append(rd) needs_save = False for field in ["description", "pacer_doc_id"]: if attachment[field]: setattr(rd, field, attachment[field]) needs_save = True # Only set page_count and file_size if they're blank, in case # we got the real value by measuring. if rd.page_count is None: rd.page_count = attachment["page_count"] if rd.file_size is None and attachment["file_size_str"]: try: rd.file_size = convert_size_to_bytes( attachment["file_size_str"] ) except ValueError: pass if needs_save: rd.save() if tags: for tag in tags: tag.tag_object(rd) # Do *not* do this async — that can cause race conditions. add_items_to_solr([rd.pk], "search.RECAPDocument") mark_pq_successful(pq, d_id=de.docket_id, de_id=de.pk) process_orphan_documents( rds_created, pq.court_id, main_rd.docket_entry.docket.date_filed ) changed = mark_ia_upload_needed(de.docket) if changed: de.docket.save()
def get_data_from_att_report(text: str, court_id: str) -> Dict[str, str]: att_page = AttachmentPage(map_cl_to_pacer_id(court_id)) att_page._parse_text(text) att_data = att_page.data return att_data
def get_data_from_att_report(text, court_id): att_page = AttachmentPage(map_cl_to_pacer_id(court_id)) att_page._parse_text(text) att_data = att_page.data return att_data
def process_recap_attachment(self, pk): """Process an uploaded attachment page from the RECAP API endpoint. :param pk: The primary key of the processing queue item you want to work on :return: None """ pq = ProcessingQueue.objects.get(pk=pk) mark_pq_status(pq, '', pq.PROCESSING_IN_PROGRESS) logger.info("Processing RECAP item (debug is: %s): %s" % (pq.debug, pq)) att_page = AttachmentPage(map_cl_to_pacer_id(pq.court_id)) text = pq.filepath_local.read().decode('utf-8') att_page._parse_text(text) att_data = att_page.data logger.info("Parsing completed for item %s" % pq) # Merge the contents of the data into CL. try: rd = RECAPDocument.objects.get( pacer_doc_id=att_data['pacer_doc_id'], docket_entry__docket__court=pq.court, ) except RECAPDocument.DoesNotExist as exc: msg = "Could not find docket to associate with attachment metadata" if (self.request.retries == self.max_retries) or pq.debug: mark_pq_status(pq, msg, pq.PROCESSING_FAILED) return None else: mark_pq_status(pq, msg, pq.QUEUED_FOR_RETRY) raise self.retry(exc=exc) # We got the right item. Update/create all the attachments for # the docket entry. de = rd.docket_entry if att_data['document_number'] is None: # Bankruptcy attachment page. Use the document number from the Main doc att_data['document_number'] = rd.document_number if not pq.debug: # Save the old HTML to the docket entry. pacer_file = PacerHtmlFiles(content_object=de) pacer_file.filepath.save( 'attachment_page.html', # Irrelevant b/c UUIDFileSystemStorage ContentFile(text), ) # Create/update the attachment items. for attachment in att_data['attachments']: if all([ attachment['attachment_number'], # Missing on sealed items. attachment.get('pacer_doc_id', False), # Missing on some restricted docs (see Juriscraper) attachment['page_count'] is not None, attachment['description'] ]): rd, created = RECAPDocument.objects.update_or_create( docket_entry=de, document_number=att_data['document_number'], attachment_number=attachment['attachment_number'], document_type=RECAPDocument.ATTACHMENT, ) needs_save = False for field in ['description', 'pacer_doc_id']: if attachment[field]: setattr(rd, field, attachment[field]) needs_save = True if needs_save: try: rd.save() except IntegrityError: # Happens when we hit courtlistener/issues#765, in which # we violate the unique constraint on pacer_doc_id. continue # Do *not* do this async — that can cause race conditions. add_or_update_recap_document([rd.pk], force_commit=False) mark_pq_successful(pq, d_id=de.docket_id, de_id=de.pk)