def test_parsing_results(self): """Can we do a simple query and parse?""" paths = [] path_root = os.path.join(TESTS_ROOT, "examples", "pacer", "attachment_pages") for root, dirnames, filenames in os.walk(path_root): for filename in fnmatch.filter(filenames, '*.html'): paths.append(os.path.join(root, filename)) paths.sort() path_max_len = max(len(path) for path in paths) + 2 for i, path in enumerate(paths): sys.stdout.write("%s. Doing %s" % (i, path.ljust(path_max_len))) dirname, filename = os.path.split(path) filename_sans_ext = filename.split('.')[0] json_path = os.path.join(dirname, '%s.json' % filename_sans_ext) court = filename_sans_ext.split('_')[0] report = AttachmentPage(court) with open(path, 'r') as f: report._parse_text(f.read().decode('utf-8')) data = report.data if not os.path.exists(json_path): with open(json_path, 'w') as f: print("Creating new file at %s" % json_path) json.dump(data, f, indent=2, sort_keys=True) continue with open(json_path) as f: j = json.load(f) self.assertEqual(j, data) sys.stdout.write("✓\n")
def test_parsing_results(self): """Can we do a simple query and parse?""" paths = [] path_root = os.path.join(TESTS_ROOT, "examples", "pacer", "attachment_pages") for root, dirnames, filenames in os.walk(path_root): for filename in fnmatch.filter(filenames, '*.html'): paths.append(os.path.join(root, filename)) paths.sort() path_max_len = max(len(path) for path in paths) + 2 for i, path in enumerate(paths): sys.stdout.write("%s. Doing %s" % (i, path.ljust(path_max_len))) dirname, filename = os.path.split(path) filename_sans_ext = filename.split('.')[0] json_path = os.path.join(dirname, '%s.json' % filename_sans_ext) court = filename_sans_ext.split('_')[0] report = AttachmentPage(court) with open(path, 'r') as f: report._parse_text(f.read().decode('utf-8')) data = report.data with open(json_path) as f: j = json.load(f) self.assertEqual(j, data) sys.stdout.write("✓\n")
def process_recap_attachment(self, pk): """Process an uploaded attachment page from the RECAP API endpoint. :param pk: The primary key of the processing queue item you want to work on :return: None """ pq = ProcessingQueue.objects.get(pk=pk) mark_pq_status(pq, '', pq.PROCESSING_IN_PROGRESS) logger.info("Processing RECAP item (debug is: %s): %s" % (pq.debug, pq)) att_page = AttachmentPage(map_cl_to_pacer_id(pq.court_id)) with open(pq.filepath_local.path) as f: text = f.read().decode('utf-8') att_page._parse_text(text) att_data = att_page.data logger.info("Parsing completed for item %s" % pq) if pq.pacer_case_id in ['undefined', 'null']: # Bad data from the client. Fix it with parsed data. pq.pacer_case_id = att_data.get('pacer_case_id') pq.save() # Merge the contents of the data into CL. try: params = { 'pacer_doc_id': att_data['pacer_doc_id'], 'docket_entry__docket__court': pq.court, } if pq.pacer_case_id: params['docket_entry__docket__pacer_case_id'] = pq.pacer_case_id main_rd = RECAPDocument.objects.get(**params) except RECAPDocument.MultipleObjectsReturned: # Unclear how to proceed and we don't want to associate this data with # the wrong case. We must punt. msg = "Too many documents found when attempting to associate " \ "attachment data" mark_pq_status(pq, msg, pq.PROCESSING_FAILED) return None except RECAPDocument.DoesNotExist as exc: msg = "Could not find docket to associate with attachment metadata" if (self.request.retries == self.max_retries) or pq.debug: mark_pq_status(pq, msg, pq.PROCESSING_FAILED) return None else: mark_pq_status(pq, msg, pq.QUEUED_FOR_RETRY) raise self.retry(exc=exc) # We got the right item. Update/create all the attachments for # the docket entry. de = main_rd.docket_entry if att_data['document_number'] is None: # Bankruptcy attachment page. Use the document number from the Main doc att_data['document_number'] = main_rd.document_number rds_created = [] if not pq.debug: # Save the old HTML to the docket entry. pacer_file = PacerHtmlFiles(content_object=de, upload_type=ATTACHMENT_PAGE) pacer_file.filepath.save( 'attachment_page.html', # Irrelevant b/c UUIDFileSystemStorage ContentFile(text), ) # Create/update the attachment items. for attachment in att_data['attachments']: if all([attachment['attachment_number'], # Missing on sealed items. attachment.get('pacer_doc_id', False), # Missing on some restricted docs (see Juriscraper) attachment['page_count'] is not None, attachment['description']]): rd, created = RECAPDocument.objects.update_or_create( docket_entry=de, document_number=att_data['document_number'], attachment_number=attachment['attachment_number'], document_type=RECAPDocument.ATTACHMENT, ) if created: rds_created.append(rd) needs_save = False for field in ['description', 'pacer_doc_id']: if attachment[field]: setattr(rd, field, attachment[field]) needs_save = True if needs_save: rd.save() # Do *not* do this async — that can cause race conditions. add_or_update_recap_document([rd.pk], force_commit=False) mark_pq_successful(pq, d_id=de.docket_id, de_id=de.pk) process_orphan_documents(rds_created, pq.court_id, main_rd.docket_entry.docket.date_filed)
def process_recap_attachment(self, pk, tag_names=None): """Process an uploaded attachment page from the RECAP API endpoint. :param pk: The primary key of the processing queue item you want to work on :param tag_names: A list of tag names to add to all items created or modified in this function. :return: None """ pq = ProcessingQueue.objects.get(pk=pk) mark_pq_status(pq, "", PROCESSING_STATUS.IN_PROGRESS) logger.info("Processing RECAP item (debug is: %s): %s" % (pq.debug, pq)) att_page = AttachmentPage(map_cl_to_pacer_id(pq.court_id)) with open(pq.filepath_local.path) as f: text = f.read().decode("utf-8") att_page._parse_text(text) att_data = att_page.data logger.info("Parsing completed for item %s" % pq) if att_data == {}: # Bad attachment page. msg = "Not a valid attachment page upload." mark_pq_status(pq, msg, PROCESSING_STATUS.INVALID_CONTENT) self.request.chain = None return None if pq.pacer_case_id in ["undefined", "null"]: # Bad data from the client. Fix it with parsed data. pq.pacer_case_id = att_data.get("pacer_case_id") pq.save() # Merge the contents of the data into CL. try: params = { "pacer_doc_id": att_data["pacer_doc_id"], "docket_entry__docket__court": pq.court, } if pq.pacer_case_id: params["docket_entry__docket__pacer_case_id"] = pq.pacer_case_id main_rd = RECAPDocument.objects.get(**params) except RECAPDocument.MultipleObjectsReturned: # Unclear how to proceed and we don't want to associate this data with # the wrong case. We must punt. msg = ( "Too many documents found when attempting to associate " "attachment data" ) mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED) return None except RECAPDocument.DoesNotExist as exc: msg = "Could not find docket to associate with attachment metadata" if (self.request.retries == self.max_retries) or pq.debug: mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED) return None else: mark_pq_status(pq, msg, PROCESSING_STATUS.QUEUED_FOR_RETRY) raise self.retry(exc=exc) # We got the right item. Update/create all the attachments for # the docket entry. de = main_rd.docket_entry if att_data["document_number"] is None: # Bankruptcy attachment page. Use the document number from the Main doc att_data["document_number"] = main_rd.document_number rds_created = [] if not pq.debug: # Save the old HTML to the docket entry. pacer_file = PacerHtmlFiles( content_object=de, upload_type=UPLOAD_TYPE.ATTACHMENT_PAGE ) pacer_file.filepath.save( "attachment_page.html", # Irrelevant b/c UUIDFileSystemStorage ContentFile(text), ) # Create/update the attachment items. tags = [] if tag_names: for tag_name in tag_names: tag, _ = Tag.objects.get_or_create(name=tag_name) tags.append(tag) for attachment in att_data["attachments"]: if all( [ attachment["attachment_number"], # Missing on sealed items. attachment.get("pacer_doc_id", False), # Missing on some restricted docs (see Juriscraper) attachment["page_count"] is not None, attachment["description"], ] ): rd, created = RECAPDocument.objects.update_or_create( docket_entry=de, document_number=att_data["document_number"], attachment_number=attachment["attachment_number"], document_type=RECAPDocument.ATTACHMENT, ) if created: rds_created.append(rd) needs_save = False for field in ["description", "pacer_doc_id"]: if attachment[field]: setattr(rd, field, attachment[field]) needs_save = True # Only set page_count and file_size if they're blank, in case # we got the real value by measuring. if rd.page_count is None: rd.page_count = attachment["page_count"] if rd.file_size is None and attachment["file_size_str"]: try: rd.file_size = convert_size_to_bytes( attachment["file_size_str"] ) except ValueError: pass if needs_save: rd.save() if tags: for tag in tags: tag.tag_object(rd) # Do *not* do this async — that can cause race conditions. add_items_to_solr([rd.pk], "search.RECAPDocument") mark_pq_successful(pq, d_id=de.docket_id, de_id=de.pk) process_orphan_documents( rds_created, pq.court_id, main_rd.docket_entry.docket.date_filed ) changed = mark_ia_upload_needed(de.docket) if changed: de.docket.save()
def get_data_from_att_report(text: str, court_id: str) -> Dict[str, str]: att_page = AttachmentPage(map_cl_to_pacer_id(court_id)) att_page._parse_text(text) att_data = att_page.data return att_data
def get_data_from_att_report(text, court_id): att_page = AttachmentPage(map_cl_to_pacer_id(court_id)) att_page._parse_text(text) att_data = att_page.data return att_data
def process_recap_attachment(self, pk): """Process an uploaded attachment page from the RECAP API endpoint. :param pk: The primary key of the processing queue item you want to work on :return: None """ pq = ProcessingQueue.objects.get(pk=pk) mark_pq_status(pq, '', pq.PROCESSING_IN_PROGRESS) logger.info("Processing RECAP item (debug is: %s): %s" % (pq.debug, pq)) att_page = AttachmentPage(map_cl_to_pacer_id(pq.court_id)) text = pq.filepath_local.read().decode('utf-8') att_page._parse_text(text) att_data = att_page.data logger.info("Parsing completed for item %s" % pq) # Merge the contents of the data into CL. try: rd = RECAPDocument.objects.get( pacer_doc_id=att_data['pacer_doc_id'], docket_entry__docket__court=pq.court, ) except RECAPDocument.DoesNotExist as exc: msg = "Could not find docket to associate with attachment metadata" if (self.request.retries == self.max_retries) or pq.debug: mark_pq_status(pq, msg, pq.PROCESSING_FAILED) return None else: mark_pq_status(pq, msg, pq.QUEUED_FOR_RETRY) raise self.retry(exc=exc) # We got the right item. Update/create all the attachments for # the docket entry. de = rd.docket_entry if att_data['document_number'] is None: # Bankruptcy attachment page. Use the document number from the Main doc att_data['document_number'] = rd.document_number if not pq.debug: # Save the old HTML to the docket entry. pacer_file = PacerHtmlFiles(content_object=de) pacer_file.filepath.save( 'attachment_page.html', # Irrelevant b/c UUIDFileSystemStorage ContentFile(text), ) # Create/update the attachment items. for attachment in att_data['attachments']: if all([ attachment['attachment_number'], # Missing on sealed items. attachment.get('pacer_doc_id', False), # Missing on some restricted docs (see Juriscraper) attachment['page_count'] is not None, attachment['description'] ]): rd, created = RECAPDocument.objects.update_or_create( docket_entry=de, document_number=att_data['document_number'], attachment_number=attachment['attachment_number'], document_type=RECAPDocument.ATTACHMENT, ) needs_save = False for field in ['description', 'pacer_doc_id']: if attachment[field]: setattr(rd, field, attachment[field]) needs_save = True if needs_save: try: rd.save() except IntegrityError: # Happens when we hit courtlistener/issues#765, in which # we violate the unique constraint on pacer_doc_id. continue # Do *not* do this async — that can cause race conditions. add_or_update_recap_document([rd.pk], force_commit=False) mark_pq_successful(pq, d_id=de.docket_id, de_id=de.pk)