def merge_pacer_docket_into_cl_docket( d, pacer_case_id, docket_data, report, appellate=False, tag_names=None, ): # Ensure that we set the case ID. This is needed on dockets that have # matching docket numbers, but that never got PACER data before. This was # previously rare, but since we added the FJC data to the dockets table, # this is now quite common. if not d.pacer_case_id: d.pacer_case_id = pacer_case_id d.add_recap_source() update_docket_metadata(d, docket_data) d.save() if appellate: d, og_info = update_docket_appellate_metadata(d, docket_data) if og_info is not None: og_info.save() d.originating_court_information = og_info tags = add_tags_to_objs(tag_names, [d]) # Add the HTML to the docket in case we need it someday. upload_type = (UPLOAD_TYPE.APPELLATE_DOCKET if appellate else UPLOAD_TYPE.DOCKET) pacer_file = PacerHtmlFiles(content_object=d, upload_type=upload_type) pacer_file.filepath.save( "docket.html", # We only care about the ext w/UUIDFileSystemStorage ContentFile(report.response.text), ) rds_created, content_updated = add_docket_entries( d, docket_data["docket_entries"], tags=tags) add_parties_and_attorneys(d, docket_data["parties"]) process_orphan_documents(rds_created, d.court_id, d.date_filed) logger.info("Created/updated docket: %s" % d) return rds_created, content_updated
def process_recap_docket_history_report(self, pk): """Process the docket history report. :param pk: The primary key of the processing queue item you want to work on :returns: A dict indicating whether the docket needs Solr re-indexing. """ pq = ProcessingQueue.objects.get(pk=pk) mark_pq_status(pq, '', pq.PROCESSING_IN_PROGRESS) logger.info("Processing RECAP item (debug is: %s): %s" % (pq.debug, pq)) report = DocketHistoryReport(map_cl_to_pacer_id(pq.court_id)) with open(pq.filepath_local.path) as f: text = f.read().decode('utf-8') report._parse_text(text) data = report.data logger.info("Parsing completed for item %s" % pq) # Merge the contents of the docket into CL. d, count = find_docket_object(pq.court_id, pq.pacer_case_id, data['docket_number']) if count > 1: logger.info("Found %s dockets during lookup. Choosing oldest." % count) d = d.earliest('date_created') add_recap_source(d) update_docket_metadata(d, data) if pq.debug: mark_pq_successful(pq, d_id=d.pk) self.request.callbacks = None return {'docket_pk': d.pk, 'needs_solr_update': False} try: d.save() except IntegrityError as exc: logger.warning("Race condition experienced while attempting docket " "save.") error_message = "Unable to save docket due to IntegrityError." if self.request.retries == self.max_retries: mark_pq_status(pq, error_message, pq.PROCESSING_FAILED) self.request.callbacks = None return None else: mark_pq_status(pq, error_message, pq.QUEUED_FOR_RETRY) raise self.retry(exc=exc) # Add the HTML to the docket in case we need it someday. pacer_file = PacerHtmlFiles(content_object=d, upload_type=DOCKET_HISTORY_REPORT) pacer_file.filepath.save( 'docket_history.html', # We only care about the ext w/UUIDFileSystemStorage ContentFile(text), ) rds_created, needs_solr_update = add_docket_entries(d, data['docket_entries']) process_orphan_documents(rds_created, pq.court_id, d.date_filed) mark_pq_successful(pq, d_id=d.pk) return { 'docket_pk': d.pk, 'needs_solr_update': bool(rds_created or needs_solr_update), }
def process_recap_attachment(self, pk): """Process an uploaded attachment page from the RECAP API endpoint. :param pk: The primary key of the processing queue item you want to work on :return: None """ pq = ProcessingQueue.objects.get(pk=pk) mark_pq_status(pq, '', pq.PROCESSING_IN_PROGRESS) logger.info("Processing RECAP item (debug is: %s): %s" % (pq.debug, pq)) att_page = AttachmentPage(map_cl_to_pacer_id(pq.court_id)) with open(pq.filepath_local.path) as f: text = f.read().decode('utf-8') att_page._parse_text(text) att_data = att_page.data logger.info("Parsing completed for item %s" % pq) if pq.pacer_case_id in ['undefined', 'null']: # Bad data from the client. Fix it with parsed data. pq.pacer_case_id = att_data.get('pacer_case_id') pq.save() # Merge the contents of the data into CL. try: params = { 'pacer_doc_id': att_data['pacer_doc_id'], 'docket_entry__docket__court': pq.court, } if pq.pacer_case_id: params['docket_entry__docket__pacer_case_id'] = pq.pacer_case_id main_rd = RECAPDocument.objects.get(**params) except RECAPDocument.MultipleObjectsReturned: # Unclear how to proceed and we don't want to associate this data with # the wrong case. We must punt. msg = "Too many documents found when attempting to associate " \ "attachment data" mark_pq_status(pq, msg, pq.PROCESSING_FAILED) return None except RECAPDocument.DoesNotExist as exc: msg = "Could not find docket to associate with attachment metadata" if (self.request.retries == self.max_retries) or pq.debug: mark_pq_status(pq, msg, pq.PROCESSING_FAILED) return None else: mark_pq_status(pq, msg, pq.QUEUED_FOR_RETRY) raise self.retry(exc=exc) # We got the right item. Update/create all the attachments for # the docket entry. de = main_rd.docket_entry if att_data['document_number'] is None: # Bankruptcy attachment page. Use the document number from the Main doc att_data['document_number'] = main_rd.document_number rds_created = [] if not pq.debug: # Save the old HTML to the docket entry. pacer_file = PacerHtmlFiles(content_object=de, upload_type=ATTACHMENT_PAGE) pacer_file.filepath.save( 'attachment_page.html', # Irrelevant b/c UUIDFileSystemStorage ContentFile(text), ) # Create/update the attachment items. for attachment in att_data['attachments']: if all([attachment['attachment_number'], # Missing on sealed items. attachment.get('pacer_doc_id', False), # Missing on some restricted docs (see Juriscraper) attachment['page_count'] is not None, attachment['description']]): rd, created = RECAPDocument.objects.update_or_create( docket_entry=de, document_number=att_data['document_number'], attachment_number=attachment['attachment_number'], document_type=RECAPDocument.ATTACHMENT, ) if created: rds_created.append(rd) needs_save = False for field in ['description', 'pacer_doc_id']: if attachment[field]: setattr(rd, field, attachment[field]) needs_save = True if needs_save: rd.save() # Do *not* do this async — that can cause race conditions. add_or_update_recap_document([rd.pk], force_commit=False) mark_pq_successful(pq, d_id=de.docket_id, de_id=de.pk) process_orphan_documents(rds_created, pq.court_id, main_rd.docket_entry.docket.date_filed)
def process_recap_docket(self, pk): """Process an uploaded docket from the RECAP API endpoint. :param pk: The primary key of the processing queue item you want to work on. :returns: A dict of the form: { // The PK of the docket that's created or updated 'docket_pk': 22, // A boolean indicating whether a new docket entry or recap document // was created (implying a Solr needs updating). 'needs_solr_update': True, } This value is a dict so that it can be ingested in a Celery chain. """ pq = ProcessingQueue.objects.get(pk=pk) mark_pq_status(pq, '', pq.PROCESSING_IN_PROGRESS) logger.info("Processing RECAP item (debug is: %s): %s" % (pq.debug, pq)) report = DocketReport(map_cl_to_pacer_id(pq.court_id)) text = pq.filepath_local.read().decode('utf-8') if 'History/Documents' in text: # Prior to 1.1.8, we did not separate docket history reports into their # own upload_type. Alas, we still have some old clients around, so we # need to handle those clients here. pq.upload_type = DOCKET_HISTORY_REPORT pq.save() process_recap_docket_history_report(pk) self.request.callbacks = None return None report._parse_text(text) data = report.data logger.info("Parsing completed of item %s" % pq) if data == {}: # Not really a docket. Some sort of invalid document (see Juriscraper). msg = "Not a valid docket upload." mark_pq_status(pq, msg, pq.INVALID_CONTENT) self.request.callbacks = None return None # Merge the contents of the docket into CL. d, count = find_docket_object(pq.court_id, pq.pacer_case_id, data['docket_number']) if count > 1: logger.info("Found %s dockets during lookup. Choosing oldest." % count) d = d.earliest('date_created') add_recap_source(d) update_docket_metadata(d, data) if pq.debug: mark_pq_successful(pq, d_id=d.pk) self.request.callbacks = None return {'docket_pk': d.pk, 'needs_solr_update': False} d.save() # Add the HTML to the docket in case we need it someday. pacer_file = PacerHtmlFiles(content_object=d, upload_type=DOCKET) pacer_file.filepath.save( 'docket.html', # We only care about the ext w/UUIDFileSystemStorage ContentFile(text), ) rds_created, needs_solr_update = add_docket_entries(d, data['docket_entries']) add_parties_and_attorneys(d, data['parties']) process_orphan_documents(rds_created, pq.court_id, d.date_filed) mark_pq_successful(pq, d_id=d.pk) return { 'docket_pk': d.pk, 'needs_solr_update': bool(rds_created or needs_solr_update), }
def get_docket_by_pacer_case_id(self, pacer_case_id, court_id, session, tag=None, **kwargs): """Get a docket by PACER case id, CL court ID, and a collection of kwargs that can be passed to the DocketReport query. For details of acceptable parameters, see DocketReport.query() :param pacer_case_id: The internal case ID of the item in PACER. :param court_id: A courtlistener court ID. :param session: A valid PacerSession object. :param tag: The tag name that should be stored with the item in the DB. :param kwargs: A variety of keyword args to pass to DocketReport.query(). """ report = DocketReport(map_cl_to_pacer_id(court_id), session) logger.info("Querying docket report %s.%s" % (court_id, pacer_case_id)) try: d = Docket.objects.get( pacer_case_id=pacer_case_id, court_id=court_id, ) except Docket.DoesNotExist: d = None except Docket.MultipleObjectsReturned: d = None if d is not None: first_missing_id = get_first_missing_de_number(d) if first_missing_id > 1: # We don't have to get the whole thing! kwargs.setdefault('doc_num_start', first_missing_id) report.query(pacer_case_id, **kwargs) docket_data = report.data logger.info("Querying and parsing complete for %s.%s" % (court_id, pacer_case_id)) # Merge the contents into CL. if d is None: d, count = find_docket_object(court_id, pacer_case_id, docket_data['docket_number']) if count > 1: d = d.earliest('date_created') add_recap_source(d) update_docket_metadata(d, docket_data) d.save() if tag is not None: tag, _ = Tag.objects.get_or_create(name=tag) d.tags.add(tag) # Add the HTML to the docket in case we need it someday. pacer_file = PacerHtmlFiles(content_object=d, upload_type=DOCKET) pacer_file.filepath.save( 'docket.html', # We only care about the ext w/UUIDFileSystemStorage ContentFile(report.response.text), ) rds_created, needs_solr_update = add_docket_entries( d, docket_data['docket_entries'], tag=tag) add_parties_and_attorneys(d, docket_data['parties']) process_orphan_documents(rds_created, d.court_id, d.date_filed) logger.info("Created/updated docket: %s" % d) return { 'docket_pk': d.pk, 'needs_solr_update': bool(rds_created or needs_solr_update), }
def process_recap_attachment(self, pk, tag_names=None): """Process an uploaded attachment page from the RECAP API endpoint. :param pk: The primary key of the processing queue item you want to work on :param tag_names: A list of tag names to add to all items created or modified in this function. :return: None """ pq = ProcessingQueue.objects.get(pk=pk) mark_pq_status(pq, "", PROCESSING_STATUS.IN_PROGRESS) logger.info("Processing RECAP item (debug is: %s): %s" % (pq.debug, pq)) att_page = AttachmentPage(map_cl_to_pacer_id(pq.court_id)) with open(pq.filepath_local.path) as f: text = f.read().decode("utf-8") att_page._parse_text(text) att_data = att_page.data logger.info("Parsing completed for item %s" % pq) if att_data == {}: # Bad attachment page. msg = "Not a valid attachment page upload." mark_pq_status(pq, msg, PROCESSING_STATUS.INVALID_CONTENT) self.request.chain = None return None if pq.pacer_case_id in ["undefined", "null"]: # Bad data from the client. Fix it with parsed data. pq.pacer_case_id = att_data.get("pacer_case_id") pq.save() # Merge the contents of the data into CL. try: params = { "pacer_doc_id": att_data["pacer_doc_id"], "docket_entry__docket__court": pq.court, } if pq.pacer_case_id: params["docket_entry__docket__pacer_case_id"] = pq.pacer_case_id main_rd = RECAPDocument.objects.get(**params) except RECAPDocument.MultipleObjectsReturned: # Unclear how to proceed and we don't want to associate this data with # the wrong case. We must punt. msg = ( "Too many documents found when attempting to associate " "attachment data" ) mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED) return None except RECAPDocument.DoesNotExist as exc: msg = "Could not find docket to associate with attachment metadata" if (self.request.retries == self.max_retries) or pq.debug: mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED) return None else: mark_pq_status(pq, msg, PROCESSING_STATUS.QUEUED_FOR_RETRY) raise self.retry(exc=exc) # We got the right item. Update/create all the attachments for # the docket entry. de = main_rd.docket_entry if att_data["document_number"] is None: # Bankruptcy attachment page. Use the document number from the Main doc att_data["document_number"] = main_rd.document_number rds_created = [] if not pq.debug: # Save the old HTML to the docket entry. pacer_file = PacerHtmlFiles( content_object=de, upload_type=UPLOAD_TYPE.ATTACHMENT_PAGE ) pacer_file.filepath.save( "attachment_page.html", # Irrelevant b/c UUIDFileSystemStorage ContentFile(text), ) # Create/update the attachment items. tags = [] if tag_names: for tag_name in tag_names: tag, _ = Tag.objects.get_or_create(name=tag_name) tags.append(tag) for attachment in att_data["attachments"]: if all( [ attachment["attachment_number"], # Missing on sealed items. attachment.get("pacer_doc_id", False), # Missing on some restricted docs (see Juriscraper) attachment["page_count"] is not None, attachment["description"], ] ): rd, created = RECAPDocument.objects.update_or_create( docket_entry=de, document_number=att_data["document_number"], attachment_number=attachment["attachment_number"], document_type=RECAPDocument.ATTACHMENT, ) if created: rds_created.append(rd) needs_save = False for field in ["description", "pacer_doc_id"]: if attachment[field]: setattr(rd, field, attachment[field]) needs_save = True # Only set page_count and file_size if they're blank, in case # we got the real value by measuring. if rd.page_count is None: rd.page_count = attachment["page_count"] if rd.file_size is None and attachment["file_size_str"]: try: rd.file_size = convert_size_to_bytes( attachment["file_size_str"] ) except ValueError: pass if needs_save: rd.save() if tags: for tag in tags: tag.tag_object(rd) # Do *not* do this async — that can cause race conditions. add_items_to_solr([rd.pk], "search.RECAPDocument") mark_pq_successful(pq, d_id=de.docket_id, de_id=de.pk) process_orphan_documents( rds_created, pq.court_id, main_rd.docket_entry.docket.date_filed ) changed = mark_ia_upload_needed(de.docket) if changed: de.docket.save()
def process_recap_attachment(self, pk): """Process an uploaded attachment page from the RECAP API endpoint. :param pk: The primary key of the processing queue item you want to work on :return: None """ pq = ProcessingQueue.objects.get(pk=pk) mark_pq_status(pq, '', pq.PROCESSING_IN_PROGRESS) logger.info("Processing RECAP item (debug is: %s): %s" % (pq.debug, pq)) att_page = AttachmentPage(map_cl_to_pacer_id(pq.court_id)) text = pq.filepath_local.read().decode('utf-8') att_page._parse_text(text) att_data = att_page.data logger.info("Parsing completed for item %s" % pq) # Merge the contents of the data into CL. try: rd = RECAPDocument.objects.get( pacer_doc_id=att_data['pacer_doc_id'], docket_entry__docket__court=pq.court, ) except RECAPDocument.DoesNotExist as exc: msg = "Could not find docket to associate with attachment metadata" if (self.request.retries == self.max_retries) or pq.debug: mark_pq_status(pq, msg, pq.PROCESSING_FAILED) return None else: mark_pq_status(pq, msg, pq.QUEUED_FOR_RETRY) raise self.retry(exc=exc) # We got the right item. Update/create all the attachments for # the docket entry. de = rd.docket_entry if att_data['document_number'] is None: # Bankruptcy attachment page. Use the document number from the Main doc att_data['document_number'] = rd.document_number if not pq.debug: # Save the old HTML to the docket entry. pacer_file = PacerHtmlFiles(content_object=de) pacer_file.filepath.save( 'attachment_page.html', # Irrelevant b/c UUIDFileSystemStorage ContentFile(text), ) # Create/update the attachment items. for attachment in att_data['attachments']: if all([ attachment['attachment_number'], # Missing on sealed items. attachment.get('pacer_doc_id', False), # Missing on some restricted docs (see Juriscraper) attachment['page_count'] is not None, attachment['description'] ]): rd, created = RECAPDocument.objects.update_or_create( docket_entry=de, document_number=att_data['document_number'], attachment_number=attachment['attachment_number'], document_type=RECAPDocument.ATTACHMENT, ) needs_save = False for field in ['description', 'pacer_doc_id']: if attachment[field]: setattr(rd, field, attachment[field]) needs_save = True if needs_save: try: rd.save() except IntegrityError: # Happens when we hit courtlistener/issues#765, in which # we violate the unique constraint on pacer_doc_id. continue # Do *not* do this async — that can cause race conditions. add_or_update_recap_document([rd.pk], force_commit=False) mark_pq_successful(pq, d_id=de.docket_id, de_id=de.pk)
def process_recap_docket_history_report(self, pk): """Process the docket history report. :param pk: The primary key of the processing queue item you want to work on :returns: A dict indicating whether the docket needs Solr re-indexing. """ start_time = now() pq = ProcessingQueue.objects.get(pk=pk) mark_pq_status(pq, "", PROCESSING_STATUS.IN_PROGRESS) logger.info("Processing RECAP item (debug is: %s): %s" % (pq.debug, pq)) try: text = pq.filepath_local.read().decode("utf-8") except IOError as exc: msg = "Internal processing error (%s: %s)." % (exc.errno, exc.strerror) if (self.request.retries == self.max_retries) or pq.debug: mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED) return None else: mark_pq_status(pq, msg, PROCESSING_STATUS.QUEUED_FOR_RETRY) raise self.retry(exc=exc) report = DocketHistoryReport(map_cl_to_pacer_id(pq.court_id)) report._parse_text(text) data = report.data logger.info("Parsing completed for item %s" % pq) if data == {}: # Bad docket history page. msg = "Not a valid docket history page upload." mark_pq_status(pq, msg, PROCESSING_STATUS.INVALID_CONTENT) self.request.chain = None return None # Merge the contents of the docket into CL. d, docket_count = find_docket_object( pq.court_id, pq.pacer_case_id, data["docket_number"] ) if docket_count > 1: logger.info( "Found %s dockets during lookup. Choosing oldest." % docket_count ) d = d.earliest("date_created") d.add_recap_source() update_docket_metadata(d, data) if pq.debug: mark_pq_successful(pq, d_id=d.pk) self.request.chain = None return {"docket_pk": d.pk, "content_updated": False} try: d.save() except IntegrityError as exc: logger.warning( "Race condition experienced while attempting docket save." ) error_message = "Unable to save docket due to IntegrityError." if self.request.retries == self.max_retries: mark_pq_status(pq, error_message, PROCESSING_STATUS.FAILED) self.request.chain = None return None else: mark_pq_status( pq, error_message, PROCESSING_STATUS.QUEUED_FOR_RETRY ) raise self.retry(exc=exc) # Add the HTML to the docket in case we need it someday. pacer_file = PacerHtmlFiles( content_object=d, upload_type=UPLOAD_TYPE.DOCKET_HISTORY_REPORT ) pacer_file.filepath.save( # We only care about the ext w/UUIDFileSystemStorage "docket_history.html", ContentFile(text), ) rds_created, content_updated = add_docket_entries( d, data["docket_entries"] ) process_orphan_documents(rds_created, pq.court_id, d.date_filed) if content_updated and docket_count > 0: newly_enqueued = enqueue_docket_alert(d.pk) if newly_enqueued: send_docket_alert(d.pk, start_time) mark_pq_successful(pq, d_id=d.pk) return { "docket_pk": d.pk, "content_updated": bool(rds_created or content_updated), }
def get_docket_by_pacer_case_id(self, data, court_id, cookies, tag_names=None, **kwargs): """Get a docket by PACER case id, CL court ID, and a collection of kwargs that can be passed to the DocketReport query. For details of acceptable parameters, see DocketReport.query() :param data: A dict containing: Required: 'pacer_case_id': The internal case ID of the item in PACER. Optional: 'docket_pk': The ID of the docket to work on to avoid lookups if it's known in advance. :param court_id: A courtlistener court ID. :param cookies: A requests.cookies.RequestsCookieJar with the cookies of a logged-in PACER user. :param tag_names: A list of tag names that should be stored with the item in the DB. :param kwargs: A variety of keyword args to pass to DocketReport.query(). :return: A dict indicating if we need to update Solr. """ s = PacerSession(cookies=cookies) if data is None: logger.info("Empty data argument. Terminating " "chains and exiting.") self.request.callbacks = None return pacer_case_id = data.get('pacer_case_id') report = DocketReport(map_cl_to_pacer_id(court_id), s) logger.info("Querying docket report %s.%s" % (court_id, pacer_case_id)) if data.get('docket_pk') is not None: d = Docket.objects.get(pk=data['docket_pk']) else: try: d = Docket.objects.get( pacer_case_id=pacer_case_id, court_id=court_id, ) except Docket.DoesNotExist: d = None except Docket.MultipleObjectsReturned: d = None if d is not None: first_missing_id = get_first_missing_de_number(d) if first_missing_id > 1: # We don't have to get the whole thing! kwargs.setdefault('doc_num_start', first_missing_id) report.query(pacer_case_id, **kwargs) docket_data = report.data logger.info("Querying and parsing complete for %s.%s" % (court_id, pacer_case_id)) if not docket_data: logger.info("No valid docket data for %s.%s", court_id, pacer_case_id) self.request.callbacks = None return # Merge the contents into CL. if d is None: d, count = find_docket_object(court_id, pacer_case_id, docket_data['docket_number']) if count > 1: d = d.earliest('date_created') add_recap_source(d) update_docket_metadata(d, docket_data) d.save() tags = [] if tag_names is not None: for tag_name in tag_names: tag, _ = Tag.objects.get_or_create(name=tag_name) tag.tag_object(d) tags.append(tag) # Add the HTML to the docket in case we need it someday. pacer_file = PacerHtmlFiles(content_object=d, upload_type=UPLOAD_TYPE.DOCKET) pacer_file.filepath.save( 'docket.html', # We only care about the ext w/UUIDFileSystemStorage ContentFile(report.response.text), ) rds_created, content_updated = add_docket_entries( d, docket_data['docket_entries'], tags=tags) add_parties_and_attorneys(d, docket_data['parties']) process_orphan_documents(rds_created, d.court_id, d.date_filed) logger.info("Created/updated docket: %s" % d) return { 'docket_pk': d.pk, 'content_updated': bool(rds_created or content_updated), }
def do_case_query_by_pacer_case_id(self, data, court_id, cookies, tag_names=None): """Run a case query (iquery.pl) query on a case and save the data :param data: A dict containing at least the following: { 'pacer_case_id': The internal pacer case ID for the item. } :param court_id: A courtlistener court ID :param cookies: A requests.cookies.RequestsCookieJar with the cookies of a logged-in PACER user. :param tag_names: A list of tag names to associate with the docket when saving it in the DB. :return: A dict with the pacer_case_id and docket_pk values. """ s = PacerSession(cookies=cookies) if data is None: logger.info("Empty data argument. Terminating " "chains and exiting.") self.request.callbacks = None return pacer_case_id = data.get('pacer_case_id') report = CaseQuery(map_cl_to_pacer_id(court_id), s) logger.info("Querying docket report %s.%s" % (court_id, pacer_case_id)) try: d = Docket.objects.get( pacer_case_id=pacer_case_id, court_id=court_id, ) except Docket.DoesNotExist: d = None except Docket.MultipleObjectsReturned: d = None report.query(pacer_case_id) docket_data = report.data logger.info("Querying and parsing complete for %s.%s" % (court_id, pacer_case_id)) if not docket_data: logger.info("No valid docket data for %s.%s", court_id, pacer_case_id) self.request.callbacks = None return # Merge the contents into CL. if d is None: d, count = find_docket_object(court_id, pacer_case_id, docket_data['docket_number']) if count > 1: d = d.earliest('date_created') add_recap_source(d) update_docket_metadata(d, docket_data) d.save() tags = [] if tag_names is not None: for tag_name in tag_names: tag, _ = Tag.objects.get_or_create(name=tag_name) tag.tag_object(d) tags.append(tag) # Add the HTML to the docket in case we need it someday. pacer_file = PacerHtmlFiles(content_object=d, upload_type=UPLOAD_TYPE.CASE_REPORT_PAGE) pacer_file.filepath.save( 'case_report.html', # We only care about the ext w/UUIDFileSystemStorage ContentFile(report.response.text), ) logger.info("Created/updated docket: %s" % d) return { 'pacer_case_id': pacer_case_id, 'docket_pk': d.pk, }
def get_docket_by_pacer_case_id(self, pacer_case_id, court_id, session, tag=None, **kwargs): """Get a docket by PACER case id, CL court ID, and a collection of kwargs that can be passed to the DocketReport query. For details of acceptable parameters, see DocketReport.query() :param pacer_case_id: The internal case ID of the item in PACER. :param court_id: A courtlistener court ID. :param session: A valid PacerSession object. :param tag: The tag name that should be stored with the item in the DB. :param kwargs: A variety of keyword args to pass to DocketReport.query(). """ report = DocketReport(map_cl_to_pacer_id(court_id), session) logger.info("Querying docket report %s.%s" % (court_id, pacer_case_id)) try: d = Docket.objects.get( pacer_case_id=pacer_case_id, court_id=court_id, ) except Docket.DoesNotExist: d = None except Docket.MultipleObjectsReturned: d = None if d is not None: first_missing_id = get_first_missing_de_number(d) if d is not None and first_missing_id > 1: # We don't have to get the whole thing! kwargs.setdefault('doc_num_start', first_missing_id) report.query(pacer_case_id, **kwargs) docket_data = report.data logger.info("Querying and parsing complete for %s.%s" % (court_id, pacer_case_id)) # Merge the contents into CL. try: if d is None: d = Docket.objects.get( Q(pacer_case_id=pacer_case_id) | Q(docket_number=docket_data['docket_number']), court_id=court_id, ) # Add RECAP as a source if it's not already. if d.source in [Docket.DEFAULT, Docket.SCRAPER]: d.source = Docket.RECAP_AND_SCRAPER elif d.source == Docket.COLUMBIA: d.source = Docket.COLUMBIA_AND_RECAP elif d.source == Docket.COLUMBIA_AND_SCRAPER: d.source = Docket.COLUMBIA_AND_RECAP_AND_SCRAPER except Docket.DoesNotExist: d = Docket( source=Docket.RECAP, pacer_case_id=pacer_case_id, court_id=court_id ) except Docket.MultipleObjectsReturned: logger.error("Too many dockets returned when trying to look up '%s.%s'" % (court_id, pacer_case_id)) return None update_docket_metadata(d, docket_data) d.save() if tag is not None: tag, _ = Tag.objects.get_or_create(name=tag) d.tags.add(tag) # Add the HTML to the docket in case we need it someday. pacer_file = PacerHtmlFiles(content_object=d) pacer_file.filepath.save( 'docket.html', # We only care about the ext w/UUIDFileSystemStorage ContentFile(report.response.text), ) for docket_entry in docket_data['docket_entries']: try: de, created = DocketEntry.objects.update_or_create( docket=d, entry_number=docket_entry['document_number'], defaults={ 'description': docket_entry['description'], 'date_filed': docket_entry['date_filed'], } ) except DocketEntry.MultipleObjectsReturned: logger.error( "Multiple docket entries found for document entry number '%s' " "while processing '%s.%s'" % (docket_entry['document_number'], court_id, pacer_case_id) ) continue else: if tag is not None: de.tags.add(tag) try: rd = RECAPDocument.objects.get( docket_entry=de, # No attachments when uploading dockets. document_type=RECAPDocument.PACER_DOCUMENT, document_number=docket_entry['document_number'], ) except RECAPDocument.DoesNotExist: try: rd = RECAPDocument.objects.create( docket_entry=de, # No attachments when uploading dockets. document_type=RECAPDocument.PACER_DOCUMENT, document_number=docket_entry['document_number'], pacer_doc_id=docket_entry['pacer_doc_id'], is_available=False, ) except IntegrityError: # Race condition. The item was created after our get failed. rd = RECAPDocument.objects.get( docket_entry=de, # No attachments when uploading dockets. document_type=RECAPDocument.PACER_DOCUMENT, document_number=docket_entry['document_number'], ) except RECAPDocument.MultipleObjectsReturned: logger.error( "Multiple recap documents found for document entry " "number: '%s', docket: %s" % (docket_entry['document_number'], d) ) continue rd.pacer_doc_id = rd.pacer_doc_id or docket_entry['pacer_doc_id'] if tag is not None: rd.tags.add(tag) add_parties_and_attorneys(d, docket_data['parties']) logger.info("Created/updated docket: %s" % d) return d
def process_recap_docket(self, pk): """Process an uploaded docket from the RECAP API endpoint. :param pk: The primary key of the processing queue item you want to work on. :returns: A dict of the form: { // The PK of the docket that's created or updated 'docket_pk': 22, // A boolean indicating whether a new docket entry or recap document // was created (implying a Solr needs updating). 'needs_solr_update': True, } This value is a dict so that it can be ingested in a Celery chain. """ pq = ProcessingQueue.objects.get(pk=pk) mark_pq_status(pq, '', pq.PROCESSING_IN_PROGRESS) logger.info("Processing RECAP item (debug is: %s): %s" % (pq.debug, pq)) report = DocketReport(map_cl_to_pacer_id(pq.court_id)) text = pq.filepath_local.read().decode('utf-8') if 'History/Documents' in text: # Prior to 1.1.8, we did not separate docket history reports into their # own upload_type. Alas, we still have some old clients around, so we # need to handle those clients here. pq.upload_type = pq.DOCKET_HISTORY_REPORT pq.save() process_recap_docket_history_report(pk) self.request.callbacks = None return None report._parse_text(text) docket_data = report.data logger.info("Parsing completed of item %s" % pq) if docket_data == {}: # Not really a docket. Some sort of invalid document (see Juriscraper). msg = "Not a valid docket upload." mark_pq_status(pq, msg, pq.INVALID_CONTENT) self.request.callbacks = None return None # Merge the contents of the docket into CL. Attempt several lookups of # decreasing specificity. Note that pacer_case_id is required for Docket # uploads. d = None for kwargs in [{'pacer_case_id': pq.pacer_case_id, 'docket_number': docket_data['docket_number']}, {'pacer_case_id': pq.pacer_case_id}, {'docket_number': docket_data['docket_number'], 'pacer_case_id': None}]: try: d = Docket.objects.get(court_id=pq.court_id, **kwargs) break except Docket.DoesNotExist: continue except Docket.MultipleObjectsReturned: msg = "Too many dockets found when trying to look up '%s'" % pq mark_pq_status(pq, msg, pq.PROCESSING_FAILED) self.request.callbacks = None return None if d is None: # Couldn't find it. Make a new one. d = Docket( source=Docket.RECAP, pacer_case_id=pq.pacer_case_id, court_id=pq.court_id ) # Add RECAP as a source if it's not already. if d.source in [Docket.DEFAULT, Docket.SCRAPER]: d.source = Docket.RECAP_AND_SCRAPER elif d.source == Docket.COLUMBIA: d.source = Docket.COLUMBIA_AND_RECAP elif d.source == Docket.COLUMBIA_AND_SCRAPER: d.source = Docket.COLUMBIA_AND_RECAP_AND_SCRAPER update_docket_metadata(d, docket_data) if pq.debug: mark_pq_successful(pq, d_id=d.pk) self.request.callbacks = None return {'docket_pk': d.pk, 'needs_solr_update': False} d.save() # Add the HTML to the docket in case we need it someday. pacer_file = PacerHtmlFiles(content_object=d) pacer_file.filepath.save( 'docket.html', # We only care about the ext w/UUIDFileSystemStorage ContentFile(text), ) # Docket entries & documents rds_created = [] needs_solr_update = False for docket_entry in docket_data['docket_entries']: try: de, de_created = DocketEntry.objects.update_or_create( docket=d, entry_number=docket_entry['document_number'], defaults={ 'description': docket_entry['description'], 'date_filed': docket_entry['date_filed'], } ) except DocketEntry.MultipleObjectsReturned: logger.error( "Multiple docket entries found for document entry number '%s' " "while processing '%s'" % (docket_entry['document_number'], pq) ) continue if de_created: needs_solr_update = True # Then make the RECAPDocument object. Try to find it. If we do, update # the pacer_doc_id field if it's blank. If we can't find it, create it # or throw an error. params = { 'docket_entry': de, # No attachments when uploading dockets. 'document_type': RECAPDocument.PACER_DOCUMENT, 'document_number': docket_entry['document_number'], } try: rd = RECAPDocument.objects.get(**params) except RECAPDocument.DoesNotExist: rd = RECAPDocument.objects.create( pacer_doc_id=docket_entry['pacer_doc_id'], is_available=False, **params ) rds_created.append(rd) except RECAPDocument.MultipleObjectsReturned: logger.error( "Multiple recap documents found for document entry number'%s' " "while processing '%s'" % (docket_entry['document_number'], pq) ) continue else: rd.pacer_doc_id = rd.pacer_doc_id or pq.pacer_doc_id add_parties_and_attorneys(d, docket_data['parties']) process_orphan_documents(rds_created, pq.court_id, d.date_filed) mark_pq_successful(pq, d_id=d.pk) return { 'docket_pk': d.pk, 'needs_solr_update': bool(rds_created or needs_solr_update), }
def merge_attachment_page_data( court: Court, pacer_case_id: int, pacer_doc_id: int, document_number: int, text: str, attachment_dicts: List[Dict[str, Union[int, str]]], debug: bool = False, ) -> Tuple[List[RECAPDocument], DocketEntry]: """Merge attachment page data into the docket :param court: The court object we're working with :param pacer_case_id: A PACER case ID :param pacer_doc_id: A PACER document ID :param document_number: The docket entry number :param text: The text of the attachment page :param attachment_dicts: A list of Juriscraper-parsed dicts for each attachment. :param debug: Whether to do saves during this process. :return: A list of RECAPDocuments modified or created during the process, and the DocketEntry object associated with the RECAPDocuments :raises: RECAPDocument.MultipleObjectsReturned, RECAPDocument.DoesNotExist """ try: params = { "pacer_doc_id": pacer_doc_id, "docket_entry__docket__court": court, } if pacer_case_id: params["docket_entry__docket__pacer_case_id"] = pacer_case_id main_rd = RECAPDocument.objects.get(**params) except RECAPDocument.MultipleObjectsReturned as exc: # Unclear how to proceed and we don't want to associate this data with # the wrong case. We must punt. raise exc except RECAPDocument.DoesNotExist as exc: # Can't find the docket to associate with the attachment metadata # It may be possible to go look for orphaned documents at this stage # and to then add them here, as we do when adding dockets. This need is # particularly acute for those that get free look emails and then go to # the attachment page. raise exc # We got the right item. Update/create all the attachments for # the docket entry. de = main_rd.docket_entry if document_number is None: # Bankruptcy attachment page. Use the document number from the Main doc document_number = main_rd.document_number if debug: return [], de # Save the old HTML to the docket entry. pacer_file = PacerHtmlFiles( content_object=de, upload_type=UPLOAD_TYPE.ATTACHMENT_PAGE ) pacer_file.filepath.save( "attachment_page.html", # Irrelevant b/c UUIDFileSystemStorage ContentFile(text), ) # Create/update the attachment items. rds_created = [] rds_affected = [] for attachment in attachment_dicts: sanity_checks = [ attachment["attachment_number"], # Missing on sealed items. attachment.get("pacer_doc_id", False), # Missing on some restricted docs (see Juriscraper) attachment["page_count"] is not None, attachment["description"], ] if not all(sanity_checks): continue rd, created = RECAPDocument.objects.update_or_create( docket_entry=de, document_number=document_number, attachment_number=attachment["attachment_number"], document_type=RECAPDocument.ATTACHMENT, ) if created: rds_created.append(rd) rds_affected.append(rd) for field in ["description", "pacer_doc_id"]: if attachment[field]: setattr(rd, field, attachment[field]) # Only set page_count and file_size if they're blank, in case # we got the real value by measuring. if rd.page_count is None: rd.page_count = attachment["page_count"] if rd.file_size is None and attachment["file_size_str"]: try: rd.file_size = convert_size_to_bytes( attachment["file_size_str"] ) except ValueError: pass rd.save() # Do *not* do this async — that can cause race conditions. add_items_to_solr([rd.pk], "search.RECAPDocument") mark_ia_upload_needed(de.docket, save_docket=True) process_orphan_documents( rds_created, court.pk, main_rd.docket_entry.docket.date_filed ) return rds_affected, de
def process_recap_docket(self, pk): """Process an uploaded docket from the RECAP API endpoint. :param pk: The primary key of the processing queue item you want to work on. :returns: A dict of the form: { // The PK of the docket that's created or updated 'docket_pk': 22, // A boolean indicating whether a new docket entry or // recap document was created (implying a Solr needs // updating). 'content_updated': True, } This value is a dict so that it can be ingested in a Celery chain. """ start_time = now() pq = ProcessingQueue.objects.get(pk=pk) mark_pq_status(pq, "", PROCESSING_STATUS.IN_PROGRESS) logger.info(f"Processing RECAP item (debug is: {pq.debug}): {pq}") report = DocketReport(map_cl_to_pacer_id(pq.court_id)) try: text = pq.filepath_local.read().decode() except IOError as exc: msg = f"Internal processing error ({exc.errno}: {exc.strerror})." if (self.request.retries == self.max_retries) or pq.debug: mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED) return None else: mark_pq_status(pq, msg, PROCESSING_STATUS.QUEUED_FOR_RETRY) raise self.retry(exc=exc) if "History/Documents" in text: # Prior to 1.1.8, we did not separate docket history reports into their # own upload_type. Alas, we still have some old clients around, so we # need to handle those clients here. pq.upload_type = UPLOAD_TYPE.DOCKET_HISTORY_REPORT pq.save() process_recap_docket_history_report(pk) self.request.chain = None return None report._parse_text(text) data = report.data logger.info(f"Parsing completed of item {pq}") if data == {}: # Not really a docket. Some sort of invalid document (see Juriscraper). msg = "Not a valid docket upload." mark_pq_status(pq, msg, PROCESSING_STATUS.INVALID_CONTENT) self.request.chain = None return None # Merge the contents of the docket into CL. d = find_docket_object(pq.court_id, pq.pacer_case_id, data["docket_number"]) d.add_recap_source() update_docket_metadata(d, data) if not d.pacer_case_id: d.pacer_case_id = pq.pacer_case_id if pq.debug: mark_pq_successful(pq, d_id=d.pk) self.request.chain = None return {"docket_pk": d.pk, "content_updated": False} d.save() # Add the HTML to the docket in case we need it someday. pacer_file = PacerHtmlFiles(content_object=d, upload_type=UPLOAD_TYPE.DOCKET) pacer_file.filepath.save( "docket.html", # We only care about the ext w/UUIDFileSystemStorage ContentFile(text), ) rds_created, content_updated = add_docket_entries(d, data["docket_entries"]) add_parties_and_attorneys(d, data["parties"]) process_orphan_documents(rds_created, pq.court_id, d.date_filed) if content_updated: newly_enqueued = enqueue_docket_alert(d.pk) if newly_enqueued: send_docket_alert(d.pk, start_time) mark_pq_successful(pq, d_id=d.pk) return { "docket_pk": d.pk, "content_updated": bool(rds_created or content_updated), }
def get_appellate_docket_by_docket_number(self, docket_number, court_id, cookies, tag_names=None, **kwargs): """Get a docket by docket number, CL court ID, and a collection of kwargs that can be passed to the DocketReport query. For details of acceptable parameters, see DocketReport.query() :param docket_number: The docket number of the case. :param court_id: A courtlistener/PACER appellate court ID. :param cookies: A requests.cookies.RequestsCookieJar with the cookies of a logged-in PACER user. :param tag_names: The tag name that should be stored with the item in the DB, if desired. :param kwargs: A variety of keyword args to pass to DocketReport.query(). """ s = PacerSession(cookies=cookies) report = AppellateDocketReport(court_id, s) logging_id = "%s - %s" % (court_id, docket_number) logger.info("Querying docket report %s", logging_id) try: report.query(docket_number, **kwargs) except requests.RequestException as e: logger.warning("Problem getting docket %s", logging_id) if self.request.retries == self.max_retries: self.request.callbacks = None return None raise self.retry(exc=e) docket_data = report.data logger.info('Querying and parsing complete for %s', logging_id) if docket_data == {}: logger.info("Unable to find docket: %s", logging_id) self.request.callbacks = None return None try: d = Docket.objects.get( docket_number=docket_number, court_id=court_id, ) except Docket.DoesNotExist: d = None except Docket.MultipleObjectsReturned: d = None if d is None: d, count = find_docket_object(court_id, docket_number, docket_number) if count > 1: d = d.earliest('date_created') add_recap_source(d) update_docket_metadata(d, docket_data) d, og_info = update_docket_appellate_metadata(d, docket_data) if not d.pacer_case_id: d.pacer_case_id = docket_number if og_info is not None: og_info.save() d.originating_court_information = og_info d.save() tags = [] if tag_names is not None: for tag_name in tag_names: tag, _ = Tag.objects.get_or_create(name=tag_name) tag.tag_object(d) tags.append(tag) # Save the HTML to the docket in case we need it someday. pacer_file = PacerHtmlFiles(content_object=d, upload_type=UPLOAD_TYPE.APPELLATE_DOCKET) pacer_file.filepath.save( 'docket.html', # We only care about the ext w/UUIDFileSystemStorage ContentFile(report.response.text), ) rds_created, content_updated = add_docket_entries( d, docket_data['docket_entries'], tags=tags) add_parties_and_attorneys(d, docket_data['parties']) process_orphan_documents(rds_created, d.court_id, d.date_filed) logger.info("Created/updated docket: %s" % d) return { 'docket_pk': d.pk, 'content_updated': bool(rds_created or content_updated), }
def process_recap_claims_register(self, pk): """Merge bankruptcy claims registry HTML into RECAP :param pk: The primary key of the processing queue item you want to work on :type pk: int :return: None :rtype: None """ pq = ProcessingQueue.objects.get(pk=pk) if pq.debug: # Proper debugging not supported on this endpoint. Just abort. mark_pq_successful(pq) self.request.chain = None return None mark_pq_status(pq, "", PROCESSING_STATUS.IN_PROGRESS) logger.info("Processing RECAP item (debug is: %s): %s" % (pq.debug, pq)) try: text = pq.filepath_local.read().decode("utf-8") except IOError as exc: msg = "Internal processing error (%s: %s)." % (exc.errno, exc.strerror) if (self.request.retries == self.max_retries) or pq.debug: mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED) return None else: mark_pq_status(pq, msg, PROCESSING_STATUS.QUEUED_FOR_RETRY) raise self.retry(exc=exc) report = ClaimsRegister(map_cl_to_pacer_id(pq.court_id)) report._parse_text(text) data = report.data logger.info("Parsing completed for item %s" % pq) if not data: # Bad HTML msg = "Not a valid claims registry page or other parsing failure" mark_pq_status(pq, msg, PROCESSING_STATUS.INVALID_CONTENT) self.request.chain = None return None # Merge the contents of the docket into CL. d, docket_count = find_docket_object( pq.court_id, pq.pacer_case_id, data["docket_number"] ) if docket_count > 1: logger.info( "Found %s dockets during lookup. Choosing oldest." % docket_count ) d = d.earliest("date_created") # Merge the contents into CL d.add_recap_source() update_docket_metadata(d, data) try: d.save() except IntegrityError as exc: logger.warning( "Race condition experienced while attempting docket save." ) error_message = "Unable to save docket due to IntegrityError." if self.request.retries == self.max_retries: mark_pq_status(pq, error_message, PROCESSING_STATUS.FAILED) self.request.chain = None return None else: mark_pq_status( pq, error_message, PROCESSING_STATUS.QUEUED_FOR_RETRY ) raise self.retry(exc=exc) add_bankruptcy_data_to_docket(d, data) add_claims_to_docket(d, data["claims"]) logger.info("Created/updated claims data for %s", pq) # Add the HTML to the docket in case we need it someday. pacer_file = PacerHtmlFiles( content_object=d, upload_type=UPLOAD_TYPE.CLAIMS_REGISTER ) pacer_file.filepath.save( # We only care about the ext w/UUIDFileSystemStorage "claims_registry.html", ContentFile(text), ) mark_pq_successful(pq, d_id=d.pk) return {"docket_pk": d.pk}
def process_recap_docket(pk): """Process an uploaded docket from the RECAP API endpoint. param pk: The primary key of the processing queue item you want to work on. """ pq = ProcessingQueue.objects.get(pk=pk) pq.status = pq.PROCESSING_IN_PROGRESS pq.save() logger.info("Processing RECAP item (debug is: %s): %s" % (pq.debug, pq)) report = DocketReport(map_cl_to_pacer_id(pq.court_id)) text = pq.filepath_local.read().decode('utf-8') report._parse_text(text) docket_data = report.data logger.info("Parsing completed of item %s" % pq) # Merge the contents of the docket into CL try: d = Docket.objects.get( Q(pacer_case_id=pq.pacer_case_id) | Q(docket_number=docket_data['docket_number']), court_id=pq.court_id, ) # Add RECAP as a source if it's not already. if d.source in [Docket.DEFAULT, Docket.SCRAPER]: d.source = Docket.RECAP_AND_SCRAPER elif d.source == Docket.COLUMBIA: d.source = Docket.COLUMBIA_AND_RECAP elif d.source == Docket.COLUMBIA_AND_SCRAPER: d.source = Docket.COLUMBIA_AND_RECAP_AND_SCRAPER except Docket.DoesNotExist: d = Docket( source=Docket.RECAP, pacer_case_id=pq.pacer_case_id, court_id=pq.court_id ) except Docket.MultipleObjectsReturned: msg = "Too many dockets found when trying to look up '%s'" % pq logger.error(msg) pq.error_message = msg pq.status = pq.PROCESSING_FAILED pq.save() return None update_docket_metadata(d, docket_data) if pq.debug: mark_pq_successful(pq, d_id=d.pk) return d d.save() # Add the HTML to the docket in case we need it someday. pacer_file = PacerHtmlFiles(content_object=d) pacer_file.filepath.save( 'docket.html', # We only care about the ext w/UUIDFileSystemStorage ContentFile(text), ) # Docket entries for docket_entry in docket_data['docket_entries']: try: de, created = DocketEntry.objects.update_or_create( docket=d, entry_number=docket_entry['document_number'], defaults={ 'description': docket_entry['description'], 'date_filed': docket_entry['date_filed'], } ) except DocketEntry.MultipleObjectsReturned: logger.error( "Multiple docket entries found for document entry number '%s' " "while processing '%s'" % (docket_entry['document_number'], pq) ) continue # Then make the RECAPDocument object. Try to find it. If we do, update # the pacer_doc_id field if it's blank. If we can't find it, create it # or throw an error. try: rd = RECAPDocument.objects.get( docket_entry=de, # No attachments when uploading dockets. document_type=RECAPDocument.PACER_DOCUMENT, document_number=docket_entry['document_number'], ) except RECAPDocument.DoesNotExist: RECAPDocument.objects.create( docket_entry=de, # No attachments when uploading dockets. document_type=RECAPDocument.PACER_DOCUMENT, document_number=docket_entry['document_number'], pacer_doc_id=docket_entry['pacer_doc_id'], is_available=False, ) except RECAPDocument.MultipleObjectsReturned: logger.error( "Multiple recap documents found for document entry number'%s' " "while processing '%s'" % (docket_entry['document_number'], pq) ) continue else: rd.pacer_doc_id = rd.pacer_doc_id or pq.pacer_doc_id add_parties_and_attorneys(d, docket_data['parties']) mark_pq_successful(pq, d_id=d.pk) return d
def process_recap_appellate_docket(self, pk): """Process an uploaded appellate docket from the RECAP API endpoint. :param pk: The primary key of the processing queue item you want to work on. :returns: A dict of the form: { // The PK of the docket that's created or updated 'docket_pk': 22, // A boolean indicating whether a new docket entry or // recap document was created (implying a Solr needs // updating). 'content_updated': True, } This value is a dict so that it can be ingested in a Celery chain. """ start_time = now() pq = ProcessingQueue.objects.get(pk=pk) mark_pq_status(pq, "", PROCESSING_STATUS.IN_PROGRESS) logger.info( "Processing Appellate RECAP item" " (debug is: %s): %s" % (pq.debug, pq) ) report = AppellateDocketReport(map_cl_to_pacer_id(pq.court_id)) try: text = pq.filepath_local.read().decode("utf-8") except IOError as exc: msg = "Internal processing error (%s: %s)." % (exc.errno, exc.strerror) if (self.request.retries == self.max_retries) or pq.debug: mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED) return None else: mark_pq_status(pq, msg, PROCESSING_STATUS.QUEUED_FOR_RETRY) raise self.retry(exc=exc) report._parse_text(text) data = report.data logger.info("Parsing completed of item %s" % pq) if data == {}: # Not really a docket. Some sort of invalid document (see Juriscraper). msg = "Not a valid docket upload." mark_pq_status(pq, msg, PROCESSING_STATUS.INVALID_CONTENT) self.request.chain = None return None # Merge the contents of the docket into CL. d, docket_count = find_docket_object( pq.court_id, pq.pacer_case_id, data["docket_number"] ) if docket_count > 1: logger.info( "Found %s dockets during lookup. Choosing oldest." % docket_count ) d = d.earliest("date_created") d.add_recap_source() update_docket_metadata(d, data) d, og_info = update_docket_appellate_metadata(d, data) if not d.pacer_case_id: d.pacer_case_id = pq.pacer_case_id if pq.debug: mark_pq_successful(pq, d_id=d.pk) self.request.chain = None return {"docket_pk": d.pk, "content_updated": False} if og_info is not None: og_info.save() d.originating_court_information = og_info d.save() # Add the HTML to the docket in case we need it someday. pacer_file = PacerHtmlFiles( content_object=d, upload_type=UPLOAD_TYPE.APPELLATE_DOCKET ) pacer_file.filepath.save( "docket.html", # We only care about the ext w/UUIDFileSystemStorage ContentFile(text), ) rds_created, content_updated = add_docket_entries( d, data["docket_entries"] ) add_parties_and_attorneys(d, data["parties"]) process_orphan_documents(rds_created, pq.court_id, d.date_filed) if content_updated and docket_count > 0: newly_enqueued = enqueue_docket_alert(d.pk) if newly_enqueued: send_docket_alert(d.pk, start_time) mark_pq_successful(pq, d_id=d.pk) return { "docket_pk": d.pk, "content_updated": bool(rds_created or content_updated), }
def process_recap_docket(pk): """Process an uploaded docket from the RECAP API endpoint. :param pk: The primary key of the processing queue item you want to work on. :return: The docket that's created or updated. """ pq = ProcessingQueue.objects.get(pk=pk) mark_pq_status(pq, '', pq.PROCESSING_IN_PROGRESS) logger.info("Processing RECAP item (debug is: %s): %s" % (pq.debug, pq)) report = DocketReport(map_cl_to_pacer_id(pq.court_id)) text = pq.filepath_local.read().decode('utf-8') report._parse_text(text) docket_data = report.data logger.info("Parsing completed of item %s" % pq) if docket_data == {}: # Not really a docket. Some sort of invalid document (see Juriscraper). msg = "Not a valid docket upload." mark_pq_status(pq, msg, pq.INVALID_CONTENT) return None # Merge the contents of the docket into CL. Attempt several lookups of # decreasing specificity. d = None for kwargs in [{ 'pacer_case_id': pq.pacer_case_id, 'docket_number': docket_data['docket_number'] }, { 'pacer_case_id': pq.pacer_case_id }, { 'docket_number': docket_data['docket_number'] }]: try: d = Docket.objects.get(court_id=pq.court_id, **kwargs) break except Docket.DoesNotExist: continue except Docket.MultipleObjectsReturned: msg = "Too many dockets found when trying to look up '%s'" % pq mark_pq_status(pq, msg, pq.PROCESSING_FAILED) return None if d is None: # Couldn't find it. Make a new one. d = Docket(source=Docket.RECAP, pacer_case_id=pq.pacer_case_id, court_id=pq.court_id) # Add RECAP as a source if it's not already. if d.source in [Docket.DEFAULT, Docket.SCRAPER]: d.source = Docket.RECAP_AND_SCRAPER elif d.source == Docket.COLUMBIA: d.source = Docket.COLUMBIA_AND_RECAP elif d.source == Docket.COLUMBIA_AND_SCRAPER: d.source = Docket.COLUMBIA_AND_RECAP_AND_SCRAPER update_docket_metadata(d, docket_data) if pq.debug: mark_pq_successful(pq, d_id=d.pk) return d d.save() # Add the HTML to the docket in case we need it someday. pacer_file = PacerHtmlFiles(content_object=d) pacer_file.filepath.save( 'docket.html', # We only care about the ext w/UUIDFileSystemStorage ContentFile(text), ) # Docket entries for docket_entry in docket_data['docket_entries']: try: de, created = DocketEntry.objects.update_or_create( docket=d, entry_number=docket_entry['document_number'], defaults={ 'description': docket_entry['description'], 'date_filed': docket_entry['date_filed'], }) except DocketEntry.MultipleObjectsReturned: logger.error( "Multiple docket entries found for document entry number '%s' " "while processing '%s'" % (docket_entry['document_number'], pq)) continue # Then make the RECAPDocument object. Try to find it. If we do, update # the pacer_doc_id field if it's blank. If we can't find it, create it # or throw an error. try: rd = RECAPDocument.objects.get( docket_entry=de, # No attachments when uploading dockets. document_type=RECAPDocument.PACER_DOCUMENT, document_number=docket_entry['document_number'], ) except RECAPDocument.DoesNotExist: try: RECAPDocument.objects.create( docket_entry=de, # No attachments when uploading dockets. document_type=RECAPDocument.PACER_DOCUMENT, document_number=docket_entry['document_number'], pacer_doc_id=docket_entry['pacer_doc_id'], is_available=False, ) except IntegrityError: logger.warn( "Creating new document with pacer_doc_id of '%s' violates " "unique constraint on pacer_doc_id field." % docket_entry['pacer_doc_id']) continue except RECAPDocument.MultipleObjectsReturned: logger.error( "Multiple recap documents found for document entry number'%s' " "while processing '%s'" % (docket_entry['document_number'], pq)) continue else: rd.pacer_doc_id = rd.pacer_doc_id or pq.pacer_doc_id add_parties_and_attorneys(d, docket_data['parties']) mark_pq_successful(pq, d_id=d.pk) return d