def fetch_docket_by_pacer_case_id( session, court_id, pacer_case_id, fq, ): """Download the docket from PACER and merge it into CL :param session: A PacerSession object to work with :param court_id: The CL ID of the court :param pacer_case_id: The pacer_case_id of the docket, if known :param fq: The PacerFetchQueue object :return: a dict with information about the docket and the new data """ report = DocketReport(map_cl_to_pacer_id(court_id), session) report.query(pacer_case_id, **get_fq_docket_kwargs(fq)) docket_data = report.data if not docket_data: raise ParsingException("No data found in docket report.") if fq.docket_id: d = Docket.objects.get(pk=fq.docket_id) else: d, count = find_docket_object( court_id, pacer_case_id, docket_data["docket_number"] ) if count > 1: d = d.earliest("date_created") rds_created, content_updated = merge_pacer_docket_into_cl_docket( d, pacer_case_id, docket_data, report, appellate=False, ) return { "docket_pk": d.pk, "content_updated": bool(rds_created or content_updated), }
def setUp(self): self.docket, count = find_docket_object("akd", "41664", "3:11-cv-00064") if count > 1: raise Exception("Should not get more than one docket during " "this test!") process_docket_data(self.docket, self.DOCKET_PATH, UPLOAD_TYPE.IA_XML_FILE)
def setUp(self) -> None: docket_number = "3:11-cv-00064" self.docket = find_docket_object("akd", "41664", docket_number) self.docket.filepath_local = ( "/test/xml/gov.uscourts.akd.41664.docket.xml") self.docket.docket_number = docket_number self.docket.save() process_docket_data(self.docket, UPLOAD_TYPE.IA_XML_FILE)
def merge_rss_feed_contents(feed_data, court_pk, feed_status_pk): """Merge the rss feed contents into CourtListener :param feed_data: The data parameter of a PacerRssFeed object that has already queried the feed and been parsed. :param court_pk: The CourtListener court ID. :param feed_status_pk: The CL ID for the RSS status object. :returns all_rds_created: A list of all the RDs created during the processing. """ start_time = now() feed_status = RssFeedStatus.objects.get(pk=feed_status_pk) # RSS feeds are a list of normal Juriscraper docket objects. all_rds_created = [] d_pks_to_alert = [] for docket in feed_data: item_hash = hash_item(docket) if is_cached(item_hash): continue with transaction.atomic(): cached_ok = cache_hash(item_hash) if not cached_ok: # The item is already in the cache, ergo it's getting processed # in another thread/process and we had a race condition. continue d, docket_count = find_docket_object(court_pk, docket["pacer_case_id"], docket["docket_number"]) if docket_count > 1: logger.info("Found %s dockets during lookup. Choosing " "oldest." % docket_count) d = d.earliest("date_created") d.add_recap_source() update_docket_metadata(d, docket) if not d.pacer_case_id: d.pacer_case_id = docket["pacer_case_id"] d.save() rds_created, content_updated = add_docket_entries( d, docket["docket_entries"]) if content_updated and docket_count > 0: newly_enqueued = enqueue_docket_alert(d.pk) if newly_enqueued: d_pks_to_alert.append((d.pk, start_time)) all_rds_created.extend([rd.pk for rd in rds_created]) logger.info( "%s: Sending %s new RECAP documents to Solr for indexing and " "sending %s dockets for alerts.", feed_status.court_id, len(all_rds_created), len(d_pks_to_alert), ) return {"d_pks_to_alert": d_pks_to_alert, "rds_for_solr": all_rds_created}
def test_rss_feed_ingestion(self): """Can we ingest RSS feeds without creating duplicates?""" court_id = 'scotus' rss_feed = PacerRssFeed(court_id) rss_feed.is_bankruptcy = True # Needed because we say SCOTUS above. with open(self.make_path('rss_sample_unnumbered_mdb.xml')) as f: text = f.read().decode('utf-8') rss_feed._parse_text(text) docket = rss_feed.data[0] d, docket_count = find_docket_object(court_id, docket['pacer_case_id'], docket['docket_number']) update_docket_metadata(d, docket) d.save() self.assertTrue(docket_count == 0) expected_count = 1 add_docket_entries(d, docket['docket_entries']) self.assertEqual(d.docket_entries.count(), expected_count) add_docket_entries(d, docket['docket_entries']) self.assertEqual(d.docket_entries.count(), expected_count)
def process_recap_appellate_docket(self, pk): """Process an uploaded appellate docket from the RECAP API endpoint. :param pk: The primary key of the processing queue item you want to work on. :returns: A dict of the form: { // The PK of the docket that's created or updated 'docket_pk': 22, // A boolean indicating whether a new docket entry or // recap document was created (implying a Solr needs // updating). 'content_updated': True, } This value is a dict so that it can be ingested in a Celery chain. """ start_time = now() pq = ProcessingQueue.objects.get(pk=pk) mark_pq_status(pq, "", PROCESSING_STATUS.IN_PROGRESS) logger.info( "Processing Appellate RECAP item" " (debug is: %s): %s" % (pq.debug, pq) ) report = AppellateDocketReport(map_cl_to_pacer_id(pq.court_id)) try: text = pq.filepath_local.read().decode("utf-8") except IOError as exc: msg = "Internal processing error (%s: %s)." % (exc.errno, exc.strerror) if (self.request.retries == self.max_retries) or pq.debug: mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED) return None else: mark_pq_status(pq, msg, PROCESSING_STATUS.QUEUED_FOR_RETRY) raise self.retry(exc=exc) report._parse_text(text) data = report.data logger.info("Parsing completed of item %s" % pq) if data == {}: # Not really a docket. Some sort of invalid document (see Juriscraper). msg = "Not a valid docket upload." mark_pq_status(pq, msg, PROCESSING_STATUS.INVALID_CONTENT) self.request.chain = None return None # Merge the contents of the docket into CL. d, docket_count = find_docket_object( pq.court_id, pq.pacer_case_id, data["docket_number"] ) if docket_count > 1: logger.info( "Found %s dockets during lookup. Choosing oldest." % docket_count ) d = d.earliest("date_created") d.add_recap_source() update_docket_metadata(d, data) d, og_info = update_docket_appellate_metadata(d, data) if not d.pacer_case_id: d.pacer_case_id = pq.pacer_case_id if pq.debug: mark_pq_successful(pq, d_id=d.pk) self.request.chain = None return {"docket_pk": d.pk, "content_updated": False} if og_info is not None: og_info.save() d.originating_court_information = og_info d.save() # Add the HTML to the docket in case we need it someday. pacer_file = PacerHtmlFiles( content_object=d, upload_type=UPLOAD_TYPE.APPELLATE_DOCKET ) pacer_file.filepath.save( "docket.html", # We only care about the ext w/UUIDFileSystemStorage ContentFile(text), ) rds_created, content_updated = add_docket_entries( d, data["docket_entries"] ) add_parties_and_attorneys(d, data["parties"]) process_orphan_documents(rds_created, pq.court_id, d.date_filed) if content_updated and docket_count > 0: newly_enqueued = enqueue_docket_alert(d.pk) if newly_enqueued: send_docket_alert(d.pk, start_time) mark_pq_successful(pq, d_id=d.pk) return { "docket_pk": d.pk, "content_updated": bool(rds_created or content_updated), }
def process_recap_docket_history_report(self, pk): """Process the docket history report. :param pk: The primary key of the processing queue item you want to work on :returns: A dict indicating whether the docket needs Solr re-indexing. """ start_time = now() pq = ProcessingQueue.objects.get(pk=pk) mark_pq_status(pq, "", PROCESSING_STATUS.IN_PROGRESS) logger.info("Processing RECAP item (debug is: %s): %s" % (pq.debug, pq)) try: text = pq.filepath_local.read().decode("utf-8") except IOError as exc: msg = "Internal processing error (%s: %s)." % (exc.errno, exc.strerror) if (self.request.retries == self.max_retries) or pq.debug: mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED) return None else: mark_pq_status(pq, msg, PROCESSING_STATUS.QUEUED_FOR_RETRY) raise self.retry(exc=exc) report = DocketHistoryReport(map_cl_to_pacer_id(pq.court_id)) report._parse_text(text) data = report.data logger.info("Parsing completed for item %s" % pq) if data == {}: # Bad docket history page. msg = "Not a valid docket history page upload." mark_pq_status(pq, msg, PROCESSING_STATUS.INVALID_CONTENT) self.request.chain = None return None # Merge the contents of the docket into CL. d, docket_count = find_docket_object( pq.court_id, pq.pacer_case_id, data["docket_number"] ) if docket_count > 1: logger.info( "Found %s dockets during lookup. Choosing oldest." % docket_count ) d = d.earliest("date_created") d.add_recap_source() update_docket_metadata(d, data) if pq.debug: mark_pq_successful(pq, d_id=d.pk) self.request.chain = None return {"docket_pk": d.pk, "content_updated": False} try: d.save() except IntegrityError as exc: logger.warning( "Race condition experienced while attempting docket save." ) error_message = "Unable to save docket due to IntegrityError." if self.request.retries == self.max_retries: mark_pq_status(pq, error_message, PROCESSING_STATUS.FAILED) self.request.chain = None return None else: mark_pq_status( pq, error_message, PROCESSING_STATUS.QUEUED_FOR_RETRY ) raise self.retry(exc=exc) # Add the HTML to the docket in case we need it someday. pacer_file = PacerHtmlFiles( content_object=d, upload_type=UPLOAD_TYPE.DOCKET_HISTORY_REPORT ) pacer_file.filepath.save( # We only care about the ext w/UUIDFileSystemStorage "docket_history.html", ContentFile(text), ) rds_created, content_updated = add_docket_entries( d, data["docket_entries"] ) process_orphan_documents(rds_created, pq.court_id, d.date_filed) if content_updated and docket_count > 0: newly_enqueued = enqueue_docket_alert(d.pk) if newly_enqueued: send_docket_alert(d.pk, start_time) mark_pq_successful(pq, d_id=d.pk) return { "docket_pk": d.pk, "content_updated": bool(rds_created or content_updated), }
def process_recap_claims_register(self, pk): """Merge bankruptcy claims registry HTML into RECAP :param pk: The primary key of the processing queue item you want to work on :type pk: int :return: None :rtype: None """ pq = ProcessingQueue.objects.get(pk=pk) if pq.debug: # Proper debugging not supported on this endpoint. Just abort. mark_pq_successful(pq) self.request.chain = None return None mark_pq_status(pq, "", PROCESSING_STATUS.IN_PROGRESS) logger.info("Processing RECAP item (debug is: %s): %s" % (pq.debug, pq)) try: text = pq.filepath_local.read().decode("utf-8") except IOError as exc: msg = "Internal processing error (%s: %s)." % (exc.errno, exc.strerror) if (self.request.retries == self.max_retries) or pq.debug: mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED) return None else: mark_pq_status(pq, msg, PROCESSING_STATUS.QUEUED_FOR_RETRY) raise self.retry(exc=exc) report = ClaimsRegister(map_cl_to_pacer_id(pq.court_id)) report._parse_text(text) data = report.data logger.info("Parsing completed for item %s" % pq) if not data: # Bad HTML msg = "Not a valid claims registry page or other parsing failure" mark_pq_status(pq, msg, PROCESSING_STATUS.INVALID_CONTENT) self.request.chain = None return None # Merge the contents of the docket into CL. d, docket_count = find_docket_object( pq.court_id, pq.pacer_case_id, data["docket_number"] ) if docket_count > 1: logger.info( "Found %s dockets during lookup. Choosing oldest." % docket_count ) d = d.earliest("date_created") # Merge the contents into CL d.add_recap_source() update_docket_metadata(d, data) try: d.save() except IntegrityError as exc: logger.warning( "Race condition experienced while attempting docket save." ) error_message = "Unable to save docket due to IntegrityError." if self.request.retries == self.max_retries: mark_pq_status(pq, error_message, PROCESSING_STATUS.FAILED) self.request.chain = None return None else: mark_pq_status( pq, error_message, PROCESSING_STATUS.QUEUED_FOR_RETRY ) raise self.retry(exc=exc) add_bankruptcy_data_to_docket(d, data) add_claims_to_docket(d, data["claims"]) logger.info("Created/updated claims data for %s", pq) # Add the HTML to the docket in case we need it someday. pacer_file = PacerHtmlFiles( content_object=d, upload_type=UPLOAD_TYPE.CLAIMS_REGISTER ) pacer_file.filepath.save( # We only care about the ext w/UUIDFileSystemStorage "claims_registry.html", ContentFile(text), ) mark_pq_successful(pq, d_id=d.pk) return {"docket_pk": d.pk}
def merge_rss_feed_contents(self, feed_data, court_pk, metadata_only=False): """Merge the rss feed contents into CourtListener :param self: The Celery task :param feed_data: The data parameter of a PacerRssFeed object that has already queried the feed and been parsed. :param court_pk: The CourtListener court ID. :param metadata_only: Whether to only do metadata and skip docket entries. :returns Dict containing keys: d_pks_to_alert: A list of (docket, alert_time) tuples for sending alerts rds_for_solr: A list of RECAPDocument PKs for updating in Solr """ start_time = now() # RSS feeds are a list of normal Juriscraper docket objects. all_rds_created = [] d_pks_to_alert = [] for docket in feed_data: item_hash = hash_item(docket) if is_cached(item_hash): continue with transaction.atomic(): cached_ok = cache_hash(item_hash) if not cached_ok: # The item is already in the cache, ergo it's getting processed # in another thread/process and we had a race condition. continue d, docket_count = find_docket_object(court_pk, docket["pacer_case_id"], docket["docket_number"]) if docket_count > 1: logger.info("Found %s dockets during lookup. Choosing " "oldest." % docket_count) d = d.earliest("date_created") d.add_recap_source() update_docket_metadata(d, docket) if not d.pacer_case_id: d.pacer_case_id = docket["pacer_case_id"] try: d.save() add_bankruptcy_data_to_docket(d, docket) except IntegrityError as exc: # The docket was created while we looked it up. Retry and it # should associate with the new one instead. raise self.retry(exc=exc) if metadata_only: continue rds_created, content_updated = add_docket_entries( d, docket["docket_entries"]) if content_updated and docket_count > 0: newly_enqueued = enqueue_docket_alert(d.pk) if newly_enqueued: d_pks_to_alert.append((d.pk, start_time)) all_rds_created.extend([rd.pk for rd in rds_created]) logger.info( "%s: Sending %s new RECAP documents to Solr for indexing and " "sending %s dockets for alerts.", court_pk, len(all_rds_created), len(d_pks_to_alert), ) return {"d_pks_to_alert": d_pks_to_alert, "rds_for_solr": all_rds_created}
def process_recap_docket(self, pk): """Process an uploaded docket from the RECAP API endpoint. :param pk: The primary key of the processing queue item you want to work on. :returns: A dict of the form: { // The PK of the docket that's created or updated 'docket_pk': 22, // A boolean indicating whether a new docket entry or // recap document was created (implying a Solr needs // updating). 'content_updated': True, } This value is a dict so that it can be ingested in a Celery chain. """ start_time = now() pq = ProcessingQueue.objects.get(pk=pk) mark_pq_status(pq, "", PROCESSING_STATUS.IN_PROGRESS) logger.info(f"Processing RECAP item (debug is: {pq.debug}): {pq}") report = DocketReport(map_cl_to_pacer_id(pq.court_id)) try: text = pq.filepath_local.read().decode() except IOError as exc: msg = f"Internal processing error ({exc.errno}: {exc.strerror})." if (self.request.retries == self.max_retries) or pq.debug: mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED) return None else: mark_pq_status(pq, msg, PROCESSING_STATUS.QUEUED_FOR_RETRY) raise self.retry(exc=exc) if "History/Documents" in text: # Prior to 1.1.8, we did not separate docket history reports into their # own upload_type. Alas, we still have some old clients around, so we # need to handle those clients here. pq.upload_type = UPLOAD_TYPE.DOCKET_HISTORY_REPORT pq.save() process_recap_docket_history_report(pk) self.request.chain = None return None report._parse_text(text) data = report.data logger.info(f"Parsing completed of item {pq}") if data == {}: # Not really a docket. Some sort of invalid document (see Juriscraper). msg = "Not a valid docket upload." mark_pq_status(pq, msg, PROCESSING_STATUS.INVALID_CONTENT) self.request.chain = None return None # Merge the contents of the docket into CL. d = find_docket_object(pq.court_id, pq.pacer_case_id, data["docket_number"]) d.add_recap_source() update_docket_metadata(d, data) if not d.pacer_case_id: d.pacer_case_id = pq.pacer_case_id if pq.debug: mark_pq_successful(pq, d_id=d.pk) self.request.chain = None return {"docket_pk": d.pk, "content_updated": False} d.save() # Add the HTML to the docket in case we need it someday. pacer_file = PacerHtmlFiles(content_object=d, upload_type=UPLOAD_TYPE.DOCKET) pacer_file.filepath.save( "docket.html", # We only care about the ext w/UUIDFileSystemStorage ContentFile(text), ) rds_created, content_updated = add_docket_entries(d, data["docket_entries"]) add_parties_and_attorneys(d, data["parties"]) process_orphan_documents(rds_created, pq.court_id, d.date_filed) if content_updated: newly_enqueued = enqueue_docket_alert(d.pk) if newly_enqueued: send_docket_alert(d.pk, start_time) mark_pq_successful(pq, d_id=d.pk) return { "docket_pk": d.pk, "content_updated": bool(rds_created or content_updated), }
def process_recap_docket(self, pk): """Process an uploaded docket from the RECAP API endpoint. :param pk: The primary key of the processing queue item you want to work on. :returns: A dict of the form: { // The PK of the docket that's created or updated 'docket_pk': 22, // A boolean indicating whether a new docket entry or // recap document was created (implying a Solr needs // updating). 'content_updated': True, } This value is a dict so that it can be ingested in a Celery chain. """ start_time = now() pq = ProcessingQueue.objects.get(pk=pk) mark_pq_status(pq, '', PROCESSING_STATUS.IN_PROGRESS) logger.info("Processing RECAP item (debug is: %s): %s" % (pq.debug, pq)) report = DocketReport(map_cl_to_pacer_id(pq.court_id)) text = pq.filepath_local.read().decode('utf-8') if 'History/Documents' in text: # Prior to 1.1.8, we did not separate docket history reports into their # own upload_type. Alas, we still have some old clients around, so we # need to handle those clients here. pq.upload_type = UPLOAD_TYPE.DOCKET_HISTORY_REPORT pq.save() process_recap_docket_history_report(pk) self.request.chain = None return None report._parse_text(text) data = report.data logger.info("Parsing completed of item %s" % pq) if data == {}: # Not really a docket. Some sort of invalid document (see Juriscraper). msg = "Not a valid docket upload." mark_pq_status(pq, msg, PROCESSING_STATUS.INVALID_CONTENT) self.request.chain = None return None # Merge the contents of the docket into CL. d, docket_count = find_docket_object(pq.court_id, pq.pacer_case_id, data['docket_number']) if docket_count > 1: logger.info("Found %s dockets during lookup. Choosing oldest." % docket_count) d = d.earliest('date_created') d.add_recap_source() update_docket_metadata(d, data) if not d.pacer_case_id: d.pacer_case_id = pq.pacer_case_id if pq.debug: mark_pq_successful(pq, d_id=d.pk) self.request.chain = None return {'docket_pk': d.pk, 'content_updated': False} d.save() # Add the HTML to the docket in case we need it someday. pacer_file = PacerHtmlFiles(content_object=d, upload_type=UPLOAD_TYPE.DOCKET) pacer_file.filepath.save( 'docket.html', # We only care about the ext w/UUIDFileSystemStorage ContentFile(text), ) rds_created, content_updated = add_docket_entries( d, data['docket_entries']) add_parties_and_attorneys(d, data['parties']) process_orphan_documents(rds_created, pq.court_id, d.date_filed) if content_updated and docket_count > 0: newly_enqueued = enqueue_docket_alert(d.pk) if newly_enqueued: send_docket_alert(d.pk, start_time) mark_pq_successful(pq, d_id=d.pk) return { 'docket_pk': d.pk, 'content_updated': bool(rds_created or content_updated), }
def setUp(self) -> None: self.docket = find_docket_object("akd", "41664", "3:11-cv-00064") process_docket_data(self.docket, self.DOCKET_PATH, UPLOAD_TYPE.IA_XML_FILE)