def merge_rss_feed_contents(rss_feed, court_pk, feed_status_pk): """Merge the rss feed contents into CourtListener :param rss_feed: A PacerRssFeed object that has already queried the feed. :param court_pk: The CourtListener court ID. :param feed_status_pk: The CL ID for the RSS status object. :returns all_rds_created: A list of all the RDs created during the processing. """ start_time = now() feed_status = RssFeedStatus.objects.get(pk=feed_status_pk) rss_feed.parse() logger.info("%s: Got %s results to merge." % (feed_status.court_id, len(rss_feed.data))) # RSS feeds are a list of normal Juriscraper docket objects. all_rds_created = [] for docket in rss_feed.data: item_hash = hash_item(docket) if is_cached(item_hash): continue with transaction.atomic(): cached_ok = cache_hash(item_hash) if not cached_ok: # The item is already in the cache, ergo it's getting processed # in another thread/process and we had a race condition. continue d, docket_count = find_docket_object(court_pk, docket['pacer_case_id'], docket['docket_number']) if docket_count > 1: logger.info("Found %s dockets during lookup. Choosing " "oldest." % docket_count) d = d.earliest('date_created') add_recap_source(d) update_docket_metadata(d, docket) if not d.pacer_case_id: d.pacer_case_id = docket['pacer_case_id'] d.save() rds_created, content_updated = add_docket_entries( d, docket['docket_entries']) if content_updated and docket_count > 0: enqueue_docket_alert(d.pk, start_time) all_rds_created.extend([rd.pk for rd in rds_created]) logger.info("%s: Sending %s new RECAP documents to Solr for indexing." % (feed_status.court_id, len(all_rds_created))) return all_rds_created
def merge_rss_feed_contents(feed_data, court_pk, feed_status_pk): """Merge the rss feed contents into CourtListener :param feed_data: The data parameter of a PacerRssFeed object that has already queried the feed and been parsed. :param court_pk: The CourtListener court ID. :param feed_status_pk: The CL ID for the RSS status object. :returns all_rds_created: A list of all the RDs created during the processing. """ start_time = now() feed_status = RssFeedStatus.objects.get(pk=feed_status_pk) # RSS feeds are a list of normal Juriscraper docket objects. all_rds_created = [] d_pks_to_alert = [] for docket in feed_data: item_hash = hash_item(docket) if is_cached(item_hash): continue with transaction.atomic(): cached_ok = cache_hash(item_hash) if not cached_ok: # The item is already in the cache, ergo it's getting processed # in another thread/process and we had a race condition. continue d, docket_count = find_docket_object(court_pk, docket["pacer_case_id"], docket["docket_number"]) if docket_count > 1: logger.info("Found %s dockets during lookup. Choosing " "oldest." % docket_count) d = d.earliest("date_created") d.add_recap_source() update_docket_metadata(d, docket) if not d.pacer_case_id: d.pacer_case_id = docket["pacer_case_id"] d.save() rds_created, content_updated = add_docket_entries( d, docket["docket_entries"]) if content_updated and docket_count > 0: newly_enqueued = enqueue_docket_alert(d.pk) if newly_enqueued: d_pks_to_alert.append((d.pk, start_time)) all_rds_created.extend([rd.pk for rd in rds_created]) logger.info( "%s: Sending %s new RECAP documents to Solr for indexing and " "sending %s dockets for alerts.", feed_status.court_id, len(all_rds_created), len(d_pks_to_alert), ) return {"d_pks_to_alert": d_pks_to_alert, "rds_for_solr": all_rds_created}
def merge_rss_feed_contents(feed_data, court_pk, feed_status_pk): """Merge the rss feed contents into CourtListener :param feed_data: The data parameter of a PacerRssFeed object that has already queried the feed and been parsed. :param court_pk: The CourtListener court ID. :param feed_status_pk: The CL ID for the RSS status object. :returns all_rds_created: A list of all the RDs created during the processing. """ start_time = now() feed_status = RssFeedStatus.objects.get(pk=feed_status_pk) # RSS feeds are a list of normal Juriscraper docket objects. all_rds_created = [] d_pks_to_alert = [] for docket in feed_data: item_hash = hash_item(docket) if is_cached(item_hash): continue with transaction.atomic(): cached_ok = cache_hash(item_hash) if not cached_ok: # The item is already in the cache, ergo it's getting processed # in another thread/process and we had a race condition. continue d, docket_count = find_docket_object( court_pk, docket['pacer_case_id'], docket['docket_number']) if docket_count > 1: logger.info("Found %s dockets during lookup. Choosing " "oldest." % docket_count) d = d.earliest('date_created') d.add_recap_source() update_docket_metadata(d, docket) if not d.pacer_case_id: d.pacer_case_id = docket['pacer_case_id'] d.save() rds_created, content_updated = add_docket_entries( d, docket['docket_entries']) if content_updated and docket_count > 0: newly_enqueued = enqueue_docket_alert(d.pk) if newly_enqueued: d_pks_to_alert.append((d.pk, start_time)) all_rds_created.extend([rd.pk for rd in rds_created]) logger.info("%s: Sending %s new RECAP documents to Solr for indexing and " "sending %s dockets for alerts.", feed_status.court_id, len(all_rds_created), len(d_pks_to_alert)) return {'d_pks_to_alert': d_pks_to_alert, 'rds_for_solr': all_rds_created}
def process_recap_appellate_docket(self, pk): """Process an uploaded appellate docket from the RECAP API endpoint. :param pk: The primary key of the processing queue item you want to work on. :returns: A dict of the form: { // The PK of the docket that's created or updated 'docket_pk': 22, // A boolean indicating whether a new docket entry or // recap document was created (implying a Solr needs // updating). 'content_updated': True, } This value is a dict so that it can be ingested in a Celery chain. """ start_time = now() pq = ProcessingQueue.objects.get(pk=pk) mark_pq_status(pq, "", PROCESSING_STATUS.IN_PROGRESS) logger.info( "Processing Appellate RECAP item" " (debug is: %s): %s" % (pq.debug, pq) ) report = AppellateDocketReport(map_cl_to_pacer_id(pq.court_id)) try: text = pq.filepath_local.read().decode("utf-8") except IOError as exc: msg = "Internal processing error (%s: %s)." % (exc.errno, exc.strerror) if (self.request.retries == self.max_retries) or pq.debug: mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED) return None else: mark_pq_status(pq, msg, PROCESSING_STATUS.QUEUED_FOR_RETRY) raise self.retry(exc=exc) report._parse_text(text) data = report.data logger.info("Parsing completed of item %s" % pq) if data == {}: # Not really a docket. Some sort of invalid document (see Juriscraper). msg = "Not a valid docket upload." mark_pq_status(pq, msg, PROCESSING_STATUS.INVALID_CONTENT) self.request.chain = None return None # Merge the contents of the docket into CL. d, docket_count = find_docket_object( pq.court_id, pq.pacer_case_id, data["docket_number"] ) if docket_count > 1: logger.info( "Found %s dockets during lookup. Choosing oldest." % docket_count ) d = d.earliest("date_created") d.add_recap_source() update_docket_metadata(d, data) d, og_info = update_docket_appellate_metadata(d, data) if not d.pacer_case_id: d.pacer_case_id = pq.pacer_case_id if pq.debug: mark_pq_successful(pq, d_id=d.pk) self.request.chain = None return {"docket_pk": d.pk, "content_updated": False} if og_info is not None: og_info.save() d.originating_court_information = og_info d.save() # Add the HTML to the docket in case we need it someday. pacer_file = PacerHtmlFiles( content_object=d, upload_type=UPLOAD_TYPE.APPELLATE_DOCKET ) pacer_file.filepath.save( "docket.html", # We only care about the ext w/UUIDFileSystemStorage ContentFile(text), ) rds_created, content_updated = add_docket_entries( d, data["docket_entries"] ) add_parties_and_attorneys(d, data["parties"]) process_orphan_documents(rds_created, pq.court_id, d.date_filed) if content_updated and docket_count > 0: newly_enqueued = enqueue_docket_alert(d.pk) if newly_enqueued: send_docket_alert(d.pk, start_time) mark_pq_successful(pq, d_id=d.pk) return { "docket_pk": d.pk, "content_updated": bool(rds_created or content_updated), }
def process_recap_docket_history_report(self, pk): """Process the docket history report. :param pk: The primary key of the processing queue item you want to work on :returns: A dict indicating whether the docket needs Solr re-indexing. """ start_time = now() pq = ProcessingQueue.objects.get(pk=pk) mark_pq_status(pq, "", PROCESSING_STATUS.IN_PROGRESS) logger.info("Processing RECAP item (debug is: %s): %s" % (pq.debug, pq)) try: text = pq.filepath_local.read().decode("utf-8") except IOError as exc: msg = "Internal processing error (%s: %s)." % (exc.errno, exc.strerror) if (self.request.retries == self.max_retries) or pq.debug: mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED) return None else: mark_pq_status(pq, msg, PROCESSING_STATUS.QUEUED_FOR_RETRY) raise self.retry(exc=exc) report = DocketHistoryReport(map_cl_to_pacer_id(pq.court_id)) report._parse_text(text) data = report.data logger.info("Parsing completed for item %s" % pq) if data == {}: # Bad docket history page. msg = "Not a valid docket history page upload." mark_pq_status(pq, msg, PROCESSING_STATUS.INVALID_CONTENT) self.request.chain = None return None # Merge the contents of the docket into CL. d, docket_count = find_docket_object( pq.court_id, pq.pacer_case_id, data["docket_number"] ) if docket_count > 1: logger.info( "Found %s dockets during lookup. Choosing oldest." % docket_count ) d = d.earliest("date_created") d.add_recap_source() update_docket_metadata(d, data) if pq.debug: mark_pq_successful(pq, d_id=d.pk) self.request.chain = None return {"docket_pk": d.pk, "content_updated": False} try: d.save() except IntegrityError as exc: logger.warning( "Race condition experienced while attempting docket save." ) error_message = "Unable to save docket due to IntegrityError." if self.request.retries == self.max_retries: mark_pq_status(pq, error_message, PROCESSING_STATUS.FAILED) self.request.chain = None return None else: mark_pq_status( pq, error_message, PROCESSING_STATUS.QUEUED_FOR_RETRY ) raise self.retry(exc=exc) # Add the HTML to the docket in case we need it someday. pacer_file = PacerHtmlFiles( content_object=d, upload_type=UPLOAD_TYPE.DOCKET_HISTORY_REPORT ) pacer_file.filepath.save( # We only care about the ext w/UUIDFileSystemStorage "docket_history.html", ContentFile(text), ) rds_created, content_updated = add_docket_entries( d, data["docket_entries"] ) process_orphan_documents(rds_created, pq.court_id, d.date_filed) if content_updated and docket_count > 0: newly_enqueued = enqueue_docket_alert(d.pk) if newly_enqueued: send_docket_alert(d.pk, start_time) mark_pq_successful(pq, d_id=d.pk) return { "docket_pk": d.pk, "content_updated": bool(rds_created or content_updated), }
def merge_rss_feed_contents(self, feed_data, court_pk, metadata_only=False): """Merge the rss feed contents into CourtListener :param self: The Celery task :param feed_data: The data parameter of a PacerRssFeed object that has already queried the feed and been parsed. :param court_pk: The CourtListener court ID. :param metadata_only: Whether to only do metadata and skip docket entries. :returns Dict containing keys: d_pks_to_alert: A list of (docket, alert_time) tuples for sending alerts rds_for_solr: A list of RECAPDocument PKs for updating in Solr """ start_time = now() # RSS feeds are a list of normal Juriscraper docket objects. all_rds_created = [] d_pks_to_alert = [] for docket in feed_data: item_hash = hash_item(docket) if is_cached(item_hash): continue with transaction.atomic(): cached_ok = cache_hash(item_hash) if not cached_ok: # The item is already in the cache, ergo it's getting processed # in another thread/process and we had a race condition. continue d, docket_count = find_docket_object(court_pk, docket["pacer_case_id"], docket["docket_number"]) if docket_count > 1: logger.info("Found %s dockets during lookup. Choosing " "oldest." % docket_count) d = d.earliest("date_created") d.add_recap_source() update_docket_metadata(d, docket) if not d.pacer_case_id: d.pacer_case_id = docket["pacer_case_id"] try: d.save() add_bankruptcy_data_to_docket(d, docket) except IntegrityError as exc: # The docket was created while we looked it up. Retry and it # should associate with the new one instead. raise self.retry(exc=exc) if metadata_only: continue rds_created, content_updated = add_docket_entries( d, docket["docket_entries"]) if content_updated and docket_count > 0: newly_enqueued = enqueue_docket_alert(d.pk) if newly_enqueued: d_pks_to_alert.append((d.pk, start_time)) all_rds_created.extend([rd.pk for rd in rds_created]) logger.info( "%s: Sending %s new RECAP documents to Solr for indexing and " "sending %s dockets for alerts.", court_pk, len(all_rds_created), len(d_pks_to_alert), ) return {"d_pks_to_alert": d_pks_to_alert, "rds_for_solr": all_rds_created}
def process_recap_docket(self, pk): """Process an uploaded docket from the RECAP API endpoint. :param pk: The primary key of the processing queue item you want to work on. :returns: A dict of the form: { // The PK of the docket that's created or updated 'docket_pk': 22, // A boolean indicating whether a new docket entry or // recap document was created (implying a Solr needs // updating). 'content_updated': True, } This value is a dict so that it can be ingested in a Celery chain. """ start_time = now() pq = ProcessingQueue.objects.get(pk=pk) mark_pq_status(pq, "", PROCESSING_STATUS.IN_PROGRESS) logger.info(f"Processing RECAP item (debug is: {pq.debug}): {pq}") report = DocketReport(map_cl_to_pacer_id(pq.court_id)) try: text = pq.filepath_local.read().decode() except IOError as exc: msg = f"Internal processing error ({exc.errno}: {exc.strerror})." if (self.request.retries == self.max_retries) or pq.debug: mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED) return None else: mark_pq_status(pq, msg, PROCESSING_STATUS.QUEUED_FOR_RETRY) raise self.retry(exc=exc) if "History/Documents" in text: # Prior to 1.1.8, we did not separate docket history reports into their # own upload_type. Alas, we still have some old clients around, so we # need to handle those clients here. pq.upload_type = UPLOAD_TYPE.DOCKET_HISTORY_REPORT pq.save() process_recap_docket_history_report(pk) self.request.chain = None return None report._parse_text(text) data = report.data logger.info(f"Parsing completed of item {pq}") if data == {}: # Not really a docket. Some sort of invalid document (see Juriscraper). msg = "Not a valid docket upload." mark_pq_status(pq, msg, PROCESSING_STATUS.INVALID_CONTENT) self.request.chain = None return None # Merge the contents of the docket into CL. d = find_docket_object(pq.court_id, pq.pacer_case_id, data["docket_number"]) d.add_recap_source() update_docket_metadata(d, data) if not d.pacer_case_id: d.pacer_case_id = pq.pacer_case_id if pq.debug: mark_pq_successful(pq, d_id=d.pk) self.request.chain = None return {"docket_pk": d.pk, "content_updated": False} d.save() # Add the HTML to the docket in case we need it someday. pacer_file = PacerHtmlFiles(content_object=d, upload_type=UPLOAD_TYPE.DOCKET) pacer_file.filepath.save( "docket.html", # We only care about the ext w/UUIDFileSystemStorage ContentFile(text), ) rds_created, content_updated = add_docket_entries(d, data["docket_entries"]) add_parties_and_attorneys(d, data["parties"]) process_orphan_documents(rds_created, pq.court_id, d.date_filed) if content_updated: newly_enqueued = enqueue_docket_alert(d.pk) if newly_enqueued: send_docket_alert(d.pk, start_time) mark_pq_successful(pq, d_id=d.pk) return { "docket_pk": d.pk, "content_updated": bool(rds_created or content_updated), }
def process_recap_docket(self, pk): """Process an uploaded docket from the RECAP API endpoint. :param pk: The primary key of the processing queue item you want to work on. :returns: A dict of the form: { // The PK of the docket that's created or updated 'docket_pk': 22, // A boolean indicating whether a new docket entry or // recap document was created (implying a Solr needs // updating). 'content_updated': True, } This value is a dict so that it can be ingested in a Celery chain. """ start_time = now() pq = ProcessingQueue.objects.get(pk=pk) mark_pq_status(pq, '', PROCESSING_STATUS.IN_PROGRESS) logger.info("Processing RECAP item (debug is: %s): %s" % (pq.debug, pq)) report = DocketReport(map_cl_to_pacer_id(pq.court_id)) text = pq.filepath_local.read().decode('utf-8') if 'History/Documents' in text: # Prior to 1.1.8, we did not separate docket history reports into their # own upload_type. Alas, we still have some old clients around, so we # need to handle those clients here. pq.upload_type = UPLOAD_TYPE.DOCKET_HISTORY_REPORT pq.save() process_recap_docket_history_report(pk) self.request.chain = None return None report._parse_text(text) data = report.data logger.info("Parsing completed of item %s" % pq) if data == {}: # Not really a docket. Some sort of invalid document (see Juriscraper). msg = "Not a valid docket upload." mark_pq_status(pq, msg, PROCESSING_STATUS.INVALID_CONTENT) self.request.chain = None return None # Merge the contents of the docket into CL. d, docket_count = find_docket_object(pq.court_id, pq.pacer_case_id, data['docket_number']) if docket_count > 1: logger.info("Found %s dockets during lookup. Choosing oldest." % docket_count) d = d.earliest('date_created') d.add_recap_source() update_docket_metadata(d, data) if not d.pacer_case_id: d.pacer_case_id = pq.pacer_case_id if pq.debug: mark_pq_successful(pq, d_id=d.pk) self.request.chain = None return {'docket_pk': d.pk, 'content_updated': False} d.save() # Add the HTML to the docket in case we need it someday. pacer_file = PacerHtmlFiles(content_object=d, upload_type=UPLOAD_TYPE.DOCKET) pacer_file.filepath.save( 'docket.html', # We only care about the ext w/UUIDFileSystemStorage ContentFile(text), ) rds_created, content_updated = add_docket_entries( d, data['docket_entries']) add_parties_and_attorneys(d, data['parties']) process_orphan_documents(rds_created, pq.court_id, d.date_filed) if content_updated and docket_count > 0: newly_enqueued = enqueue_docket_alert(d.pk) if newly_enqueued: send_docket_alert(d.pk, start_time) mark_pq_successful(pq, d_id=d.pk) return { 'docket_pk': d.pk, 'content_updated': bool(rds_created or content_updated), }
def test_nothing_happens_for_timers_after_de_creation(self): """Do we avoid sending alerts for timers after the de was created?""" enqueue_docket_alert(self.docket.pk, self.after) # Do zero emails go out? None should. self.assertEqual(len(mail.outbox), 0)
def test_triggering_docket_alert(self): """Does the alert trigger when it should?""" enqueue_docket_alert(self.docket.pk, self.before) # Does the alert go out? It should. self.assertEqual(len(mail.outbox), 1)