def process_docket_data(d, filepath, report_type): """Process docket data file. :param d: A docket object to work on. :param filepath: The path to a saved HTML file containing docket or docket history report data. :param report_type: Whether it's a docket or a docket history report. """ from cl.recap.mergers import ( add_docket_entries, add_parties_and_attorneys, update_docket_appellate_metadata, update_docket_metadata, add_bankruptcy_data_to_docket, add_claims_to_docket, ) court_id = map_cl_to_pacer_id(d.court_id) if report_type == UPLOAD_TYPE.DOCKET: report = DocketReport(court_id) elif report_type == UPLOAD_TYPE.DOCKET_HISTORY_REPORT: report = DocketHistoryReport(court_id) elif report_type == UPLOAD_TYPE.APPELLATE_DOCKET: report = AppellateDocketReport(court_id) elif report_type == UPLOAD_TYPE.IA_XML_FILE: report = InternetArchive(court_id) elif report_type == UPLOAD_TYPE.CASE_REPORT_PAGE: report = CaseQuery(court_id) elif report_type == UPLOAD_TYPE.CLAIMS_REGISTER: report = ClaimsRegister(court_id) else: raise NotImplementedError("The report type with id '%s' is not yet " "supported. Perhaps you need to add it?" % report_type) with open(filepath, "r") as f: text = f.read().decode("utf-8") report._parse_text(text) data = report.data if data == {}: return None if report_type == UPLOAD_TYPE.CLAIMS_REGISTER: add_bankruptcy_data_to_docket(d, data) add_claims_to_docket(d, data["claims"]) else: update_docket_metadata(d, data) d, og_info = update_docket_appellate_metadata(d, data) if og_info is not None: og_info.save() d.originating_court_information = og_info d.save() if data.get("docket_entries"): add_docket_entries(d, data["docket_entries"]) if report_type in ( UPLOAD_TYPE.DOCKET, UPLOAD_TYPE.APPELLATE_DOCKET, UPLOAD_TYPE.IA_XML_FILE, ): add_parties_and_attorneys(d, data["parties"]) return d.pk
def test_new_has_terminated_entities(self): """Do we update all existing data when scraped data has terminated entities? """ add_parties_and_attorneys(self.d, self.new_party_data) # Docket should have two parties, Powell and McCarthy. This # implies that extraneous_p has been removed. self.assertEqual(self.d.parties.count(), 2) # Powell has an attorney. The rest are extraneous or don't have attys. role_count = Role.objects.filter(docket=self.d).count() self.assertEqual(role_count, 1)
def test_new_lacks_terminated_entities_old_lacks_too(self): """Do we update all existing data when there aren't terminated entities at play? """ self.new_mccarthy_data["date_terminated"] = None add_parties_and_attorneys(self.d, self.new_party_data) # Docket should have two parties, Powell and McCarthy. This # implies that extraneous_p has been removed. self.assertEqual(self.d.parties.count(), 2) # Powell has an attorney. The rest are extraneous or don't have attys. role_count = Role.objects.filter(docket=self.d).count() self.assertEqual(role_count, 1)
def test_new_lacks_terminated_entities_old_has_them(self): """Do we update things properly when old has terminated parties, but new lacks them? Do we disassociate extraneous parties that aren't in the new data and aren't terminated? """ # Add terminated attorney that's not in the new data. term_a = Attorney.objects.create(name="Robert Mueller") Role.objects.create( docket=self.d, attorney=term_a, party=self.p, role=Role.TERMINATED, date_action=date(2018, 3, 16), ) # Add a terminated party that's not in the new data. term_p = Party.objects.create(name="Zainab Ahmad") PartyType.objects.create( docket=self.d, party=term_p, name="plaintiff", date_terminated=date(2018, 11, 4), ) # Remove termination data from the new. self.new_mccarthy_data["date_terminated"] = None add_parties_and_attorneys(self.d, self.new_party_data) # Docket should have three parties, Powell and McCarthy from the new # data, and Ahmad from the old. This implies that extraneous_p has been # removed and that terminated parties have not. self.assertEqual(self.d.parties.count(), 3) # Powell now has has two attorneys, Robert Mueller and self.a. The rest # are extraneous or don't have attys. role_count = Role.objects.filter(docket=self.d).count() self.assertEqual(role_count, 2)
def process_recap_appellate_docket(self, pk): """Process an uploaded appellate docket from the RECAP API endpoint. :param pk: The primary key of the processing queue item you want to work on. :returns: A dict of the form: { // The PK of the docket that's created or updated 'docket_pk': 22, // A boolean indicating whether a new docket entry or // recap document was created (implying a Solr needs // updating). 'content_updated': True, } This value is a dict so that it can be ingested in a Celery chain. """ start_time = now() pq = ProcessingQueue.objects.get(pk=pk) mark_pq_status(pq, "", PROCESSING_STATUS.IN_PROGRESS) logger.info( "Processing Appellate RECAP item" " (debug is: %s): %s" % (pq.debug, pq) ) report = AppellateDocketReport(map_cl_to_pacer_id(pq.court_id)) try: text = pq.filepath_local.read().decode("utf-8") except IOError as exc: msg = "Internal processing error (%s: %s)." % (exc.errno, exc.strerror) if (self.request.retries == self.max_retries) or pq.debug: mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED) return None else: mark_pq_status(pq, msg, PROCESSING_STATUS.QUEUED_FOR_RETRY) raise self.retry(exc=exc) report._parse_text(text) data = report.data logger.info("Parsing completed of item %s" % pq) if data == {}: # Not really a docket. Some sort of invalid document (see Juriscraper). msg = "Not a valid docket upload." mark_pq_status(pq, msg, PROCESSING_STATUS.INVALID_CONTENT) self.request.chain = None return None # Merge the contents of the docket into CL. d, docket_count = find_docket_object( pq.court_id, pq.pacer_case_id, data["docket_number"] ) if docket_count > 1: logger.info( "Found %s dockets during lookup. Choosing oldest." % docket_count ) d = d.earliest("date_created") d.add_recap_source() update_docket_metadata(d, data) d, og_info = update_docket_appellate_metadata(d, data) if not d.pacer_case_id: d.pacer_case_id = pq.pacer_case_id if pq.debug: mark_pq_successful(pq, d_id=d.pk) self.request.chain = None return {"docket_pk": d.pk, "content_updated": False} if og_info is not None: og_info.save() d.originating_court_information = og_info d.save() # Add the HTML to the docket in case we need it someday. pacer_file = PacerHtmlFiles( content_object=d, upload_type=UPLOAD_TYPE.APPELLATE_DOCKET ) pacer_file.filepath.save( "docket.html", # We only care about the ext w/UUIDFileSystemStorage ContentFile(text), ) rds_created, content_updated = add_docket_entries( d, data["docket_entries"] ) add_parties_and_attorneys(d, data["parties"]) process_orphan_documents(rds_created, pq.court_id, d.date_filed) if content_updated and docket_count > 0: newly_enqueued = enqueue_docket_alert(d.pk) if newly_enqueued: send_docket_alert(d.pk, start_time) mark_pq_successful(pq, d_id=d.pk) return { "docket_pk": d.pk, "content_updated": bool(rds_created or content_updated), }
def process_recap_docket(self, pk): """Process an uploaded docket from the RECAP API endpoint. :param pk: The primary key of the processing queue item you want to work on. :returns: A dict of the form: { // The PK of the docket that's created or updated 'docket_pk': 22, // A boolean indicating whether a new docket entry or // recap document was created (implying a Solr needs // updating). 'content_updated': True, } This value is a dict so that it can be ingested in a Celery chain. """ start_time = now() pq = ProcessingQueue.objects.get(pk=pk) mark_pq_status(pq, "", PROCESSING_STATUS.IN_PROGRESS) logger.info(f"Processing RECAP item (debug is: {pq.debug}): {pq}") report = DocketReport(map_cl_to_pacer_id(pq.court_id)) try: text = pq.filepath_local.read().decode() except IOError as exc: msg = f"Internal processing error ({exc.errno}: {exc.strerror})." if (self.request.retries == self.max_retries) or pq.debug: mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED) return None else: mark_pq_status(pq, msg, PROCESSING_STATUS.QUEUED_FOR_RETRY) raise self.retry(exc=exc) if "History/Documents" in text: # Prior to 1.1.8, we did not separate docket history reports into their # own upload_type. Alas, we still have some old clients around, so we # need to handle those clients here. pq.upload_type = UPLOAD_TYPE.DOCKET_HISTORY_REPORT pq.save() process_recap_docket_history_report(pk) self.request.chain = None return None report._parse_text(text) data = report.data logger.info(f"Parsing completed of item {pq}") if data == {}: # Not really a docket. Some sort of invalid document (see Juriscraper). msg = "Not a valid docket upload." mark_pq_status(pq, msg, PROCESSING_STATUS.INVALID_CONTENT) self.request.chain = None return None # Merge the contents of the docket into CL. d = find_docket_object(pq.court_id, pq.pacer_case_id, data["docket_number"]) d.add_recap_source() update_docket_metadata(d, data) if not d.pacer_case_id: d.pacer_case_id = pq.pacer_case_id if pq.debug: mark_pq_successful(pq, d_id=d.pk) self.request.chain = None return {"docket_pk": d.pk, "content_updated": False} d.save() # Add the HTML to the docket in case we need it someday. pacer_file = PacerHtmlFiles(content_object=d, upload_type=UPLOAD_TYPE.DOCKET) pacer_file.filepath.save( "docket.html", # We only care about the ext w/UUIDFileSystemStorage ContentFile(text), ) rds_created, content_updated = add_docket_entries(d, data["docket_entries"]) add_parties_and_attorneys(d, data["parties"]) process_orphan_documents(rds_created, pq.court_id, d.date_filed) if content_updated: newly_enqueued = enqueue_docket_alert(d.pk) if newly_enqueued: send_docket_alert(d.pk, start_time) mark_pq_successful(pq, d_id=d.pk) return { "docket_pk": d.pk, "content_updated": bool(rds_created or content_updated), }
def process_recap_docket(self, pk): """Process an uploaded docket from the RECAP API endpoint. :param pk: The primary key of the processing queue item you want to work on. :returns: A dict of the form: { // The PK of the docket that's created or updated 'docket_pk': 22, // A boolean indicating whether a new docket entry or // recap document was created (implying a Solr needs // updating). 'content_updated': True, } This value is a dict so that it can be ingested in a Celery chain. """ start_time = now() pq = ProcessingQueue.objects.get(pk=pk) mark_pq_status(pq, '', PROCESSING_STATUS.IN_PROGRESS) logger.info("Processing RECAP item (debug is: %s): %s" % (pq.debug, pq)) report = DocketReport(map_cl_to_pacer_id(pq.court_id)) text = pq.filepath_local.read().decode('utf-8') if 'History/Documents' in text: # Prior to 1.1.8, we did not separate docket history reports into their # own upload_type. Alas, we still have some old clients around, so we # need to handle those clients here. pq.upload_type = UPLOAD_TYPE.DOCKET_HISTORY_REPORT pq.save() process_recap_docket_history_report(pk) self.request.chain = None return None report._parse_text(text) data = report.data logger.info("Parsing completed of item %s" % pq) if data == {}: # Not really a docket. Some sort of invalid document (see Juriscraper). msg = "Not a valid docket upload." mark_pq_status(pq, msg, PROCESSING_STATUS.INVALID_CONTENT) self.request.chain = None return None # Merge the contents of the docket into CL. d, docket_count = find_docket_object(pq.court_id, pq.pacer_case_id, data['docket_number']) if docket_count > 1: logger.info("Found %s dockets during lookup. Choosing oldest." % docket_count) d = d.earliest('date_created') d.add_recap_source() update_docket_metadata(d, data) if not d.pacer_case_id: d.pacer_case_id = pq.pacer_case_id if pq.debug: mark_pq_successful(pq, d_id=d.pk) self.request.chain = None return {'docket_pk': d.pk, 'content_updated': False} d.save() # Add the HTML to the docket in case we need it someday. pacer_file = PacerHtmlFiles(content_object=d, upload_type=UPLOAD_TYPE.DOCKET) pacer_file.filepath.save( 'docket.html', # We only care about the ext w/UUIDFileSystemStorage ContentFile(text), ) rds_created, content_updated = add_docket_entries( d, data['docket_entries']) add_parties_and_attorneys(d, data['parties']) process_orphan_documents(rds_created, pq.court_id, d.date_filed) if content_updated and docket_count > 0: newly_enqueued = enqueue_docket_alert(d.pk) if newly_enqueued: send_docket_alert(d.pk, start_time) mark_pq_successful(pq, d_id=d.pk) return { 'docket_pk': d.pk, 'content_updated': bool(rds_created or content_updated), }
def process_docket_data( d: Docket, report_type: int, filepath: str = None, ) -> Optional[int]: """Process docket data file. :param d: A docket object to work on. :param report_type: Whether it's a docket or a docket history report. :param filepath: A local path where the item can be found. If not provided, the filepath_local field of the docket object will be attempted. """ from cl.recap.mergers import ( add_bankruptcy_data_to_docket, add_claims_to_docket, add_docket_entries, add_parties_and_attorneys, update_docket_appellate_metadata, update_docket_metadata, ) court_id = map_cl_to_pacer_id(d.court_id) if report_type == UPLOAD_TYPE.DOCKET: report = DocketReport(court_id) elif report_type == UPLOAD_TYPE.DOCKET_HISTORY_REPORT: report = DocketHistoryReport(court_id) elif report_type == UPLOAD_TYPE.APPELLATE_DOCKET: report = AppellateDocketReport(court_id) elif report_type == UPLOAD_TYPE.IA_XML_FILE: report = InternetArchive(court_id) elif report_type == UPLOAD_TYPE.CASE_REPORT_PAGE: report = CaseQuery(court_id) elif report_type == UPLOAD_TYPE.CLAIMS_REGISTER: report = ClaimsRegister(court_id) else: raise NotImplementedError( "The report type with id '%s' is not yet " "supported. Perhaps you need to add it?" % report_type ) if filepath: with open(filepath, "r") as f: text = f.read() else: # This is an S3 path, so get it remotely. text = d.filepath_local.read().decode() report._parse_text(text) data = report.data if data == {}: return None if report_type == UPLOAD_TYPE.CLAIMS_REGISTER: add_bankruptcy_data_to_docket(d, data) add_claims_to_docket(d, data["claims"]) else: update_docket_metadata(d, data) d, og_info = update_docket_appellate_metadata(d, data) if og_info is not None: og_info.save() d.originating_court_information = og_info d.save() if data.get("docket_entries"): add_docket_entries(d, data["docket_entries"]) if report_type in ( UPLOAD_TYPE.DOCKET, UPLOAD_TYPE.APPELLATE_DOCKET, UPLOAD_TYPE.IA_XML_FILE, ): add_parties_and_attorneys(d, data["parties"]) return d.pk