def reprocess_docket_data(d, filepath, report_type): """Reprocess docket data that we already have. :param d: A docket object to work on. :param filepath: The path to a saved HTML file containing docket or docket history report data. :param report_type: Whether it's a docket or a docket history report. """ from cl.recap.tasks import update_docket_metadata, add_docket_entries, \ add_parties_and_attorneys if report_type == DOCKET: report = DocketReport(map_cl_to_pacer_id(d.court_id)) elif report_type == DOCKET_HISTORY_REPORT: report = DocketHistoryReport(map_cl_to_pacer_id(d.court_id)) with open(filepath, 'r') as f: text = f.read().decode('utf-8') report._parse_text(text) data = report.data if data == {}: return None update_docket_metadata(d, data) d.save() add_docket_entries(d, data['docket_entries']) if report_type == DOCKET: add_parties_and_attorneys(d, data['parties']) return d.pk
def process_docket_data(d, filepath, report_type): """Process docket data file. :param d: A docket object to work on. :param filepath: The path to a saved HTML file containing docket or docket history report data. :param report_type: Whether it's a docket or a docket history report. """ from cl.recap.mergers import ( add_docket_entries, add_parties_and_attorneys, update_docket_appellate_metadata, update_docket_metadata, add_bankruptcy_data_to_docket, add_claims_to_docket, ) court_id = map_cl_to_pacer_id(d.court_id) if report_type == UPLOAD_TYPE.DOCKET: report = DocketReport(court_id) elif report_type == UPLOAD_TYPE.DOCKET_HISTORY_REPORT: report = DocketHistoryReport(court_id) elif report_type == UPLOAD_TYPE.APPELLATE_DOCKET: report = AppellateDocketReport(court_id) elif report_type == UPLOAD_TYPE.IA_XML_FILE: report = InternetArchive(court_id) elif report_type == UPLOAD_TYPE.CASE_REPORT_PAGE: report = CaseQuery(court_id) elif report_type == UPLOAD_TYPE.CLAIMS_REGISTER: report = ClaimsRegister(court_id) else: raise NotImplementedError("The report type with id '%s' is not yet " "supported. Perhaps you need to add it?" % report_type) with open(filepath, "r") as f: text = f.read().decode("utf-8") report._parse_text(text) data = report.data if data == {}: return None if report_type == UPLOAD_TYPE.CLAIMS_REGISTER: add_bankruptcy_data_to_docket(d, data) add_claims_to_docket(d, data["claims"]) else: update_docket_metadata(d, data) d, og_info = update_docket_appellate_metadata(d, data) if og_info is not None: og_info.save() d.originating_court_information = og_info d.save() if data.get("docket_entries"): add_docket_entries(d, data["docket_entries"]) if report_type in ( UPLOAD_TYPE.DOCKET, UPLOAD_TYPE.APPELLATE_DOCKET, UPLOAD_TYPE.IA_XML_FILE, ): add_parties_and_attorneys(d, data["parties"]) return d.pk
def process_docket_data(d, filepath, report_type): """Process docket data file. :param d: A docket object to work on. :param filepath: The path to a saved HTML file containing docket or docket history report data. :param report_type: Whether it's a docket or a docket history report. """ from cl.recap.tasks import update_docket_metadata, add_docket_entries, \ add_parties_and_attorneys, update_docket_appellate_metadata if report_type == UPLOAD_TYPE.DOCKET: report = DocketReport(map_cl_to_pacer_id(d.court_id)) elif report_type == UPLOAD_TYPE.DOCKET_HISTORY_REPORT: report = DocketHistoryReport(map_cl_to_pacer_id(d.court_id)) elif report_type == UPLOAD_TYPE.APPELLATE_DOCKET: report = AppellateDocketReport(map_cl_to_pacer_id(d.court_id)) elif report_type == UPLOAD_TYPE.IA_XML_FILE: report = InternetArchive() with open(filepath, 'r') as f: text = f.read().decode('utf-8') report._parse_text(text) data = report.data if data == {}: return None update_docket_metadata(d, data) d, og_info = update_docket_appellate_metadata(d, data) if og_info is not None: og_info.save() d.originating_court_information = og_info d.save() add_docket_entries(d, data['docket_entries']) if report_type in (UPLOAD_TYPE.DOCKET, UPLOAD_TYPE.APPELLATE_DOCKET, UPLOAD_TYPE.IA_XML_FILE): add_parties_and_attorneys(d, data['parties']) return d.pk
def process_docket_data(d, filepath, report_type): """Process docket data file. :param d: A docket object to work on. :param filepath: The path to a saved HTML file containing docket or docket history report data. :param report_type: Whether it's a docket or a docket history report. """ from cl.recap.tasks import update_docket_metadata, add_docket_entries, \ add_parties_and_attorneys, update_docket_appellate_metadata if report_type == UPLOAD_TYPE.DOCKET: report = DocketReport(map_cl_to_pacer_id(d.court_id)) elif report_type == UPLOAD_TYPE.DOCKET_HISTORY_REPORT: report = DocketHistoryReport(map_cl_to_pacer_id(d.court_id)) elif report_type == UPLOAD_TYPE.APPELLATE_DOCKET: report = AppellateDocketReport(map_cl_to_pacer_id(d.court_id)) elif report_type == UPLOAD_TYPE.IA_XML_FILE: report = InternetArchive() elif report_type == UPLOAD_TYPE.CASE_REPORT_PAGE: report = CaseQuery(map_cl_to_pacer_id(d.court_id)) with open(filepath, 'r') as f: text = f.read().decode('utf-8') report._parse_text(text) data = report.data if data == {}: return None update_docket_metadata(d, data) d, og_info = update_docket_appellate_metadata(d, data) if og_info is not None: og_info.save() d.originating_court_information = og_info d.save() if data.get('docket_entries'): add_docket_entries(d, data['docket_entries']) if report_type in (UPLOAD_TYPE.DOCKET, UPLOAD_TYPE.APPELLATE_DOCKET, UPLOAD_TYPE.IA_XML_FILE): add_parties_and_attorneys(d, data['parties']) return d.pk
def process_recap_docket(self, pk): """Process an uploaded docket from the RECAP API endpoint. :param pk: The primary key of the processing queue item you want to work on. :returns: A dict of the form: { // The PK of the docket that's created or updated 'docket_pk': 22, // A boolean indicating whether a new docket entry or // recap document was created (implying a Solr needs // updating). 'content_updated': True, } This value is a dict so that it can be ingested in a Celery chain. """ start_time = now() pq = ProcessingQueue.objects.get(pk=pk) mark_pq_status(pq, "", PROCESSING_STATUS.IN_PROGRESS) logger.info("Processing RECAP item (debug is: %s): %s" % (pq.debug, pq)) report = DocketReport(map_cl_to_pacer_id(pq.court_id)) try: text = pq.filepath_local.read().decode("utf-8") except IOError as exc: msg = "Internal processing error (%s: %s)." % (exc.errno, exc.strerror) if (self.request.retries == self.max_retries) or pq.debug: mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED) return None else: mark_pq_status(pq, msg, PROCESSING_STATUS.QUEUED_FOR_RETRY) raise self.retry(exc=exc) if "History/Documents" in text: # Prior to 1.1.8, we did not separate docket history reports into their # own upload_type. Alas, we still have some old clients around, so we # need to handle those clients here. pq.upload_type = UPLOAD_TYPE.DOCKET_HISTORY_REPORT pq.save() process_recap_docket_history_report(pk) self.request.chain = None return None report._parse_text(text) data = report.data logger.info("Parsing completed of item %s" % pq) if data == {}: # Not really a docket. Some sort of invalid document (see Juriscraper). msg = "Not a valid docket upload." mark_pq_status(pq, msg, PROCESSING_STATUS.INVALID_CONTENT) self.request.chain = None return None # Merge the contents of the docket into CL. d, docket_count = find_docket_object( pq.court_id, pq.pacer_case_id, data["docket_number"] ) if docket_count > 1: logger.info( "Found %s dockets during lookup. Choosing oldest." % docket_count ) d = d.earliest("date_created") d.add_recap_source() update_docket_metadata(d, data) if not d.pacer_case_id: d.pacer_case_id = pq.pacer_case_id if pq.debug: mark_pq_successful(pq, d_id=d.pk) self.request.chain = None return {"docket_pk": d.pk, "content_updated": False} d.save() # Add the HTML to the docket in case we need it someday. pacer_file = PacerHtmlFiles( content_object=d, upload_type=UPLOAD_TYPE.DOCKET ) pacer_file.filepath.save( "docket.html", # We only care about the ext w/UUIDFileSystemStorage ContentFile(text), ) rds_created, content_updated = add_docket_entries( d, data["docket_entries"] ) add_parties_and_attorneys(d, data["parties"]) process_orphan_documents(rds_created, pq.court_id, d.date_filed) if content_updated and docket_count > 0: newly_enqueued = enqueue_docket_alert(d.pk) if newly_enqueued: send_docket_alert(d.pk, start_time) mark_pq_successful(pq, d_id=d.pk) return { "docket_pk": d.pk, "content_updated": bool(rds_created or content_updated), }
def process_recap_docket(self, pk): """Process an uploaded docket from the RECAP API endpoint. :param pk: The primary key of the processing queue item you want to work on. :returns: A dict of the form: { // The PK of the docket that's created or updated 'docket_pk': 22, // A boolean indicating whether a new docket entry or recap document // was created (implying a Solr needs updating). 'needs_solr_update': True, } This value is a dict so that it can be ingested in a Celery chain. """ pq = ProcessingQueue.objects.get(pk=pk) mark_pq_status(pq, '', pq.PROCESSING_IN_PROGRESS) logger.info("Processing RECAP item (debug is: %s): %s" % (pq.debug, pq)) report = DocketReport(map_cl_to_pacer_id(pq.court_id)) text = pq.filepath_local.read().decode('utf-8') if 'History/Documents' in text: # Prior to 1.1.8, we did not separate docket history reports into their # own upload_type. Alas, we still have some old clients around, so we # need to handle those clients here. pq.upload_type = DOCKET_HISTORY_REPORT pq.save() process_recap_docket_history_report(pk) self.request.callbacks = None return None report._parse_text(text) data = report.data logger.info("Parsing completed of item %s" % pq) if data == {}: # Not really a docket. Some sort of invalid document (see Juriscraper). msg = "Not a valid docket upload." mark_pq_status(pq, msg, pq.INVALID_CONTENT) self.request.callbacks = None return None # Merge the contents of the docket into CL. d, count = find_docket_object(pq.court_id, pq.pacer_case_id, data['docket_number']) if count > 1: logger.info("Found %s dockets during lookup. Choosing oldest." % count) d = d.earliest('date_created') add_recap_source(d) update_docket_metadata(d, data) if pq.debug: mark_pq_successful(pq, d_id=d.pk) self.request.callbacks = None return {'docket_pk': d.pk, 'needs_solr_update': False} d.save() # Add the HTML to the docket in case we need it someday. pacer_file = PacerHtmlFiles(content_object=d, upload_type=DOCKET) pacer_file.filepath.save( 'docket.html', # We only care about the ext w/UUIDFileSystemStorage ContentFile(text), ) rds_created, needs_solr_update = add_docket_entries(d, data['docket_entries']) add_parties_and_attorneys(d, data['parties']) process_orphan_documents(rds_created, pq.court_id, d.date_filed) mark_pq_successful(pq, d_id=d.pk) return { 'docket_pk': d.pk, 'needs_solr_update': bool(rds_created or needs_solr_update), }
def process_recap_docket(pk): """Process an uploaded docket from the RECAP API endpoint. param pk: The primary key of the processing queue item you want to work on. """ pq = ProcessingQueue.objects.get(pk=pk) pq.status = pq.PROCESSING_IN_PROGRESS pq.save() logger.info("Processing RECAP item (debug is: %s): %s" % (pq.debug, pq)) report = DocketReport(map_cl_to_pacer_id(pq.court_id)) text = pq.filepath_local.read().decode('utf-8') report._parse_text(text) docket_data = report.data logger.info("Parsing completed of item %s" % pq) # Merge the contents of the docket into CL try: d = Docket.objects.get( Q(pacer_case_id=pq.pacer_case_id) | Q(docket_number=docket_data['docket_number']), court_id=pq.court_id, ) # Add RECAP as a source if it's not already. if d.source in [Docket.DEFAULT, Docket.SCRAPER]: d.source = Docket.RECAP_AND_SCRAPER elif d.source == Docket.COLUMBIA: d.source = Docket.COLUMBIA_AND_RECAP elif d.source == Docket.COLUMBIA_AND_SCRAPER: d.source = Docket.COLUMBIA_AND_RECAP_AND_SCRAPER except Docket.DoesNotExist: d = Docket( source=Docket.RECAP, pacer_case_id=pq.pacer_case_id, court_id=pq.court_id ) except Docket.MultipleObjectsReturned: msg = "Too many dockets found when trying to look up '%s'" % pq logger.error(msg) pq.error_message = msg pq.status = pq.PROCESSING_FAILED pq.save() return None update_docket_metadata(d, docket_data) if pq.debug: mark_pq_successful(pq, d_id=d.pk) return d d.save() # Add the HTML to the docket in case we need it someday. pacer_file = PacerHtmlFiles(content_object=d) pacer_file.filepath.save( 'docket.html', # We only care about the ext w/UUIDFileSystemStorage ContentFile(text), ) # Docket entries for docket_entry in docket_data['docket_entries']: try: de, created = DocketEntry.objects.update_or_create( docket=d, entry_number=docket_entry['document_number'], defaults={ 'description': docket_entry['description'], 'date_filed': docket_entry['date_filed'], } ) except DocketEntry.MultipleObjectsReturned: logger.error( "Multiple docket entries found for document entry number '%s' " "while processing '%s'" % (docket_entry['document_number'], pq) ) continue # Then make the RECAPDocument object. Try to find it. If we do, update # the pacer_doc_id field if it's blank. If we can't find it, create it # or throw an error. try: rd = RECAPDocument.objects.get( docket_entry=de, # No attachments when uploading dockets. document_type=RECAPDocument.PACER_DOCUMENT, document_number=docket_entry['document_number'], ) except RECAPDocument.DoesNotExist: RECAPDocument.objects.create( docket_entry=de, # No attachments when uploading dockets. document_type=RECAPDocument.PACER_DOCUMENT, document_number=docket_entry['document_number'], pacer_doc_id=docket_entry['pacer_doc_id'], is_available=False, ) except RECAPDocument.MultipleObjectsReturned: logger.error( "Multiple recap documents found for document entry number'%s' " "while processing '%s'" % (docket_entry['document_number'], pq) ) continue else: rd.pacer_doc_id = rd.pacer_doc_id or pq.pacer_doc_id add_parties_and_attorneys(d, docket_data['parties']) mark_pq_successful(pq, d_id=d.pk) return d
def run_parsers_on_path( self, path_root, required_fields=["date_filed", "case_name", "docket_number"], ): """Test all the parsers, faking the network query.""" paths = [] for root, dirnames, filenames in os.walk(path_root): for filename in fnmatch.filter(filenames, "*.html"): paths.append(os.path.join(root, filename)) paths.sort() path_max_len = max(len(path) for path in paths) + 2 for i, path in enumerate(paths): sys.stdout.write("%s. Doing %s" % (i, path.ljust(path_max_len))) t1 = time.time() dirname, filename = os.path.split(path) filename_sans_ext = filename.split(".")[0] json_path = os.path.join(dirname, "%s.json" % filename_sans_ext) court = filename_sans_ext.split("_")[0] report = DocketReport(court) with open(path, "rb") as f: report._parse_text(f.read().decode("utf-8")) data = report.data if data != {}: # If the docket is a valid docket, make sure some required # fields are populated. for field in required_fields: self.assertTrue( data[field], msg="Unable to find truthy value for field %s" % field, ) self.assertEqual(data["court_id"], court) # Party-specific tests... for party in data["parties"]: self.assertTrue( party.get("name", False), msg="Every party must have a name attribute. Did not " "get a value for:\n\n%s" % party, ) # Protect against effed up adversary proceedings cases that # don't parse properly. See: cacb, 2:08-ap-01570-BB self.assertNotIn("----", party["name"]) if not os.path.isfile(json_path): bar = "*" * 50 print( "\n\n%s\nJSON FILE DID NOT EXIST. CREATING IT AT:" "\n\n %s\n\n" "Please test the data in this file before assuming " "everything worked.\n%s\n" % (bar, json_path, bar) ) with open(json_path, "w") as f: json.dump(data, f, indent=2, sort_keys=True) # self.assertFalse(True) continue with open(json_path) as f: j = json.load(f) if j != {}: # Compare docket entries and parties first, for easier # debugging, then compare whole objects to be sure. self.assertEqual( j["docket_entries"], data["docket_entries"] ) self.assertEqual(j["parties"], data["parties"]) self.assertEqual(j, data) t2 = time.time() duration = t2 - t1 warn_or_crash_slow_parser(duration, max_duration=1) sys.stdout.write("✓ - %0.1fs\n" % (t2 - t1))
def run_parsers_on_path(self, path_root, required_fields=[ 'date_filed', 'case_name', 'docket_number']): """Test all the parsers, faking the network query.""" paths = [] for root, dirnames, filenames in os.walk(path_root): for filename in fnmatch.filter(filenames, '*.html'): paths.append(os.path.join(root, filename)) paths.sort() path_max_len = max(len(path) for path in paths) + 2 for i, path in enumerate(paths): sys.stdout.write("%s. Doing %s" % (i, path.ljust(path_max_len))) t1 = time.time() dirname, filename = os.path.split(path) filename_sans_ext = filename.split('.')[0] json_path = os.path.join(dirname, '%s.json' % filename_sans_ext) court = filename_sans_ext.split('_')[0] report = DocketReport(court) with open(path, 'r') as f: report._parse_text(f.read().decode('utf-8')) data = report.data if data != {}: # If the docket is a valid docket, make sure some required # fields are populated. for field in required_fields: self.assertTrue( data[field], msg="Unable to find truthy value for field %s" % field, ) self.assertEqual(data['court_id'], court) # Party-specific tests... for party in data['parties']: self.assertTrue( party.get('name', False), msg="Every party must have a name attribute. Did not " "get a value for:\n\n%s" % party ) # Protect against effed up adversary proceedings cases that # don't parse properly. See: cacb, 2:08-ap-01570-BB self.assertNotIn('----', party['name']) if not os.path.isfile(json_path): bar = "*" * 50 print("\n\n%s\nJSON FILE DID NOT EXIST. CREATING IT AT:" "\n\n %s\n\n" "Please test the data in this file before assuming " "everything worked.\n%s\n" % (bar, json_path, bar)) with open(json_path, 'w') as f: json.dump(data, f, indent=2, sort_keys=True) #self.assertFalse(True) continue with open(json_path) as f: j = json.load(f) if j != {}: # Compare docket entries and parties first, for easier # debugging, then compare whole objects to be sure. self.assertEqual(j['docket_entries'], data['docket_entries']) self.assertEqual(j['parties'], data['parties']) self.assertEqual(j, data) t2 = time.time() duration = t2 - t1 warn_or_crash_slow_parser(duration, max_duration=1) sys.stdout.write("✓ - %0.1fs\n" % (t2-t1))
def process_recap_docket(self, pk): """Process an uploaded docket from the RECAP API endpoint. :param pk: The primary key of the processing queue item you want to work on. :returns: A dict of the form: { // The PK of the docket that's created or updated 'docket_pk': 22, // A boolean indicating whether a new docket entry or recap document // was created (implying a Solr needs updating). 'needs_solr_update': True, } This value is a dict so that it can be ingested in a Celery chain. """ pq = ProcessingQueue.objects.get(pk=pk) mark_pq_status(pq, '', pq.PROCESSING_IN_PROGRESS) logger.info("Processing RECAP item (debug is: %s): %s" % (pq.debug, pq)) report = DocketReport(map_cl_to_pacer_id(pq.court_id)) text = pq.filepath_local.read().decode('utf-8') if 'History/Documents' in text: # Prior to 1.1.8, we did not separate docket history reports into their # own upload_type. Alas, we still have some old clients around, so we # need to handle those clients here. pq.upload_type = pq.DOCKET_HISTORY_REPORT pq.save() process_recap_docket_history_report(pk) self.request.callbacks = None return None report._parse_text(text) docket_data = report.data logger.info("Parsing completed of item %s" % pq) if docket_data == {}: # Not really a docket. Some sort of invalid document (see Juriscraper). msg = "Not a valid docket upload." mark_pq_status(pq, msg, pq.INVALID_CONTENT) self.request.callbacks = None return None # Merge the contents of the docket into CL. Attempt several lookups of # decreasing specificity. Note that pacer_case_id is required for Docket # uploads. d = None for kwargs in [{'pacer_case_id': pq.pacer_case_id, 'docket_number': docket_data['docket_number']}, {'pacer_case_id': pq.pacer_case_id}, {'docket_number': docket_data['docket_number'], 'pacer_case_id': None}]: try: d = Docket.objects.get(court_id=pq.court_id, **kwargs) break except Docket.DoesNotExist: continue except Docket.MultipleObjectsReturned: msg = "Too many dockets found when trying to look up '%s'" % pq mark_pq_status(pq, msg, pq.PROCESSING_FAILED) self.request.callbacks = None return None if d is None: # Couldn't find it. Make a new one. d = Docket( source=Docket.RECAP, pacer_case_id=pq.pacer_case_id, court_id=pq.court_id ) # Add RECAP as a source if it's not already. if d.source in [Docket.DEFAULT, Docket.SCRAPER]: d.source = Docket.RECAP_AND_SCRAPER elif d.source == Docket.COLUMBIA: d.source = Docket.COLUMBIA_AND_RECAP elif d.source == Docket.COLUMBIA_AND_SCRAPER: d.source = Docket.COLUMBIA_AND_RECAP_AND_SCRAPER update_docket_metadata(d, docket_data) if pq.debug: mark_pq_successful(pq, d_id=d.pk) self.request.callbacks = None return {'docket_pk': d.pk, 'needs_solr_update': False} d.save() # Add the HTML to the docket in case we need it someday. pacer_file = PacerHtmlFiles(content_object=d) pacer_file.filepath.save( 'docket.html', # We only care about the ext w/UUIDFileSystemStorage ContentFile(text), ) # Docket entries & documents rds_created = [] needs_solr_update = False for docket_entry in docket_data['docket_entries']: try: de, de_created = DocketEntry.objects.update_or_create( docket=d, entry_number=docket_entry['document_number'], defaults={ 'description': docket_entry['description'], 'date_filed': docket_entry['date_filed'], } ) except DocketEntry.MultipleObjectsReturned: logger.error( "Multiple docket entries found for document entry number '%s' " "while processing '%s'" % (docket_entry['document_number'], pq) ) continue if de_created: needs_solr_update = True # Then make the RECAPDocument object. Try to find it. If we do, update # the pacer_doc_id field if it's blank. If we can't find it, create it # or throw an error. params = { 'docket_entry': de, # No attachments when uploading dockets. 'document_type': RECAPDocument.PACER_DOCUMENT, 'document_number': docket_entry['document_number'], } try: rd = RECAPDocument.objects.get(**params) except RECAPDocument.DoesNotExist: rd = RECAPDocument.objects.create( pacer_doc_id=docket_entry['pacer_doc_id'], is_available=False, **params ) rds_created.append(rd) except RECAPDocument.MultipleObjectsReturned: logger.error( "Multiple recap documents found for document entry number'%s' " "while processing '%s'" % (docket_entry['document_number'], pq) ) continue else: rd.pacer_doc_id = rd.pacer_doc_id or pq.pacer_doc_id add_parties_and_attorneys(d, docket_data['parties']) process_orphan_documents(rds_created, pq.court_id, d.date_filed) mark_pq_successful(pq, d_id=d.pk) return { 'docket_pk': d.pk, 'needs_solr_update': bool(rds_created or needs_solr_update), }
def run_parsers_on_path( self, path_root, required_fields=['date_filed', 'case_name', 'docket_number']): """Test all the parsers, faking the network query.""" paths = [] for root, dirnames, filenames in os.walk(path_root): for filename in fnmatch.filter(filenames, '*.html'): paths.append(os.path.join(root, filename)) paths.sort() path_max_len = max(len(path) for path in paths) + 2 for i, path in enumerate(paths): sys.stdout.write("%s. Doing %s" % (i, path.ljust(path_max_len))) t1 = time.time() dirname, filename = os.path.split(path) filename_sans_ext = filename.split('.')[0] json_path = os.path.join(dirname, '%s.json' % filename_sans_ext) court = filename_sans_ext.split('_')[0] report = DocketReport(court) with open(path, 'r') as f: report._parse_text(f.read().decode('utf-8')) data = report.data if data != {}: # If the docket is a valid docket, make sure some required # fields are populated. for field in required_fields: self.assertTrue( data[field], msg="Unable to find truthy value for field %s" % field, ) self.assertEqual(data['court_id'], court) # Party-specific tests... for party in data['parties']: self.assertTrue( party.get('name', False), msg="Every party must have a name attribute. Did not " "get a value for:\n\n%s" % party) # Protect against effed up adversary proceedings cases that # don't parse properly. See: cacb, 2:08-ap-01570-BB self.assertNotIn('----', party['name']) if not os.path.isfile(json_path): bar = "*" * 50 print("\n\n%s\nJSON FILE DID NOT EXIST. CREATING IT AT:" "\n\n %s\n\n" "Please test the data in this file before assuming " "everything worked.\n%s\n" % (bar, json_path, bar)) with open(json_path, 'w') as f: json.dump(data, f, indent=2, sort_keys=True) #self.assertFalse(True) continue with open(json_path) as f: j = json.load(f) if j != {}: # Compare docket entries and parties first, for easier # debugging, then compare whole objects to be sure. self.assertEqual(j['docket_entries'], data['docket_entries']) self.assertEqual(j['parties'], data['parties']) self.assertEqual(j, data) t2 = time.time() max_duration = 1 duration = t2 - t1 if duration > max_duration: if sys.gettrace() is None and not IS_TRAVIS: # Don't do this if we're debugging. raise SlownessException( "The parser for '{fn}' took {duration}s to test, " "which is more than the maximum allowed duration of " "{max_duration}s.".format( fn=filename, duration=duration, max_duration=max_duration, )) sys.stdout.write("✓ - %0.1fs\n" % (t2 - t1))
def process_docket_data( d: Docket, report_type: int, filepath: str = None, ) -> Optional[int]: """Process docket data file. :param d: A docket object to work on. :param report_type: Whether it's a docket or a docket history report. :param filepath: A local path where the item can be found. If not provided, the filepath_local field of the docket object will be attempted. """ from cl.recap.mergers import ( add_bankruptcy_data_to_docket, add_claims_to_docket, add_docket_entries, add_parties_and_attorneys, update_docket_appellate_metadata, update_docket_metadata, ) court_id = map_cl_to_pacer_id(d.court_id) if report_type == UPLOAD_TYPE.DOCKET: report = DocketReport(court_id) elif report_type == UPLOAD_TYPE.DOCKET_HISTORY_REPORT: report = DocketHistoryReport(court_id) elif report_type == UPLOAD_TYPE.APPELLATE_DOCKET: report = AppellateDocketReport(court_id) elif report_type == UPLOAD_TYPE.IA_XML_FILE: report = InternetArchive(court_id) elif report_type == UPLOAD_TYPE.CASE_REPORT_PAGE: report = CaseQuery(court_id) elif report_type == UPLOAD_TYPE.CLAIMS_REGISTER: report = ClaimsRegister(court_id) else: raise NotImplementedError( "The report type with id '%s' is not yet " "supported. Perhaps you need to add it?" % report_type ) if filepath: with open(filepath, "r") as f: text = f.read() else: # This is an S3 path, so get it remotely. text = d.filepath_local.read().decode() report._parse_text(text) data = report.data if data == {}: return None if report_type == UPLOAD_TYPE.CLAIMS_REGISTER: add_bankruptcy_data_to_docket(d, data) add_claims_to_docket(d, data["claims"]) else: update_docket_metadata(d, data) d, og_info = update_docket_appellate_metadata(d, data) if og_info is not None: og_info.save() d.originating_court_information = og_info d.save() if data.get("docket_entries"): add_docket_entries(d, data["docket_entries"]) if report_type in ( UPLOAD_TYPE.DOCKET, UPLOAD_TYPE.APPELLATE_DOCKET, UPLOAD_TYPE.IA_XML_FILE, ): add_parties_and_attorneys(d, data["parties"]) return d.pk
#!/usr/bin/env python # # Takes an .html file on the command line, parses it using the PACER # Docket Report parser, and outputs json to stdout. import jsondate3 as json import sys from juriscraper.pacer.http import PacerSession from juriscraper.pacer import DocketReport pacer_session = PacerSession(username="******", password="******") report = DocketReport("psc", pacer_session) for path in sys.argv[1:]: with open(path, "r") as f: report._parse_text(f.read().decode("utf-8")) data = report.data print json.dumps(data, indent=2, sort_keys=True, separators=(",", ": "))
#!/usr/bin/env python # # Takes an .html file on the command line, parses it using the PACER # Docket Report parser, and outputs json to stdout. import jsondate as json import sys from juriscraper.pacer.http import PacerSession from juriscraper.pacer import DocketReport pacer_session = PacerSession(username='******', password='******') report = DocketReport('psc', pacer_session) for path in sys.argv[1:]: with open(path, 'r') as f: report._parse_text(f.read().decode('utf-8')) data = report.data print json.dumps(data, indent=2, sort_keys=True, separators=(',', ': '))
def process_recap_docket(pk): """Process an uploaded docket from the RECAP API endpoint. :param pk: The primary key of the processing queue item you want to work on. :return: The docket that's created or updated. """ pq = ProcessingQueue.objects.get(pk=pk) mark_pq_status(pq, '', pq.PROCESSING_IN_PROGRESS) logger.info("Processing RECAP item (debug is: %s): %s" % (pq.debug, pq)) report = DocketReport(map_cl_to_pacer_id(pq.court_id)) text = pq.filepath_local.read().decode('utf-8') report._parse_text(text) docket_data = report.data logger.info("Parsing completed of item %s" % pq) if docket_data == {}: # Not really a docket. Some sort of invalid document (see Juriscraper). msg = "Not a valid docket upload." mark_pq_status(pq, msg, pq.INVALID_CONTENT) return None # Merge the contents of the docket into CL. Attempt several lookups of # decreasing specificity. d = None for kwargs in [{ 'pacer_case_id': pq.pacer_case_id, 'docket_number': docket_data['docket_number'] }, { 'pacer_case_id': pq.pacer_case_id }, { 'docket_number': docket_data['docket_number'] }]: try: d = Docket.objects.get(court_id=pq.court_id, **kwargs) break except Docket.DoesNotExist: continue except Docket.MultipleObjectsReturned: msg = "Too many dockets found when trying to look up '%s'" % pq mark_pq_status(pq, msg, pq.PROCESSING_FAILED) return None if d is None: # Couldn't find it. Make a new one. d = Docket(source=Docket.RECAP, pacer_case_id=pq.pacer_case_id, court_id=pq.court_id) # Add RECAP as a source if it's not already. if d.source in [Docket.DEFAULT, Docket.SCRAPER]: d.source = Docket.RECAP_AND_SCRAPER elif d.source == Docket.COLUMBIA: d.source = Docket.COLUMBIA_AND_RECAP elif d.source == Docket.COLUMBIA_AND_SCRAPER: d.source = Docket.COLUMBIA_AND_RECAP_AND_SCRAPER update_docket_metadata(d, docket_data) if pq.debug: mark_pq_successful(pq, d_id=d.pk) return d d.save() # Add the HTML to the docket in case we need it someday. pacer_file = PacerHtmlFiles(content_object=d) pacer_file.filepath.save( 'docket.html', # We only care about the ext w/UUIDFileSystemStorage ContentFile(text), ) # Docket entries for docket_entry in docket_data['docket_entries']: try: de, created = DocketEntry.objects.update_or_create( docket=d, entry_number=docket_entry['document_number'], defaults={ 'description': docket_entry['description'], 'date_filed': docket_entry['date_filed'], }) except DocketEntry.MultipleObjectsReturned: logger.error( "Multiple docket entries found for document entry number '%s' " "while processing '%s'" % (docket_entry['document_number'], pq)) continue # Then make the RECAPDocument object. Try to find it. If we do, update # the pacer_doc_id field if it's blank. If we can't find it, create it # or throw an error. try: rd = RECAPDocument.objects.get( docket_entry=de, # No attachments when uploading dockets. document_type=RECAPDocument.PACER_DOCUMENT, document_number=docket_entry['document_number'], ) except RECAPDocument.DoesNotExist: try: RECAPDocument.objects.create( docket_entry=de, # No attachments when uploading dockets. document_type=RECAPDocument.PACER_DOCUMENT, document_number=docket_entry['document_number'], pacer_doc_id=docket_entry['pacer_doc_id'], is_available=False, ) except IntegrityError: logger.warn( "Creating new document with pacer_doc_id of '%s' violates " "unique constraint on pacer_doc_id field." % docket_entry['pacer_doc_id']) continue except RECAPDocument.MultipleObjectsReturned: logger.error( "Multiple recap documents found for document entry number'%s' " "while processing '%s'" % (docket_entry['document_number'], pq)) continue else: rd.pacer_doc_id = rd.pacer_doc_id or pq.pacer_doc_id add_parties_and_attorneys(d, docket_data['parties']) mark_pq_successful(pq, d_id=d.pk) return d