def fetch_docket_by_pacer_case_id( session, court_id, pacer_case_id, fq, ): """Download the docket from PACER and merge it into CL :param session: A PacerSession object to work with :param court_id: The CL ID of the court :param pacer_case_id: The pacer_case_id of the docket, if known :param fq: The PacerFetchQueue object :return: a dict with information about the docket and the new data """ report = DocketReport(map_cl_to_pacer_id(court_id), session) report.query(pacer_case_id, **get_fq_docket_kwargs(fq)) docket_data = report.data if not docket_data: raise ParsingException("No data found in docket report.") if fq.docket_id: d = Docket.objects.get(pk=fq.docket_id) else: d, count = find_docket_object( court_id, pacer_case_id, docket_data["docket_number"] ) if count > 1: d = d.earliest("date_created") rds_created, content_updated = merge_pacer_docket_into_cl_docket( d, pacer_case_id, docket_data, report, appellate=False, ) return { "docket_pk": d.pk, "content_updated": bool(rds_created or content_updated), }
def get_docket_json(self): """Download docket to disk from Pacer :return: None """ q = Query() db = TinyDB("db/master.json") fjc_table = db.table("fjc") for row in fjc_table.search(~(q.PACER_CASE_ID == "") & (q.JSON == "False")): rep = DocketReport(row["COURT"], self.s) rep.query( row["PACER_CASE_ID"], show_parties_and_counsel=True, show_terminated_parties=True, show_list_of_member_cases=True, include_pdf_headers=True, show_multiple_docs=False, ) with open( "downloads/json/pacer_docket_%s.json" % row["PACER_CASE_ID"], "w" ) as write_file: json.dump(rep.data, write_file, indent=4, sort_keys=True, default=str) with open( "downloads/html/pacer_docket_%s.html" % row["PACER_CASE_ID"], "w" ) as file: file.write(rep.response.text) fjc_table.update( { "JSON": "True", "pacer_doc_id": rep.data["docket_entries"][0]["pacer_doc_id"], }, doc_ids=[row.doc_id], ) logging.info("Finished collecting JSON and HTML")
def get_docket_by_pacer_case_id(self, pacer_case_id, court_id, session, tag=None, **kwargs): """Get a docket by PACER case id, CL court ID, and a collection of kwargs that can be passed to the DocketReport query. For details of acceptable parameters, see DocketReport.query() :param pacer_case_id: The internal case ID of the item in PACER. :param court_id: A courtlistener court ID. :param session: A valid PacerSession object. :param tag: The tag name that should be stored with the item in the DB. :param kwargs: A variety of keyword args to pass to DocketReport.query(). """ report = DocketReport(map_cl_to_pacer_id(court_id), session) logger.info("Querying docket report %s.%s" % (court_id, pacer_case_id)) try: d = Docket.objects.get( pacer_case_id=pacer_case_id, court_id=court_id, ) except Docket.DoesNotExist: d = None except Docket.MultipleObjectsReturned: d = None if d is not None: first_missing_id = get_first_missing_de_number(d) if first_missing_id > 1: # We don't have to get the whole thing! kwargs.setdefault('doc_num_start', first_missing_id) report.query(pacer_case_id, **kwargs) docket_data = report.data logger.info("Querying and parsing complete for %s.%s" % (court_id, pacer_case_id)) # Merge the contents into CL. if d is None: d, count = find_docket_object(court_id, pacer_case_id, docket_data['docket_number']) if count > 1: d = d.earliest('date_created') add_recap_source(d) update_docket_metadata(d, docket_data) d.save() if tag is not None: tag, _ = Tag.objects.get_or_create(name=tag) d.tags.add(tag) # Add the HTML to the docket in case we need it someday. pacer_file = PacerHtmlFiles(content_object=d, upload_type=DOCKET) pacer_file.filepath.save( 'docket.html', # We only care about the ext w/UUIDFileSystemStorage ContentFile(report.response.text), ) rds_created, needs_solr_update = add_docket_entries( d, docket_data['docket_entries'], tag=tag) add_parties_and_attorneys(d, docket_data['parties']) process_orphan_documents(rds_created, d.court_id, d.date_filed) logger.info("Created/updated docket: %s" % d) return { 'docket_pk': d.pk, 'needs_solr_update': bool(rds_created or needs_solr_update), }
def get_docket_by_pacer_case_id(self, data, court_id, cookies, tag_names=None, **kwargs): """Get a docket by PACER case id, CL court ID, and a collection of kwargs that can be passed to the DocketReport query. For details of acceptable parameters, see DocketReport.query() :param data: A dict containing: Required: 'pacer_case_id': The internal case ID of the item in PACER. Optional: 'docket_pk': The ID of the docket to work on to avoid lookups if it's known in advance. :param court_id: A courtlistener court ID. :param cookies: A requests.cookies.RequestsCookieJar with the cookies of a logged-in PACER user. :param tag_names: A list of tag names that should be stored with the item in the DB. :param kwargs: A variety of keyword args to pass to DocketReport.query(). :return: A dict indicating if we need to update Solr. """ s = PacerSession(cookies=cookies) if data is None: logger.info("Empty data argument. Terminating " "chains and exiting.") self.request.callbacks = None return pacer_case_id = data.get('pacer_case_id') report = DocketReport(map_cl_to_pacer_id(court_id), s) logger.info("Querying docket report %s.%s" % (court_id, pacer_case_id)) if data.get('docket_pk') is not None: d = Docket.objects.get(pk=data['docket_pk']) else: try: d = Docket.objects.get( pacer_case_id=pacer_case_id, court_id=court_id, ) except Docket.DoesNotExist: d = None except Docket.MultipleObjectsReturned: d = None if d is not None: first_missing_id = get_first_missing_de_number(d) if first_missing_id > 1: # We don't have to get the whole thing! kwargs.setdefault('doc_num_start', first_missing_id) report.query(pacer_case_id, **kwargs) docket_data = report.data logger.info("Querying and parsing complete for %s.%s" % (court_id, pacer_case_id)) if not docket_data: logger.info("No valid docket data for %s.%s", court_id, pacer_case_id) self.request.callbacks = None return # Merge the contents into CL. if d is None: d, count = find_docket_object(court_id, pacer_case_id, docket_data['docket_number']) if count > 1: d = d.earliest('date_created') add_recap_source(d) update_docket_metadata(d, docket_data) d.save() tags = [] if tag_names is not None: for tag_name in tag_names: tag, _ = Tag.objects.get_or_create(name=tag_name) tag.tag_object(d) tags.append(tag) # Add the HTML to the docket in case we need it someday. pacer_file = PacerHtmlFiles(content_object=d, upload_type=UPLOAD_TYPE.DOCKET) pacer_file.filepath.save( 'docket.html', # We only care about the ext w/UUIDFileSystemStorage ContentFile(report.response.text), ) rds_created, content_updated = add_docket_entries( d, docket_data['docket_entries'], tags=tags) add_parties_and_attorneys(d, docket_data['parties']) process_orphan_documents(rds_created, d.court_id, d.date_filed) logger.info("Created/updated docket: %s" % d) return { 'docket_pk': d.pk, 'content_updated': bool(rds_created or content_updated), }
def get_docket_by_pacer_case_id(self, pacer_case_id, court_id, session, tag=None, **kwargs): """Get a docket by PACER case id, CL court ID, and a collection of kwargs that can be passed to the DocketReport query. For details of acceptable parameters, see DocketReport.query() :param pacer_case_id: The internal case ID of the item in PACER. :param court_id: A courtlistener court ID. :param session: A valid PacerSession object. :param tag: The tag name that should be stored with the item in the DB. :param kwargs: A variety of keyword args to pass to DocketReport.query(). """ report = DocketReport(map_cl_to_pacer_id(court_id), session) logger.info("Querying docket report %s.%s" % (court_id, pacer_case_id)) try: d = Docket.objects.get( pacer_case_id=pacer_case_id, court_id=court_id, ) except Docket.DoesNotExist: d = None except Docket.MultipleObjectsReturned: d = None if d is not None: first_missing_id = get_first_missing_de_number(d) if d is not None and first_missing_id > 1: # We don't have to get the whole thing! kwargs.setdefault('doc_num_start', first_missing_id) report.query(pacer_case_id, **kwargs) docket_data = report.data logger.info("Querying and parsing complete for %s.%s" % (court_id, pacer_case_id)) # Merge the contents into CL. try: if d is None: d = Docket.objects.get( Q(pacer_case_id=pacer_case_id) | Q(docket_number=docket_data['docket_number']), court_id=court_id, ) # Add RECAP as a source if it's not already. if d.source in [Docket.DEFAULT, Docket.SCRAPER]: d.source = Docket.RECAP_AND_SCRAPER elif d.source == Docket.COLUMBIA: d.source = Docket.COLUMBIA_AND_RECAP elif d.source == Docket.COLUMBIA_AND_SCRAPER: d.source = Docket.COLUMBIA_AND_RECAP_AND_SCRAPER except Docket.DoesNotExist: d = Docket( source=Docket.RECAP, pacer_case_id=pacer_case_id, court_id=court_id ) except Docket.MultipleObjectsReturned: logger.error("Too many dockets returned when trying to look up '%s.%s'" % (court_id, pacer_case_id)) return None update_docket_metadata(d, docket_data) d.save() if tag is not None: tag, _ = Tag.objects.get_or_create(name=tag) d.tags.add(tag) # Add the HTML to the docket in case we need it someday. pacer_file = PacerHtmlFiles(content_object=d) pacer_file.filepath.save( 'docket.html', # We only care about the ext w/UUIDFileSystemStorage ContentFile(report.response.text), ) for docket_entry in docket_data['docket_entries']: try: de, created = DocketEntry.objects.update_or_create( docket=d, entry_number=docket_entry['document_number'], defaults={ 'description': docket_entry['description'], 'date_filed': docket_entry['date_filed'], } ) except DocketEntry.MultipleObjectsReturned: logger.error( "Multiple docket entries found for document entry number '%s' " "while processing '%s.%s'" % (docket_entry['document_number'], court_id, pacer_case_id) ) continue else: if tag is not None: de.tags.add(tag) try: rd = RECAPDocument.objects.get( docket_entry=de, # No attachments when uploading dockets. document_type=RECAPDocument.PACER_DOCUMENT, document_number=docket_entry['document_number'], ) except RECAPDocument.DoesNotExist: try: rd = RECAPDocument.objects.create( docket_entry=de, # No attachments when uploading dockets. document_type=RECAPDocument.PACER_DOCUMENT, document_number=docket_entry['document_number'], pacer_doc_id=docket_entry['pacer_doc_id'], is_available=False, ) except IntegrityError: # Race condition. The item was created after our get failed. rd = RECAPDocument.objects.get( docket_entry=de, # No attachments when uploading dockets. document_type=RECAPDocument.PACER_DOCUMENT, document_number=docket_entry['document_number'], ) except RECAPDocument.MultipleObjectsReturned: logger.error( "Multiple recap documents found for document entry " "number: '%s', docket: %s" % (docket_entry['document_number'], d) ) continue rd.pacer_doc_id = rd.pacer_doc_id or docket_entry['pacer_doc_id'] if tag is not None: rd.tags.add(tag) add_parties_and_attorneys(d, docket_data['parties']) logger.info("Created/updated docket: %s" % d) return d
class PacerDocketReportTest(unittest.TestCase): """A variety of tests for the docket report""" def setUp(self): pacer_session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) pacer_session.login() self.report = DocketReport('cand', pacer_session) self.pacer_case_id = '186730' # 4:06-cv-07294 Foley v. Bates @staticmethod def _count_rows(html): """Count the rows in the docket report. :param html: The HTML of the docket report. :return: The count of the number of rows. """ tree = get_html_parsed_text(html) return len(tree.xpath('//table[./tr/td[3]]/tr')) - 1 # No header row @SKIP_IF_NO_PACER_LOGIN def test_queries(self): """Do a variety of queries work?""" self.report.query(self.pacer_case_id) self.assertIn('Foley v. Bates', self.report.response.text, msg="Super basic query failed") self.report.query(self.pacer_case_id, date_start=date(2007, 11, 1)) row_count = self._count_rows(self.report.response.text) self.assertEqual(2, row_count, msg="Didn't get expected number of " "rows when filtering by start " "date. Got %s." % row_count) self.report.query(self.pacer_case_id, date_start=date(2007, 11, 1), date_end=date(2007, 11, 28)) row_count = self._count_rows(self.report.response.text) self.assertEqual(1, row_count, msg="Didn't get expected number of " "rows when filtering by start and " "end dates. Got %s." % row_count) self.report.query(self.pacer_case_id, doc_num_start=5, doc_num_end=5) row_count = self._count_rows(self.report.response.text) self.assertEqual(1, row_count, msg="Didn't get expected number of rows " "when filtering by doc number. Got " "%s" % row_count) self.report.query(self.pacer_case_id, date_start=date(2007, 11, 1), date_end=date(2007, 11, 28), date_range_type="Entered") row_count = self._count_rows(self.report.response.text) self.assertEqual(1, row_count, msg="Didn't get expected number of rows " "when filtering by start and end " "dates and date_range_type of " "Entered. Got %s" % row_count) self.report.query(self.pacer_case_id, doc_num_start=500, show_parties_and_counsel=True) self.assertIn('Cheema', self.report.response.text, msg="Didn't find party info when it was explicitly " "requested.") self.report.query(self.pacer_case_id, doc_num_start=500, show_parties_and_counsel=False) self.assertNotIn('Cheema', self.report.response.text, msg="Got party info but it was not requested.") @SKIP_IF_NO_PACER_LOGIN def test_using_same_report_twice(self): """Do the caches get properly nuked between runs? See issue #187. """ # Query the first one... self.report.query(self.pacer_case_id) d = self.report.data.copy() # Then the second one... second_pacer_case_id = '63111' # 1:07-cv-00035-RJA-HKS Anson v. USA self.report.query(second_pacer_case_id) d2 = self.report.data.copy() self.assertNotEqual( d, d2, msg="Got same values for docket data of two different queries. " "Is there a problem with the caches on the DocketReport?" )
class PacerDocketReportTest(unittest.TestCase): """A variety of tests for the docket report""" def setUp(self): self.session = get_pacer_session() self.session.login() self.report = DocketReport("cand", self.session) self.pacer_case_id = "186730" # 4:06-cv-07294 Foley v. Bates @staticmethod def _count_rows(html): """Count the rows in the docket report. :param html: The HTML of the docket report. :return: The count of the number of rows. """ tree = get_html_parsed_text(html) return len(tree.xpath("//table[./tr/td[3]]/tr")) - 1 # No header row @SKIP_IF_NO_PACER_LOGIN def test_queries(self): """Do a variety of queries work?""" self.report.query(self.pacer_case_id) self.assertIn( "Foley v. Bates", self.report.response.text, msg="Super basic query failed", ) self.report.query(self.pacer_case_id, date_start=date(2007, 11, 1)) row_count = self._count_rows(self.report.response.text) self.assertEqual( 2, row_count, msg="Didn't get expected number of " "rows when filtering by start " "date. Got %s." % row_count, ) self.report.query( self.pacer_case_id, date_start=date(2007, 11, 1), date_end=date(2007, 11, 28), ) row_count = self._count_rows(self.report.response.text) self.assertEqual( 1, row_count, msg="Didn't get expected number of " "rows when filtering by start and " "end dates. Got %s." % row_count, ) self.report.query(self.pacer_case_id, doc_num_start=5, doc_num_end=5) row_count = self._count_rows(self.report.response.text) self.assertEqual( 1, row_count, msg="Didn't get expected number of rows " "when filtering by doc number. Got " "%s" % row_count, ) self.report.query( self.pacer_case_id, date_start=date(2007, 11, 1), date_end=date(2007, 11, 28), date_range_type="Entered", ) row_count = self._count_rows(self.report.response.text) self.assertEqual( 1, row_count, msg="Didn't get expected number of rows " "when filtering by start and end " "dates and date_range_type of " "Entered. Got %s" % row_count, ) self.report.query( self.pacer_case_id, doc_num_start=500, show_parties_and_counsel=True, ) self.assertIn( "Cheema", self.report.response.text, msg="Didn't find party info when it was explicitly " "requested.", ) self.report.query( self.pacer_case_id, doc_num_start=500, show_parties_and_counsel=False, ) self.assertNotIn( "Cheema", self.report.response.text, msg="Got party info but it was not requested.", ) @SKIP_IF_NO_PACER_LOGIN def test_using_same_report_twice(self): """Do the caches get properly nuked between runs? See issue #187. """ # Query the first one... self.report.query(self.pacer_case_id) d = self.report.data.copy() # Then the second one... second_pacer_case_id = "63111" # 1:07-cv-00035-RJA-HKS Anson v. USA self.report.query(second_pacer_case_id) d2 = self.report.data.copy() self.assertNotEqual( d, d2, msg="Got same values for docket data of two different queries. " "Is there a problem with the caches on the DocketReport?", )