def test_logging_short_username(self): """If a username shorter than six characters is provided, do we throw an appropriate exception? """ session = PacerSession(username='******', password='******') with self.assertRaises(PacerLoginException): session.login()
def get_pacer_dockets(options, docket_pks, tags): """Get the pacer dockets identified by the FJC IDB rows""" q = options['queue'] throttle = CeleryThrottle(queue_name=q) pacer_session = None for i, docket_pk in enumerate(docket_pks): if i < options['offset']: continue if i >= options['limit'] > 0: break throttle.maybe_wait() if i % 1000 == 0 or pacer_session is None: pacer_session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) pacer_session.login() logger.info("Sent %s tasks to celery so far." % i) d = Docket.objects.get(pk=docket_pk) chain( get_docket_by_pacer_case_id.s( {'pacer_case_id': d.pacer_case_id, 'docket_pk': d.pk}, d.court_id, cookies=pacer_session.cookies, tag_names=tags, **{'show_parties_and_counsel': True, 'show_terminated_parties': True, 'show_list_of_member_cases': False} ).set(queue=q), add_or_update_recap_docket.s().set(queue=q), ).apply_async()
def test_logging_short_password(self): """If a short password is provided, do we throw an appropriate exception? """ session = PacerSession(username='******', password='******') with self.assertRaises(PacerLoginException): session.login()
def test_logging_into_test_site(self): try: pacer_session = PacerSession(username='******', password='******') pacer_session.login_training() self.assertIsNotNone(pacer_session) self.assertIsNotNone(pacer_session.cookies.get( 'PacerSession', None, domain='.uscourts.gov', path='/')) except PacerLoginException: self.fail('Could not log into PACER test site!')
def test_logging_into_pacer(self): try: session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() self.assertIsNotNone(session) self.assertIsNotNone(session.cookies.get( 'PacerSession', None, domain='.uscourts.gov', path='/')) except PacerLoginException: self.fail('Could not log into PACER')
def setUp(self): pacer_session = PacerSession() if PACER_USERNAME and PACER_PASSWORD: # CAND chosen at random pacer_session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) pacer_session.login() with open(os.path.join(JURISCRAPER_ROOT, 'pacer/courts.json')) as j: self.courts = get_courts_from_json(json.load(j)) path = os.path.join(TESTS_ROOT, 'fixtures/valid_free_opinion_dates.json') with open(path) as j: self.valid_dates = json.load(j) self.reports = {} for court in self.courts: court_id = get_court_id_from_url(court['court_link']) self.reports[court_id] = FreeOpinionReport(court_id, pacer_session)
def get_pdfs(options): """Get PDFs for the results of the Free Document Report queries. At this stage, we have rows in the PACERFreeDocumentRow table, each of which represents a PDF we need to download and merge into our normal tables: Docket, DocketEntry, and RECAPDocument. In this function, we iterate over the entire table of results, merge it into our normal tables, and then download and extract the PDF. :return: None """ q = options['queue'] index = options['index'] cnt = CaseNameTweaker() rows = PACERFreeDocumentRow.objects.filter(error_msg="").only('pk') count = rows.count() task_name = "downloading" if index: task_name += " and indexing" logger.info("%s %s items from PACER." % (task_name, count)) throttle = CeleryThrottle(queue_name=q) completed = 0 for row in queryset_generator(rows): throttle.maybe_wait() if completed % 30000 == 0: pacer_session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) pacer_session.login() c = chain( process_free_opinion_result.si(row.pk, cnt).set(queue=q), get_and_process_pdf.s(pacer_session.cookies, row.pk).set(queue=q), delete_pacer_row.s(row.pk).set(queue=q), ) if index: c |= add_items_to_solr.s('search.RECAPDocument').set(queue=q) c.apply_async() completed += 1 if completed % 1000 == 0: logger.info("Sent %s/%s tasks to celery for %s so " "far." % (completed, count, task_name))
def setUp(self): pacer_session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) pacer_session.login() self.report = DocketReport('cand', pacer_session) self.pacer_case_id = '186730' # 4:06-cv-07294 Foley v. Bates
def setUp(self): self.session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD)
def setUpClass(cls): if PACER_USERNAME and PACER_PASSWORD: cls.pacer_session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) cls.report = ShowCaseDocApi('dcd', cls.pacer_session)
class PacerSessionTest(unittest.TestCase): """ Test the PacerSession wrapper class """ def setUp(self): self.session = PacerSession() def test_data_transformation(self): """ Test our data transformation routine for building out PACER-compliant multi-part form data """ data = {'case_id': 123, 'case_type': 'something'} expected = {'case_id': (None, 123), 'case_type': (None, 'something')} output = self.session._prepare_multipart_form_data(data) self.assertEqual(output, expected) @mock.patch('juriscraper.pacer.http.requests.Session.post') def test_ignores_non_data_posts(self, mock_post): """ Test that POSTs without a data parameter just pass through as normal. :param mock_post: mocked Session.post method """ data = {'name': ('filename', 'junk')} self.session.post('https://free.law', files=data) self.assertTrue(mock_post.called, 'request.Session.post should be called') self.assertEqual(data, mock_post.call_args[1]['files'], 'the data should not be changed if using a files call') @mock.patch('juriscraper.pacer.http.requests.Session.post') def test_transforms_data_on_post(self, mock_post): """ Test that POSTs using the data parameter get transformed into PACER's delightfully odd multi-part form data. :param mock_post: mocked Session.post method """ data = {'name': 'dave', 'age': 33} expected = {'name': (None, 'dave'), 'age': (None, 33)} self.session.post('https://free.law', data=data) self.assertTrue(mock_post.called, 'request.Session.post should be called') self.assertNotIn('data', mock_post.call_args[1], 'we should intercept data arguments') self.assertEqual(expected, mock_post.call_args[1]['files'], 'we should transform and populate the files argument') @mock.patch('juriscraper.pacer.http.requests.Session.post') def test_sets_default_timeout(self, mock_post): self.session.post('https://free.law', data={}) self.assertTrue(mock_post.called, 'request.Session.post should be called') self.assertIn('timeout', mock_post.call_args[1], 'we should add a default timeout automatically') self.assertEqual(300, mock_post.call_args[1]['timeout'], 'default should be 300')
def get_and_save_free_document_reports(options): """Query the Free Doc Reports on PACER and get a list of all the free documents. Do not download those items, as that step is done later. """ # Kill any *old* logs that report they're in progress. (They've failed.) twelve_hrs_ago = now() - timedelta(hours=12) PACERFreeDocumentLog.objects.filter( date_started__lt=twelve_hrs_ago, status=PACERFreeDocumentLog.SCRAPE_IN_PROGRESS, ).update( status=PACERFreeDocumentLog.SCRAPE_FAILED, ) pacer_court_ids = { map_cl_to_pacer_id(v): {'until': now(), 'count': 1, 'result': None} for v in Court.objects.filter( jurisdiction__in=['FD', 'FB'], in_use=True, end_date=None, ).exclude( pk__in=['casb', 'ganb', 'gub', 'innb', 'mieb', 'miwb', 'nmib', 'nvb', 'ohsb', 'prb', 'tnwb', 'vib'] ).values_list( 'pk', flat=True ) } pacer_session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) pacer_session.login() # Iterate over every court, X days at a time. As courts are completed, # remove them from the list of courts to process until none are left tomorrow = now() + timedelta(days=1) while len(pacer_court_ids) > 0: court_ids_copy = pacer_court_ids.copy() # Make a copy of the list. for pacer_court_id, delay in court_ids_copy.items(): if now() < delay['until']: # Do other courts until the delay is up. Do not print/log # anything since at the end there will only be one court left. continue next_start_date, next_end_date = get_next_date_range(pacer_court_id) if delay['result'] is not None: if delay['result'].ready(): result = delay['result'].get() if result == PACERFreeDocumentLog.SCRAPE_SUCCESSFUL: if next_start_date >= tomorrow.date(): logger.info("Finished '%s'. Marking it complete." % pacer_court_id) pacer_court_ids.pop(pacer_court_id, None) continue elif result == PACERFreeDocumentLog.SCRAPE_FAILED: logger.error("Encountered critical error on %s " "(network error?). Marking as failed and " "pressing on." % pacer_court_id) pacer_court_ids.pop(pacer_court_id, None) continue else: next_delay = min(delay['count'] * 5, 30) # backoff w/cap logger.info("Court %s still in progress. Delaying at least " "%ss." % (pacer_court_id, next_delay)) pacer_court_ids[pacer_court_id]['until'] = now() + timedelta( seconds=next_delay) pacer_court_ids[pacer_court_id]['count'] += 1 continue mark_court_in_progress(pacer_court_id, next_end_date) pacer_court_ids[pacer_court_id]['count'] = 1 # Reset delay['result'] = chain( get_and_save_free_document_report.si( pacer_court_id, next_start_date, next_end_date, pacer_session ), mark_court_done_on_date.s(pacer_court_id, next_end_date), ).apply_async()
def get_pacer_session(): return PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD)
#!/usr/bin/env python # # Takes an .html file on the command line, parses it using the PACER # Docket Report parser, and outputs json to stdout. import jsondate3 as json import sys from juriscraper.pacer.http import PacerSession from juriscraper.pacer import DocketReport pacer_session = PacerSession(username="******", password="******") report = DocketReport("psc", pacer_session) for path in sys.argv[1:]: with open(path, "r") as f: report._parse_text(f.read().decode("utf-8")) data = report.data print json.dumps(data, indent=2, sort_keys=True, separators=(",", ": "))
def setUpClass(cls): pacer_session = PacerSession(username='******', password='******') cls.report = DocketReport('psc', pacer_session) cls.pacer_case_id = '62866' # 1:07-cr-00001-RJA-HKS USA v. Green
def setUp(self): self.session = PacerSession()
def test_logging_in_bad_credentials(self): # Make sure password is more than eight characters. session = PacerSession(username='******', password='******') with self.assertRaises(PacerLoginException): session.login()
def fetch(ctx, overwrite=False): print('fetch') session = PacerSession(username=os.environ.get('PACER_USERNAME'), password=os.environ.get('PACER_PASSWORD')) today = date.today().strftime('%m/%d/%Y') citations = [ '18:922A.F', '18:922C.F', '18:922E.F', '18:922G.F', '18:924A.F', '18:924C.F', ] for citation in citations: outputfile = 'data/{0}.tsv'.format(citation) if overwrite or not os.path.exists(outputfile): body = { "office": (None, ""), "case_type": (None, ""), "case_flags": (None, ""), "citation": (None, citation), "pending_citations": (None, "1"), "terminated_citations": (None, "1"), "cvbcases": (None, "No"), "filed_from": (None, "1/1/2007"), "filed_to": (None, today), "terminal_digit": (None, ""), "pending_defendants": (None, "on"), "terminated_defendants": (None, "on"), "fugitive_defendants": (None, ""), "nonfugitive_defendants": (None, "1"), "reportable_cases": (None, "1"), "non_reportable_cases": (None, "1"), "sort1": (None, "case number"), "sort2": (None, ""), "sort3": (None, ""), "format": (None, "data") } intermediate_resp = session.post( 'https://ecf.ilnd.uscourts.gov/cgi-bin/CrCaseFiled-Rpt.pl?1-L_1_0-1' .format(randint(200000, 40000000)), files=body) intermediate_doc = BeautifulSoup(intermediate_resp.content, 'lxml') form = intermediate_doc.find('form') action = form.attrs.get('action') action_path = action.split('/')[-1] url = 'https://ecf.ilnd.uscourts.gov/cgi-bin/' + action_path resp = session.post(url) print('-' * 50) print(citation) print('-' * 50) print(resp.content) with open(outputfile, 'w') as f: f.write(resp.content) else: print('skipped {0}'.format(citation))
def get_and_save_free_document_reports(options): """Query the Free Doc Reports on PACER and get a list of all the free documents. Do not download those items, as that step is done later. For now just get the list. Note that this uses synchronous celery chains. A previous version was more complex and did not use synchronous chains. Unfortunately in Celery 4.2.0, or more accurately in redis-py 3.x.x, doing it that way failed nearly every time. This is a simpler version, though a slower one, but it should get the job done. """ # Kill any *old* logs that report they're in progress. (They've failed.) three_hrs_ago = now() - timedelta(hours=3) PACERFreeDocumentLog.objects.filter( date_started__lt=three_hrs_ago, status=PACERFreeDocumentLog.SCRAPE_IN_PROGRESS, ).update(status=PACERFreeDocumentLog.SCRAPE_FAILED,) cl_court_ids = ( Court.objects.filter( jurisdiction__in=[ Court.FEDERAL_DISTRICT, Court.FEDERAL_BANKRUPTCY, ], in_use=True, end_date=None, ) .exclude(pk__in=["casb", "gub", "innb", "miwb", "ohsb", "prb"],) .values_list("pk", flat=True,) ) pacer_court_ids = [map_cl_to_pacer_id(v) for v in cl_court_ids] pacer_session = PacerSession( username=PACER_USERNAME, password=PACER_PASSWORD ) pacer_session.login() today = now() for pacer_court_id in pacer_court_ids: while True: next_start_d, next_end_d = get_next_date_range(pacer_court_id) logger.info( "Attempting to get latest document references for " "%s between %s and %s", pacer_court_id, next_start_d, next_end_d, ) mark_court_in_progress(pacer_court_id, next_end_d) try: status = get_and_save_free_document_report( pacer_court_id, next_start_d, next_end_d, pacer_session.cookies, ) except RequestException: logger.error( "Failed to get document references for %s " "between %s and %s due to network error.", pacer_court_id, next_start_d, next_end_d, ) mark_court_done_on_date( PACERFreeDocumentLog.SCRAPE_FAILED, pacer_court_id, next_end_d, ) break except IndexError: logger.error( "Failed to get document references for %s " "between %s and %s due to PACER 6.3 bug.", pacer_court_id, next_start_d, next_end_d, ) mark_court_done_on_date( PACERFreeDocumentLog.SCRAPE_FAILED, pacer_court_id, next_end_d, ) break else: result = mark_court_done_on_date( status, pacer_court_id, next_end_d ) if result == PACERFreeDocumentLog.SCRAPE_SUCCESSFUL: if next_end_d >= today.date(): logger.info( "Got all document references for '%s'.", pacer_court_id ) # Break from while loop, onwards to next court break else: # More dates to do; let it continue continue elif result == PACERFreeDocumentLog.SCRAPE_FAILED: logger.error( "Encountered critical error on %s " "(network error?). Marking as failed and " "pressing on." % pacer_court_id ) # Break from while loop, onwards to next court break
class PacerSessionTest(unittest.TestCase): """ Test the PacerSession wrapper class """ def setUp(self): self.session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) def test_data_transformation(self): """ Test our data transformation routine for building out PACER-compliant multi-part form data """ data = {'case_id': 123, 'case_type': 'something'} expected = {'case_id': (None, 123), 'case_type': (None, 'something')} output = self.session._prepare_multipart_form_data(data) self.assertEqual(output, expected) @mock.patch('juriscraper.pacer.http.requests.Session.post') def test_ignores_non_data_posts(self, mock_post): """ Test that POSTs without a data parameter just pass through as normal. :param mock_post: mocked Session.post method """ data = {'name': ('filename', 'junk')} self.session.post('https://free.law', files=data, auto_login=False) self.assertTrue(mock_post.called, 'request.Session.post should be called') self.assertEqual(data, mock_post.call_args[1]['files'], 'the data should not be changed if using a files call') @mock.patch('juriscraper.pacer.http.requests.Session.post') def test_transforms_data_on_post(self, mock_post): """ Test that POSTs using the data parameter get transformed into PACER's delightfully odd multi-part form data. :param mock_post: mocked Session.post method """ data = {'name': 'dave', 'age': 33} expected = {'name': (None, 'dave'), 'age': (None, 33)} self.session.post('https://free.law', data=data, auto_login=False) self.assertTrue(mock_post.called, 'request.Session.post should be called') self.assertNotIn('data', mock_post.call_args[1], 'we should intercept data arguments') self.assertEqual(expected, mock_post.call_args[1]['files'], 'we should transform and populate the files argument') @mock.patch('juriscraper.pacer.http.requests.Session.post') def test_sets_default_timeout(self, mock_post): self.session.post('https://free.law', data={}, auto_login=False) self.assertTrue(mock_post.called, 'request.Session.post should be called') self.assertIn('timeout', mock_post.call_args[1], 'we should add a default timeout automatically') self.assertEqual(300, mock_post.call_args[1]['timeout'], 'default should be 300') @mock.patch('juriscraper.pacer.http.PacerSession.login') @SKIP_IF_NO_PACER_LOGIN def test_auto_login(self, mock_login): """Do we automatically log in if needed?""" court_id = 'ksd' pacer_doc_id = '07902639735' url = make_doc1_url(court_id, pacer_doc_id, True) pacer_case_id = '81531' # This triggers and auto-login because we aren't logged in yet. self.session.username = PACER_USERNAME self.session.password = PACER_PASSWORD _ = self.session.get(url, params={ 'case_id': pacer_case_id, 'got_receipt': '1', }, allow_redirects=True) self.assertTrue(mock_login.called, 'PacerSession.login() should be called.')
class PacerSessionTest(unittest.TestCase): """ Test the PacerSession wrapper class """ def setUp(self): self.session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) def test_data_transformation(self): """ Test our data transformation routine for building out PACER-compliant multi-part form data """ data = {'case_id': 123, 'case_type': 'something'} expected = {'case_id': (None, 123), 'case_type': (None, 'something')} output = self.session._prepare_multipart_form_data(data) self.assertEqual(output, expected) @mock.patch('juriscraper.pacer.http.requests.Session.post') def test_ignores_non_data_posts(self, mock_post): """ Test that POSTs without a data parameter just pass through as normal. :param mock_post: mocked Session.post method """ data = {'name': ('filename', 'junk')} self.session.post('https://free.law', files=data, auto_login=False) self.assertTrue(mock_post.called, 'request.Session.post should be called') self.assertEqual( data, mock_post.call_args[1]['files'], 'the data should not be changed if using a files call') @mock.patch('juriscraper.pacer.http.requests.Session.post') def test_transforms_data_on_post(self, mock_post): """ Test that POSTs using the data parameter get transformed into PACER's delightfully odd multi-part form data. :param mock_post: mocked Session.post method """ data = {'name': 'dave', 'age': 33} expected = {'name': (None, 'dave'), 'age': (None, 33)} self.session.post('https://free.law', data=data, auto_login=False) self.assertTrue(mock_post.called, 'request.Session.post should be called') self.assertNotIn('data', mock_post.call_args[1], 'we should intercept data arguments') self.assertEqual( expected, mock_post.call_args[1]['files'], 'we should transform and populate the files argument') @mock.patch('juriscraper.pacer.http.requests.Session.post') def test_sets_default_timeout(self, mock_post): self.session.post('https://free.law', data={}, auto_login=False) self.assertTrue(mock_post.called, 'request.Session.post should be called') self.assertIn('timeout', mock_post.call_args[1], 'we should add a default timeout automatically') self.assertEqual(300, mock_post.call_args[1]['timeout'], 'default should be 300') @mock.patch('juriscraper.pacer.http.PacerSession.login') def test_auto_login(self, mock_login): """Do we automatically log in if needed?""" court_id = 'ksd' pacer_doc_id = '07902639735' url = make_doc1_url(court_id, pacer_doc_id, True) pacer_case_id = '81531' # This triggers and auto-login because we aren't logged in yet. self.session.username = PACER_USERNAME self.session.password = PACER_PASSWORD _ = self.session.get(url, params={ 'case_id': pacer_case_id, 'got_receipt': '1', }, allow_redirects=True) self.assertTrue(mock_login.called, 'PacerSession.login() should be called.')