def parse(responses): """Using a list of responses, parse out useful information and return it as a list of dicts. """ results = [] court_id = "Court not yet set." for response in responses: response.raise_for_status() court_id = get_court_id_from_url(response.url) set_response_encoding(response) text = clean_html(response.text) tree = get_html_parsed_text(text) tree.rewrite_links(fix_links_in_lxml_tree, base_href=response.url) opinion_count = int( tree.xpath('//b[contains(text(), "Total number of ' 'opinions reported")]')[0].tail) if opinion_count == 0: continue rows = tree.xpath('(//table)[1]//tr[position() > 1]') for row in rows: if results: # If we have results already, pass the previous result to # the FreeOpinionRow object. row = FreeOpinionRow(row, results[-1], court_id) else: row = FreeOpinionRow(row, {}, court_id) results.append(row) logger.info("Parsed %s results from written opinions report at %s" % (len(results), court_id)) return results
def test_extract_written_documents_report(self): """Do all the written reports work?""" for court in self.courts: if court['type'] == "U.S. Courts of Appeals": continue court_id = get_court_id_from_url(court['court_link']) if court_id not in self.valid_dates: continue results = [] report = self.reports[court_id] some_date = convert_date_string(self.valid_dates[court_id]) retry_count = 1 max_retries = 5 # We'll try five times total while not results and retry_count <= max_retries: # This loop is sometimes needed to find a date with documents. # In general the valid dates json object should suffice, # however. if some_date > date.today(): raise ValueError("Runaway date query for %s: %s" % (court_id, some_date)) try: report.query(some_date, some_date, sort='case_number') except ConnectionError as e: if retry_count <= max_retries: print("%s. Trying again (%s of %s)" % (e, retry_count, max_retries)) time.sleep(10) # Give the server a moment of rest. retry_count += 1 continue else: print("%s: Repeated errors at this court." % e) raise e if not report.responses: break # Not a supported court. some_date += timedelta(days=1) else: # While loop ended normally (without hitting break) for result in results: for k, v in result.items(): if k in ['nature_of_suit', 'cause']: continue self.assertIsNotNone( v, msg="Value of key %s is None in court %s" % (k, court_id) ) # Can we download one item from each court? r = report.download_pdf(results[0]['pacer_case_id'], results[0]['pacer_doc_id']) if r is None: # Extremely messed up download. continue self.assertEqual(r.headers['Content-Type'], 'application/pdf')
def test_extract_written_documents_report(self): """Do all the written reports work?""" for court in self.courts: if court["type"] == "U.S. Courts of Appeals": continue court_id = get_court_id_from_url(court["court_link"]) if court_id not in self.valid_dates: continue results = [] report = self.reports[court_id] some_date = convert_date_string(self.valid_dates[court_id]) retry_count = 1 max_retries = 5 # We'll try five times total while not results and retry_count <= max_retries: # This loop is sometimes needed to find a date with documents. # In general the valid dates json object should suffice, # however. if some_date > date.today(): raise ValueError("Runaway date query for %s: %s" % (court_id, some_date)) try: report.query(some_date, some_date, sort="case_number") except ConnectionError as e: if retry_count <= max_retries: print("%s. Trying again (%s of %s)" % (e, retry_count, max_retries)) time.sleep(10) # Give the server a moment of rest. retry_count += 1 continue else: print("%s: Repeated errors at this court." % e) raise e if not report.responses: break # Not a supported court. some_date += timedelta(days=1) else: # While loop ended normally (without hitting break) for result in results: for k, v in result.items(): if k in ["nature_of_suit", "cause"]: continue self.assertIsNotNone( v, msg="Value of key %s is None in court %s" % (k, court_id), ) # Can we download one item from each court? r = report.download_pdf(results[0]["pacer_case_id"], results[0]["pacer_doc_id"]) if r is None: # Extremely messed up download. continue self.assertEqual(r.headers["Content-Type"], "application/pdf")
def setUpClass(cls): pacer_session = PacerSession() if PACER_USERNAME and PACER_PASSWORD: # CAND chosen at random pacer_session = login('cand', PACER_USERNAME, PACER_PASSWORD) with open(os.path.join(JURISCRAPER_ROOT, 'pacer/courts.json')) as j: cls.courts = get_courts_from_json(json.load(j)) with open(os.path.join(TESTS_ROOT, 'fixtures/valid_free_opinion_dates.json')) as j: cls.valid_dates = json.load(j) cls.reports = {} for court in cls.courts: court_id = get_court_id_from_url(court['court_link']) cls.reports[court_id] = FreeOpinionReport(court_id, pacer_session)
def setUpClass(cls): pacer_session = PacerSession() if PACER_USERNAME and PACER_PASSWORD: # CAND chosen at random pacer_session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) with open(os.path.join(JURISCRAPER_ROOT, 'pacer/courts.json')) as j: cls.courts = get_courts_from_json(json.load(j)) path = os.path.join(TESTS_ROOT, 'fixtures/valid_free_opinion_dates.json') with open(path) as j: cls.valid_dates = json.load(j) cls.reports = {} for court in cls.courts: court_id = get_court_id_from_url(court['court_link']) cls.reports[court_id] = FreeOpinionReport(court_id, pacer_session)
def setUp(self): pacer_session = PacerSession() if pacer_credentials_are_defined(): # CAND chosen at random pacer_session = get_pacer_session() pacer_session.login() with open(os.path.join(JURISCRAPER_ROOT, 'pacer/courts.json')) as j: self.courts = get_courts_from_json(json.load(j)) path = os.path.join(TESTS_ROOT_EXAMPLES_PACER, 'dates/valid_free_opinion_dates.json') with open(path) as j: self.valid_dates = json.load(j) self.reports = {} for court in self.courts: court_id = get_court_id_from_url(court['court_link']) self.reports[court_id] = FreeOpinionReport(court_id, pacer_session)
def setUp(self): pacer_session = PacerSession() if PACER_USERNAME and PACER_PASSWORD: # CAND chosen at random pacer_session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) pacer_session.login() with open(os.path.join(JURISCRAPER_ROOT, 'pacer/courts.json')) as j: self.courts = get_courts_from_json(json.load(j)) path = os.path.join(TESTS_ROOT, 'fixtures/valid_free_opinion_dates.json') with open(path) as j: self.valid_dates = json.load(j) self.reports = {} for court in self.courts: court_id = get_court_id_from_url(court['court_link']) self.reports[court_id] = FreeOpinionReport(court_id, pacer_session)
def test_getting_court_id_from_url(self): qa_pairs = (('https://ecf.almd.uscourts.gov/cgi-bin/DktRpt.pl?56120', 'almd'), ) for q, a in qa_pairs: self.assertEqual(get_court_id_from_url(q), a)
def test_getting_court_id_from_url(self): qa_pairs = ( ('https://ecf.almd.uscourts.gov/cgi-bin/DktRpt.pl?56120', 'almd'), ) for q, a in qa_pairs: self.assertEqual(get_court_id_from_url(q), a)