def parse(responses): """Using a list of responses, parse out useful information and return it as a list of dicts. """ results = [] court_id = "Court not yet set." for response in responses: response.raise_for_status() court_id = get_court_id_from_url(response.url) set_response_encoding(response) text = clean_html(response.text) tree = get_html_parsed_text(text) tree.rewrite_links(fix_links_in_lxml_tree, base_href=response.url) opinion_count = int( tree.xpath('//b[contains(text(), "Total number of ' 'opinions reported")]')[0].tail) if opinion_count == 0: continue rows = tree.xpath('(//table)[1]//tr[position() > 1]') for row in rows: if results: # If we have results already, pass the previous result to # the FreeOpinionRow object. row = FreeOpinionRow(row, results[-1], court_id) else: row = FreeOpinionRow(row, {}, court_id) results.append(row) logger.info("Parsed %s results from written opinions report at %s" % (len(results), court_id)) return results
def _count_rows(html): """Count the rows in the docket report. :param html: The HTML of the docket report. :return: The count of the number of rows. """ tree = get_html_parsed_text(html) return len(tree.xpath('//table[./tr/td[3]]/tr')) - 1 # No header row
def _make_html_tree(self, text): """Hook for custom HTML parsers By default, the etree.html parser is used, but this allows support for other parsers like the html5parser or even BeautifulSoup, if it's called for (example: return get_html5_parsed_text(text)). Otherwise, this method can be overwritten to execute custom parsing logic. """ return get_html_parsed_text(text)
def download_pdf(self, pacer_case_id, pacer_document_number): """Download a PDF from PACER. Note that this doesn't support attachments yet. """ timeout = (60, 300) url = make_doc1_url(self.court_id, pacer_document_number, True) data = { 'caseid': pacer_case_id, 'got_receipt': '1', } logger.info("GETting PDF at URL: %s with params: %s" % (url, data)) r = self.session.get(url, params=data, timeout=timeout) # The request above sometimes generates an HTML page with an iframe # containing the PDF, and other times returns the PDF. Our task is thus # to either get the src of the iframe and download the PDF or just # return the pdf. r.raise_for_status() if is_pdf(r): logger.info('Got PDF binary data for case %s at: %s' % (url, data)) return r text = clean_html(r.text) tree = get_html_parsed_text(text) tree.rewrite_links(fix_links_in_lxml_tree, base_href=r.url) try: iframe_src = tree.xpath('//iframe/@src')[0] except IndexError: if 'pdf:Producer' in text: logger.error("Unable to download PDF. PDF content was placed " "directly in HTML. URL: %s, caseid: %s" % (url, pacer_case_id)) else: logger.error( "Unable to download PDF. PDF not served as binary " "data and unable to find iframe src attribute. " "URL: %s, caseid: %s" % (url, pacer_case_id)) return None r = self.session.get(iframe_src, timeout=timeout) if is_pdf(r): logger.info('Got iframed PDF data for case %s at: %s' % (url, iframe_src)) return r
def _get_subpage_html_by_page(self, page): path = ".//textarea[@id='PostContent']" html = page.xpath(path)[0].text_content() return get_html_parsed_text(html)
def get_page(self) -> WebElement: text = clean_html(self.webdriver.page_source) html = get_html_parsed_text(text) html.rewrite_links(fix_links_but_keep_anchors, base_href=self.url) return html