def load_bill_text_xml(docinfo): # If XML text is available, use it, but pre-render it # into HTML. Otherwise use the legacy HTML that we # scraped from THOMAS. if "xml_file" in docinfo: import congressxml return congressxml.convert_xml(docinfo["xml_file"]) elif "html_file" in docinfo: return lxml.etree.parse(docinfo["html_file"]) else: raise IOError("Bill text is not available for one of the bills.")
def load_bill_text(bill, version, plain_text=False, mods_only=False, with_citations=False): # Load bill text info from the Congress project data directory. dat = get_bill_text_metadata(bill, version) if not dat: # No text is available. if plain_text: return "" # for indexing, just return empty string if no text is available raise IOError("Bill text is not available for this bill.") ret = { "bill_id": bill.id, "bill_name": bill.title, "has_displayable_text": dat.get("has_displayable_text"), } # Load basic metadata from a MODS file if one exists. if "mods_file" in dat: ret.update(load_bill_mods_metadata(dat["mods_file"])) # Otherwise fall back on using the text-versions data.json file. We may have # this for historical bills that we don't have a MODS file for. else: gpo_url = dat["urls"]["pdf"] m = re.match( r"http://www.gpo.gov/fdsys/pkg/(STATUTE-\d+)/pdf/(STATUTE-\d+-.*).pdf", gpo_url) if m: # TODO (but not needed right now): Docs from the BILLS collection. gpo_url = "http://www.gpo.gov/fdsys/granule/%s/%s/content-detail.html" % m.groups( ) ret.update({ "docdate": dat["issued_on"], "gpo_url": gpo_url, "gpo_pdf_url": dat["urls"]["pdf"], "doc_version": dat["version_code"], "doc_version_name": get_gpo_status_code_name(dat["version_code"]), }) # Pass through some fields. for f in ('html_file', 'xml_file', 'pdf_file', 'has_thumbnail', 'thumbnail_path'): if f in dat: ret[f] = dat[f] if with_citations: #and and not settings.DEBUG: load_citation_info(ret) # If the caller only wants metadata, return it. if mods_only: return ret if "xml_file" in dat and not plain_text: # convert XML on the fly to HTML import lxml.html, congressxml ret.update({ "text_html": lxml.html.tostring(congressxml.convert_xml(dat["xml_file"]), encoding=str), "source": dat.get("xml_file_source"), }) elif "html_file" in dat and not plain_text: # This will be for bills around the 103rd-108th Congresses when # bill text is available from GPO but not in XML. ret.update({ "text_html": open(dat["html_file"]).read(), }) elif "text_file" in dat: # bill text from the Statutes at Large, or when plain_text is True then from GPO bill_text_content = open(dat["text_file"]).read() # In the GPO BILLS collection, there's gunk at the top and bottom that we'd # rather just remove: metadata in brackets at the top, and <all> at the end. # We remove it because it's not really useful when indexing. if bill_text_content: bill_text_content = re.sub(r"^\s*(\[[^\n]+\]\s*)*", "", bill_text_content) bill_text_content = re.sub(r"\s*<all>\s*$", "", bill_text_content) # Caller just wants the plain text? if plain_text: # replace form feeds (OCR'd layers only) with an indication of the page break return bill_text_content.replace( "\u000C", "\n=============================================\n") # Return the text wrapped in <pre>, and replace form feeds with an <hr>. import cgi bill_text_content = "<pre>" + cgi.escape(bill_text_content) + "</pre>" bill_text_content = bill_text_content.replace( "\u000C", "<hr>") # (OCR'd layers only) ret.update({ "text_html": bill_text_content, "source": dat.get("text_file_source"), }) return ret
def load_bill_text(bill, version, plain_text=False, mods_only=False): # Load bill text info from the Congress project data directory. # We have JSON files for metadata and plain text files mirrored from GPO # containing bill text (either from the Statutes at Large OCR'ed text # layers, or from GPO FDSys's BILLS collection). dat = get_bill_text_metadata(bill, version) if not dat: # No text is available. if plain_text: return "" # for indexing, just return empty string if no text is available raise IOError("Bill text is not available for this bill.") ret = { "bill_id": bill.id, "bill_name": bill.title, "has_displayable_text": dat.get("has_displayable_text"), } # Load basic metadata from a MODS file if one exists. if "mods_file" in dat: ret.update(load_bill_mods_metadata(dat["mods_file"])) # Otherwise fall back on using the text-versions data.json file. We may have # this for historical bills that we don't have a MODS file for. else: gpo_url = dat["urls"]["pdf"] m = re.match(r"http://www.gpo.gov/fdsys/pkg/(STATUTE-\d+)/pdf/(STATUTE-\d+-.*).pdf", gpo_url) if m: # TODO (but not needed right now): Docs from the BILLS collection. gpo_url = "http://www.gpo.gov/fdsys/granule/%s/%s/content-detail.html" % m.groups() ret.update({ "docdate": datetime.date(*(int(d) for d in dat["issued_on"].split("-"))), "gpo_url": gpo_url, "gpo_pdf_url": dat["urls"]["pdf"], "doc_version": dat["version_code"], "doc_version_name": get_gpo_status_code_name(dat["version_code"]), }) # Pass through some fields. for f in ('html_file', 'thumbnail_path'): if f in dat: ret[f] = dat[f] # If the caller only wants metadata, return it. if mods_only: return ret if "xml_file" in dat and not plain_text: # convert XML on the fly to HTML import lxml.html, congressxml ret.update({ "text_html": lxml.html.tostring(congressxml.convert_xml(dat["xml_file"])), "source": dat.get("xml_file_source"), }) elif "html_file" in dat and not plain_text: # This will be for bills around the 103rd-108th Congresses when # bill text is available from GPO but not in XML. ret.update({ "text_html": open(dat["html_file"]).read().decode("utf8"), }) elif "text_file" in dat: # bill text from the Statutes at Large, or when plain_text is True then from GPO bill_text_content = open(dat["text_file"]).read().decode("utf8") # In the GPO BILLS collection, there's gunk at the top and bottom that we'd # rather just remove: metadata in brackets at the top, and <all> at the end. # We remove it because it's not really useful when indexing. if bill_text_content: bill_text_content = re.sub(r"^\s*(\[[^\n]+\]\s*)*", "", bill_text_content) bill_text_content = re.sub(r"\s*<all>\s*$", "", bill_text_content) # Caller just wants the plain text? if plain_text: # replace form feeds (OCR'd layers only) with an indication of the page break return bill_text_content.replace(u"\u000C", "\n=============================================\n") # Return the text wrapped in <pre>, and replace form feeds with an <hr>. import cgi bill_text_content = "<pre>" + cgi.escape(bill_text_content) + "</pre>" bill_text_content = bill_text_content.replace(u"\u000C", "<hr>") # (OCR'd layers only) ret.update({ "text_html": bill_text_content, "source": dat.get("text_file_source"), }) return ret