Beispiel #1
0
 def load_bill_text_xml(docinfo):
     # If XML text is available, use it, but pre-render it
     # into HTML. Otherwise use the legacy HTML that we
     # scraped from THOMAS.
     if "xml_file" in docinfo:
         import congressxml
         return congressxml.convert_xml(docinfo["xml_file"])
     elif "html_file" in docinfo:
         return lxml.etree.parse(docinfo["html_file"])
     else:
         raise IOError("Bill text is not available for one of the bills.")
Beispiel #2
0
 def load_bill_text_xml(docinfo):
     # If XML text is available, use it, but pre-render it
     # into HTML. Otherwise use the legacy HTML that we
     # scraped from THOMAS.
     if "xml_file" in docinfo:
         import congressxml
         return congressxml.convert_xml(docinfo["xml_file"])
     elif "html_file" in docinfo:
         return lxml.etree.parse(docinfo["html_file"])
     else:
         raise IOError("Bill text is not available for one of the bills.")
Beispiel #3
0
def load_bill_text(bill,
                   version,
                   plain_text=False,
                   mods_only=False,
                   with_citations=False):
    # Load bill text info from the Congress project data directory.

    dat = get_bill_text_metadata(bill, version)
    if not dat:
        # No text is available.
        if plain_text:
            return ""  # for indexing, just return empty string if no text is available
        raise IOError("Bill text is not available for this bill.")

    ret = {
        "bill_id": bill.id,
        "bill_name": bill.title,
        "has_displayable_text": dat.get("has_displayable_text"),
    }

    # Load basic metadata from a MODS file if one exists.
    if "mods_file" in dat:
        ret.update(load_bill_mods_metadata(dat["mods_file"]))

    # Otherwise fall back on using the text-versions data.json file. We may have
    # this for historical bills that we don't have a MODS file for.
    else:
        gpo_url = dat["urls"]["pdf"]

        m = re.match(
            r"http://www.gpo.gov/fdsys/pkg/(STATUTE-\d+)/pdf/(STATUTE-\d+-.*).pdf",
            gpo_url)
        if m:
            # TODO (but not needed right now): Docs from the BILLS collection.
            gpo_url = "http://www.gpo.gov/fdsys/granule/%s/%s/content-detail.html" % m.groups(
            )

        ret.update({
            "docdate":
            dat["issued_on"],
            "gpo_url":
            gpo_url,
            "gpo_pdf_url":
            dat["urls"]["pdf"],
            "doc_version":
            dat["version_code"],
            "doc_version_name":
            get_gpo_status_code_name(dat["version_code"]),
        })

    # Pass through some fields.
    for f in ('html_file', 'xml_file', 'pdf_file', 'has_thumbnail',
              'thumbnail_path'):
        if f in dat:
            ret[f] = dat[f]

    if with_citations:  #and and not settings.DEBUG:
        load_citation_info(ret)

    # If the caller only wants metadata, return it.
    if mods_only:
        return ret

    if "xml_file" in dat and not plain_text:
        # convert XML on the fly to HTML
        import lxml.html, congressxml
        ret.update({
            "text_html":
            lxml.html.tostring(congressxml.convert_xml(dat["xml_file"]),
                               encoding=str),
            "source":
            dat.get("xml_file_source"),
        })

    elif "html_file" in dat and not plain_text:
        # This will be for bills around the 103rd-108th Congresses when
        # bill text is available from GPO but not in XML.
        ret.update({
            "text_html": open(dat["html_file"]).read(),
        })

    elif "text_file" in dat:
        # bill text from the Statutes at Large, or when plain_text is True then from GPO

        bill_text_content = open(dat["text_file"]).read()

        # In the GPO BILLS collection, there's gunk at the top and bottom that we'd
        # rather just remove: metadata in brackets at the top, and <all> at the end.
        # We remove it because it's not really useful when indexing.
        if bill_text_content:
            bill_text_content = re.sub(r"^\s*(\[[^\n]+\]\s*)*", "",
                                       bill_text_content)
            bill_text_content = re.sub(r"\s*<all>\s*$", "", bill_text_content)

        # Caller just wants the plain text?
        if plain_text:
            # replace form feeds (OCR'd layers only) with an indication of the page break
            return bill_text_content.replace(
                "\u000C", "\n=============================================\n")

        # Return the text wrapped in <pre>, and replace form feeds with an <hr>.
        import cgi
        bill_text_content = "<pre>" + cgi.escape(bill_text_content) + "</pre>"
        bill_text_content = bill_text_content.replace(
            "\u000C", "<hr>")  # (OCR'd layers only)

        ret.update({
            "text_html": bill_text_content,
            "source": dat.get("text_file_source"),
        })

    return ret
def load_bill_text(bill, version, plain_text=False, mods_only=False):
    # Load bill text info from the Congress project data directory.
    # We have JSON files for metadata and plain text files mirrored from GPO
    # containing bill text (either from the Statutes at Large OCR'ed text
    # layers, or from GPO FDSys's BILLS collection).
    
    dat = get_bill_text_metadata(bill, version)
    if not dat:
        # No text is available.
        if plain_text:
            return "" # for indexing, just return empty string if no text is available
        raise IOError("Bill text is not available for this bill.")

    ret = {
        "bill_id": bill.id,
        "bill_name": bill.title,
        "has_displayable_text": dat.get("has_displayable_text"),
    }

    # Load basic metadata from a MODS file if one exists.
    if "mods_file" in dat:
        ret.update(load_bill_mods_metadata(dat["mods_file"]))

    # Otherwise fall back on using the text-versions data.json file. We may have
    # this for historical bills that we don't have a MODS file for.
    else:
        gpo_url = dat["urls"]["pdf"]

        m = re.match(r"http://www.gpo.gov/fdsys/pkg/(STATUTE-\d+)/pdf/(STATUTE-\d+-.*).pdf", gpo_url)
        if m:
            # TODO (but not needed right now): Docs from the BILLS collection.
            gpo_url = "http://www.gpo.gov/fdsys/granule/%s/%s/content-detail.html" % m.groups()

        ret.update({
            "docdate": datetime.date(*(int(d) for d in dat["issued_on"].split("-"))),
            "gpo_url": gpo_url,
            "gpo_pdf_url": dat["urls"]["pdf"],
            "doc_version": dat["version_code"],
            "doc_version_name": get_gpo_status_code_name(dat["version_code"]),
        })

    # Pass through some fields.
    for f in ('html_file', 'thumbnail_path'):
        if f in dat:
            ret[f] = dat[f]

    # If the caller only wants metadata, return it.
    if mods_only:
        return ret

    if "xml_file" in dat and not plain_text:
        # convert XML on the fly to HTML
        import lxml.html, congressxml
        ret.update({
            "text_html": lxml.html.tostring(congressxml.convert_xml(dat["xml_file"])),
            "source": dat.get("xml_file_source"),
        })

    elif "html_file" in dat and not plain_text:
        # This will be for bills around the 103rd-108th Congresses when
        # bill text is available from GPO but not in XML.
        ret.update({
            "text_html": open(dat["html_file"]).read().decode("utf8"),
        })

    elif "text_file" in dat:
        # bill text from the Statutes at Large, or when plain_text is True then from GPO

        bill_text_content = open(dat["text_file"]).read().decode("utf8")

        # In the GPO BILLS collection, there's gunk at the top and bottom that we'd
        # rather just remove: metadata in brackets at the top, and <all> at the end.
        # We remove it because it's not really useful when indexing.
        if bill_text_content:
            bill_text_content = re.sub(r"^\s*(\[[^\n]+\]\s*)*", "", bill_text_content)
            bill_text_content = re.sub(r"\s*<all>\s*$", "", bill_text_content)

        # Caller just wants the plain text?
        if plain_text:
            # replace form feeds (OCR'd layers only) with an indication of the page break
            return bill_text_content.replace(u"\u000C", "\n=============================================\n")
            
        # Return the text wrapped in <pre>, and replace form feeds with an <hr>.
        import cgi
        bill_text_content = "<pre>" + cgi.escape(bill_text_content) + "</pre>"
        bill_text_content = bill_text_content.replace(u"\u000C", "<hr>") # (OCR'd layers only)

        ret.update({
            "text_html": bill_text_content,
            "source": dat.get("text_file_source"),
        })


    return ret