def bill_text(request, congress, type_slug, number, version=None):
    if version == "":
        version = None

    try:
        bill_type = BillType.by_slug(type_slug)
    except BillType.NotFound:
        raise Http404("Invalid bill type: " + type_slug)
    bill = get_object_or_404(Bill, congress=congress, bill_type=bill_type, number=number)

    from billtext import load_bill_text, get_bill_text_versions
    try:
        textdata = load_bill_text(bill, version)
    except IOError:
        textdata = None

    # Get a list of the alternate versions of this bill.
    alternates = None
    is_latest = True
    if textdata:
        alternates = []
        for v in get_bill_text_versions(bill):
            try:
                alternates.append(load_bill_text(bill, v, mods_only=True))
            except IOError:
                pass
        alternates.sort(key = lambda mods : mods["docdate"])
        if len(alternates) > 0:
            is_latest = False
            if textdata["doc_version"] == alternates[-1]["doc_version"]:
                is_latest = True

    # Get a list of related bills.
    from billtext import get_current_version
    related_bills = []
    for rb in list(bill.find_reintroductions()) + [r.related_bill for r in bill.get_related_bills()]:
        try:
            rbv = get_current_version(rb)
            if not (rb, rbv) in related_bills: related_bills.append((rb, rbv))
        except IOError:
            pass # text not available
    for btc in BillTextComparison.objects.filter(bill1=bill).exclude(bill2=bill):
        if not (btc.bill2, btc.ver2) in related_bills: related_bills.append((btc.bill2, btc.ver2))
    for btc in BillTextComparison.objects.filter(bill2=bill).exclude(bill1=bill):
        if not (btc.bill1, btc.ver1) in related_bills: related_bills.append((btc.bill1, btc.ver1))

    return {
        'bill': bill,
        "congressdates": get_congress_dates(bill.congress),
        "textdata": textdata,
        "version": version,
        "is_latest": is_latest,
        "alternates": alternates,
        "related_bills": related_bills,
        "days_old": (datetime.datetime.now().date() - bill.current_status_date).days,
        "is_on_bill_text_page": True, # for the header tabs
    }
Beispiel #2
0
def bill_text(request, congress, type_slug, number, version=None):
    if version == "":
        version = None

    try:
        bill_type = BillType.by_slug(type_slug)
    except BillType.NotFound:
        raise Http404("Invalid bill type: " + type_slug)
    bill = get_object_or_404(Bill, congress=congress, bill_type=bill_type, number=number)

    from billtext import load_bill_text, get_bill_text_versions
    try:
        textdata = load_bill_text(bill, version)
    except IOError:
        textdata = None

    # Get a list of the alternate versions of this bill.
    alternates = None
    is_latest = True
    if textdata:
        alternates = []
        for v in get_bill_text_versions(bill):
            try:
                alternates.append(load_bill_text(bill, v, mods_only=True))
            except IOError:
                pass
        alternates.sort(key = lambda mods : mods["docdate"])
        if len(alternates) > 0:
            is_latest = False
            if textdata["doc_version"] == alternates[-1]["doc_version"]:
                is_latest = True

    # Get a list of related bills.
    from billtext import get_current_version
    related_bills = []
    for rb in list(bill.find_reintroductions()) + [r.related_bill for r in bill.get_related_bills()]:
        try:
            rbv = get_current_version(rb)
            if not (rb, rbv) in related_bills: related_bills.append((rb, rbv))
        except IOError:
            pass # text not available
    for btc in BillTextComparison.objects.filter(bill1=bill).exclude(bill2=bill):
        if not (btc.bill2, btc.ver2) in related_bills: related_bills.append((btc.bill2, btc.ver2))
    for btc in BillTextComparison.objects.filter(bill2=bill).exclude(bill1=bill):
        if not (btc.bill1, btc.ver1) in related_bills: related_bills.append((btc.bill1, btc.ver1))

    return {
        'bill': bill,
        "congressdates": get_congress_dates(bill.congress),
        "textdata": textdata,
        "version": version,
        "is_latest": is_latest,
        "alternates": alternates,
        "related_bills": related_bills,
        "days_old": (datetime.datetime.now().date() - bill.current_status_date).days,
        "is_on_bill_text_page": True, # for the header tabs
    }
Beispiel #3
0
def bill_text(request, congress, type_slug, number, version=None):
    if version == "":
        version = None
    
    try:
        bill_type = BillType.by_slug(type_slug)
    except BillType.NotFound:
        raise Http404("Invalid bill type: " + type_slug)
    bill = get_object_or_404(Bill, congress=congress, bill_type=bill_type, number=number)
    
    from billtext import load_bill_text, bill_gpo_status_codes
    try:
        textdata = load_bill_text(bill, version)
    except IOError:
        textdata = None

    # Get a list of the alternate versions of this bill.
    alternates = None
    if textdata:
        alternates = []
        for v in bill_gpo_status_codes:
            fn = "data/us/bills.text/%s/%s/%s%d%s.mods.xml" % (bill.congress, BillType.by_value(bill.bill_type).xml_code, BillType.by_value(bill.bill_type).xml_code, bill.number, v)
            if os.path.exists(fn):
                alternates.append(load_bill_text(bill, v, mods_only=True))
        alternates.sort(key = lambda mods : mods["docdate"])

    # Get a list of related bills.
    from billtext import get_current_version
    related_bills = []
    for rb in list(bill.find_reintroductions()) + [r.related_bill for r in bill.get_related_bills()]:
        try:
            rbv = get_current_version(rb)
            if not (rb, rbv) in related_bills: related_bills.append((rb, rbv))
        except IOError:
            pass # text not available
    for btc in BillTextComparison.objects.filter(bill1=bill).exclude(bill2=bill):
        if not (btc.bill2, btc.ver2) in related_bills: related_bills.append((btc.bill2, btc.ver2))
    for btc in BillTextComparison.objects.filter(bill2=bill).exclude(bill1=bill):
        if not (btc.bill1, btc.ver1) in related_bills: related_bills.append((btc.bill1, btc.ver1))

    return {
        'bill': bill,
        "congressdates": get_congress_dates(bill.congress),
        "textdata": textdata,
        "version": version,
        "alternates": alternates,
        "related_bills": related_bills,
    }
Beispiel #4
0
def bill_text(request, congress, type_slug, number, version=None):
    if version == "":
        version = None

    try:
        bill_type = BillType.by_slug(type_slug)
    except BillType.NotFound:
        raise Http404("Invalid bill type: " + type_slug)
    bill = get_object_or_404(Bill, congress=congress, bill_type=bill_type, number=number)

    from billtext import load_bill_text, bill_gpo_status_codes
    try:
        textdata = load_bill_text(bill, version)
    except IOError:
        textdata = None

    # Get a list of the alternate versions of this bill.
    alternates = None
    if textdata:
        alternates = []
        for v in bill_gpo_status_codes:
            fn = "data/us/bills.text/%s/%s/%s%d%s.mods.xml" % (bill.congress, BillType.by_value(bill.bill_type).xml_code, BillType.by_value(bill.bill_type).xml_code, bill.number, v)
            if os.path.exists(fn):
                alternates.append(load_bill_text(bill, v, mods_only=True))
        alternates.sort(key = lambda mods : mods["docdate"])

    # Get a list of related bills.
    from billtext import get_current_version
    related_bills = []
    for rb in list(bill.find_reintroductions()) + [r.related_bill for r in bill.get_related_bills()]:
        try:
            rbv = get_current_version(rb)
            if not (rb, rbv) in related_bills: related_bills.append((rb, rbv))
        except IOError:
            pass # text not available
    for btc in BillTextComparison.objects.filter(bill1=bill).exclude(bill2=bill):
        if not (btc.bill2, btc.ver2) in related_bills: related_bills.append((btc.bill2, btc.ver2))
    for btc in BillTextComparison.objects.filter(bill2=bill).exclude(bill1=bill):
        if not (btc.bill1, btc.ver1) in related_bills: related_bills.append((btc.bill1, btc.ver1))

    return {
        'bill': bill,
        "congressdates": get_congress_dates(bill.congress),
        "textdata": textdata,
        "version": version,
        "alternates": alternates,
        "related_bills": related_bills,
    }
Beispiel #5
0
def get_text_info(bill):
    # bill text info and areas of law affected
    from billtext import load_bill_text
    try:
        return load_bill_text(bill, None, mods_only=True, with_citations=True)
    except IOError:
        return None
Beispiel #6
0
def get_text_info(bill):
    # bill text info and areas of law affected
    from billtext import load_bill_text
    try:
        return load_bill_text(bill, None, mods_only=True, with_citations=True)
    except IOError:
        return None
def get_bill_paragraphs(bill):
	from billtext import load_bill_text
	from hashlib import md5
		
	try:
		dom = lxml.etree.fromstring(load_bill_text(bill, None)["text_html"])
	except IOError:
		return None
		
	hashes = { }
		
	for node in dom.xpath("//p"):	
		text = lxml.etree.tostring(node, method="text", encoding="utf8")
		text = text.lower() # normalize case
		text = re.sub("^\(.*?\)\s*", "", text) # remove initial list numbering
		text = re.sub(r"\W+", " ", text).strip() # normalize spaces and other non-word characters
		if text == "": continue
		text = md5(text).hexdigest()
		hashes[text] = hashes.get(text, 0) + 1

	return hashes
Beispiel #8
0
def get_bill_paragraphs(bill):
    from billtext import load_bill_text
    from hashlib import md5

    try:
        dom = lxml.etree.fromstring(load_bill_text(bill, None)["text_html"])
    except IOError:
        return None

    hashes = {}

    for node in dom.xpath("//p"):
        text = lxml.etree.tostring(node, method="text", encoding="utf8")
        text = text.lower()  # normalize case
        text = re.sub("^\(.*?\)\s*", "", text)  # remove initial list numbering
        text = re.sub(
            r"\W+", " ",
            text).strip()  # normalize spaces and other non-word characters
        if text == "": continue
        text = md5(text).hexdigest()
        hashes[text] = hashes.get(text, 0) + 1

    return hashes
Beispiel #9
0
def bill_details(request, congress, type_slug, number):
    bill = load_bill_from_url(congress, type_slug, number)

    # get related bills
    related_bills = []
    reintro_prev = None
    reintro_next = None
    for reintro in bill.find_reintroductions():
        if reintro.congress < bill.congress: reintro_prev = reintro
        if reintro.congress > bill.congress and not reintro_next: reintro_next = reintro
    if reintro_prev: related_bills.append({ "bill": reintro_prev, "note": "was a previous version of this bill.", "show_title": False })
    if reintro_next: related_bills.append({ "bill": reintro_next, "note": "was a re-introduction of this bill in a later Congress.", "show_title": False })
    for rb in bill.get_related_bills():
        if rb.relation in ("identical", "rule"):
            related_bills.append({ "bill": rb.related_bill, "note": "(%s)" % rb.relation, "show_title": False })
        elif rb.relation == "ruled-by":
            related_bills.append({ "bill": rb.related_bill, "prenote": "Debate on", "note": " is governed by these rules.", "show_title": False })
        else:
            related_bills.append({ "bill": rb.related_bill, "note": ("(%s)" % (rb.relation.title() if rb.relation != "unknown" else "Related")), "show_title": True })

    # bill text info and areas of law affected
    from billtext import load_bill_text
    try:
        text_info = load_bill_text(bill, None, mods_only=True, with_citations=True)
    except IOError:
        text_info = None

    return {
        'bill': bill,
        "congressdates": get_congress_dates(bill.congress),
        "subtitle": get_secondary_bill_title(bill, bill.titles),
        "current": bill.congress == CURRENT_CONGRESS,
        "dead": bill.congress != CURRENT_CONGRESS and bill.current_status not in BillStatus.final_status_obvious,
        "feed": bill.get_feed(),
        "text_info": text_info,
        "related": related_bills,
    }
Beispiel #10
0
def bill_details(request, congress, type_slug, number):
    bill = load_bill_from_url(congress, type_slug, number)

    # get related bills
    related_bills = []
    reintro_prev = None
    reintro_next = None
    for reintro in bill.find_reintroductions():
        if reintro.congress < bill.congress: reintro_prev = reintro
        if reintro.congress > bill.congress and not reintro_next: reintro_next = reintro
    if reintro_prev: related_bills.append({ "bill": reintro_prev, "note": "was a previous version of this bill.", "show_title": False })
    if reintro_next: related_bills.append({ "bill": reintro_next, "note": "was a re-introduction of this bill in a later Congress.", "show_title": False })
    for rb in bill.get_related_bills():
        if rb.relation in ("identical", "rule"):
            related_bills.append({ "bill": rb.related_bill, "note": "(%s)" % rb.relation, "show_title": False })
        elif rb.relation == "ruled-by":
            related_bills.append({ "bill": rb.related_bill, "prenote": "Debate on", "note": " is governed by these rules.", "show_title": False })
        else:
            related_bills.append({ "bill": rb.related_bill, "note": ("(%s)" % (rb.relation.title() if rb.relation != "unknown" else "Related")), "show_title": True })

    # bill text info and areas of law affected
    from billtext import load_bill_text
    try:
        text_info = load_bill_text(bill, None, mods_only=True, with_citations=True)
    except IOError:
        text_info = None

    return {
        'bill': bill,
        "congressdates": get_congress_dates(bill.congress),
        "subtitle": get_secondary_bill_title(bill, bill.titles),
        "current": bill.congress == CURRENT_CONGRESS,
        "dead": bill.congress != CURRENT_CONGRESS and bill.current_status not in BillStatus.final_status_obvious,
        "feed": bill.get_feed(),
        "text_info": text_info,
        "related": related_bills,
    }
Beispiel #11
0
def load_comparison(left_bill, left_version, right_bill, right_version, timelimit=10, force=False):
    from billtext import load_bill_text, compare_xml_text, get_current_version
    import lxml
    
    left_bill = Bill.objects.get(id = left_bill)
    right_bill = Bill.objects.get(id = right_bill)
    
    if left_version == "": left_version = get_current_version(left_bill)
    if right_version == "": right_version = get_current_version(right_bill)
    
    btc = None
    try:
        btc = BillTextComparison.objects.get(
            bill1 = left_bill,
            ver1 = left_version,
            bill2 = right_bill,
            ver2 = right_version)
        btc.decompress()
        if not force: return btc.data
    except BillTextComparison.DoesNotExist:
        pass
    
    # Try with the bills swapped.
    try:
        btc2 = BillTextComparison.objects.get(
            bill2 = left_bill,
            ver2 = left_version,
            bill1 = right_bill,
            ver1 = right_version)
        btc2.decompress()
        data = btc2.data
        return {
            "left_meta": data["right_meta"],
            "right_meta": data["left_meta"],
            "left_text": data["right_text"],
            "right_text": data["left_text"],
        }
    except BillTextComparison.DoesNotExist:
        pass
    
    left = load_bill_text(left_bill, left_version, mods_only=True)
    right = load_bill_text(right_bill, right_version, mods_only=True)
    
    doc1 = lxml.etree.parse(left["basename"] + ".html")
    doc2 = lxml.etree.parse(right["basename"] + ".html")
    compare_xml_text(doc1, doc2, timelimit=timelimit) # revises DOMs in-place
    
    # dates aren't JSON serializable
    left["docdate"] = left["docdate"].strftime("%x")
    right["docdate"] = right["docdate"].strftime("%x")
    
    ret = {
        "left_meta": left,
        "right_meta": right,
        "left_text": lxml.etree.tostring(doc1),
        "right_text": lxml.etree.tostring(doc2),
    }
    
    if not btc:
        btc = BillTextComparison(
            bill1 = left_bill,
            ver1 = left_version,
            bill2 = right_bill,
            ver2 = right_version,
            data = dict(ret)) # clone before compress()
    else:
        btc.data = dict(ret) # clone before compress()
        
    btc.compress()
    btc.save()
    
    return ret
Beispiel #12
0
 def get_text_info():
     from models import USCSection
     from billtext import load_bill_text
     from search import parse_slip_law_number
     import re
     try:
         metadata = load_bill_text(bill, None, mods_only=True)
         
         # do interesting stuff with citations
         if "citations" in metadata:
             slip_laws = []
             statutes = []
             usc = { }
             other = []
             usc_other = USCSection(name="Other Citations", ordering=99999)
             for cite in metadata["citations"]:
                 if cite["type"] == "slip_law":
                     slip_laws.append(cite)
                     cite["bill"] = parse_slip_law_number(cite["text"])
                 elif cite["type"] == "statutes_at_large":
                     statutes.append(cite)
                 elif cite["type"] == "usc":
                     # build a normalized citation and a link to LII
                     cite_norm = "usc/" + cite["title"]
                     cite_link = "http://www.law.cornell.edu/uscode/text/" + cite["title"]
                     if cite["section"]:
                         cite_link += "/" + cite["section"]
                         cite_norm += "/" + cite["section"]
                     if cite["paragraph"]: cite_link += "#" + "_".join(re.findall(r"\(([^)]+)\)", cite["paragraph"]))
                     
                     # Build a tree of title-chapter-...-section nodes so we can
                     # display the citations in context.
                     try:
                         sec_obj = USCSection.objects.get(citation=cite_norm)
                     except: # USCSection.DoesNotExist and MultipleObjectsReturned both possible
                         # the 'id' field is set to make these objects properly hashable
                         sec_obj = USCSection(id=cite["text"], name=cite["text"], parent_section=usc_other)
                     
                     sec_obj.link = cite_link
                     
                     if "range_to_section" in cite:
                         sec_obj.range_to_section = cite["range_to_section"]
                     
                     # recursively go up to the title
                     path = [sec_obj]
                     while sec_obj.parent_section:
                         sec_obj = sec_obj.parent_section
                         path.append(sec_obj)
                         
                     # now pop off from the path to put the node at the right point in a tree
                     container = usc
                     while path:
                         p = path.pop(-1)
                         if p not in container: container[p] = { }
                         container = container[p]
                     
                 else:
                     other.append(cite)
                     
             slip_laws.sort(key = lambda x : (x["congress"], x["number"]))
             
             # restructure data format
             def ucfirst(s): return s[0].upper() + s[1:]
             def rebuild_usc_sec(seclist, indent=0):
                 ret = []
                 seclist = sorted(seclist.items(), key=lambda x : x[0].ordering)
                 for sec, subparts in seclist:
                     ret.append({
                         "text": (ucfirst(sec.level_type + ((" " + sec.number) if sec.number else "") + (": " if sec.name else "")) if sec.level_type else "") + (sec.name if sec.name else ""),
                         "link": getattr(sec, "link", None),
                         "range_to_section": getattr(sec, "range_to_section", None),
                         "indent": indent,
                     })
                     ret.extend(rebuild_usc_sec(subparts, indent=indent+1))
                 return ret
             usc = rebuild_usc_sec(usc)
             
             metadata["citations"] = {
                 "slip_laws": slip_laws, "statutes": statutes, "usc": usc, "other": other,
                 "count": len(slip_laws)+len(statutes)+len(usc)+len(other) }
         return metadata
     except IOError:
         return None
Beispiel #13
0
def bill_text_image(request, congress, type_slug, number, image_type):
    bill = load_bill_from_url(congress, type_slug, number)
    from billtext import load_bill_text

    # Rasterizes a page of a PDF to a greyscale PIL.Image.
    # Crop out the GPO seal & the vertical margins.
    def pdftopng(pdffile, pagenumber, width=900):
        from PIL import Image
        import subprocess, StringIO
        pngbytes = subprocess.check_output([
            "/usr/bin/pdftoppm", "-f",
            str(pagenumber), "-l",
            str(pagenumber), "-scale-to",
            str(width), "-png", pdffile
        ])
        im = Image.open(StringIO.StringIO(pngbytes))
        im = im.convert("L")

        # crop out the GPO seal:
        im = im.crop((0, int((.06 if pagenumber == 1 else 0) * im.size[0]),
                      im.size[0], im.size[1]))

        # zealous-crop the vertical margins, but at least leaving a little
        # at the bottom so that when we paste the two pages of the two images
        # together they don't get totally scruntched, and put in some padding
        # at the top.
        # (.getbbox() crops out zeroes, so we'll invert the image to make it work with white)
        from PIL import ImageOps
        bbox = ImageOps.invert(im).getbbox()
        vpad = int(.02 * im.size[1])
        im = im.crop((0, max(0, bbox[1] - vpad), im.size[0],
                      min(im.size[1], bbox[3] + vpad)))

        return im

    # Find the PDF file and rasterize the first two pages.

    try:
        metadata = load_bill_text(bill, None, mods_only=True)
    except IOError:
        # if bill text metadata isn't available, trap the error
        # and just 404 it
        raise Http404()

    if metadata.get("pdf_file"):
        # Use the PDF files on disk.
        pg1 = pdftopng(metadata.get("pdf_file"), 1)
        try:
            pg2 = pdftopng(metadata.get("pdf_file"), 2)
        except:
            pg2 = pg1.crop((0, 0, pg1.size[0], 0))  # may only be one page!
    elif settings.DEBUG:
        # When debugging in a local environment we may not have bill text available
        # so download the PDF from GPO.
        import os, tempfile, subprocess
        try:
            (fd1, fn1) = tempfile.mkstemp(suffix=".pdf")
            os.close(fd1)
            subprocess.check_call(
                ["/usr/bin/wget", "-O", fn1, "-q", metadata["gpo_pdf_url"]])
            pg1 = pdftopng(fn1, 1)
            pg2 = pdftopng(fn1, 2)
        finally:
            os.unlink(fn1)
    else:
        # No PDF is available.
        raise Http404()

    # Since some bills have big white space at the top of the first page,
    # we'll combine the first two pages and then shift the window down
    # until the real start of the bill.

    from PIL import Image
    img = Image.new(pg1.mode, (pg1.size[0], int(pg1.size[1] + pg2.size[1])))
    img.paste(pg1, (0, 0))
    img.paste(pg2, (0, pg1.size[1]))

    # Zealous crop the (horizontal) margins. We do this only after the two
    # pages have been combined so that we don't mess up their alignment.
    # Add some padding.
    from PIL import ImageOps
    hpad = int(.02 * img.size[0])
    bbox = ImageOps.invert(img).getbbox()
    img = img.crop((max(0,
                        bbox[0] - hpad), 0, min(img.size[0],
                                                bbox[2] + hpad), img.size[1]))

    # Now take a window from the top matching a particular aspect ratio.
    # We're going to display this next to photos of members of congress,
    # so use that aspect ratio.
    try:
        aspect = float(request.GET["aspect"])
    except:
        aspect = 240.0 / 200.0
    img = img.crop((0, 0, img.size[0], int(aspect * img.size[0])))

    # Resize to requested width.
    if "width" in request.GET:
        img.thumbnail((int(
            request.GET["width"]), int(aspect * float(request.GET["width"]))),
                      Image.ANTIALIAS)

    # Add symbology.
    if image_type == "thumbnail":
        img = img.convert("RGBA")

        banner_color = None
        party_colors = {
            "Republican": (230, 14, 19, 150),
            "Democrat": (0, 65, 161, 150)
        }
        if bill.sponsor_role:
            banner_color = party_colors.get(bill.sponsor_role.party)
        if banner_color:
            from PIL import ImageDraw
            im = Image.new("RGBA", img.size, (0, 0, 0, 0))
            draw = ImageDraw.Draw(im)
            draw.rectangle(((0, int(.85 * im.size[1])), im.size),
                           outline=None,
                           fill=banner_color)
            del draw
            img = Image.alpha_composite(img, im)

        if bill.sponsor and bill.sponsor.has_photo():
            im = Image.open("." + bill.sponsor.get_photo_url(200))
            im.thumbnail([int(x / 2.5) for x in img.size])
            img.paste(
                im,
                (int(.05 * img.size[1]), int(.95 * img.size[1]) - im.size[1]))

        from PIL import ImageDraw
        draw = ImageDraw.Draw(img)
        draw.rectangle(((0, 0), (img.size[0] - 1, img.size[1] - 1)),
                       outline=(100, 100, 100, 255),
                       fill=None)
        del draw

    # Serialize & return.
    import StringIO
    imgbytesbuf = StringIO.StringIO()
    img.save(imgbytesbuf, "PNG")
    imgbytes = imgbytesbuf.getvalue()
    imgbytesbuf.close()
    return HttpResponse(imgbytes, content_type="image/png")
Beispiel #14
0
    def get_text_info():
        from models import USCSection
        from billtext import load_bill_text
        from search import parse_slip_law_number
        import re
        try:
            metadata = load_bill_text(bill, None, mods_only=True)

            # do interesting stuff with citations
            if "citations" in metadata and not settings.DEBUG:
                slip_laws = []
                statutes = []
                usc = { }
                other = []
                usc_other = USCSection(name="Other Citations", ordering=99999)
                for cite in metadata["citations"]:
                    if cite["type"] == "slip_law":
                        slip_laws.append(cite)
                        cite["bill"] = parse_slip_law_number(cite["text"])
                    elif cite["type"] == "statutes_at_large":
                        statutes.append(cite)
                    elif cite["type"] in ("usc-section", "usc-chapter"):
                        # Build a tree of title-chapter-...-section nodes so we can
                        # display the citations in context.
                        try:
                            sec_obj = USCSection.objects.get(citation=cite["key"])
                        except: # USCSection.DoesNotExist and MultipleObjectsReturned both possible
                            # create a fake entry for the sake of output
                            # the 'id' field is set to make these objects properly hashable
                            sec_obj = USCSection(id=cite["text"], name=cite["text"], parent_section=usc_other)

                        if "range_to_section" in cite:
                            sec_obj.range_to_section = cite["range_to_section"]

                        # recursively go up to the title
                        path = [sec_obj]
                        so = sec_obj
                        while so.parent_section:
                            so = so.parent_section
                            path.append(so)

                        # build a link to LII
                        if cite["type"] == "usc-section":
                            cite_link = "http://www.law.cornell.edu/uscode/text/" + cite["title"]
                            if cite["section"]:
                                cite_link += "/" + cite["section"]
                            if cite["paragraph"]: cite_link += "#" + "_".join(re.findall(r"\(([^)]+)\)", cite["paragraph"]))
                        elif cite["type"] == "usc-chapter":
                            cite_link = "http://www.law.cornell.edu/uscode/text/" + cite["title"] + "/" + "/".join(
                                (so.level_type + "-" + so.number) for so in reversed(path[:-1])
                                )
                        sec_obj.link = cite_link

                        # now pop off from the path to put the node at the right point in a tree
                        container = usc
                        while path:
                            p = path.pop(-1)
                            if p not in container: container[p] = { }
                            container = container[p]

                    else:
                        other.append(cite)

                slip_laws.sort(key = lambda x : (x["congress"], x["number"]))

                # restructure data format
                def ucfirst(s): return s[0].upper() + s[1:]
                def rebuild_usc_sec(seclist, indent=0):
                    ret = []
                    seclist = sorted(seclist.items(), key=lambda x : x[0].ordering)
                    for sec, subparts in seclist:
                        ret.append({
                            "text": (ucfirst(sec.level_type + ((" " + sec.number) if sec.number else "") + (": " if sec.name else "")) if sec.level_type else "") + (sec.name_recased if sec.name else ""),
                            "link": getattr(sec, "link", None),
                            "range_to_section": getattr(sec, "range_to_section", None),
                            "indent": indent,
                        })
                        ret.extend(rebuild_usc_sec(subparts, indent=indent+1))
                    return ret
                usc = rebuild_usc_sec(usc)

                metadata["citations"] = {
                    "slip_laws": slip_laws, "statutes": statutes, "usc": usc, "other": other,
                    "count": len(slip_laws)+len(statutes)+len(usc)+len(other) }
            return metadata
        except IOError:
            return None
Beispiel #15
0
 def get_text_info():
     from billtext import load_bill_text
     try:
         return load_bill_text(bill, None, mods_only=True)
     except IOError:
         return None
Beispiel #16
0
def load_comparison(left_bill,
                    left_version,
                    right_bill,
                    right_version,
                    timelimit=10,
                    force=False):
    from billtext import load_bill_text, compare_xml_text, get_current_version
    import lxml

    left_bill = Bill.objects.get(id=left_bill)
    right_bill = Bill.objects.get(id=right_bill)

    if left_version == "": left_version = get_current_version(left_bill)
    if right_version == "": right_version = get_current_version(right_bill)

    btc = None
    try:
        btc = BillTextComparison.objects.get(bill1=left_bill,
                                             ver1=left_version,
                                             bill2=right_bill,
                                             ver2=right_version)
        btc.decompress()
        if not force: return btc.data
    except BillTextComparison.DoesNotExist:
        pass

    # Try with the bills swapped.
    try:
        btc2 = BillTextComparison.objects.get(bill2=left_bill,
                                              ver2=left_version,
                                              bill1=right_bill,
                                              ver1=right_version)
        btc2.decompress()
        data = btc2.data
        return {
            "left_meta": data["right_meta"],
            "right_meta": data["left_meta"],
            "left_text": data["right_text"],
            "right_text": data["left_text"],
        }
    except BillTextComparison.DoesNotExist:
        pass

    left = load_bill_text(left_bill, left_version, mods_only=True)
    right = load_bill_text(right_bill, right_version, mods_only=True)

    try:
        doc1 = lxml.etree.parse(left["html_file"])
        doc2 = lxml.etree.parse(right["html_file"])
    except KeyError:
        raise IOError(
            "The HTML bill text format is not available for one of the bills.")

    compare_xml_text(doc1, doc2, timelimit=timelimit)  # revises DOMs in-place

    # dates aren't JSON serializable
    left["docdate"] = left["docdate"].strftime("%x")
    right["docdate"] = right["docdate"].strftime("%x")

    ret = {
        "left_meta": left,
        "right_meta": right,
        "left_text": lxml.etree.tostring(doc1),
        "right_text": lxml.etree.tostring(doc2),
    }

    if not btc:
        btc = BillTextComparison(bill1=left_bill,
                                 ver1=left_version,
                                 bill2=right_bill,
                                 ver2=right_version,
                                 data=dict(ret))  # clone before compress()
    else:
        btc.data = dict(ret)  # clone before compress()

    btc.compress()
    btc.save()

    return ret
Beispiel #17
0
def load_comparison(left_bill,
                    left_version,
                    right_bill,
                    right_version,
                    timelimit=10):
    from billtext import load_bill_text, get_current_version
    from xml_diff import compare
    import lxml

    left_bill = Bill.objects.get(id=left_bill)
    right_bill = Bill.objects.get(id=right_bill)

    if left_version == "": left_version = get_current_version(left_bill)
    if right_version == "": right_version = get_current_version(right_bill)

    use_cache = True

    if use_cache:
        # Load from cache.
        try:
            btc = BillTextComparison.objects.get(bill1=left_bill,
                                                 ver1=left_version,
                                                 bill2=right_bill,
                                                 ver2=right_version)
            btc.decompress()
            return btc.data
        except BillTextComparison.DoesNotExist:
            pass

        # Load from cache - Try with the bills swapped.
        try:
            btc2 = BillTextComparison.objects.get(bill2=left_bill,
                                                  ver2=left_version,
                                                  bill1=right_bill,
                                                  ver1=right_version)
            btc2.decompress()
            data = btc2.data
            # un-swap
            return {
                "left_meta": data["right_meta"],
                "right_meta": data["left_meta"],
                "left_text": data["right_text"],
                "right_text": data["left_text"],
            }
        except BillTextComparison.DoesNotExist:
            pass

    # Load bill text metadata.
    left = load_bill_text(left_bill, left_version, mods_only=True)
    right = load_bill_text(right_bill, right_version, mods_only=True)

    # Load XML DOMs for each document and perform the comparison.
    def load_bill_text_xml(docinfo):
        # If XML text is available, use it, but pre-render it
        # into HTML. Otherwise use the legacy HTML that we
        # scraped from THOMAS.
        if "xml_file" in docinfo:
            import congressxml
            return congressxml.convert_xml(docinfo["xml_file"])
        elif "html_file" in docinfo:
            return lxml.etree.parse(docinfo["html_file"])
        else:
            raise IOError("Bill text is not available for one of the bills.")

    doc1 = load_bill_text_xml(left)
    doc2 = load_bill_text_xml(right)

    def make_tag_func(ins_del):
        import lxml.etree
        elem = lxml.etree.Element("comparison-change")
        return elem

    compare(doc1.getroot(), doc2.getroot(), make_tag_func=make_tag_func)

    # Prepare JSON response data.
    # dates aren't JSON serializable
    left["docdate"] = left["docdate"].strftime("%x")
    right["docdate"] = right["docdate"].strftime("%x")
    ret = {
        "left_meta": left,
        "right_meta": right,
        "left_text": lxml.etree.tostring(doc1),
        "right_text": lxml.etree.tostring(doc2),
    }

    if use_cache:
        # Cache in database so we don't have to re-do the comparison
        # computation again.
        btc = BillTextComparison(bill1=left_bill,
                                 ver1=left_version,
                                 bill2=right_bill,
                                 ver2=right_version,
                                 data=dict(ret))  # clone before compress()
        btc.compress()
        btc.save()

    # Return JSON comparison data.
    return ret
Beispiel #18
0
def load_comparison(left_bill, left_version, right_bill, right_version, timelimit=10, use_cache=True, force_update=False):
    from billtext import load_bill_text, get_current_version
    from xml_diff import compare
    import lxml

    left_bill = Bill.objects.get(id = left_bill)
    right_bill = Bill.objects.get(id = right_bill)

    if left_version == "": left_version = get_current_version(left_bill)
    if right_version == "": right_version = get_current_version(right_bill)

    if use_cache:
        # Load from cache.
        try:
            btc = BillTextComparison.objects.get(
                bill1 = left_bill,
                ver1 = left_version,
                bill2 = right_bill,
                ver2 = right_version)
            btc.decompress()
            return btc.data
        except BillTextComparison.DoesNotExist:
            pass

        # Load from cache - Try with the bills swapped.
        try:
            btc2 = BillTextComparison.objects.get(
                bill2 = left_bill,
                ver2 = left_version,
                bill1 = right_bill,
                ver1 = right_version)
            btc2.decompress()
            data = btc2.data
            # un-swap
            return {
                "left_meta": data["right_meta"],
                "right_meta": data["left_meta"],
                "left_text": data["right_text"],
                "right_text": data["left_text"],
            }
        except BillTextComparison.DoesNotExist:
            pass

    # Load bill text metadata.
    left = load_bill_text(left_bill, left_version, mods_only=True)
    right = load_bill_text(right_bill, right_version, mods_only=True)

    # Load XML DOMs for each document and perform the comparison.
    def load_bill_text_xml(docinfo):
        # If XML text is available, use it, but pre-render it
        # into HTML. Otherwise use the legacy HTML that we
        # scraped from THOMAS.
        if "xml_file" in docinfo:
            import congressxml
            return congressxml.convert_xml(docinfo["xml_file"])
        elif "html_file" in docinfo:
            return lxml.etree.parse(docinfo["html_file"])
        else:
            raise IOError("Bill text is not available for one of the bills.")
    doc1 = load_bill_text_xml(left)
    doc2 = load_bill_text_xml(right)
    def make_tag_func(ins_del):
        import lxml.etree
        elem = lxml.etree.Element("comparison-change")
        return elem
    def differ(text1, text2):
        # ensure we use the C++ Google DMP and can specify the time limit
        import diff_match_patch
        for x in diff_match_patch.diff_unicode(text1, text2, timelimit=timelimit):
            yield x
    compare(doc1.getroot(), doc2.getroot(), make_tag_func=make_tag_func, differ=differ)

    # Prepare JSON response data.
        # dates aren't JSON serializable
    left["docdate"] = left["docdate"].strftime("%x")
    right["docdate"] = right["docdate"].strftime("%x")
    ret = {
        "left_meta": left,
        "right_meta": right,
        "left_text": lxml.etree.tostring(doc1),
        "right_text": lxml.etree.tostring(doc2),
    }

    if use_cache or force_update:
        # For force_update, or race conditions, delete any existing record.
        fltr = { "bill1": left_bill,
            "ver1": left_version,
            "bill2": right_bill,
            "ver2": right_version }
        BillTextComparison.objects.filter(**fltr).delete()

        # Cache in database so we don't have to re-do the comparison
        # computation again.
        btc = BillTextComparison(
            data = dict(ret), # clone before compress()
            **fltr)
        btc.compress()
        btc.save()

    # Return JSON comparison data.
    return ret
Beispiel #19
0
def load_comparison(left_bill,
                    left_version,
                    right_bill,
                    right_version,
                    timelimit=10,
                    use_cache=True,
                    force_update=False):
    from billtext import load_bill_text, get_current_version
    from xml_diff import compare
    import lxml

    left_bill = Bill.objects.get(id=left_bill)
    right_bill = Bill.objects.get(id=right_bill)

    if left_version == "": left_version = get_current_version(left_bill)
    if right_version == "": right_version = get_current_version(right_bill)

    if use_cache:
        # Load from cache.
        try:
            btc = BillTextComparison.objects.get(bill1=left_bill,
                                                 ver1=left_version,
                                                 bill2=right_bill,
                                                 ver2=right_version)
            btc.decompress()
            return btc.data
        except BillTextComparison.DoesNotExist:
            pass

        # Load from cache - Try with the bills swapped.
        try:
            btc2 = BillTextComparison.objects.get(bill2=left_bill,
                                                  ver2=left_version,
                                                  bill1=right_bill,
                                                  ver1=right_version)
            btc2.decompress()
            data = btc2.data
            # un-swap
            return {
                "left_meta": data["right_meta"],
                "right_meta": data["left_meta"],
                "left_text": data["right_text"],
                "right_text": data["left_text"],
            }
        except BillTextComparison.DoesNotExist:
            pass

    # Load bill text metadata.
    left = load_bill_text(left_bill, left_version, mods_only=True)
    right = load_bill_text(right_bill, right_version, mods_only=True)

    # Load XML DOMs for each document and perform the comparison.
    def load_bill_text_xml(docinfo):
        # If XML text is available, use it, but pre-render it
        # into HTML. Otherwise use the legacy HTML that we
        # scraped from THOMAS.
        if "xml_file" in docinfo:
            import congressxml
            return congressxml.convert_xml(docinfo["xml_file"])
        elif "html_file" in docinfo:
            return lxml.etree.parse(docinfo["html_file"])
        else:
            raise IOError("Bill text is not available for one of the bills.")

    doc1 = load_bill_text_xml(left)
    doc2 = load_bill_text_xml(right)

    def make_tag_func(ins_del):
        import lxml.etree
        elem = lxml.etree.Element("comparison-change")
        return elem

    def differ(text1, text2):
        # ensure we use the C++ Google DMP and can specify the time limit
        import diff_match_patch
        for x in diff_match_patch.diff_unicode(text1,
                                               text2,
                                               timelimit=timelimit):
            yield x

    compare(doc1.getroot(),
            doc2.getroot(),
            make_tag_func=make_tag_func,
            differ=differ)

    # Prepare JSON response data.
    # dates aren't JSON serializable
    left["docdate"] = left["docdate"].strftime("%x")
    right["docdate"] = right["docdate"].strftime("%x")
    ret = {
        "left_meta": left,
        "right_meta": right,
        "left_text": lxml.etree.tostring(doc1),
        "right_text": lxml.etree.tostring(doc2),
    }

    if use_cache or force_update:
        # For force_update, or race conditions, delete any existing record.
        fltr = {
            "bill1": left_bill,
            "ver1": left_version,
            "bill2": right_bill,
            "ver2": right_version
        }
        BillTextComparison.objects.filter(**fltr).delete()

        # Cache in database so we don't have to re-do the comparison
        # computation again.
        btc = BillTextComparison(
            data=dict(ret),  # clone before compress()
            **fltr)
        btc.compress()
        btc.save()

    # Return JSON comparison data.
    return ret
Beispiel #20
0
def bill_text_image(request, congress, type_slug, number):
    bill = load_bill_from_url(congress, type_slug, number)
    from billtext import load_bill_text

    # Rasterizes a page of a PDF to a greyscale PIL.Image.
    # Crop out the GPO seal & the vertical margins.
    def pdftopng(pdffile, pagenumber, width=900):
        from PIL import Image
        import subprocess, StringIO
        pngbytes = subprocess.check_output(["/usr/bin/pdftoppm", "-f", str(pagenumber), "-l", str(pagenumber), "-scale-to", str(width), "-png", pdffile])
        im = Image.open(StringIO.StringIO(pngbytes))
        im = im.convert("L")

        # crop out the GPO seal:
        im = im.crop((0, int((.06 if pagenumber==1 else 0) * im.size[0]), im.size[0], im.size[1]))

        # zealous-crop the vertical margins, but at least leaving a little
        # at the bottom so that when we paste the two pages of the two images
        # together they don't get totally scruntched, and put in some padding
        # at the top.
        # (.getbbox() crops out zeroes, so we'll invert the image to make it work with white)
        from PIL import ImageOps
        bbox = ImageOps.invert(im).getbbox()
        vpad = int(.02*im.size[1])
        im = im.crop( (0, max(0, bbox[1]-vpad), im.size[0], min(im.size[1], bbox[3]+vpad) ) )

        return im

    # Find the PDF file and rasterize the first two pages.

    try:
        metadata = load_bill_text(bill, None, mods_only=True)
    except IOError:
        # if bill text metadata isn't available, trap the error
        # and just 404 it
        raise Http404()

    if metadata.get("pdf_file"):
        # Use the PDF files on disk.
        pg1 = pdftopng(metadata.get("pdf_file"), 1)
        try:
            pg2 = pdftopng(metadata.get("pdf_file"), 2)
        except:
            pg2 = pg1.crop((0, 0, pg1.size[0], 0)) # may only be one page!
    elif settings.DEBUG:
        # When debugging in a local environment we may not have bill text available
        # so download the PDF from GPO.
        import os, tempfile, subprocess
        try:
            (fd1, fn1) = tempfile.mkstemp(suffix=".pdf")
            os.close(fd1)
            subprocess.check_call(["/usr/bin/wget", "-O", fn1, "-q", metadata["gpo_pdf_url"]])
            pg1 = pdftopng(fn1, 1)
            pg2 = pdftopng(fn1, 2)
        finally:
            os.unlink(fn1)
    else:
        # No PDF is available.
        raise Http404()

    # Since some bills have big white space at the top of the first page,
    # we'll combine the first two pages and then shift the window down
    # until the real start of the bill.
    
    from PIL import Image
    img = Image.new(pg1.mode, (pg1.size[0], int(pg1.size[1]+pg2.size[1])))
    img.paste(pg1, (0,0))
    img.paste(pg2, (0,pg1.size[1]))

    # Zealous crop the (horizontal) margins. We do this only after the two
    # pages have been combined so that we don't mess up their alignment.
    # Add some padding.
    from PIL import ImageOps
    hpad = int(.02*img.size[0])
    bbox = ImageOps.invert(img).getbbox()
    img = img.crop( (max(0, bbox[0]-hpad), 0, min(img.size[0], bbox[2]+hpad), img.size[1]) )

    # Now take a window from the top matching a particular aspect ratio.
    # We're going to display this next to photos of members of congress,
    # so use that aspect ratio.
    img = img.crop((0,0, img.size[0], int(240.0/200.0*img.size[0])))

    # Resize to requested width.
    if "width" in request.GET:
        img.thumbnail((int(request.GET["width"]), 11.0/8.0*int(request.GET["width"])), Image.ANTIALIAS)

    import StringIO
    imgbytesbuf = StringIO.StringIO()
    img.save(imgbytesbuf, "PNG")
    imgbytes = imgbytesbuf.getvalue()
    imgbytesbuf.close()
    return HttpResponse(imgbytes, mimetype="image/png")
Beispiel #21
0
 def get_text_info():
     from billtext import load_bill_text
     try:
         return load_bill_text(bill, None, mods_only=True)
     except IOError:
         return None