Beispiel #1
0
def was_bill_enacted_2013(b, startdate, enddate):
    # Our status code is currently tied to the assignment of a slip
    # law number, which isn't what we mean exactly.
    #
    # (Additionally, we should count a bill as enacted if any identified companion
    # bill is enacted.)

    # TODO: See new function in the Bill model.

    # If it *was* assigned a slip law number, which in the future might
    # be useful for veto overrides, then OK.
    if b.current_status in BillStatus.final_status_passed_bill and \
     startdate <= b.current_status_date <= enddate:
        return True

    # Otherwise, check the actions for a <signed> action.
    fn = "data/congress/%s/bills/%s/%s%d/data.json" % (
        b.congress, BillType.by_value(
            b.bill_type).slug, BillType.by_value(b.bill_type).slug, b.number)
    bj = json.load(open(fn))
    for axn in bj["actions"]:
        if axn["type"] == "signed" and startdate.isoformat(
        ) <= axn["acted_at"] <= enddate.isoformat():
            return True

    return False
def was_bill_enacted_2013(b, startdate, enddate):
	# Our status code is currently tied to the assignment of a slip
	# law number, which isn't what we mean exactly.
	#
	# (Additionally, we should count a bill as enacted if any identified companion
	# bill is enacted.)

	# TODO: See new function in the Bill model.

	# If it *was* assigned a slip law number, which in the future might
	# be useful for veto overrides, then OK.
	if b.current_status in BillStatus.final_status_passed_bill and \
		startdate <= b.current_status_date <= enddate:
		return True

	# Otherwise, check the actions for a <signed> action.
	fn = "data/congress/%s/bills/%s/%s%d/data.json" % (
    	b.congress,
        BillType.by_value(b.bill_type).slug,
        BillType.by_value(b.bill_type).slug,
        b.number)
	bj = json.load(open(fn))
	for axn in bj["actions"]:
		if axn["type"] == "signed" and startdate.isoformat() <= axn["acted_at"] <= enddate.isoformat():
			return True
		
	return False
def was_bill_enacted(b, startdate, enddate, recurse=True):
	# Our status code is currently tied to the assignment of a slip
	# law number, which isn't what we mean exactly.
	#
	# (Additionally, we should count a bill as enacted if any identified companion
	# bill is enacted.)

	# If it *was* assigned a slip law number, which in the future might
	# be useful for veto overrides, then OK.
	if b.current_status in BillStatus.final_status_passed_bill and \
		startdate <= b.current_status_date <= enddate:
		return True

	# Otherwise, check the actions for a <signed> action.
	fn = "data/congress/%s/bills/%s/%s%d/data.json" % (
    	b.congress,
        BillType.by_value(b.bill_type).slug,
        BillType.by_value(b.bill_type).slug,
        b.number)
	bj = json.load(open(fn))
	for axn in bj["actions"]:
		if axn["type"] == "signed" and startdate.isoformat() <= axn["acted_at"] <= enddate.isoformat():
			return True

	# Otherwise check companion bills.
	#if recurse:
	#	for rb in RelatedBill.objects.filter(bill=b, relation="identical").select_related("related_bill"):
	#		if was_bill_enacted(rb.related_bill, startdate, enddate, recurse=False):
	#			return True
			
	return False
Beispiel #4
0
def get_bill_text_metadata(bill, version):
    from bill.models import BillType # has to be here and not module-level to avoid cyclic dependency
    import glob, json

    bt = BillType.by_value(bill.bill_type).slug
    basename = "data/congress/%d/bills/%s/%s%d/text-versions" % (bill.congress, bt, bt, bill.number)
    
    if version == None:
        # Cycle through files to find most recent version by date.
        dat = None
        for versionfile in glob.glob(basename + "/*/data.json"):
            d = json.load(open(versionfile))
            if not dat or d["issued_on"] > dat["issued_on"]:
                dat = d
        if not dat: return None
    else:
        dat = json.load(open(basename + "/%s/data.json" % version))
        
    basename += "/" + dat["version_code"]

    bt2 = BillType.by_value(bill.bill_type).xml_code
    html_fn = "data/us/bills.text/%s/%s/%s%d%s.html" % (bill.congress, bt2, bt2, bill.number, dat["version_code"])

    if os.path.exists(basename + "/mods.xml"):
        dat["mods_file"] = basename + "/mods.xml"

    # get a plain text file if one exists
    if os.path.exists(basename + "/document.txt"):
        dat["text_file"] = basename + "/document.txt"
        dat["has_displayable_text"] = True

        for source in dat.get("sources", []):
            if source["source"] == "statutes":
                dat["text_file_source"] = "statutes"

    # get an HTML file if one exists
    if os.path.exists(html_fn):
        dat["html_file"] = html_fn
        dat["has_displayable_text"] = True

    # get a PDF file if one exists
    pdf_fn = "data/us/bills.text/%s/%s/%s%d%s.pdf" % (bill.congress, bt2, bt2, bill.number, dat["version_code"])
    if os.path.exists(pdf_fn):
        dat["pdf_file"] = pdf_fn
        dat["has_thumbnail"] = True
        dat["thumbnail_path"] = bill.get_absolute_url() + "/_text_image"

    # get an XML file if one exists
    if os.path.exists(basename + "/catoxml.xml"):
        dat["xml_file"] = basename + "/catoxml.xml"
        dat["has_displayable_text"] = True
        dat["xml_file_source"] = "cato-deepbills"
    elif os.path.exists(basename + "/document.xml"):
        dat["xml_file"] = basename + "/document.xml"
        dat["has_displayable_text"] = True

    return dat
Beispiel #5
0
def get_bill_text_metadata(bill, version):
    from bill.models import BillType  # has to be here and not module-level to avoid cyclic dependency
    import glob, json

    bt = BillType.by_value(bill.bill_type).slug
    basename = "data/congress/%d/bills/%s/%s%d/text-versions" % (
        bill.congress, bt, bt, bill.number)

    if version == None:
        # Cycle through files to find most recent version by date.
        dat = None
        for versionfile in glob.glob(basename + "/*/data.json"):
            d = json.load(open(versionfile))
            if not dat or d["issued_on"] > dat["issued_on"]:
                dat = d
        if not dat: return None
    else:
        dat = json.load(open(basename + "/%s/data.json" % version))

    basename += "/" + dat["version_code"]

    bt2 = BillType.by_value(bill.bill_type).xml_code
    html_fn = "data/us/bills.text/%s/%s/%s%d%s.html" % (
        bill.congress, bt2, bt2, bill.number, dat["version_code"])

    if os.path.exists(basename + "/mods.xml"):
        dat["mods_file"] = basename + "/mods.xml"

    # get a plain text file if one exists
    if os.path.exists(basename + "/document.txt"):
        dat["text_file"] = basename + "/document.txt"
        dat["has_displayable_text"] = True

        for source in dat.get("sources", []):
            if source["source"] == "statutes":
                dat["text_file_source"] = "statutes"

    # get an HTML file if one exists
    if os.path.exists(html_fn):
        dat["html_file"] = html_fn
        dat["has_displayable_text"] = True

    # get an XML file if one exists
    if os.path.exists(basename + "/catoxml.xml"):
        dat["xml_file"] = basename + "/catoxml.xml"
        dat["has_displayable_text"] = True
        dat["xml_file_source"] = "cato-deepbills"
    elif os.path.exists(basename + "/document.xml"):
        dat["xml_file"] = basename + "/document.xml"
        dat["has_displayable_text"] = True

    thumb_fn = "data/us/bills.text/%s/%s/%s%d%s-thumb200.png" % (
        bill.congress, bt2, bt2, bill.number, dat["version_code"])
    if os.path.exists(thumb_fn):
        dat["thumbnail_path"] = thumb_fn

    return dat
def get_transparency_stats(person, role, stats, congress, startdate, enddate):
	global transparency_bills
	if not transparency_bills:
		transparency_bills = []
		for line in open("analysis/transparency-bills.txt"):
			bill = Bill.from_congressproject_id(re.split("\s", line)[0])
			if bill.congress != congress: continue
			transparency_bills.append(bill)

	# which bills are in the right chamber?
	plausible_bills = []
	for bill in transparency_bills:
		if BillType.by_value(bill.bill_type).chamber == RoleType.by_value(role.role_type).congress_chamber:
			plausible_bills.append(bill)

	# did person sponsor any of these within this session?
	sponsored = []
	for bill in transparency_bills:
		if startdate <= bill.introduced_date <= enddate and bill.sponsor == person:
			sponsored.append(bill)

	# did person cosponsor any of these within this session?
	cosponsored = []
	for cosp in Cosponsor.objects.filter(person=person, bill__in=transparency_bills, joined__gte=startdate, joined__lte=enddate):
		cosponsored.append(cosp.bill)

	stats["transparency-bills"] = {
		"value": len(sponsored)*3 + len(cosponsored),
		"sponsored": make_bill_entries(sponsored),
		"cosponsored": make_bill_entries(cosponsored),
		"num_bills": len(plausible_bills),
		"chamber": RoleType.by_value(role.role_type).congress_chamber,
	}
Beispiel #7
0
def get_bill_number(bill, show_congress_number="ARCHIVAL"):
    "Compute display form of bill number"

    from bill.models import BillType
    ret = '%s %s' % (BillType.by_value(bill.bill_type).label, bill.number)
    if (bill.congress != settings.CURRENT_CONGRESS and show_congress_number == "ARCHIVAL") or show_congress_number == "ALL":
        ret += ' (%s)' % ordinal(bill.congress)
    return ret
Beispiel #8
0
def load_bill_text(bill, version, plain_text=False, mods_only=False):
    if bill.congress < 103 or plain_text:
        return load_bill_text_alt(bill, version, plain_text=plain_text, mods_only=mods_only)
    
    from bill.models import BillType # has to be here and not module-level to avoid cyclic dependency

    bt = BillType.by_value(bill.bill_type).xml_code
    basename = "data/us/bills.text/%s/%s/%s%d%s" % (bill.congress, bt, bt, bill.number, version if version != None else "")
    
    if mods_only:
        bill_text_content = None
    else:
        bill_text_content = open(basename + ".html").read()
    
    mods = lxml.etree.parse(basename + ".mods.xml")
    ns = { "mods": "http://www.loc.gov/mods/v3" }
    
    docdate = mods.xpath("string(mods:originInfo/mods:dateIssued)", namespaces=ns)
    gpo_url = "http://www.gpo.gov/fdsys/search/pagedetails.action?packageId=" + mods.xpath("string(mods:recordInfo/mods:recordIdentifier[@source='DGPO'])", namespaces=ns)
    #gpo_url = mods.xpath("string(mods:identifier[@type='uri'])", namespaces=ns)
    gpo_pdf_url = mods.xpath("string(mods:location/mods:url[@displayLabel='PDF rendition'])", namespaces=ns)
    doc_version = mods.xpath("string(mods:extension/mods:billVersion)", namespaces=ns)
    numpages = mods.xpath("string(mods:physicalDescription/mods:extent)", namespaces=ns)
    if numpages: numpages = re.sub(r" p\.$", " pages", numpages)
    
    docdate = datetime.date(*(int(d) for d in docdate.split("-")))
    
    doc_version_name = bill_gpo_status_codes[doc_version]

    # load a list of citations as marked up by GPO
    citations = []
    for cite in mods.xpath("//mods:identifier", namespaces=ns):
        if cite.get("type") == "USC citation":
            citations.append( parse_usc_citation(cite) )
        elif cite.get("type") == "Statute citation":
            citations.append({ "type": "statutes_at_large", "text": cite.text })
        elif cite.get("type") == "public law citation":
            try:
                congress_cite, slip_law_num = re.match(r"Public Law (\d+)-(\d+)$", cite.text).groups()
                citations.append({ "type": "slip_law", "text": cite.text, "congress": int(congress_cite), "number": int(slip_law_num) })
            except:
                citations.append({ "type": "unknown", "text": cite.text })
            
    return {
        "bill_id": bill.id,
        "bill_name": bill.title,
        "basename": basename,
        "text_html": bill_text_content,
        "docdate": docdate,
        "gpo_url": gpo_url,
        "gpo_pdf_url": gpo_pdf_url,
        "doc_version": doc_version,
        "doc_version_name": doc_version_name,
        "numpages": numpages,
        "has_html_text": True,
        "citations": citations,
    }
Beispiel #9
0
def get_bill_text_version_regular(bill, version):
    basename = bill.data_dir_path + "/text-versions"
    dat = json.load(open(basename + "/%s/data.json" % version))

    dat["status_name"] = get_gpo_status_code_name(dat["version_code"])
    dat["corresponding_status_codes"] = get_gpo_status_code_corresponding_status(
        dat["version_code"])
    dat["issued_on"] = datetime.date(*(int(d)
                                       for d in dat["issued_on"].split("-")))

    # find content files

    basename += "/" + dat["version_code"]

    from bill.models import BillType  # has to be here and not module-level to avoid cyclic dependency
    bt = BillType.by_value(bill.bill_type).slug
    html_fn = "data/congress-bill-text-legacy/%d/%s/%s%d/%s.html" % (
        bill.congress, bt, bt, bill.number, dat["version_code"])

    if os.path.exists(basename + "/mods.xml"):
        dat["mods_file"] = basename + "/mods.xml"

    # get a plain text file if one exists
    if os.path.exists(basename + "/document.txt"):
        dat["text_file"] = basename + "/document.txt"
        dat["has_displayable_text"] = True

        for source in dat.get("sources", []):
            if source["source"] == "statutes":
                dat["text_file_source"] = "statutes"

    # get an HTML file if one exists
    if os.path.exists(html_fn):
        dat["html_file"] = html_fn
        dat["has_displayable_text"] = True

    # get a PDF file if one exists
    pdf_fn = basename + "/document.pdf"
    if os.path.exists(pdf_fn):
        dat["pdf_file"] = pdf_fn
        dat["has_thumbnail"] = True
        dat["thumbnail_path"] = bill.get_absolute_url() + "/_text_image"

    # get an XML file if one exists
    if os.path.exists(basename + "/catoxml.xml"):
        dat["xml_file"] = basename + "/catoxml.xml"
        dat["has_displayable_text"] = True
        dat["xml_file_source"] = "cato-deepbills"
    elif os.path.exists(basename + "/document.xml"):
        dat["xml_file"] = basename + "/document.xml"
        dat["has_displayable_text"] = True

    if settings.DEBUG:
        dat["has_thumbnail"] = True

    return dat
Beispiel #10
0
def get_bill_number(bill, show_congress_number="ARCHIVAL"):
    "Compute display form of bill number"
    
    if bill.congress <= 42:
        # This is an American Memory bill. It's number is stored.
        ret = bill.title.split(":")[0]
    else:
        from bill.models import BillType
        ret = '%s %s' % (BillType.by_value(bill.bill_type).label, bill.number)
    if (bill.congress != settings.CURRENT_CONGRESS and show_congress_number == "ARCHIVAL") or show_congress_number == "ALL":
        ret += ' (%s)' % ordinal(bill.congress)
    return ret
Beispiel #11
0
def bill_text(request, congress, type_slug, number, version=None):
    if version == "":
        version = None
    
    try:
        bill_type = BillType.by_slug(type_slug)
    except BillType.NotFound:
        raise Http404("Invalid bill type: " + type_slug)
    bill = get_object_or_404(Bill, congress=congress, bill_type=bill_type, number=number)
    
    from billtext import load_bill_text, bill_gpo_status_codes
    try:
        textdata = load_bill_text(bill, version)
    except IOError:
        textdata = None

    # Get a list of the alternate versions of this bill.
    alternates = None
    if textdata:
        alternates = []
        for v in bill_gpo_status_codes:
            fn = "data/us/bills.text/%s/%s/%s%d%s.mods.xml" % (bill.congress, BillType.by_value(bill.bill_type).xml_code, BillType.by_value(bill.bill_type).xml_code, bill.number, v)
            if os.path.exists(fn):
                alternates.append(load_bill_text(bill, v, mods_only=True))
        alternates.sort(key = lambda mods : mods["docdate"])

    # Get a list of related bills.
    from billtext import get_current_version
    related_bills = []
    for rb in list(bill.find_reintroductions()) + [r.related_bill for r in bill.get_related_bills()]:
        try:
            rbv = get_current_version(rb)
            if not (rb, rbv) in related_bills: related_bills.append((rb, rbv))
        except IOError:
            pass # text not available
    for btc in BillTextComparison.objects.filter(bill1=bill).exclude(bill2=bill):
        if not (btc.bill2, btc.ver2) in related_bills: related_bills.append((btc.bill2, btc.ver2))
    for btc in BillTextComparison.objects.filter(bill2=bill).exclude(bill1=bill):
        if not (btc.bill1, btc.ver1) in related_bills: related_bills.append((btc.bill1, btc.ver1))

    return {
        'bill': bill,
        "congressdates": get_congress_dates(bill.congress),
        "textdata": textdata,
        "version": version,
        "alternates": alternates,
        "related_bills": related_bills,
    }
Beispiel #12
0
def bill_text(request, congress, type_slug, number, version=None):
    if version == "":
        version = None

    try:
        bill_type = BillType.by_slug(type_slug)
    except BillType.NotFound:
        raise Http404("Invalid bill type: " + type_slug)
    bill = get_object_or_404(Bill, congress=congress, bill_type=bill_type, number=number)

    from billtext import load_bill_text, bill_gpo_status_codes
    try:
        textdata = load_bill_text(bill, version)
    except IOError:
        textdata = None

    # Get a list of the alternate versions of this bill.
    alternates = None
    if textdata:
        alternates = []
        for v in bill_gpo_status_codes:
            fn = "data/us/bills.text/%s/%s/%s%d%s.mods.xml" % (bill.congress, BillType.by_value(bill.bill_type).xml_code, BillType.by_value(bill.bill_type).xml_code, bill.number, v)
            if os.path.exists(fn):
                alternates.append(load_bill_text(bill, v, mods_only=True))
        alternates.sort(key = lambda mods : mods["docdate"])

    # Get a list of related bills.
    from billtext import get_current_version
    related_bills = []
    for rb in list(bill.find_reintroductions()) + [r.related_bill for r in bill.get_related_bills()]:
        try:
            rbv = get_current_version(rb)
            if not (rb, rbv) in related_bills: related_bills.append((rb, rbv))
        except IOError:
            pass # text not available
    for btc in BillTextComparison.objects.filter(bill1=bill).exclude(bill2=bill):
        if not (btc.bill2, btc.ver2) in related_bills: related_bills.append((btc.bill2, btc.ver2))
    for btc in BillTextComparison.objects.filter(bill2=bill).exclude(bill1=bill):
        if not (btc.bill1, btc.ver1) in related_bills: related_bills.append((btc.bill1, btc.ver1))

    return {
        'bill': bill,
        "congressdates": get_congress_dates(bill.congress),
        "textdata": textdata,
        "version": version,
        "alternates": alternates,
        "related_bills": related_bills,
    }
Beispiel #13
0
def get_bill_text_metadata(bill, version):
    from bill.models import BillType # has to be here and not module-level to avoid cyclic dependency
    import glob, json

    bt = BillType.by_value(bill.bill_type).slug
    basename = "data/congress/%d/bills/%s/%s%d/text-versions" % (bill.congress, bt, bt, bill.number)
    
    if version == None:
        # Cycle through files to find most recent version by date.
        dat = None
        for versionfile in glob.glob(basename + "/*/data.json"):
            d = json.load(open(versionfile))
            if not dat or d["issued_on"] > dat["issued_on"]:
                dat = d
        if not dat: return None
    else:
        dat = json.load(open(basename + "/%s/data.json" % version))
        
    dat["plain_text_file"] = basename + "/" + dat["version_code"] + "/document.txt"
    
    return dat
Beispiel #14
0
def load_bill_text_alt(bill, version, plain_text=False, mods_only=False):
    # Load bill text info from the Congress project JSON files.
    
    from bill.models import BillType # has to be here and not module-level to avoid cyclic dependency
    import glob, json

    bt = BillType.by_value(bill.bill_type).slug
    basename = "data/congress/%d/bills/%s/%s%d/text-versions" % (bill.congress, bt, bt, bill.number)
    
    if version == None:
        # Cycle through files to find most recent version by date.
        dat = None
        for versionfile in glob.glob(basename + "/*.json"):
            d = json.load(open(versionfile))
            if not dat or d["issued_on"] > dat["issued_on"]:
                dat = d
    else:
        dat = json.load(open(basename + "/%s.json" % version))
            
    if not mods_only:
        raise Exception("Bill text not available.")
            
    gpo_url = dat["urls"]["pdf"]
    m = re.match(r"http://www.gpo.gov/fdsys/pkg/(STATUTE-\d+)/pdf/(STATUTE-\d+-.*).pdf", gpo_url)
    if m:
        gpo_url = "http://www.gpo.gov/fdsys/granule/%s/%s/content-detail.html" % m.groups()
            
    return {
        "bill_id": bill.id,
        "bill_name": bill.title,
        "basename": basename,
        "docdate": datetime.date(*(int(d) for d in dat["issued_on"].split("-"))),
        "gpo_url": gpo_url,
        "gpo_pdf_url": dat["urls"]["pdf"],
        "doc_version": dat["version_code"],
        "doc_version_name": bill_gpo_status_codes[dat["version_code"]],
        "has_html_text": False,
    }
Beispiel #15
0
def get_bill_text_metadata(bill, version):
    from bill.models import BillType  # has to be here and not module-level to avoid cyclic dependency
    import glob, json

    bt = BillType.by_value(bill.bill_type).slug
    basename = "data/congress/%d/bills/%s/%s%d/text-versions" % (
        bill.congress, bt, bt, bill.number)

    if version == None:
        # Cycle through files to find most recent version by date.
        dat = None
        for versionfile in glob.glob(basename + "/*/data.json"):
            d = json.load(open(versionfile))
            if not dat or d["issued_on"] > dat["issued_on"]:
                dat = d
        if not dat: return None
    else:
        dat = json.load(open(basename + "/%s/data.json" % version))

    dat["plain_text_file"] = basename + "/" + dat[
        "version_code"] + "/document.txt"

    return dat
#!script

import os.path
from bill.models import Bill, BillType

all_bill_ids = list(Bill.objects.all().values_list('id', flat=True))


def batch(iterable, n=1):
    l = len(iterable)
    for ndx in range(0, l, n):
        yield iterable[ndx:min(ndx + n, l)]


for idset in batch(all_bill_ids, n=2000):
    print("...")
    for bill in Bill.objects.only('congress', 'bill_type',
                                  'number').in_bulk(idset).values():
        fn = "data/congress/%s/bills/%s/%s%d/data.json" % (
            bill.congress, BillType.by_value(bill.bill_type).slug,
            BillType.by_value(bill.bill_type).slug, bill.number)

        if not os.path.exists(fn):
            print(bill.id, bill)
#!script

import os.path
from bill.models import Bill, BillType

all_bill_ids = list(Bill.objects.all().values_list('id', flat=True))

def batch(iterable, n = 1):
   l = len(iterable)
   for ndx in range(0, l, n):
	   yield iterable[ndx:min(ndx+n, l)]

for idset in batch(all_bill_ids, n=2000):
	print "..."
	for bill in Bill.objects.only('congress', 'bill_type', 'number').in_bulk(idset).values():
		fn = "data/congress/%s/bills/%s/%s%d/data.json" % (
			bill.congress,
			BillType.by_value(bill.bill_type).slug,
			BillType.by_value(bill.bill_type).slug,
			bill.number)

		if not os.path.exists(fn):
			print bill.id, bill


Beispiel #18
0
def load_bill_text(bill, version, plain_text=False, mods_only=False):
    if bill.congress < 103 or plain_text:
        return load_bill_text_alt(bill, version, plain_text=plain_text, mods_only=mods_only)
    
    from bill.models import BillType # has to be here and not module-level to avoid cyclic dependency

    bt = BillType.by_value(bill.bill_type).xml_code
    basename = "data/us/bills.text/%s/%s/%s%d%s" % (bill.congress, bt, bt, bill.number, version if version != None else "")
    
    if mods_only:
        bill_text_content = None
    else:
        bill_text_content = open(basename + ".html").read()
    
    mods = lxml.etree.parse(basename + ".mods.xml")
    ns = { "mods": "http://www.loc.gov/mods/v3" }
    
    docdate = mods.xpath("string(mods:originInfo/mods:dateIssued)", namespaces=ns)
    gpo_url = "http://www.gpo.gov/fdsys/search/pagedetails.action?packageId=" + mods.xpath("string(mods:recordInfo/mods:recordIdentifier[@source='DGPO'])", namespaces=ns)
    #gpo_url = mods.xpath("string(mods:identifier[@type='uri'])", namespaces=ns)
    gpo_pdf_url = mods.xpath("string(mods:location/mods:url[@displayLabel='PDF rendition'])", namespaces=ns)
    doc_version = mods.xpath("string(mods:extension/mods:billVersion)", namespaces=ns)
    numpages = mods.xpath("string(mods:physicalDescription/mods:extent)", namespaces=ns)
    if numpages: numpages = re.sub(r" p\.$", " pages", numpages)
    
    docdate = datetime.date(*(int(d) for d in docdate.split("-")))
    
    doc_version_name = bill_gpo_status_codes[doc_version]
    
    # citations
    citations = []
    for cite in mods.xpath("//mods:identifier", namespaces=ns):
        if cite.get("type") == "USC citation":
            try:
                title_cite, title_app_cite, sec_cite, para_cite = re.match(r"(\d+\S*)\s*U.S.C.(\s*App.)?\s*([^\s(]+?)?\s*(\(.*|et ?seq\.?|note)?$", cite.text).groups()
                if title_app_cite: title_cite += "a"
                if para_cite and para_cite.strip() == "": para_cite = None
                
                if not para_cite and "-" in sec_cite:
                    # This dash may indicate a range of sections, or it may just be
                    # a dash that occurs within section names. Be smart and try to
                    # figure it out.
                    found_range = False
                    sec_dash_parts = sec_cite.split("-")
                    for i in xrange(1, len(sec_dash_parts)):
                        # Split the citation around each particular dash, and if both
                        # halves are valid citations with the same parent then assume
                        # this is a range. (A nice case is 16 U.S.C. 3839aa-8, where
                        # both 3839aa and 8 are valid sections but are far apart.)
                        sec_parts = ["-".join(sec_dash_parts[:i]),
                                     "-".join(sec_dash_parts[i:])]
                        from models import USCSection
                        sec_parent = None
                        for sec_part in sec_parts:
                            matched_sec = list(USCSection.objects.filter(citation="usc/" + title_cite + "/" + sec_part))
                            if len(matched_sec) == 0:
                                break # part doesn't exist, skip the else block below and fall through to assume this is not a range
                            if sec_parent == None:
                                sec_parent = matched_sec[0].parent_section_id
                            else:
                                if sec_parent != matched_sec[0].parent_section_id:
                                    break # likewise, parents dont match so not a range
                        else:
                            # Both parts exist. Treat as a USC citation range.
                            citations.append({ "type": "usc", "text": cite.text, "title": title_cite, "section": sec_parts[0], "paragraph": None, "range_to_section": sec_parts[1] })
                            found_range = True
                            break
                    if found_range: continue
                    
                citations.append({ "type": "usc", "text": cite.text, "title": title_cite, "section": sec_cite, "paragraph" : para_cite })
            except:
                citations.append({ "type": "unknown", "text": cite.text })
        elif cite.get("type") == "Statute citation":
            citations.append({ "type": "statutes_at_large", "text": cite.text })
        elif cite.get("type") == "public law citation":
            try:
                congress_cite, slip_law_num = re.match(r"Public Law (\d+)-(\d+)$", cite.text).groups()
                citations.append({ "type": "slip_law", "text": cite.text, "congress": int(congress_cite), "number": int(slip_law_num) })
            except:
                citations.append({ "type": "unknown", "text": cite.text })
        else:
            continue
            
    return {
        "bill_id": bill.id,
        "bill_name": bill.title,
        "basename": basename,
        "text_html": bill_text_content,
        "docdate": docdate,
        "gpo_url": gpo_url,
        "gpo_pdf_url": gpo_pdf_url,
        "doc_version": doc_version,
        "doc_version_name": doc_version_name,
        "numpages": numpages,
        "has_html_text": True,
        "citations": citations,
    }
Beispiel #19
0
def analysis_methodology(request):
    from settings import CURRENT_CONGRESS
    from person.models import RoleType
    from bill.models import BillType
    from us import get_congress_dates
    import json
    
    from person.analysis import load_sponsorship_analysis2
    def make_chart_series(role_type):
        data = load_sponsorship_analysis2(CURRENT_CONGRESS, role_type, None)
        if not data: return None
        
        ret = { }
        for p in data["all"]:
            ret.setdefault(p["party"], {
                "type": "party",
                "party": p["party"],
                "data": [],
            })["data"].append({
                "x": float(p["ideology"]),
                "y": float(p["leadership"]),
                "name": p["name"],
            })
        ret = list(ret.values())
        ret.sort(key = lambda s : len(s["data"]), reverse=True)
        
        data = dict(data) # clone before modifying, just in case
        data["series"] = json.dumps(ret)
        
        return data
        
    import bill.prognosis_model
    import bill.prognosis_model_test
    prognosis_factors = list((k, dict(v)) for k, v in bill.prognosis_model.factors.items()) # clone
    for k, v in prognosis_factors:
        v["bill_type"] = BillType.by_value(k[0])
        v["is_introduced_model"] = k[1]
        v["factors"] = sorted(v["factors"].values(), key = lambda f : f["regression_beta"], reverse=True)
    prognosis_factors = [kv[1] for kv in prognosis_factors]
    prognosis_factors.sort(key = lambda m : (m["bill_type"] in (BillType.house_bill, BillType.senate_bill), m["count"]), reverse=True)
    prognosis_test = list(bill.prognosis_model_test.model_test_results.items()) # clone
    for k, v in prognosis_test:
        v["bill_type"] = BillType.by_value(k[0])
        v["is_introduced_model"] = (k[1] == 0)
        v["success_name"] = bill.prognosis_model.factors[(k[0], (k[1] == 0))]["success_name"]
    prognosis_test.sort(key = lambda kv : (kv[0][0] in (BillType.house_bill, BillType.senate_bill), bill.prognosis_model.factors[kv[0]]["count"]), reverse=True)
    prognosis_test = [kv[1] for kv in prognosis_test]
    
    return {
        "ideology": lambda : { # defer until cache miss
            "house": make_chart_series(RoleType.representative), 
            "senate": make_chart_series(RoleType.senator),
        },
        "current_congress": CURRENT_CONGRESS,
        "prognosis_training_congress": bill.prognosis_model.congress,
        "prognosis_training_congress_dates": get_congress_dates(bill.prognosis_model.congress),
        "prognosis_factors": prognosis_factors,
        "prognosis_test": prognosis_test,
        "prognosis_testing_traincongress": bill.prognosis_model_test.train_congress,
        "prognosis_testing_testcongress": bill.prognosis_model_test.test_congress,
    }
Beispiel #20
0
 def reference(bill):
     bt = BillType.by_value(bill.bill_type)
     return bt.xml_code + str(bill.congress) + "-" + str(bill.number)
Beispiel #21
0
def load_bill_text(bill, version, plain_text=False, mods_only=False):
    if bill.congress < 103:
        return load_bill_text_alt(bill, version, plain_text=plain_text, mods_only=mods_only)
    
    from bill.models import BillType # has to be here and not module-level to avoid cyclic dependency

    bt = BillType.by_value(bill.bill_type).xml_code
    basename = "data/us/bills.text/%s/%s/%s%d%s" % (bill.congress, bt, bt, bill.number, version if version != None else "")
    
    if mods_only:
        bill_text_content = None
    else:
        if plain_text:
            # plain_text never raises an IOError
            try:
                return open(basename + ".txt").read().decode("utf8", "ignore") # otherwise we get 'Chuck failed' in the xapian_backend apparently due to decoding issue.
            except IOError:
                return ""
        elif os.path.exists(basename + ".xml") and False:
            dom = lxml.etree.parse(basename + ".xml")
            transform = lxml.etree.parse(os.path.join(os.path.dirname(os.path.realpath(__file__)), "textxsl/billres.xsl"))
            transform = lxml.etree.XSLT(transform)
            result = transform(dom)
            
            # empty nodes cause HTML parsing problems, so remove them.
            # iterate in reverse document order so that we hit parents after
            # their children, since if we remove all of the children then we may
            # want to remove the parent too.
            for node in reversed(list(result.getiterator())):
                if node.xpath("string(.)") == "":
                    node.getparent().remove(node)
                    
            bill_text_content = lxml.etree.tostring(result.xpath("head/style")[0]) + lxml.etree.tostring(result.xpath("body")[0])
        else:
            bill_text_content = open(basename + ".html").read()
    
    mods = lxml.etree.parse(basename + ".mods.xml")
    ns = { "mods": "http://www.loc.gov/mods/v3" }
    
    docdate = mods.xpath("string(mods:originInfo/mods:dateIssued)", namespaces=ns)
    gpo_url = "http://www.gpo.gov/fdsys/search/pagedetails.action?packageId=" + mods.xpath("string(mods:recordInfo/mods:recordIdentifier[@source='DGPO'])", namespaces=ns)
    #gpo_url = mods.xpath("string(mods:identifier[@type='uri'])", namespaces=ns)
    gpo_pdf_url = mods.xpath("string(mods:location/mods:url[@displayLabel='PDF rendition'])", namespaces=ns)
    doc_version = mods.xpath("string(mods:extension/mods:billVersion)", namespaces=ns)
    numpages = mods.xpath("string(mods:physicalDescription/mods:extent)", namespaces=ns)
    if numpages: numpages = re.sub(r" p\.$", " pages", numpages)
    
    docdate = datetime.date(*(int(d) for d in docdate.split("-")))
    
    doc_version_name = bill_gpo_status_codes[doc_version]
    
    # citations
    citations = []
    for cite in mods.xpath("//mods:identifier", namespaces=ns):
        if cite.get("type") == "USC citation":
            try:
                title_cite, title_app_cite, sec_cite, para_cite = re.match(r"(\d+\S*)\s*U.S.C.(\s*App.)?\s*([^\s(]+?)?\s*(\(.*|et ?seq\.?|note)?$", cite.text).groups()
                if title_app_cite: title_cite += "a"
                citations.append({ "type": "usc", "text": cite.text, "title": title_cite, "section": sec_cite, "paragraph" : para_cite })
            except:
                citations.append({ "type": "unknown", "text": cite.text })
        elif cite.get("type") == "Statute citation":
            citations.append({ "type": "statutes_at_large", "text": cite.text })
        elif cite.get("type") == "public law citation":
            try:
                congress_cite, slip_law_num = re.match(r"Public Law (\d+)-(\d+)$", cite.text).groups()
                citations.append({ "type": "slip_law", "text": cite.text, "congress": int(congress_cite), "number": int(slip_law_num) })
            except:
                citations.append({ "type": "unknown", "text": cite.text })
        else:
            continue
            
    return {
        "bill_id": bill.id,
        "bill_name": bill.title,
        "basename": basename,
        "text_html": bill_text_content,
        "docdate": docdate,
        "gpo_url": gpo_url,
        "gpo_pdf_url": gpo_pdf_url,
        "doc_version": doc_version,
        "doc_version_name": doc_version_name,
        "numpages": numpages,
        "has_html_text": True,
        "citations": citations,
    }
Beispiel #22
0
def load_bill_text(bill, version, plain_text=False, mods_only=False):
    if bill.congress < 103 or plain_text:
        return load_bill_text_alt(bill,
                                  version,
                                  plain_text=plain_text,
                                  mods_only=mods_only)

    from bill.models import BillType  # has to be here and not module-level to avoid cyclic dependency

    bt = BillType.by_value(bill.bill_type).xml_code
    basename = "data/us/bills.text/%s/%s/%s%d%s" % (
        bill.congress, bt, bt, bill.number, version if version != None else "")

    if mods_only:
        bill_text_content = None
    else:
        bill_text_content = open(basename + ".html").read()

    mods = lxml.etree.parse(basename + ".mods.xml")
    ns = {"mods": "http://www.loc.gov/mods/v3"}

    docdate = mods.xpath("string(mods:originInfo/mods:dateIssued)",
                         namespaces=ns)
    gpo_url = "http://www.gpo.gov/fdsys/search/pagedetails.action?packageId=" + mods.xpath(
        "string(mods:recordInfo/mods:recordIdentifier[@source='DGPO'])",
        namespaces=ns)
    #gpo_url = mods.xpath("string(mods:identifier[@type='uri'])", namespaces=ns)
    gpo_pdf_url = mods.xpath(
        "string(mods:location/mods:url[@displayLabel='PDF rendition'])",
        namespaces=ns)
    doc_version = mods.xpath("string(mods:extension/mods:billVersion)",
                             namespaces=ns)
    numpages = mods.xpath("string(mods:physicalDescription/mods:extent)",
                          namespaces=ns)
    if numpages: numpages = re.sub(r" p\.$", " pages", numpages)

    docdate = datetime.date(*(int(d) for d in docdate.split("-")))

    doc_version_name = bill_gpo_status_codes[doc_version]

    # load a list of citations as marked up by GPO
    citations = []
    for cite in mods.xpath("//mods:identifier", namespaces=ns):
        if cite.get("type") == "USC citation":
            citations.append(parse_usc_citation(cite))
        elif cite.get("type") == "Statute citation":
            citations.append({"type": "statutes_at_large", "text": cite.text})
        elif cite.get("type") == "public law citation":
            try:
                congress_cite, slip_law_num = re.match(
                    r"Public Law (\d+)-(\d+)$", cite.text).groups()
                citations.append({
                    "type": "slip_law",
                    "text": cite.text,
                    "congress": int(congress_cite),
                    "number": int(slip_law_num)
                })
            except:
                citations.append({"type": "unknown", "text": cite.text})

    return {
        "bill_id": bill.id,
        "bill_name": bill.title,
        "basename": basename,
        "text_html": bill_text_content,
        "docdate": docdate,
        "gpo_url": gpo_url,
        "gpo_pdf_url": gpo_pdf_url,
        "doc_version": doc_version,
        "doc_version_name": doc_version_name,
        "numpages": numpages,
        "has_html_text": True,
        "citations": citations,
    }
Beispiel #23
0
def get_bill_text_metadata(bill, version):
    from bill.models import BillType # has to be here and not module-level to avoid cyclic dependency
    import glob, json

    basename = bill.data_dir_path + "/text-versions"
    
    if version == None:
        # Cycle through files to find most recent version by date.
        dat = None
        for versionfile in glob.glob(basename + "/*/data.json"):
            d = json.load(open(versionfile))
            if not dat or d["issued_on"] > dat["issued_on"]:
                dat = d
        if not dat: return None
    else:
        dat = json.load(open(basename + "/%s/data.json" % version))

    # human readable status name

    dat["status_name"] = get_gpo_status_code_name(dat["version_code"])
    dat["corresponding_status_codes"] = get_gpo_status_code_corresponding_status(dat["version_code"])

    # parse date

    dat["issued_on"] = datetime.date(*(int(d) for d in dat["issued_on"].split("-")))

    # find content files
        
    basename += "/" + dat["version_code"]

    bt = BillType.by_value(bill.bill_type).slug
    html_fn = "data/congress-bill-text-legacy/%d/%s/%s%d/%s.html" % (bill.congress, bt, bt, bill.number, dat["version_code"])

    if os.path.exists(basename + "/mods.xml"):
        dat["mods_file"] = basename + "/mods.xml"

    # get a plain text file if one exists
    if os.path.exists(basename + "/document.txt"):
        dat["text_file"] = basename + "/document.txt"
        dat["has_displayable_text"] = True

        for source in dat.get("sources", []):
            if source["source"] == "statutes":
                dat["text_file_source"] = "statutes"

    # get an HTML file if one exists
    if os.path.exists(html_fn):
        dat["html_file"] = html_fn
        dat["has_displayable_text"] = True

    # get a PDF file if one exists
    pdf_fn = basename + "/document.pdf"
    if os.path.exists(pdf_fn):
        dat["pdf_file"] = pdf_fn
        dat["has_thumbnail"] = True
        dat["thumbnail_path"] = bill.get_absolute_url() + "/_text_image"

    # get an XML file if one exists
    if os.path.exists(basename + "/catoxml.xml"):
        dat["xml_file"] = basename + "/catoxml.xml"
        dat["has_displayable_text"] = True
        dat["xml_file_source"] = "cato-deepbills"
    elif os.path.exists(basename + "/document.xml"):
        dat["xml_file"] = basename + "/document.xml"
        dat["has_displayable_text"] = True

    if settings.DEBUG:
        dat["has_thumbnail"] = True

    return dat
Beispiel #24
0
def load_bill_text(bill, version, plain_text=False, mods_only=False):
    if bill.congress < 103:
        return load_bill_text_alt(bill, version, plain_text=plain_text, mods_only=mods_only)
    
    from bill.models import BillType # has to be here and not module-level to avoid cyclic dependency

    bt = BillType.by_value(bill.bill_type).xml_code
    basename = "data/us/bills.text/%s/%s/%s%d%s" % (bill.congress, bt, bt, bill.number, version if version != None else "")
    
    if mods_only:
        bill_text_content = None
    else:
        if plain_text:
            try:
                return open(basename + ".txt").read().decode("utf8", "ignore") # otherwise we get 'Chuck failed' in the xapian_backend apparently due to decoding issue.
            except IOError:
                return ""
        elif os.path.exists(basename + ".xml") and False:
            dom = lxml.etree.parse(basename + ".xml")
            transform = lxml.etree.parse(os.path.join(os.path.dirname(os.path.realpath(__file__)), "textxsl/billres.xsl"))
            transform = lxml.etree.XSLT(transform)
            result = transform(dom)
            
            # empty nodes cause HTML parsing problems, so remove them.
            # iterate in reverse document order so that we hit parents after
            # their children, since if we remove all of the children then we may
            # want to remove the parent too.
            for node in reversed(list(result.getiterator())):
                if node.xpath("string(.)") == "":
                    node.getparent().remove(node)
                    
            bill_text_content = lxml.etree.tostring(result.xpath("head/style")[0]) + lxml.etree.tostring(result.xpath("body")[0])
        else:
            bill_text_content = open(basename + ".html").read()
    
    mods = lxml.etree.parse(basename + ".mods.xml")
    ns = { "mods": "http://www.loc.gov/mods/v3" }
    docdate = mods.xpath("string(mods:originInfo/mods:dateIssued)", namespaces=ns)
    gpo_url = "http://www.gpo.gov/fdsys/search/pagedetails.action?packageId=" + mods.xpath("string(mods:recordInfo/mods:recordIdentifier[@source='DGPO'])", namespaces=ns)
    #gpo_url = mods.xpath("string(mods:identifier[@type='uri'])", namespaces=ns)
    gpo_pdf_url = mods.xpath("string(mods:location/mods:url[@displayLabel='PDF rendition'])", namespaces=ns)
    doc_version = mods.xpath("string(mods:extension/mods:billVersion)", namespaces=ns)
    numpages = mods.xpath("string(mods:physicalDescription/mods:extent)", namespaces=ns)
    if numpages: numpages = re.sub(r" p\.$", " pages", numpages)
    
    docdate = datetime.date(*(int(d) for d in docdate.split("-")))
    
    doc_version_name = bill_gpo_status_codes[doc_version]
    
    return {
        "bill_id": bill.id,
        "bill_name": bill.title,
        "basename": basename,
        "text_html": bill_text_content,
        "docdate": docdate,
        "gpo_url": gpo_url,
        "gpo_pdf_url": gpo_pdf_url,
        "doc_version": doc_version,
        "doc_version_name": doc_version_name,
        "numpages": numpages,
        "has_html_text": True,
    }
Beispiel #25
0
def get_bill_text_metadata(bill, version):
    from bill.models import BillType  # has to be here and not module-level to avoid cyclic dependency
    import glob, json

    bt = BillType.by_value(bill.bill_type).slug
    basename = "data/congress/%d/bills/%s/%s%d/text-versions" % (
        bill.congress, bt, bt, bill.number)

    if version == None:
        # Cycle through files to find most recent version by date.
        dat = None
        for versionfile in glob.glob(basename + "/*/data.json"):
            d = json.load(open(versionfile))
            if not dat or d["issued_on"] > dat["issued_on"]:
                dat = d
        if not dat: return None
    else:
        dat = json.load(open(basename + "/%s/data.json" % version))

    # human readable status name

    dat["status_name"] = get_gpo_status_code_name(dat["version_code"])
    dat["corresponding_status_codes"] = get_gpo_status_code_corresponding_status(
        dat["version_code"])

    # parse date

    dat["issued_on"] = datetime.date(*(int(d)
                                       for d in dat["issued_on"].split("-")))

    # find content files

    basename += "/" + dat["version_code"]

    bt2 = BillType.by_value(bill.bill_type).xml_code
    html_fn = "data/congress-bill-text-legacy/%s/%s/%s%d%s.html" % (
        bill.congress, bt2, bt2, bill.number, dat["version_code"])

    if os.path.exists(basename + "/mods.xml"):
        dat["mods_file"] = basename + "/mods.xml"

    # get a plain text file if one exists
    if os.path.exists(basename + "/document.txt"):
        dat["text_file"] = basename + "/document.txt"
        dat["has_displayable_text"] = True

        for source in dat.get("sources", []):
            if source["source"] == "statutes":
                dat["text_file_source"] = "statutes"

    # get an HTML file if one exists
    if os.path.exists(html_fn):
        dat["html_file"] = html_fn
        dat["has_displayable_text"] = True

    # get a PDF file if one exists
    pdf_fn = basename + "/document.pdf"
    if os.path.exists(pdf_fn):
        dat["pdf_file"] = pdf_fn
        dat["has_thumbnail"] = True
        dat["thumbnail_path"] = bill.get_absolute_url() + "/_text_image"

    # get an XML file if one exists
    if os.path.exists(basename + "/catoxml.xml"):
        dat["xml_file"] = basename + "/catoxml.xml"
        dat["has_displayable_text"] = True
        dat["xml_file_source"] = "cato-deepbills"
    elif os.path.exists(basename + "/document.xml"):
        dat["xml_file"] = basename + "/document.xml"
        dat["has_displayable_text"] = True

    if settings.DEBUG:
        dat["has_thumbnail"] = True

    return dat
Beispiel #26
0
def load_bill_text_alt(bill, version, plain_text=False, mods_only=False):
    # Load bill text info from the Congress project JSON files.
    
    from bill.models import BillType # has to be here and not module-level to avoid cyclic dependency
    import glob, json

    bt = BillType.by_value(bill.bill_type).slug
    basename = "data/congress/%d/bills/%s/%s%d/text-versions" % (bill.congress, bt, bt, bill.number)
    
    if version == None:
        # Cycle through files to find most recent version by date.
        dat = None
        for versionfile in glob.glob(basename + "/*.json"):
            d = json.load(open(versionfile))
            if not dat or d["issued_on"] > dat["issued_on"]:
                dat = d
    else:
        dat = json.load(open(basename + "/%s.json" % version))
            
    # Load the text content (unless mods_only is set).
    bill_text_content = None
    try:
        if not dat: raise IOError("Bill text is not available for this bill.")
        if not mods_only:
            bill_text_content = open(basename + "/" + dat["version_code"] + "/document.txt").read().decode("utf8")
    except IOError:
        # text not available
        if mods_only or not plain_text: raise # these calls require raising
        bill_text_content = "" # plain_text gets "" returned instead

    # Caller just wants the plain text?
    if not mods_only and plain_text:
        # replace form feeds with an indication of the page break
        return bill_text_content.replace(u"\u000C", "=============================================")
        
    # Caller wants HTML.
    if not mods_only:
        # Return the text wrapped in <pre>, and replace form feeds with an <hr>.
        import cgi
        bill_text_content = "<pre>" + cgi.escape(bill_text_content) + "</pre>"
        bill_text_content = bill_text_content.replace(u"\u000C", "<hr>")
        #bill_text_content = "<pre>""\n".join(
        #    "<div>" + cgi.escape(line) + "</div>"
        #    for line in
        #    bill_text_content.split("\n")
        #    )

    # Returning metadata?
    try:
        gpo_url = dat["urls"]["pdf"]
    except:
        # hmm, data format problem
        raise IOError("Bill metadata not available.")
        
    m = re.match(r"http://www.gpo.gov/fdsys/pkg/(STATUTE-\d+)/pdf/(STATUTE-\d+-.*).pdf", gpo_url)
    if m:
        gpo_url = "http://www.gpo.gov/fdsys/granule/%s/%s/content-detail.html" % m.groups()
            
    return {
        "bill_id": bill.id,
        "bill_name": bill.title,
        "text_html": bill_text_content,
        "basename": basename,
        "docdate": datetime.date(*(int(d) for d in dat["issued_on"].split("-"))),
        "gpo_url": gpo_url,
        "gpo_pdf_url": dat["urls"]["pdf"],
        "doc_version": dat["version_code"],
        "doc_version_name": bill_gpo_status_codes[dat["version_code"]],
        "has_html_text": True,
    }
Beispiel #27
0
 def reference(bill):
     bt = BillType.by_value(bill.bill_type)
     return bt.xml_code + str(bill.congress) + "-" + str(bill.number)