def doit(congress):
    all = defaultdict(lambda: 0)
    enacted = defaultdict(lambda: 0)
    missing_text = 0

    qs = Bill.objects.filter(congress=congress)
    for b in tqdm.tqdm(qs, total=qs.count()):
        try:
            pp = load_bill_text(b, None, mods_only=True).get("numpages")
        except IOError:
            missing_text += 1
            continue
        wds = len(load_bill_text(b, None, plain_text=True).split(" "))

        all["count"] += 1
        all["pages"] += pp
        all["words"] += wds
        if b.current_status in BillStatus.final_status_enacted_bill:
            enacted["count"] += 1
            enacted["pages"] += pp
            enacted["words"] += wds

    print(congress, all["count"], all["pages"], all["words"], enacted["count"],
          enacted["pages"], enacted["words"])
    print("\t", missing_text, "missing text")
Beispiel #2
0
def compute_productivity(congress, date_range):
	# laws

	enacted_bills = Bill.objects.filter(
		congress=congress, # if we're measuring presidential activity, the date of signing could be outside of the Congress, so change this
		current_status__in=BillStatus.final_status_passed_bill,
		#current_status_date__gte=date_range[0],
		#current_status_date__lte=date_range[1]
		)

	#enacted_bills = (enacted_bills.filter(title__contains="Appropriations") | enacted_bills.filter(title__contains="Authorization")).distinct()

	enacted_bills = list(enacted_bills)
	enacted_bills_count = len(enacted_bills)

	enacted_bill_pages = 0
	enacted_bill_words = 0
	enacted_bill_pages_missing = 0
	for b in enacted_bills:
		try:
			pp = load_bill_text(b, None, mods_only=True).get("numpages")
		except IOError:
			pp = None
		if pp is None:
			enacted_bill_pages_missing += 1
			continue
		pp = int(pp.replace(" pages", ""))
		enacted_bill_pages += pp

		wds = len(load_bill_text(b, None, plain_text=True).split(" "))
		enacted_bill_words += wds

 	if congress < 103: enacted_bill_pages = "(no data)"
 	if congress < 103: enacted_bill_words = "(no data)"

	# votes

	house_votes = Vote.objects.filter(
		congress=congress,
		created__gte=date_range[0],
		created__lte=date_range[1],
		chamber=CongressChamber.house).count()
	senate_votes = Vote.objects.filter(
		congress=congress,
		created__gte=date_range[0],
		created__lte=date_range[1],
		chamber=CongressChamber.senate).count()

	## power
	#congress_same_party = party_control[congress][0] == party_control[congress][1]
	#branches_same_party = (party_control[congress][0] == party_control[congress][1]) and (party_control[congress][0] == party_control[congress][2])

	#

	timespan = "%d-%d" % (get_congress_dates(congress)[0].year, ((get_congress_dates(congress)[1].year-1) if get_congress_dates(congress)[1].month == 1 else get_congress_dates(congress)[1].year))
	row = [congress, timespan, date_range[0].isoformat(), date_range[1].isoformat(),
		enacted_bills_count, enacted_bill_pages, enacted_bill_words, house_votes, senate_votes]
	W.writerow(row)
def compute_productivity(congress, days_in):
	corresponding_day = get_congress_dates(congress)[0] + days_in

	# laws

	enacted_bills = Bill.objects.filter(
		congress=congress,
		current_status__in=BillStatus.final_status_passed_bill,
		current_status_date__lte=corresponding_day)

	#enacted_bills = (enacted_bills.filter(title__contains="Appropriations") | enacted_bills.filter(title__contains="Authorization")).distinct()

	enacted_bills = list(enacted_bills)
	enacted_bills_count = len(enacted_bills)

	enacted_bill_pages = 0
	enacted_bill_words = 0
	enacted_bill_pages_missing = 0
	for b in enacted_bills:
		try:
			pp = load_bill_text(b, None, mods_only=True).get("numpages")
		except IOError:
			pp = None
		if pp is None:
			enacted_bill_pages_missing += 1
			continue
		pp = int(pp.replace(" pages", ""))
		enacted_bill_pages += pp

		wds = len(load_bill_text(b, None, plain_text=True).split(" "))
		enacted_bill_words += wds

 	if congress < 103: enacted_bill_pages = "(no data)"
 	if congress < 103: enacted_bill_words = "(no data)"

	# votes

	house_votes = Vote.objects.filter(
		congress=congress,
		created__lte=corresponding_day,
		chamber=CongressChamber.house).count()
	senate_votes = Vote.objects.filter(
		congress=congress,
		created__lte=corresponding_day,
		chamber=CongressChamber.senate).count()

	# power

	congress_same_party = party_control[congress][0] == party_control[congress][1]
	branches_same_party = (party_control[congress][0] == party_control[congress][1]) and (party_control[congress][0] == party_control[congress][2])

	#

	timespan = "%d (%d-%d)" % (congress, get_congress_dates(congress)[0].year, get_congress_dates(congress)[1].year-1)
	row = [timespan, enacted_bills_count, enacted_bill_pages, enacted_bill_words, house_votes, senate_votes, "Yes" if congress_same_party else "No", "Yes" if branches_same_party else "No"]
	W.writerow(row)
Beispiel #4
0
def get_bill_paragraphs(bill):
	import lxml.html
	from bill.billtext import load_bill_text
	from hashlib import md5
		
	try:
		dom = lxml.html.fromstring(load_bill_text(bill, None)["text_html"])
	except IOError:
		print("no bill text", bill.id, bill)
		return None
	except Exception as e:
		print("error in bill text", bill.id, bill, e)
		return None
		
	hashes = { }
		
	for node in dom.xpath("//p"):	
		text = lxml.etree.tostring(node, method="text", encoding="utf8")
		text = text.lower() # normalize case
		text = re.sub("^\(.*?\)\s*", "", text) # remove initial list numbering
		text = re.sub(r"\W+", " ", text).strip() # normalize spaces and other non-word characters
		if text == "": continue
		text = md5(text).hexdigest()
		hashes[text] = hashes.get(text, 0) + 1

	return hashes
Beispiel #5
0
 def get_index_text(self):
     return "\n".join([
         self.title,
         self.display_number_no_congress_number.replace(".", ""),
         self.display_number_no_congress_number.replace(".", "").replace(" ", ""),
         ] + [t[2] for t in self.titles]) \
         + "\n\n" + load_bill_text(self, None, plain_text=True)
Beispiel #6
0
def get_bill_paragraphs(bill):
    import lxml.html
    from bill.billtext import load_bill_text
    from hashlib import md5

    try:
        dom = lxml.html.fromstring(load_bill_text(bill, None)["text_html"])
    except IOError:
        print("no bill text", bill.id, bill)
        return None
    except Exception as e:
        print("error in bill text", bill.id, bill, e)
        return None

    hashes = {}

    for node in dom.xpath("//p"):
        text = lxml.etree.tostring(node, method="text", encoding="utf8")
        text = text.lower()  # normalize case
        text = re.sub("^\(.*?\)\s*", "", text)  # remove initial list numbering
        text = re.sub(
            r"\W+", " ",
            text).strip()  # normalize spaces and other non-word characters
        if text == "": continue
        text = md5(text).hexdigest()
        hashes[text] = hashes.get(text, 0) + 1

    return hashes
def count_pages_of_bills(congress):
    counters = defaultdict(lambda: [])
    missing_text = 0

    qs = Bill.objects.filter(congress=congress)\
     .filter(current_status__in=BillStatus.final_status_enacted_bill)
    for b in qs:
        plain_text = load_bill_text(b, None, plain_text=True)

        if congress >= 103:
            # Bills since 1993 have GPO MODS XML metadata with page counts.
            try:
                pp = load_bill_text(b, None, mods_only=True).get("numpages")
            except IOError:
                missing_text += 1
                continue
            if pp is None:
                missing_text += 1
                continue
        else:
            # For historical statutes we only have plain text from the
            # Statutes at Large, extracted from PDFs. We can get page
            # counts by looking for our replacement of the form feed
            # character put in by pdftotext. We only have that when
            # we extracted text from PDFs, which we only did for
            # the Statutes at Large. We can't do this on modern bills
            # where the text came from GPO plain text format.
            pp = len([
                pgtext for pgtext in plain_text.split(
                    "\n=============================================\n")
                if pgtext.strip() != ""
            ])

        wds = len(plain_text.split(" "))

        W.writerow([
            congress, "{}-{}".format(*get_congress_years(congress)),
            b.get_absolute_url(), pp, wds
        ]),

        counters["pages"].append(pp)
        counters["words"].append(wds)
Beispiel #8
0
 def render_event_text(self, ev_code, feeds):
     from billtext import bill_gpo_status_codes, load_bill_text
     if not ev_code in bill_gpo_status_codes: raise Exception()
     bt = BillType.by_value(self.bill_type).xml_code
     textfn = "data/us/bills.text/%s/%s/%s%d%s.pdf" % (self.congress, bt, bt, self.number, ev_code) # use pdf since we don't modify it once we download it, and hopefully we actually have a displayable format like HTML
     if not os.path.exists(textfn): raise Exception()
     
     try:
         modsinfo = load_bill_text(self, ev_code, mods_only=True)
     except IOError:
         modsinfo = { "docdate": "Unknown Date", "doc_version_name": "Unknown Version" }
     
     return {
         "type": "Bill Text",
         "date": datetime.datetime.fromtimestamp(os.path.getmtime(textfn)),
         "date_has_no_time": False,
         "title": self.title,
         "url": self.get_absolute_url() + "/text",
         "body_text_template": """This {{noun}}'s text {% if doc_version_name != "Introduced" %}for status <{{doc_version_name}}> ({{doc_date}}) {% endif %}is now available.""",
         "body_html_template": """<p>This {{noun}}&rsquo;s text {% if doc_version_name != "Introduced" %}for status <i>{{doc_version_name}}</i> ({{doc_date}}) {% endif %}is now available.</p>""",
         "context": { "noun": self.noun, "doc_date": modsinfo["docdate"], "doc_version_name": modsinfo["doc_version_name"] },
         }
Beispiel #9
0
])

for congress, session, startdate, enddate in get_all_sessions():
    if congress < 103: continue

    bills = Bill.objects.filter(
        congress=congress,
        introduced_date__gte=startdate,
        introduced_date__lte=enddate,
    ).order_by('introduced_date')

    for b in bills:
        status = BillStatus.by_value(b.current_status)

        try:
            pp = load_bill_text(b, None, mods_only=True).get("numpages")
            pp = int(pp.replace(" pages", ""))

            t = load_bill_text(b, None, plain_text=True)
            wds = len(t.split(" "))
        except IOError:
            wds = "NA"
            pp = "NA"

        C.writerow([
            b.congress, session,
            BillType.by_value(b.bill_type).slug,
            str(b.number), b.noun == "bill", status.key, status
            in BillStatus.final_status_passed_bill,
            str(pp),
            str(wds)
def run_analysis_for_president(president, date_range):
	global stats
	global columns

	start_date = datetime.strptime(date_range[0], "%Y-%m-%d").date()
	end_date = datetime.strptime(date_range[1], "%Y-%m-%d").date()

	# limit to now, for the current president
	end_date = min(end_date, datetime.now().date())

	# limit to a shorter period than a whole presidency so this computes faster
	end_date = min(start_date+timedelta(days=365*1.5), end_date)

	# if we're measuring presidential activity, the date of signing could be outside of the Congress
	enacted_bills = Bill.objects.filter(
		current_status__in=BillStatus.final_status_enacted_bill,
		#sliplawpubpriv="PUB", # questionable
		current_status_date__gte=start_date, # only use this if looking at a final status
		current_status_date__lte=end_date
		)\
		.order_by('current_status_date')

	# last bill Obama signed was a rare Jan 20th morning
	enacted_bills = enacted_bills.exclude(id=347731)

	by_day = { }
	for b in tqdm.tqdm(enacted_bills, desc=president):
		# Load plain text.
		text = load_bill_text(b, None, plain_text=True)

		# Bills since 1993 have GPO MODS XML metadata with page counts.
		try:
			mods = load_bill_text(b, None, mods_only=True)
			pages = int(mods.get("numpages").replace(" pages", ""))
		except (IOError, AttributeError) as e:
			# For historical statutes we only have plain text from the
			# Statutes at Large, extracted from PDFs. We can get page
			# counts by looking for our replacement of the form feed
			# character put in by pdftotext. We only have that when
			# we extracted text from PDFs, which we only did for
			# the Statutes at Large. We can't do this on modern bills
			# where the text came from GPO plain text format.
			if b.congress < 103:
				pages = len([pgtext for pgtext in text.split("\n=============================================\n") if pgtext.strip() != ""])
			else:
				print b.id, b, e
				raise ValueError("page date missing")

		#words = len(re.split(r"\s+", text)) # not very good for pre-GPO bills because Statutes at Large pages may have multiple statutes on them

		if os.environ.get("PAGES") == "1" and pages > 1: continue
		if os.environ.get("PAGES") == ">1" and pages <= 1: continue

		#if b.congress == 115: print pages, b

		rel_date = (b.current_status_date - start_date).days
		rec = by_day.setdefault(rel_date, { "bills": 0, "pages": 0 } )
		rec["bills"] += 1
		rec["pages"] += pages
		#rec["words"] += words

	# Compute cumulative counts starting on day 0 and for every day till the
	# last day a bill was signed.
	columns.append(president)
	bills = 0
	pages = 0
	#words = 0
	for rel_date in range((end_date-start_date).days+1):
		if rel_date in by_day:
			bills += by_day[rel_date]["bills"]
			pages += by_day[rel_date]["pages"]
			#words += by_day[rel_date]["words"]
		stats.setdefault(rel_date, {})[president] = (bills, pages)
Beispiel #11
0
def compute_productivity(congress, date_range, label=None):
	# laws

	enacted_bills = Bill.objects.filter(
		congress=congress, # if we're measuring presidential activity, the date of signing could be outside of the Congress, so change this

		current_status__in=BillStatus.final_status_enacted_bill,
		#current_status=BillStatus.enacted_signed,
		current_status_date__gte=date_range[0],
		current_status_date__lte=date_range[1]

		#introduced_date__gte=date_range[0],
		#introduced_date__lte=date_range[1]

		)\
		.order_by('current_status_date')

	if date_range[0].month == 1 and date_range[0].day == 20:
		# last bill Obama signed was a rare Jan 20th morning
		enacted_bills = enacted_bills.exclude(id=347731)

	#enacted_bills = (enacted_bills.filter(title__contains="Appropriations") | enacted_bills.filter(title__contains="Authorization")).distinct()

	enacted_bills = list(enacted_bills)
	enacted_bills_count = len(enacted_bills)

	enacted_bill_pages = 0
	enacted_bill_words = 0
	enacted_bill_pages_missing = 0
	for b in enacted_bills:
		# Load plain text.
		text = load_bill_text(b, None, plain_text=True)

		# Bills since 1993 have GPO MODS XML metadata with page counts.
		try:
			mods = load_bill_text(b, None, mods_only=True)
			enacted_bill_pages += int(mods.get("numpages").replace(" pages", ""))
		except (IOError, AttributeError):
			# For historical statutes we only have plain text from the
			# Statutes at Large, extracted from PDFs. We can get page
			# counts by looking for our replacement of the form feed
			# character put in by pdftotext. We only have that when
			# we extracted text from PDFs, which we only did for
			# the Statutes at Large. We can't do this on modern bills
			# where the text came from GPO plain text format.
			if congress < 103:
				enacted_bill_pages += len(text.split("\n=============================================\n"))
			else:
				enacted_bill_pages_missing += 1

		enacted_bill_words += len(re.split(r"\s+", text))

	if enacted_bill_pages_missing:
		enacted_bill_pages = "(missing)"
		enacted_bill_words = "(missing)"

#	# votes
#
#	house_votes = Vote.objects.filter(
#		congress=congress,
#		created__gte=date_range[0],
#		created__lte=date_range[1],
#		chamber=CongressChamber.house).count()
#	senate_votes = Vote.objects.filter(
#		congress=congress,
#		created__gte=date_range[0],
#		created__lte=date_range[1],
#		chamber=CongressChamber.senate).count()

	#timespan = "%d-%d" % (get_congress_dates(congress)[0].year, ((get_congress_dates(congress)[1].year-1) if get_congress_dates(congress)[1].month == 1 else get_congress_dates(congress)[1].year))
	row = [label or congress, date_range[0].isoformat(), date_range[1].isoformat(),
		enacted_bills_count, enacted_bill_pages, enacted_bill_words] #, house_votes, senate_votes]
	W.writerow(row)
for congress in range(CURRENT_CONGRESS, 103-1, -1):
	enacted_bills = Bill.objects.filter(
		bill_type__in=(BillType.senate_bill, BillType.house_bill),
		congress=congress,
		current_status__in=BillStatus.final_status_passed_bill)

	#enacted_bills = (enacted_bills.filter(title__contains="Appropriations") | enacted_bills.filter(title__contains="Authorization")).distinct()

	enacted_bills = list(enacted_bills)
	enacted_bills_count = len(enacted_bills)

	enacted_bill_cites_usc = 0
	enacted_bill_cites_cfr = 0
	enacted_bill_cites_pl = 0
	for b in enacted_bills:
		metadata = load_bill_text(b, None, mods_only=True)
		cite_types = set()
		for cite in metadata["citations"]:
			cite_types.add(cite["type"])

		text = load_bill_text(b, None, plain_text=True)
		m = re.search("code\s+of\s+federal\s+regulations", text, re.I)
		if m: cite_types.add("cfr")
		m = re.search("\d+ c\.?f\.?r\.?", text, re.I)
		if m: cite_types.add("cfr")

		if "usc-chapter" in cite_types or "usc-section" in cite_types:
			enacted_bill_cites_usc += 1
		if "statutes_at_large" in cite_types or "slip_law" in cite_types:
			enacted_bill_cites_pl += 1
		if "cfr" in cite_types:
	# before the 103rd, there's no MODS data
	if congress < 103: continue

	# to get bills in the first session, look for bills that were
	# introduced before the end of the session.
	bills = Bill.objects.filter(
		congress=congress,
		introduced_date__gte=startdate,
		introduced_date__lte=enddate,
		)

	# get page counts by GPO version code
	page_counts = { }
	for b in bills:
		try:
			mods = load_bill_text(b, None, mods_only=True)
		except IOError:
			status = "MISSING_TEXT"
			page_counts[status] = page_counts.get(status, 0) + 1
			continue

		status = mods["doc_version"]
		if status is None or status.strip() == "": status = "UNKNOWN"
		status_names[status] = mods["doc_version_name"]
		pp = int(mods.get("numpages").replace(" pages", ""))
		page_counts[status] = page_counts.get(status, 0) + pp

	for status, page_count in sorted(page_counts.items(), key = lambda kv : -kv[1]):
		w.writerow([str(congress), session, status, status_names[status], str(page_count)])

C.writerow([ "congress", "session", "billtype", "billnumber", "isbill", "finalstatus", "islaw", "pages", "words"])

for congress, session, startdate, enddate in get_all_sessions():
	if congress < 103: continue

	bills = Bill.objects.filter(
		congress=congress,
		introduced_date__gte=startdate,
		introduced_date__lte=enddate,
		).order_by('introduced_date')

	for b in bills:
		status = BillStatus.by_value(b.current_status)

		try:
			pp = load_bill_text(b, None, mods_only=True).get("numpages")
			pp = int(pp.replace(" pages", ""))

			t = load_bill_text(b, None, plain_text=True)
			wds = len(t.split(" "))
		except IOError:
			wds = "NA"
			pp = "NA"
		
		C.writerow([
			b.congress, session, BillType.by_value(b.bill_type).slug, str(b.number),
			b.noun == "bill",
			status.key, status in BillStatus.final_status_passed_bill,
			str(pp), str(wds)])

def compute_productivity(congress, days_in):
    corresponding_day = get_congress_dates(congress)[0] + days_in

    # laws

    enacted_bills = Bill.objects.filter(
        congress=congress,
        current_status__in=BillStatus.final_status_passed_bill,
        current_status_date__lte=corresponding_day)

    #enacted_bills = (enacted_bills.filter(title__contains="Appropriations") | enacted_bills.filter(title__contains="Authorization")).distinct()

    enacted_bills = list(enacted_bills)
    enacted_bills_count = len(enacted_bills)

    enacted_bill_pages = 0
    enacted_bill_words = 0
    enacted_bill_pages_missing = 0
    for b in enacted_bills:
        try:
            pp = load_bill_text(b, None, mods_only=True).get("numpages")
        except IOError:
            pp = None
        if pp is None:
            enacted_bill_pages_missing += 1
            continue
        pp = int(pp.replace(" pages", ""))
        enacted_bill_pages += pp

        wds = len(load_bill_text(b, None, plain_text=True).split(" "))
        enacted_bill_words += wds

    if congress < 103: enacted_bill_pages = "(no data)"
    if congress < 103: enacted_bill_words = "(no data)"

    # votes

    house_votes = Vote.objects.filter(congress=congress,
                                      created__lte=corresponding_day,
                                      chamber=CongressChamber.house).count()
    senate_votes = Vote.objects.filter(congress=congress,
                                       created__lte=corresponding_day,
                                       chamber=CongressChamber.senate).count()

    # power

    congress_same_party = party_control[congress][0] == party_control[
        congress][1]
    branches_same_party = (party_control[congress][0]
                           == party_control[congress][1]) and (
                               party_control[congress][0]
                               == party_control[congress][2])

    #

    timespan = "%d (%d-%d)" % (congress, get_congress_dates(congress)[0].year,
                               get_congress_dates(congress)[1].year - 1)
    row = [
        timespan, enacted_bills_count, enacted_bill_pages, enacted_bill_words,
        house_votes, senate_votes, "Yes" if congress_same_party else "No",
        "Yes" if branches_same_party else "No"
    ]
    W.writerow(row)
def run_analysis_for_president(president, date_range):
	global stats
	global columns

	start_date = datetime.strptime(date_range[0], "%Y-%m-%d").date()
	end_date = datetime.strptime(date_range[1], "%Y-%m-%d").date()

	# limit to now, for the current president
	end_date = min(end_date, datetime.now().date())

	# limit to a shorter period than a whole presidency so this computes faster
	end_date = min(start_date+timedelta(days=365*2.5), end_date)

	# if we're measuring presidential activity, the date of signing could be outside of the Congress
	enacted_bills = Bill.objects.filter(
		current_status__in=BillStatus.final_status_enacted_bill,
		#sliplawpubpriv="PUB", # questionable
		current_status_date__gte=start_date, # only use this if looking at a final status
		current_status_date__lte=end_date
		)\
		.order_by('current_status_date')

	# last bill Obama signed was a rare Jan 20th morning
	enacted_bills = enacted_bills.exclude(id=347731)

	by_day = { }
	for b in tqdm.tqdm(enacted_bills, desc=president):
		# Load plain text.
		text = load_bill_text(b, None, plain_text=True)

		# Bills since 1993 have GPO MODS XML metadata with page counts.
		try:
			mods = load_bill_text(b, None, mods_only=True)
			pages = int(mods.get("numpages").replace(" pages", ""))
		except (IOError, AttributeError) as e:
			# For historical statutes we only have plain text from the
			# Statutes at Large, extracted from PDFs. We can get page
			# counts by looking for our replacement of the form feed
			# character put in by pdftotext. We only have that when
			# we extracted text from PDFs, which we only did for
			# the Statutes at Large. We can't do this on modern bills
			# where the text came from GPO plain text format.
			if b.congress < 103:
				pages = len([pgtext for pgtext in text.split("\n=============================================\n") if pgtext.strip() != ""])
			else:
				print(b.id, b, e)
				raise ValueError("page date missing")

		#words = len(re.split(r"\s+", text)) # not very good for pre-GPO bills because Statutes at Large pages may have multiple statutes on them

		if os.environ.get("PAGES") == "1" and pages > 1: continue
		if os.environ.get("PAGES") == ">1" and pages <= 1: continue

		#if b.congress == 115: print pages, b

		rel_date = (b.current_status_date - start_date).days
		rec = by_day.setdefault(rel_date, { "bills": 0, "pages": 0 } )
		rec["bills"] += 1
		rec["pages"] += pages
		#rec["words"] += words

	# Compute cumulative counts starting on day 0 and for every day till the
	# last day a bill was signed.
	columns.append(president)
	bills = 0
	pages = 0
	#words = 0
	for rel_date in range((end_date-start_date).days+1):
		if rel_date in by_day:
			bills += by_day[rel_date]["bills"]
			pages += by_day[rel_date]["pages"]
			#words += by_day[rel_date]["words"]
		stats.setdefault(rel_date, {})[president] = (bills, pages)
for congress in range(CURRENT_CONGRESS, 103 - 1, -1):
    enacted_bills = Bill.objects.filter(
        bill_type__in=(BillType.senate_bill, BillType.house_bill),
        congress=congress,
        current_status__in=BillStatus.final_status_passed_bill)

    #enacted_bills = (enacted_bills.filter(title__contains="Appropriations") | enacted_bills.filter(title__contains="Authorization")).distinct()

    enacted_bills = list(enacted_bills)
    enacted_bills_count = len(enacted_bills)

    enacted_bill_cites_usc = 0
    enacted_bill_cites_cfr = 0
    enacted_bill_cites_pl = 0
    for b in enacted_bills:
        metadata = load_bill_text(b, None, mods_only=True)
        cite_types = set()
        for cite in metadata["citations"]:
            cite_types.add(cite["type"])

        text = load_bill_text(b, None, plain_text=True)
        m = re.search("code\s+of\s+federal\s+regulations", text, re.I)
        if m: cite_types.add("cfr")
        m = re.search("\d+ c\.?f\.?r\.?", text, re.I)
        if m: cite_types.add("cfr")

        if "usc-chapter" in cite_types or "usc-section" in cite_types:
            enacted_bill_cites_usc += 1
        if "statutes_at_large" in cite_types or "slip_law" in cite_types:
            enacted_bill_cites_pl += 1
        if "cfr" in cite_types:
def run_analysis(label, congress):
	global stats
	global columns

	congress_dates = get_congress_dates(congress)

	start_date = datetime.date(congress_dates[0].year, 1, 1)

	# limit to now, for the current congress
	end_date = datetime.date(congress_dates[0].year, 9, 11)
	end_date = min(end_date, datetime.datetime.now().date())

	bills = Bill.objects.filter(congress=congress, bill_type__in=(BillType.house_bill, BillType.senate_bill, BillType.house_joint_resolution, BillType.senate_joint_resolution), introduced_date__lte=end_date)\
		.order_by('current_status_date')
	by_day = { }
	for b in tqdm.tqdm(bills, desc=label):
		match_date = None
		for datestr, state, text, srcxml in b.major_actions:
			action_date = eval(datestr)
			#if state in (BillStatus.pass_over_house, BillStatus.pass_back_house, BillStatus.passed_bill):
			if state in BillStatus.final_status_enacted_bill:
				match_date = action_date.date()
				break
		else:
			# No event matched.
			continue

		# Load plain text.
		text = load_bill_text(b, None, plain_text=True)

		# Bills since 1993 have GPO MODS XML metadata with page counts.
		try:
			mods = load_bill_text(b, None, mods_only=True)
			pages = mods.get("numpages")
		except (IOError, AttributeError) as e:
			# For historical statutes we only have plain text from the
			# Statutes at Large, extracted from PDFs. We can get page
			# counts by looking for our replacement of the form feed
			# character put in by pdftotext. We only have that when
			# we extracted text from PDFs, which we only did for
			# the Statutes at Large. We can't do this on modern bills
			# where the text came from GPO plain text format.
			if b.congress < 103:
				pages = len([pgtext for pgtext in text.split("\n=============================================\n") if pgtext.strip() != ""])
			else:
				print b.id, b, e
				raise ValueError("page date missing")

		#words = len(re.split(r"\s+", text)) # not very good for pre-GPO bills because Statutes at Large pages may have multiple statutes on them

		################ EEK
		#if pages == 1: continue
		################ EEK

		rel_date = (match_date - start_date).days
		rec = by_day.setdefault(rel_date, { "bills": 0, "pages": 0 } )
		rec["bills"] += 1
		rec["pages"] += pages
		#rec["words"] += words

	# Compute cumulative counts starting on day 0 and for every day till the
	# last day a bill was signed.
	columns.append(label)
	bills = 0
	pages = 0
	#words = 0
	for rel_date in range((end_date-start_date).days+1):
		if rel_date in by_day:
			bills += by_day[rel_date]["bills"]
			pages += by_day[rel_date]["pages"]
			#words += by_day[rel_date]["words"]
		stats.setdefault(rel_date, {})[label] = (bills, pages)
Beispiel #19
0
def compute_productivity(congress, date_range):
    # laws

    enacted_bills = Bill.objects.filter(
        #congress=congress, --- because we're mostly measuring presidential activity, the date of signing could be outside of the Congress
        current_status__in=BillStatus.final_status_passed_bill,
        current_status_date__gte=date_range[0],
        current_status_date__lte=date_range[1])

    #enacted_bills = (enacted_bills.filter(title__contains="Appropriations") | enacted_bills.filter(title__contains="Authorization")).distinct()

    enacted_bills = list(enacted_bills)
    enacted_bills_count = len(enacted_bills)

    enacted_bill_pages = 0
    enacted_bill_words = 0
    enacted_bill_pages_missing = 0
    for b in enacted_bills:
        try:
            pp = load_bill_text(b, None, mods_only=True).get("numpages")
        except IOError:
            pp = None
        if pp is None:
            enacted_bill_pages_missing += 1
            continue
        pp = int(pp.replace(" pages", ""))
        enacted_bill_pages += pp

        wds = len(load_bill_text(b, None, plain_text=True).split(" "))
        enacted_bill_words += wds

    if congress < 103: enacted_bill_pages = "(no data)"
    if congress < 103: enacted_bill_words = "(no data)"

    # votes

    house_votes = Vote.objects.filter(congress=congress,
                                      created__gte=date_range[0],
                                      created__lte=date_range[1],
                                      chamber=CongressChamber.house).count()
    senate_votes = Vote.objects.filter(congress=congress,
                                       created__gte=date_range[0],
                                       created__lte=date_range[1],
                                       chamber=CongressChamber.senate).count()

    ## power
    #congress_same_party = party_control[congress][0] == party_control[congress][1]
    #branches_same_party = (party_control[congress][0] == party_control[congress][1]) and (party_control[congress][0] == party_control[congress][2])

    #

    timespan = "%d-%d" % (get_congress_dates(congress)[0].year,
                          ((get_congress_dates(congress)[1].year -
                            1) if get_congress_dates(congress)[1].month == 1
                           else get_congress_dates(congress)[1].year))
    row = [
        congress, timespan, date_range[0].isoformat(),
        date_range[1].isoformat(), enacted_bills_count, enacted_bill_pages,
        enacted_bill_words, house_votes, senate_votes
    ]
    W.writerow(row)
    # before the 103rd, there's no MODS data
    if congress < 103: continue

    # to get bills in the first session, look for bills that were
    # introduced before the end of the session.
    bills = Bill.objects.filter(
        congress=congress,
        introduced_date__gte=startdate,
        introduced_date__lte=enddate,
    )

    # get page counts by GPO version code
    page_counts = {}
    for b in bills:
        try:
            mods = load_bill_text(b, None, mods_only=True)
        except IOError:
            status = "MISSING_TEXT"
            page_counts[status] = page_counts.get(status, 0) + 1
            continue

        status = mods["doc_version"]
        if status is None or status.strip() == "": status = "UNKNOWN"
        status_names[status] = mods["doc_version_name"]
        pp = mods.get("numpages")
        page_counts[status] = page_counts.get(status, 0) + pp

    for status, page_count in sorted(page_counts.items(),
                                     key=lambda kv: -kv[1]):
        w.writerow([
            str(congress), session, status, status_names[status],
    print(t, t.term_type, t.is_top_term())

    for bill in Bill.objects.filter(congress__gte=108, terms=t).only(
            "id", "congress", "bill_type", "number", "title", "titles",
            "introduced_date",
            "sponsor_role__party").prefetch_related("sponsor_role"):
        w1.writerow([
            t.name.encode("utf8"),
            bill.id,
        ])

        if not bill.id in seen_bills[k]:
            seen_bills[k].add(bill.id)

            text = load_bill_text(bill, None, plain_text=True)
            w2.writerow([
                bill.id,
                bill.title_no_number.encode("utf8"),
                get_secondary_bill_title_2(bill).encode("utf8"),
                bill.introduced_date.isoformat(),
                bill.sponsor_role.party if bill.sponsor_role else "N/A",
                "https://www.govtrack.us" + bill.get_absolute_url(),
                text[0:4096].encode("utf8"),
            ])

            for cite_id in sorted(bill.usc_citations_uptree()):
                w3.writerow([
                    bill.id,
                    cite_id,
                ])
	k = (t.term_type, t.is_top_term())

	w1, w2, w3 = files[k]

	print t, t.term_type, t.is_top_term()

	for bill in Bill.objects.filter(congress__gte=108, terms=t).only("id", "congress", "bill_type", "number", "title", "titles", "introduced_date", "sponsor_role__party").prefetch_related("sponsor_role"):
		w1.writerow([
			t.name.encode("utf8"),
			bill.id,
			])

		if not bill.id in seen_bills[k]:
			seen_bills[k].add(bill.id)

			text = load_bill_text(bill, None, plain_text=True)
			w2.writerow([
				bill.id,
				bill.title_no_number.encode("utf8"),
				get_secondary_bill_title_2(bill).encode("utf8"),
				bill.introduced_date.isoformat(),
				bill.sponsor_role.party if bill.sponsor_role else "N/A",
				"https://www.govtrack.us" + bill.get_absolute_url(),
				text[0:4096].encode("utf8"),
				])

			for cite_id in sorted(bill.usc_citations_uptree()):
				w3.writerow([
					bill.id,
					cite_id,
					])