def doit(congress): all = defaultdict(lambda: 0) enacted = defaultdict(lambda: 0) missing_text = 0 qs = Bill.objects.filter(congress=congress) for b in tqdm.tqdm(qs, total=qs.count()): try: pp = load_bill_text(b, None, mods_only=True).get("numpages") except IOError: missing_text += 1 continue wds = len(load_bill_text(b, None, plain_text=True).split(" ")) all["count"] += 1 all["pages"] += pp all["words"] += wds if b.current_status in BillStatus.final_status_enacted_bill: enacted["count"] += 1 enacted["pages"] += pp enacted["words"] += wds print(congress, all["count"], all["pages"], all["words"], enacted["count"], enacted["pages"], enacted["words"]) print("\t", missing_text, "missing text")
def compute_productivity(congress, date_range): # laws enacted_bills = Bill.objects.filter( congress=congress, # if we're measuring presidential activity, the date of signing could be outside of the Congress, so change this current_status__in=BillStatus.final_status_passed_bill, #current_status_date__gte=date_range[0], #current_status_date__lte=date_range[1] ) #enacted_bills = (enacted_bills.filter(title__contains="Appropriations") | enacted_bills.filter(title__contains="Authorization")).distinct() enacted_bills = list(enacted_bills) enacted_bills_count = len(enacted_bills) enacted_bill_pages = 0 enacted_bill_words = 0 enacted_bill_pages_missing = 0 for b in enacted_bills: try: pp = load_bill_text(b, None, mods_only=True).get("numpages") except IOError: pp = None if pp is None: enacted_bill_pages_missing += 1 continue pp = int(pp.replace(" pages", "")) enacted_bill_pages += pp wds = len(load_bill_text(b, None, plain_text=True).split(" ")) enacted_bill_words += wds if congress < 103: enacted_bill_pages = "(no data)" if congress < 103: enacted_bill_words = "(no data)" # votes house_votes = Vote.objects.filter( congress=congress, created__gte=date_range[0], created__lte=date_range[1], chamber=CongressChamber.house).count() senate_votes = Vote.objects.filter( congress=congress, created__gte=date_range[0], created__lte=date_range[1], chamber=CongressChamber.senate).count() ## power #congress_same_party = party_control[congress][0] == party_control[congress][1] #branches_same_party = (party_control[congress][0] == party_control[congress][1]) and (party_control[congress][0] == party_control[congress][2]) # timespan = "%d-%d" % (get_congress_dates(congress)[0].year, ((get_congress_dates(congress)[1].year-1) if get_congress_dates(congress)[1].month == 1 else get_congress_dates(congress)[1].year)) row = [congress, timespan, date_range[0].isoformat(), date_range[1].isoformat(), enacted_bills_count, enacted_bill_pages, enacted_bill_words, house_votes, senate_votes] W.writerow(row)
def compute_productivity(congress, days_in): corresponding_day = get_congress_dates(congress)[0] + days_in # laws enacted_bills = Bill.objects.filter( congress=congress, current_status__in=BillStatus.final_status_passed_bill, current_status_date__lte=corresponding_day) #enacted_bills = (enacted_bills.filter(title__contains="Appropriations") | enacted_bills.filter(title__contains="Authorization")).distinct() enacted_bills = list(enacted_bills) enacted_bills_count = len(enacted_bills) enacted_bill_pages = 0 enacted_bill_words = 0 enacted_bill_pages_missing = 0 for b in enacted_bills: try: pp = load_bill_text(b, None, mods_only=True).get("numpages") except IOError: pp = None if pp is None: enacted_bill_pages_missing += 1 continue pp = int(pp.replace(" pages", "")) enacted_bill_pages += pp wds = len(load_bill_text(b, None, plain_text=True).split(" ")) enacted_bill_words += wds if congress < 103: enacted_bill_pages = "(no data)" if congress < 103: enacted_bill_words = "(no data)" # votes house_votes = Vote.objects.filter( congress=congress, created__lte=corresponding_day, chamber=CongressChamber.house).count() senate_votes = Vote.objects.filter( congress=congress, created__lte=corresponding_day, chamber=CongressChamber.senate).count() # power congress_same_party = party_control[congress][0] == party_control[congress][1] branches_same_party = (party_control[congress][0] == party_control[congress][1]) and (party_control[congress][0] == party_control[congress][2]) # timespan = "%d (%d-%d)" % (congress, get_congress_dates(congress)[0].year, get_congress_dates(congress)[1].year-1) row = [timespan, enacted_bills_count, enacted_bill_pages, enacted_bill_words, house_votes, senate_votes, "Yes" if congress_same_party else "No", "Yes" if branches_same_party else "No"] W.writerow(row)
def get_bill_paragraphs(bill): import lxml.html from bill.billtext import load_bill_text from hashlib import md5 try: dom = lxml.html.fromstring(load_bill_text(bill, None)["text_html"]) except IOError: print("no bill text", bill.id, bill) return None except Exception as e: print("error in bill text", bill.id, bill, e) return None hashes = { } for node in dom.xpath("//p"): text = lxml.etree.tostring(node, method="text", encoding="utf8") text = text.lower() # normalize case text = re.sub("^\(.*?\)\s*", "", text) # remove initial list numbering text = re.sub(r"\W+", " ", text).strip() # normalize spaces and other non-word characters if text == "": continue text = md5(text).hexdigest() hashes[text] = hashes.get(text, 0) + 1 return hashes
def get_index_text(self): return "\n".join([ self.title, self.display_number_no_congress_number.replace(".", ""), self.display_number_no_congress_number.replace(".", "").replace(" ", ""), ] + [t[2] for t in self.titles]) \ + "\n\n" + load_bill_text(self, None, plain_text=True)
def get_bill_paragraphs(bill): import lxml.html from bill.billtext import load_bill_text from hashlib import md5 try: dom = lxml.html.fromstring(load_bill_text(bill, None)["text_html"]) except IOError: print("no bill text", bill.id, bill) return None except Exception as e: print("error in bill text", bill.id, bill, e) return None hashes = {} for node in dom.xpath("//p"): text = lxml.etree.tostring(node, method="text", encoding="utf8") text = text.lower() # normalize case text = re.sub("^\(.*?\)\s*", "", text) # remove initial list numbering text = re.sub( r"\W+", " ", text).strip() # normalize spaces and other non-word characters if text == "": continue text = md5(text).hexdigest() hashes[text] = hashes.get(text, 0) + 1 return hashes
def count_pages_of_bills(congress): counters = defaultdict(lambda: []) missing_text = 0 qs = Bill.objects.filter(congress=congress)\ .filter(current_status__in=BillStatus.final_status_enacted_bill) for b in qs: plain_text = load_bill_text(b, None, plain_text=True) if congress >= 103: # Bills since 1993 have GPO MODS XML metadata with page counts. try: pp = load_bill_text(b, None, mods_only=True).get("numpages") except IOError: missing_text += 1 continue if pp is None: missing_text += 1 continue else: # For historical statutes we only have plain text from the # Statutes at Large, extracted from PDFs. We can get page # counts by looking for our replacement of the form feed # character put in by pdftotext. We only have that when # we extracted text from PDFs, which we only did for # the Statutes at Large. We can't do this on modern bills # where the text came from GPO plain text format. pp = len([ pgtext for pgtext in plain_text.split( "\n=============================================\n") if pgtext.strip() != "" ]) wds = len(plain_text.split(" ")) W.writerow([ congress, "{}-{}".format(*get_congress_years(congress)), b.get_absolute_url(), pp, wds ]), counters["pages"].append(pp) counters["words"].append(wds)
def render_event_text(self, ev_code, feeds): from billtext import bill_gpo_status_codes, load_bill_text if not ev_code in bill_gpo_status_codes: raise Exception() bt = BillType.by_value(self.bill_type).xml_code textfn = "data/us/bills.text/%s/%s/%s%d%s.pdf" % (self.congress, bt, bt, self.number, ev_code) # use pdf since we don't modify it once we download it, and hopefully we actually have a displayable format like HTML if not os.path.exists(textfn): raise Exception() try: modsinfo = load_bill_text(self, ev_code, mods_only=True) except IOError: modsinfo = { "docdate": "Unknown Date", "doc_version_name": "Unknown Version" } return { "type": "Bill Text", "date": datetime.datetime.fromtimestamp(os.path.getmtime(textfn)), "date_has_no_time": False, "title": self.title, "url": self.get_absolute_url() + "/text", "body_text_template": """This {{noun}}'s text {% if doc_version_name != "Introduced" %}for status <{{doc_version_name}}> ({{doc_date}}) {% endif %}is now available.""", "body_html_template": """<p>This {{noun}}’s text {% if doc_version_name != "Introduced" %}for status <i>{{doc_version_name}}</i> ({{doc_date}}) {% endif %}is now available.</p>""", "context": { "noun": self.noun, "doc_date": modsinfo["docdate"], "doc_version_name": modsinfo["doc_version_name"] }, }
]) for congress, session, startdate, enddate in get_all_sessions(): if congress < 103: continue bills = Bill.objects.filter( congress=congress, introduced_date__gte=startdate, introduced_date__lte=enddate, ).order_by('introduced_date') for b in bills: status = BillStatus.by_value(b.current_status) try: pp = load_bill_text(b, None, mods_only=True).get("numpages") pp = int(pp.replace(" pages", "")) t = load_bill_text(b, None, plain_text=True) wds = len(t.split(" ")) except IOError: wds = "NA" pp = "NA" C.writerow([ b.congress, session, BillType.by_value(b.bill_type).slug, str(b.number), b.noun == "bill", status.key, status in BillStatus.final_status_passed_bill, str(pp), str(wds)
def run_analysis_for_president(president, date_range): global stats global columns start_date = datetime.strptime(date_range[0], "%Y-%m-%d").date() end_date = datetime.strptime(date_range[1], "%Y-%m-%d").date() # limit to now, for the current president end_date = min(end_date, datetime.now().date()) # limit to a shorter period than a whole presidency so this computes faster end_date = min(start_date+timedelta(days=365*1.5), end_date) # if we're measuring presidential activity, the date of signing could be outside of the Congress enacted_bills = Bill.objects.filter( current_status__in=BillStatus.final_status_enacted_bill, #sliplawpubpriv="PUB", # questionable current_status_date__gte=start_date, # only use this if looking at a final status current_status_date__lte=end_date )\ .order_by('current_status_date') # last bill Obama signed was a rare Jan 20th morning enacted_bills = enacted_bills.exclude(id=347731) by_day = { } for b in tqdm.tqdm(enacted_bills, desc=president): # Load plain text. text = load_bill_text(b, None, plain_text=True) # Bills since 1993 have GPO MODS XML metadata with page counts. try: mods = load_bill_text(b, None, mods_only=True) pages = int(mods.get("numpages").replace(" pages", "")) except (IOError, AttributeError) as e: # For historical statutes we only have plain text from the # Statutes at Large, extracted from PDFs. We can get page # counts by looking for our replacement of the form feed # character put in by pdftotext. We only have that when # we extracted text from PDFs, which we only did for # the Statutes at Large. We can't do this on modern bills # where the text came from GPO plain text format. if b.congress < 103: pages = len([pgtext for pgtext in text.split("\n=============================================\n") if pgtext.strip() != ""]) else: print b.id, b, e raise ValueError("page date missing") #words = len(re.split(r"\s+", text)) # not very good for pre-GPO bills because Statutes at Large pages may have multiple statutes on them if os.environ.get("PAGES") == "1" and pages > 1: continue if os.environ.get("PAGES") == ">1" and pages <= 1: continue #if b.congress == 115: print pages, b rel_date = (b.current_status_date - start_date).days rec = by_day.setdefault(rel_date, { "bills": 0, "pages": 0 } ) rec["bills"] += 1 rec["pages"] += pages #rec["words"] += words # Compute cumulative counts starting on day 0 and for every day till the # last day a bill was signed. columns.append(president) bills = 0 pages = 0 #words = 0 for rel_date in range((end_date-start_date).days+1): if rel_date in by_day: bills += by_day[rel_date]["bills"] pages += by_day[rel_date]["pages"] #words += by_day[rel_date]["words"] stats.setdefault(rel_date, {})[president] = (bills, pages)
def compute_productivity(congress, date_range, label=None): # laws enacted_bills = Bill.objects.filter( congress=congress, # if we're measuring presidential activity, the date of signing could be outside of the Congress, so change this current_status__in=BillStatus.final_status_enacted_bill, #current_status=BillStatus.enacted_signed, current_status_date__gte=date_range[0], current_status_date__lte=date_range[1] #introduced_date__gte=date_range[0], #introduced_date__lte=date_range[1] )\ .order_by('current_status_date') if date_range[0].month == 1 and date_range[0].day == 20: # last bill Obama signed was a rare Jan 20th morning enacted_bills = enacted_bills.exclude(id=347731) #enacted_bills = (enacted_bills.filter(title__contains="Appropriations") | enacted_bills.filter(title__contains="Authorization")).distinct() enacted_bills = list(enacted_bills) enacted_bills_count = len(enacted_bills) enacted_bill_pages = 0 enacted_bill_words = 0 enacted_bill_pages_missing = 0 for b in enacted_bills: # Load plain text. text = load_bill_text(b, None, plain_text=True) # Bills since 1993 have GPO MODS XML metadata with page counts. try: mods = load_bill_text(b, None, mods_only=True) enacted_bill_pages += int(mods.get("numpages").replace(" pages", "")) except (IOError, AttributeError): # For historical statutes we only have plain text from the # Statutes at Large, extracted from PDFs. We can get page # counts by looking for our replacement of the form feed # character put in by pdftotext. We only have that when # we extracted text from PDFs, which we only did for # the Statutes at Large. We can't do this on modern bills # where the text came from GPO plain text format. if congress < 103: enacted_bill_pages += len(text.split("\n=============================================\n")) else: enacted_bill_pages_missing += 1 enacted_bill_words += len(re.split(r"\s+", text)) if enacted_bill_pages_missing: enacted_bill_pages = "(missing)" enacted_bill_words = "(missing)" # # votes # # house_votes = Vote.objects.filter( # congress=congress, # created__gte=date_range[0], # created__lte=date_range[1], # chamber=CongressChamber.house).count() # senate_votes = Vote.objects.filter( # congress=congress, # created__gte=date_range[0], # created__lte=date_range[1], # chamber=CongressChamber.senate).count() #timespan = "%d-%d" % (get_congress_dates(congress)[0].year, ((get_congress_dates(congress)[1].year-1) if get_congress_dates(congress)[1].month == 1 else get_congress_dates(congress)[1].year)) row = [label or congress, date_range[0].isoformat(), date_range[1].isoformat(), enacted_bills_count, enacted_bill_pages, enacted_bill_words] #, house_votes, senate_votes] W.writerow(row)
for congress in range(CURRENT_CONGRESS, 103-1, -1): enacted_bills = Bill.objects.filter( bill_type__in=(BillType.senate_bill, BillType.house_bill), congress=congress, current_status__in=BillStatus.final_status_passed_bill) #enacted_bills = (enacted_bills.filter(title__contains="Appropriations") | enacted_bills.filter(title__contains="Authorization")).distinct() enacted_bills = list(enacted_bills) enacted_bills_count = len(enacted_bills) enacted_bill_cites_usc = 0 enacted_bill_cites_cfr = 0 enacted_bill_cites_pl = 0 for b in enacted_bills: metadata = load_bill_text(b, None, mods_only=True) cite_types = set() for cite in metadata["citations"]: cite_types.add(cite["type"]) text = load_bill_text(b, None, plain_text=True) m = re.search("code\s+of\s+federal\s+regulations", text, re.I) if m: cite_types.add("cfr") m = re.search("\d+ c\.?f\.?r\.?", text, re.I) if m: cite_types.add("cfr") if "usc-chapter" in cite_types or "usc-section" in cite_types: enacted_bill_cites_usc += 1 if "statutes_at_large" in cite_types or "slip_law" in cite_types: enacted_bill_cites_pl += 1 if "cfr" in cite_types:
# before the 103rd, there's no MODS data if congress < 103: continue # to get bills in the first session, look for bills that were # introduced before the end of the session. bills = Bill.objects.filter( congress=congress, introduced_date__gte=startdate, introduced_date__lte=enddate, ) # get page counts by GPO version code page_counts = { } for b in bills: try: mods = load_bill_text(b, None, mods_only=True) except IOError: status = "MISSING_TEXT" page_counts[status] = page_counts.get(status, 0) + 1 continue status = mods["doc_version"] if status is None or status.strip() == "": status = "UNKNOWN" status_names[status] = mods["doc_version_name"] pp = int(mods.get("numpages").replace(" pages", "")) page_counts[status] = page_counts.get(status, 0) + pp for status, page_count in sorted(page_counts.items(), key = lambda kv : -kv[1]): w.writerow([str(congress), session, status, status_names[status], str(page_count)])
C.writerow([ "congress", "session", "billtype", "billnumber", "isbill", "finalstatus", "islaw", "pages", "words"]) for congress, session, startdate, enddate in get_all_sessions(): if congress < 103: continue bills = Bill.objects.filter( congress=congress, introduced_date__gte=startdate, introduced_date__lte=enddate, ).order_by('introduced_date') for b in bills: status = BillStatus.by_value(b.current_status) try: pp = load_bill_text(b, None, mods_only=True).get("numpages") pp = int(pp.replace(" pages", "")) t = load_bill_text(b, None, plain_text=True) wds = len(t.split(" ")) except IOError: wds = "NA" pp = "NA" C.writerow([ b.congress, session, BillType.by_value(b.bill_type).slug, str(b.number), b.noun == "bill", status.key, status in BillStatus.final_status_passed_bill, str(pp), str(wds)])
def compute_productivity(congress, days_in): corresponding_day = get_congress_dates(congress)[0] + days_in # laws enacted_bills = Bill.objects.filter( congress=congress, current_status__in=BillStatus.final_status_passed_bill, current_status_date__lte=corresponding_day) #enacted_bills = (enacted_bills.filter(title__contains="Appropriations") | enacted_bills.filter(title__contains="Authorization")).distinct() enacted_bills = list(enacted_bills) enacted_bills_count = len(enacted_bills) enacted_bill_pages = 0 enacted_bill_words = 0 enacted_bill_pages_missing = 0 for b in enacted_bills: try: pp = load_bill_text(b, None, mods_only=True).get("numpages") except IOError: pp = None if pp is None: enacted_bill_pages_missing += 1 continue pp = int(pp.replace(" pages", "")) enacted_bill_pages += pp wds = len(load_bill_text(b, None, plain_text=True).split(" ")) enacted_bill_words += wds if congress < 103: enacted_bill_pages = "(no data)" if congress < 103: enacted_bill_words = "(no data)" # votes house_votes = Vote.objects.filter(congress=congress, created__lte=corresponding_day, chamber=CongressChamber.house).count() senate_votes = Vote.objects.filter(congress=congress, created__lte=corresponding_day, chamber=CongressChamber.senate).count() # power congress_same_party = party_control[congress][0] == party_control[ congress][1] branches_same_party = (party_control[congress][0] == party_control[congress][1]) and ( party_control[congress][0] == party_control[congress][2]) # timespan = "%d (%d-%d)" % (congress, get_congress_dates(congress)[0].year, get_congress_dates(congress)[1].year - 1) row = [ timespan, enacted_bills_count, enacted_bill_pages, enacted_bill_words, house_votes, senate_votes, "Yes" if congress_same_party else "No", "Yes" if branches_same_party else "No" ] W.writerow(row)
def run_analysis_for_president(president, date_range): global stats global columns start_date = datetime.strptime(date_range[0], "%Y-%m-%d").date() end_date = datetime.strptime(date_range[1], "%Y-%m-%d").date() # limit to now, for the current president end_date = min(end_date, datetime.now().date()) # limit to a shorter period than a whole presidency so this computes faster end_date = min(start_date+timedelta(days=365*2.5), end_date) # if we're measuring presidential activity, the date of signing could be outside of the Congress enacted_bills = Bill.objects.filter( current_status__in=BillStatus.final_status_enacted_bill, #sliplawpubpriv="PUB", # questionable current_status_date__gte=start_date, # only use this if looking at a final status current_status_date__lte=end_date )\ .order_by('current_status_date') # last bill Obama signed was a rare Jan 20th morning enacted_bills = enacted_bills.exclude(id=347731) by_day = { } for b in tqdm.tqdm(enacted_bills, desc=president): # Load plain text. text = load_bill_text(b, None, plain_text=True) # Bills since 1993 have GPO MODS XML metadata with page counts. try: mods = load_bill_text(b, None, mods_only=True) pages = int(mods.get("numpages").replace(" pages", "")) except (IOError, AttributeError) as e: # For historical statutes we only have plain text from the # Statutes at Large, extracted from PDFs. We can get page # counts by looking for our replacement of the form feed # character put in by pdftotext. We only have that when # we extracted text from PDFs, which we only did for # the Statutes at Large. We can't do this on modern bills # where the text came from GPO plain text format. if b.congress < 103: pages = len([pgtext for pgtext in text.split("\n=============================================\n") if pgtext.strip() != ""]) else: print(b.id, b, e) raise ValueError("page date missing") #words = len(re.split(r"\s+", text)) # not very good for pre-GPO bills because Statutes at Large pages may have multiple statutes on them if os.environ.get("PAGES") == "1" and pages > 1: continue if os.environ.get("PAGES") == ">1" and pages <= 1: continue #if b.congress == 115: print pages, b rel_date = (b.current_status_date - start_date).days rec = by_day.setdefault(rel_date, { "bills": 0, "pages": 0 } ) rec["bills"] += 1 rec["pages"] += pages #rec["words"] += words # Compute cumulative counts starting on day 0 and for every day till the # last day a bill was signed. columns.append(president) bills = 0 pages = 0 #words = 0 for rel_date in range((end_date-start_date).days+1): if rel_date in by_day: bills += by_day[rel_date]["bills"] pages += by_day[rel_date]["pages"] #words += by_day[rel_date]["words"] stats.setdefault(rel_date, {})[president] = (bills, pages)
for congress in range(CURRENT_CONGRESS, 103 - 1, -1): enacted_bills = Bill.objects.filter( bill_type__in=(BillType.senate_bill, BillType.house_bill), congress=congress, current_status__in=BillStatus.final_status_passed_bill) #enacted_bills = (enacted_bills.filter(title__contains="Appropriations") | enacted_bills.filter(title__contains="Authorization")).distinct() enacted_bills = list(enacted_bills) enacted_bills_count = len(enacted_bills) enacted_bill_cites_usc = 0 enacted_bill_cites_cfr = 0 enacted_bill_cites_pl = 0 for b in enacted_bills: metadata = load_bill_text(b, None, mods_only=True) cite_types = set() for cite in metadata["citations"]: cite_types.add(cite["type"]) text = load_bill_text(b, None, plain_text=True) m = re.search("code\s+of\s+federal\s+regulations", text, re.I) if m: cite_types.add("cfr") m = re.search("\d+ c\.?f\.?r\.?", text, re.I) if m: cite_types.add("cfr") if "usc-chapter" in cite_types or "usc-section" in cite_types: enacted_bill_cites_usc += 1 if "statutes_at_large" in cite_types or "slip_law" in cite_types: enacted_bill_cites_pl += 1 if "cfr" in cite_types:
def run_analysis(label, congress): global stats global columns congress_dates = get_congress_dates(congress) start_date = datetime.date(congress_dates[0].year, 1, 1) # limit to now, for the current congress end_date = datetime.date(congress_dates[0].year, 9, 11) end_date = min(end_date, datetime.datetime.now().date()) bills = Bill.objects.filter(congress=congress, bill_type__in=(BillType.house_bill, BillType.senate_bill, BillType.house_joint_resolution, BillType.senate_joint_resolution), introduced_date__lte=end_date)\ .order_by('current_status_date') by_day = { } for b in tqdm.tqdm(bills, desc=label): match_date = None for datestr, state, text, srcxml in b.major_actions: action_date = eval(datestr) #if state in (BillStatus.pass_over_house, BillStatus.pass_back_house, BillStatus.passed_bill): if state in BillStatus.final_status_enacted_bill: match_date = action_date.date() break else: # No event matched. continue # Load plain text. text = load_bill_text(b, None, plain_text=True) # Bills since 1993 have GPO MODS XML metadata with page counts. try: mods = load_bill_text(b, None, mods_only=True) pages = mods.get("numpages") except (IOError, AttributeError) as e: # For historical statutes we only have plain text from the # Statutes at Large, extracted from PDFs. We can get page # counts by looking for our replacement of the form feed # character put in by pdftotext. We only have that when # we extracted text from PDFs, which we only did for # the Statutes at Large. We can't do this on modern bills # where the text came from GPO plain text format. if b.congress < 103: pages = len([pgtext for pgtext in text.split("\n=============================================\n") if pgtext.strip() != ""]) else: print b.id, b, e raise ValueError("page date missing") #words = len(re.split(r"\s+", text)) # not very good for pre-GPO bills because Statutes at Large pages may have multiple statutes on them ################ EEK #if pages == 1: continue ################ EEK rel_date = (match_date - start_date).days rec = by_day.setdefault(rel_date, { "bills": 0, "pages": 0 } ) rec["bills"] += 1 rec["pages"] += pages #rec["words"] += words # Compute cumulative counts starting on day 0 and for every day till the # last day a bill was signed. columns.append(label) bills = 0 pages = 0 #words = 0 for rel_date in range((end_date-start_date).days+1): if rel_date in by_day: bills += by_day[rel_date]["bills"] pages += by_day[rel_date]["pages"] #words += by_day[rel_date]["words"] stats.setdefault(rel_date, {})[label] = (bills, pages)
def compute_productivity(congress, date_range): # laws enacted_bills = Bill.objects.filter( #congress=congress, --- because we're mostly measuring presidential activity, the date of signing could be outside of the Congress current_status__in=BillStatus.final_status_passed_bill, current_status_date__gte=date_range[0], current_status_date__lte=date_range[1]) #enacted_bills = (enacted_bills.filter(title__contains="Appropriations") | enacted_bills.filter(title__contains="Authorization")).distinct() enacted_bills = list(enacted_bills) enacted_bills_count = len(enacted_bills) enacted_bill_pages = 0 enacted_bill_words = 0 enacted_bill_pages_missing = 0 for b in enacted_bills: try: pp = load_bill_text(b, None, mods_only=True).get("numpages") except IOError: pp = None if pp is None: enacted_bill_pages_missing += 1 continue pp = int(pp.replace(" pages", "")) enacted_bill_pages += pp wds = len(load_bill_text(b, None, plain_text=True).split(" ")) enacted_bill_words += wds if congress < 103: enacted_bill_pages = "(no data)" if congress < 103: enacted_bill_words = "(no data)" # votes house_votes = Vote.objects.filter(congress=congress, created__gte=date_range[0], created__lte=date_range[1], chamber=CongressChamber.house).count() senate_votes = Vote.objects.filter(congress=congress, created__gte=date_range[0], created__lte=date_range[1], chamber=CongressChamber.senate).count() ## power #congress_same_party = party_control[congress][0] == party_control[congress][1] #branches_same_party = (party_control[congress][0] == party_control[congress][1]) and (party_control[congress][0] == party_control[congress][2]) # timespan = "%d-%d" % (get_congress_dates(congress)[0].year, ((get_congress_dates(congress)[1].year - 1) if get_congress_dates(congress)[1].month == 1 else get_congress_dates(congress)[1].year)) row = [ congress, timespan, date_range[0].isoformat(), date_range[1].isoformat(), enacted_bills_count, enacted_bill_pages, enacted_bill_words, house_votes, senate_votes ] W.writerow(row)
# before the 103rd, there's no MODS data if congress < 103: continue # to get bills in the first session, look for bills that were # introduced before the end of the session. bills = Bill.objects.filter( congress=congress, introduced_date__gte=startdate, introduced_date__lte=enddate, ) # get page counts by GPO version code page_counts = {} for b in bills: try: mods = load_bill_text(b, None, mods_only=True) except IOError: status = "MISSING_TEXT" page_counts[status] = page_counts.get(status, 0) + 1 continue status = mods["doc_version"] if status is None or status.strip() == "": status = "UNKNOWN" status_names[status] = mods["doc_version_name"] pp = mods.get("numpages") page_counts[status] = page_counts.get(status, 0) + pp for status, page_count in sorted(page_counts.items(), key=lambda kv: -kv[1]): w.writerow([ str(congress), session, status, status_names[status],
print(t, t.term_type, t.is_top_term()) for bill in Bill.objects.filter(congress__gte=108, terms=t).only( "id", "congress", "bill_type", "number", "title", "titles", "introduced_date", "sponsor_role__party").prefetch_related("sponsor_role"): w1.writerow([ t.name.encode("utf8"), bill.id, ]) if not bill.id in seen_bills[k]: seen_bills[k].add(bill.id) text = load_bill_text(bill, None, plain_text=True) w2.writerow([ bill.id, bill.title_no_number.encode("utf8"), get_secondary_bill_title_2(bill).encode("utf8"), bill.introduced_date.isoformat(), bill.sponsor_role.party if bill.sponsor_role else "N/A", "https://www.govtrack.us" + bill.get_absolute_url(), text[0:4096].encode("utf8"), ]) for cite_id in sorted(bill.usc_citations_uptree()): w3.writerow([ bill.id, cite_id, ])
k = (t.term_type, t.is_top_term()) w1, w2, w3 = files[k] print t, t.term_type, t.is_top_term() for bill in Bill.objects.filter(congress__gte=108, terms=t).only("id", "congress", "bill_type", "number", "title", "titles", "introduced_date", "sponsor_role__party").prefetch_related("sponsor_role"): w1.writerow([ t.name.encode("utf8"), bill.id, ]) if not bill.id in seen_bills[k]: seen_bills[k].add(bill.id) text = load_bill_text(bill, None, plain_text=True) w2.writerow([ bill.id, bill.title_no_number.encode("utf8"), get_secondary_bill_title_2(bill).encode("utf8"), bill.introduced_date.isoformat(), bill.sponsor_role.party if bill.sponsor_role else "N/A", "https://www.govtrack.us" + bill.get_absolute_url(), text[0:4096].encode("utf8"), ]) for cite_id in sorted(bill.usc_citations_uptree()): w3.writerow([ bill.id, cite_id, ])