def parse_bill(self, chamber, session, bill_id, bill_info_url): with self.urlopen_context(bill_info_url) as bill_info_data: bill_info = self.soup_parser(bill_info_data) version_url = '%s/bill.doc' % bill_id version_link = bill_info.find(href=version_url) if not version_link: # This bill was withdrawn return bill_title = version_link.findNext('p').contents[0].strip() bill = Bill(session, chamber, bill_id, bill_title) bill.add_version("Most Recent Version", session_url(session) + version_url) bill.add_source(bill_info_url) sponsor_links = bill_info.findAll(href=re.compile( 'legislator/[SH]\d+\.htm')) for sponsor_link in sponsor_links: bill.add_sponsor('primary', sponsor_link.contents[0].strip()) action_p = version_link.findAllNext('p')[-1] for action in action_p.findAll(text=True): action = action.strip() if (not action or action == 'last action' or 'Prefiled' in action): continue action_date = action.split('-')[0] action_date = dt.datetime.strptime(action_date, '%b %d') # Fix: action_date = action_date.replace( year=int('20' + session[2:4])) action = '-'.join(action.split('-')[1:]) if action.endswith('House') or action.endswith('(H)'): actor = 'lower' elif action.endswith('Senate') or action.endswith('(S)'): actor = 'upper' else: actor = chamber bill.add_action(actor, action, action_date) vote_link = bill_info.find(href=re.compile('.*/vote_history.pdf')) if vote_link: bill.add_document( 'vote_history.pdf', bill_info_url.replace('.htm', '') + "/vote_history.pdf") self.add_bill(bill)
def scrape_bills(self,chamber,year): if int(year) %2 == 0: raise NoDataForYear(year) # year = int(year) oyear = year #save off the original of the session if chamber == 'upper': bill_no = 1 abbr = 'SB' else: bill_no = 4001 abbr = 'HB' while True: (bill_page,year) = self.scrape_bill(year, abbr, bill_no) # if we can't find a page, we must be done. This is a healthy thing. if bill_page == None: return title = ''.join(self.flatten(bill_page.findAll(id='frg_billstatus_ObjectSubject')[0])) title = title.replace('\n','').replace('\r','') bill_id = "%s %d" % (abbr, bill_no) the_bill = Bill("Regular Session %d" % oyear, chamber, bill_id, title) #sponsors first = 0 for name in bill_page.findAll(id='frg_billstatus_SponsorList')[0].findAll('a'): the_bill.add_sponsor(['primary', 'cosponsor'][first], name.string) first = 1 #versions for doc in bill_page.findAll(id='frg_billstatus_DocumentGridTable')[0].findAll('tr'): r = self.parse_doc(the_bill, doc) if r: the_bill.add_version(*r) #documents if 'frg_billstatus_HlaTable' in str(bill_page): for doc in bill_page.findAll(id='frg_billstatus_HlaTable')[0].findAll('tr'): r = self.parse_doc(the_bill, doc) if r: the_bill.add_document(*r) if 'frg_billstatus_SfaSection' in str(bill_page): for doc in bill_page.findAll(id='frg_billstatus_SfaSection')[0].findAll('tr'): r = self.parse_doc(the_bill, doc) if r: the_bill.add_document(*r) the_bill.add_source('http://legislature.mi.gov/doc.aspx?%d-%s-%04d' % (year, abbr, bill_no)) self.parse_actions(the_bill, bill_page.findAll(id='frg_billstatus_HistoriesGridView')[0]) self.add_bill(the_bill) bill_no = bill_no + 1 pass
def parse_bill(self, chamber, session, bill_id, bill_info_url): with self.urlopen_context(bill_info_url) as bill_info_data: bill_info = self.soup_parser(bill_info_data) version_url = "%s/bill.doc" % bill_id version_link = bill_info.find(href=version_url) if not version_link: # This bill was withdrawn return bill_title = version_link.findNext("p").contents[0].strip() bill = Bill(session, chamber, bill_id, bill_title) bill.add_version("Most Recent Version", session_url(session) + version_url) bill.add_source(bill_info_url) sponsor_links = bill_info.findAll(href=re.compile("legislator/[SH]\d+\.htm")) for sponsor_link in sponsor_links: bill.add_sponsor("primary", sponsor_link.contents[0].strip()) action_p = version_link.findAllNext("p")[-1] for action in action_p.findAll(text=True): action = action.strip() if not action or action == "last action" or "Prefiled" in action: continue action_date = action.split("-")[0] action_date = dt.datetime.strptime(action_date, "%b %d") # Fix: action_date = action_date.replace(year=int("20" + session[2:4])) action = "-".join(action.split("-")[1:]) if action.endswith("House") or action.endswith("(H)"): actor = "lower" elif action.endswith("Senate") or action.endswith("(S)"): actor = "upper" else: actor = chamber bill.add_action(actor, action, action_date) vote_link = bill_info.find(href=re.compile(".*/vote_history.pdf")) if vote_link: bill.add_document("vote_history.pdf", bill_info_url.replace(".htm", "") + "/vote_history.pdf") self.save_bill(bill)
def scrape_bills(self, chamber, year): if year != "2009": raise NoDataForYear if chamber == "upper": other_chamber = "lower" bill_id = "SB 1" else: other_chamber = "upper" bill_id = "HB 1" b1 = Bill("2009-2010", chamber, bill_id, "A super bill") b1.add_source("http://example.com") b1.add_version("As Introduced", "http://example.com/SB1.html") b1.add_document("Google", "http://google.com") b1.add_sponsor("primary", "Bob Smith") b1.add_sponsor("secondary", "Johnson, Sally") d1 = datetime.datetime.strptime("1/29/2010", "%m/%d/%Y") v1 = Vote("upper", d1, "Final passage", True, 2, 0, 0) v1.yes("Bob Smith") v1.yes("Sally Johnson") d2 = datetime.datetime.strptime("1/30/2010", "%m/%d/%Y") v2 = Vote("lower", d2, "Final passage", False, 0, 1, 1) v2.no("B. Smith") v2.other("Sally Johnson") b1.add_vote(v1) b1.add_vote(v2) b1.add_action(chamber, "introduced", d1) b1.add_action(chamber, "read first time", d1) b1.add_action(other_chamber, "introduced", d2) self.save_bill(b1)
def scrape_bill(self, chamber, current_bill, session): other_chamber = 'upper' if chamber == 'lower' else 'lower' with self.soup_context("http://alisondb.legislature.state.al.us/acas/SESSBillsStatusResultsMac.asp?BillNumber=%s&GetStatus=Get+Status&session=%s" % (current_bill, session[0])) as bill: if "Your ACAS Session has expired." in str(bill): raise Exception("Expired cookie - you'll have to run with -n to skip caching") try: bill_id = int(re.findall(r'BTN([0-9]+)', str(bill))[0]) except: raise Exception("No bill found. Hopefully that means it's the end of the session") title = bill.find("td", {'colspan': '7'}).string self.log("Starting parse of %s" % current_bill) #create our bill! bill = Bill(session[1], chamber, current_bill, title.strip()) #add sponsors and co-sponsors with self.soup_context("http://alisondb.legislature.state.al.us/acas/ACTIONSponsorsResultsMac.asp?OID=%d" % bill_id) as sponsors: # This pains me. (primary,secondary) = sponsors.findAll("table", text="Co-Sponsors")[0].parent.parent.parent.findAll('table') for p in primary.findAll('td'): bill.add_sponsor('primary', p.string) for s in secondary.findAll('td'): bill.add_sponsor('cosponsor', s.string) with self.soup_context("http://alisondb.legislature.state.al.us/acas/ACTIONHistoryResultsMac.asp?OID=%d" % bill_id) as history: actions = history.findAll('table', text="Committee")[0].parent.parent.parent.findAll('tr') #Date Amend/Subst Matter Committee Nay Yea Abs Vote for event in actions: e = event.findAll('td') if len(e) == 0: continue date = e[0].string amend = e[1].find('input') matter = e[2].string y_votes = e[5].string n_votes = e[4].string a_votes = e[6].string if not matter: continue roll = e[7].find('input') #(date, amend, matter, committee, nays, yeas, abs, vote_thing) = map(lambda x: x.string, e) if date != None: act_date = dt.datetime.strptime(date, '%m/%d/%Y') if amend != None: splitter = re.findall(r'documentSelected\(\'(\w*)\',\'([\w\d-]*)\',\'([\w\.\-]*)\',\'([\w\d/]*)\',\'([\w\d]*)\',\'([\w\s]*)\'', str(amend))[0] amend = "http://alisondb.legislature.state.al.us/acas/%s/%s" % (splitter[3], splitter[2]) bill.add_document(matter, amend) if roll != None: splitter = re.findall(r'voteSelected\(\'(\d*)\',\'(\d*)\',\'(\d*)\',\'(.*)\',\'(\d*)\'',str(roll))[0] roll = "http://alisondb.legislature.state.al.us/acas/GetRollCallVoteResults.asp?MOID=%s&VOTE=%s&BODY=%s&SESS=%s" % (splitter[0], splitter[1], splitter[2], splitter[4]) with self.soup_context(roll) as votes: vote_rows = votes.findAll('table', text='Member')[0].parent.parent.parent.findAll('tr') yea_votes = int(votes.findAll('tr', text='Total Yea:')[0].parent.parent.findAll('td')[2].string) nay_votes = int(votes.findAll('tr', text='Total Nay:')[0].parent.parent.findAll('td')[2].string) abs_votes = int(votes.findAll('tr', text='Total Abs:')[0].parent.parent.findAll('td')[2].string) p_votes = len(votes.findAll('tr', text='P')) #chamber, date, motion, passed, yes_count, no_count, other_count vote = Vote(chamber, act_date, matter, (yea_votes > nay_votes), yea_votes, nay_votes, abs_votes + p_votes) vote.add_source(roll) for row in vote_rows: skip = str(row) if "Total Yea" in skip or "Total Nay" in skip or "Total Abs" in skip: continue html_layouts_are_awesome = row.findAll('td') if len(html_layouts_are_awesome) == 0: continue (name, t) = html_layouts_are_awesome[0].string, html_layouts_are_awesome[2].string self.dumb_vote(vote, name, t) if len(html_layouts_are_awesome) > 3: (name, t) = html_layouts_are_awesome[4].string, html_layouts_are_awesome[6].string self.dumb_vote(vote, name, t) bill.add_vote(vote) if y_votes != None: yea_votes = self.dumber_vote(y_votes) nay_votes = self.dumber_vote(n_votes) abs_votes = self.dumber_vote(a_votes) vote = Vote(chamber, act_date, matter, (yea_votes > nay_votes), yea_votes, nay_votes, abs_votes) bill.add_vote(vote) bill.add_action(chamber, matter, act_date) self.add_bill(bill)