def parse_vote_new(self, bill, chamber, url): vote_page = BeautifulSoup(self.urlopen(url)) table = vote_page.table info_row = table.findAll('tr')[1] date = info_row.td.contents[0] date = dt.datetime.strptime(date, '%m/%d/%Y') motion = info_row.findAll('td')[1].contents[0] yes_count = int(info_row.findAll('td')[2].contents[0]) no_count = int(info_row.findAll('td')[3].contents[0]) abs_count = int(info_row.findAll('td')[4].contents[0]) passed = info_row.findAll('td')[5].contents[0] == 'Pass' vote = Vote(chamber, date, motion, passed, yes_count, no_count, abs_count) vote.add_source(url) for tr in table.findAll('tr')[3:]: if len(tr.findAll('td')) != 2: continue name = tr.td.contents[0].split(' of')[0] type = tr.findAll('td')[1].contents[0] if type.startswith('Yea'): vote.yes(name) elif type.startswith('Nay'): vote.no(name) else: vote.other(name) bill.add_vote(vote)
def parse_vote(self, bill, action, act_chamber, act_date, url): url = "http://www.legis.state.ak.us/basis/%s" % url info_page = self.soup_parser(self.urlopen(url)) tally = re.findall("Y(\d+) N(\d+)\s*(?:\w(\d+))*\s*(?:\w(\d+))*\s*(?:\w(\d+))*", action)[0] yes, no, o1, o2, o3 = map(lambda x: 0 if x == "" else int(x), tally) yes, no, other = int(yes), int(no), (int(o1) + int(o2) + int(o3)) votes = info_page.findAll("pre", text=re.compile("Yeas"), limit=1)[0].split("\n\n") motion = info_page.findAll(text=re.compile("The question being"))[0] motion = re.findall('The question being:\s*"(.*)\?"', motion, re.DOTALL)[0].replace("\n", " ") vote = Vote(act_chamber, act_date, motion, yes > no, yes, no, other) for vote_list in votes: vote_type = False if vote_list.startswith("Yeas: "): vote_list, vote_type = vote_list[6:], vote.yes elif vote_list.startswith("Nays: "): vote_list, vote_type = vote_list[6:], vote.no elif vote_list.startswith("Excused: "): vote_list, vote_type = vote_list[9:], vote.other elif vote_list.startswith("Absent: "): vote_list, vote_type = vote_list[9:], vote.other if vote_type: for name in vote_list.split(","): vote_type(name.strip()) vote.add_source(url) return vote
def scrape_old_vote(self, url): vote_page = self.soup_parser(self.urlopen(url)) header = vote_page.h3.contents[0] chamber_name = header.split(', ')[1] if chamber_name.startswith('House'): chamber = 'lower' else: chamber = 'upper' location = ' '.join(chamber_name.split(' ')[1:]) if location.startswith('of Representatives'): location = '' motion = ', '.join(header.split(', ')[2:]) def get_count(cell): if len(cell.contents) == 0: return 0 else: return int(cell.contents[0]) results_tbl = vote_page.findAll('table')[1] yes_count = get_count(results_tbl.findAll('td')[1]) no_count = get_count(results_tbl.findAll('td')[3]) excused_count = get_count(results_tbl.findAll('td')[5]) absent_count = get_count(results_tbl.findAll('td')[7]) other_count = excused_count + absent_count passed = yes_count > no_count vote = Vote(chamber, None, motion, passed, yes_count, no_count, other_count, excused_count=excused_count, absent_count=absent_count, location=location) vote.add_source(url) vote_tbl = vote_page.table for td in vote_tbl.findAll('td'): if td.contents[0] == 'Yea': vote.yes(td.findPrevious().contents[0]) elif td.contents[0] == 'Nay': vote.no(td.findPrevious().contents[0]) elif td.contents[0] in ['Excused', 'Absent']: vote.other(td.findPrevious().contents[0]) return vote
def scrape_new_vote(self, url): vote_page = self.soup_parser(self.urlopen(url)) header = vote_page.find(id="ctl00_contentMain_hdVote").contents[0] chamber_name = header.split(', ')[1] if chamber_name.startswith('House'): chamber = 'lower' else: chamber = 'upper' location = ' '.join(chamber_name.split(' ')[1:]) if location.startswith('of Representatives'): location = '' motion = ', '.join(header.split(', ')[2:]) yes_count = int(vote_page.find( id="ctl00_contentMain_tdAyes").contents[0]) no_count = int(vote_page.find( id="ctl00_contentMain_tdNays").contents[0]) excused_count = int(vote_page.find( id="ctl00_contentMain_tdExcused").contents[0]) absent_count = int(vote_page.find( id="ctl00_contentMain_tdAbsent").contents[0]) other_count = excused_count + absent_count passed = yes_count > no_count vote = Vote(chamber, None, motion, passed, yes_count, no_count, other_count, excused_count=excused_count, absent_count=absent_count, location=location) vote.add_source(url) vote_tbl = vote_page.find(id="ctl00_contentMain_tblVotes") for td in vote_tbl.findAll('td'): if td.contents[0] == 'Yea': vote.yes(td.findPrevious().contents[0]) elif td.contents[0] == 'Nay': vote.no(td.findPrevious().contents[0]) elif td.contents[0] in ['Excused', 'Absent']: vote.other(td.findPrevious().contents[0]) return vote
def parse_status(self, bill, url): chamber = bill['chamber'] session = bill['session'] bill_id = bill['bill_id'] status = self.soup_parser(self.urlopen(url)) bill.add_source(url) act_table = status.table # Get actions for row in act_table.findAll('tr')[1:]: act_date = row.td.find(text=True) act_date = dt.datetime.strptime(act_date, "%m/%d/%Y") action = row.findAll('td')[1].find(text=True) # If not specified, assume action occurred # in originating house actor = chamber split_action = action.split('/') if len(split_action) > 1: actor = split_action[0] if actor == 'House': actor = 'lower' elif actor == 'Senate': actor = 'upper' elif actor == 'LFA': actor = 'Office of the Legislative Fiscal Analyst' action = '/'.join(split_action[1:]).strip() if action == 'Governor Signed': actor = 'Governor' bill.add_action(actor, action, act_date) # Check if this action is a vote links = row.findAll('a') if len(links) > 1: vote_url = links[-1]['href'] # Committee votes are of a different format that # we don't handle yet if not vote_url.endswith('txt'): continue vote_url = '/'.join(url.split('/')[:-1]) + '/' + vote_url vote_page = self.urlopen(vote_url) vote_re = re.compile('YEAS -?\s?(\d+)(.*)NAYS -?\s?(\d+)' '(.*)ABSENT( OR NOT VOTING)? -?\s?' '(\d+)(.*)', re.MULTILINE | re.DOTALL) match = vote_re.search(vote_page) yes_count = match.group(1) no_count = match.group(3) other_count = match.group(6) if int(yes_count) > int(no_count): passed = True else: passed = False if actor == 'upper' or actor == 'lower': vote_chamber = actor vote_location = '' else: vote_chamber = '' vote_location = actor vote = Vote(vote_chamber, act_date, action, passed, yes_count, no_count, other_count, location=vote_location) vote.add_source(vote_url) yes_votes = re.split('\s{2,}', match.group(2).strip()) no_votes = re.split('\s{2,}', match.group(4).strip()) other_votes = re.split('\s{2,}', match.group(7).strip()) map(vote.yes, yes_votes) map(vote.no, no_votes) map(vote.other, other_votes) bill.add_vote(vote)
def parse_vote_details(self, url): """ Grab the details of a specific vote, such as how each legislator voted. """ def find_vote(letter): return vote_page.findAll('span', {'class': 'font8text'}, text=letter) with self.soup_context(url) as vote_page: header = vote_page.find('div', {'class': 'subHdrGraphic'}) if 'Senate' in header.string: chamber = 'upper' else: chamber = 'lower' # we'll use the link back to the bill as a base to # get the motion/date linkback = vote_page.find( 'a', href=re.compile('billinfo')).parent.parent date = linkback.find('div').string date = dt.datetime.strptime(date, "%A, %B %d, %Y") motion = linkback.findNextSibling('div') if motion.a: motion = "%s %s" % (motion.a.string, motion.contents[-1].string.strip()) elif motion.span: motion = "%s %s" % (motion.span.string.strip(), motion.contents[-1].string.strip()) else: motion = motion.string.strip().replace(' ', '') yes_count = int(vote_page.find('div', text='YEAS').next.string) no_count = int(vote_page.find('div', text='NAYS').next.string) lve_count = int(vote_page.find('div', text='LVE').next.string) nv_count = int(vote_page.find('div', text='N/V').next.string) other_count = lve_count + nv_count passed = yes_count > no_count vote = Vote(chamber, date, motion, passed, yes_count, no_count, other_count) vote.add_source(url) # find the votes by the inner text. because background colors lie. yes_votes = [vote.yes, find_vote('Y')] no_votes = [vote.no, find_vote('N')] nv_votes = [vote.other, find_vote('E') + find_vote('X')] for (action, votes) in (yes_votes, no_votes, nv_votes): for a_vote in votes: action(a_vote.parent.findNextSibling('span').string) if len(vote['yes_votes']) != yes_count: raise ScrapeError('wrong yes count %d/%d' % (len(vote['yes_votes']), yes_count)) if len(vote['no_votes']) != no_count: raise ScrapeError('wrong no count %d/%d' % (len(vote['no_votes']), no_count)) if len(vote['other_votes']) != other_count: raise ScrapeError('wrong other count %d/%d' % (len(vote['other_votes']), other_count)) return vote
def _scrape_bill_votes(self, soup, bill, chamber): # scrape votes # http://flooractivityext.leg.wa.gov/rollcall.aspx?id=9695&bienId=4 for roll_call_link in soup.findAll('a', href=re.compile('ShowRollCall')): print('roll_call ', roll_call_link) print('roll_call href ', roll_call_link['href']) href = roll_call_link['href'] #if href.count('(') and href.count(')') and href.count(','): toks = href.split('(') toks = toks[1].split(')') toks = toks[0].split(',') id = toks[0] bienId = toks[1] roll_call_url = 'http://flooractivityext.leg.wa.gov/rollcall.aspx?id=%s&bienId=%s' % (id, bienId) print('roll_call_url ', roll_call_url) with self.soup_context(roll_call_url) as roll_call_info: rows = roll_call_info.findAll('tr') date = rows[3].find('td').string motion = rows[2].find('td').string #strip cruft motion = string.replace(motion, '&', '') motion = string.replace(motion, ' ', '') motion = string.replace(motion, ' ', ' ') print('orig motion ', motion) # eg. "House vote on Final Passage" # lop off first three words to get motion: "Final Passage" # first word is chamber if toks[0] == 'House': chamber = 'lower' elif toks[0] == 'Senate': chamber = 'upper' print('chamber ', chamber) toks = motion.split(' ') motion = ' '.join(toks[3:]) print('motion ', motion) print('date ', date) counts = roll_call_info.find(text=re.compile('Yeas:')) print('yeas ', counts) toks = counts.string.splitlines() for tok in toks: print( 'tok ', tok) toks2 = tok.strip().split('&')[0].split(' ') if toks2[0] == 'Yeas:': yes_count = int(toks2[1]) print('yes_count ', yes_count) elif toks2[0] == 'Nays:': no_count = int(toks2[1]) print('no_count ', no_count) elif toks2[0] == 'Absent:': absent_count = int(toks2[1]) print('abs_coount ', absent_count) elif toks2[0] == 'Excused:': excused_count = int(toks2[1]) print('excused_count ', excused_count) vote = Vote(chamber, date, motion, True, yes_count, no_count, excused_count) vote.add_source(roll_call_url) #Vote('upper', '12/7/08', 'Final passage', True, 30, 8, 3) # voterLists = roll_call_info.findAll('span', {'class': 'RollCall'}) # for voterList in voterLists: # print('voterList ', voterList) # toks = voterList.string.split(',') # for tok in toks: # print('tok ', tok) #eg. Representatives Alexander, Angel, Simpson, G., and Mr. Speaker #eg. Representative Alexander start_tok = 'Representative' if chamber == 'upper': start_tok = 'Senator' nameLists = roll_call_info.findAll(text=re.compile(start_tok)) print 'len nameLists', len(nameLists) print 'nameLists', nameLists if not nameLists: continue nameListIdx = 0 for i, count in enumerate([yes_count, no_count, absent_count, excused_count]): print 'i,count', i, count if count is 0: continue nameList = nameLists[nameListIdx] nameListIdx = nameListIdx + 1 start_tok = 'Representative' if chamber == 'upper': start_tok = 'Senator' if count > 1: start_tok = '%ss' % start_tok print 'start_tok', start_tok if not nameList: continue if count > 2: toks = nameList.split(',') else: toks = nameList.split('and') #eg.   Senators Benton #eg.   Senator Benton first_tok = toks.pop(0) print 'first_tok', first_tok name = first_tok.split(start_tok)[-1].strip() print('first_name', name) if i == 0: vote.yes(name) elif i == 2: vote.no(name) if count == 1: continue if count == 2: last_tok = toks[0] else: #eg. and Zarelli last_tok = toks.pop(-1) print 'last_tok', last_tok name = last_tok.replace('and ', '', 1) if type == 'yes': vote.yes(name) elif type == 'no': vote.no(name) print('last_name', name) sz = len(toks) for j, tok in enumerate(toks): name = tok.strip() if name[1] == '.': continue if j+1 < sz: next_tok = toks[j+1].strip() if next_tok[1] == '.': name = ('%s, %s' % (name, next_tok)) print('name', name) if type == 'yes': vote.yes(name) elif type == 'no': vote.no(name) print 'nameList ', nameList
def scrape_bill(self, chamber, current_bill, session): other_chamber = 'upper' if chamber == 'lower' else 'lower' with self.soup_context("http://alisondb.legislature.state.al.us/acas/SESSBillsStatusResultsMac.asp?BillNumber=%s&GetStatus=Get+Status&session=%s" % (current_bill, session[0])) as bill: if "Your ACAS Session has expired." in str(bill): raise Exception("Expired cookie - you'll have to run with -n to skip caching") try: bill_id = int(re.findall(r'BTN([0-9]+)', str(bill))[0]) except: raise Exception("No bill found. Hopefully that means it's the end of the session") title = bill.find("td", {'colspan': '7'}).string self.log("Starting parse of %s" % current_bill) #create our bill! bill = Bill(session[1], chamber, current_bill, title.strip()) #add sponsors and co-sponsors with self.soup_context("http://alisondb.legislature.state.al.us/acas/ACTIONSponsorsResultsMac.asp?OID=%d" % bill_id) as sponsors: # This pains me. (primary,secondary) = sponsors.findAll("table", text="Co-Sponsors")[0].parent.parent.parent.findAll('table') for p in primary.findAll('td'): bill.add_sponsor('primary', p.string) for s in secondary.findAll('td'): bill.add_sponsor('cosponsor', s.string) with self.soup_context("http://alisondb.legislature.state.al.us/acas/ACTIONHistoryResultsMac.asp?OID=%d" % bill_id) as history: actions = history.findAll('table', text="Committee")[0].parent.parent.parent.findAll('tr') #Date Amend/Subst Matter Committee Nay Yea Abs Vote for event in actions: e = event.findAll('td') if len(e) == 0: continue date = e[0].string amend = e[1].find('input') matter = e[2].string y_votes = e[5].string n_votes = e[4].string a_votes = e[6].string if not matter: continue roll = e[7].find('input') #(date, amend, matter, committee, nays, yeas, abs, vote_thing) = map(lambda x: x.string, e) if date != None: act_date = dt.datetime.strptime(date, '%m/%d/%Y') if amend != None: splitter = re.findall(r'documentSelected\(\'(\w*)\',\'([\w\d-]*)\',\'([\w\.\-]*)\',\'([\w\d/]*)\',\'([\w\d]*)\',\'([\w\s]*)\'', str(amend))[0] amend = "http://alisondb.legislature.state.al.us/acas/%s/%s" % (splitter[3], splitter[2]) bill.add_document(matter, amend) if roll != None: splitter = re.findall(r'voteSelected\(\'(\d*)\',\'(\d*)\',\'(\d*)\',\'(.*)\',\'(\d*)\'',str(roll))[0] roll = "http://alisondb.legislature.state.al.us/acas/GetRollCallVoteResults.asp?MOID=%s&VOTE=%s&BODY=%s&SESS=%s" % (splitter[0], splitter[1], splitter[2], splitter[4]) with self.soup_context(roll) as votes: vote_rows = votes.findAll('table', text='Member')[0].parent.parent.parent.findAll('tr') yea_votes = int(votes.findAll('tr', text='Total Yea:')[0].parent.parent.findAll('td')[2].string) nay_votes = int(votes.findAll('tr', text='Total Nay:')[0].parent.parent.findAll('td')[2].string) abs_votes = int(votes.findAll('tr', text='Total Abs:')[0].parent.parent.findAll('td')[2].string) p_votes = len(votes.findAll('tr', text='P')) #chamber, date, motion, passed, yes_count, no_count, other_count vote = Vote(chamber, act_date, matter, (yea_votes > nay_votes), yea_votes, nay_votes, abs_votes + p_votes) vote.add_source(roll) for row in vote_rows: skip = str(row) if "Total Yea" in skip or "Total Nay" in skip or "Total Abs" in skip: continue html_layouts_are_awesome = row.findAll('td') if len(html_layouts_are_awesome) == 0: continue (name, t) = html_layouts_are_awesome[0].string, html_layouts_are_awesome[2].string self.dumb_vote(vote, name, t) if len(html_layouts_are_awesome) > 3: (name, t) = html_layouts_are_awesome[4].string, html_layouts_are_awesome[6].string self.dumb_vote(vote, name, t) bill.add_vote(vote) if y_votes != None: yea_votes = self.dumber_vote(y_votes) nay_votes = self.dumber_vote(n_votes) abs_votes = self.dumber_vote(a_votes) vote = Vote(chamber, act_date, matter, (yea_votes > nay_votes), yea_votes, nay_votes, abs_votes) bill.add_vote(vote) bill.add_action(chamber, matter, act_date) self.add_bill(bill)
def get_vote(self, bill, url): url = 'http://www.ncga.state.nc.us' + url + '&bPrintable=true' chamber = {'H': 'lower', 'S': 'upper'}[ re.findall('sChamber=(\w)', url)[0]] data = self.urlopen(url) soup = self.soup_parser(data) motion = soup.findAll('a', href=re.compile('BillLookUp\.pl'))[0] \ .findParents('tr', limit=1)[0].findAll('td')[1] \ .font.contents[-1] vote_time = soup.findAll('b', text='Time:')[0].next.strip() vote_time = dt.datetime.strptime(vote_time, '%b %d %Y %I:%M%p') vote_mess = soup.findAll('td', text=re.compile('Total Votes:'))[0] (yeas, noes, nots, absent, excused) = map(lambda x: int(x), re.findall( 'Ayes: (\d+)\s+Noes: (\d+)\s+Not: (\d+)\s+Exc. ' 'Absent: (\d+)\s+Exc. Vote: (\d+)', vote_mess, re.U)[0]) # chamber, date, motion, passed, yes_count, no_count, other_count v = Vote(chamber, vote_time, motion, (yeas > noes), yeas, noes, nots + absent + excused) # eh, it's easier to just get table[2] for this.. vote_table = soup.findAll('table')[2] for row in vote_table.findAll('tr'): if 'Democrat' in self.flatten(row): continue cells = row.findAll('td') if len(cells) == 1: # I can't find any examples of ties in the House, # nor information on who would break them. if not self.lt_gov and chamber == 'upper': full_name = soup.findAll( 'td', text=re.compile('Lieutenant Governor'))[0] \ .parent.findAll('span')[0].contents[0] (first_name, last_name, middle_name, suffix) = split_name( full_name) self.lt_gov = Person(full_name, first_name=first_name, last_name=last_name, middle_name=middle_name, suffix=suffix) self.lt_gov.add_role('Lieutenant Governor', bill['session']) self.save_person(self.lt_gov) if 'VOTES YES' in self.flatten(cells[0]): v['passed'] = True v.yes(full_name) else: v['passed'] = False v.no(full_name) continue elif len(cells) == 2: vote_type, a = cells bunch = [self.flatten(a)] elif len(cells) == 3: vote_type, d, r = cells bunch = [self.flatten(d), self.flatten(r)] else: continue # why doesn't .string work? ... bleh. vote_type = vote_type.font.b.contents[0] if 'Ayes' in vote_type: adder = v.yes elif 'Noes' in vote_type: adder = v.no else: adder = v.other for party in bunch: party = map(lambda x: x.replace( ' (SPEAKER)', ''), party[ (party.index(':') + 1):].split(';')) if party[0] == 'None': party = [] for x in party: adder(x) v.add_source(url) bill.add_vote(v)
def get_vote(self, bill, url): url = "http://www.ncga.state.nc.us" + url + "&bPrintable=true" chamber = {"H": "lower", "S": "upper"}[re.findall("sChamber=(\w)", url)[0]] data = self.urlopen(url) soup = self.soup_parser(data) motion = ( soup.findAll("a", href=re.compile("BillLookUp\.pl"))[0] .findParents("tr", limit=1)[0] .findAll("td")[1] .font.contents[-1] ) vote_time = soup.findAll("b", text="Time:")[0].next.strip() vote_time = dt.datetime.strptime(vote_time, "%b %d %Y %I:%M%p") vote_mess = soup.findAll("td", text=re.compile("Total Votes:"))[0] (yeas, noes, nots, absent, excused) = map( lambda x: int(x), re.findall( "Ayes: (\d+)\s+Noes: (\d+)\s+Not: (\d+)\s+Exc. " "Absent: (\d+)\s+Exc. Vote: (\d+)", vote_mess, re.U )[0], ) # chamber, date, motion, passed, yes_count, no_count, other_count v = Vote(chamber, vote_time, motion, (yeas > noes), yeas, noes, nots + absent + excused) # eh, it's easier to just get table[2] for this.. vote_table = soup.findAll("table")[2] for row in vote_table.findAll("tr"): if "Democrat" in self.flatten(row): continue cells = row.findAll("td") if len(cells) == 1: # I can't find any examples of ties in the House, # nor information on who would break them. if not self.lt_gov and chamber == "upper": full_name = ( soup.findAll("td", text=re.compile("Lieutenant Governor"))[0] .parent.findAll("span")[0] .contents[0] ) (first_name, last_name, middle_name, suffix) = split_name(full_name) self.lt_gov = Person( full_name, first_name=first_name, last_name=last_name, middle_name=middle_name, suffix=suffix ) self.lt_gov.add_role("Lieutenant Governor", bill["session"]) self.add_legislator(self.lt_gov) if "VOTES YES" in self.flatten(cells[0]): v["passed"] = True v.yes(full_name) else: v["passed"] = False v.no(full_name) continue elif len(cells) == 2: vote_type, a = cells bunch = [self.flatten(a)] elif len(cells) == 3: vote_type, d, r = cells bunch = [self.flatten(d), self.flatten(r)] else: continue # why doesn't .string work? ... bleh. vote_type = vote_type.font.b.contents[0] if "Ayes" in vote_type: adder = v.yes elif "Noes" in vote_type: adder = v.no else: adder = v.other for party in bunch: party = map(lambda x: x.replace(" (SPEAKER)", ""), party[(party.index(":") + 1) :].split(";")) if party[0] == "None": party = [] for x in party: adder(x) v.add_source(url) bill.add_vote(v)