def parse_senate_vote(self, sv_text, url): """Sets any overrides and creates the vote instance""" overrides = {"ONEILL": "O'NEILL"} # Add new columns as they appear to be safe vote = Vote('upper', '?', 'senate passage', False, 0, 0, 0) vote.add_source(url) vote, rowHeads, saneRow = self.parse_visual_grid(vote, sv_text, overrides, sVoteHeader, rDate, 'TOTAL', 'TOTAL') # Sanity checks on vote data, checks that the calculated total and listed totals match sane={'yes': 0, 'no': 0, 'other':0} # Make sure the header row and sanity row are in orde sorted_rh = sorted(rowHeads.items(), key=operator.itemgetter(0)) startCount=-1 for cell in saneRow: if startCount >= 0: saneVote = sorted_rh[startCount][1] if 'Y' == saneVote[0]: sane['yes'] = int(cell[0]) elif 'N' == saneVote[0]: sane['no'] = int(cell[0]) else: sane['other'] += int(cell[0]) startCount += 1 elif 'TOTAL' in cell[0]: startCount = 0 # Make sure the parsed vote totals match up with counts in the total field if sane['yes'] != vote['yes_count'] or sane['no'] != vote['no_count'] or\ sane['other'] != vote['other_count']: raise ValueError("Votes were not parsed correctly") # Make sure the date is a date if not isinstance(vote['date'], datetime): raise ValueError("Date was not parsed correctly") # End Sanity Check return vote
def vote(self): """Return a billy vote. """ actual_vote_dict = collections.defaultdict(list) date = self.date() motion = self.motion() passed = self.passed() counts = self.get_counts() yes_count = int(counts.get("Yeas", 0)) no_count = int(counts.get("Nays", 0)) vote = Vote( self.chamber, date, motion, passed, yes_count, no_count, sum(map(int, counts.values())) - (yes_count + no_count), actual_vote=dict(actual_vote_dict), ) for vote_val, voter in self.vote_values(): getattr(vote, vote_val)(voter) vote.add_source(self.url) return vote
def scrape_vote(self, bill, vote_type_id, vote_type): base_url = "http://dcclims1.dccouncil.us/lims/voting.aspx?VoteTypeID=%s&LegID=%s" url = base_url % (vote_type_id, bill["bill_id"]) with self.urlopen(url) as html: doc = lxml.html.fromstring(html) vote_date = convert_date(doc.get_element_by_id("VoteDate").text) # check if voice vote / approved boxes have an 'x' voice = doc.xpath('//span[@id="VoteTypeVoice"]/b/text()')[0] == "x" passed = doc.xpath('//span[@id="VoteResultApproved"]/b/text()')[0] == "x" yes_count = extract_int(doc.xpath('//span[@id="VoteCount1"]/b/text()')[0]) no_count = extract_int(doc.xpath('//span[@id="VoteCount2"]/b/text()')[0]) # every now and then this actually drops below 0 (error in count) other_count = max(13 - (yes_count + no_count), 0) vote = Vote("upper", vote_date, vote_type, passed, yes_count, no_count, other_count, voice_vote=voice) vote.add_source(url) # members are only text on page in a <u> tag for member_u in doc.xpath("//u"): member = member_u.text vote_text = member_u.xpath("../../i/text()")[0] if "Yes" in vote_text: vote.yes(member) elif "No" in vote_text: vote.no(member) else: vote.other(member) bill.add_vote(vote)
def scrape_senate_vote(self, bill, url): (path, resp) = self.urlretrieve(url) text = convert_pdf(path, "text") os.remove(path) lines = text.split("\n") date_match = re.search(r"Date:\s+(\d+/\d+/\d+)", text) if not date_match: self.log("Couldn't find date on %s" % url) return time_match = re.search(r"Time:\s+(\d+:\d+:\d+)\s+(AM|PM)", text) date = "%s %s %s" % (date_match.group(1), time_match.group(1), time_match.group(2)) date = datetime.datetime.strptime(date, "%m/%d/%Y %I:%M:%S %p") date = self._tz.localize(date) vote_type = None yes_count, no_count, other_count = None, None, 0 votes = [] for line in lines[21:]: line = line.strip() if not line: continue if line.startswith("YEAS"): yes_count = int(line.split(" - ")[1]) vote_type = "yes" elif line.startswith("NAYS"): no_count = int(line.split(" - ")[1]) vote_type = "no" elif line.startswith("EXCUSED") or line.startswith("NOT VOTING"): other_count += int(line.split(" - ")[1]) vote_type = "other" else: votes.extend([(n.strip(), vote_type) for n in re.split(r"\s{2,}", line)]) if yes_count is None or no_count is None: self.log("Couldne't find vote counts in %s" % url) return passed = yes_count > no_count + other_count clean_bill_id = fix_bill_id(bill["bill_id"]) motion_line = None for i, line in enumerate(lines): if line.strip() == clean_bill_id: motion_line = i + 2 motion = lines[motion_line] if not motion: self.log("Couldn't find motion for %s" % url) return vote = Vote("upper", date, motion, passed, yes_count, no_count, other_count) vote.add_source(url) insert_specific_votes(vote, votes) check_vote_counts(vote) bill.add_vote(vote)
def scrape_votes(self, link, chamber, bill): with self.urlopen(link) as votes_page_html: votes_page = lxml.html.fromstring(votes_page_html) page_tables = votes_page.cssselect("table") votes_table = page_tables[0] votes_elements = votes_table.cssselect("td") # Eliminate table headings and unnecessary element votes_elements = votes_elements[3 : len(votes_elements)] ve = grouper(5, votes_elements) for actor, date, name_and_text, name, text in ve: if "cow" in text.text_content() or "COW" in text.text_content(): continue vote_date = dt.datetime.strptime(date.text_content(), "%m/%d/%Y") motion_and_votes = text.text_content().lstrip("FINAL VOTE - ") motion, sep, votes = motion_and_votes.partition(".") if "passed" in votes: passed = True else: passed = False votes_match = re.search("([0-9]+)-([0-9]+)-?([0-9]+)?", votes) yes_count = votes_match.group(1) no_count = votes_match.group(2) other_count = votes_match.group(3) if other_count == None: other_count = 0 vote = Vote(chamber, vote_date, motion, passed, yes_count, no_count, other_count) vote.add_source(link) bill.add_vote(vote)
def parse_vote(self, bill, action, act_chamber, act_date, url, re_vote_text = re.compile(r'The question (?:being|to be reconsidered):\s*"(.*?\?)"', re.S), re_header=re.compile(r'\d{2}-\d{2}-\d{4}\s{10,}\w{,20} Journal\s{10,}\d{,6}\s{,4}')): html = self.get(url).text doc = lxml.html.fromstring(html) if len(doc.xpath('//pre')) < 2: return # Find all chunks of text representing voting reports. votes_text_container = doc.xpath('//pre') if len(votes_text_container) < 2: return votes_text = votes_text_container[1].text_content() votes_text = re_vote_text.split(votes_text) votes_data = zip(votes_text[1::2], votes_text[2::2]) # Process each. for motion, text in votes_data: yes = no = other = 0 tally = re.findall(r'\b([YNEA])[A-Z]+:\s{,3}(\d{,3})', text) for vtype, vcount in tally: vcount = int(vcount) if vcount != '-' else 0 if vtype == 'Y': yes = vcount elif vtype == 'N': no = vcount else: other += vcount vote = Vote(act_chamber, act_date, motion, yes > no, yes, no, other) # In lengthy documents, the "header" can be repeated in the middle # of content. This regex gets rid of it. vote_lines = re_header.sub('', text) vote_lines = vote_lines.split('\r\n') vote_type = None for vote_list in vote_lines: if vote_list.startswith('Yeas: '): vote_list, vote_type = vote_list[6:], vote.yes elif vote_list.startswith('Nays: '): vote_list, vote_type = vote_list[6:], vote.no elif vote_list.startswith('Excused: '): vote_list, vote_type = vote_list[9:], vote.other elif vote_list.startswith('Absent: '): vote_list, vote_type = vote_list[9:], vote.other elif vote_list.strip() == '': vote_type = None if vote_type: for name in vote_list.split(','): name = name.strip() if name: vote_type(name) vote.add_source(url) bill.add_vote(vote)
def scrape_vote(self, bill, vote_chamber, bill_id, vote_id, vote_date, action_text): url = ('http://alisondb.legislature.state.al.us/Alison/' 'GetRollCallVoteResults.aspx?' 'VOTE={0}&BODY={1}&INST={2}&SESS={3}'. format(vote_id, vote_chamber, bill_id, self.session_id)) doc = lxml.html.fromstring(self.get(url=url).text) voters = {'Y': [], 'N': [], 'P': [], 'A': []} voters_and_votes = doc.xpath('//table/tr/td/font/text()') capture_vote = False name = '' for item in voters_and_votes: if capture_vote: capture_vote = False if name: voters[item].append(name) else: capture_vote = True name = item if (name.endswith(", Vacant") or name.startswith("Total ") or not name.strip()): name = '' # Check name counts against totals listed on the site total_yea = doc.xpath('//*[starts-with(text(), "Total Yea")]/text()') if total_yea: total_yea = int(total_yea[0].split(":")[-1]) assert total_yea == len(voters['Y']), "Yea count incorrect" else: total_yea = len(voters['Y']) total_nay = doc.xpath('//*[starts-with(text(), "Total Nay")]/text()') if total_nay: total_nay = int(total_nay[0].split(":")[-1]) assert total_nay == len(voters['N']), "Nay count incorrect" else: total_nay = len(voters['N']) total_absent = doc.xpath( '//*[starts-with(text(), "Total Absent")]/text()') if total_absent: total_absent = int(total_absent[0].split(":")[-1]) assert total_absent == len(voters['A']), "Absent count incorrect" total_other = len(voters['P']) + len(voters['A']) vote = Vote( self.CHAMBERS[vote_chamber[0]], vote_date, action_text, total_yea > total_nay, total_yea, total_nay, total_other) vote.add_source(url) for member in voters['Y']: vote.yes(member) for member in voters['N']: vote.no(member) for member in (voters['A'] + voters['P']): vote.other(member) bill.add_vote(vote)
def scrape_vote(self, bill, date, motion, url): page = self.urlopen(url) if "not yet official" in page: # Sometimes they link to vote pages before they go live return page = lxml.html.fromstring(page) if url.endswith("Senate"): actor = "upper" else: actor = "lower" count_path = "string(//td[@align = 'center' and contains(., '%s: ')])" yes_count = int(page.xpath(count_path % "Yeas").split()[-1]) no_count = int(page.xpath(count_path % "Nays").split()[-1]) other_count = int(page.xpath(count_path % "Non Voting").split()[-1]) other_count += int(page.xpath(count_path % "Present").split()[-1]) passed = yes_count > no_count + other_count vote = Vote(actor, date, motion, passed, yes_count, no_count, other_count) vote.add_source(url) vote_path = "//h3[. = '%s']/following-sibling::table[1]/tr/td/a" for yes in page.xpath(vote_path % "Yeas"): vote.yes(yes.text) for no in page.xpath(vote_path % "Nays"): vote.no(no.text) for other in page.xpath(vote_path % "Non Voting"): vote.other(other.text) for other in page.xpath(vote_path % "Present"): vote.other(other.text) bill.add_vote(vote)
def parse_senate_vote(self, url): vote = Vote('upper', '?', 'senate passage', False, 0, 0, 0) vote.add_source(url) fname, resp = self.urlretrieve(url) sv_text = convert_sv_text(convert_pdf(fname, 'text')) os.remove(fname) in_votes = False for line in sv_text: if not in_votes: dmatch = re.search('DATE:(\d{2}-\d{2}-\d{2})', line) if dmatch: date = dmatch.groups()[0] vote['date'] = datetime.strptime(date, '%m-%d-%y') if 'YES NO ABS EXC' in line: in_votes = True elif 'PASSED' in line: vote['passed'] = True else: if 'TOTALS' in line: # Lt. Governor voted if 'GOVERNOR' in line: name, spaces, line = re.match(' ([A-Z,.]+)(\s+)X(.*)', line).groups() if len(spaces) == 1: vote.yes(name) else: vote.no(name) _, yes, no, abs, exc = line.split() vote['yes_count'] = int(yes) vote['no_count'] = int(no) vote['other_count'] = int(abs)+int(exc) # no longer in votes in_votes = False continue # pull votes out matches = re.match(' ([A-Z,.]+)(\s+)X\s+([A-Z,.]+)(\s+)X', line).groups() name1, spaces1, name2, spaces2 = matches # vote can be determined by # of spaces if len(spaces1) == 1: vote.yes(name1) elif len(spaces1) == 2: vote.no(name1) else: vote.other(name1) if len(spaces2) == 1: vote.yes(name2) elif len(spaces2) == 2: vote.no(name2) else: vote.other(name2) return vote
def scrape_vote(self, bill, date, url): with self.urlopen(url) as page: page = lxml.html.fromstring(page) header = page.xpath("string(//h4[contains(@id, 'hdVote')])") location = header.split(', ')[1] if location.startswith('House'): chamber = 'lower' elif location.startswith('Senate'): chamber = 'upper' else: raise ScrapeError("Bad chamber: %s" % chamber) committee = ' '.join(location.split(' ')[1:]).strip() if not committee or committee.startswith('of Representatives'): committee = None motion = ', '.join(header.split(', ')[2:]).strip() yes_count = int( page.xpath("string(//td[contains(@id, 'tdAyes')])")) no_count = int( page.xpath("string(//td[contains(@id, 'tdNays')])")) excused_count = int( page.xpath("string(//td[contains(@id, 'tdExcused')])")) absent_count = int( page.xpath("string(//td[contains(@id, 'tdAbsent')])")) other_count = excused_count + absent_count passed = yes_count > no_count if motion.startswith('Do Pass'): type = 'passage' elif motion == 'Concurred in amendments': type = 'amendment' elif motion == 'Veto override': type = 'veto_override' else: type = 'other' vote = Vote(chamber, date, motion, passed, yes_count, no_count, other_count) vote['type'] = type if committee: vote['committee'] = committee vote.add_source(url) for td in page.xpath("//table[contains(@id, 'tblVotes')]/tr/td"): if td.text == 'Yea': vote.yes(td.getprevious().text.strip()) elif td.text == 'Nay': vote.no(td.getprevious().text.strip()) elif td.text in ('Excused', 'Absent'): vote.other(td.getprevious().text.strip()) bill.add_vote(vote)
def scrape_bill_details(self, url, bill): html = self.get(url, retry_on_404=True).text doc = lxml.html.fromstring(html) # summary sections summary = doc.xpath('//h4[starts-with(text(), "SUMMARY")]/following-sibling::p/text()') if summary and summary[0].strip(): bill['summary'] = summary[0].strip() # versions for va in doc.xpath('//h4[text()="FULL TEXT"]/following-sibling::ul[1]/li/a[1]'): # 11/16/09 \xa0House: Prefiled and ordered printed; offered 01/13/10 10100110D date, desc = va.text.split(u' \xa0') desc.rsplit(' ', 1)[0] # chop off last part link = va.get('href') date = datetime.datetime.strptime(date, '%m/%d/%y') # budget bills in VA are searchable but no full text available if '+men+' in link: self.warning('not adding budget version, bill text not available') else: # VA duplicates reprinted bills, lets keep the original name bill.add_version(desc, BASE_URL+link, date=date, mimetype='text/html', on_duplicate='use_old') # actions for ali in doc.xpath('//h4[text()="HISTORY"]/following-sibling::ul[1]/li'): date, action = ali.text_content().split(u' \xa0') actor, action = action.split(': ', 1) actor = self.actor_map[actor] date = datetime.datetime.strptime(date.strip(), '%m/%d/%y') # if action ends in (##-Y ##-N) remove that part vrematch = self.vote_strip_re.match(action) if vrematch: action, y, n, o = vrematch.groups() vote = Vote(actor, date, action, int(y) > int(n), int(y), int(n), 0) vote_url = ali.xpath('a/@href') if vote_url: self.parse_vote(vote, vote_url[0]) vote.add_source(BASE_URL + vote_url[0]) # set other count, it isn't provided vote['other_count'] = len(vote['other_votes']) #vote.validate() bill.add_vote(vote) # categorize actions for pattern, atype in self._action_classifiers: if re.match(pattern, action): break else: atype = 'other' # if matched a 'None' atype, don't add the action if atype: bill.add_action(actor, action, date, type=atype)
def scrape_vote(self, bill, date, motion, url): page = lxml.html.fromstring(self.urlopen(url)) if url.endswith('Senate'): actor = 'upper' else: actor = 'lower' count_path = "string(//td[@align = 'center' and contains(., '%s: ')])" yes_count = int(page.xpath(count_path % "Yeas").split()[-1]) no_count = int(page.xpath(count_path % "Nays").split()[-1]) other_count = int(page.xpath(count_path % "Non Voting").split()[-1]) other_count += int(page.xpath(count_path % "Present").split()[-1]) passed = yes_count > no_count + other_count vote = Vote(actor, date, motion, passed, yes_count, no_count, other_count) vote.add_source(url) vote_path = "//h3[. = '%s']/following-sibling::table[1]/tr/td/a" for yes in page.xpath(vote_path % "Yeas"): vote.yes(yes.text) for no in page.xpath(vote_path % "Nays"): vote.no(no.text) for other in page.xpath(vote_path % "Non Voting"): vote.other(other.text) for other in page.xpath(vote_path % "Present"): vote.other(other.text) bill.add_vote(vote)
def scrape_vote(self, bill, date, motion, url): page = self.urlopen(url) if 'not yet official' in page: # Sometimes they link to vote pages before they go live return page = lxml.html.fromstring(page) if url.endswith('Senate'): actor = 'upper' else: actor = 'lower' count_path = "string(//td[@align = 'center' and contains(., '%s: ')])" yes_count = int(page.xpath(count_path % "Yeas").split()[-1]) no_count = int(page.xpath(count_path % "Nays").split()[-1]) other_count = int(page.xpath(count_path % "Non Voting").split()[-1]) other_count += int(page.xpath(count_path % "Present").split()[-1]) passed = yes_count > no_count + other_count vote = Vote(actor, date, motion, passed, yes_count, no_count, other_count) vote.add_source(url) xpath = ( '//*[contains(@class, "ms-standardheader")]/' 'following-sibling::table') divs = page.xpath(xpath) votevals = 'yes no other other'.split() for (voteval, div) in zip(votevals, divs): for a in div.xpath('.//a'): getattr(vote, voteval)(a.text_content()) bill.add_vote(vote)
def add_vote(self, bill, chamber, date, text, url): votes = re.findall(r'Ayes,? (\d+)[,;]\s+N(?:oes|ays),? (\d+)', text) (yes, no) = int(votes[0][0]), int(votes[0][1]) vtype = 'other' for regex, type in motion_classifiers.iteritems(): if re.match(regex, text): vtype = type break v = Vote(chamber, date, text, yes > no, yes, no, 0, type=vtype) # fetch the vote itself if url: v.add_source(url) if 'av' in url: self.add_house_votes(v, url) elif 'sv' in url: self.add_senate_votes(v, url) # other count is brute forced v['other_count'] = len(v['other_votes']) v.validate() bill.add_vote(v)
def add_vote(self, bill, chamber, date, line, text): votes = re.findall(r'Ayes (\d+)\, Noes (\d+)', text) (yes, no) = int(votes[0][0]), int(votes[0][1]) vtype = 'other' for regex, type in motion_classifiers.iteritems(): if re.match(regex, text): vtype = type break v = Vote(chamber, date, text, yes > no, yes, no, 0, type=vtype) # fetch the vote itself link = line.xpath('//a[contains(@href, "/votes/")]') if link: link = link[0].get('href') v.add_source(link) filename, resp = self.urlretrieve(link) if 'av' in link: self.add_house_votes(v, filename) elif 'sv' in link: self.add_senate_votes(v, filename) bill.add_vote(v)
def scrape_vote(self, bill, chamber, date, url): (path, resp) = self.urlretrieve(url) text = convert_pdf(path, 'text') os.remove(path) try: motion = text.split('\n')[4].strip() except IndexError: return try: yes_count = int(re.search(r'Yeas - (\d+)', text).group(1)) except AttributeError: return no_count = int(re.search(r'Nays - (\d+)', text).group(1)) other_count = int(re.search(r'Not Voting - (\d+)', text).group(1)) passed = yes_count > (no_count + other_count) vote = Vote(chamber, date, motion, passed, yes_count, no_count, other_count) vote.add_source(url) y,n,o = 0,0,0 break_outter = False for line in text.split('\n')[9:]: if break_outter: break if 'after roll call' in line: break if 'Indication of Vote' in line: break if 'Presiding' in line: continue for col in re.split(r'-\d+', line): col = col.strip() if not col: continue match = re.match(r'(Y|N|EX|\*)\s+(.+)$', col) if match: if match.group(2) == "PAIR": break_outter = True break if match.group(1) == 'Y': vote.yes(match.group(2)) elif match.group(1) == 'N': vote.no(match.group(2)) else: vote.other(match.group(2)) else: vote.other(col.strip()) vote.validate() bill.add_vote(vote)
def scrape_committee_vote(self, bill, actor, date, motion, page, url, uniqid): votes = page.xpath("//table")[0] rows = votes.xpath(".//tr")[0] if rows[0].text_content() == 'Votes:': #New webste rows = votes.xpath(".//tr")[2] yno = rows.xpath(".//td") if len(yno) < 3: yes = yno[0] no, other = None, None else: yes, no, other = rows.xpath(".//td")[:3] def proc_block(obj, typ): if obj is None: return { "type": None, "count": None, "votes": [] } votes = [] for vote in obj.xpath(".//br"): if vote.tail: vote = vote.tail.strip() if vote: votes.append(vote) count = len(votes) return { "type": typ, "count": count, "votes": votes } vote_dict = { "yes": proc_block(yes, 'yes'), "no": proc_block(no, 'no'), "other": proc_block(other, 'other'), } yes_count = vote_dict['yes']['count'] no_count = vote_dict['no']['count'] or 0 other_count = vote_dict['other']['count'] or 0 vote = Vote( actor, date, motion, (yes_count > no_count), yes_count, no_count, other_count, _vote_id=uniqid) vote.add_source(url) for key in vote_dict: for voter in vote_dict[key]['votes']: getattr(vote, key)(voter) bill.add_vote(vote)
def scrape_vote(self, bill, date, url): with self.urlopen(url) as page: page = lxml.html.fromstring(page) header = page.xpath("string(//h4[contains(@id, 'hdVote')])") location = header.split(", ")[1] if location.startswith("House"): chamber = "lower" elif location.startswith("Senate"): chamber = "upper" else: raise ScrapeError("Bad chamber: %s" % chamber) committee = " ".join(location.split(" ")[1:]).strip() if not committee or committee.startswith("of Representatives"): committee = None motion = ", ".join(header.split(", ")[2:]).strip() if not motion: # If we can't detect a motion, skip this vote return yes_count = int(page.xpath("string(//td[contains(@id, 'tdAyes')])")) no_count = int(page.xpath("string(//td[contains(@id, 'tdNays')])")) excused_count = int(page.xpath("string(//td[contains(@id, 'tdExcused')])")) absent_count = int(page.xpath("string(//td[contains(@id, 'tdAbsent')])")) other_count = excused_count + absent_count passed = yes_count > no_count if motion.startswith("Do Pass"): type = "passage" elif motion == "Concurred in amendments": type = "amendment" elif motion == "Veto override": type = "veto_override" else: type = "other" vote = Vote(chamber, date, motion, passed, yes_count, no_count, other_count) vote["type"] = type if committee: vote["committee"] = committee vote.add_source(url) for td in page.xpath("//table[contains(@id, 'tblVotes')]/tr/td"): if td.text == "Yea": vote.yes(td.getprevious().text.strip()) elif td.text == "Nay": vote.no(td.getprevious().text.strip()) elif td.text in ("Excused", "Absent"): vote.other(td.getprevious().text.strip()) bill.add_vote(vote)
def scrape_vote(self, bill, action_text, url): doc = lxml.html.fromstring(self.urlopen(url)) date = None yes_count = no_count = other_count = None # process action_text - might look like "Vote - Senate Floor - Third Reading Passed (46-0) - 01/16/12" if action_text.startswith('Vote - Senate Floor - '): action_text = action_text[22:] chamber = 'upper' elif action_text.startswith('Vote - House Floor - '): action_text = action_text[21:] chamber = 'lower' motion, unused_date = action_text.split(' - ') yes_count, no_count = re.findall('\((\d+)-(\d+)\)', motion)[0] if 'Passed' in motion: motion = motion.split(' Passed')[0] passed = True elif 'Adopted' in motion: motion = motion.split(' Adopted')[0] passed = True elif 'Rejected' in motion: motion = motion.split(' Rejected')[0] passed = False elif 'Floor Amendment' in motion: passed = int(yes_count) > int(no_count) else: raise Exception('unknown motion: %s' % motion) vote = Vote(chamber=chamber, date=None, motion=motion, yes_count=int(yes_count), no_count=int(no_count), other_count=0, passed=passed) vfunc = None nobrs = doc.xpath('//nobr/text()') for text in nobrs: text = text.replace(u'\xa0', ' ') if text.startswith('Calendar Date: '): vote['date'] = datetime.datetime.strptime(text.split(': ', 1)[1], '%b %d, %Y %H:%M %p') elif 'Yeas' in text and 'Nays' in text and 'Not Voting' in text: self.debug(text) yeas, nays, nv, exc, absent = re.match('(\d+) Yeas\s+(\d+) Nays\s+(\d+) Not Voting\s+(\d+) Excused \(Absent\)\s+(\d+) Absent', text).groups() vote['yes_count'] = int(yeas) vote['no_count'] = int(nays) vote['other_count'] = int(nv) + int(exc) + int(absent) elif 'Voting Yea' in text: vfunc = vote.yes elif 'Voting Nay' in text: vfunc = vote.no elif 'Not Voting' in text or 'Excused' in text: vfunc = vote.other elif vfunc: vfunc(text) vote.validate() vote.add_source(url) bill.add_vote(vote)
def scrape_votes(self, bill, link): with self.urlopen(link) as page: page = lxml.html.fromstring(page) raw_vote_data = page.xpath("//span[@id='lblVoteData']")[0].text_content() raw_vote_data = re.split("\w+? by [\w ]+?\s+-", raw_vote_data.strip())[1:] for raw_vote in raw_vote_data: raw_vote = raw_vote.split(u"\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0") motion = raw_vote[0] vote_date = re.search("(\d+/\d+/\d+)", motion) if vote_date: vote_date = datetime.datetime.strptime(vote_date.group(), "%m/%d/%Y") passed = "Passed" in motion or "Recommended for passage" in motion or "Adopted" in raw_vote[1] vote_regex = re.compile("\d+$") aye_regex = re.compile("^.+voting aye were: (.+) -") no_regex = re.compile("^.+voting no were: (.+) -") other_regex = re.compile("^.+present and not voting were: (.+) -") yes_count = 0 no_count = 0 other_count = 0 ayes = [] nos = [] others = [] for v in raw_vote[1:]: v = v.strip() if v.startswith("Ayes...") and vote_regex.search(v): yes_count = int(vote_regex.search(v).group()) elif v.startswith("Noes...") and vote_regex.search(v): no_count = int(vote_regex.search(v).group()) elif v.startswith("Present and not voting...") and vote_regex.search(v): other_count += int(vote_regex.search(v).group()) elif aye_regex.search(v): ayes = aye_regex.search(v).groups()[0].split(", ") elif no_regex.search(v): nos = no_regex.search(v).groups()[0].split(", ") elif other_regex.search(v): others += other_regex.search(v).groups()[0].split(", ") if "ChamberVoting=H" in link: chamber = "lower" else: chamber = "upper" vote = Vote(chamber, vote_date, motion, passed, yes_count, no_count, other_count) vote.add_source(link) for a in ayes: vote.yes(a) for n in nos: vote.no(n) for o in others: vote.other(o) vote.validate() bill.add_vote(vote) return bill
def scrape_vote(self, bill, name, url): if "VOTE/H" in url: vote_chamber = "lower" cols = (1, 5, 9, 13) name_offset = 3 yes_offset = 0 no_offset = 1 else: vote_chamber = "upper" cols = (1, 6) name_offset = 4 yes_offset = 1 no_offset = 2 # Connecticut's SSL is causing problems with Scrapelib, so use Requests page = requests.get(url, verify=False).text if "BUDGET ADDRESS" in page: return page = lxml.html.fromstring(page) yes_count = page.xpath("string(//span[contains(., 'Those voting Yea')])") yes_count = int(re.match(r"[^\d]*(\d+)[^\d]*", yes_count).group(1)) no_count = page.xpath("string(//span[contains(., 'Those voting Nay')])") no_count = int(re.match(r"[^\d]*(\d+)[^\d]*", no_count).group(1)) other_count = page.xpath("string(//span[contains(., 'Those absent')])") other_count = int(re.match(r"[^\d]*(\d+)[^\d]*", other_count).group(1)) need_count = page.xpath("string(//span[contains(., 'Necessary for')])") need_count = int(re.match(r"[^\d]*(\d+)[^\d]*", need_count).group(1)) date = page.xpath("string(//span[contains(., 'Taken on')])") date = re.match(r".*Taken\s+on\s+(\d+/\s?\d+)", date).group(1) date = date.replace(" ", "") date = datetime.datetime.strptime(date + " " + bill["session"], "%m/%d %Y").date() vote = Vote(vote_chamber, date, name, yes_count > need_count, yes_count, no_count, other_count) vote.add_source(url) table = page.xpath("//table")[0] for row in table.xpath("tr"): for i in cols: name = row.xpath("string(td[%d])" % (i + name_offset)).strip() if not name or name == "VACANT": continue if "Y" in row.xpath("string(td[%d])" % (i + yes_offset)): vote.yes(name) elif "N" in row.xpath("string(td[%d])" % (i + no_offset)): vote.no(name) else: vote.other(name) bill.add_vote(vote)
def scrape_current(self, chamber, term): chamber_name = "Senate" if chamber == "upper" else "House" with self.urlopen( ksapi.url + "bill_status/" ) as bill_request: # perhaps we should save this data so we can make on request for both chambers? bill_request_json = json.loads(bill_request) bills = bill_request_json["content"] for bill_data in bills: # filtering out other chambers bill_equal_chamber = False for history in bill_data["HISTORY"]: if history["chamber"] == chamber_name: bill_is_in_chamber = True if not bill_is_in_chamber: continue # main bill = Bill(term, chamber, bill_data["BILLNO"], bill_data["SHORTTITLE"]) bill.add_source(ksapi.url + "bill_status/" + bill_data["BILLNO"].lower()) if bill_data["LONGTITLE"]: bill.add_title(bill_data["LONGTITLE"]) bill.add_document("apn", ksapi.ksleg + bill_data["apn"]) bill.add_version("Latest", ksapi.ksleg + bill_data["apn"]) for sponsor in bill_data["SPONSOR_NAMES"]: bill.add_sponsor("primary" if len(bill_data["SPONSOR_NAMES"]) == 1 else "cosponsor", sponsor) for event in bill_data["HISTORY"]: if "committee_names" in event and "conferee_names" in event: actor = " and ".join(bill_data["committee_names"] + bill_data["conferee_names"]) elif "committee_names" in history: actor = " and ".join(bill_data["committee_names"]) elif "conferee_names" in history: actor = " and ".join(bill_data["conferee_names"]) else: actor = "upper" if chamber == "Senate" else "lower" date = datetime.datetime.strptime(event["occurred_datetime"], "%Y-%m-%dT%H:%M:%S") bill.add_action(actor, event["status"], date) if event["action_code"] in ksapi.voted: votes = votes_re.match(event["status"]) if votes: vote = Vote( chamber, date, votes.group(1), event["action_code"] in ksapi.passed, int(votes.group(2)), int(votes.group(3)), 0, ) vote.add_source(ksapi.ksleg + "bill_status/" + bill_data["BILLNO"].lower()) bill.add_vote(vote) self.save_bill(bill)
def scrape_votes(self, bill, bill_prefix, number, session): vote_url = ('http://www.legislature.state.oh.us/votes.cfm?ID=' + session + '_' + bill_prefix + '_' + str(number)) page = self.urlopen(vote_url) page = lxml.html.fromstring(page) for jlink in page.xpath("//a[contains(@href, 'JournalText')]"): date = datetime.datetime.strptime(jlink.text, "%m/%d/%Y").date() details = jlink.xpath("string(../../../td[2])") chamber = details.split(" - ")[0] if chamber == 'House': chamber = 'lower' elif chamber == 'Senate': chamber = 'upper' else: raise ScrapeError("Bad chamber: %s" % chamber) motion = details.split(" - ")[1].split("\n")[0].strip() vote_row = jlink.xpath("../../..")[0].getnext() yea_div = vote_row.xpath( "td/font/div[contains(@id, 'Yea')]")[0] yeas = [] for td in yea_div.xpath("table/tr/td"): name = td.xpath("string()") if name: yeas.append(name) no_div = vote_row.xpath( "td/font/div[contains(@id, 'Nay')]")[0] nays = [] for td in no_div.xpath("table/tr/td"): name = td.xpath("string()") if name: nays.append(name) yes_count = len(yeas) no_count = len(nays) vote = Vote(chamber, date, motion, yes_count > no_count, yes_count, no_count, 0) for yes in yeas: vote.yes(yes) for no in nays: vote.no(no) vote.add_source(vote_url) bill.add_vote(vote)
def scrape_votes(self, bill, bill_prefix, number, session): vote_url = ('http://www.legislature.state.oh.us/votes.cfm?ID=' + session + '_' + bill_prefix + '_' + str(number)) page = self.urlopen(vote_url) page = lxml.html.fromstring(page) for jlink in page.xpath("//a[contains(@href, 'JournalText')]"): date = datetime.datetime.strptime(jlink.text, "%m/%d/%Y").date() details = jlink.xpath("string(../../../td[2])") chamber = details.split(" - ")[0] if chamber == 'House': chamber = 'lower' elif chamber == 'Senate': chamber = 'upper' else: raise ScrapeError("Bad chamber: %s" % chamber) motion = details.split(" - ")[1].split("\n")[0].strip() vote_row = jlink.xpath("../../..")[0].getnext() yea_div = vote_row.xpath( "td/font/div[contains(@id, 'Yea')]")[0] yeas = [] for td in yea_div.xpath("table/tr/td"): name = td.xpath("string()") if name: yeas.append(name) no_div = vote_row.xpath( "td/font/div[contains(@id, 'Nay')]")[0] nays = [] for td in no_div.xpath("table/tr/td"): name = td.xpath("string()") if name: nays.append(name) yes_count = len(yeas) no_count = len(nays) vote = Vote(chamber, date, motion, yes_count > no_count, yes_count, no_count, 0) for yes in yeas: vote.yes(yes) for no in nays: vote.no(no) vote.add_source(vote_url) bill.add_vote(vote)
def parse_vote(self, bill, vote_date, vote_chamber, vote_status, vote_url): vote_chamber = 'upper' if vote_chamber == 'Senate' else 'lower' vote_date = datetime.datetime.strptime(vote_date, '%a %d %b %Y') vote_doc, resp = self.urlretrieve(vote_url) subprocess.check_call('abiword --to=ksvote.txt %s' % vote_doc, shell=True, cwd='/tmp/') vote_lines = open('/tmp/ksvote.txt').readlines() os.remove(vote_doc) vote = None passed = True for line in vote_lines: totals = re.findall( 'Yeas (\d+)[;,] Nays (\d+)[;,] (?:Present but not voting|Present and Passing):? (\d+)[;,] (?:Absent or not voting|Absent or Not Voting):? (\d+)', line) if totals: totals = totals[0] yeas = int(totals[0]) nays = int(totals[1]) nv = int(totals[2]) absent = int(totals[3]) # default passed to true vote = Vote(vote_chamber, vote_date, vote_status, True, yeas, nays, nv + absent) elif line.startswith('Yeas:'): line = line.split(':', 1)[1].strip() for member in line.split(', '): if member != 'None.': vote.yes(member) elif line.startswith('Nays:'): line = line.split(':', 1)[1].strip() for member in line.split(', '): if member != 'None.': vote.no(member) elif line.startswith('Present '): line = line.split(':', 1)[1].strip() for member in line.split(', '): if member != 'None.': vote.other(member) elif line.startswith('Absent or'): line = line.split(':', 1)[1].strip() for member in line.split(', '): if member != 'None.': vote.other(member) elif 'the motion did not prevail' in line: passed = False if vote: vote['passed'] = passed vote.add_source(vote_url) bill.add_vote(vote)
def scrape_votes(self, bill, votes_url): html = self.urlopen(votes_url) doc = lxml.html.fromstring(html) doc.make_links_absolute(votes_url) EXPECTED_VOTE_CODES = ['Y','N','E','NV','A','P','-'] # vote indicator, a few spaces, a name, newline or multiple spaces VOTE_RE = re.compile('(Y|N|E|NV|A|P|-)\s{2,5}(\w.+?)(?:\n|\s{2})') for link in doc.xpath('//a[contains(@href, "votehistory")]'): pieces = link.text.split(' - ') date = pieces[-1] if len(pieces) == 3: motion = pieces[1] else: motion = 'Third Reading' chamber = link.xpath('../following-sibling::td/text()')[0] if chamber == 'HOUSE': chamber = 'lower' elif chamber == 'SENATE': chamber = 'upper' else: self.warning('unknown chamber %s' % chamber) date = datetime.datetime.strptime(date, "%A, %B %d, %Y") # download the file fname, resp = self.urlretrieve(link.get('href')) pdflines = convert_pdf(fname, 'text').splitlines() os.remove(fname) vote = Vote(chamber, date, motion.strip(), False, 0, 0, 0) for line in pdflines: for match in VOTE_RE.findall(line): vcode, name = match if vcode == 'Y': vote.yes(name) elif vcode == 'N': vote.no(name) else: vote.other(name) # fake the counts vote['yes_count'] = len(vote['yes_votes']) vote['no_count'] = len(vote['no_votes']) vote['other_count'] = len(vote['other_votes']) vote['passed'] = vote['yes_count'] > vote['no_count'] vote.add_source(link.get('href')) bill.add_vote(vote)
def parse_vote(self, bill, vote_date, vote_chamber, vote_status, vote_url): vote_chamber = 'upper' if vote_chamber == 'Senate' else 'lower' vote_date = datetime.datetime.strptime(vote_date, '%a %d %b %Y') vote_doc, resp = self.urlretrieve(vote_url) subprocess.check_call('abiword --to=ksvote.txt %s' % vote_doc, shell=True, cwd='/tmp/') vote_lines = open('/tmp/ksvote.txt').readlines() os.remove(vote_doc) vote = None passed = True for line in vote_lines: line = line.strip() totals = re.findall('Yeas (\d+)[;,] Nays (\d+)[;,] (?:Present but not voting:|Present and Passing) (\d+)[;,] (?:Absent or not voting:|Absent or Not Voting) (\d+)', line) if totals: totals = totals[0] yeas = int(totals[0]) nays = int(totals[1]) nv = int(totals[2]) absent = int(totals[3]) # default passed to true vote = Vote(vote_chamber, vote_date, vote_status, True, yeas, nays, nv+absent) elif line.startswith('Yeas:'): line = line.split(':', 1)[1].strip() for member in line.split(', '): if member != 'None.': vote.yes(member) elif line.startswith('Nays:'): line = line.split(':', 1)[1].strip() for member in line.split(', '): if member != 'None.': vote.no(member) elif line.startswith('Present '): line = line.split(':', 1)[1].strip() for member in line.split(', '): if member != 'None.': vote.other(member) elif line.startswith('Absent or'): line = line.split(':', 1)[1].strip() for member in line.split(', '): if member != 'None.': vote.other(member) elif 'the motion did not prevail' in line: passed = False if vote: vote['passed'] = passed vote.add_source(vote_url) bill.add_vote(vote)
def scrape_chamber_votes(self, chamber, session, url): xml = self.urlopen(url) doc = lxml.etree.fromstring(xml) for vxml in doc.xpath('//vote'): legislation = vxml.get('legislation') motion = vxml.get('caption') timestamp = datetime.datetime.strptime(vxml.get('dateTime'), '%Y-%m-%dT%H:%M:%S') leg_prefix = legislation.split(' ')[0] if leg_prefix in ('SB', 'SR'): bill_chamber = 'upper' elif leg_prefix in ('HB', 'HR'): bill_chamber = 'lower' elif leg_prefix in ('', 'EX', 'ELECTION'): continue else: raise Exception('unknown legislation prefix: ' + legislation) # skip bills from other chamber if bill_chamber != chamber: continue unknown_count = int(vxml.xpath('totals/@unknown')[0]) excused_count = int(vxml.xpath('totals/@excused')[0]) nv_count = int(vxml.xpath('totals/@not-voting')[0]) no_count = int(vxml.xpath('totals/@nays')[0]) yes_count = int(vxml.xpath('totals/@yeas')[0]) other_count = unknown_count + excused_count + nv_count vote = Vote(chamber, timestamp, motion, passed=yes_count > no_count, yes_count=yes_count, no_count=no_count, other_count=other_count, session=session, bill_id=legislation, bill_chamber=bill_chamber) vote.add_source(url) for m in vxml.xpath('member'): vote_letter = m.get('vote') member = m.get('name') if vote_letter == 'Y': vote.yes(member) elif vote_letter == 'N': vote.no(member) else: vote.other(member) self.save_vote(vote)
def scrape_committee_vote(self, bill, actor, date, motion, url, uniqid): with self.urlopen(url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(url) committee = page.xpath("//b")[0].text_content() votes = page.xpath("//table")[0] rows = votes.xpath(".//tr")[0] yno = rows.xpath(".//td") if len(yno) < 3: yes = yno[0] no, other = None, None else: yes, no, other = rows.xpath(".//td") def proc_block(obj): if obj is None: return {"type": None, "count": None, "votes": []} typ = obj.xpath("./b")[0].text_content() count = obj.xpath(".//b")[0].tail.replace("-", "").strip() count = int(count) votes = [] for vote in obj.xpath(".//br"): vote = vote.tail if vote: vote = vote.strip() votes.append(vote) return {"type": typ, "count": count, "votes": votes} vote_dict = { "yes": proc_block(yes), "no": proc_block(no), "other": proc_block(other), } yes_count = vote_dict['yes']['count'] no_count = vote_dict['no']['count'] or 0 other_count = vote_dict['other']['count'] or 0 vote = Vote(actor, date, motion, (yes_count > no_count), yes_count, no_count, other_count, _vote_id=uniqid) vote.add_source(url) for key in vote_dict: for voter in vote_dict[key]['votes']: getattr(vote, key)(voter) bill.add_vote(vote)
def scrape_vote(self, bill, chamber, date, url): (path, resp) = self.urlretrieve(url) text = convert_pdf(path, 'text') os.remove(path) motion = text.split('\n')[4].strip() yes_count = int(re.search(r'Yeas - (\d+)', text).group(1)) no_count = int(re.search(r'Nays - (\d+)', text).group(1)) other_count = int(re.search(r'Not Voting - (\d+)', text).group(1)) passed = yes_count > (no_count + other_count) vote = Vote(chamber, date, motion, passed, yes_count, no_count, other_count) vote.add_source(url) y,n,o = 0,0,0 break_outter = False for line in text.split('\n')[9:]: if break_outter: break if 'after roll call' in line: break if 'Indication of Vote' in line: break if 'Presiding' in line: continue for col in re.split(r'-\d+', line): col = col.strip() if not col: continue match = re.match(r'(Y|N|EX|\*)\s+(.+)$', col) if match: if match.group(2) == "PAIR": break_outter = True break if match.group(1) == 'Y': vote.yes(match.group(2)) elif match.group(1) == 'N': vote.no(match.group(2)) else: vote.other(match.group(2)) else: vote.other(col.strip()) vote.validate() bill.add_vote(vote)
def scrape_vote(self, bill, motion, url): page = self.urlopen(url, retry_on_404=True) page = lxml.html.fromstring(page) yeas_cell = page.xpath("//td[text() = 'Yeas (Y):']")[0] yes_count = int(yeas_cell.xpath("string(following-sibling::td)")) nays_cell = page.xpath("//td[text() = 'Nays (N):']")[0] no_count = int(nays_cell.xpath("string(following-sibling::td)")) abs_cell = page.xpath("//td[text() = 'Absent (X):']")[0] abs_count = int(abs_cell.xpath("string(following-sibling::td)")) ex_cell = page.xpath("//td[text() = 'Excused (E):']")[0] ex_count = int(ex_cell.xpath("string(following-sibling::td)")) other_count = abs_count + ex_count if 'chamber=House' in url: chamber = 'lower' elif 'chamber=Senate' in url: chamber = 'upper' date_cell = page.xpath("//td[text() = 'Date:']")[0] date = date_cell.xpath("string(following-sibling::td)") try: date = datetime.datetime.strptime(date, "%B %d, %Y") except ValueError: date = datetime.datetime.strptime(date, "%b. %d, %Y") outcome_cell = page.xpath("//td[text()='Outcome:']")[0] outcome = outcome_cell.xpath("string(following-sibling::td)") vote = Vote(chamber, date, motion, outcome == 'PREVAILS', yes_count, no_count, other_count) vote.add_source(url) member_cell = page.xpath("//td[text() = 'Member']")[0] for row in member_cell.xpath("../../tr")[1:]: name = row.xpath("string(td[2])") # name = name.split(" of ")[0] vtype = row.xpath("string(td[4])") if vtype == 'Y': vote.yes(name) elif vtype == 'N': vote.no(name) elif vtype == 'X' or vtype == 'E': vote.other(name) bill.add_vote(vote)
def scrape_votes(self, bill, sponsor, link): with self.urlopen(link) as page: page = lxml.html.fromstring(page) raw_vote_data = page.xpath( "//span[@id='lblVoteData']")[0].text_content() raw_vote_data = raw_vote_data.strip().split( '%s by %s - ' % (bill['bill_id'], sponsor))[1:] for raw_vote in raw_vote_data: raw_vote = raw_vote.split( u'\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0') motion = raw_vote[0] vote_date = re.search('(\d+/\d+/\d+)', motion) if vote_date: vote_date = datetime.datetime.strptime( vote_date.group(), '%m/%d/%Y') passed = ('Passed' in motion) or ('Adopted' in raw_vote[1]) vote_regex = re.compile('\d+$') aye_regex = re.compile('^.+voting aye were: (.+) -') no_regex = re.compile('^.+voting no were: (.+) -') yes_count = None no_count = None other_count = 0 ayes = [] nos = [] for v in raw_vote[1:]: if v.startswith('Ayes...') and vote_regex.search(v): yes_count = int(vote_regex.search(v).group()) elif v.startswith('Noes...') and vote_regex.search(v): no_count = int(vote_regex.search(v).group()) elif aye_regex.search(v): ayes = aye_regex.search(v).groups()[0].split(', ') elif no_regex.search(v): nos = no_regex.search(v).groups()[0].split(', ') if yes_count and no_count: passed = yes_count > no_count else: yes_count = no_count = 0 vote = Vote(bill['chamber'], vote_date, motion, passed, yes_count, no_count, other_count) vote.add_source(link) for a in ayes: vote.yes(a) for n in nos: vote.no(n) bill.add_vote(vote) return bill
def scrape_vote(self, bill, motion, url): page = self.get(url, retry_on_404=True).text page = lxml.html.fromstring(page) yeas_cell = page.xpath("//td[text() = 'Yeas (Y):']")[0] yes_count = int(yeas_cell.xpath("string(following-sibling::td)")) nays_cell = page.xpath("//td[text() = 'Nays (N):']")[0] no_count = int(nays_cell.xpath("string(following-sibling::td)")) abs_cell = page.xpath("//td[text() = 'Absent (X):']")[0] abs_count = int(abs_cell.xpath("string(following-sibling::td)")) ex_cell = page.xpath("//td[text() = 'Excused (E):']")[0] ex_count = int(ex_cell.xpath("string(following-sibling::td)")) other_count = abs_count + ex_count if 'chamber=House' in url: chamber = 'lower' elif 'chamber=Senate' in url: chamber = 'upper' date_cell = page.xpath("//td[text() = 'Date:']")[0] date = date_cell.xpath("string(following-sibling::td)") try: date = datetime.datetime.strptime(date, "%B %d, %Y") except ValueError: date = datetime.datetime.strptime(date, "%b. %d, %Y") outcome_cell = page.xpath("//td[text()='Outcome:']")[0] outcome = outcome_cell.xpath("string(following-sibling::td)") vote = Vote(chamber, date, motion, outcome == 'PREVAILS', yes_count, no_count, other_count) vote.add_source(url) member_cell = page.xpath("//td[text() = 'Member']")[0] for row in member_cell.xpath("../../tr")[1:]: name = row.xpath("string(td[2])") # name = name.split(" of ")[0] vtype = row.xpath("string(td[4])") if vtype == 'Y': vote.yes(name) elif vtype == 'N': vote.no(name) elif vtype == 'X' or vtype == 'E': vote.other(name) bill.add_vote(vote)
def parse_vote(self, bill, actor, date, motion, url, uniqid): page = self.get(url).text bill.add_source(url) vote_re = re.compile( "YEAS -?\s?(\d+)(.*)NAYS -?\s?(\d+)" "(.*)ABSENT( OR NOT VOTING)? -?\s?" "(\d+)(.*)", re.MULTILINE | re.DOTALL, ) match = vote_re.search(page) yes_count = int(match.group(1)) no_count = int(match.group(3)) other_count = int(match.group(6)) if yes_count > no_count: passed = True else: passed = False if actor == "upper" or actor == "lower": vote_chamber = actor vote_location = "" else: vote_chamber = "" vote_location = actor vote = Vote( vote_chamber, date, motion, passed, yes_count, no_count, other_count, location=vote_location, _vote_id=uniqid, ) vote.add_source(url) yes_votes = re.split("\s{2,}", match.group(2).strip()) no_votes = re.split("\s{2,}", match.group(4).strip()) other_votes = re.split("\s{2,}", match.group(7).strip()) for yes in yes_votes: if yes: vote.yes(yes) for no in no_votes: if no: vote.no(no) for other in other_votes: if other: vote.other(other) bill.add_vote(vote)
def scrape_vote(self, bill, chamber, url): page = self.urlopen(url) if 'There are no details available for this roll call' in page: return page = page.replace(' ', ' ') page = lxml.html.fromstring(page) info_row = page.xpath("//table[1]/tr[2]")[0] date = info_row.xpath("string(td[1])") date = datetime.datetime.strptime(date, "%m/%d/%Y") motion = info_row.xpath("string(td[2])") yes_count = int(info_row.xpath("string(td[3])")) no_count = int(info_row.xpath("string(td[4])")) other_count = int(info_row.xpath("string(td[5])")) passed = info_row.xpath("string(td[6])") == 'Pass' if motion == 'Shall the bill pass?': type = 'passage' elif motion == 'Shall the bill be read the third time?': type = 'reading:3' elif 'be amended as' in motion: type = 'amendment' else: type = 'other' vote = Vote(chamber, date, motion, passed, yes_count, no_count, other_count) vote.add_source(url) for tr in page.xpath("//table[1]/tr")[3:]: if len(tr.xpath("td")) != 2: continue # avoid splitting duplicate names name = tr.xpath("string(td[1])").strip() if not name.startswith(DOUBLED_NAMES): name = name.split(' of')[0] type = tr.xpath("string(td[2])").strip() if type.startswith('Yea'): vote.yes(name) elif type.startswith('Nay'): vote.no(name) elif type.startswith('Not Voting'): pass else: vote.other(name) bill.add_vote(vote)
def scrape_vote(self, bill, chamber, url): page = self.urlopen(url) if 'There are no details available for this roll call' in page: return page = page.replace(' ', ' ') page = lxml.html.fromstring(page) info_row = page.xpath("//table[1]/tr[2]")[0] date = info_row.xpath("string(td[1])") date = datetime.datetime.strptime(date, "%m/%d/%Y") motion = info_row.xpath("string(td[2])") yes_count = int(info_row.xpath("string(td[3])")) no_count = int(info_row.xpath("string(td[4])")) other_count = int(info_row.xpath("string(td[5])")) passed = info_row.xpath("string(td[6])") == 'Pass' if motion == 'Shall the bill pass?': type = 'passage' elif motion == 'Shall the bill be read the third time?': type = 'reading:3' elif 'be amended as' in motion: type = 'amendment' else: type = 'other' vote = Vote(chamber, date, motion, passed, yes_count, no_count, other_count) vote.add_source(url) for tr in page.xpath("//table[1]/tr")[3:]: if len(tr.xpath("td")) != 2: continue # avoid splitting duplicate names name = tr.xpath("string(td[1])").strip() if not name.startswith(DOUBLED_NAMES): name = name.split(' of')[0] type = tr.xpath("string(td[2])").strip() if type.startswith('Yea'): vote.yes(name) elif type.startswith('Nay'): vote.no(name) elif type.startswith('Not Voting'): pass else: vote.other(name) bill.add_vote(vote)
def parse_vote(self, bill, actor, date, motion, url, uniqid): page = self.get(url).text bill.add_source(url) vote_re = re.compile( 'YEAS -?\s?(\d+)(.*)NAYS -?\s?(\d+)' '(.*)ABSENT( OR NOT VOTING)? -?\s?' '(\d+)(.*)', re.MULTILINE | re.DOTALL) match = vote_re.search(page) yes_count = int(match.group(1)) no_count = int(match.group(3)) other_count = int(match.group(6)) if yes_count > no_count: passed = True else: passed = False if actor == 'upper' or actor == 'lower': vote_chamber = actor vote_location = '' else: vote_chamber = '' vote_location = actor vote = Vote(vote_chamber, date, motion, passed, yes_count, no_count, other_count, location=vote_location, _vote_id=uniqid) vote.add_source(url) yes_votes = re.split('\s{2,}', match.group(2).strip()) no_votes = re.split('\s{2,}', match.group(4).strip()) other_votes = re.split('\s{2,}', match.group(7).strip()) for yes in yes_votes: if yes: vote.yes(yes) for no in no_votes: if no: vote.no(no) for other in other_votes: if other: vote.other(other) bill.add_vote(vote)
def scrape_committee_vote(self, bill, actor, date, motion, page, url, uniqid): votes = page.xpath("//table")[0] rows = votes.xpath(".//tr")[0] if rows[0].text_content() == 'Votes:': #New webste rows = votes.xpath(".//tr")[2] yno = rows.xpath(".//td") if len(yno) < 3: yes = yno[0] no, other = None, None else: yes, no, other = rows.xpath(".//td")[:3] def proc_block(obj, typ): if obj is None: return {"type": None, "count": None, "votes": []} votes = [] for vote in obj.xpath(".//br"): if vote.tail: vote = vote.tail.strip() if vote: votes.append(vote) count = len(votes) return {"type": typ, "count": count, "votes": votes} vote_dict = { "yes": proc_block(yes, 'yes'), "no": proc_block(no, 'no'), "other": proc_block(other, 'other'), } yes_count = vote_dict['yes']['count'] no_count = vote_dict['no']['count'] or 0 other_count = vote_dict['other']['count'] or 0 vote = Vote(actor, date, motion, (yes_count > no_count), yes_count, no_count, other_count, _vote_id=uniqid) vote.add_source(url) for key in vote_dict: for voter in vote_dict[key]['votes']: getattr(vote, key)(voter) bill.add_vote(vote)
def process_vote(self, data): chamber = parse_psuedo_id(data['organization'])['classification'] bill_chamber, bill_id = self.get_bill_details(data['bill']) if chamber == 'legislature': chamber = 'upper' if bill_chamber == 'legislature': bill_chamber = 'upper' yes_count = None no_count = None other_count = 0 for vc in data['counts']: if vc['option'] == 'yes': yes_count = vc['value'] elif vc['option'] == 'no': no_count = vc['value'] else: other_count += vc['value'] vote = Vote( chamber=chamber, date=parse_date(data['start_date']), motion=data['motion_text'], passed=data['result'] == 'pass', yes_count=yes_count, no_count=no_count, other_count=other_count, action=data['bill_action'], # TODO: was data['motion_classification'], type='other', session=data['legislative_session'], bill_chamber=bill_chamber, bill_id=bill_id, ) for vr in data['votes']: if vr['option'] == 'yes': vote.yes(vr['voter_name']) elif vr['option'] == 'no': vote.no(vr['voter_name']) else: vote.other(vr['voter_name']) for source in data['sources']: vote.add_source(source['url']) vote.update(**data['extras']) self.save_vote(vote)
def scrape_chamber_votes(self, chamber, session, url): xml = self.urlopen(url) doc = lxml.etree.fromstring(xml) for vxml in doc.xpath('//vote'): legislation = vxml.get('legislation') motion = vxml.get('caption') or 'unknown' timestamp = datetime.datetime.strptime(vxml.get('dateTime'), '%Y-%m-%dT%H:%M:%S') leg_prefix = legislation.split(' ')[0] if leg_prefix in ('SB', 'SR'): bill_chamber = 'upper' elif leg_prefix in ('HB', 'HR'): bill_chamber = 'lower' elif leg_prefix in ('', 'EX', 'ELECTION'): continue else: raise Exception('unknown legislation prefix: ' + legislation) # skip bills from other chamber if bill_chamber != chamber: continue unknown_count = int(vxml.xpath('totals/@unknown')[0]) excused_count = int(vxml.xpath('totals/@excused')[0]) nv_count = int(vxml.xpath('totals/@not-voting')[0]) no_count = int(vxml.xpath('totals/@nays')[0]) yes_count = int(vxml.xpath('totals/@yeas')[0]) other_count = unknown_count + excused_count + nv_count vote = Vote(chamber, timestamp, motion, passed=yes_count > no_count, yes_count=yes_count, no_count=no_count, other_count=other_count, session=session, bill_id=legislation, bill_chamber=bill_chamber) vote.add_source(url) for m in vxml.xpath('member'): vote_letter = m.get('vote') member = m.get('name') if vote_letter == 'Y': vote.yes(member) elif vote_letter == 'N': vote.no(member) else: vote.other(member) self.save_vote(vote)
def scrape_votes(self, bill, sponsor, link): with self.urlopen(link) as page: page = lxml.html.fromstring(page) raw_vote_data = page.xpath("//span[@id='lblVoteData']")[0].text_content() raw_vote_data = raw_vote_data.strip().split('%s by %s - ' % (bill['bill_id'], sponsor))[1:] for raw_vote in raw_vote_data: raw_vote = raw_vote.split(u'\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0') motion = raw_vote[0] vote_date = re.search('(\d+/\d+/\d+)', motion) if vote_date: vote_date = datetime.datetime.strptime(vote_date.group(), '%m/%d/%Y') passed = ('Passed' in motion) or ('Adopted' in raw_vote[1]) vote_regex = re.compile('\d+$') aye_regex = re.compile('^.+voting aye were: (.+) -') no_regex = re.compile('^.+voting no were: (.+) -') yes_count = None no_count = None other_count = 0 ayes = [] nos = [] for v in raw_vote[1:]: if v.startswith('Ayes...') and vote_regex.search(v): yes_count = int(vote_regex.search(v).group()) elif v.startswith('Noes...') and vote_regex.search(v): no_count = int(vote_regex.search(v).group()) elif aye_regex.search(v): ayes = aye_regex.search(v).groups()[0].split(', ') elif no_regex.search(v): nos = no_regex.search(v).groups()[0].split(', ') if yes_count and no_count: passed = yes_count > no_count else: yes_count = no_count = 0 vote = Vote(bill['chamber'], vote_date, motion, passed, yes_count, no_count, other_count) vote.add_source(link) for a in ayes: vote.yes(a) for n in nos: vote.no(n) bill.add_vote(vote) return bill
def scrape_vote(self, bill, vote_type_id, vote_type): base_url = 'http://dcclims1.dccouncil.us/lims/voting.aspx?VoteTypeID=%s&LegID=%s' url = base_url % (vote_type_id, bill['bill_id']) with self.urlopen(url) as html: doc = lxml.html.fromstring(html) vote_date = convert_date(doc.get_element_by_id('VoteDate').text) # check if voice vote / approved boxes have an 'x' voice = ( doc.xpath('//span[@id="VoteTypeVoice"]/b/text()')[0] == 'x') passed = (doc.xpath('//span[@id="VoteResultApproved"]/b/text()')[0] == 'x') yes_count = extract_int( doc.xpath('//span[@id="VoteCount1"]/b/text()')[0]) no_count = extract_int( doc.xpath('//span[@id="VoteCount2"]/b/text()')[0]) other_count = 0 for n in xrange(3, 9): other_count += extract_int( doc.xpath('//span[@id="VoteCount%s"]/b/text()' % n)[0]) vote = Vote('upper', vote_date, vote_type, passed, yes_count, no_count, other_count, voice_vote=voice) vote.add_source(url) # members are only text on page in a <u> tag for member_u in doc.xpath('//u'): member = member_u.text # normalize case vote_text = member_u.xpath('../../i/text()')[0].upper() if 'YES' in vote_text: vote.yes(member) elif 'NO' in vote_text: vote.no(member) else: vote.other(member) bill.add_vote(vote)
def scrape_vote(self, chamber, session, bill_id, vote_url): NO_VOTE_URL = 'http://www.house.leg.state.mn.us/votes/novotefound.asp' html = self.urlopen(vote_url) # sometimes the link is broken, will redirect to NO_VOTE_URL if html.response.url == NO_VOTE_URL: return doc = lxml.html.fromstring(html) try: motion = doc.xpath("//div[@id='leg_PageContent']/div/h2/text()")[0] except IndexError: self.logger.warning("Bill was missing a motion number, skipping") return vote_count = doc.xpath(".//div[@id='leg_PageContent']/div/h3/text()")[1].split() yeas = int(vote_count[0]) nays = int(vote_count[3]) # second paragraph has date paragraphs = doc.xpath(".//div[@id='leg_PageContent']/div/p/text()") date = None for p in paragraphs: try: date = datetime.datetime.strptime(p.strip(), '%m/%d/%Y') break except ValueError: pass if date is None: self.logger.warning("No date could be found for vote on %s" % motion) return vote = Vote('lower', date, motion, yeas>nays, yeas, nays, 0, session=session, bill_id=bill_id, bill_chamber=chamber) vote.add_source(vote_url) # first table has YEAs for name in doc.xpath('//table[1]/tr/td/font/text()'): vote.yes(name.strip()) # second table is nays for name in doc.xpath('//table[2]/tr/td/font/text()'): vote.no(name.strip()) self.save_vote(vote)
def scrape_senate_vote(self, bill, url, date): try: filename, resp = self.urlretrieve(url) except scrapelib.HTTPError: self.warning("missing vote file %s" % url) return vote = Vote('upper', date, 'Passage', passed=None, yes_count=0, no_count=0, other_count=0) vote.add_source(url) text = convert_pdf(filename, 'text') os.remove(filename) if re.search('Yea:\s+\d+\s+Nay:\s+\d+\s+Absent:\s+\d+', text): return self.scrape_senate_vote_3col(bill, vote, text, url, date) data = re.split(r'(Yea|Nay|Absent)s?:', text)[::-1] data = filter(None, data) keymap = dict(yea='yes', nay='no') actual_vote = collections.defaultdict(int) while True: if not data: break vote_val = data.pop() key = keymap.get(vote_val.lower(), 'other') values = data.pop() for name in re.split(r'(?:[\s,]+and\s|[\s,]{2,})', values): if name.lower().strip() == 'none.': continue name = name.replace('..', '') name = re.sub(r'\.$', '', name) name = name.strip('-1234567890 \n') if not name: continue getattr(vote, key)(name) actual_vote[vote_val] += 1 vote[key + '_count'] += 1 assert actual_vote[vote_val] == vote[key + '_count'] vote['passed'] = vote['no_count'] < vote['yes_count'] bill.add_vote(vote)
def scrape_vote(self, chamber, session, bill_id, vote_url): NO_VOTE_URL = 'http://www.house.leg.state.mn.us/votes/novotefound.asp' html = self.urlopen(vote_url) # sometimes the link is broken, will redirect to NO_VOTE_URL if html.response.url == NO_VOTE_URL: return doc = lxml.html.fromstring(html) paragraphs = doc.xpath('//h1/following-sibling::p') # first paragraph has motion and vote total top_par = paragraphs[0].text_content() lines = top_par.splitlines() # 3rd line is the motion except in cases where first line is gone motion = lines[2] or lines[1] # last line is "__ YEA and __ Nay" yeas, nays = self.yeanay_re.match(lines[-1]).groups() yeas = int(yeas) nays = int(nays) # second paragraph has date date = self.date_re.match(paragraphs[1].text_content()).groups()[0] date = datetime.datetime.strptime(date, '%m/%d/%Y') vote = Vote('lower', date, motion, yeas > nays, yeas, nays, 0, session=session, bill_id=bill_id, bill_chamber=chamber) vote.add_source(vote_url) # first table has YEAs for name in doc.xpath('//table[1]/tr/td/font/text()'): vote.yes(name.strip()) # second table is nays for name in doc.xpath('//table[2]/tr/td/font/text()'): vote.no(name.strip()) self.save_vote(vote)
def scrape_chamber_votes(self, chamber, url): xml = self.urlopen(url) doc = lxml.etree.fromstring(xml) for vxml in doc.xpath('//vote'): motion = vxml.get('caption') or 'unknown' timestamp = datetime.datetime.strptime(vxml.get('dateTime'), '%Y-%m-%dT%H:%M:%S') # legislaton key is often blank, so we're ignoring it now #legislation = vxml.get('legislation') unknown_count = int(vxml.xpath('totals/@unknown')[0]) excused_count = int(vxml.xpath('totals/@excused')[0]) nv_count = int(vxml.xpath('totals/@not-voting')[0]) no_count = int(vxml.xpath('totals/@nays')[0]) yes_count = int(vxml.xpath('totals/@yeas')[0]) other_count = unknown_count + excused_count + nv_count vote = Vote(chamber, timestamp, motion, passed=yes_count > no_count, yes_count=yes_count, no_count=no_count, other_count=other_count) vote.add_source(url) for m in vxml.xpath('member'): vote_letter = m.get('vote') member = m.get('name') if vote_letter == 'Y': vote.yes(member) elif vote_letter == 'N': vote.no(member) else: vote.other(member) # other count is frequently wrong, not sure why they can't count if len(vote['other_votes']) != vote['other_count']: self.warning("vote XML had wrong other count: said %s got %s" % (len(vote['other_votes']), vote['other_count'])) vote['other_count'] = len(vote['other_votes']) # store vote self.votes[vxml.get('id')] = vote
def parse_house_vote(self, url): """ house votes are pdfs that can be converted to text, require some nasty regex to get votes out reliably """ fname, resp = self.urlretrieve(url) text = convert_pdf(fname, 'text') if not text.strip(): self.warning('image PDF %s' % url) return os.remove(fname) # get date if text.strip() == 'NEW MEXICO HOUSE OF REPRESENTATIVES': self.warning("What the heck: %s" % (url)) return None date = re.findall('(\d+/\d+/\d+)', text)[0] date = datetime.strptime(date, '%m/%d/%Y') # get totals yea, nay, exc, absent = self.HOUSE_TOTAL_RE.findall(text)[0] # make vote (faked passage indicator) vote = Vote('lower', date, 'house passage', int(yea) > int(nay), int(yea), int(nay), int(absent) + int(exc)) vote.add_source(url) # votes real_votes = False for v, name in HOUSE_VOTE_RE.findall(text): # our regex is a bit broad, wait until we see 'Nays' to start # and end when we see CERTIFIED or ____ signature line if 'Nays' in name or 'Excused' in name: real_votes = True continue elif 'CERTIFIED' in name or '___' in name: break elif real_votes and name.strip(): if v == 'Y': vote.yes(name) elif v == 'N': vote.no(name) else: # excused/absent vote.other(name) return vote
def parse_vote(self, bill, action, act_chamber, act_date, url): url = "http://www.legis.state.ak.us/basis/%s" % url info_page = self.soup_parser(self.urlopen(url)) yes = no = other = 0 tally = re.findall('(?:(Y|N|E|A)(-|\d+)\s*)', action) for vtype, vcount in tally: vcount = int(vcount) if vcount != '-' else 0 if vtype == 'Y': yes = vcount elif vtype == 'N': no = vcount else: other += vcount votes = info_page.findAll('pre', text=re.compile('Yeas'), limit=1)[0].split('\n\n') motion = info_page.findAll(text=re.compile('The question being'))[0] motion = re.findall('The question being:\s*"(.*)\?"', motion, re.DOTALL)[0].replace('\n', ' ') vote = Vote(act_chamber, act_date, motion, yes > no, yes, no, other) for vote_list in votes: vote_type = False if vote_list.startswith('Yeas: '): vote_list, vote_type = vote_list[6:], vote.yes elif vote_list.startswith('Nays: '): vote_list, vote_type = vote_list[6:], vote.no elif vote_list.startswith('Excused: '): vote_list, vote_type = vote_list[9:], vote.other elif vote_list.startswith('Absent: '): vote_list, vote_type = vote_list[9:], vote.other if vote_type: for name in vote_list.split(','): name = name.strip() if name: vote_type(name) vote.add_source(url) return vote
def scrape_vote(self, bill, vote_type_id, vote_type): base_url = 'http://dcclims1.dccouncil.us/lims/voting.aspx?VoteTypeID=%s&LegID=%s' url = base_url % (vote_type_id, bill['bill_id']) with self.urlopen(url) as html: doc = lxml.html.fromstring(html) vote_date = convert_date(doc.get_element_by_id('VoteDate').text) # check if voice vote / approved boxes have an 'x' voice = ( doc.xpath('//span[@id="VoteTypeVoice"]/b/text()')[0] == 'x') passed = (doc.xpath('//span[@id="VoteResultApproved"]/b/text()')[0] == 'x') yes_count = extract_int( doc.xpath('//span[@id="VoteCount1"]/b/text()')[0]) no_count = extract_int( doc.xpath('//span[@id="VoteCount2"]/b/text()')[0]) # every now and then this actually drops below 0 (error in count) other_count = max(13 - (yes_count + no_count), 0) vote = Vote('upper', vote_date, vote_type, passed, yes_count, no_count, other_count, voice_vote=voice) vote.add_source(url) # members are only text on page in a <u> tag for member_u in doc.xpath('//u'): member = member_u.text vote_text = member_u.xpath('../../i/text()')[0] if 'Yes' in vote_text: vote.yes(member) elif 'No' in vote_text: vote.no(member) else: vote.other(member) bill.add_vote(vote)
def parse_committee_votes(self, committee, chamber, bill, url): bill.add_source(url) html = self.urlopen(url) doc = lxml.html.fromstring(html) doc.make_links_absolute(url) for link in doc.xpath("//a[contains(@href, 'listVoteSummary.cfm')]"): # Date date = link.xpath('../../td')[0].text_content() date = datetime.datetime.strptime(date, "%m/%d/%Y") # Motion motion = link.xpath('..')[0].text_content().strip() _, motion = motion.split('-', 1) motion = motion.strip() vote_url = link.attrib['href'] # Roll call. rollcall = self.parse_upper_committee_vote_rollcall(bill, vote_url) motion = 'Committee vote (%s): %s' % (committee, motion) vote = Vote(chamber, date, motion, type='other', committee=committee, **rollcall) for voteval in ('yes', 'no', 'other'): for name in rollcall.get(voteval + '_votes', []): getattr(vote, voteval)(name) vote.add_source(url) vote.add_source(vote_url) bill.add_vote(vote) for link in doc.xpath("//a[contains(@href, 'listVotes.cfm')]"): self.parse_committee_votes(committee, chamber, bill, link.attrib['href'])
def parse_committee_votes(self, bill, url): bill.add_source(url) html = self.get(url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) chamber = ('upper' if 'Senate' in doc.xpath('string(//h1)') else 'lower') committee = tuple(doc.xpath('//h2')[0].itertext())[-2].strip() for link in doc.xpath("//a[contains(@href, 'listVoteSummary.cfm')]"): # Date for fmt in ("%m/%d/%Y", "%m-%d-%Y"): date = link.xpath('../../td')[0].text_content() try: date = datetime.datetime.strptime(date, fmt) except ValueError: continue break # Motion motion = link.text_content().split(' - ')[-1].strip() motion = 'Committee vote (%s): %s' % (committee, motion) # Roll call. vote_url = link.attrib['href'] rollcall = self.parse_upper_committee_vote_rollcall(bill, vote_url) vote = Vote(chamber, date, motion, type='other', committee=committee, **rollcall) for voteval in ('yes', 'no', 'other'): for name in rollcall.get(voteval + '_votes', []): getattr(vote, voteval)(name) vote.add_source(url) vote.add_source(vote_url) bill.add_vote(vote)
def scrape(self, chamber, session): url = { "upper": "%s/%s" % (RI_URL_BASE, "SVotes"), "lower": "%s/%s" % (RI_URL_BASE, "HVotes") } url = url[chamber] action = "%s/%s" % (url, "votes.asp") dates = self.get_dates(url) for date in dates: votes = self.parse_vote_page(self.post_to(action, date), url, session) for vote_dict in votes: for vote in vote_dict: vote = vote_dict[vote] count = vote['count'] chamber = { "H": "lower", "S": "upper" }[vote['meta']['chamber']] v = Vote( chamber, vote['time'], vote['meta']['extra']['motion'], count['passage'], int(count['YEAS']), int(count['NAYS']), int(count['NOT VOTING']), session=session, bill_id=vote['meta']['bill'], bill_chamber=chamber, bill_session=vote['meta']['year'], ) v.add_source(vote['source']) for vt in vote['votes']: if vt['vote'] == "Y": v.yes(vt['name']) elif vt['vote'] == "N": v.no(vt['name']) else: v.other(vt['name']) self.save_vote(v)
def scrape_vote(self, bill, date, motion, url): try: page = self.get(url).text except scrapelib.HTTPError: #sometiems the link is there but is dead return if 'not yet official' in page: # Sometimes they link to vote pages before they go live return page = lxml.html.fromstring(page) if url.endswith('Senate'): actor = 'upper' else: actor = 'lower' count_path = "string(//td[@align = 'center' and contains(., '%s: ')])" yes_count = int(page.xpath(count_path % "Yeas").split()[-1]) no_count = int(page.xpath(count_path % "Nays").split()[-1]) other_count = int(page.xpath(count_path % "Non Voting").split()[-1]) other_count += int(page.xpath(count_path % "Present").split()[-1]) passed = yes_count > no_count + other_count vote = Vote(actor, date, motion, passed, yes_count, no_count, other_count) vote.add_source(url) xpath = ( '//*[contains(@class, "ms-standardheader")]/' 'following-sibling::table') divs = page.xpath(xpath) votevals = 'yes no other other'.split() for (voteval, div) in zip(votevals, divs): for a in div.xpath('.//a'): name = a.text_content().strip() if not name: continue getattr(vote, voteval)(name) bill.add_vote(vote)
def scrape_vote(self, bill, moid, vote_id, body, inst, motion, chamber): url = "http://alisondb.legislature.state.al.us/acas/GetRollCallVoteResults.asp?MOID=%s&VOTE=%s&BODY=%s&INST=%s&SESS=%s" % ( moid, vote_id, body, inst, self.session_id) doc = lxml.html.fromstring(self.urlopen(url)) voters = {'Y': [], 'N': [], 'P': [], 'A': []} leg_tds = doc.xpath('//td[@width="33%"]') for td in leg_tds: name = td.text two_after = td.xpath('following-sibling::td')[1].text if name == 'Total Yea:': total_yea = int(two_after) elif name == 'Total Nay:': total_nay = int(two_after) elif name == 'Total Abs:': total_abs = int(two_after) elif name == 'Legislative Date:': vote_date = datetime.datetime.strptime(two_after, '%m/%d/%Y') # lines to ignore elif name in ('Legislative Day:', 'Vote ID:'): pass elif 'Vacant' in name: pass else: # add legislator to list of voters voters[two_after].append(name) # TODO: passed is faked total_other = total_abs + len(voters['P']) vote = Vote(chamber, vote_date, motion, total_yea > total_nay, total_yea, total_nay, total_other) vote.add_source(url) for member in voters['Y']: vote.yes(member) for member in voters['N']: vote.no(member) for member in (voters['A'] + voters['P']): vote.other(member) bill.add_vote(vote)
def parse_vote(self, bill, link): member_doc = lxml.html.fromstring(self.get(link).text) motion = member_doc.xpath("//div[@id='main_content']/h4/text()") opinions = member_doc.xpath("//div[@id='main_content']/h3/text()") if len(opinions) > 0: temp = opinions[0].split() vote_chamber = temp[0] vote_date = datetime.datetime.strptime(temp[-1], '%m/%d/%Y') vote_status = " ".join(temp[2:-2]) vote_status = vote_status if vote_status.strip() else motion[0] vote_chamber = 'upper' if vote_chamber == 'Senate' else 'lower' for i in opinions: try: count = int(i[i.find("(") + 1:i.find(")")]) except: pass if "yea" in i.lower(): yes_count = count elif "nay" in i.lower(): no_count = count elif "present" in i.lower(): p_count = count elif "absent" in i.lower(): a_count = count vote = Vote(vote_chamber, vote_date, vote_status, yes_count > no_count, yes_count, no_count, p_count + a_count) vote.add_source(link) a_links = member_doc.xpath("//div[@id='main_content']/a/text()") for i in range(1, len(a_links)): if i <= yes_count: vote.yes(re.sub(',', '', a_links[i]).split()[0]) elif no_count != 0 and i > yes_count and i <= yes_count + no_count: vote.no(re.sub(',', '', a_links[i]).split()[0]) else: vote.other(re.sub(',', '', a_links[i]).split()[0]) bill.add_vote(vote) else: print self.warning("No Votes for: %s", link)
def vote(self): '''Return a billy vote. ''' actual_vote_dict = collections.defaultdict(list) vote = Vote('lower', self.date(), self.motion(), self.passed(), 0, 0, 0, actual_vote=dict(actual_vote_dict)) for (vote_val, count), (actual_vote, _), text in self._parse(): vote[vote_val + '_count'] = count for name in filter(None, PlaintextColumns(text)): names = [name] if 'Candelaria Reardon' in name: names.append('Candelaria Reardon') other_name = name.replace('Candelaria Reardon', '').strip() names.append(other_name) for name in filter(None, names): actual_vote_dict[actual_vote].append(name) getattr(vote, vote_val)(name) vote.add_source(self.url) return vote
def scrape_vote_history(self, bill, vurl): html = self.urlopen(vurl) doc = lxml.html.fromstring(html) doc.make_links_absolute(vurl) # skip first two rows for row in doc.xpath('//table/tr')[2:]: tds = row.getchildren() if len(tds) != 10: self.warning('irregular vote row: %s' % vurl) continue (timestamp, motion, vote, yeas, nays, nv, exc, abst, total, result) = tds timestamp = timestamp.text.replace(u'\xa0', ' ') timestamp = datetime.datetime.strptime(timestamp, '%m/%d/%Y %H:%M %p') yeas = int(yeas.text) nays = int(nays.text) others = int(nv.text) + int(exc.text) + int(abst.text) assert yeas + nays + others == int(total.text) passed = (result.text == 'Passed') vote_link = vote.xpath('a')[0] if '[H]' in vote_link.text: chamber = 'lower' else: chamber = 'upper' vote = Vote(chamber, timestamp, motion.text, passed, yeas, nays, others) vote.add_source(vurl) rollcall_pdf = vote_link.get('href') self.scrape_rollcall(vote, rollcall_pdf) vote.add_source(rollcall_pdf) bill.add_vote(vote)
def vote(self): '''Return a billy vote. ''' actual_vote_dict = collections.defaultdict(list) date = self.date() motion = self.motion() passed = self.passed() counts = self.get_counts() yes_count = sum(int(counts.get(key, 0)) for key in ('Yea', 'Yeas')) no_count = sum(int(counts.get(key, 0)) for key in ('Nay', 'Nays')) vote = Vote(self.chamber, date, motion, passed, yes_count, no_count, sum(map(int, counts.values())) - (yes_count + no_count), actual_vote=dict(actual_vote_dict)) for vote_val, voter in self.vote_values(): getattr(vote, vote_val)(voter) vote.add_source(self.url) return vote
def parse_vote(self, bill, action, act_chamber, act_date, url): url = "http://www.legis.state.ak.us/basis/%s" % url info_page = self.soup_parser(self.urlopen(url)) tally = re.findall( 'Y(\d+) N(\d+)\s*(?:\w(\d+))*\s*(?:\w(\d+))*' '\s*(?:\w(\d+))*', action)[0] yes, no, o1, o2, o3 = [0 if not x else int(x) for x in tally] other = o1 + o2 + o3 votes = info_page.findAll('pre', text=re.compile('Yeas'), limit=1)[0].split('\n\n') motion = info_page.findAll(text=re.compile('The question being'))[0] motion = re.findall('The question being:\s*"(.*)\?"', motion, re.DOTALL)[0].replace('\n', ' ') vote = Vote(act_chamber, act_date, motion, yes > no, yes, no, other) for vote_list in votes: vote_type = False if vote_list.startswith('Yeas: '): vote_list, vote_type = vote_list[6:], vote.yes elif vote_list.startswith('Nays: '): vote_list, vote_type = vote_list[6:], vote.no elif vote_list.startswith('Excused: '): vote_list, vote_type = vote_list[9:], vote.other elif vote_list.startswith('Absent: '): vote_list, vote_type = vote_list[9:], vote.other if vote_type: for name in vote_list.split(','): name = name.strip() if name: vote_type(name) vote.add_source(url) return vote