def apply_votes(self, bill): """Given a bill (and assuming it has a status_url in its dict), parse all of the votes """ bill_votes = votes.all_votes_for_url(self, bill['status_url']) for (chamber,vote_desc,pdf_url,these_votes) in bill_votes: try: date = vote_desc.split("-")[-1] except IndexError: self.warning("[%s] Couldn't get date out of [%s]" % (bill['bill_id'],vote_desc)) continue yes_votes = [] no_votes = [] other_votes = [] for voter,vote in these_votes.iteritems(): if vote == 'Y': yes_votes.append(voter) elif vote == 'N': no_votes.append(voter) else: other_votes.append(voter) passed = len(yes_votes) > len(no_votes) # not necessarily correct, but not sure where else to get it. maybe from pdf vote = Vote(standardize_chamber(chamber),date,vote_desc,passed, len(yes_votes), len(no_votes), len(other_votes),pdf_url=pdf_url) for voter in yes_votes: vote.yes(voter) for voter in no_votes: vote.no(voter) for voter in other_votes: vote.other(voter) bill.add_vote(vote)
def scrape(self, chamber, session): self.validate_session(session) if chamber == 'upper': other_chamber = 'lower' bill_id = 'SB 1' else: other_chamber = 'upper' bill_id = 'HB 1' b1 = Bill(session, chamber, bill_id, 'A super bill') b1.add_source('http://example.com/') b1.add_version('As Introduced', 'http://example.com/SB1.html') b1.add_document('Google', 'http://google.com') b1.add_sponsor('primary', 'Bob Smith') b1.add_sponsor('secondary', 'Johnson, Sally') d1 = datetime.datetime.strptime('1/29/2010', '%m/%d/%Y') v1 = Vote('upper', d1, 'Final passage', True, 2, 0, 0) v1.yes('Smith') v1.yes('Johnson') d2 = datetime.datetime.strptime('1/30/2010', '%m/%d/%Y') v2 = Vote('lower', d2, 'Final passage', False, 0, 1, 1) v2.no('Bob Smith') v2.other('S. Johnson') b1.add_vote(v1) b1.add_vote(v2) b1.add_action(chamber, 'introduced', d1) b1.add_action(chamber, 'read first time', d2) b1.add_action(other_chamber, 'introduced', d2) self.save_bill(b1)
def scrape_vote(self, bill, date, motion, url): page = self.urlopen(url) if "not yet official" in page: # Sometimes they link to vote pages before they go live return page = lxml.html.fromstring(page) if url.endswith("Senate"): actor = "upper" else: actor = "lower" count_path = "string(//td[@align = 'center' and contains(., '%s: ')])" yes_count = int(page.xpath(count_path % "Yeas").split()[-1]) no_count = int(page.xpath(count_path % "Nays").split()[-1]) other_count = int(page.xpath(count_path % "Non Voting").split()[-1]) other_count += int(page.xpath(count_path % "Present").split()[-1]) passed = yes_count > no_count + other_count vote = Vote(actor, date, motion, passed, yes_count, no_count, other_count) vote.add_source(url) vote_path = "//h3[. = '%s']/following-sibling::table[1]/tr/td/a" for yes in page.xpath(vote_path % "Yeas"): vote.yes(yes.text) for no in page.xpath(vote_path % "Nays"): vote.no(no.text) for other in page.xpath(vote_path % "Non Voting"): vote.other(other.text) for other in page.xpath(vote_path % "Present"): vote.other(other.text) bill.add_vote(vote)
def parse_vote(self, actor, date, row): """ takes the actor, date and row element and returns a Vote object """ spans = row.xpath('.//span') motion = row.text passed, yes_count, no_count, other_count = spans[0].text_content().split('-') yes_votes = [ name for name in spans[1].tail.replace(u'\xa0--\xa0', '').split(',') if name ] no_votes = [ name for name in spans[2].tail.replace(u'\xa0--\xa0', '').split(',') if name ] other_votes = [] if spans[3].text.startswith('Absent'): other_votes = [ name for name in spans[3].tail.replace(u'\xa0--\xa0', '').split(',') if name ] for key, val in {'adopted': True, 'passed': True, 'failed':False}.items(): if key in passed.lower(): passed = val break vote = Vote(actor, date, motion, passed, int(yes_count), int(no_count), int(other_count)) for name in yes_votes: if name and name != 'None': vote.yes(name) for name in no_votes: if name and name != 'None': vote.no(name) for name in other_votes: if name and name != 'None': vote.other(name) return vote
def parse_vote(self, actor, date, row): """ takes the actor, date and row element and returns a Vote object """ spans = row.xpath('.//span') motion = row.text.replace(u'\u00a0', " ").replace("-", "").strip() motion = motion if motion else "passage" passed, yes_count, no_count, other_count = spans[0].text_content().rsplit('-', 3) yes_votes = self.get_names(spans[1].tail) no_votes = self.get_names(spans[2].tail) other_votes = [] for span in spans[3:]: if span.text.startswith(('Absent', 'Excused')): other_votes += self.get_names(span.tail) for key, val in {'adopted': True, 'passed': True, 'failed': False}.items(): if key in passed.lower(): passed = val break vote = Vote(actor, date, motion, passed, int(yes_count), int(no_count), int(other_count)) for name in yes_votes: if name and name != 'None': vote.yes(name) for name in no_votes: if name and name != 'None': vote.no(name) for name in other_votes: if name and name != 'None': vote.other(name) return vote
def scrape_vote(self, bill, date, url): with self.urlopen(url) as page: page = lxml.html.fromstring(page) header = page.xpath("string(//h4[contains(@id, 'hdVote')])") location = header.split(', ')[1] if location.startswith('House'): chamber = 'lower' elif location.startswith('Senate'): chamber = 'upper' else: raise ScrapeError("Bad chamber: %s" % chamber) committee = ' '.join(location.split(' ')[1:]).strip() if not committee or committee.startswith('of Representatives'): committee = None motion = ', '.join(header.split(', ')[2:]).strip() yes_count = int( page.xpath("string(//td[contains(@id, 'tdAyes')])")) no_count = int( page.xpath("string(//td[contains(@id, 'tdNays')])")) excused_count = int( page.xpath("string(//td[contains(@id, 'tdExcused')])")) absent_count = int( page.xpath("string(//td[contains(@id, 'tdAbsent')])")) other_count = excused_count + absent_count passed = yes_count > no_count if motion.startswith('Do Pass'): type = 'passage' elif motion == 'Concurred in amendments': type = 'amendment' elif motion == 'Veto override': type = 'veto_override' else: type = 'other' vote = Vote(chamber, date, motion, passed, yes_count, no_count, other_count) vote['type'] = type if committee: vote['committee'] = committee vote.add_source(url) for td in page.xpath("//table[contains(@id, 'tblVotes')]/tr/td"): if td.text == 'Yea': vote.yes(td.getprevious().text.strip()) elif td.text == 'Nay': vote.no(td.getprevious().text.strip()) elif td.text in ('Excused', 'Absent'): vote.other(td.getprevious().text.strip()) bill.add_vote(vote)
def scrape_vote(self, bill, vote_chamber, bill_id, vote_id, vote_date, action_text): url = ('http://alisondb.legislature.state.al.us/Alison/' 'GetRollCallVoteResults.aspx?' 'VOTE={0}&BODY={1}&INST={2}&SESS={3}'. format(vote_id, vote_chamber, bill_id, self.session_id)) doc = lxml.html.fromstring(self.get(url=url).text) voters = {'Y': [], 'N': [], 'P': [], 'A': []} voters_and_votes = doc.xpath('//table/tr/td/font/text()') capture_vote = False name = '' for item in voters_and_votes: if capture_vote: capture_vote = False if name: voters[item].append(name) else: capture_vote = True name = item if (name.endswith(", Vacant") or name.startswith("Total ") or not name.strip()): name = '' # Check name counts against totals listed on the site total_yea = doc.xpath('//*[starts-with(text(), "Total Yea")]/text()') if total_yea: total_yea = int(total_yea[0].split(":")[-1]) assert total_yea == len(voters['Y']), "Yea count incorrect" else: total_yea = len(voters['Y']) total_nay = doc.xpath('//*[starts-with(text(), "Total Nay")]/text()') if total_nay: total_nay = int(total_nay[0].split(":")[-1]) assert total_nay == len(voters['N']), "Nay count incorrect" else: total_nay = len(voters['N']) total_absent = doc.xpath( '//*[starts-with(text(), "Total Absent")]/text()') if total_absent: total_absent = int(total_absent[0].split(":")[-1]) assert total_absent == len(voters['A']), "Absent count incorrect" total_other = len(voters['P']) + len(voters['A']) vote = Vote( self.CHAMBERS[vote_chamber[0]], vote_date, action_text, total_yea > total_nay, total_yea, total_nay, total_other) vote.add_source(url) for member in voters['Y']: vote.yes(member) for member in voters['N']: vote.no(member) for member in (voters['A'] + voters['P']): vote.other(member) bill.add_vote(vote)
def scrape_vote(self, bill, date, motion, url): page = lxml.html.fromstring(self.urlopen(url)) if url.endswith('Senate'): actor = 'upper' else: actor = 'lower' count_path = "string(//td[@align = 'center' and contains(., '%s: ')])" yes_count = int(page.xpath(count_path % "Yeas").split()[-1]) no_count = int(page.xpath(count_path % "Nays").split()[-1]) other_count = int(page.xpath(count_path % "Non Voting").split()[-1]) other_count += int(page.xpath(count_path % "Present").split()[-1]) passed = yes_count > no_count + other_count vote = Vote(actor, date, motion, passed, yes_count, no_count, other_count) vote.add_source(url) vote_path = "//h3[. = '%s']/following-sibling::table[1]/tr/td/a" for yes in page.xpath(vote_path % "Yeas"): vote.yes(yes.text) for no in page.xpath(vote_path % "Nays"): vote.no(no.text) for other in page.xpath(vote_path % "Non Voting"): vote.other(other.text) for other in page.xpath(vote_path % "Present"): vote.other(other.text) bill.add_vote(vote)
def scrape_vote(self, bill, vote_type_id, vote_type): base_url = "http://dcclims1.dccouncil.us/lims/voting.aspx?VoteTypeID=%s&LegID=%s" url = base_url % (vote_type_id, bill["bill_id"]) with self.urlopen(url) as html: doc = lxml.html.fromstring(html) vote_date = convert_date(doc.get_element_by_id("VoteDate").text) # check if voice vote / approved boxes have an 'x' voice = doc.xpath('//span[@id="VoteTypeVoice"]/b/text()')[0] == "x" passed = doc.xpath('//span[@id="VoteResultApproved"]/b/text()')[0] == "x" yes_count = extract_int(doc.xpath('//span[@id="VoteCount1"]/b/text()')[0]) no_count = extract_int(doc.xpath('//span[@id="VoteCount2"]/b/text()')[0]) # every now and then this actually drops below 0 (error in count) other_count = max(13 - (yes_count + no_count), 0) vote = Vote("upper", vote_date, vote_type, passed, yes_count, no_count, other_count, voice_vote=voice) vote.add_source(url) # members are only text on page in a <u> tag for member_u in doc.xpath("//u"): member = member_u.text vote_text = member_u.xpath("../../i/text()")[0] if "Yes" in vote_text: vote.yes(member) elif "No" in vote_text: vote.no(member) else: vote.other(member) bill.add_vote(vote)
def test_vote(): v = Vote('upper', datetime.datetime(2012, 1, 1), 'passage', True, 3, 1, 2, note='note') assert_equal(v, {'chamber': 'upper', 'date': datetime.datetime(2012, 1, 1), 'motion': 'passage', 'passed': True, 'yes_count': 3, 'no_count': 1, 'other_count': 2, 'type': 'other', 'yes_votes': [], 'no_votes': [], 'other_votes': [], 'note': 'note', '_type': 'vote', 'sources': []}) yes_voters = ['Lincoln', 'Adams', 'Johnson'] list(map(v.yes, yes_voters)) assert_equal(v['yes_votes'], yes_voters) no_voters = ['Kennedy'] list(map(v.no, no_voters)) assert_equal(v['no_votes'], no_voters) other_voters = ['Polk', 'Pierce'] list(map(v.other, other_voters)) assert_equal(v['other_votes'], other_voters) # validate should work v.validate() # now add someone else and make sure it doesn't validate v.yes('Clinton') with assert_raises(ValueError): v.validate()
def _build_lower_votes(self): url = self.shared_url + '&Votes=Y' self.urls.add(votes=url) self.bill.add_source(url) doc = self.urls.votes.doc if doc is None: return # Grab bill information. try: pre = doc.xpath('//pre')[0].text_content() no_votes = ('There are no votes for this bill in this legislative ' 'session.') if pre == no_votes: raise ValueError('No votes for this bill.') # Skip bill if votes can't be found. except (IndexError, ValueError) as e: return actual_vote = collections.defaultdict(list) for table in doc.xpath('//table'): date = table.xpath('caption/label[contains(., "DATE:")]') date = date[0].itersiblings().next().text date = datetime.datetime.strptime(date, '%m/%d/%Y') votes = table.xpath('caption/span/label[contains(., "YEA/NAY:")]') votes = votes[0].itersiblings().next().text yes_count, no_count = map(int, votes.split('/')) passed = yes_count > no_count vote = Vote('lower', date, 'Floor Vote', passed, yes_count, no_count, other_count=0) tds = table.xpath('tr/td/text()') votes = iter(tds) while True: try: data = list(islice(votes, 2)) name, vote_val = data except (StopIteration, ValueError): # End of data. Stop. break name = self._scrub_name(name) if vote_val.strip() == 'Y': vote.yes(name) elif vote_val.strip() in ('N', 'NO'): vote.no(name) else: vote.other(name) actual_vote[vote_val].append(name) # The page doesn't provide an other_count. vote['other_count'] = len(vote['other_votes']) vote['actual_vote'] = actual_vote self.bill.add_vote(vote)
def scrape_vote(self, bill, chamber, date, url): (path, resp) = self.urlretrieve(url) text = convert_pdf(path, 'text') os.remove(path) try: motion = text.split('\n')[4].strip() except IndexError: return try: yes_count = int(re.search(r'Yeas - (\d+)', text).group(1)) except AttributeError: return no_count = int(re.search(r'Nays - (\d+)', text).group(1)) other_count = int(re.search(r'Not Voting - (\d+)', text).group(1)) passed = yes_count > (no_count + other_count) vote = Vote(chamber, date, motion, passed, yes_count, no_count, other_count) vote.add_source(url) y,n,o = 0,0,0 break_outter = False for line in text.split('\n')[9:]: if break_outter: break if 'after roll call' in line: break if 'Indication of Vote' in line: break if 'Presiding' in line: continue for col in re.split(r'-\d+', line): col = col.strip() if not col: continue match = re.match(r'(Y|N|EX|\*)\s+(.+)$', col) if match: if match.group(2) == "PAIR": break_outter = True break if match.group(1) == 'Y': vote.yes(match.group(2)) elif match.group(1) == 'N': vote.no(match.group(2)) else: vote.other(match.group(2)) else: vote.other(col.strip()) vote.validate() bill.add_vote(vote)
def build_lower_votes(self): url = "http://assembly.state.ny.us/leg/?" "default_fld=&bn=%s&term=%s&Votes=Y" url = url % (self.bill_id, self.term_start_year) self.urls.add(votes=url) self.bill.add_source(url) doc = self.urls.votes.doc if doc is None: return # Grab bill information. try: pre = doc.xpath("//pre")[0].text_content() no_votes = "There are no votes for this bill in this legislative " "session." if pre == no_votes: raise ValueError("No votes for this bill.") # Skip bill if votes can't be found. except (IndexError, ValueError) as e: return actual_vote = collections.defaultdict(list) for table in doc.xpath("//table"): date = table.xpath('caption/label[contains(., "DATE:")]') date = date[0].itersiblings().next().text date = datetime.datetime.strptime(date, "%m/%d/%Y") votes = table.xpath('caption/span/label[contains(., "YEA/NAY:")]') votes = votes[0].itersiblings().next().text yes_count, no_count = map(int, votes.split("/")) passed = yes_count > no_count vote = Vote("lower", date, "Floor Vote", passed, yes_count, no_count, other_count=0) tds = table.xpath("tr/td/text()") votes = iter(tds) while True: try: data = list(islice(votes, 2)) name, vote_val = data except (StopIteration, ValueError): # End of data. Stop. break name = self._scrub_name(name) if vote_val.strip() == "Y": vote.yes(name) elif vote_val.strip() in ("N", "NO"): vote.no(name) else: vote.other(name) actual_vote[vote_val].append(name) # The page doesn't provide an other_count. vote["other_count"] = len(vote["other_votes"]) vote["actual_vote"] = actual_vote self.bill.add_vote(vote)
def scrape_bill(self, session, bills): billdata, details = bills[0] (senate_url, assembly_url, bill_chamber, bill_type, bill_id, title, (letter, number, is_amd)) = details data = billdata["data"]["bill"] assembly = AssemblyBillPage(self, session, bill_chamber, details) assembly.build() bill = assembly.bill bill.add_source(billdata["url"]) # Add companion. if data["sameAs"]: bill.add_companion(data["sameAs"]) if data["summary"]: bill["summary"] = data["summary"] if data["votes"]: for vote_data in data["votes"]: vote = Vote( chamber="upper", date=self.date_from_timestamp(vote_data["voteDate"]), motion=vote_data["description"] or "[No motion available.]", passed=False, yes_votes=[], no_votes=[], other_votes=[], yes_count=0, no_count=0, other_count=0, ) for name in vote_data["ayes"]: vote.yes(name) vote["yes_count"] += 1 for names in map(vote_data.get, ["absent", "excused", "abstains"]): for name in names: vote.other(name) vote["other_count"] += 1 for name in vote_data["nays"]: vote.no(name) vote["no_count"] += 1 vote["passed"] = vote["yes_count"] > vote["no_count"] bill.add_vote(vote) # if data['previousVersions']: # These are instances of the same bill from prior sessions. # import pdb; pdb.set_trace() if not data["title"]: bill["title"] = bill["summary"] self.save_bill(bill)
def scrape_vote(self, bill, name, url): if "VOTE/H" in url: vote_chamber = "lower" cols = (1, 5, 9, 13) name_offset = 3 yes_offset = 0 no_offset = 1 else: vote_chamber = "upper" cols = (1, 6) name_offset = 4 yes_offset = 1 no_offset = 2 # Connecticut's SSL is causing problems with Scrapelib, so use Requests page = requests.get(url, verify=False).text if "BUDGET ADDRESS" in page: return page = lxml.html.fromstring(page) yes_count = page.xpath("string(//span[contains(., 'Those voting Yea')])") yes_count = int(re.match(r"[^\d]*(\d+)[^\d]*", yes_count).group(1)) no_count = page.xpath("string(//span[contains(., 'Those voting Nay')])") no_count = int(re.match(r"[^\d]*(\d+)[^\d]*", no_count).group(1)) other_count = page.xpath("string(//span[contains(., 'Those absent')])") other_count = int(re.match(r"[^\d]*(\d+)[^\d]*", other_count).group(1)) need_count = page.xpath("string(//span[contains(., 'Necessary for')])") need_count = int(re.match(r"[^\d]*(\d+)[^\d]*", need_count).group(1)) date = page.xpath("string(//span[contains(., 'Taken on')])") date = re.match(r".*Taken\s+on\s+(\d+/\s?\d+)", date).group(1) date = date.replace(" ", "") date = datetime.datetime.strptime(date + " " + bill["session"], "%m/%d %Y").date() vote = Vote(vote_chamber, date, name, yes_count > need_count, yes_count, no_count, other_count) vote.add_source(url) table = page.xpath("//table")[0] for row in table.xpath("tr"): for i in cols: name = row.xpath("string(td[%d])" % (i + name_offset)).strip() if not name or name == "VACANT": continue if "Y" in row.xpath("string(td[%d])" % (i + yes_offset)): vote.yes(name) elif "N" in row.xpath("string(td[%d])" % (i + no_offset)): vote.no(name) else: vote.other(name) bill.add_vote(vote)
def scrape_vote(self, bill, date, url): with self.urlopen(url) as page: page = lxml.html.fromstring(page) header = page.xpath("string(//h4[contains(@id, 'hdVote')])") location = header.split(", ")[1] if location.startswith("House"): chamber = "lower" elif location.startswith("Senate"): chamber = "upper" else: raise ScrapeError("Bad chamber: %s" % chamber) committee = " ".join(location.split(" ")[1:]).strip() if not committee or committee.startswith("of Representatives"): committee = None motion = ", ".join(header.split(", ")[2:]).strip() if not motion: # If we can't detect a motion, skip this vote return yes_count = int(page.xpath("string(//td[contains(@id, 'tdAyes')])")) no_count = int(page.xpath("string(//td[contains(@id, 'tdNays')])")) excused_count = int(page.xpath("string(//td[contains(@id, 'tdExcused')])")) absent_count = int(page.xpath("string(//td[contains(@id, 'tdAbsent')])")) other_count = excused_count + absent_count passed = yes_count > no_count if motion.startswith("Do Pass"): type = "passage" elif motion == "Concurred in amendments": type = "amendment" elif motion == "Veto override": type = "veto_override" else: type = "other" vote = Vote(chamber, date, motion, passed, yes_count, no_count, other_count) vote["type"] = type if committee: vote["committee"] = committee vote.add_source(url) for td in page.xpath("//table[contains(@id, 'tblVotes')]/tr/td"): if td.text == "Yea": vote.yes(td.getprevious().text.strip()) elif td.text == "Nay": vote.no(td.getprevious().text.strip()) elif td.text in ("Excused", "Absent"): vote.other(td.getprevious().text.strip()) bill.add_vote(vote)
def scrape_bill(self, session, bills): billdata, details = bills[0] (senate_url, assembly_url, bill_chamber, bill_type, bill_id, title, (letter, number, is_amd)) = details data = billdata['data']['bill'] assembly = AssemblyBillPage(self, session, bill_chamber, details) assembly.build() bill = assembly.bill bill.add_source(billdata['url']) # Add companion. if data['sameAs']: bill.add_companion(data['sameAs']) if data['summary']: bill['summary'] = data['summary'] if data['votes']: for vote_data in data['votes']: vote = Vote( chamber='upper', date=self.date_from_timestamp(vote_data['voteDate']), motion=vote_data['description'] or '[No motion available.]', passed=False, yes_votes=[], no_votes=[], other_votes=[], yes_count=0, no_count=0, other_count=0) for name in vote_data['ayes']: vote.yes(name) vote['yes_count'] += 1 for names in map(vote_data.get, ['absent', 'excused', 'abstains']): for name in names: vote.other(name) vote['other_count'] += 1 for name in vote_data['nays']: vote.no(name) vote['no_count'] += 1 vote['passed'] = vote['yes_count'] > vote['no_count'] bill.add_vote(vote) # if data['previousVersions']: # These are instances of the same bill from prior sessions. # import pdb; pdb.set_trace() if not data['title']: bill['title'] = bill['summary'] self.save_bill(bill)
def scrape_votes(self, bill, link): with self.urlopen(link) as page: page = lxml.html.fromstring(page) raw_vote_data = page.xpath("//span[@id='lblVoteData']")[0].text_content() raw_vote_data = re.split("\w+? by [\w ]+?\s+-", raw_vote_data.strip())[1:] for raw_vote in raw_vote_data: raw_vote = raw_vote.split(u"\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0") motion = raw_vote[0] vote_date = re.search("(\d+/\d+/\d+)", motion) if vote_date: vote_date = datetime.datetime.strptime(vote_date.group(), "%m/%d/%Y") passed = "Passed" in motion or "Recommended for passage" in motion or "Adopted" in raw_vote[1] vote_regex = re.compile("\d+$") aye_regex = re.compile("^.+voting aye were: (.+) -") no_regex = re.compile("^.+voting no were: (.+) -") other_regex = re.compile("^.+present and not voting were: (.+) -") yes_count = 0 no_count = 0 other_count = 0 ayes = [] nos = [] others = [] for v in raw_vote[1:]: v = v.strip() if v.startswith("Ayes...") and vote_regex.search(v): yes_count = int(vote_regex.search(v).group()) elif v.startswith("Noes...") and vote_regex.search(v): no_count = int(vote_regex.search(v).group()) elif v.startswith("Present and not voting...") and vote_regex.search(v): other_count += int(vote_regex.search(v).group()) elif aye_regex.search(v): ayes = aye_regex.search(v).groups()[0].split(", ") elif no_regex.search(v): nos = no_regex.search(v).groups()[0].split(", ") elif other_regex.search(v): others += other_regex.search(v).groups()[0].split(", ") if "ChamberVoting=H" in link: chamber = "lower" else: chamber = "upper" vote = Vote(chamber, vote_date, motion, passed, yes_count, no_count, other_count) vote.add_source(link) for a in ayes: vote.yes(a) for n in nos: vote.no(n) for o in others: vote.other(o) vote.validate() bill.add_vote(vote) return bill
def record_votes(root, session): for el in root.xpath(u'//div[starts-with(., "Yeas \u2014")]'): text = ''.join(el.getprevious().getprevious().itertext()) text.replace('\n', ' ') m = re.search(r'(?P<bill_id>\w+\W+\d+)(,?\W+as\W+amended,?)?\W+was\W+' '(?P<type>adopted|passed' '(\W+to\W+(?P<to>engrossment|third\W+reading))?)\W+' 'by\W+\(Record\W+(?P<record>\d+)\):\W+' '(?P<yeas>\d+)\W+Yeas,\W+(?P<nays>\d+)\W+Nays,\W+' '(?P<present>\d+)\W+Present', text) if m: yes_count = int(m.group('yeas')) no_count = int(m.group('nays')) other_count = int(m.group('present')) bill_id = m.group('bill_id') bill_id = bill_id.replace(u'\xa0', ' ') bill_id = re.sub(r'CS(SB|HB)', r'\1', bill_id) if bill_id.startswith('H') or bill_id.startswith('CSHB'): bill_chamber = 'lower' elif bill_id.startswith('S') or bill_id.startswith('CSSB'): bill_chamber = 'upper' else: continue motion = get_motion(m) vote = Vote(None, None, motion, True, yes_count, no_count, other_count) vote['bill_id'] = bill_id vote['bill_chamber'] = bill_chamber vote['session'] = session[0:2] vote['method'] = 'record' vote['record'] = m.group('record') vote['type'] = get_type(motion) for name in names(el): vote.yes(name) el = next_tag(el) if el.text and el.text.startswith('Nays'): for name in names(el): vote.no(name) el = next_tag(el) while el.text and re.match(r'Present|Absent', el.text): for name in names(el): vote.other(name) el = next_tag(el) vote['other_count'] = len(vote['other_votes']) yield vote else: pass
def scrape_votes(self, bill, bill_prefix, number, session): vote_url = ('http://www.legislature.state.oh.us/votes.cfm?ID=' + session + '_' + bill_prefix + '_' + str(number)) page = self.urlopen(vote_url) page = lxml.html.fromstring(page) for jlink in page.xpath("//a[contains(@href, 'JournalText')]"): date = datetime.datetime.strptime(jlink.text, "%m/%d/%Y").date() details = jlink.xpath("string(../../../td[2])") chamber = details.split(" - ")[0] if chamber == 'House': chamber = 'lower' elif chamber == 'Senate': chamber = 'upper' else: raise ScrapeError("Bad chamber: %s" % chamber) motion = details.split(" - ")[1].split("\n")[0].strip() vote_row = jlink.xpath("../../..")[0].getnext() yea_div = vote_row.xpath( "td/font/div[contains(@id, 'Yea')]")[0] yeas = [] for td in yea_div.xpath("table/tr/td"): name = td.xpath("string()") if name: yeas.append(name) no_div = vote_row.xpath( "td/font/div[contains(@id, 'Nay')]")[0] nays = [] for td in no_div.xpath("table/tr/td"): name = td.xpath("string()") if name: nays.append(name) yes_count = len(yeas) no_count = len(nays) vote = Vote(chamber, date, motion, yes_count > no_count, yes_count, no_count, 0) for yes in yeas: vote.yes(yes) for no in nays: vote.no(no) vote.add_source(vote_url) bill.add_vote(vote)
def scrape_vote(self, bill, vote_url, chamber, date): page = self.lxmlize(vote_url) try: motion = page.xpath('//td/b/font[text()="MOTION:"]/../../following-sibling::td/font/text()')[0] except: self.warning("Vote Summary Page Broken ") return if 'withdrawn' not in motion: # Every table row after the one with VOTE in a td/div/b/font rolls = page.xpath('//tr[preceding-sibling::tr/td/div/b/font/text()="VOTE"]') count_row = rolls[-1] yes_count = count_row.xpath('.//b/font[normalize-space(text())="YES:"]' '/../following-sibling::font[1]/text()')[0] no_count = count_row.xpath('.//b/font[normalize-space(text())="NO:"]' '/../following-sibling::font[1]/text()')[0] exc_count = count_row.xpath('.//b/font[normalize-space(text())="EXC:"]' '/../following-sibling::font[1]/text()')[0] nv_count = count_row.xpath('.//b/font[normalize-space(text())="ABS:"]' '/../following-sibling::font[1]/text()')[0] if count_row.xpath('.//b/font[normalize-space(text())="FINAL ACTION:"]' '/../following-sibling::b[1]/font/text()'): final = count_row.xpath('.//b/font[normalize-space(text())="FINAL ACTION:"]' '/../following-sibling::b[1]/font/text()')[0] passed = True if 'pass' in final.lower() or int(yes_count) > int(no_count) else False elif 'passed without objection' in motion.lower(): passed = True yes_count = int(len(rolls[:-2])) else: self.warning("No vote breakdown found for %s" % vote_url) return other_count = int(exc_count) + int(nv_count) vote = Vote(chamber, date, motion, passed, int(yes_count), int(no_count), int(other_count)) for roll in rolls[:-2]: voter = roll.xpath('td[2]/div/font')[0].text_content() voted = roll.xpath('td[3]/div/font')[0].text_content().strip() if voted: if 'Yes' in voted: vote.yes(voter) elif 'No' in voted: vote.no(voter) else: vote.other(voter) elif 'passed without objection' in motion.lower() and voter: vote.yes(voter) bill.add_vote(vote)
def parse_vote(self, bill, vote_date, vote_chamber, vote_status, vote_url): vote_chamber = 'upper' if vote_chamber == 'Senate' else 'lower' vote_date = datetime.datetime.strptime(vote_date, '%a %d %b %Y') vote_doc, resp = self.urlretrieve(vote_url) subprocess.check_call('abiword --to=ksvote.txt %s' % vote_doc, shell=True, cwd='/tmp/') vote_lines = open('/tmp/ksvote.txt').readlines() os.remove(vote_doc) vote = None passed = True for line in vote_lines: line = line.strip() totals = re.findall('Yeas (\d+)[;,] Nays (\d+)[;,] (?:Present but not voting:|Present and Passing) (\d+)[;,] (?:Absent or not voting:|Absent or Not Voting) (\d+)', line) if totals: totals = totals[0] yeas = int(totals[0]) nays = int(totals[1]) nv = int(totals[2]) absent = int(totals[3]) # default passed to true vote = Vote(vote_chamber, vote_date, vote_status, True, yeas, nays, nv+absent) elif line.startswith('Yeas:'): line = line.split(':', 1)[1].strip() for member in line.split(', '): if member != 'None.': vote.yes(member) elif line.startswith('Nays:'): line = line.split(':', 1)[1].strip() for member in line.split(', '): if member != 'None.': vote.no(member) elif line.startswith('Present '): line = line.split(':', 1)[1].strip() for member in line.split(', '): if member != 'None.': vote.other(member) elif line.startswith('Absent or'): line = line.split(':', 1)[1].strip() for member in line.split(', '): if member != 'None.': vote.other(member) elif 'the motion did not prevail' in line: passed = False if vote: vote['passed'] = passed vote.add_source(vote_url) bill.add_vote(vote)
def scrape_votes(self, bill, votes_url): html = self.urlopen(votes_url) doc = lxml.html.fromstring(html) doc.make_links_absolute(votes_url) EXPECTED_VOTE_CODES = ['Y','N','E','NV','A','P','-'] # vote indicator, a few spaces, a name, newline or multiple spaces VOTE_RE = re.compile('(Y|N|E|NV|A|P|-)\s{2,5}(\w.+?)(?:\n|\s{2})') for link in doc.xpath('//a[contains(@href, "votehistory")]'): pieces = link.text.split(' - ') date = pieces[-1] if len(pieces) == 3: motion = pieces[1] else: motion = 'Third Reading' chamber = link.xpath('../following-sibling::td/text()')[0] if chamber == 'HOUSE': chamber = 'lower' elif chamber == 'SENATE': chamber = 'upper' else: self.warning('unknown chamber %s' % chamber) date = datetime.datetime.strptime(date, "%A, %B %d, %Y") # download the file fname, resp = self.urlretrieve(link.get('href')) pdflines = convert_pdf(fname, 'text').splitlines() os.remove(fname) vote = Vote(chamber, date, motion.strip(), False, 0, 0, 0) for line in pdflines: for match in VOTE_RE.findall(line): vcode, name = match if vcode == 'Y': vote.yes(name) elif vcode == 'N': vote.no(name) else: vote.other(name) # fake the counts vote['yes_count'] = len(vote['yes_votes']) vote['no_count'] = len(vote['no_votes']) vote['other_count'] = len(vote['other_votes']) vote['passed'] = vote['yes_count'] > vote['no_count'] vote.add_source(link.get('href')) bill.add_vote(vote)
def get_lower_votes(self): url = ('http://assembly.state.ny.us/leg/?' 'default_fld=&bn=%s&term=%s&Votes=Y') url = url % (self.bill_id, self.term_start_year) doc = self.url2lxml(url) if doc is None: return pre = doc.xpath('//pre')[0].text_content() no_votes = ('There are no votes for this bill in this ' 'legislative session.') if pre == no_votes: return actual_vote = collections.defaultdict(list) for table in doc.xpath('//table'): date = table.xpath('caption/label[contains(., "DATE:")]') date = date[0].itersiblings().next().text date = datetime.datetime.strptime(date, '%m/%d/%Y') votes = table.xpath('caption/span/label[contains(., "YEA/NAY:")]') votes = votes[0].itersiblings().next().text yes_count, no_count = map(int, votes.split('/')) passed = yes_count > no_count vote = Vote('lower', date, 'Floor Vote', passed, yes_count, no_count, other_count=0) tds = table.xpath('tr/td/text()') votes = iter(tds) while True: try: data = list(islice(votes, 2)) name, vote_val = data except (StopIteration, ValueError): # End of data. Stop. break name = self._scrub_name(name) if vote_val.strip() == 'Y': vote.yes(name) elif vote_val.strip() in ('N', 'NO'): vote.no(name) else: vote.other(name) actual_vote[vote_val].append(name) # The page doesn't provide an other_count. vote['other_count'] = len(vote['other_votes']) vote['actual_vote'] = actual_vote self.bill.add_vote(vote)
def parse_vote(self, bill, actor, date, motion, url, uniqid): page = self.get(url).text bill.add_source(url) vote_re = re.compile( "YEAS -?\s?(\d+)(.*)NAYS -?\s?(\d+)" "(.*)ABSENT( OR NOT VOTING)? -?\s?" "(\d+)(.*)", re.MULTILINE | re.DOTALL, ) match = vote_re.search(page) yes_count = int(match.group(1)) no_count = int(match.group(3)) other_count = int(match.group(6)) if yes_count > no_count: passed = True else: passed = False if actor == "upper" or actor == "lower": vote_chamber = actor vote_location = "" else: vote_chamber = "" vote_location = actor vote = Vote( vote_chamber, date, motion, passed, yes_count, no_count, other_count, location=vote_location, _vote_id=uniqid, ) vote.add_source(url) yes_votes = re.split("\s{2,}", match.group(2).strip()) no_votes = re.split("\s{2,}", match.group(4).strip()) other_votes = re.split("\s{2,}", match.group(7).strip()) for yes in yes_votes: if yes: vote.yes(yes) for no in no_votes: if no: vote.no(no) for other in other_votes: if other: vote.other(other) bill.add_vote(vote)
def scrape_vote(self, bill, chamber, url): page = self.urlopen(url) if 'There are no details available for this roll call' in page: return page = page.replace(' ', ' ') page = lxml.html.fromstring(page) info_row = page.xpath("//table[1]/tr[2]")[0] date = info_row.xpath("string(td[1])") date = datetime.datetime.strptime(date, "%m/%d/%Y") motion = info_row.xpath("string(td[2])") yes_count = int(info_row.xpath("string(td[3])")) no_count = int(info_row.xpath("string(td[4])")) other_count = int(info_row.xpath("string(td[5])")) passed = info_row.xpath("string(td[6])") == 'Pass' if motion == 'Shall the bill pass?': type = 'passage' elif motion == 'Shall the bill be read the third time?': type = 'reading:3' elif 'be amended as' in motion: type = 'amendment' else: type = 'other' vote = Vote(chamber, date, motion, passed, yes_count, no_count, other_count) vote.add_source(url) for tr in page.xpath("//table[1]/tr")[3:]: if len(tr.xpath("td")) != 2: continue # avoid splitting duplicate names name = tr.xpath("string(td[1])").strip() if not name.startswith(DOUBLED_NAMES): name = name.split(' of')[0] type = tr.xpath("string(td[2])").strip() if type.startswith('Yea'): vote.yes(name) elif type.startswith('Nay'): vote.no(name) elif type.startswith('Not Voting'): pass else: vote.other(name) bill.add_vote(vote)
def scrape_vote(self, bill, motion, url): page = self.urlopen(url, retry_on_404=True) page = lxml.html.fromstring(page) yeas_cell = page.xpath("//td[text() = 'Yeas (Y):']")[0] yes_count = int(yeas_cell.xpath("string(following-sibling::td)")) nays_cell = page.xpath("//td[text() = 'Nays (N):']")[0] no_count = int(nays_cell.xpath("string(following-sibling::td)")) abs_cell = page.xpath("//td[text() = 'Absent (X):']")[0] abs_count = int(abs_cell.xpath("string(following-sibling::td)")) ex_cell = page.xpath("//td[text() = 'Excused (E):']")[0] ex_count = int(ex_cell.xpath("string(following-sibling::td)")) other_count = abs_count + ex_count if 'chamber=House' in url: chamber = 'lower' elif 'chamber=Senate' in url: chamber = 'upper' date_cell = page.xpath("//td[text() = 'Date:']")[0] date = date_cell.xpath("string(following-sibling::td)") try: date = datetime.datetime.strptime(date, "%B %d, %Y") except ValueError: date = datetime.datetime.strptime(date, "%b. %d, %Y") outcome_cell = page.xpath("//td[text()='Outcome:']")[0] outcome = outcome_cell.xpath("string(following-sibling::td)") vote = Vote(chamber, date, motion, outcome == 'PREVAILS', yes_count, no_count, other_count) vote.add_source(url) member_cell = page.xpath("//td[text() = 'Member']")[0] for row in member_cell.xpath("../../tr")[1:]: name = row.xpath("string(td[2])") # name = name.split(" of ")[0] vtype = row.xpath("string(td[4])") if vtype == 'Y': vote.yes(name) elif vtype == 'N': vote.no(name) elif vtype == 'X' or vtype == 'E': vote.other(name) bill.add_vote(vote)
def _parse_senate_votes(self, vote_data): vote_datetime = datetime.datetime.strptime(vote_data['voteDate'], '%Y-%m-%d') vote = Vote( chamber='upper', date=vote_datetime.date(), motion='[No motion available.]', passed=False, yes_votes=[], no_votes=[], other_votes=[], yes_count=0, no_count=0, other_count=0) if vote_data['voteType'] == 'FLOOR': vote['motion'] = 'Floor Vote' elif vote_data['voteType'] == 'COMMITTEE': vote['motion'] = '{} Vote'.format(vote_data['committee']['name']) else: raise ValueError('Unknown vote type encountered.') vote_rolls = vote_data['memberVotes']['items'] # Count all yea votes. if 'items' in vote_rolls.get('AYE', {}): for legislator in vote_rolls['AYE']['items']: vote.yes(legislator['fullName']) vote['yes_count'] += 1 if 'items' in vote_rolls.get('AYEWR', {}): for legislator in vote_rolls['AYEWR']['items']: vote.yes(legislator['fullName']) vote['yes_count'] += 1 # Count all nay votes. if 'items' in vote_rolls.get('NAY', {}): for legislator in vote_rolls['NAY']['items']: vote.no(legislator['fullName']) vote['no_count'] += 1 # Count all other types of votes. other_vote_types = ('EXC', 'ABS', 'ABD') for vote_type in other_vote_types: if vote_rolls.get(vote_type, []): for legislator in vote_rolls[vote_type]['items']: vote.other(legislator['fullName']) vote['other_count'] += 1 vote['passed'] = vote['yes_count'] > vote['no_count'] return vote
def scrape_votes(self, bill, page): for b in page.xpath("//div/b[starts-with(., 'VOTE: FLOOR VOTE:')]"): date = b.text.split('-')[1].strip() date = datetime.datetime.strptime(date, "%b %d, %Y").date() yes_votes, no_votes, other_votes = [], [], [] yes_count, no_count, other_count = 0, 0, 0 vtype = None for tag in b.xpath("following-sibling::blockquote/*"): if tag.tag == 'b': text = tag.text if text.startswith('Ayes'): vtype = 'yes' yes_count = int(re.search( r'\((\d+)\):', text).group(1)) elif text.startswith('Nays'): vtype = 'no' no_count = int(re.search( r'\((\d+)\):', text).group(1)) elif (text.startswith('Excused') or text.startswith('Abstains') or text.startswith('Absent') ): vtype = 'other' other_count += int(re.search( r'\((\d+)\):', text).group(1)) else: raise ValueError('bad vote type: %s' % tag.text) elif tag.tag == 'a': name = tag.text.strip() if vtype == 'yes': yes_votes.append(name) elif vtype == 'no': no_votes.append(name) elif vtype == 'other': other_votes.append(name) passed = yes_count > (no_count + other_count) vote = Vote('upper', date, 'Floor Vote', passed, yes_count, no_count, other_count) for name in yes_votes: vote.yes(name) for name in no_votes: vote.no(name) for name in other_votes: vote.other(name) bill.add_vote(vote)
def scrape_votes(self, bill_page, bill, insert, year): root = lxml.html.fromstring(bill_page) for link in root.xpath('//a[contains(text(), "Passage")]'): motion = link.text if "Assembly" in motion: chamber = "lower" else: chamber = "upper" vote_url = "http://www.leg.state.nv.us/Session/%s/Reports/%s" % (insert, link.get("href")) bill.add_source(vote_url) with self.urlopen(vote_url) as page: page = page.decode("utf8").replace(u"\xa0", " ") root = lxml.html.fromstring(page) date = root.xpath("//h1/text()")[-1].strip() date = datetime.strptime(date, "%B %d, %Y at %H:%M %p") top_block_text = root.xpath('//div[@align="center"]')[0].text_content() yes_count = int(re.findall("(\d+) Yea", top_block_text)[0]) no_count = int(re.findall("(\d+) Nay", top_block_text)[0]) excused = int(re.findall("(\d+) Excused", top_block_text)[0]) not_voting = int(re.findall("(\d+) Not Voting", top_block_text)[0]) absent = int(re.findall("(\d+) Absent", top_block_text)[0]) other_count = excused + not_voting + absent passed = yes_count > no_count vote = Vote( chamber, date, motion, passed, yes_count, no_count, other_count, not_voting=not_voting, absent=absent, ) for el in root.xpath("//table[2]/tr"): tds = el.xpath("td") name = tds[1].text_content().strip() vote_result = tds[2].text_content().strip() if vote_result == "Yea": vote.yes(name) elif vote_result == "Nay": vote.no(name) else: vote.other(name) bill.add_vote(vote)
def parse_house_vote(self, url): """ house votes are pdfs that can be converted to text, require some nasty regex to get votes out reliably """ fname, resp = self.urlretrieve(url) text = convert_pdf(fname, 'text') if not text.strip(): self.warning('image PDF %s' % url) return os.remove(fname) # get date date = re.findall('(\d+/\d+/\d+)', text)[0] date = datetime.strptime(date, '%m/%d/%y') # get totals absent, yea, nay, exc = self.HOUSE_TOTAL_RE.findall(text)[0] # make vote (faked passage indicator) vote = Vote('lower', date, 'house passage', int(yea) > int(nay), int(yea), int(nay), int(absent)+int(exc)) vote.add_source(url) # votes real_votes = False for v, name in HOUSE_VOTE_RE.findall(text): # our regex is a bit broad, wait until we see 'Nays' to start # and end when we see CERTIFIED or ____ signature line if 'Nays' in name or 'Excused' in name: real_votes = True continue elif 'CERTIFIED' in name or '___' in name: break elif real_votes and name.strip(): if v == 'Y': vote.yes(name) elif v == 'N': vote.no(name) else: # excused/absent vote.other(name) return vote
def scrape_votes(self, bill_page, bill, insert, year): root = lxml.html.fromstring(bill_page) for link in root.xpath('//a[contains(text(), "Passage")]'): motion = link.text if 'Assembly' in motion: chamber = 'lower' else: chamber = 'upper' vote_url = 'http://www.leg.state.nv.us/Session/%s/Reports/%s' % ( insert, link.get('href')) bill.add_source(vote_url) page = self.urlopen(vote_url) page = page.replace(u"\xa0", " ") root = lxml.html.fromstring(page) date = root.xpath('//h1/text()')[-1].strip() date = datetime.strptime(date, "%B %d, %Y at %H:%M %p") top_block_text = root.xpath('//div[@align="center"]')[0].text_content() yes_count = int(re.findall("(\d+) Yea", top_block_text)[0]) no_count = int(re.findall("(\d+) Nay", top_block_text)[0]) excused = int(re.findall("(\d+) Excused", top_block_text)[0]) not_voting = int(re.findall("(\d+) Not Voting", top_block_text)[0]) absent = int(re.findall("(\d+) Absent", top_block_text)[0]) other_count = excused + not_voting + absent passed = yes_count > no_count vote = Vote(chamber, date, motion, passed, yes_count, no_count, other_count, not_voting=not_voting, absent=absent) for el in root.xpath('//table[2]/tr'): tds = el.xpath('td') name = tds[1].text_content().strip() vote_result = tds[2].text_content().strip() if vote_result == 'Yea': vote.yes(name) elif vote_result == 'Nay': vote.no(name) else: vote.other(name) bill.add_vote(vote)
def scrape_vote(self, bill, moid, vote_id, body, inst, motion, chamber): url = "http://alisondb.legislature.state.al.us/acas/GetRollCallVoteResults.asp?MOID=%s&VOTE=%s&BODY=%s&INST=%s&SESS=%s" % ( moid, vote_id, body, inst, self.session_id) doc = lxml.html.fromstring(self.urlopen(url)) voters = {'Y': [], 'N': [], 'P': [], 'A': []} leg_tds = doc.xpath('//td[@width="33%"]') for td in leg_tds: name = td.text two_after = td.xpath('following-sibling::td')[1].text if name == 'Total Yea:': total_yea = int(two_after) elif name == 'Total Nay:': total_nay = int(two_after) elif name == 'Total Abs:': total_abs = int(two_after) elif name == 'Legislative Date:': vote_date = datetime.datetime.strptime(two_after, '%m/%d/%Y') # lines to ignore elif name in ('Legislative Day:', 'Vote ID:'): pass elif 'Vacant' in name: pass else: # add legislator to list of voters voters[two_after].append(name) # TODO: passed is faked total_other = total_abs + len(voters['P']) vote = Vote(chamber, vote_date, motion, total_yea > total_nay, total_yea, total_nay, total_other) vote.add_source(url) for member in voters['Y']: vote.yes(member) for member in voters['N']: vote.no(member) for member in (voters['A'] + voters['P']): vote.other(member) bill.add_vote(vote)
def scrape(self, chamber, session): url = { "upper": "%s/%s" % (RI_URL_BASE, "SVotes"), "lower": "%s/%s" % (RI_URL_BASE, "HVotes") } url = url[chamber] action = "%s/%s" % (url, "votes.asp") dates = self.get_dates(url) for date in dates: votes = self.parse_vote_page(self.post_to(action, date), url, session) for vote_dict in votes: for vote in vote_dict: vote = vote_dict[vote] count = vote['count'] chamber = { "H": "lower", "S": "upper" }[vote['meta']['chamber']] v = Vote( chamber, vote['time'], vote['meta']['extra']['motion'], count['passage'], int(count['YEAS']), int(count['NAYS']), int(count['NOT VOTING']), session=session, bill_id=vote['meta']['bill'], bill_chamber=chamber, bill_session=vote['meta']['year'], ) v.add_source(vote['source']) for vt in vote['votes']: if vt['vote'] == "Y": v.yes(vt['name']) elif vt['vote'] == "N": v.no(vt['name']) else: v.other(vt['name']) self.save_vote(v)
def scrape_votes(self, bill): bill_num = bill['bill_id'].split()[1] url = ("http://wslwebservices.leg.wa.gov/legislationservice.asmx/" "GetRollCalls?billNumber=%s&biennium=%s" % (bill_num, self.biennium)) page = self.get(url) page = lxml.etree.fromstring(page.content) for rc in xpath(page, "//wa:RollCall"): motion = xpath(rc, "string(wa:Motion)") date = xpath(rc, "string(wa:VoteDate)").split("T")[0] date = datetime.datetime.strptime(date, "%Y-%m-%d").date() yes_count = int(xpath(rc, "string(wa:YeaVotes/wa:Count)")) no_count = int(xpath(rc, "string(wa:NayVotes/wa:Count)")) abs_count = int(xpath(rc, "string(wa:AbsentVotes/wa:Count)")) ex_count = int(xpath(rc, "string(wa:ExcusedVotes/wa:Count)")) other_count = abs_count + ex_count agency = xpath(rc, "string(wa:Agency)") chamber = {'House': 'lower', 'Senate': 'upper'}[agency] vote = Vote(chamber, date, motion, yes_count > (no_count + other_count), yes_count, no_count, other_count) for sv in xpath(rc, "wa:Votes/wa:Vote"): name = xpath(sv, "string(wa:Name)") vtype = xpath(sv, "string(wa:VOte)") if vtype == 'Yea': vote.yes(name) elif vtype == 'Nay': vote.no(name) else: vote.other(name) bill.add_vote(vote)
def parse_vote(self, bill, link): member_doc = lxml.html.fromstring(self.get(link).text) motion = member_doc.xpath("//div[@id='main_content']/h4/text()") opinions = member_doc.xpath("//div[@id='main_content']/h3/text()") if len(opinions) > 0: temp = opinions[0].split() vote_chamber = temp[0] vote_date = datetime.datetime.strptime(temp[-1], '%m/%d/%Y') vote_status = " ".join(temp[2:-2]) vote_status = vote_status if vote_status.strip() else motion[0] vote_chamber = 'upper' if vote_chamber == 'Senate' else 'lower' for i in opinions: try: count = int(i[i.find("(") + 1:i.find(")")]) except: pass if "yea" in i.lower(): yes_count = count elif "nay" in i.lower(): no_count = count elif "present" in i.lower(): p_count = count elif "absent" in i.lower(): a_count = count vote = Vote(vote_chamber, vote_date, vote_status, yes_count > no_count, yes_count, no_count, p_count + a_count) vote.add_source(link) a_links = member_doc.xpath("//div[@id='main_content']/a/text()") for i in range(1, len(a_links)): if i <= yes_count: vote.yes(re.sub(',', '', a_links[i]).split()[0]) elif no_count != 0 and i > yes_count and i <= yes_count + no_count: vote.no(re.sub(',', '', a_links[i]).split()[0]) else: vote.other(re.sub(',', '', a_links[i]).split()[0]) bill.add_vote(vote) else: print self.warning("No Votes for: %s", link)
def scrape_vote(self, bill, chamber, date, url): (path, resp) = self.urlretrieve(url) text = convert_pdf(path, 'text') os.remove(path) motion = text.split('\n')[4].strip() yes_count = int(re.search(r'Yeas - (\d+)', text).group(1)) no_count = int(re.search(r'Nays - (\d+)', text).group(1)) other_count = int(re.search(r'Not Voting - (\d+)', text).group(1)) passed = yes_count > (no_count + other_count) vote = Vote(chamber, date, motion, passed, yes_count, no_count, other_count) vote.add_source(url) for line in text.split('\n')[9:]: if 'after roll call' in line: break if 'Presiding' in line: continue for col in re.split(r'-\d+', line): col = col.strip() if not col: continue match = re.match(r'(Y|N|EX)\s+(.+)$', col) if match: if match.group(1) == 'Y': vote.yes(match.group(2)) elif match.group(1) == 'N': vote.no(match.group(2)) else: vote.other(match.group(2)) else: vote.other(col.strip()) vote.validate() bill.add_vote(vote)
def apply_votes(self, bill): """Given a bill (and assuming it has a status_url in its dict), parse all of the votes """ bill_votes = votes.all_votes_for_url(self, bill['status_url']) for (chamber, vote_desc, pdf_url, these_votes) in bill_votes: try: date = vote_desc.split("-")[-1] except IndexError: self.warning("[%s] Couldn't get date out of [%s]" % (bill['bill_id'], vote_desc)) continue yes_votes = [] no_votes = [] other_votes = [] for voter, vote in these_votes.iteritems(): if vote == 'Y': yes_votes.append(voter) elif vote == 'N': no_votes.append(voter) else: other_votes.append(voter) passed = len(yes_votes) > len( no_votes ) # not necessarily correct, but not sure where else to get it. maybe from pdf vote = Vote(standardize_chamber(chamber), date, vote_desc, passed, len(yes_votes), len(no_votes), len(other_votes), pdf_url=pdf_url) for voter in yes_votes: vote.yes(voter) for voter in no_votes: vote.no(voter) for voter in other_votes: vote.other(voter) bill.add_vote(vote)
def record_votes(root, session): for el in root.xpath('//div{}'.format(''.join(vote_selectors))): mv = MaybeVote(el) if not mv.is_valid: continue v = Vote(None, None, 'passage' if mv.passed else 'other', mv.passed, mv.yeas or 0, mv.nays or 0, mv.present or 0) v['bill_id'] = mv.bill_id v['bill_chamber'] = mv.chamber v['is_amendment'] = mv.is_amendment v['session'] = session[0:2] v['method'] = 'record' for each in mv.votes['yeas']: v.yes(each) for each in mv.votes['nays']: v.no(each) for each in mv.votes['present'] + mv.votes['absent']: v.other(each) yield v
def scrape_vote(self, chamber, session, bill_id, vote_url): NO_VOTE_URL = 'http://www.house.leg.state.mn.us/votes/novotefound.asp' with self.urlopen(vote_url) as html: # sometimes the link is broken, will redirect to NO_VOTE_URL if html.response.url == NO_VOTE_URL: return doc = lxml.html.fromstring(html) paragraphs = doc.xpath('//h1/following-sibling::p') # first paragraph has motion and vote total top_par = paragraphs[0].text_content() lines = top_par.splitlines() # 3rd line is the motion except in cases where first line is gone motion = lines[2] or lines[1] # last line is "__ YEA and __ Nay" yeas, nays = self.yeanay_re.match(lines[-1]).groups() yeas = int(yeas) nays = int(nays) # second paragraph has date date = self.date_re.match(paragraphs[1].text_content()).groups()[0] date = datetime.datetime.strptime(date, '%m/%d/%Y') vote = Vote('lower', date, motion, yeas>nays, yeas, nays, 0, session=session, bill_id=bill_id, bill_chamber=chamber) vote.add_source(vote_url) # first table has YEAs for name in doc.xpath('//table[1]/tr/td/font/text()'): vote.yes(name.strip()) # second table is nays for name in doc.xpath('//table[2]/tr/td/font/text()'): vote.no(name.strip()) self.save_vote(vote)
def scrape_vote(self, bill, vote_type_id, vote_type): base_url = 'http://www.dccouncil.washington.dc.us/lims/voting.aspx?VoteTypeID=%s&LegID=%s' url = base_url % (vote_type_id, bill['bill_id']) with self.urlopen(url) as html: doc = lxml.html.fromstring(html) vote_date = convert_date(doc.get_element_by_id('VoteDate').text) # check if voice vote / approved boxes have an 'x' voice = (doc.xpath('//span[@id="VoteTypeVoice"]/b/text()')[0] == 'x') passed = (doc.xpath('//span[@id="VoteResultApproved"]/b/text()')[0] == 'x') yes_count = extract_int(doc.xpath( '//span[@id="VoteCount1"]/b/text()')[0]) no_count = extract_int(doc.xpath( '//span[@id="VoteCount2"]/b/text()')[0]) # every now and then this actually drops below 0 (error in count) other_count = max(13 - (yes_count+no_count), 0) vote = Vote('upper', vote_date, vote_type, passed, yes_count, no_count, other_count, voice_vote=voice) vote.add_source(url) # members are only text on page in a <u> tag for member_u in doc.xpath('//u'): member = member_u.text vote_text = member_u.xpath('../../i/text()')[0] if 'Yes' in vote_text: vote.yes(member) elif 'No' in vote_text: vote.no(member) else: vote.other(member) bill.add_vote(vote)
def scrape_vote(self, bill, chamber, date, td): motion = td.text result = td.xpath("string(span[1])").strip() passed = result.split()[0] == "PASSED" yes, no, other = [ int(g) for g in re.search(r'(\d+)-(\d+)-(\d+)$', result).groups() ] vote = Vote(chamber, date, motion, passed, yes, no, other) for name in split_names(td.xpath("span[. = 'AYES']")[0].tail): vote.yes(name) for name in split_names(td.xpath("span[. = 'NAYS']")[0].tail): vote.no(name) for name in split_names( td.xpath("span[contains(., 'Absent')]")[0].tail): vote.other(name) assert len(vote['yes_votes']) == vote['yes_count'] assert len(vote['no_votes']) == vote['no_count'] assert len(vote['other_votes']) == vote['other_count'] bill.add_vote(vote)
def parse_vote(self, actor, date, row): """ takes the actor, date and row element and returns a Vote object """ spans = row.xpath('.//span') motion = row.text.replace(u'\u00a0', " ").replace("-", "").strip() motion = motion if motion else "passage" passed, yes_count, no_count, other_count = spans[0].text_content( ).rsplit('-', 3) yes_votes = self.get_names(spans[1].tail) no_votes = self.get_names(spans[2].tail) other_votes = [] for span in spans[3:]: if span.text.startswith(('Absent', 'Excused')): other_votes += self.get_names(span.tail) for key, val in { 'adopted': True, 'passed': True, 'failed': False }.items(): if key in passed.lower(): passed = val break vote = Vote(actor, date, motion, passed, int(yes_count), int(no_count), int(other_count)) for name in yes_votes: if name and name != 'None': vote.yes(name) for name in no_votes: if name and name != 'None': vote.no(name) for name in other_votes: if name and name != 'None': vote.other(name) return vote
def scrape_vote(self, bill, date, motion, url): page = self.urlopen(url) if 'not yet official' in page: # Sometimes they link to vote pages before they go live return page = lxml.html.fromstring(page) if url.endswith('Senate'): actor = 'upper' else: actor = 'lower' count_path = "string(//td[@align = 'center' and contains(., '%s: ')])" yes_count = int(page.xpath(count_path % "Yeas").split()[-1]) no_count = int(page.xpath(count_path % "Nays").split()[-1]) other_count = int(page.xpath(count_path % "Non Voting").split()[-1]) other_count += int(page.xpath(count_path % "Present").split()[-1]) passed = yes_count > no_count + other_count vote = Vote(actor, date, motion, passed, yes_count, no_count, other_count) vote.add_source(url) vote_path = "//h3[. = '%s']/following-sibling::table[1]/tr/td/a" for yes in page.xpath(vote_path % "Yeas"): vote.yes(yes.text) for no in page.xpath(vote_path % "Nays"): vote.no(no.text) for other in page.xpath(vote_path % "Non Voting"): vote.other(other.text) for other in page.xpath(vote_path % "Present"): vote.other(other.text) bill.add_vote(vote)
def scrape_bill_sheet(self, session, chamber): """ Scrape the bill sheet (the page full of bills and other small bits of data) """ sheet_url = self.get_bill_folder(session, chamber) bill_chamber = {"Senate": "upper", "House": "lower"}[chamber] index = { "id": 0, "title_sponsor": 1, "version": 2, "history": 3, "votes": 7 } with self.urlopen(sheet_url) as sheet_html: sheet_page = lxml.html.fromstring(sheet_html) bills = sheet_page.xpath('//table/tr') for bill in bills: bill_id = self.read_td(bill[index["id"]][0]) if bill_id == None: # Every other entry is null for some reason continue dot_loc = bill_id.find('.') if dot_loc != -1: # budget bills are missing the .pdf, don't truncate bill_id = bill_id[:dot_loc] title_and_sponsor = bill[index["title_sponsor"]][0] bill_title = title_and_sponsor.text bill_title_and_sponsor = title_and_sponsor.text_content() sponsors = bill_title_and_sponsor.replace(bill_title, "").\ replace(" & ...", "").split("--") cats = { "SB": "bill", "HB": "bill", "HR": "resolution", "SR": "resolution", "SCR": "concurrent resolution", "HCR": "concurrent resolution", "SJR": "joint resolution", "HJR": "joint resolution", "SM": "memorial", "HM": "memorial" } bill_type = None for cat in cats: if bill_id[:len(cat)] == cat: bill_type = cats[cat] b = Bill(session, bill_chamber, bill_id, bill_title, type=bill_type) b.add_source(sheet_url) versions_url = \ bill[index["version"]].xpath('font/a')[0].attrib["href"] versions_url = CO_URL_BASE + versions_url versions = self.parse_versions(versions_url) for version in versions: b.add_version(version['name'], version['link'], mimetype=version['mimetype']) bill_history_href = CO_URL_BASE + \ bill[index["history"]][0][0].attrib['href'] # ^^^^^^^ We assume this is a full path to the target. # might want to consider some better rel-path support # XXX: Look at this ^ history = self.parse_history(bill_history_href) b.add_source(bill_history_href) for action in history: self.add_action_to_bill(b, action) for sponsor in sponsors: if sponsor != None and sponsor != "(NONE)" and \ sponsor != "": b.add_sponsor("primary", sponsor) # Now that we have history, let's see if we can't grab some # votes bill_vote_href = self.get_vote_url(bill_id, session) votes = self.parse_votes(bill_vote_href) if votes['sanity-check'] != bill_id: self.warning("XXX: READ ME! Sanity check failed!") self.warning(" -> Scraped ID: " + votes['sanity-check']) self.warning(" -> 'Real' ID: " + bill_id) assert votes['sanity-check'] == bill_id for vote in votes['votes']: filed_votes = vote['votes'] passage = vote['meta'] result = vote['result'] composite_time = "%s %s" % (passage['x-parent-date'], passage['TIME']) # It's now like: 04/01/2011 02:10:14 PM pydate = dt.datetime.strptime(composite_time, "%m/%d/%Y %I:%M:%S %p") hasHouse = "House" in passage['x-parent-ctty'] hasSenate = "Senate" in passage['x-parent-ctty'] if hasHouse and hasSenate: actor = "joint" elif hasHouse: actor = "lower" else: actor = "upper" other = (int(result['EXC']) + int(result['ABS'])) # OK, sometimes the Other count is wrong. local_other = 0 for voter in filed_votes: l_vote = filed_votes[voter].lower().strip() if l_vote != "yes" and l_vote != "no": local_other = local_other + 1 if local_other != other: self.warning( \ "XXX: !!!WARNING!!! - resetting the 'OTHER' VOTES" ) self.warning(" -> Old: %s // New: %s" % (other, local_other)) other = local_other v = Vote(actor, pydate, passage['MOTION'], (result['FINAL_ACTION'] == "PASS"), int(result['YES']), int(result['NO']), other, moved=passage['MOVED'], seconded=passage['SECONDED']) v.add_source(vote['meta']['url']) # v.add_source( bill_vote_href ) # XXX: Add more stuff to kwargs, we have a ton of data for voter in filed_votes: who = voter vote = filed_votes[who] if vote.lower() == "yes": v.yes(who) elif vote.lower() == "no": v.no(who) else: v.other(who) b.add_vote(v) self.save_bill(b)
def parse_vote(self, bill, vote_date, vote_chamber, vote_status, vote_url): vote_chamber = 'upper' if vote_chamber == 'Senate' else 'lower' vote_date = datetime.datetime.strptime(vote_date, '%a %d %b %Y') vote_doc, resp = self.urlretrieve(vote_url) try: subprocess.check_call('timeout 10 abiword --to=ksvote.txt %s' % vote_doc, shell=True, cwd='/tmp/') except subprocess.CalledProcessError: # timeout failed, some documents hang abiword self.error('abiword hung for longer than 10s on conversion') return vote_lines = open('/tmp/ksvote.txt').readlines() os.remove(vote_doc) comma_or_and = re.compile(', |\sand\s') vote = None passed = True for line in vote_lines: totals = re.findall( 'Yeas (\d+)[;,] Nays (\d+)[;,] (?:Present but not voting|Present and Passing):? (\d+)[;,] (?:Absent or not voting|Absent or Not Voting):? (\d+)', line) line = line.strip() if totals: totals = totals[0] yeas = int(totals[0]) nays = int(totals[1]) nv = int(totals[2]) absent = int(totals[3]) # default passed to true vote = Vote(vote_chamber, vote_date, vote_status.strip(), True, yeas, nays, nv + absent) elif vote and line.startswith('Yeas:'): line = line.split(':', 1)[1].strip() for member in comma_or_and.split(line): if member != 'None.': vote.yes(member) elif vote and line.startswith('Nays:'): line = line.split(':', 1)[1].strip() for member in comma_or_and.split(line): if member != 'None.': vote.no(member) elif vote and line.startswith('Present '): line = line.split(':', 1)[1].strip() for member in comma_or_and.split(line): if member != 'None.': vote.other(member) elif vote and line.startswith('Absent or'): line = line.split(':', 1)[1].strip() for member in comma_or_and.split(line): if member != 'None.': vote.other(member) elif 'the motion did not prevail' in line: passed = False if vote: vote['passed'] = passed vote.add_source(vote_url) bill.add_vote(vote)
class IDBillScraper(BillScraper): state = 'id' # the following are only used for parsing legislation from 2008 and earlier vote = None in_vote = False ayes = False nays = False other = False last_date = None def scrape_subjects(self, session): self._subjects = defaultdict(list) url = 'http://legislature.idaho.gov/legislation/%s/topicind.htm' % session html = self.urlopen(url) doc = lxml.html.fromstring(html) # loop through anchors anchors = doc.xpath('//td[@width="95%"]//a') for a in anchors: # if anchor has a name, that's the subject if a.get('name'): subject = a.get('name') # if anchor is a link to a bill, save that reference elif 'legislation' in a.get('href'): self._subjects[a.text].append(subject) def scrape(self, chamber, session): """ Scrapes all the bills for a given session and chamber """ #url = BILLS_URL % session if int(session[:4]) < 2009: self.scrape_pre_2009(chamber, session) else: self.scrape_subjects(session) self.scrape_post_2009(chamber, session) def scrape_post_2009(self, chamber, session): "scrapes legislation for 2009 and above" url = BILLS_URL % session with self.urlopen(url) as bill_index: html = lxml.html.fromstring(bill_index) # I check for rows with an id that contains 'bill' and startswith # 'H' or 'S' to make sure I dont get any links from the menus # might not be necessary bill_rows = html.xpath('//tr[contains(@id, "bill") and '\ 'starts-with(descendant::td/a/text(), "%s")]'\ % _CHAMBERS[chamber][0]) for row in bill_rows: matches = re.match(r'([A-Z]*)([0-9]+)', row[0].text_content().strip()) bill_id = " ".join(matches.groups()).strip() short_title = row[1].text_content().strip() self.scrape_bill(chamber, session, bill_id, short_title) def scrape_pre_2009(self, chamber, session): """scrapes legislation from 2008 and below.""" url = BILLS_URL + 'l' url = url % session with self.urlopen(url) as bill_index: html = lxml.html.fromstring(bill_index) html.make_links_absolute(url) links = html.xpath('//a') exprs = r'(%s[A-Z]*)([0-9]+)' % _CHAMBERS[chamber][0] for link in links: matches = re.match(exprs, link.text) if matches: bill_id = " ".join(matches.groups()) short_title = link.tail[:link.tail.index('..')] self.scrape_pre_2009_bill(chamber, session, bill_id, short_title) def scrape_bill(self, chamber, session, bill_id, short_title=None): """ Scrapes documents, actions, vote counts and votes for bills from the 2009 session and above. """ url = BILL_URL % (session, bill_id.replace(' ', '')) with self.urlopen(url) as bill_page: html = lxml.html.fromstring(bill_page) html.make_links_absolute( 'http://legislature.idaho.gov/legislation/%s/' % session) bill_tables = html.xpath('./body/table/tr/td[2]')[0].xpath( './/table') title = bill_tables[1].text_content().strip() bill_type = get_bill_type(bill_id) bill = Bill(session, chamber, bill_id, title, type=bill_type) bill.add_source(url) bill['subjects'] = self._subjects[bill_id.replace(' ', '')] if short_title and bill['title'].lower() != short_title.lower(): bill.add_title(short_title) # documents doc_links = html.xpath('//span/a') for link in doc_links: name = link.text_content().strip() href = link.get('href') if 'Engrossment' in name or 'Bill Text' in name: bill.add_version(name, href) else: bill.add_document(name, href) # sponsors range from a committee to one legislator to a group of legs sponsor_lists = bill_tables[0].text_content().split('by') if len(sponsor_lists) > 1: for sponsors in sponsor_lists[1:]: for person in sponsors.split(','): bill.add_sponsor('primary', person) actor = chamber last_date = None for row in bill_tables[2]: # lots of empty rows if len(row) == 1: continue _, date, action, _ = [x.text_content().strip() for x in row] if date: last_date = date else: date = last_date date = datetime.datetime.strptime(date + '/' + session[0:4], "%m/%d/%Y") if action.startswith('House'): actor = 'lower' elif action.startswith('Senate'): actor = 'upper' # votes if 'AYES' in action or 'NAYS' in action: vote = self.parse_vote(actor, date, row[2]) vote.add_source(url) bill.add_vote(vote) # some td's text is seperated by br elements if len(row[2]): action = "".join(row[2].itertext()) action = action.replace(u'\xa0', ' ').strip() atype = get_action(actor, action) bill.add_action(actor, action, date, type=atype) # after voice vote/roll call and some actions the bill is sent # 'to House' or 'to Senate' if 'to House' in action: actor = 'lower' elif 'to Senate' in action: actor = 'upper' self.save_bill(bill) def scrape_pre_2009_bill(self, chamber, session, bill_id, short_title=''): """bills from 2008 and below are in a 'pre' element and is simpler to parse them as text""" url = 'http://legislature.idaho.gov/legislation/%s/%s.html' % ( session, bill_id.replace(' ', '')) with self.urlopen(url) as bill_page: html = lxml.html.fromstring(bill_page) text = html.xpath('//pre')[0].text.split('\r\n') # title title = " - ".join( [x.strip() for x in text[1].split('-') if x.isupper()]) # bill type bill_type = get_bill_type(bill_id) bill = Bill(session, chamber, bill_id, title, type=bill_type) # sponsors sponsors = text[0].split('by')[-1] for sponsor in sponsors.split(','): bill.add_sponsor('primary', sponsor) actor = chamber self.flag() # clear last bills vote flags self.vote = None # for line in text: if re.match(r'^\d\d/\d\d', line): date = date = datetime.datetime.strptime( line[0:5] + '/' + session[0:4], "%m/%d/%Y") self.last_date = date action_text = line[5:].strip() # actor if action_text.lower().startswith('house') or \ action_text.lower().startswith('senate'): actor = {'H': 'lower', 'S': 'upper'}[action_text[0]] action = get_action(actor, action_text) bill.add_action(actor, action_text, date, type=action) if "bill:passed" in action or "bill:failed" in action: passed = False if 'FAILED' in action_text else True votes = re.search(r'(\d+)-(\d+)-(\d+)', action_text) if votes: yes, no, other = votes.groups() self.in_vote = True self.vote = Vote(chamber, date, action_text, passed, int(yes), int(no), int(other)) else: date = self.last_date # nothing to do if its not a vote if "Floor Sponsor" in line: self.in_vote = False if self.vote: bill.add_vote(self.vote) self.vote = None if not self.in_vote: continue if 'AYES --' in line: self.flag(ayes=True) elif 'NAYS --' in line: self.flag(nays=True) elif 'Absent and excused' in line: self.flag(other=True) if self.ayes: for name in line.replace('AYES --', '').split(','): name = name.strip() if name: self.vote.yes(name) if self.nays: for name in line.replace('NAYS --', '').split(','): name = name.strip() if name: self.vote.no(name) if self.other: for name in line.replace('Absent and excused --', '').split(','): name = name.strip() if name: self.vote.other(name) self.save_bill(bill) def parse_vote(self, actor, date, row): """ takes the actor, date and row element and returns a Vote object """ spans = row.xpath('.//span') motion = row.text passed, yes_count, no_count, other_count = spans[0].text_content( ).split('-') yes_votes = [ name for name in spans[1].tail.replace(u'\xa0--\xa0', '').split(',') if name ] no_votes = [ name for name in spans[2].tail.replace(u'\xa0--\xa0', '').split(',') if name ] other_votes = [] if spans[3].text.startswith('Absent'): other_votes = [ name for name in spans[3].tail.replace(u'\xa0--\xa0', '').split(',') if name ] for key, val in { 'adopted': True, 'passed': True, 'failed': False }.items(): if key in passed.lower(): passed = val break vote = Vote(actor, date, motion, passed, int(yes_count), int(no_count), int(other_count)) for name in yes_votes: if name and name != 'None': vote.yes(name) for name in no_votes: if name and name != 'None': vote.no(name) for name in other_votes: if name and name != 'None': vote.other(name) return vote def flag(self, ayes=False, nays=False, other=False): """ help to keep track of where we are at parsing votes from text""" self.ayes = ayes self.nays = nays self.other = other
def scrape_bill_sheet(self, session, chamber): """ Scrape the bill sheet (the page full of bills and other small bits of data) """ sheet_url = self.get_bill_folder(session, chamber) bill_chamber = {"Senate": "upper", "House": "lower"}[chamber] index = { "id": 0, "title_sponsor": 1, "version": 2, "history": 3, "votes": 7 } sheet_html = self.urlopen(sheet_url) sheet_page = lxml.html.fromstring(sheet_html) sheet_page.make_links_absolute(sheet_url) bills = sheet_page.xpath('//table/tr') for bill in bills: bill_id = self.read_td(bill[index["id"]][0]) if bill_id == None: # Every other entry is null for some reason continue dot_loc = bill_id.find('.') if dot_loc != -1: # budget bills are missing the .pdf, don't truncate bill_id = bill_id[:dot_loc] title_and_sponsor = bill[index["title_sponsor"]][0] bill_title = title_and_sponsor.text bill_title_and_sponsor = title_and_sponsor.text_content() if bill_title is None: continue # Odd ... sponsors = bill_title_and_sponsor.replace(bill_title, "").\ replace(" & ...", "").split("--") cats = { "SB": "bill", "HB": "bill", "HR": "resolution", "SR": "resolution", "SCR": "concurrent resolution", "HCR": "concurrent resolution", "SJR": "joint resolution", "HJR": "joint resolution", "SM": "memorial", "HM": "memorial" } bill_type = None for cat in cats: if bill_id[:len(cat)] == cat: bill_type = cats[cat] b = Bill(session, bill_chamber, bill_id, bill_title, type=bill_type) b.add_source(sheet_url) versions_url = \ bill[index["version"]].xpath('font/a')[0].attrib["href"] versions_url = versions_url versions = self.parse_versions(versions_url) for version in versions: b.add_version(version['name'], version['link'], mimetype=version['mimetype']) bill_history_href = bill[index["history"]][0][0].attrib['href'] history = self.parse_history(bill_history_href) b.add_source(bill_history_href) chamber_map = dict(Senate='upper', House='lower') for action, date in history: action_actor = chamber_map.get(chamber, chamber) attrs = dict(actor=action_actor, action=action, date=date) attrs.update(self.categorizer.categorize(action)) b.add_action(**attrs) for sponsor in sponsors: if sponsor != None and sponsor != "(NONE)" and \ sponsor != "": if "&" in sponsor: for sponsor in [x.strip() for x in sponsor.split("&")]: b.add_sponsor("primary", sponsor) else: b.add_sponsor("primary", sponsor) # Now that we have history, let's see if we can't grab some # votes bill_vote_href, = bill.xpath(".//a[contains(text(), 'Votes')]") bill_vote_href = bill_vote_href.attrib['href'] #bill_vote_href = self.get_vote_url(bill_id, session) votes = self.parse_votes(bill_vote_href) if (votes['sanity-check'] == 'This site only supports frames ' 'compatible browsers!'): votes['votes'] = [] elif votes['sanity-check'] != bill_id: self.warning("XXX: READ ME! Sanity check failed!") self.warning(" -> Scraped ID: " + votes['sanity-check']) self.warning(" -> 'Real' ID: " + bill_id) assert votes['sanity-check'] == bill_id for vote in votes['votes']: filed_votes = vote['votes'] passage = vote['meta'] result = vote['result'] composite_time = "%s %s" % (passage['x-parent-date'], passage['TIME']) # It's now like: 04/01/2011 02:10:14 PM pydate = dt.datetime.strptime(composite_time, "%m/%d/%Y %I:%M:%S %p") hasHouse = "House" in passage['x-parent-ctty'] hasSenate = "Senate" in passage['x-parent-ctty'] if hasHouse and hasSenate: actor = "joint" elif hasHouse: actor = "lower" else: actor = "upper" other = (int(result['EXC']) + int(result['ABS'])) # OK, sometimes the Other count is wrong. local_other = 0 for voter in filed_votes: l_vote = filed_votes[voter].lower().strip() if l_vote != "yes" and l_vote != "no": local_other = local_other + 1 if local_other != other: self.warning( \ "XXX: !!!WARNING!!! - resetting the 'OTHER' VOTES") self.warning(" -> Old: %s // New: %s" % (other, local_other)) other = local_other passed = (result['FINAL_ACTION'] == "PASS") if passage['MOTION'].strip() == "": continue if "without objection" in passage['MOTION'].lower(): passed = True v = Vote(actor, pydate, passage['MOTION'], passed, int(result['YES']), int(result['NO']), other, moved=passage['MOVED'], seconded=passage['SECONDED']) v.add_source(vote['meta']['url']) # v.add_source( bill_vote_href ) # XXX: Add more stuff to kwargs, we have a ton of data seen = set([]) for voter in filed_votes: who = voter if who in seen: raise Exception("Seeing the double-thing. - bug #702") seen.add(who) vote = filed_votes[who] if vote.lower() == "yes": v.yes(who) elif vote.lower() == "no": v.no(who) else: v.other(who) b.add_vote(v) self.save_bill(b)
def scrape_votes(self, bill_page, bill, insert, year): root = lxml.html.fromstring(bill_page) for link in root.xpath('//a[contains(text(), "Passage")]'): motion = link.text if 'Assembly' in motion: chamber = 'lower' else: chamber = 'upper' vote_url = 'http://www.leg.state.nv.us/Session/%s/Reports/%s' % ( insert, link.get('href')) bill.add_source(vote_url) with self.urlopen(vote_url) as page: page = page.decode("utf8").replace(u"\xa0", " ") root = lxml.html.fromstring(page) date = root.xpath('string(/html/body/center/font)').split()[-1] date = date + "-" + str(year) date = datetime.strptime(date, "%m-%d-%Y") yes_count = int( root.xpath( 'string(/html/body/center/table/tr/td[1])').split()[0]) no_count = int( root.xpath( 'string(/html/body/center/table/tr/td[2])').split()[0]) excused = int( root.xpath( 'string(/html/body/center/table/tr/td[3])').split()[0]) not_voting = int( root.xpath( 'string(/html/body/center/table/tr/td[4])').split()[0]) absent = int( root.xpath( 'string(/html/body/center/table/tr/td[5])').split()[0]) other_count = excused + not_voting + absent passed = yes_count > no_count vote = Vote(chamber, date, motion, passed, yes_count, no_count, other_count, not_voting=not_voting, absent=absent) for el in root.xpath('/html/body/table[2]/tr'): name = el.xpath('string(td[1])').strip() full_name = '' for part in name: full_name = full_name + part + " " name = str(name) vote_result = el.xpath('string(td[2])').split()[0] if vote_result == 'Yea': vote.yes(name) elif vote_result == 'Nay': vote.no(name) else: vote.other(name) bill.add_vote(vote)
def scrape_vote(self, bill, name, url): if "VOTE/H" in url: vote_chamber = 'lower' cols = (1, 5, 9, 13) name_offset = 3 yes_offset = 0 no_offset = 1 else: vote_chamber = 'upper' cols = (1, 6) name_offset = 4 yes_offset = 1 no_offset = 2 with self.urlopen(url) as page: if 'BUDGET ADDRESS' in page: return page = lxml.html.fromstring(page) yes_count = page.xpath( "string(//span[contains(., 'Those voting Yea')])") yes_count = int(re.match(r'[^\d]*(\d+)[^\d]*', yes_count).group(1)) no_count = page.xpath( "string(//span[contains(., 'Those voting Nay')])") no_count = int(re.match(r'[^\d]*(\d+)[^\d]*', no_count).group(1)) other_count = page.xpath( "string(//span[contains(., 'Those absent')])") other_count = int( re.match(r'[^\d]*(\d+)[^\d]*', other_count).group(1)) need_count = page.xpath( "string(//span[contains(., 'Necessary for')])") need_count = int( re.match(r'[^\d]*(\d+)[^\d]*', need_count).group(1)) date = page.xpath("string(//span[contains(., 'Taken on')])") date = re.match(r'.*Taken\s+on\s+(\d+/\s?\d+)', date).group(1) date = date.replace(' ', '') date = datetime.datetime.strptime(date + " " + bill['session'], "%m/%d %Y").date() vote = Vote(vote_chamber, date, name, yes_count > need_count, yes_count, no_count, other_count) vote.add_source(url) table = page.xpath("//table")[0] for row in table.xpath("tr"): for i in cols: name = row.xpath("string(td[%d])" % ( i + name_offset)).strip() if not name or name == 'VACANT': continue if "Y" in row.xpath("string(td[%d])" % (i + yes_offset)): vote.yes(name) elif "N" in row.xpath("string(td[%d])" % (i + no_offset)): vote.no(name) else: vote.other(name) bill.add_vote(vote)
def scrape_senate(self, session): url = journals % (session, 'Senate') page = self.lxmlize(url) hrefs = page.xpath("//font//a") for href in hrefs: (path, response) = self.urlretrieve(href.attrib['href']) data = convert_pdf(path, type='text') cur_bill_id = None cur_vote_count = None in_vote = False cur_question = None in_question = False known_date = None cur_vote = {} for line in data.split("\n"): if not known_date: dt = date_re.findall(line) if dt != []: dt, dow = dt[0] dt = dt.replace(',', '') known_date = datetime.datetime.strptime(dt, "%A %B %d %Y") if in_question: line = line.strip() if re.match("\d+", line): in_question = False continue try: line, _ = line.rsplit(" ", 1) cur_question += line.strip() except ValueError: in_question = False continue cur_question += line.strip() if not in_vote: summ = vote_re.findall(line) if summ != []: cur_vote = {} cur_vote_count = summ[0] in_vote = True continue if ("The question being" in line) or \ ("On motion of" in line) or \ ("the following" in line) or \ ("moved that the" in line): cur_question, _ = line.strip().rsplit(" ", 1) cur_question = cur_question.strip() in_question = True if line.strip() == "": continue first = line[0] if first != " ": if " " not in line: # wtf continue bill_id, kruft = line.split(" ", 1) if len(bill_id) < 3: continue if bill_id[0] != "H" and bill_id[0] != "S": continue if bill_id[1] not in ['B', 'J', 'R', 'M']: continue cur_bill_id = bill_id else: line = line.strip() try: line, lineno = line.rsplit(" ", 1) except ValueError: in_vote = False if cur_question is None: continue if cur_bill_id is None: continue yes, no, exc, ab = cur_vote_count other = int(exc) + int(ab) yes, no, other = int(yes), int(no), int(other) bc = {'H': 'lower', 'S': 'upper'}[cur_bill_id[0]] vote = Vote('upper', known_date, cur_question, (yes > no), yes, no, other, session=session, bill_id=cur_bill_id, bill_chamber=bc) for person in cur_vote: if person is None: continue howvote = cur_vote[person] if person.endswith("Y"): howvote = "Y" person = person[:-1] if person.endswith("N"): howvote = "N" person = person[:-1] if person.endswith("E"): howvote = "E" person = person[:-1] howvote = howvote.upper() if howvote == 'Y': vote.yes(person) elif howvote == 'N': vote.no(person) else: vote.other(person) vote.add_source(href.attrib['href']) self.save_vote(vote) cur_vote, cur_question, cur_vote_count = ( None, None, None) continue votes = re.findall(votes_re, line) for person in votes: name, li, vot = person cur_vote[name] = vot os.unlink(path)
def scrape_bill_type(self, chamber, session, bill_type, type_abbr, committee_abbr_regex=get_committee_name_regex()): if chamber == 'upper': chamber_name = 'SENATE' else: chamber_name = 'ASSEMBLY' bills = self.session.query(CABill).filter_by( session_year=session).filter_by( measure_type=type_abbr) for bill in bills: bill_session = session if bill.session_num != '0': bill_session += ' Special Session %s' % bill.session_num bill_id = bill.short_bill_id fsbill = Bill(bill_session, chamber, bill_id, '') # # Construct session for web query, going from '20092010' to '0910' # source_session = session[2:4] + session[6:8] # # Turn 'AB 10' into 'ab_10' # source_num = "%s_%s" % (bill.measure_type.lower(), # bill.measure_num) # Construct a fake source url source_url = ('http://leginfo.legislature.ca.gov/faces/' 'billNavClient.xhtml?bill_id=%s') % bill.bill_id fsbill.add_source(source_url) fsbill.add_version(bill_id, source_url, 'text/html') title = '' type_ = ['bill'] subject = '' all_titles = set() # Get digest test (aka "summary") from latest version. if bill.versions: version = bill.versions[-1] nsmap = version.xml.nsmap xpath = '//caml:DigestText/xhtml:p' els = version.xml.xpath(xpath, namespaces=nsmap) chunks = [] for el in els: t = etree_text_content(el) t = re.sub(r'\s+', ' ', t) t = re.sub(r'\)(\S)', lambda m: ') %s' % m.group(1), t) chunks.append(t) summary = '\n\n'.join(chunks) for version in bill.versions: if not version.bill_xml: continue # CA is inconsistent in that some bills have a short title # that is longer, more descriptive than title. if bill.measure_type in ('AB', 'SB'): impact_clause = clean_title(version.title) title = clean_title(version.short_title) else: impact_clause = None if len(version.title) < len(version.short_title) and \ not version.title.lower().startswith('an act'): title = clean_title(version.short_title) else: title = clean_title(version.title) if title: all_titles.add(title) type_ = [bill_type] if version.appropriation == 'Yes': type_.append('appropriation') if version.fiscal_committee == 'Yes': type_.append('fiscal committee') if version.local_program == 'Yes': type_.append('local program') if version.urgency == 'Yes': type_.append('urgency') if version.taxlevy == 'Yes': type_.append('tax levy') if version.subject: subject = clean_title(version.subject) if not title: self.warning("Couldn't find title for %s, skipping" % bill_id) continue fsbill['title'] = title fsbill['summary'] = summary fsbill['type'] = type_ fsbill['subjects'] = filter(None, [subject]) fsbill['impact_clause'] = impact_clause # We don't want the current title in alternate_titles all_titles.remove(title) fsbill['alternate_titles'] = list(all_titles) for author in version.authors: if author.house == chamber_name: fsbill.add_sponsor(SPONSOR_TYPES[author.contribution], author.name, official_type=author.contribution) for action in bill.actions: if not action.action: # NULL action text seems to be an error on CA's part, # unless it has some meaning I'm missing continue actor = action.actor or chamber actor = actor.strip() match = re.match(r'(Assembly|Senate)($| \(Floor)', actor) if match: actor = {'Assembly': 'lower', 'Senate': 'upper'}[match.group(1)] elif actor.startswith('Governor'): actor = 'other' else: def replacer(matchobj): if matchobj: return {'Assembly': 'lower', 'Senate': 'upper'}[matchobj.group()] else: return matchobj.group() actor = re.sub(r'^(Assembly|Senate)', replacer, actor) type_ = [] act_str = action.action act_str = re.sub(r'\s+', ' ', act_str) attrs = self.categorizer.categorize(act_str) # Add in the committee strings of the related committees, if any. kwargs = attrs matched_abbrs = committee_abbr_regex.findall(action.action) if 'Com. on' in action.action and not matched_abbrs: msg = 'Failed to extract committee abbr from %r.' self.logger.warning(msg % action.action) if matched_abbrs: committees = [] for abbr in matched_abbrs: try: name = self.committee_abbr_to_name(chamber, abbr) except KeyError: msg = ('Mapping contains no committee name for ' 'abbreviation %r. Action text was %r.') args = (abbr, action.action) raise KeyError(msg % args) else: committees.append(name) committees = filter(None, committees) kwargs['committees'] = committees code = re.search(r'C[SXZ]\d+', actor) if code is not None: code = code.group() kwargs['actor_info'] = {'committee_code': code} assert len(committees) == len(matched_abbrs) for committee, abbr in zip(committees, matched_abbrs): act_str = act_str.replace('Com. on ' + abbr, committee) act_str = act_str.replace(abbr, committee) changed = False for string in ['upper', 'lower', 'joint']: if actor.startswith(string): actor = string changed = True break if not changed: actor = 'other' if actor != action.actor: actor_info = kwargs.get('actor_info', {}) actor_info['details'] = action.actor kwargs['actor_info'] = actor_info # Add strings for related legislators, if any. rgx = '(?:senator|assembly[mwp][^ .,:;]+)\s+[^ .,:;]+' legislators = re.findall(rgx, action.action, re.I) if legislators: kwargs['legislators'] = legislators fsbill.add_action(actor, act_str, action.action_date.date(), **kwargs) for vote in bill.votes: if vote.vote_result == '(PASS)': result = True else: result = False full_loc = vote.location.description first_part = full_loc.split(' ')[0].lower() if first_part in ['asm', 'assembly']: vote_chamber = 'lower' vote_location = ' '.join(full_loc.split(' ')[1:]) elif first_part.startswith('sen'): vote_chamber = 'upper' vote_location = ' '.join(full_loc.split(' ')[1:]) else: raise ScrapeError("Bad location: %s" % full_loc) if vote.motion: motion = vote.motion.motion_text or '' else: motion = '' if "Third Reading" in motion or "3rd Reading" in motion: vtype = 'passage' elif "Do Pass" in motion: vtype = 'passage' else: vtype = 'other' motion = motion.strip() # Why did it take until 2.7 to get a flags argument on re.sub? motion = re.compile(r'(\w+)( Extraordinary)? Session$', re.IGNORECASE).sub('', motion) motion = re.compile(r'^(Senate|Assembly) ', re.IGNORECASE).sub('', motion) motion = re.sub(r'^(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? ', '', motion) motion = re.sub(r' \(\w+\)$', '', motion) motion = re.sub(r'(SCR|SB|AB|AJR|ACR)\s?\d+ \w+\.?$', '', motion) motion = re.sub(r'(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? ' r'Urgency Clause$', '(Urgency Clause)', motion) motion = re.sub(r'\s+', ' ', motion) if not motion: self.warning("Got blank motion on vote for %s" % bill_id) continue fsvote = Vote(vote_chamber, self._tz.localize(vote.vote_date_time), motion, result, int(vote.ayes), int(vote.noes), int(vote.abstain), threshold=vote.threshold, type_=vtype) if vote_location != 'Floor': fsvote['committee'] = vote_location for record in vote.votes: if record.vote_code == 'AYE': fsvote.yes(record.legislator_name) elif record.vote_code.startswith('NO'): fsvote.no(record.legislator_name) else: fsvote.other(record.legislator_name) for s in ('yes', 'no', 'other'): # Kill dupe votes. key = s + '_votes' fsvote[key] = list(set(fsvote[key])) # In a small percentage of bills, the integer vote counts # are inaccurate, so let's ignore them. for k in ('yes', 'no', 'other'): fsvote[k + '_count'] = len(fsvote[k + '_votes']) fsbill.add_vote(fsvote) self.save_bill(fsbill)
def scrape_pdf_for_votes(self, session, chamber, date, motion, href): warned = False # vote indicator, a few spaces, a name, newline or multiple spaces VOTE_RE = re.compile('(Y|N|E|NV|A|P|-)\s{2,5}(\w.+?)(?:\n|\s{2})') COUNT_RE = re.compile(r'^(\d+)\s+YEAS?\s+(\d+)\s+NAYS?\s+(\d+)\s+PRESENT(?:\s+(\d+)\s+NOT\sVOTING)?\s*$') PASS_FAIL_WORDS = { 'PASSED': True, 'PREVAILED': True, 'ADOPTED': True, 'CONCURRED': True, 'FAILED': False, 'LOST': False, } pdflines = self.fetch_pdf_lines(href) if not pdflines: return False yes_count = no_count = present_count = other_count = 0 yes_votes = [] no_votes = [] present_votes = [] other_vote_detail = defaultdict(list) passed = None counts_found = False vote_lines = [] for line in pdflines: # consider pass/fail as a document property instead of a result of the vote count # extract the vote count from the document instead of just using counts of names if not line.strip(): continue elif line.strip() in PASS_FAIL_WORDS: if passed is not None: raise Exception("Duplicate pass/fail matches in [%s]" % href) passed = PASS_FAIL_WORDS[line.strip()] elif COUNT_RE.match(line): yes_count, no_count, present_count, not_voting_count = COUNT_RE.match(line).groups() yes_count = int(yes_count) no_count = int(no_count) present_count = int(present_count) counts_found = True elif counts_found: for value in VOTE_VALUES: if re.search(r'^\s*({})\s+\w'.format(value), line): vote_lines.append(line) break votes = find_columns_and_parse(vote_lines) for name, vcode in votes.items(): if name == 'Mr. Speaker': name = self.metadata['session_details'][session]['speaker'] elif name == 'Mr. President': name = self.metadata['session_details'][session]['president'] if vcode == 'Y': yes_votes.append(name) elif vcode == 'N': no_votes.append(name) else: other_vote_detail[vcode].append(name) other_count += 1 if vcode == 'P': present_votes.append(name) # fake the counts if yes_count == 0 and no_count == 0 and present_count == 0: yes_count = len(yes_votes) no_count = len(no_votes) else: # audit if yes_count != len(yes_votes): self.warning("Mismatched yes count [expect: %i] [have: %i]" % (yes_count, len(yes_votes))) warned = True if no_count != len(no_votes): self.warning("Mismatched no count [expect: %i] [have: %i]" % (no_count, len(no_votes))) warned = True if present_count != len(present_votes): self.warning("Mismatched present count [expect: %i] [have: %i]" % (present_count, len(present_votes))) warned = True if passed is None: if chamber == 'lower': # senate doesn't have these lines self.warning("No pass/fail word found; fall back to comparing yes and no vote.") warned = True passed = yes_count > no_count vote = Vote(chamber, date, motion, passed, yes_count, no_count, other_count, other_vote_detail=other_vote_detail) for name in yes_votes: vote.yes(name) for name in no_votes: vote.no(name) for other_type, names in other_vote_detail.iteritems(): for name in names: vote.other(name) vote.add_source(href) if warned: self.warning("Warnings were issued. Best to check %s" % href) return vote
def scrape_lower_committee_votes(self, session_number, bill): ''' House committee roll calls are not available on the Senate's website. Furthermore, the House uses an internal ID system in its URLs, making accessing those pages non-trivial. This function will fetch all the House committee votes for the given bill, and add the votes to that object. ''' house_url = 'http://www.myfloridahouse.gov/Sections/Bills/bills.aspx' # Keep the digits and all following characters in the bill's ID bill_number = re.search(r'^\w+\s(\d+\w*)$', bill['bill_id']).group(1) form = { 'rblChamber': 'B', 'ddlSession': session_number, 'ddlBillList': '-1', 'txtBillNumber': bill_number, 'ddlSponsor': '-1', 'ddlReferredTo': '-1', 'SubmittedByControl': '', } doc = lxml.html.fromstring(self.post(url=house_url, data=form).text) doc.make_links_absolute(house_url) (bill_link, ) = doc.xpath( '//a[contains(@href, "/Bills/billsdetail.aspx?BillId=")]/@href') bill_doc = self.lxmlize(bill_link) links = bill_doc.xpath('//a[text()="See Votes"]/@href') for link in links: vote_doc = self.lxmlize(link) (date, ) = vote_doc.xpath( '//span[@id="ctl00_ContentPlaceHolder1_lblDate"]/text()') date = datetime.datetime.strptime(date, '%m/%d/%Y %I:%M:%S %p').date() totals = vote_doc.xpath('//table//table')[-1].text_content() totals = re.sub(r'(?mu)\s+', " ", totals).strip() (yes_count, no_count, other_count) = [ int(x) for x in re.search( r'(?m)Total Yeas:\s+(\d+)\s+Total Nays:\s+(\d+)\s+' 'Total Missed:\s+(\d+)', totals).groups() ] passed = yes_count > no_count (committee, ) = vote_doc.xpath( '//span[@id="ctl00_ContentPlaceHolder1_lblCommittee"]/text()') (action, ) = vote_doc.xpath( '//span[@id="ctl00_ContentPlaceHolder1_lblAction"]/text()') motion = "{} ({})".format(action, committee) vote = Vote('lower', date, motion, passed, yes_count, no_count, other_count) vote.add_source(link) for member_vote in vote_doc.xpath('//table//table//table//td'): if not member_vote.text_content().strip(): continue (member, ) = member_vote.xpath('span[2]//text()') (member_vote, ) = member_vote.xpath('span[1]//text()') if member_vote == "Y": vote.yes(member) elif member_vote == "N": vote.no(member) elif member_vote == "-": vote.other(member) # Parenthetical votes appear to not be counted in the # totals for Yea, Nay, _or_ Missed elif re.search(r'\([YN]\)', member_vote): continue else: raise IndexError( "Unknown vote type found: {}".format(member_vote)) vote.validate() bill.add_vote(vote)
def scrape_floor_vote(self, chamber, bill, date, url): (path, resp) = self.urlretrieve(url) text = convert_pdf(path, 'text') lines = text.split("\n") os.remove(path) MOTION_INDEX = 4 TOTALS_INDEX = 6 VOTE_START_INDEX = 9 motion = lines[MOTION_INDEX].strip() # Sometimes there is no motion name, only "Passage" in the line above if (not motion and not lines[MOTION_INDEX - 1].startswith("Calendar Page:")): motion = lines[MOTION_INDEX - 1] MOTION_INDEX -= 1 TOTALS_INDEX -= 1 VOTE_START_INDEX -= 1 else: assert motion, "Floor vote's motion name appears to be empty" for _extra_motion_line in range(2): MOTION_INDEX += 1 if lines[MOTION_INDEX].strip(): motion = "{}, {}".format(motion, lines[MOTION_INDEX].strip()) TOTALS_INDEX += 1 VOTE_START_INDEX += 1 else: break (yes_count, no_count, other_count) = [ int(x) for x in re.search( r'^\s+Yeas - (\d+)\s+Nays - (\d+)\s+Not Voting - (\d+)\s*$', lines[TOTALS_INDEX]).groups() ] passed = (yes_count > no_count) vote = Vote(chamber, date, motion, passed, yes_count, no_count, other_count) vote.add_source(url) for line in lines[VOTE_START_INDEX:]: if not line.strip(): break if " President " in line: line = line.replace(" President ", " ") elif " Speaker " in line: line = line.replace(" Speaker ", " ") # Votes follow the pattern of: # [vote code] [member name]-[district number] for member in re.findall(r'\s*Y\s+(.*?)-\d{1,3}\s*', line): vote.yes(member) for member in re.findall(r'\s*N\s+(.*?)-\d{1,3}\s*', line): vote.no(member) for member in re.findall(r'\s*(?:EX|AV)\s+(.*?)-\d{1,3}\s*', line): vote.other(member) try: vote.validate() except ValueError: # On a rare occasion, a member won't have a vote code, # which indicates that they didn't vote. The totals reflect # this. self.logger.info("Votes don't add up; looking for additional ones") for line in lines[VOTE_START_INDEX:]: if not line.strip(): break for member in re.findall(r'\s{8,}([A-Z][a-z\'].*?)-\d{1,3}', line): vote.other(member) vote.validate() bill.add_vote(vote)
def build_senate_votes(self): xpath = "//div/b[starts-with(., 'VOTE: FLOOR VOTE:')]" for b in self.urls.senate.xpath(xpath): date = b.text.split('-')[1].strip() date = datetime.datetime.strptime(date, "%b %d, %Y").date() yes_votes, no_votes, other_votes = [], [], [] yes_count, no_count, other_count = 0, 0, 0 actual_vote = collections.defaultdict(list) vtype = None for tag in b.xpath("following-sibling::blockquote/*"): if tag.tag == 'b': text = tag.text if text.startswith('Ayes'): vtype = 'yes' yes_count = int( re.search(r'\((\d+)\):', text).group(1)) elif text.startswith('Nays'): vtype = 'no' no_count = int(re.search(r'\((\d+)\):', text).group(1)) elif (text.startswith('Excused') or text.startswith('Abstain') or text.startswith('Absent')): vtype = 'other' other_count += int( re.search(r'\((\d+)\):', text).group(1)) else: raise ValueError('bad vote type: %s' % tag.text) elif tag.tag == 'a': name = tag.text.strip() if vtype == 'yes': yes_votes.append(name) elif vtype == 'no': no_votes.append(name) elif vtype == 'other': other_votes.append((name, tag.text)) passed = yes_count > (no_count + other_count) vote = Vote('upper', date, 'Floor Vote', passed, yes_count, no_count, other_count) for name in yes_votes: vote.yes(name) for name in no_votes: vote.no(name) for name, vote_val in other_votes: vote.other(name) actual_vote[vote_val].append(name) vote['actual_vote'] = actual_vote vote.add_source(self.url) self.bill.add_vote(vote) xpath = "//div/b[starts-with(., 'VOTE: COMMITTEE VOTE:')]" for b in self.urls.senate.xpath(xpath): _, committee, date = re.split(r'\s*\t+\s*-\s*', b.text) date = date.strip() date = datetime.datetime.strptime(date, "%b %d, %Y").date() yes_votes, no_votes, other_votes = [], [], [] yes_count, no_count, other_count = 0, 0, 0 vtype = None for tag in b.xpath("following-sibling::blockquote/*"): if tag.tag == 'b': text = tag.text if text.startswith('Ayes'): vtype = 'yes' yes_count += int( re.search(r'\((\d+)\):', text).group(1)) elif text.startswith('Nays'): vtype = 'no' no_count += int( re.search(r'\((\d+)\):', text).group(1)) elif (text.startswith('Excused') or text.startswith('Abstain') or text.startswith('Absent')): vtype = 'other' other_count += int( re.search(r'\((\d+)\):', text).group(1)) else: raise ValueError('bad vote type: %s' % tag.text) elif tag.tag == 'a': name = tag.text.strip() if vtype == 'yes': yes_votes.append(name) elif vtype == 'no': no_votes.append(name) elif vtype == 'other': other_votes.append(name) passed = yes_count > (no_count + other_count) vote = Vote('upper', date, '%s Committee Vote' % committee, passed, yes_count, no_count, other_count) for name in yes_votes: vote.yes(name) for name in no_votes: vote.no(name) for name in other_votes: vote.other(name) vote.add_source(self.url) self.bill.add_vote(vote)
def scrape_digest(self, bill): digest_url = 'http://legisweb.state.wy.us/%(session)s/Digest/%(bill_id)s.pdf' % bill bill.add_source(digest_url) try: (filename, response) = self.urlretrieve(digest_url) all_text = convert_pdf(filename, type='text') except scrapelib.HTTPError: self.warning('no digest for %s' % bill['bill_id']) return if all_text.strip() == "": self.warning('Non-functional digest for bill {}'.format( bill['bill_id'])) return # Split the digest's text into sponsors, description, and actions SPONSOR_RE = r'(?sm)Sponsored By:\s+(.*?)\n\n' DESCRIPTION_RE = r'(?sm)\n\n((?:AN\s*?ACT|A JOINT RESOLUTION) .*?)\n\n' ACTIONS_RE = r'(?sm)\n\n(\d{1,2}/\d{1,2}/\d{4}.*)' ext_title = re.search(DESCRIPTION_RE, all_text).group(1) bill_desc = ext_title.replace('\n', ' ') bill_desc = re.sub(" *", " ", bill_desc.decode('utf-8')).encode('utf-8') bill['description'] = bill_desc sponsor_span = re.search(SPONSOR_RE, all_text).group(1) sponsors = '' sponsors = sponsor_span.replace('\n', ' ') if sponsors: if 'Committee' in sponsors: bill.add_sponsor('primary', sponsors) else: if bill['chamber'] == 'lower': sp_lists = sponsors.split('and Senator(s)') else: sp_lists = sponsors.split('and Representative(s)') for spl in sp_lists: for sponsor in split_names(spl): sponsor = sponsor.strip() if sponsor != "": bill.add_sponsor('primary', sponsor) action_re = re.compile('(\d{1,2}/\d{1,2}/\d{4})\s+(H |S )?(.+)') vote_total_re = re.compile( '(Ayes )?(\d*)(\s*)Nays(\s*)(\d+)(\s*)Excused(\s*)(\d+)(\s*)Absent(\s*)(\d+)(\s*)Conflicts(\s*)(\d+)' ) # initial actor is bill chamber actor = bill['chamber'] actions = [] action_lines = re.search(ACTIONS_RE, all_text).group(1).split('\n') action_lines = iter(action_lines) for line in action_lines: line = clean_line(line) # skip blank lines if not line: continue amatch = action_re.match(line) if amatch: date, achamber, action = amatch.groups() # change actor if one is on this action if achamber == 'H ': actor = 'lower' elif achamber == 'S ': actor = 'upper' date = datetime.datetime.strptime(date, '%m/%d/%Y') bill.add_action(actor, action.strip(), date, type=categorize_action(action)) elif line == 'ROLL CALL': voters = defaultdict(str) # if we hit a roll call, use an inner loop to consume lines # in a psuedo-state machine manner, 3 types # Ayes|Nays|Excused|... - indicates next line is voters # : (Senators|Representatives): ... - voters # \d+ Nays \d+ Excused ... - totals voters_type = None for ainext in action_lines: nextline = clean_line(ainext) if not nextline: continue breakers = [ "Ayes:", "Nays:", "Nayes:", "Excused:", "Absent:", "Conflicts:" ] for breaker in breakers: if nextline.startswith(breaker): voters_type = breaker[:-1] if voters_type == "Nayes": voters_type = "Nays" self.log("Fixed a case of 'Naye-itis'") nextline = nextline[len(breaker) - 1:] if nextline.startswith(': '): voters[voters_type] = nextline elif nextline in ('Ayes', 'Nays', 'Excused', 'Absent', 'Conflicts'): voters_type = nextline elif vote_total_re.match(nextline): #_, ayes, _, nays, _, exc, _, abs, _, con, _ = \ tupple = vote_total_re.match(nextline).groups() ayes = tupple[1] nays = tupple[4] exc = tupple[7] abs = tupple[10] con = tupple[13] passed = (('Passed' in action or 'Do Pass' in action or 'Did Concur' in action or 'Referred to' in action) and 'Failed' not in action) vote = Vote(actor, date, action, passed, int(ayes), int(nays), int(exc) + int(abs) + int(con)) vote.add_source(digest_url) for vtype, voters in voters.iteritems(): for voter in split_names(voters): if voter: if vtype == 'Ayes': vote.yes(voter) elif vtype == 'Nays': vote.no(voter) else: vote.other(voter) # done collecting this vote bill.add_vote(vote) break else: # if it is a stray line within the vote, is is a # continuation of the voter list # (sometimes has a newline) voters[voters_type] += ' ' + nextline
def scrape_vote(self, bill, name, url): match = re.match('^(Senate|House) Vote on [^,]*,(.*)$', name) if not match: return chamber = {'Senate': 'upper', 'House': 'lower'}[match.group(1)] motion = match.group(2).strip() if motion.startswith('FINAL PASSAGE'): type = 'passage' elif motion.startswith('AMENDMENT'): type = 'amendment' elif 'ON 3RD READINT' in motion: type = 'reading:3' else: type = 'other' vote = Vote(chamber, None, motion, None, None, None, None) vote['type'] = type vote.add_source(url) with self.urlopen(url) as text: (fd, temp_path) = tempfile.mkstemp() with os.fdopen(fd, 'wb') as w: w.write(text) html = pdf_to_lxml(temp_path) os.remove(temp_path) vote_type = None total_re = re.compile('^Total--(\d+)$') body = html.xpath('string(/html/body)') date_match = re.search('%s (\d{4,4})' % bill['bill_id'], body) try: date = date_match.group(1) except AttributeError: print "BAD VOTE" return month = int(date[0:2]) day = int(date[2:4]) date = datetime.date(int(bill['session']), month, day) vote['date'] = date for line in body.replace(u'\xa0', '\n').split('\n'): line = line.replace(' ', '').strip() if not line: continue if line in ('YEAS', 'NAYS', 'ABSENT'): vote_type = { 'YEAS': 'yes', 'NAYS': 'no', 'ABSENT': 'other' }[line] elif vote_type: match = total_re.match(line) if match: vote['%s_count' % vote_type] = int(match.group(1)) elif vote_type == 'yes': vote.yes(line) elif vote_type == 'no': vote.no(line) elif vote_type == 'other': vote.other(line) # The PDFs oddly don't say whether a vote passed or failed. # Hopefully passage just requires yes_votes > not_yes_votes if vote['yes_count'] > (vote['no_count'] + vote['other_count']): vote['passed'] = True else: vote['passed'] = False bill.add_vote(vote)
def scrape_vote(self, bill, name, url): match = re.match('^(Senate|House) Vote on [^,]*,(.*)$', name) if not match: return chamber = {'Senate': 'upper', 'House': 'lower'}[match.group(1)] motion = match.group(2).strip() if motion.startswith('FINAL PASSAGE'): type = 'passage' elif motion.startswith('AMENDMENT'): type = 'amendment' elif 'ON 3RD READING' in motion: type = 'reading:3' else: type = 'other' vote = Vote(chamber, None, motion, None, None, None, None) vote['type'] = type vote.add_source(url) (fd, temp_path) = tempfile.mkstemp() self.urlretrieve(url, temp_path) html = pdf_to_lxml(temp_path) os.close(fd) os.remove(temp_path) vote_type = None total_re = re.compile('^Total--(\d+)$') body = html.xpath('string(/html/body)') date_match = re.search('Date: (\d{1,2}/\d{1,2}/\d{4})', body) try: date = date_match.group(1) except AttributeError: self.warning("BAD VOTE: date error") return vote['date'] = datetime.datetime.strptime(date, '%m/%d/%Y') for line in body.replace(u'\xa0', '\n').split('\n'): line = line.replace(' ', '').strip() if not line: continue if line in ('YEAS', 'NAYS', 'ABSENT'): vote_type = { 'YEAS': 'yes', 'NAYS': 'no', 'ABSENT': 'other' }[line] elif line in ('Total', '--'): vote_type = None elif vote_type: match = total_re.match(line) if match: vote['%s_count' % vote_type] = int(match.group(1)) elif vote_type == 'yes': vote.yes(line) elif vote_type == 'no': vote.no(line) elif vote_type == 'other': vote.other(line) # tally counts vote['yes_count'] = len(vote['yes_votes']) vote['no_count'] = len(vote['no_votes']) vote['other_count'] = len(vote['other_votes']) # The PDFs oddly don't say whether a vote passed or failed. # Hopefully passage just requires yes_votes > not_yes_votes if vote['yes_count'] > (vote['no_count'] + vote['other_count']): vote['passed'] = True else: vote['passed'] = False bill.add_vote(vote)
def scrape_house(self, session): url = journals % (session, 'House') page = self.lxmlize(url) hrefs = page.xpath("//font//a") for href in hrefs: (path, response) = self.urlretrieve(href.attrib['href']) data = convert_pdf(path, type='text') in_vote = False cur_vote = {} known_date = None cur_vote_count = None in_question = False cur_question = None cur_bill_id = None for line in data.split("\n"): if known_date is None: dt = date_re.findall(line) if dt != []: dt, dow = dt[0] known_date = datetime.datetime.strptime(dt, "%A, %B %d, %Y") non_std = False if re.match("(\s+)?\d+.*", line) is None: non_std = True l = line.lower().strip() skip = False blacklist = [ "house", "page", "general assembly", "state of colorado", "session", "legislative day" ] for thing in blacklist: if thing in l: skip = True if skip: continue found = re.findall( "(?P<bill_id>(H|S|SJ|HJ)(B|M|R)\d{2}-\d{3,4})", line ) if found != []: found = found[0] cur_bill_id, chamber, typ = found try: if not non_std: _, line = line.strip().split(" ", 1) line = line.strip() except ValueError: in_vote = False in_question = False continue if in_question: cur_question += " " + line.strip() continue if ("The question being" in line) or \ ("On motion of" in line) or \ ("the following" in line) or \ ("moved that the" in line): cur_question = line.strip() in_question = True if in_vote: if line == "": likely_garbage = True likely_garbage = False if "co-sponsor" in line.lower(): likely_garbage = True if 'the speaker' in line.lower(): likely_garbage = True votes = re.findall(votes_re, line) if likely_garbage: votes = [] for person, _, v in votes: cur_vote[person] = v last_line = False for who, _, vote in votes: if who.lower() == "speaker": last_line = True if votes == [] or last_line: in_vote = False # save vote yes, no, other = cur_vote_count if cur_bill_id is None or cur_question is None: continue bc = { "H": "lower", "S": "upper", "J": "joint" }[cur_bill_id[0].upper()] vote = Vote('lower', known_date, cur_question, (yes > no), yes, no, other, session=session, bill_id=cur_bill_id, bill_chamber=bc) vote.add_source(href.attrib['href']) vote.add_source(url) for person in cur_vote: if person is None: continue vot = cur_vote[person] if person.endswith("Y"): vot = "Y" person = person[:-1] if person.endswith("N"): vot = "N" person = person[:-1] if person.endswith("E"): vot = "E" person = person[:-1] if vot == 'Y': vote.yes(person) elif vot == 'N': vote.no(person) elif vot == 'E' or vot == '-': vote.other(person) self.save_vote(vote) cur_vote = {} in_question = False cur_question = None in_vote = False cur_vote_count = None continue summ = vote_re.findall(line) if summ == []: continue summ = summ[0] yes, no, exc, ab = summ yes, no, exc, ab = \ int(yes), int(no), int(exc), int(ab) other = exc + ab cur_vote_count = (yes, no, other) in_vote = True continue os.unlink(path)