def parse_bill_actions_table(self, bill, action_table, bill_id, session, url, bill_chamber): for action in action_table.xpath('*')[1:]: date = action[0].text_content() date = dt.datetime.strptime(date, "%m/%d/%Y").strftime('%Y-%m-%d') actor = action[1].text_content() string = action[2].text_content() actor = { "S": "upper", "H": "lower", "D": "Data Systems", "$": "Appropriation measure", "ConAm": "Constitutional Amendment" }[actor] act_type, committees = categorize_action(string) # XXX: Translate short-code to full committee name for the # matcher. real_committees = [] if committees: for committee in committees: try: committee = self.short_ids[committee]['name'] real_committees.append(committee) except KeyError: pass act = bill.add_action(string, date, chamber=actor, classification=act_type) for committee in real_committees: act.add_related_entity(name=committee, entity_type="organization") vote = self.parse_vote(string) if vote: v, motion = vote vote = VoteEvent( start_date=date, chamber=actor, bill=bill_id, bill_chamber=bill_chamber, legislative_session=session, motion_text=motion, result='pass' if 'passed' in string.lower() else 'fail', classification='passage') vote.add_source(url) vote.set_count('yes', int(v['n_yes'] or 0)) vote.set_count('no', int(v['n_no'] or 0)) vote.set_count('not voting', int(v['n_excused'] or 0)) for voter in split_specific_votes(v['yes']): vote.yes(voter) for voter in split_specific_votes(v['yes_resv']): vote.yes(voter) for voter in split_specific_votes(v['no']): vote.no(voter) for voter in split_specific_votes(v['excused']): vote.vote('not voting', voter) yield vote
def record_votes(root, session, chamber): for el in root.xpath('//div{}'.format(''.join(vote_selectors))): mv = MaybeVote(el) if not mv.is_valid: continue v = VoteEvent(chamber=chamber, start_date=None, motion_text='passage' if mv.passed else 'other', result='pass' if mv.passed else 'fail', classification='passage' if mv.passed else 'other', legislative_session=session[0:2], bill=mv.bill_id, bill_chamber=mv.chamber) v.set_count('yes', mv.yeas or 0) v.set_count('no', mv.nays or 0) v.set_count('not voting', mv.present or 0) for each in mv.votes['yeas']: v.yes(each) for each in mv.votes['nays']: v.no(each) for each in mv.votes['present']: v.vote('not voting', each) for each in mv.votes['absent']: v.vote('absent', each) yield v
def record_votes(root, session, chamber): for el in root.xpath('//div{}'.format(''.join(vote_selectors))): mv = MaybeVote(el) if not mv.is_valid: continue v = VoteEvent( chamber=chamber, start_date=None, motion_text='passage' if mv.passed else 'other', result='pass' if mv.passed else 'fail', classification='passage' if mv.passed else 'other', legislative_session=session[0:2], bill=mv.bill_id, bill_chamber=mv.chamber ) v.set_count('yes', mv.yeas or 0) v.set_count('no', mv.nays or 0) v.set_count('not voting', mv.present or 0) for each in mv.votes['yeas']: v.yes(each) for each in mv.votes['nays']: v.no(each) for each in mv.votes['present']: v.vote('not voting', each) for each in mv.votes['absent']: v.vote('absent', each) yield v
def record_votes(root, session, chamber): for el in root.xpath("//div{}".format("".join(vote_selectors))): mv = MaybeVote(el) if not mv.is_valid: continue v = VoteEvent( chamber=chamber, start_date=None, motion_text="passage" if mv.passed else "other", result="pass" if mv.passed else "fail", classification="passage" if mv.passed else "other", legislative_session=session[0:2], bill=mv.bill_id, bill_chamber=mv.chamber, ) v.set_count("yes", mv.yeas or 0) v.set_count("no", mv.nays or 0) v.set_count("not voting", mv.present or 0) for each in mv.votes["yeas"]: v.yes(each) for each in mv.votes["nays"]: v.no(each) for each in mv.votes["present"]: v.vote("not voting", each) for each in mv.votes["absent"]: v.vote("absent", each) yield v
def _parse_senate_votes(self, vote_data, bill, url): vote_datetime = datetime.datetime.strptime(vote_data["voteDate"], "%Y-%m-%d") if vote_data["voteType"] == "FLOOR": motion = "Floor Vote" elif vote_data["voteType"] == "COMMITTEE": motion = "{} Vote".format(vote_data["committee"]["name"]) else: raise ValueError("Unknown vote type encountered.") if vote_data["version"]: motion += " - Version: " + vote_data["version"] vote = VoteEvent( chamber="upper", start_date=vote_datetime.strftime("%Y-%m-%d"), motion_text=motion, classification="passage", result="fail", bill=bill, ) vote.add_source(url) vote_rolls = vote_data["memberVotes"]["items"] yes_count, no_count, other_count = 0, 0, 0 # Count all yea votes. if "items" in vote_rolls.get("AYE", {}): for legislator in vote_rolls["AYE"]["items"]: vote.yes(legislator["fullName"]) yes_count += 1 if "items" in vote_rolls.get("AYEWR", {}): for legislator in vote_rolls["AYEWR"]["items"]: vote.yes(legislator["fullName"]) yes_count += 1 # Count all nay votes. if "items" in vote_rolls.get("NAY", {}): for legislator in vote_rolls["NAY"]["items"]: vote.no(legislator["fullName"]) no_count += 1 # Count all other types of votes. other_vote_types = ("EXC", "ABS", "ABD") for vote_type in other_vote_types: if vote_rolls.get(vote_type, []): for legislator in vote_rolls[vote_type]["items"]: vote.vote("other", legislator["fullName"]) other_count += 1 vote.result = "pass" if yes_count > no_count else "fail" vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("other", other_count) return vote
def scrape_votes_old(self, bill, billname, session): vote_url = ("http://archives.legislature.state.oh.us/bills.cfm?ID=" + session + "_" + billname) page = self.get(vote_url).text page = lxml.html.fromstring(page) for jlink in page.xpath("//a[contains(@href, 'JournalText')]"): date = self._tz.localize( datetime.datetime.strptime(jlink.text, "%m/%d/%Y")).date() date = "{:%Y-%m-%d}".format(date) details = jlink.xpath("string(../../../td[2])") chamber = details.split(" - ")[0] if chamber == "House": chamber = "lower" elif chamber == "Senate": chamber = "upper" else: raise ScrapeError("Bad chamber: %s" % chamber) motion = details.split(" - ")[1].split("\n")[0].strip() vote_row = jlink.xpath("../../..")[0].getnext() yea_div = vote_row.xpath("td/font/div[contains(@id, 'Yea')]")[0] yeas = [] for td in yea_div.xpath("table/tr/td"): name = td.xpath("string()") if name: yeas.append(name) no_div = vote_row.xpath("td/font/div[contains(@id, 'Nay')]")[0] nays = [] for td in no_div.xpath("table/tr/td"): name = td.xpath("string()") if name: nays.append(name) yes_count = len(yeas) no_count = len(nays) vote = VoteEvent( chamber=chamber, start_date=date, motion_text=motion, result="pass" if yes_count > no_count else "fail", bill=bill, classification="passed", ) for yes in yeas: vote.yes(yes) for no in nays: vote.no(no) vote.add_source(vote_url) yield vote
def scrape_vote(self, bill, vote_id, session): vote_url = 'https://legis.delaware.gov/json/RollCall/GetRollCallVoteByRollCallId' form = { 'rollCallId': vote_id, 'sort': '', 'group': '', 'filter': '', } page = self.post(url=vote_url, data=form, allow_redirects=True).json() if page: roll = page['Model'] vote_chamber = self.chamber_map[roll['ChamberName']] # "7/1/16 01:00 AM" vote_date = dt.datetime.strptime(roll['TakenAtDateTime'], '%m/%d/%y %I:%M %p').strftime('%Y-%m-%d') # TODO: What does this code mean? vote_motion = roll['RollCallVoteType'] vote_passed = 'pass' if roll['RollCallStatus'] == 'Passed' else 'fail' other_count = (int(roll['NotVotingCount']) + int(roll['VacantVoteCount']) + int(roll['AbsentVoteCount']) + int(roll['ConflictVoteCount']) ) vote = Vote(chamber=vote_chamber, start_date=vote_date, motion_text=vote_motion, result=vote_passed, classification='other', bill=bill.identifier, legislative_session=session ) vote.add_source(vote_url) vote.set_count('yes', roll['YesVoteCount']) vote.set_count('no', roll['NoVoteCount']) vote.set_count('other', other_count) for row in roll['AssemblyMemberVotes']: # AssemblyMemberId looks like it should work here, # but for some sessions it's bugged to only return session try: voter = self.legislators_by_short[str(row['ShortName'])] name = voter['DisplayName'] except KeyError: self.warning('could not find legislator short name %s', row['ShortName']) name = row['ShortName'] if row['SelectVoteTypeCode'] == 'Y': vote.yes(name) elif row['SelectVoteTypeCode'] == 'N': vote.no(name) else: vote.vote('other', name) # bill.add_vote_event(vote) yield vote
def _parse_senate_votes(self, vote_data, bill, url): vote_datetime = datetime.datetime.strptime( vote_data['voteDate'], '%Y-%m-%d') if vote_data['voteType'] == 'FLOOR': motion = 'Floor Vote' elif vote_data['voteType'] == 'COMMITTEE': motion = '{} Vote'.format(vote_data['committee']['name']) else: raise ValueError('Unknown vote type encountered.') vote = VoteEvent( chamber='upper', start_date=vote_datetime.strftime('%Y-%m-%d'), motion_text=motion, classification='passage', result='fail', bill=bill, ) vote.add_source(url) vote_rolls = vote_data['memberVotes']['items'] yes_count, no_count, other_count = 0, 0, 0 # Count all yea votes. if 'items' in vote_rolls.get('AYE', {}): for legislator in vote_rolls['AYE']['items']: vote.yes(legislator['fullName']) yes_count += 1 if 'items' in vote_rolls.get('AYEWR', {}): for legislator in vote_rolls['AYEWR']['items']: vote.yes(legislator['fullName']) yes_count += 1 # Count all nay votes. if 'items' in vote_rolls.get('NAY', {}): for legislator in vote_rolls['NAY']['items']: vote.no(legislator['fullName']) no_count += 1 # Count all other types of votes. other_vote_types = ('EXC', 'ABS', 'ABD') for vote_type in other_vote_types: if vote_rolls.get(vote_type, []): for legislator in vote_rolls[vote_type]['items']: vote.vote('other', legislator['fullName']) other_count += 1 vote.result = 'pass' if yes_count > no_count else 'fail' vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('other', other_count) return vote
def _parse_senate_votes(self, vote_data, bill, url): vote_datetime = datetime.datetime.strptime(vote_data['voteDate'], '%Y-%m-%d') if vote_data['voteType'] == 'FLOOR': motion = 'Floor Vote' elif vote_data['voteType'] == 'COMMITTEE': motion = '{} Vote'.format(vote_data['committee']['name']) else: raise ValueError('Unknown vote type encountered.') vote = VoteEvent( chamber='upper', start_date=vote_datetime.strftime('%Y-%m-%d'), motion_text=motion, classification='passage', result='fail', bill=bill, ) vote.add_source(url) vote_rolls = vote_data['memberVotes']['items'] yes_count, no_count, other_count = 0, 0, 0 # Count all yea votes. if 'items' in vote_rolls.get('AYE', {}): for legislator in vote_rolls['AYE']['items']: vote.yes(legislator['fullName']) yes_count += 1 if 'items' in vote_rolls.get('AYEWR', {}): for legislator in vote_rolls['AYEWR']['items']: vote.yes(legislator['fullName']) yes_count += 1 # Count all nay votes. if 'items' in vote_rolls.get('NAY', {}): for legislator in vote_rolls['NAY']['items']: vote.no(legislator['fullName']) no_count += 1 # Count all other types of votes. other_vote_types = ('EXC', 'ABS', 'ABD') for vote_type in other_vote_types: if vote_rolls.get(vote_type, []): for legislator in vote_rolls[vote_type]['items']: vote.vote('other', legislator['fullName']) other_count += 1 vote.result = 'pass' if yes_count > no_count else 'fail' vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('other', other_count) return vote
def scrape_vote(self, bill, vote_json, session): if vote_json['amendmentNumber']: motion = '{}: {}'.format( vote_json['amendmentNumber'], vote_json['action']) else: motion = vote_json['action'] result = 'pass' if vote_json['yesVotesCount'] > vote_json['noVotesCount'] else 'fail' v = VoteEvent( chamber=self.chamber_abbrev_map[vote_json['chamber']], start_date=self.parse_local_date(vote_json['voteDate']), motion_text=motion, result=result, legislative_session=session, bill=bill, classification='other', ) v.set_count(option='yes', value=vote_json['yesVotesCount']) v.set_count('no', vote_json['noVotesCount']) v.set_count('absent', vote_json['absentVotesCount']) v.set_count('excused', vote_json['excusedVotesCount']) v.set_count('other', vote_json['conflictVotesCount']) for name in vote_json['yesVotes'].split(','): if name.strip(): v.yes(name.strip()) for name in vote_json['noVotes'].split(','): if name.strip(): v.no(name.strip()) # add votes with other classifications # option can be 'yes', 'no', 'absent', # 'abstain', 'not voting', 'paired', 'excused' for name in vote_json['absentVotes'].split(','): if name.strip(): v.vote(option="absent", voter=name) for name in vote_json['excusedVotes'].split(','): if name.strip(): v.vote(option="excused", voter=name) for name in vote_json['conflictVotes'].split(','): if name.strip(): v.vote(option="other", voter=name) source_url = 'http://lso.wyoleg.gov/Legislation/{}/{}'.format( session, vote_json['billNumber']) v.add_source(source_url) yield v
def handle_page(self): (date, ) = self.doc.xpath( '//span[@id="ctl00_ContentPlaceHolder1_lblDate"]/text()') date = format_datetime( datetime.datetime.strptime(date, '%m/%d/%Y %I:%M:%S %p'), 'US/Eastern') totals = self.doc.xpath('//table//table')[-1].text_content() totals = re.sub(r'(?mu)\s+', " ", totals).strip() (yes_count, no_count, other_count) = [ int(x) for x in re.search( r'(?m)Total Yeas:\s+(\d+)\s+Total Nays:\s+(\d+)\s+' 'Total Missed:\s+(\d+)', totals).groups() ] result = 'pass' if yes_count > no_count else 'fail' (committee, ) = self.doc.xpath( '//span[@id="ctl00_ContentPlaceHolder1_lblCommittee"]/text()') (action, ) = self.doc.xpath( '//span[@id="ctl00_ContentPlaceHolder1_lblAction"]/text()') motion = "{} ({})".format(action, committee) vote = VoteEvent( start_date=date, bill=self.kwargs['bill'], chamber='lower', motion_text=motion, result=result, classification='committee', ) vote.add_source(self.url) vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('not voting', other_count) for member_vote in self.doc.xpath('//table//table//table//td'): if not member_vote.text_content().strip(): continue (member, ) = member_vote.xpath('span[2]//text()') (member_vote, ) = member_vote.xpath('span[1]//text()') if member_vote == "Y": vote.yes(member) elif member_vote == "N": vote.no(member) elif member_vote == "-": vote.vote('not voting', member) # Parenthetical votes appear to not be counted in the # totals for Yea, Nay, _or_ Missed elif re.search(r'\([YN]\)', member_vote): continue else: raise ValueError( "Unknown vote type found: {}".format(member_vote)) yield vote
def scrape_vote(self, chamber, session, bill_id, vote_url): NO_VOTE_URL = 'http://www.house.leg.state.mn.us/votes/novotefound.asp' resp = self.get(vote_url) html = resp.text # sometimes the link is broken, will redirect to NO_VOTE_URL if resp.url == NO_VOTE_URL: return doc = lxml.html.fromstring(html) try: motion = doc.xpath("//div[@id='leg_PageContent']/div/h2/text()")[0] except IndexError: self.logger.warning("Bill was missing a motion number, skipping") return vote_count = doc.xpath( ".//div[@id='leg_PageContent']/div/h3/text()")[1].split() yeas = int(vote_count[0]) nays = int(vote_count[3]) # second paragraph has date paragraphs = doc.xpath(".//div[@id='leg_PageContent']/div/p/text()") date = None for p in paragraphs: try: date = datetime.datetime.strptime(p.strip(), '%m/%d/%Y').date() break except ValueError: pass if date is None: self.logger.warning("No date could be found for vote on %s" % motion) return vote = VoteEvent(chamber='lower', start_date=date, motion_text=motion, result='pass' if yeas > nays else 'fail', classification='passage', legislative_session=session, bill=bill_id, bill_chamber=chamber) vote.set_count('yes', yeas) vote.set_count('no', nays) vote.add_source(vote_url) vote.pupa_id = vote_url # first table has YEAs for name in doc.xpath('//table[1]/tr/td/font/text()'): vote.yes(name.strip()) # second table is nays for name in doc.xpath('//table[2]/tr/td/font/text()'): vote.no(name.strip()) yield vote
def scrape_vote(self, bill, vote_json, session): if vote_json["amendmentNumber"]: motion = "{}: {}".format(vote_json["amendmentNumber"], vote_json["action"]) else: motion = vote_json["action"] result = ("pass" if vote_json["yesVotesCount"] > vote_json["noVotesCount"] else "fail") v = VoteEvent( chamber=self.chamber_abbrev_map[vote_json["chamber"]], start_date=self.parse_local_date(vote_json["voteDate"]), motion_text=motion, result=result, legislative_session=session, bill=bill, classification="other", ) v.set_count(option="yes", value=vote_json["yesVotesCount"]) v.set_count("no", vote_json["noVotesCount"]) v.set_count("absent", vote_json["absentVotesCount"]) v.set_count("excused", vote_json["excusedVotesCount"]) v.set_count("other", vote_json["conflictVotesCount"]) for name in vote_json["yesVotes"].split(","): if name.strip(): v.yes(name.strip()) for name in vote_json["noVotes"].split(","): if name.strip(): v.no(name.strip()) # add votes with other classifications # option can be 'yes', 'no', 'absent', # 'abstain', 'not voting', 'paired', 'excused' for name in vote_json["absentVotes"].split(","): if name.strip(): v.vote(option="absent", voter=name) for name in vote_json["excusedVotes"].split(","): if name.strip(): v.vote(option="excused", voter=name) for name in vote_json["conflictVotes"].split(","): if name.strip(): v.vote(option="other", voter=name) source_url = "http://lso.wyoleg.gov/Legislation/{}/{}".format( session, vote_json["billNumber"]) v.add_source(source_url) yield v
def parse_bill_actions_table(self, bill, action_table, bill_id, session, url, bill_chamber): for action in action_table.xpath('*')[1:]: date = action[0].text_content() date = dt.datetime.strptime(date, "%m/%d/%Y").strftime('%Y-%m-%d') actor = action[1].text_content().upper() string = action[2].text_content() actor = { "S": "upper", "H": "lower", "D": "legislature", # "Data Systems", "$": "Appropriation measure", "CONAM": "Constitutional Amendment" }[actor] act_type, committees = categorize_action(string) # XXX: Translate short-code to full committee name for the # matcher. real_committees = [] if committees: for committee in committees: try: committee = self.short_ids[committee]['name'] real_committees.append(committee) except KeyError: pass act = bill.add_action(string, date, chamber=actor, classification=act_type) for committee in real_committees: act.add_related_entity(name=committee, entity_type="organization") vote = self.parse_vote(string) if vote: v, motion = vote vote = VoteEvent(start_date=date, chamber=actor, bill=bill_id, bill_chamber=bill_chamber, legislative_session=session, motion_text=motion, result='pass' if 'passed' in string.lower() else 'fail', classification='passage') vote.add_source(url) vote.set_count('yes', int(v['n_yes'] or 0)) vote.set_count('no', int(v['n_no'] or 0)) vote.set_count('not voting', int(v['n_excused'] or 0)) for voter in split_specific_votes(v['yes']): vote.yes(voter) for voter in split_specific_votes(v['yes_resv']): vote.yes(voter) for voter in split_specific_votes(v['no']): vote.no(voter) for voter in split_specific_votes(v['excused']): vote.vote('not voting', voter) yield vote
def handle_page(self): date, = self.doc.xpath('//span[contains(@id, "lblDate")]/text()') date = format_datetime( datetime.datetime.strptime(date, '%m/%d/%Y %I:%M:%S %p'), 'US/Eastern') yes_count = int( self.doc.xpath('//span[contains(@id, "lblYeas")]/text()')[0]) no_count = int( self.doc.xpath('//span[contains(@id, "lblNays")]/text()')[0]) other_count = int( self.doc.xpath('//span[contains(@id, "lblMissed")]/text()')[0]) result = 'pass' if yes_count > no_count else 'fail' committee, = self.doc.xpath( '//span[contains(@id, "lblCommittee")]/text()') action, = self.doc.xpath('//span[contains(@id, "lblAction")]/text()') motion = "{} ({})".format(action, committee) vote = VoteEvent( start_date=date, bill=self.kwargs['bill'], chamber='lower', motion_text=motion, result=result, classification='committee', ) vote.add_source(self.url) vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('not voting', other_count) for member_vote in self.doc.xpath( '//ul[contains(@class, "vote-list")]/li'): if not member_vote.text_content().strip(): continue member, = member_vote.xpath('span[2]//text()') member_vote, = member_vote.xpath('span[1]//text()') if member_vote == "Y": vote.yes(member) elif member_vote == "N": vote.no(member) elif member_vote == "-": vote.vote('not voting', member) # Parenthetical votes appear to not be counted in the # totals for Yea, Nay, _or_ Missed elif re.search(r'\([YN]\)', member_vote): continue else: raise ValueError( "Unknown vote type found: {}".format(member_vote)) yield vote
def parse_vote(self, bill, actor, date, motion, url, uniqid): page = self.get(url).text bill.add_source(url) vote_re = re.compile( r"YEAS -?\s?(\d+)(.*)NAYS -?\s?(\d+)" r"(.*)ABSENT( OR NOT VOTING)? -?\s?" r"(\d+)(.*)", re.MULTILINE | re.DOTALL, ) match = vote_re.search(page) yes_count = int(match.group(1)) no_count = int(match.group(3)) other_count = int(match.group(6)) if yes_count > no_count: passed = True else: passed = False if actor == "upper" or actor == "lower": vote_chamber = actor else: vote_chamber = "" vote = Vote( chamber=vote_chamber, start_date=date, motion_text=motion, result="pass" if passed else "fail", identifier=str(uniqid), classification="passage", bill=bill, ) vote.add_source(url) vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("other", other_count) yes_votes = re.split(r"\s{2,}", match.group(2).strip()) no_votes = re.split(r"\s{2,}", match.group(4).strip()) other_votes = re.split(r"\s{2,}", match.group(7).strip()) for yes in yes_votes: if yes: vote.yes(yes) for no in no_votes: if no: vote.no(no) for other in other_votes: if other: vote.vote("other", other) yield vote
def test_full_vote_event(): j = Jurisdiction.objects.create(id='jid', division_id='did') j.legislative_sessions.create(name='1900', identifier='1900') sp1 = ScrapePerson('John Smith', primary_org='lower') sp2 = ScrapePerson('Adam Smith', primary_org='lower') org = ScrapeOrganization(name='House', classification='lower') bill = ScrapeBill('HB 1', '1900', 'Axe & Tack Tax Act', from_organization=org._id) vote_event = ScrapeVoteEvent(legislative_session='1900', motion_text='passage', start_date='1900-04-01', classification='passage:bill', result='pass', bill_chamber='lower', bill='HB 1', organization=org._id) vote_event.set_count('yes', 20) vote_event.yes('John Smith') vote_event.no('Adam Smith') oi = OrganizationImporter('jid') oi.import_data([org.as_dict()]) pi = PersonImporter('jid') pi.import_data([sp1.as_dict(), sp2.as_dict()]) mi = MembershipImporter('jid', pi, oi, DumbMockImporter()) mi.import_data([sp1._related[0].as_dict(), sp2._related[0].as_dict()]) bi = BillImporter('jid', oi, pi) bi.import_data([bill.as_dict()]) VoteEventImporter('jid', pi, oi, bi).import_data([vote_event.as_dict()]) assert VoteEvent.objects.count() == 1 ve = VoteEvent.objects.get() assert ve.legislative_session == LegislativeSession.objects.get() assert ve.motion_classification == ['passage:bill'] assert ve.bill == Bill.objects.get() count = ve.counts.get() assert count.option == 'yes' assert count.value == 20 votes = list(ve.votes.all()) assert len(votes) == 2 for v in ve.votes.all(): if v.voter_name == 'John Smith': assert v.option == 'yes' assert v.voter == Person.objects.get(name='John Smith') else: assert v.option == 'no' assert v.voter == Person.objects.get(name='Adam Smith')
def scrape_vote(self, chamber, session, bill_id, vote_url): NO_VOTE_URL = 'http://www.house.leg.state.mn.us/votes/novotefound.asp' resp = self.get(vote_url) html = resp.text # sometimes the link is broken, will redirect to NO_VOTE_URL if resp.url == NO_VOTE_URL: return doc = lxml.html.fromstring(html) try: motion = doc.xpath("//div[@id='leg_PageContent']/div/h2/text()")[0] except IndexError: self.logger.warning("Bill was missing a motion number, skipping") return vote_count = doc.xpath(".//div[@id='leg_PageContent']/div/h3/text()")[1].split() yeas = int(vote_count[0]) nays = int(vote_count[3]) # second paragraph has date paragraphs = doc.xpath(".//div[@id='leg_PageContent']/div/p/text()") date = None for p in paragraphs: try: date = datetime.datetime.strptime(p.strip(), '%m/%d/%Y').date() break except ValueError: pass if date is None: self.logger.warning("No date could be found for vote on %s" % motion) return vote = VoteEvent(chamber='lower', start_date=date, motion_text=motion, result='pass' if yeas > nays else 'fail', classification='passage', legislative_session=session, bill=bill_id, bill_chamber=chamber) vote.set_count('yes', yeas) vote.set_count('no', nays) vote.add_source(vote_url) vote.pupa_id = vote_url # first table has YEAs for name in doc.xpath('//table[1]/tr/td/font/text()'): vote.yes(name.strip()) # second table is nays for name in doc.xpath('//table[2]/tr/td/font/text()'): vote.no(name.strip()) yield vote
def scrape_votes(self, bill): bill_num = bill.identifier.split()[1] url = ( "http://wslwebservices.leg.wa.gov/legislationservice.asmx/" "GetRollCalls?billNumber=%s&biennium=%s" % (bill_num, self.biennium) ) page = self.get(url) page = lxml.etree.fromstring(page.content) for rc in xpath(page, "//wa:RollCall"): motion = xpath(rc, "string(wa:Motion)") seq_no = xpath(rc, "string(wa:SequenceNumber)") date = xpath(rc, "string(wa:VoteDate)").split("T")[0] date = datetime.datetime.strptime(date, "%Y-%m-%d").date() yes_count = int(xpath(rc, "string(wa:YeaVotes/wa:Count)")) no_count = int(xpath(rc, "string(wa:NayVotes/wa:Count)")) abs_count = int(xpath(rc, "string(wa:AbsentVotes/wa:Count)")) ex_count = int(xpath(rc, "string(wa:ExcusedVotes/wa:Count)")) other_count = abs_count + ex_count agency = xpath(rc, "string(wa:Agency)") chamber = {"House": "lower", "Senate": "upper"}[agency] vote = Vote( chamber=chamber, start_date=date, motion_text="{} (#{})".format(motion, seq_no), result="pass" if yes_count > (no_count + other_count) else "fail", classification="other", bill=bill, ) vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("other", other_count) vote.add_source(url) for sv in xpath(rc, "wa:Votes/wa:Vote"): name = xpath(sv, "string(wa:Name)") vtype = xpath(sv, "string(wa:VOte)") if vtype == "Yea": vote.yes(name) elif vtype == "Nay": vote.no(name) else: vote.vote("other", name) yield vote
def handle_page(self): (date, ) = self.doc.xpath('//span[@id="ctl00_ContentPlaceHolder1_lblDate"]/text()') date = datetime.datetime.strptime(date, '%m/%d/%Y %I:%M:%S %p' ).isoformat().replace('T', ' ') totals = self.doc.xpath('//table//table')[-1].text_content() totals = re.sub(r'(?mu)\s+', " ", totals).strip() (yes_count, no_count, other_count) = [int(x) for x in re.search( r'(?m)Total Yeas:\s+(\d+)\s+Total Nays:\s+(\d+)\s+' 'Total Missed:\s+(\d+)', totals).groups()] result = 'pass' if yes_count > no_count else 'fail' (committee, ) = self.doc.xpath( '//span[@id="ctl00_ContentPlaceHolder1_lblCommittee"]/text()') (action, ) = self.doc.xpath('//span[@id="ctl00_ContentPlaceHolder1_lblAction"]/text()') motion = "{} ({})".format(action, committee) vote = VoteEvent(start_date=date, bill=self.kwargs['bill'], chamber='lower', motion_text=motion, result=result, classification='committee', ) vote.add_source(self.url) vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('not voting', other_count) for member_vote in self.doc.xpath('//table//table//table//td'): if not member_vote.text_content().strip(): continue (member, ) = member_vote.xpath('span[2]//text()') (member_vote, ) = member_vote.xpath('span[1]//text()') if member_vote == "Y": vote.yes(member) elif member_vote == "N": vote.no(member) elif member_vote == "-": vote.vote('not voting', member) # Parenthetical votes appear to not be counted in the # totals for Yea, Nay, _or_ Missed elif re.search(r'\([YN]\)', member_vote): continue else: raise ValueError("Unknown vote type found: {}".format(member_vote)) yield vote
def parse_vote(self, actor, date, row, session, bill_id, bill_chamber, source): """ takes the actor, date and row element and returns a Vote object """ spans = row.xpath(".//span") motion = row.text.replace(u"\u00a0", " ").replace("-", "").strip() motion = motion if motion else "passage" passed, yes_count, no_count, other_count = ( spans[0].text_content().rsplit("-", 3)) yes_votes = self.get_names(spans[1].tail) no_votes = self.get_names(spans[2].tail) other_votes = [] for span in spans[3:]: if span.text.startswith(("Absent", "Excused")): other_votes += self.get_names(span.tail) for key, val in { "adopted": "pass", "passed": "pass", "failed": "fail" }.items(): if key in passed.lower(): passed = val break vote = VoteEvent( chamber=actor, start_date=date, motion_text=motion, bill=bill_id, bill_chamber=bill_chamber, result=passed, classification="passage", legislative_session=session, ) vote.add_source(source) vote.set_count("yes", int(yes_count)) vote.set_count("no", int(no_count)) vote.set_count("absent", int(other_count)) for name in yes_votes: if name and name != "None": vote.yes(name) for name in no_votes: if name and name != "None": vote.no(name) for name in other_votes: if name and name != "None": vote.vote("absent", name) yield vote
def parse_vote(self, bill, actor, date, motion, url, uniqid): page = self.get(url).text bill.add_source(url) vote_re = re.compile('YEAS -?\s?(\d+)(.*)NAYS -?\s?(\d+)' '(.*)ABSENT( OR NOT VOTING)? -?\s?' '(\d+)(.*)', re.MULTILINE | re.DOTALL) match = vote_re.search(page) yes_count = int(match.group(1)) no_count = int(match.group(3)) other_count = int(match.group(6)) if yes_count > no_count: passed = True else: passed = False if actor == 'upper' or actor == 'lower': vote_chamber = actor else: vote_chamber = '' vote = Vote(chamber=vote_chamber, start_date=date, motion_text=motion, result='pass' if passed else 'fail', identifier=str(uniqid), classification='passage', bill=bill) vote.add_source(url) vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('other', other_count) yes_votes = re.split('\s{2,}', match.group(2).strip()) no_votes = re.split('\s{2,}', match.group(4).strip()) other_votes = re.split('\s{2,}', match.group(7).strip()) for yes in yes_votes: if yes: vote.yes(yes) for no in no_votes: if no: vote.no(no) for other in other_votes: if other: vote.vote('other', other) yield vote
def parse_vote(self, bill, actor, date, motion, url, uniqid): page = self.get(url).text bill.add_source(url) vote_re = re.compile( r'YEAS -?\s?(\d+)(.*)NAYS -?\s?(\d+)' r'(.*)ABSENT( OR NOT VOTING)? -?\s?' r'(\d+)(.*)', re.MULTILINE | re.DOTALL) match = vote_re.search(page) yes_count = int(match.group(1)) no_count = int(match.group(3)) other_count = int(match.group(6)) if yes_count > no_count: passed = True else: passed = False if actor == 'upper' or actor == 'lower': vote_chamber = actor else: vote_chamber = '' vote = Vote(chamber=vote_chamber, start_date=date, motion_text=motion, result='pass' if passed else 'fail', identifier=str(uniqid), classification='passage', bill=bill) vote.add_source(url) vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('other', other_count) yes_votes = re.split(r'\s{2,}', match.group(2).strip()) no_votes = re.split(r'\s{2,}', match.group(4).strip()) other_votes = re.split(r'\s{2,}', match.group(7).strip()) for yes in yes_votes: if yes: vote.yes(yes) for no in no_votes: if no: vote.no(no) for other in other_votes: if other: vote.vote('other', other) yield vote
def scrape_votes(self, bill): bill_num = bill.identifier.split()[1] url = ("http://wslwebservices.leg.wa.gov/legislationservice.asmx/" "GetRollCalls?billNumber=%s&biennium=%s" % ( bill_num, self.biennium)) page = self.get(url) page = lxml.etree.fromstring(page.content) for rc in xpath(page, "//wa:RollCall"): motion = xpath(rc, "string(wa:Motion)") seq_no = xpath(rc, "string(wa:SequenceNumber)") date = xpath(rc, "string(wa:VoteDate)").split("T")[0] date = datetime.datetime.strptime(date, "%Y-%m-%d").date() yes_count = int(xpath(rc, "string(wa:YeaVotes/wa:Count)")) no_count = int(xpath(rc, "string(wa:NayVotes/wa:Count)")) abs_count = int( xpath(rc, "string(wa:AbsentVotes/wa:Count)")) ex_count = int( xpath(rc, "string(wa:ExcusedVotes/wa:Count)")) other_count = abs_count + ex_count agency = xpath(rc, "string(wa:Agency)") chamber = {'House': 'lower', 'Senate': 'upper'}[agency] vote = Vote(chamber=chamber, start_date=date, motion_text='{} (#{})'.format(motion, seq_no), result='pass' if yes_count > (no_count + other_count) else 'fail', classification='other', bill=bill) vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('other', other_count) vote.add_source(url) for sv in xpath(rc, "wa:Votes/wa:Vote"): name = xpath(sv, "string(wa:Name)") vtype = xpath(sv, "string(wa:VOte)") if vtype == 'Yea': vote.yes(name) elif vtype == 'Nay': vote.no(name) else: vote.vote('other', name) yield vote
def scrape_votes(self, bill): bill_num = bill.identifier.split()[1] url = ("http://wslwebservices.leg.wa.gov/legislationservice.asmx/" "GetRollCalls?billNumber=%s&biennium=%s" % (bill_num, self.biennium)) page = self.get(url) page = lxml.etree.fromstring(page.content) for rc in xpath(page, "//wa:RollCall"): motion = xpath(rc, "string(wa:Motion)") date = xpath(rc, "string(wa:VoteDate)").split("T")[0] date = datetime.datetime.strptime(date, "%Y-%m-%d").date() yes_count = int(xpath(rc, "string(wa:YeaVotes/wa:Count)")) no_count = int(xpath(rc, "string(wa:NayVotes/wa:Count)")) abs_count = int(xpath(rc, "string(wa:AbsentVotes/wa:Count)")) ex_count = int(xpath(rc, "string(wa:ExcusedVotes/wa:Count)")) other_count = abs_count + ex_count agency = xpath(rc, "string(wa:Agency)") chamber = {'House': 'lower', 'Senate': 'upper'}[agency] vote = Vote(chamber=chamber, start_date=date, motion_text=motion, result='pass' if yes_count > (no_count + other_count) else 'fail', classification='other', bill=bill) vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('other', other_count) vote.add_source(url) for sv in xpath(rc, "wa:Votes/wa:Vote"): name = xpath(sv, "string(wa:Name)") vtype = xpath(sv, "string(wa:VOte)") if vtype == 'Yea': vote.yes(name) elif vtype == 'Nay': vote.no(name) else: vote.vote('other', name) yield vote
def parse_vote(self, actor, date, row, session, bill_id, bill_chamber, source): """ takes the actor, date and row element and returns a Vote object """ spans = row.xpath('.//span') motion = row.text.replace(u'\u00a0', " ").replace("-", "").strip() motion = motion if motion else "passage" passed, yes_count, no_count, other_count = spans[0].text_content( ).rsplit('-', 3) yes_votes = self.get_names(spans[1].tail) no_votes = self.get_names(spans[2].tail) other_votes = [] for span in spans[3:]: if span.text.startswith(('Absent', 'Excused')): other_votes += self.get_names(span.tail) for key, val in { 'adopted': 'pass', 'passed': 'pass', 'failed': 'fail' }.items(): if key in passed.lower(): passed = val break vote = VoteEvent(chamber=actor, start_date=date, motion_text=motion, bill=bill_id, bill_chamber=bill_chamber, result=passed, classification="passage", legislative_session=session) vote.add_source(source) vote.set_count('yes', int(yes_count)) vote.set_count('no', int(no_count)) vote.set_count('absent', int(other_count)) for name in yes_votes: if name and name != 'None': vote.yes(name) for name in no_votes: if name and name != 'None': vote.no(name) for name in other_votes: if name and name != 'None': vote.vote('absent', name) yield vote
def parse_vote(self, actor, date, row, session, bill_id, bill_chamber, source): """ takes the actor, date and row element and returns a Vote object """ spans = row.xpath('.//span') motion = row.text.replace(u'\u00a0', " ").replace("-", "").strip() motion = motion if motion else "passage" passed, yes_count, no_count, other_count = spans[0].text_content().rsplit('-', 3) yes_votes = self.get_names(spans[1].tail) no_votes = self.get_names(spans[2].tail) other_votes = [] for span in spans[3:]: if span.text.startswith(('Absent', 'Excused')): other_votes += self.get_names(span.tail) for key, val in {'adopted': 'pass', 'passed': 'pass', 'failed': 'fail'}.items(): if key in passed.lower(): passed = val break vote = VoteEvent(chamber=actor, start_date=date, motion_text=motion, bill=bill_id, bill_chamber=bill_chamber, result=passed, classification="passage", legislative_session=session) vote.add_source(source) vote.set_count('yes', int(yes_count)) vote.set_count('no', int(no_count)) vote.set_count('absent', int(other_count)) for name in yes_votes: if name and name != 'None': vote.yes(name) for name in no_votes: if name and name != 'None': vote.no(name) for name in other_votes: if name and name != 'None': vote.vote('absent', name) yield vote
def asvote(self): v = VoteEvent( chamber=self.chamber(), start_date=self.date(), motion_text=self.motion(), result='pass' if self.passed() else 'fail', classification='passage', bill=self.bill, ) v.set_count('yes', self.yes_count()) v.set_count('no', self.no_count()) v.set_count('other', self.other_count()) for voter in self.yes_votes(): v.yes(voter) for voter in self.no_votes(): v.no(voter) for voter in self.other_votes(): v.vote('other', voter) v.add_source(self.url) return v
def asvote(self): v = VoteEvent( chamber=self.chamber(), start_date=self.date(), motion_text=self.motion(), result="pass" if self.passed() else "fail", classification="passage", bill=self.bill, ) v.pupa_id = self.url # URL contains sequence number v.set_count("yes", self.yes_count()) v.set_count("no", self.no_count()) v.set_count("other", self.other_count()) for voter in self.yes_votes(): v.yes(voter) for voter in self.no_votes(): v.no(voter) for voter in self.other_votes(): v.vote("other", voter) v.add_source(self.url) return v
def scrape_bill(self, chamber, session, bill_id): # try and get bill for the first year of the session biennium url = 'http://legislature.mi.gov/doc.aspx?%s-%s' % ( session[:4], bill_id.replace(' ', '-')) html = self.get(url).text # Otherwise, try second year of the session biennium if ('Page Not Found' in html or 'The bill you are looking for is not available yet' in html): url = 'http://legislature.mi.gov/doc.aspx?%s-%s' % ( session[-4:], bill_id.replace(' ', '-')) html = self.get(url).text if ('Page Not Found' in html or 'The bill you are looking for is not available yet' in html): self.warning("Cannot open bill page for {}; skipping".format(bill_id)) return doc = lxml.html.fromstring(html) doc.make_links_absolute('http://legislature.mi.gov') title = doc.xpath('//span[@id="frg_billstatus_ObjectSubject"]')[0].text_content() # get B/R/JR/CR part and look up bill type bill_type = bill_types[bill_id.split(' ')[0][1:]] bill = Bill(bill_id, session, title, chamber=chamber, classification=bill_type) bill.add_source(url) # sponsors sponsors = doc.xpath('//span[@id="frg_billstatus_SponsorList"]/a') for sponsor in sponsors: name = sponsor.text.replace(u'\xa0', ' ') # sometimes district gets added as a link if name.isnumeric(): continue if len(sponsors) > 1: classification = ( 'primary' if sponsor.tail and 'primary' in sponsor.tail else 'cosponsor' ) else: classification = 'primary' bill.add_sponsorship( name=name, chamber=chamber, entity_type='person', primary=classification == 'primary', classification=classification, ) bill.subject = doc.xpath('//span[@id="frg_billstatus_CategoryList"]/a/text()') # actions (skip header) for row in doc.xpath('//table[@id="frg_billstatus_HistoriesGridView"]/tr')[1:]: tds = row.xpath('td') # date, journal link, action date = tds[0].text_content() journal = tds[1].text_content() action = tds[2].text_content() date = TIMEZONE.localize(datetime.datetime.strptime(date, "%m/%d/%Y")) # instead of trusting upper/lower case, use journal for actor actor = 'upper' if 'SJ' in journal else 'lower' classification = categorize_action(action) bill.add_action(action, date, chamber=actor, classification=classification) # check if action mentions a sub submatch = re.search(r'WITH SUBSTITUTE\s+([\w\-\d]+)', action, re.IGNORECASE) if submatch and tds[2].xpath('a'): version_url = tds[2].xpath('a/@href')[0] version_name = tds[2].xpath('a/text()')[0].strip() version_name = 'Substitute {}'.format(version_name) self.info("Found Substitute {}".format(version_url)) if version_url.lower().endswith('.pdf'): mimetype = 'application/pdf' elif version_url.lower().endswith('.htm'): mimetype = 'text/html' bill.add_version_link(version_name, version_url, media_type=mimetype) # check if action mentions a vote rcmatch = re.search(r'Roll Call # (\d+)', action, re.IGNORECASE) if rcmatch: rc_num = rcmatch.groups()[0] # in format mileg.aspx?page=getobject&objectname=2011-SJ-02-10-011 journal_link = tds[1].xpath('a/@href') if journal_link: objectname = journal_link[0].rsplit('=', 1)[-1] chamber_name = {'upper': 'Senate', 'lower': 'House'}[actor] vote_url = BASE_URL + '/documents/%s/Journal/%s/htm/%s.htm' % ( session, chamber_name, objectname) results = self.parse_roll_call(vote_url, rc_num) vote = VoteEvent( start_date=date, chamber=actor, bill=bill, motion_text=action, result='pass' if len(results['yes']) > len(results['no']) else 'fail', classification='passage', ) # check the expected counts vs actual count = re.search(r'YEAS (\d+)', action, re.IGNORECASE) count = int(count.groups()[0]) if count else 0 if count != len(results['yes']): self.warning('vote count mismatch for %s %s, %d != %d' % (bill_id, action, count, len(results['yes']))) count = re.search(r'NAYS (\d+)', action, re.IGNORECASE) count = int(count.groups()[0]) if count else 0 if count != len(results['no']): self.warning('vote count mismatch for %s %s, %d != %d' % (bill_id, action, count, len(results['no']))) vote.set_count('yes', len(results['yes'])) vote.set_count('no', len(results['no'])) vote.set_count('other', len(results['other'])) for name in results['yes']: vote.yes(name) for name in results['no']: vote.no(name) for name in results['other']: vote.vote('other', name) vote.add_source(vote_url) yield vote else: self.warning("missing journal link for %s %s" % (bill_id, journal)) # versions for row in doc.xpath('//table[@id="frg_billstatus_DocumentGridTable"]/tr'): parsed = self.parse_doc_row(row) if parsed: name, url = parsed if url.endswith('.pdf'): mimetype = 'application/pdf' elif url.endswith('.htm'): mimetype = 'text/html' bill.add_version_link(name, url, media_type=mimetype) # documents for row in doc.xpath('//table[@id="frg_billstatus_HlaTable"]/tr'): document = self.parse_doc_row(row) if document: name, url = document bill.add_document_link(name, url) for row in doc.xpath('//table[@id="frg_billstatus_SfaTable"]/tr'): document = self.parse_doc_row(row) if document: name, url = document bill.add_document_link(name, url) yield bill
def scrape_votes(self, bill, url): page = lxml.html.fromstring(self.get(url).text.replace(u'\xa0', ' ')) seen_rcs = set() re_ns = "http://exslt.org/regular-expressions" path = "//p[re:test(text(), 'OKLAHOMA\s+(HOUSE|STATE\s+SENATE)')]" for header in page.xpath(path, namespaces={'re': re_ns}): bad_vote = False # Each chamber has the motion name on a different line of the file if 'HOUSE' in header.xpath("string()"): chamber = 'lower' motion_index = 8 else: chamber = 'upper' motion_index = 13 motion = header.xpath( "string(following-sibling::p[%d])" % motion_index).strip() motion = re.sub(r'\s+', ' ', motion) if not motion.strip(): self.warning("Motion text not found") return match = re.match(r'^(.*) (PASSED|FAILED)$', motion) if match: motion = match.group(1) passed = match.group(2) == 'PASSED' else: passed = None rcs_p = header.xpath( "following-sibling::p[contains(., 'RCS#')]")[0] rcs_line = rcs_p.xpath("string()").replace(u'\xa0', ' ') rcs = re.search(r'RCS#\s+(\d+)', rcs_line).group(1) if rcs in seen_rcs: continue else: seen_rcs.add(rcs) date_line = rcs_p.getnext().xpath("string()") date = re.search(r'\d+/\d+/\d+', date_line).group(0) date = datetime.datetime.strptime(date, "%m/%d/%Y").date() vtype = None counts = collections.defaultdict(int) votes = collections.defaultdict(list) seen_yes = False for sib in header.xpath("following-sibling::p")[13:]: line = sib.xpath("string()").replace('\r\n', ' ').strip() if "*****" in line: break regex = (r'(YEAS|NAYS|EXCUSED|VACANT|CONSTITUTIONAL ' 'PRIVILEGE|NOT VOTING|N/V)\s*:\s*(\d+)(.*)') match = re.match(regex, line) if match: if match.group(1) == 'YEAS' and 'RCS#' not in line: vtype = 'yes' seen_yes = True elif match.group(1) == 'NAYS' and seen_yes: vtype = 'no' elif match.group(1) == 'VACANT': continue # skip these elif seen_yes: vtype = 'other' if seen_yes and match.group(3).strip(): self.warning("Bad vote format, skipping.") bad_vote = True counts[vtype] += int(match.group(2)) elif seen_yes: for name in line.split(' '): if not name: continue if 'HOUSE' in name or 'SENATE ' in name: continue votes[vtype].append(name.strip()) if bad_vote: continue if passed is None: passed = counts['yes'] > (counts['no'] + counts['other']) vote = Vote(chamber=chamber, start_date=date.strftime('%Y-%m-%d'), motion_text=motion, result='pass' if passed else 'fail', bill=bill, classification='passage') vote.set_count('yes', counts['yes']) vote.set_count('no', counts['no']) vote.set_count('other', counts['other']) vote.pupa_id = url + '#' + rcs vote.add_source(url) for name in votes['yes']: vote.yes(name) for name in votes['no']: if ':' in name: raise Exception(name) vote.no(name) for name in votes['other']: vote.vote('other', name) yield vote
def process_vote(self, votes, url, base_url, bill, legislators, chamber_dict, vote_results): for v in votes["items"]: try: v["yeas"] except KeyError: # sometimes the actual vote is buried a second layer deep v = self.get(base_url+v["link"]).json() try: v["yeas"] except KeyError: self.logger.warning("No vote info available, skipping") continue try: chamber = chamber_dict[v["chamber"]] except KeyError: chamber = "lower" if "house" in v["apn"] else "upper" try: date = self._tz.localize(datetime.datetime.strptime(v["date"], "%m/%d/%y")) date = "{:%Y-%m-%d}".format(date) except KeyError: try: date = self._tz.localize(datetime.datetime.strptime(v["occurred"], "%m/%d/%y")) date = "{:%Y-%m-%d}".format(date) except KeyError: self.logger.warning("No date found for vote, skipping") continue try: motion = v["action"] except KeyError: motion = v["motiontype"] # Sometimes Ohio's SOLAR will only return part of the JSON, so in that case skip if (not motion and isinstance(v['yeas'], str) and isinstance(v['nays'], str)): waringText = 'Malformed JSON found for vote ("revno" of {}); skipping' self.warning(waringText.format(v['revno'])) continue result = v.get("results") or v.get("passed") if result is None: if len(v['yeas']) > len(v['nays']): result = "passed" else: result = "failed" passed = vote_results[result.lower()] if "committee" in v: vote = VoteEvent(chamber=chamber, start_date=date, motion_text=motion, result='pass' if passed else 'fail', # organization=v["committee"], bill=bill, classification='passed' ) else: vote = VoteEvent(chamber=chamber, start_date=date, motion_text=motion, result='pass' if passed else 'fail', classification='passed', bill=bill ) vote.pupa_id = str(v['revno']) # the yea and nay counts are not displayed, but vote totals are # and passage status is. yes_count = 0 no_count = 0 absent_count = 0 excused_count = 0 for voter_id in v["yeas"]: vote.yes(legislators[voter_id]) yes_count += 1 for voter_id in v["nays"]: vote.no(legislators[voter_id]) no_count += 1 if "absent" in v: for voter_id in v["absent"]: vote.vote('absent', legislators[voter_id]) absent_count += 1 if "excused" in v: for voter_id in v["excused"]: vote.vote('excused', legislators[voter_id]) excused_count += 1 vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('absent', absent_count) vote.set_count('excused', excused_count) # check to see if there are any other things that look # like vote categories, throw a warning if so for key, val in v.items(): if (type(val) == list and len(val) > 0 and key not in ["yeas", "nays", "absent", "excused"]): if val[0] in legislators: self.logger.warning("{k} looks like a vote type that's not being counted." " Double check it?".format(k=key)) vote.add_source(url) yield vote
def scrape_pdf_for_votes(self, session, actor, date, motion, href): warned = False # vote indicator, a few spaces, a name, newline or multiple spaces # VOTE_RE = re.compile('(Y|N|E|NV|A|P|-)\s{2,5}(\w.+?)(?:\n|\s{2})') COUNT_RE = re.compile( r'^(\d+)\s+YEAS?\s+(\d+)\s+NAYS?\s+(\d+)\s+PRESENT(?:\s+(\d+)\s+NOT\sVOTING)?\s*$' ) PASS_FAIL_WORDS = { 'PASSED': 'pass', 'PREVAILED': 'fail', 'ADOPTED': 'pass', 'CONCURRED': 'pass', 'FAILED': 'fail', 'LOST': 'fail', } pdflines = self.fetch_pdf_lines(href) if not pdflines: return False yes_count = no_count = present_count = 0 yes_votes = [] no_votes = [] present_votes = [] excused_votes = [] not_voting = [] absent_votes = [] passed = None counts_found = False vote_lines = [] for line in pdflines: # consider pass/fail as a document property instead of a result of the vote count # extract the vote count from the document instead of just using counts of names if not line.strip(): continue elif line.strip() in PASS_FAIL_WORDS: # Crash on duplicate pass/fail status that differs from previous status if passed is not None and passed != PASS_FAIL_WORDS[line.strip()]: raise Exception("Duplicate pass/fail matches in [%s]" % href) passed = PASS_FAIL_WORDS[line.strip()] elif COUNT_RE.match(line): (yes_count, no_count, present_count, not_voting_count) = COUNT_RE.match(line).groups() yes_count = int(yes_count) no_count = int(no_count) present_count = int(present_count) counts_found = True elif counts_found: for value in VOTE_VALUES: if re.search(r'^\s*({})\s+\w'.format(value), line): vote_lines.append(line) break votes = find_columns_and_parse(vote_lines) for name, vcode in votes.items(): if name == 'Mr. Speaker': name = session_details[session]['speaker'] elif name == 'Mr. President': name = session_details[session]['president'] else: # Converts "Davis,William" to "Davis, William". name = re.sub(r'\,([a-zA-Z])', r', \1', name) if vcode == 'Y': yes_votes.append(name) elif vcode == 'N': no_votes.append(name) elif vcode == 'P': present_votes.append(name) elif vcode == 'E': excused_votes.append(name) elif vcode == 'NV': not_voting.append(name) elif vcode == 'A': absent_votes.append(name) # fake the counts if yes_count == 0 and no_count == 0 and present_count == 0: yes_count = len(yes_votes) no_count = len(no_votes) else: # audit if yes_count != len(yes_votes): self.warning("Mismatched yes count [expect: %i] [have: %i]" % (yes_count, len(yes_votes))) warned = True if no_count != len(no_votes): self.warning("Mismatched no count [expect: %i] [have: %i]" % (no_count, len(no_votes))) warned = True if passed is None: if actor['classification'] == 'lower': # senate doesn't have these lines self.warning("No pass/fail word found; fall back to comparing yes and no vote.") warned = True passed = 'pass' if yes_count > no_count else 'fail' classification, _ = _categorize_action(motion) vote_event = VoteEvent(legislative_session=session, motion_text=motion, classification=classification, organization=actor, start_date=date, result=passed) for name in yes_votes: vote_event.yes(name) for name in no_votes: vote_event.no(name) for name in present_votes: vote_event.vote('other', name) for name in excused_votes: vote_event.vote('excused', name) for name in not_voting: vote_event.vote('not voting', name) for name in absent_votes: vote_event.vote('absent', name) vote_event.set_count('yes', yes_count) vote_event.set_count('no', no_count) vote_event.set_count('other', present_count) vote_event.set_count('excused', len(excused_votes)) vote_event.set_count('absent', len(absent_votes)) vote_event.set_count('not voting', len(not_voting)) vote_event.add_source(href) # for distinguishing between votes with the same id and on same day vote_event.pupa_id = href if warned: self.warning("Warnings were issued. Best to check %s" % href) return vote_event
def scrape_vote(self, bill, name, url): if "VOTE/H" in url: vote_chamber = 'lower' cols = (1, 5, 9, 13) name_offset = 3 yes_offset = 0 no_offset = 1 else: vote_chamber = 'upper' cols = (1, 6) name_offset = 4 yes_offset = 1 no_offset = 2 # Connecticut's SSL is causing problems with Scrapelib, so use Requests page = requests.get(url, verify=False).text if 'BUDGET ADDRESS' in page: return page = lxml.html.fromstring(page) yes_count = page.xpath( "string(//span[contains(., 'Those voting Yea')])") yes_count = int(re.match(r'[^\d]*(\d+)[^\d]*', yes_count).group(1)) no_count = page.xpath( "string(//span[contains(., 'Those voting Nay')])") no_count = int(re.match(r'[^\d]*(\d+)[^\d]*', no_count).group(1)) other_count = page.xpath( "string(//span[contains(., 'Those absent')])") other_count = int( re.match(r'[^\d]*(\d+)[^\d]*', other_count).group(1)) need_count = page.xpath( "string(//span[contains(., 'Necessary for')])") need_count = int( re.match(r'[^\d]*(\d+)[^\d]*', need_count).group(1)) date = page.xpath("string(//span[contains(., 'Taken on')])") date = re.match(r'.*Taken\s+on\s+(\d+/\s?\d+)', date).group(1) date = date.replace(' ', '') date = datetime.datetime.strptime(date + " " + bill.legislative_session, "%m/%d %Y").date() # not sure about classification. vote = Vote(chamber=vote_chamber, start_date=date, motion_text=name, result='pass' if yes_count > need_count else 'fail', classification='passage', bill=bill ) vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('other', other_count) vote.add_source(url) table = page.xpath("//table")[0] for row in table.xpath("tr"): for i in cols: name = row.xpath("string(td[%d])" % ( i + name_offset)).strip() if not name or name == 'VACANT': continue if "Y" in row.xpath("string(td[%d])" % (i + yes_offset)): vote.yes(name) elif "N" in row.xpath("string(td[%d])" % (i + no_offset)): vote.no(name) else: vote.vote('other', name) yield vote
def scrape_vote(self, bill, date, url): page = self.get(url).text page = lxml.html.fromstring(page) header = page.xpath("string(//h3[contains(@id, 'hdVote')])") if 'No Bill Action' in header: self.warning("bad vote header -- skipping") return location = header.split(', ')[1] if location.startswith('House'): chamber = 'lower' elif location.startswith('Senate'): chamber = 'upper' elif location.startswith('Joint'): chamber = 'legislature' else: raise ScrapeError("Bad chamber: %s" % location) motion = ', '.join(header.split(', ')[2:]).strip() if motion: # If we can't detect a motion, skip this vote yes_count = int( page.xpath("string(//span[contains(@id, 'tdAyes')])")) no_count = int( page.xpath("string(//span[contains(@id, 'tdNays')])")) excused_count = int( page.xpath("string(//span[contains(@id, 'tdExcused')])")) absent_count = int( page.xpath("string(//span[contains(@id, 'tdAbsent')])")) passed = yes_count > no_count if motion.startswith('Do Pass'): type = 'passage' elif motion == 'Concurred in amendments': type = 'amendment' elif motion == 'Veto override': type = 'veto_override' else: type = 'other' vote = VoteEvent(chamber=chamber, start_date=date, motion_text=motion, result='pass' if passed else 'fail', classification=type, bill=bill) # The vote page URL has a unique ID # However, some votes are "consent calendar" events, # and relate to the passage of _multiple_ bills # These can't be modeled yet in Pupa, but for now we can # append a bill ID to the URL that forms the `pupa_id` # https://github.com/opencivicdata/pupa/issues/308 vote.pupa_id = '{}#{}'.format(url, bill.identifier.replace(' ', '')) vote.add_source(url) vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('excused', excused_count) vote.set_count('absent', absent_count) for td in page.xpath("//table[@id='tblVoteTotals']/tbody/tr/td"): option_or_person = td.text.strip() if option_or_person in ('Aye', 'Yea'): vote.yes(td.getprevious().text.strip()) elif option_or_person == 'Nay': vote.no(td.getprevious().text.strip()) elif option_or_person == 'Excused': vote.vote('excused', td.getprevious().text.strip()) elif option_or_person == 'Absent': vote.vote('absent', td.getprevious().text.strip()) yield vote
def process_vote(self, vote, bill, member_ids): try: motion = vote["ReadingDescription"] except KeyError: self.logger.warning("Can't even figure out what we're voting on. Skipping.") return if "VoteResult" not in vote: if "postponed" in motion.lower(): result = "Postponed" status = True # because we're talking abtout the motion, not the amendment elif "tabled" in motion.lower(): result = "Tabled" status = True else: self.logger.warning("Could not find result of vote, skipping.") return else: result = vote["VoteResult"].strip().lower() statuses = {"approved": 'pass', "disapproved": 'fail', "failed": 'fail', "declined": 'fail', "passed": 'pass'} try: status = statuses[result] except KeyError: self.logger.warning("Unexpected vote result '{result},' skipping vote.".format( result=result) ) return date = self.date_format(vote["DateOfVote"]) leg_votes = vote["MemberVotes"] v = VoteEvent(chamber='legislature', start_date=date, motion_text=motion, result=status, classification='passage', bill=bill ) yes_count = no_count = other_count = 0 for leg_vote in leg_votes: mem_name = member_ids[int(leg_vote["MemberId"])] if leg_vote["Vote"] == "1": yes_count += 1 v.yes(mem_name) elif leg_vote["Vote"] == "2": no_count += 1 v.no(mem_name) else: other_count += 1 v.vote('other', mem_name) v.set_count('yes', yes_count) v.set_count('no', no_count) v.set_count('other', other_count) # the documents for the readings are inside the vote # level in the json, so we'll deal with them here # and also add relevant actions if "amendment" in motion.lower(): if status: t = "amendment-passage" elif result in ["Tabled", "Postponed"]: t = "amendment-deferral" else: t = "amendment-failure" elif "first reading" in motion.lower(): t = "reading-1" elif "1st reading" in motion.lower(): t = "reading-1" elif "second reading" in motion.lower(): t = "reading-2" elif "2nd reading" in motion.lower(): t = "reading-2" elif "third reading" in motion.lower(): t = "reading-3" elif "3rd reading" in motion.lower(): t = "reading-3" elif "final reading" in motion.lower(): t = "reading-3" elif result in ["Tabled", "Postponed"]: t = None else: t = None bill.add_action(motion, date, classification=t) if "amendment" in t: vote["type"] = "amendment" elif "reading" in t: vote["type"] = t.replace("bill:", "") # some documents/versions are hiding in votes. if "AttachmentPath" in vote: is_version = False try: if vote["DocumentType"] in ["enrollment", "engrossment", "introduction"]: is_version = True except KeyError: pass if motion in ["enrollment", "engrossment", "introduction"]: is_version = True self.add_documents(vote["AttachmentPath"], bill, is_version) return v
def process_vote(self, votes, url, base_url, bill, legislators, chamber_dict, vote_results): for v in votes["items"]: try: v["yeas"] except KeyError: # sometimes the actual vote is buried a second layer deep v = self.get(base_url+v["link"]).json() try: v["yeas"] except KeyError: self.logger.warning("No vote info available, skipping") continue try: chamber = chamber_dict[v["chamber"]] except KeyError: chamber = "lower" if "house" in v["apn"] else "upper" try: date = self._tz.localize(datetime.datetime.strptime(v["date"], "%m/%d/%y")) date = "{:%Y-%m-%d}".format(date) except KeyError: try: date = self._tz.localize(datetime.datetime.strptime(v["occurred"], "%m/%d/%y")) date = "{:%Y-%m-%d}".format(date) except KeyError: self.logger.warning("No date found for vote, skipping") continue try: motion = v["action"] except KeyError: motion = v["motiontype"] # Sometimes Ohio's SOLAR will only return part of the JSON, so in that case skip if (not motion and isinstance(v['yeas'], str) and isinstance(v['nays'], str)): waringText = 'Malformed JSON found for vote ("revno" of {}); skipping' self.warning(waringText.format(v['revno'])) continue result = v.get("results") or v.get("passed") if result is None: if len(v['yeas']) > len(v['nays']): result = "passed" else: result = "failed" passed = vote_results[result.lower()] if "committee" in v: vote = VoteEvent(chamber=chamber, start_date=date, motion_text=motion, result='pass' if passed else 'fail', # organization=v["committee"], bill=bill, classification='passed' ) else: vote = VoteEvent(chamber=chamber, start_date=date, motion_text=motion, result='pass' if passed else 'fail', classification='passed', bill=bill ) # Concatenate the bill identifier and vote identifier to avoid collisions vote.pupa_id = '{}:{}'.format(bill.identifier.replace(' ', ''), v['revno']) # the yea and nay counts are not displayed, but vote totals are # and passage status is. yes_count = 0 no_count = 0 absent_count = 0 excused_count = 0 for voter_id in v["yeas"]: vote.yes(legislators[voter_id]) yes_count += 1 for voter_id in v["nays"]: vote.no(legislators[voter_id]) no_count += 1 if "absent" in v: for voter_id in v["absent"]: vote.vote('absent', legislators[voter_id]) absent_count += 1 if "excused" in v: for voter_id in v["excused"]: vote.vote('excused', legislators[voter_id]) excused_count += 1 vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('absent', absent_count) vote.set_count('excused', excused_count) # check to see if there are any other things that look # like vote categories, throw a warning if so for key, val in v.items(): if (type(val) == list and len(val) > 0 and key not in ["yeas", "nays", "absent", "excused"]): if val[0] in legislators: self.logger.warning("{k} looks like a vote type that's not being counted." " Double check it?".format(k=key)) vote.add_source(url) yield vote
def scrape_votes_old(self, bill, billname, session): vote_url = ('http://archives.legislature.state.oh.us/bills.cfm?ID=' + session + '_' + billname) page = self.get(vote_url).text page = lxml.html.fromstring(page) for jlink in page.xpath("//a[contains(@href, 'JournalText')]"): date = self._tz.localize(datetime.datetime.strptime(jlink.text, "%m/%d/%Y")).date() date = "{:%Y-%m-%d}".format(date) details = jlink.xpath("string(../../../td[2])") chamber = details.split(" - ")[0] if chamber == 'House': chamber = 'lower' elif chamber == 'Senate': chamber = 'upper' else: raise ScrapeError("Bad chamber: %s" % chamber) motion = details.split(" - ")[1].split("\n")[0].strip() vote_row = jlink.xpath("../../..")[0].getnext() yea_div = vote_row.xpath( "td/font/div[contains(@id, 'Yea')]")[0] yeas = [] for td in yea_div.xpath("table/tr/td"): name = td.xpath("string()") if name: yeas.append(name) no_div = vote_row.xpath( "td/font/div[contains(@id, 'Nay')]")[0] nays = [] for td in no_div.xpath("table/tr/td"): name = td.xpath("string()") if name: nays.append(name) yes_count = len(yeas) no_count = len(nays) vote = VoteEvent( chamber=chamber, start_date=date, motion_text=motion, result='pass' if yes_count > no_count else 'fail', bill=bill, classification='passed' ) for yes in yeas: vote.yes(yes) for no in nays: vote.no(no) vote.add_source(vote_url) yield vote
def scrape_vote(self, bill, date, url): page = self.get(url).text page = lxml.html.fromstring(page) header = page.xpath("string(//h4[contains(@id, 'hdVote')])") if 'No Bill Action' in header: self.warning("bad vote header -- skipping") return location = header.split(', ')[1] if location.startswith('House'): chamber = 'lower' elif location.startswith('Senate'): chamber = 'upper' elif location.startswith('Joint'): chamber = 'legislature' else: raise ScrapeError("Bad chamber: %s" % location) # committee = ' '.join(location.split(' ')[1:]).strip() # if not committee or committee.startswith('of Representatives'): # committee = None motion = ', '.join(header.split(', ')[2:]).strip() if motion: # If we can't detect a motion, skip this vote yes_count = int( page.xpath("string(//td[contains(@id, 'tdAyes')])")) no_count = int( page.xpath("string(//td[contains(@id, 'tdNays')])")) excused_count = int( page.xpath("string(//td[contains(@id, 'tdExcused')])")) absent_count = int( page.xpath("string(//td[contains(@id, 'tdAbsent')])")) passed = yes_count > no_count if motion.startswith('Do Pass'): type = 'passage' elif motion == 'Concurred in amendments': type = 'amendment' elif motion == 'Veto override': type = 'veto_override' else: type = 'other' vote = VoteEvent(chamber=chamber, start_date=date, motion_text=motion, result='pass' if passed else 'fail', classification=type, bill=bill ) vote.pupa_id = url # vote id is in URL vote.add_source(url) vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('excused', excused_count) vote.set_count('absent', absent_count) for td in page.xpath("//table[contains(@id, 'tblVotes')]/tr/td"): if td.text in ('Aye', 'Yea'): vote.yes(td.getprevious().text.strip()) elif td.text == 'Nay': vote.no(td.getprevious().text.strip()) elif td.text == 'Excused': vote.vote('excused', td.getprevious().text.strip()) elif td.text == 'Absent': vote.vote('absent', td.getprevious().text.strip()) yield vote
def scrape_votes(self, bill_page, page_url, bill, insert, year): root = lxml.html.fromstring(bill_page) trs = root.xpath('/html/body/div/table[6]//tr') assert len(trs) >= 1, "Didn't find the Final Passage Votes' table" for tr in trs[1:]: links = tr.xpath('td/a[contains(text(), "Passage")]') if len(links) == 0: self.warning("Non-passage vote found for {}; ".format(bill.identifier) + "probably a motion for the calendar. It will be skipped.") else: assert len(links) == 1, \ "Too many votes found for XPath query, on bill {}".format(bill.identifier) link = links[0] motion = link.text if 'Assembly' in motion: chamber = 'lower' else: chamber = 'upper' votes = {} tds = tr.xpath('td') for td in tds: if td.text: text = td.text.strip() date = re.match('... .*?, ....', text) count = re.match('(?P<category>.*?) (?P<votes>[0-9]+)[,]?', text) if date: vote_date = datetime.strptime(text, '%b %d, %Y') elif count: votes[count.group('category')] = int(count.group('votes')) yes = votes['Yea'] no = votes['Nay'] excused = votes['Excused'] not_voting = votes['Not Voting'] absent = votes['Absent'] other = excused + not_voting + absent passed = yes > no vote = VoteEvent(chamber=chamber, start_date=self._tz.localize(vote_date), motion_text=motion, result='pass' if passed else 'fail', classification='passage', bill=bill, ) vote.set_count('yes', yes) vote.set_count('no', no) vote.set_count('other', other) vote.set_count('not voting', not_voting) vote.set_count('absent', absent) # try to get vote details try: vote_url = 'http://www.leg.state.nv.us/Session/%s/Reports/%s' % ( insert, link.get('href')) vote.pupa_id = vote_url vote.add_source(vote_url) if vote_url in self._seen_votes: self.warning('%s is included twice, skipping second', vote_url) continue else: self._seen_votes.add(vote_url) page = self.get(vote_url).text page = page.replace(u"\xa0", " ") root = lxml.html.fromstring(page) for el in root.xpath('//table[2]/tr'): tds = el.xpath('td') name = tds[1].text_content().strip() vote_result = tds[2].text_content().strip() if vote_result == 'Yea': vote.yes(name) elif vote_result == 'Nay': vote.no(name) else: vote.vote('other', name) vote.add_source(page_url) except scrapelib.HTTPError: self.warning("failed to fetch vote page, adding vote without details") yield vote
def _parse_votes(self, url, vote, bill): '''Given a vote url and a vote object, extract the voters and the vote counts from the vote page and update the vote object. ''' if url.lower().endswith('.pdf'): try: resp = self.get(url) except HTTPError: # This vote document wasn't found. msg = 'No document found at url %r' % url self.logger.warning(msg) return try: v = PDFCommitteeVote(url, resp.content, bill) return v.asvote() except PDFCommitteeVoteParseError: # Warn and skip. self.warning("Could't parse committee vote at %r" % url) return html = self.get(url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) # Yes, no, excused, absent. try: vals = doc.xpath('//table')[1].xpath('tr/td/text()') except IndexError: # Most likely was a bogus link lacking vote data. return yes_count, no_count, excused_count, absent_count = map(int, vals) # Get the motion. try: motion = doc.xpath('//br')[-1].tail.strip() except IndexError: # Some of them mysteriously have no motion listed. motion = vote['action'] if not motion: motion = vote['action'] vote['motion'] = motion action = vote['action'] vote_url = vote['vote_url'] vote = VoteEvent( chamber=vote['chamber'], start_date=vote['date'], motion_text=vote['motion'], result='fail', # placeholder classification='passage', bill=bill, bill_action=vote['action'], ) vote.pupa_id = vote_url # URL contains sequence number vote.add_source(vote_url) vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('excused', excused_count) vote.set_count('absent', absent_count) for text in doc.xpath('//table')[2].xpath('tr/td/text()'): if not text.strip(u'\xa0'): continue v, name = filter(None, text.split(u'\xa0')) # Considering Name is brackets as short name regex = re.compile(r".*?\((.*?)\)") short_name = re.findall(regex, name) if len(short_name) > 0: note = 'Short Name: ' + short_name[0] else: note = '' # Name without brackets like 'Kary, Douglas' name = re.sub(r"[\(\[].*?[\)\]]", "", name) if v == 'Y': vote.yes(name, note=note) elif v == 'N': vote.no(name, note=note) elif v == 'E': vote.vote('excused', name, note=note) elif v == 'A': vote.vote('absent', name, note=note) # code to deterimine value of `passed` passed = None # some actions take a super majority, so we aren't just # comparing the yeas and nays here. for i in vote_passage_indicators: if i in action: passed = True break for i in vote_failure_indicators: if i in action and passed: # a quick explanation: originally an exception was # thrown if both passage and failure indicators were # present because I thought that would be a bug in my # lists. Then I found 2007 HB 160. # Now passed = False if the nays outnumber the yays.. # I won't automatically mark it as passed if the yays # ounumber the nays because I don't know what requires # a supermajority in MT. if no_count >= yes_count: passed = False break else: raise Exception("passage and failure indicator" "both present at: %s" % url) if i in action and passed is None: passed = False break for i in vote_ambiguous_indicators: if i in action: passed = yes_count > no_count break if passed is None: raise Exception("Unknown passage at: %s" % url) vote.result = 'pass' if passed else 'fail' return vote
def _parse_votes(self, url, vote, bill): '''Given a vote url and a vote object, extract the voters and the vote counts from the vote page and update the vote object. ''' if url.lower().endswith('.pdf'): try: resp = self.get(url) except HTTPError: # This vote document wasn't found. msg = 'No document found at url %r' % url self.logger.warning(msg) return try: v = PDFCommitteeVote(url, resp.content, bill) return v.asvote() except PDFCommitteeVoteParseError: # Warn and skip. self.warning("Could't parse committee vote at %r" % url) return html = self.get(url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) # Yes, no, excused, absent. try: vals = doc.xpath('//table')[1].xpath('tr/td/text()') except IndexError: # Most likely was a bogus link lacking vote data. return yes_count, no_count, excused_count, absent_count = map(int, vals) # Get the motion. try: motion = doc.xpath('//br')[-1].tail.strip() except IndexError: # Some of them mysteriously have no motion listed. motion = vote['action'] if not motion: motion = vote['action'] vote['motion'] = motion action = vote['action'] vote_url = vote['vote_url'] vote = VoteEvent( chamber=vote['chamber'], start_date=vote['date'], motion_text=vote['motion'], result='fail', # placeholder classification='passage', bill=bill, bill_action=vote['action'], ) vote.pupa_id = vote_url # URL contains sequence number vote.add_source(vote_url) vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('excused', excused_count) vote.set_count('absent', absent_count) for text in doc.xpath('//table')[2].xpath('tr/td/text()'): if not text.strip(u'\xa0'): continue v, name = filter(None, text.split(u'\xa0')) # Considering Name is brackets as short name regex = re.compile(".*?\((.*?)\)") short_name = re.findall(regex, name) if len(short_name) > 0: note = 'Short Name: ' + short_name[0] else: note = '' # Name without brackets like 'Kary, Douglas' name = re.sub("[\(\[].*?[\)\]]", "", name) if v == 'Y': vote.yes(name, note=note) elif v == 'N': vote.no(name, note=note) elif v == 'E': vote.vote('excused', name, note=note) elif v == 'A': vote.vote('absent', name, note=note) # code to deterimine value of `passed` passed = None # some actions take a super majority, so we aren't just # comparing the yeas and nays here. for i in vote_passage_indicators: if i in action: passed = True break for i in vote_failure_indicators: if i in action and passed: # a quick explanation: originally an exception was # thrown if both passage and failure indicators were # present because I thought that would be a bug in my # lists. Then I found 2007 HB 160. # Now passed = False if the nays outnumber the yays.. # I won't automatically mark it as passed if the yays # ounumber the nays because I don't know what requires # a supermajority in MT. if no_count >= yes_count: passed = False break else: raise Exception("passage and failure indicator" "both present at: %s" % url) if i in action and passed is None: passed = False break for i in vote_ambiguous_indicators: if i in action: passed = yes_count > no_count break if passed is None: raise Exception("Unknown passage at: %s" % url) vote.result = 'pass' if passed else 'fail' return vote
def scrape_vote(self, bill, date, url): page = self.get(url).text page = lxml.html.fromstring(page) header = page.xpath("string(//h3[contains(@id, 'hdVote')])") if 'No Bill Action' in header: self.warning("bad vote header -- skipping") return location = header.split(', ')[1] if location.startswith('House'): chamber = 'lower' elif location.startswith('Senate'): chamber = 'upper' elif location.startswith('Joint'): chamber = 'legislature' else: raise ScrapeError("Bad chamber: %s" % location) motion = ', '.join(header.split(', ')[2:]).strip() if motion: # If we can't detect a motion, skip this vote yes_count = int( page.xpath("string(//span[contains(@id, 'tdAyes')])")) no_count = int( page.xpath("string(//span[contains(@id, 'tdNays')])")) excused_count = int( page.xpath("string(//span[contains(@id, 'tdExcused')])")) absent_count = int( page.xpath("string(//span[contains(@id, 'tdAbsent')])")) passed = yes_count > no_count if motion.startswith('Do Pass'): type = 'passage' elif motion == 'Concurred in amendments': type = 'amendment' elif motion == 'Veto override': type = 'veto_override' else: type = 'other' vote = VoteEvent(chamber=chamber, start_date=date, motion_text=motion, result='pass' if passed else 'fail', classification=type, bill=bill ) # The vote page URL has a unique ID # However, some votes are "consent calendar" events, # and relate to the passage of _multiple_ bills # These can't be modeled yet in Pupa, but for now we can # append a bill ID to the URL that forms the `pupa_id` # https://github.com/opencivicdata/pupa/issues/308 vote.pupa_id = '{}#{}'.format(url, bill.identifier.replace(' ', '')) vote.add_source(url) vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('excused', excused_count) vote.set_count('absent', absent_count) for td in page.xpath("//table[@id='tblVoteTotals']/tbody/tr/td"): option_or_person = td.text.strip() if option_or_person in ('Aye', 'Yea'): vote.yes(td.getprevious().text.strip()) elif option_or_person == 'Nay': vote.no(td.getprevious().text.strip()) elif option_or_person == 'Excused': vote.vote('excused', td.getprevious().text.strip()) elif option_or_person == 'Absent': vote.vote('absent', td.getprevious().text.strip()) yield vote
def scrape_vote(self, bill, vote_id, session): vote_url = ( "https://legis.delaware.gov/json/RollCall/GetRollCallVoteByRollCallId" ) form = {"rollCallId": vote_id, "sort": "", "group": "", "filter": ""} self.info("Fetching vote {} for {}".format(vote_id, bill.identifier)) page = self.post(url=vote_url, data=form, allow_redirects=True).json() if page: roll = page["Model"] vote_chamber = self.chamber_map[roll["ChamberName"]] # "7/1/16 01:00 AM" vote_date = dt.datetime.strptime( roll["TakenAtDateTime"], "%m/%d/%y %I:%M %p" ).strftime("%Y-%m-%d") # TODO: What does this code mean? vote_motion = roll["RollCallVoteType"] vote_passed = "pass" if roll["RollCallStatus"] == "Passed" else "fail" other_count = ( int(roll["NotVotingCount"]) + int(roll["VacantVoteCount"]) + int(roll["AbsentVoteCount"]) + int(roll["ConflictVoteCount"]) ) vote = VoteEvent( chamber=vote_chamber, start_date=vote_date, motion_text=vote_motion, result=vote_passed, classification="other", bill=bill, legislative_session=session, ) vote_pdf_url = ( "https://legis.delaware.gov" "/json/RollCallController/GenerateRollCallPdf" "?rollCallId={}&chamberId={}".format( vote_id, self.chamber_codes[vote_chamber] ) ) # Vote URL is just a generic search URL with POSTed data, # so provide a different link vote.add_source(vote_pdf_url) vote.pupa_id = vote_pdf_url vote.set_count("yes", roll["YesVoteCount"]) vote.set_count("no", roll["NoVoteCount"]) vote.set_count("other", other_count) for row in roll["AssemblyMemberVotes"]: # AssemblyMemberId looks like it should work here, # but for some sessions it's bugged to only return session try: voter = self.legislators_by_short[str(row["ShortName"])] name = voter["DisplayName"] except KeyError: self.warning( "could not find legislator short name %s", row["ShortName"] ) name = row["ShortName"] if row["SelectVoteTypeCode"] == "Y": vote.yes(name) elif row["SelectVoteTypeCode"] == "N": vote.no(name) else: vote.vote("other", name) yield vote
def parse_html_vote(self, bill, actor, date, motion, url, uniqid): try: page = self.get(url).text except scrapelib.HTTPError: self.warning("A vote page not found for bill {}". format(bill.identifier)) return page = lxml.html.fromstring(page) page.make_links_absolute(url) descr = page.xpath("//b")[0].text_content() if descr == '': # New page method descr = page.xpath("//div[@id='content']/center")[0].text if "on voice vote" in descr: return if "committee" in descr.lower(): yield from self.scrape_committee_vote( bill, actor, date, motion, page, url, uniqid ) return passed = None if "Passed" in descr: passed = True elif "Failed" in descr: passed = False elif "UTAH STATE LEGISLATURE" in descr: return elif descr.strip() == '-': return else: self.warning(descr) raise NotImplementedError("Can't see if we passed or failed") headings = page.xpath("//b")[1:] votes = page.xpath("//table") sets = zip(headings, votes) vdict = {} for (typ, votes) in sets: txt = typ.text_content() arr = [x.strip() for x in txt.split("-", 1)] if len(arr) != 2: continue v_txt, count = arr v_txt = v_txt.strip() count = int(count) people = [x.text_content().strip() for x in votes.xpath(".//font[@face='Arial']")] vdict[v_txt] = { "count": count, "people": people } vote = Vote(chamber=actor, start_date=date, motion_text=motion, result='pass' if passed else 'fail', bill=bill, classification='passage', identifier=str(uniqid)) vote.set_count('yes', vdict['Yeas']['count']) vote.set_count('no', vdict['Nays']['count']) vote.set_count('other', vdict['Absent or not voting']['count']) vote.add_source(url) for person in vdict['Yeas']['people']: vote.yes(person) for person in vdict['Nays']['people']: vote.no(person) for person in vdict['Absent or not voting']['people']: vote.vote('other', person) yield vote
def scrape(self, session=None): HTML_TAGS_RE = r'<.*?>' if session is None: session = self.latest_session() year_slug = self.jurisdiction.get_year_slug(session) # Load all bills and resolutions via the private API bills_url = \ 'http://legislature.vermont.gov/bill/loadBillsReleased/{}/'.\ format(year_slug) bills_json = self.get(bills_url).text bills = json.loads(bills_json)['data'] or [] bills_url = \ 'http://legislature.vermont.gov/bill/loadBillsIntroduced/{}/'.\ format(year_slug) bills_json = self.get(bills_url).text bills.extend(json.loads(bills_json)['data'] or []) resolutions_url = \ 'http://legislature.vermont.gov/bill/loadAllResolutionsByChamber/{}/both'.\ format(year_slug) resolutions_json = self.get(resolutions_url).text bills.extend(json.loads(resolutions_json)['data'] or []) # Parse the information from each bill for info in bills: # Strip whitespace from strings info = {k: v.strip() for k, v in info.items()} # Identify the bill type and chamber if info['BillNumber'].startswith('J.R.H.'): bill_type = 'joint resolution' bill_chamber = 'lower' elif info['BillNumber'].startswith('J.R.S.'): bill_type = 'joint resolution' bill_chamber = 'upper' elif info['BillNumber'].startswith('H.C.R.'): bill_type = 'concurrent resolution' bill_chamber = 'lower' elif info['BillNumber'].startswith('S.C.R.'): bill_type = 'concurrent resolution' bill_chamber = 'upper' elif info['BillNumber'].startswith('H.R.'): bill_type = 'resolution' bill_chamber = 'lower' elif info['BillNumber'].startswith('S.R.'): bill_type = 'resolution' bill_chamber = 'upper' elif info['BillNumber'].startswith('PR.'): bill_type = 'constitutional amendment' if info['Body'] == 'H': bill_chamber = 'lower' elif info['Body'] == 'S': bill_chamber = 'upper' else: raise AssertionError("Amendment not tied to chamber") elif info['BillNumber'].startswith('H.'): bill_type = 'bill' bill_chamber = 'lower' elif info['BillNumber'].startswith('S.'): bill_type = 'bill' bill_chamber = 'upper' else: raise AssertionError( "Unknown bill type found: '{}'". format(info['BillNumber']) ) bill_id = info['BillNumber'].replace('.', '').replace(' ', '') # put one space back in between type and number bill_id = re.sub(r'([a-zA-Z]+)(\d+)', r'\1 \2', bill_id) # Create the bill using its basic information bill = Bill( identifier=bill_id, legislative_session=session, chamber=bill_chamber, title=info['Title'], classification=bill_type ) if 'resolution' in bill_type: bill.add_source(resolutions_url) else: bill.add_source(bills_url) # Load the bill's information page to access its metadata bill_url = 'http://legislature.vermont.gov/bill/status/{0}/{1}'.\ format(year_slug, info['BillNumber']) doc = self.lxmlize(bill_url) bill.add_source(bill_url) # Capture sponsors sponsors = doc.xpath( '//dl[@class="summary-table"]/dt[text()="Sponsor(s)"]/' 'following-sibling::dd[1]/ul/li' ) sponsor_type = 'primary' for sponsor in sponsors: if sponsor.xpath('span/text()') == ['Additional Sponsors']: sponsor_type = 'cosponsor' continue sponsor_name = sponsor.xpath('a/text()')[0].\ replace("Rep.", "").replace("Sen.", "").strip() if sponsor_name and not \ (sponsor_name[:5] == "Less" and len(sponsor_name) == 5): bill.add_sponsorship( name=sponsor_name, classification=sponsor_type, entity_type='person', primary=(sponsor_type == 'primary') ) # Capture bill text versions # Warning: There's a TODO in VT's source code saying 'move this to where it used to be' # so leave in the old and new positions versions = doc.xpath( '//dl[@class="summary-table"]/dt[text()="Bill/Resolution Text"]/' 'following-sibling::dd[1]/ul/li/a |' '//ul[@class="bill-path"]//a' ) for version in versions: if version.xpath('text()'): bill.add_version_link( note=version.xpath('text()')[0], url=version.xpath('@href')[0].replace(' ', '%20'), media_type='application/pdf' ) # Identify the internal bill ID, used for actions and votes # If there is no internal bill ID, then it has no extra information try: internal_bill_id = re.search( r'"bill/loadBillDetailedStatus/.+?/(\d+)"', lxml.etree.tostring(doc).decode('utf-8') ).group(1) except AttributeError: self.warning("Bill {} appears to have no activity".format(info['BillNumber'])) yield bill continue # Capture actions actions_url = 'http://legislature.vermont.gov/bill/loadBillDetailedStatus/{0}/{1}'.\ format(year_slug, internal_bill_id) actions_json = self.get(actions_url).text actions = json.loads(actions_json)['data'] bill.add_source(actions_url) chambers_passed = set() for action in actions: action = {k: v for k, v in action.items() if v is not None} if "Signed by Governor" in action['FullStatus']: actor = 'executive' elif action['ChamberCode'] == 'H': actor = 'lower' elif action['ChamberCode'] == 'S': actor = 'upper' else: raise AssertionError("Unknown actor for bill action") # Categorize action if "Signed by Governor" in action['FullStatus']: # assert chambers_passed == set("HS") action_type = 'executive-signature' elif "Vetoed by the Governor" in action['FullStatus']: action_type = 'executive-veto' elif "Read first time" in action['FullStatus'] \ or "Read 1st time" in action['FullStatus']: action_type = 'introduction' elif "Reported favorably" in action['FullStatus']: action_type = 'committee-passage-favorable' elif actor == 'lower' and any(x.lower().startswith('aspassed') for x in action['keywords'].split(';')): action_type = 'passage' chambers_passed.add("H") elif actor == 'upper' and any(x.lower().startswith(' aspassed') or x.lower().startswith('aspassed') for x in action['keywords'].split(';')): action_type = 'passage' chambers_passed.add("S") else: action_type = None bill.add_action( description=re.sub(HTML_TAGS_RE, "", action['FullStatus']), date=datetime.datetime.strftime( datetime.datetime.strptime(action['StatusDate'], '%m/%d/%Y'), '%Y-%m-%d' ), chamber=actor, classification=action_type ) # Capture votes votes_url = 'http://legislature.vermont.gov/bill/loadBillRollCalls/{0}/{1}'.format( year_slug, internal_bill_id) votes_json = self.get(votes_url).text votes = json.loads(votes_json)['data'] bill.add_source(votes_url) for vote in votes: roll_call_id = vote['VoteHeaderID'] roll_call_url = ('http://legislature.vermont.gov/bill/' 'loadBillRollCallDetails/{0}/{1}'.format( year_slug, roll_call_id)) roll_call_json = self.get(roll_call_url).text roll_call = json.loads(roll_call_json)['data'] roll_call_yea = [] roll_call_nay = [] roll_call_not_voting = [] for member in roll_call: (member_name, _district) = member['MemberName'].split(" of ") member_name = member_name.strip() if member['MemberVote'] == "Yea": roll_call_yea.append(member_name) elif member['MemberVote'] == "Nay": roll_call_nay.append(member_name) else: roll_call_not_voting.append(member_name) if ("Passed -- " in vote['FullStatus'] or "Veto of Governor overridden" in vote['FullStatus']): did_pass = True elif ("Failed -- " in vote['FullStatus'] or 'Veto of the Governor sustained' in vote['FullStatus']): did_pass = False else: raise AssertionError("Roll call vote result is unclear") # Check vote counts yea_count = int(re.search(r'Yeas = (\d+)', vote['FullStatus']).group(1)) nay_count = int(re.search(r'Nays = (\d+)', vote['FullStatus']).group(1)) vote_to_add = VoteEvent( bill=bill, chamber=('lower' if vote['ChamberCode'] == 'H' else 'upper'), start_date=datetime.datetime.strftime( datetime.datetime.strptime(vote['StatusDate'], '%m/%d/%Y'), '%Y-%m-%d' ), motion_text=re.sub(HTML_TAGS_RE, "", vote['FullStatus']).strip(), result='pass' if did_pass else 'fail', classification='passage', legislative_session=session, ) vote_to_add.add_source(roll_call_url) vote_to_add.set_count('yes', yea_count) vote_to_add.set_count('no', nay_count) vote_to_add.set_count('not voting', len(roll_call_not_voting)) for member in roll_call_yea: vote_to_add.yes(member) for member in roll_call_nay: vote_to_add.no(member) for member in roll_call_not_voting: vote_to_add.vote('not voting', member) yield vote_to_add # Capture extra information- Not yet implemented # Witnesses: # http://legislature.vermont.gov/bill/loadBillWitnessList/{year_slug}/{internal_bill_id} # Conference committee members: # http://legislature.vermont.gov/bill/loadBillConference/{year_slug}/{bill_number} # Committee meetings: # http://legislature.vermont.gov/committee/loadHistoryByBill/{year_slug}?LegislationId={internal_bill_id} yield bill
def scrape_votes(self, bill, url): page = lxml.html.fromstring(self.get(url).text.replace(u"\xa0", " ")) seen_rcs = set() re_ns = "http://exslt.org/regular-expressions" path = r"//p[re:test(text(), 'OKLAHOMA\s+(HOUSE|STATE\s+SENATE)')]" for header in page.xpath(path, namespaces={"re": re_ns}): bad_vote = False # Each chamber has the motion name on a different line of the file if "HOUSE" in header.xpath("string()"): chamber = "lower" motion_index = 8 else: chamber = "upper" motion_index = 13 motion = header.xpath( "string(following-sibling::p[%d])" % motion_index ).strip() motion = re.sub(r"\s+", " ", motion) if not motion.strip(): self.warning("Motion text not found") return match = re.match(r"^(.*) (PASSED|FAILED)$", motion) if match: motion = match.group(1) passed = match.group(2) == "PASSED" else: passed = None rcs_p = header.xpath("following-sibling::p[contains(., 'RCS#')]")[0] rcs_line = rcs_p.xpath("string()").replace(u"\xa0", " ") rcs = re.search(r"RCS#\s+(\d+)", rcs_line).group(1) if rcs in seen_rcs: continue else: seen_rcs.add(rcs) date_line = rcs_p.getnext().xpath("string()") date = re.search(r"\d+/\d+/\d+", date_line).group(0) date = datetime.datetime.strptime(date, "%m/%d/%Y").date() vtype = None counts = collections.defaultdict(int) votes = collections.defaultdict(list) seen_yes = False for sib in header.xpath("following-sibling::p")[13:]: line = sib.xpath("string()").replace("\r\n", " ").strip() if "*****" in line: break regex = ( r"(YEAS|NAYS|EXCUSED|VACANT|CONSTITUTIONAL " r"PRIVILEGE|NOT VOTING|N/V)\s*:\s*(\d+)(.*)" ) match = re.match(regex, line) if match: if match.group(1) == "YEAS" and "RCS#" not in line: vtype = "yes" seen_yes = True elif match.group(1) == "NAYS" and seen_yes: vtype = "no" elif match.group(1) == "VACANT": continue # skip these elif seen_yes: vtype = "other" if seen_yes and match.group(3).strip(): self.warning("Bad vote format, skipping.") bad_vote = True counts[vtype] += int(match.group(2)) elif seen_yes: for name in line.split(" "): if not name: continue if "HOUSE" in name or "SENATE " in name: continue votes[vtype].append(name.strip()) if bad_vote: continue if passed is None: passed = counts["yes"] > (counts["no"] + counts["other"]) vote = Vote( chamber=chamber, start_date=date.strftime("%Y-%m-%d"), motion_text=motion, result="pass" if passed else "fail", bill=bill, classification="passage", ) vote.set_count("yes", counts["yes"]) vote.set_count("no", counts["no"]) vote.set_count("other", counts["other"]) vote.pupa_id = url + "#" + rcs vote.add_source(url) for name in votes["yes"]: vote.yes(name) for name in votes["no"]: if ":" in name: raise Exception(name) vote.no(name) for name in votes["other"]: vote.vote("other", name) yield vote
def scrape_votes_for_chamber(self, chamber, vote_data, bill, link): raw_vote_data = re.split(r"\w+? by [\w ]+?\s+-", vote_data.strip())[1:] motion_count = 1 for raw_vote in raw_vote_data: raw_vote = raw_vote.split(u"\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0") motion = raw_vote[0] vote_date = re.search(r"(\d+/\d+/\d+)", motion) if vote_date: vote_date = datetime.datetime.strptime(vote_date.group(), "%m/%d/%Y") passed = ( "Passed" in motion or "Recommended for passage" in motion or "Rec. for pass" in motion or "Adopted" in raw_vote[1] ) vote_regex = re.compile(r"\d+$") aye_regex = re.compile(r"^.+voting aye were: (.+) -") no_regex = re.compile(r"^.+voting no were: (.+) -") not_voting_regex = re.compile(r"^.+present and not voting were: (.+) -") yes_count = 0 no_count = 0 not_voting_count = 0 ayes = [] nos = [] not_voting = [] for v in raw_vote[1:]: v = v.strip() if v.startswith("Ayes...") and vote_regex.search(v): yes_count = int(vote_regex.search(v).group()) elif v.startswith("Noes...") and vote_regex.search(v): no_count = int(vote_regex.search(v).group()) elif v.startswith("Present and not voting...") and vote_regex.search(v): not_voting_count += int(vote_regex.search(v).group()) elif aye_regex.search(v): ayes = aye_regex.search(v).groups()[0].split(", ") elif no_regex.search(v): nos = no_regex.search(v).groups()[0].split(", ") elif not_voting_regex.search(v): not_voting += not_voting_regex.search(v).groups()[0].split(", ") motion = motion.strip() motion = motion.replace("&", "&") # un-escape ampersands if motion in self._seen_votes: motion = "{} ({})".format(motion, motion_count) motion_count += 1 self._seen_votes.add(motion) vote = VoteEvent( motion_text=motion, start_date=vote_date.strftime("%Y-%m-%d") if vote_date else None, classification="passage", result="pass" if passed else "fail", chamber=chamber, bill=bill, ) vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("not voting", not_voting_count) vote.add_source(link) seen = set() for a in ayes: if a in seen: continue vote.yes(a) seen.add(a) for n in nos: if n in seen: continue vote.no(n) seen.add(n) for n in not_voting: if n in seen: continue vote.vote("not voting", n) seen.add(n) yield vote
def scrape_vote(self, bill, name, url): if "VOTE/h" in url: vote_chamber = "lower" cols = (1, 5, 9, 13) name_offset = 3 yes_offset = 0 no_offset = 1 else: vote_chamber = "upper" cols = (1, 6) name_offset = 4 yes_offset = 1 no_offset = 2 page = self.get(url, verify=False).text if "BUDGET ADDRESS" in page: return page = lxml.html.fromstring(page) yes_count = page.xpath( "string(//span[contains(., 'Those voting Yea')])") yes_count = int(re.match(r"[^\d]*(\d+)[^\d]*", yes_count).group(1)) no_count = page.xpath( "string(//span[contains(., 'Those voting Nay')])") no_count = int(re.match(r"[^\d]*(\d+)[^\d]*", no_count).group(1)) other_count = page.xpath("string(//span[contains(., 'Those absent')])") other_count = int(re.match(r"[^\d]*(\d+)[^\d]*", other_count).group(1)) need_count = page.xpath("string(//span[contains(., 'Necessary for')])") need_count = int(re.match(r"[^\d]*(\d+)[^\d]*", need_count).group(1)) date = page.xpath("string(//span[contains(., 'Taken on')])") date = re.match(r".*Taken\s+on\s+(\d+/\s?\d+)", date).group(1) date = date.replace(" ", "") date = datetime.datetime.strptime( date + " " + bill.legislative_session, "%m/%d %Y").date() # not sure about classification. vote = Vote( chamber=vote_chamber, start_date=date, motion_text=name, result="pass" if yes_count > need_count else "fail", classification="passage", bill=bill, ) vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("other", other_count) vote.add_source(url) table = page.xpath("//table")[0] for row in table.xpath("tr"): for i in cols: name = row.xpath("string(td[%d])" % (i + name_offset)).strip() if not name or name == "VACANT": continue name = string.capwords(name) if "Y" in row.xpath("string(td[%d])" % (i + yes_offset)): vote.yes(name) elif "N" in row.xpath("string(td[%d])" % (i + no_offset)): vote.no(name) else: vote.vote("other", name) yield vote
def handle_page(self): # Checks to see if any vote totals are provided if (len( self.doc.xpath( '//span[contains(@id, "ctl00_MainContent_lblTotal")]/text()' )) > 0): (date, ) = self.doc.xpath('//span[contains(@id, "lblDate")]/text()') date = format_datetime( datetime.datetime.strptime(date, "%m/%d/%Y %I:%M:%S %p"), "US/Eastern") # ctl00_MainContent_lblTotal //span[contains(@id, "ctl00_MainContent_lblTotal")] yes_count = int( self.doc.xpath('//span[contains(@id, "lblYeas")]/text()')[0]) no_count = int( self.doc.xpath('//span[contains(@id, "lblNays")]/text()')[0]) other_count = int( self.doc.xpath('//span[contains(@id, "lblMissed")]/text()')[0]) result = "pass" if yes_count > no_count else "fail" (committee, ) = self.doc.xpath('//span[contains(@id, "lblCommittee")]/text()') (action, ) = self.doc.xpath('//span[contains(@id, "lblAction")]/text()') motion = "{} ({})".format(action, committee) vote = VoteEvent( start_date=date, bill=self.kwargs["bill"], chamber="lower", motion_text=motion, result=result, classification="committee", ) vote.add_source(self.url) vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("not voting", other_count) for member_vote in self.doc.xpath( '//ul[contains(@class, "vote-list")]/li'): if not member_vote.text_content().strip(): continue (member, ) = member_vote.xpath("span[2]//text()") (member_vote, ) = member_vote.xpath("span[1]//text()") if member_vote == "Y": vote.yes(member) elif member_vote == "N": vote.no(member) elif member_vote == "-": vote.vote("not voting", member) # Parenthetical votes appear to not be counted in the # totals for Yea, Nay, _or_ Missed elif re.search(r"\([YN]\)", member_vote): continue else: raise ValueError( "Unknown vote type found: {}".format(member_vote)) yield vote
def scrape_vote(self, bill, name, url): if "VOTE/h" in url: vote_chamber = 'lower' cols = (1, 5, 9, 13) name_offset = 3 yes_offset = 0 no_offset = 1 else: vote_chamber = 'upper' cols = (1, 6) name_offset = 4 yes_offset = 1 no_offset = 2 # Connecticut's SSL is causing problems with Scrapelib, so use Requests page = requests.get(url, verify=False).text if 'BUDGET ADDRESS' in page: return page = lxml.html.fromstring(page) yes_count = page.xpath( "string(//span[contains(., 'Those voting Yea')])") yes_count = int(re.match(r'[^\d]*(\d+)[^\d]*', yes_count).group(1)) no_count = page.xpath( "string(//span[contains(., 'Those voting Nay')])") no_count = int(re.match(r'[^\d]*(\d+)[^\d]*', no_count).group(1)) other_count = page.xpath( "string(//span[contains(., 'Those absent')])") other_count = int( re.match(r'[^\d]*(\d+)[^\d]*', other_count).group(1)) need_count = page.xpath( "string(//span[contains(., 'Necessary for')])") need_count = int( re.match(r'[^\d]*(\d+)[^\d]*', need_count).group(1)) date = page.xpath("string(//span[contains(., 'Taken on')])") date = re.match(r'.*Taken\s+on\s+(\d+/\s?\d+)', date).group(1) date = date.replace(' ', '') date = datetime.datetime.strptime(date + " " + bill.legislative_session, "%m/%d %Y").date() # not sure about classification. vote = Vote(chamber=vote_chamber, start_date=date, motion_text=name, result='pass' if yes_count > need_count else 'fail', classification='passage', bill=bill ) vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('other', other_count) vote.add_source(url) table = page.xpath("//table")[0] for row in table.xpath("tr"): for i in cols: name = row.xpath("string(td[%d])" % ( i + name_offset)).strip() if not name or name == 'VACANT': continue if "Y" in row.xpath("string(td[%d])" % (i + yes_offset)): vote.yes(name) elif "N" in row.xpath("string(td[%d])" % (i + no_offset)): vote.no(name) else: vote.vote('other', name) yield vote
def scrape_votes_for_chamber(self, chamber, vote_data, bill, link): raw_vote_data = re.split(r'\w+? by [\w ]+?\s+-', vote_data.strip())[1:] motion_count = 1 for raw_vote in raw_vote_data: raw_vote = raw_vote.split(u'\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0') motion = raw_vote[0] vote_date = re.search(r'(\d+/\d+/\d+)', motion) if vote_date: vote_date = datetime.datetime.strptime(vote_date.group(), '%m/%d/%Y') passed = ( 'Passed' in motion or 'Recommended for passage' in motion or 'Rec. for pass' in motion or 'Adopted' in raw_vote[1] ) vote_regex = re.compile(r'\d+$') aye_regex = re.compile(r'^.+voting aye were: (.+) -') no_regex = re.compile(r'^.+voting no were: (.+) -') not_voting_regex = re.compile(r'^.+present and not voting were: (.+) -') yes_count = 0 no_count = 0 not_voting_count = 0 ayes = [] nos = [] not_voting = [] for v in raw_vote[1:]: v = v.strip() if v.startswith('Ayes...') and vote_regex.search(v): yes_count = int(vote_regex.search(v).group()) elif v.startswith('Noes...') and vote_regex.search(v): no_count = int(vote_regex.search(v).group()) elif v.startswith('Present and not voting...') and vote_regex.search(v): not_voting_count += int(vote_regex.search(v).group()) elif aye_regex.search(v): ayes = aye_regex.search(v).groups()[0].split(', ') elif no_regex.search(v): nos = no_regex.search(v).groups()[0].split(', ') elif not_voting_regex.search(v): not_voting += not_voting_regex.search(v).groups()[0].split(', ') motion = motion.strip() motion = motion.replace('&', '&') # un-escape ampersands if motion in self._seen_votes: motion = '{} ({})'.format(motion, motion_count) motion_count += 1 self._seen_votes.add(motion) vote = VoteEvent( motion_text=motion, start_date=vote_date.strftime('%Y-%m-%d') if vote_date else None, classification='passage', result='pass' if passed else 'fail', chamber=chamber, bill=bill, ) vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('not voting', not_voting_count) vote.add_source(link) seen = set() for a in ayes: if a in seen: continue vote.yes(a) seen.add(a) for n in nos: if n in seen: continue vote.no(n) seen.add(n) for n in not_voting: if n in seen: continue vote.vote('not voting', n) seen.add(n) yield vote