def toy_vote(): v = Vote(legislative_session="2009", motion_text="passage of the bill", start_date="2009-01-07", result='pass', classification='bill-passage') v.add_source("http://uri.example.com/", note="foo") return v
def scrape_senate_vote(self, bill, url, date): try: filename, resp = self.urlretrieve(url) except scrapelib.HTTPError: self.warning("missing vote file %s" % url) return vote = Vote( chamber='upper', start_date=date.strftime("%Y-%m-%d"), motion_text='Passage', # setting 'fail' for now. result='fail', classification='passage', bill=bill ) vote.add_source(url) text = convert_pdf(filename, 'text').decode('utf-8') os.remove(filename) if re.search('Yea:\s+\d+\s+Nay:\s+\d+\s+Absent:\s+\d+', text): yield from self.scrape_senate_vote_3col(bill, vote, text, url, date) return data = re.split(r'(Yea|Nay|Absent)s?:', text)[::-1] data = filter(None, data) keymap = dict(yea='yes', nay='no') actual_vote = collections.defaultdict(int) vote_count = { 'yes': 0, 'no': 0, 'other': 0 } while True: if not data: break vote_val = data.pop() key = keymap.get(vote_val.lower(), 'other') values = data.pop() for name in re.split(r'(?:[\s,]+and\s|[\s,]{2,})', values): if name.lower().strip() == 'none.': continue name = name.replace('..', '') name = re.sub(r'\.$', '', name) name = name.strip('-1234567890 \n') if not name: continue vote.vote(key, name) actual_vote[vote_val] += 1 vote_count[key] += 1 assert actual_vote[vote_val] == vote_count[key] for key, value in vote_count.items(): vote.set_count(key, value) # updating result with actual value vote.result = 'pass' if vote_count['yes'] > (vote_count['no'] + vote_count['other']) else 'fail' yield vote
def scrape_committee_vote(self, bill, actor, date, motion, page, url, uniqid): votes = page.xpath("//table")[0] rows = votes.xpath(".//tr")[0] if rows[0].text_content() == 'Votes:': # New webste rows = votes.xpath(".//tr")[2] yno = rows.xpath(".//td") if len(yno) < 3: yes = yno[0] no, other = None, None else: yes, no, other = rows.xpath(".//td")[:3] def proc_block(obj, typ): if obj is None: return { "type": None, "count": None, "votes": [] } votes = [] for vote in obj.xpath(".//br"): if vote.tail: vote = vote.tail.strip() if vote: votes.append(vote) count = len(votes) return { "type": typ, "count": count, "votes": votes } vote_dict = { "yes": proc_block(yes, 'yes'), "no": proc_block(no, 'no'), "other": proc_block(other, 'other'), } yes_count = vote_dict['yes']['count'] no_count = vote_dict['no']['count'] or 0 other_count = vote_dict['other']['count'] or 0 vote = Vote(chamber=actor, start_date=date, motion_text=motion, identifier=str(uniqid), result='pass' if (yes_count > no_count) else 'fail', classification='passage', bill=bill) vote.extras = {'_vote_id': uniqid} vote.add_source(url) vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('other', other_count) for key in vote_dict: for voter in vote_dict[key]['votes']: vote.vote(key, voter) yield vote
def test_vote_org_chamber(): v = Vote(legislative_session="2009", motion_text="passage of the bill", start_date="2009-01-07", result='pass', classification='bill-passage', chamber='upper') assert get_pseudo_id(v.organization) == {'classification': 'upper'}
def scrape_vote(self, bill, vote_id, session): vote_url = 'https://legis.delaware.gov/json/RollCall/GetRollCallVoteByRollCallId' form = { 'rollCallId': vote_id, 'sort': '', 'group': '', 'filter': '', } page = self.post(url=vote_url, data=form, allow_redirects=True).json() if page: roll = page['Model'] vote_chamber = self.chamber_map[roll['ChamberName']] # "7/1/16 01:00 AM" vote_date = dt.datetime.strptime(roll['TakenAtDateTime'], '%m/%d/%y %I:%M %p').strftime('%Y-%m-%d') # TODO: What does this code mean? vote_motion = roll['RollCallVoteType'] vote_passed = 'pass' if roll['RollCallStatus'] == 'Passed' else 'fail' other_count = (int(roll['NotVotingCount']) + int(roll['VacantVoteCount']) + int(roll['AbsentVoteCount']) + int(roll['ConflictVoteCount']) ) vote = Vote(chamber=vote_chamber, start_date=vote_date, motion_text=vote_motion, result=vote_passed, classification='other', bill=bill.identifier, legislative_session=session ) vote.add_source(vote_url) vote.set_count('yes', roll['YesVoteCount']) vote.set_count('no', roll['NoVoteCount']) vote.set_count('other', other_count) for row in roll['AssemblyMemberVotes']: # AssemblyMemberId looks like it should work here, # but for some sessions it's bugged to only return session try: voter = self.legislators_by_short[str(row['ShortName'])] name = voter['DisplayName'] except KeyError: self.warning('could not find legislator short name %s', row['ShortName']) name = row['ShortName'] if row['SelectVoteTypeCode'] == 'Y': vote.yes(name) elif row['SelectVoteTypeCode'] == 'N': vote.no(name) else: vote.vote('other', name) # bill.add_vote_event(vote) yield vote
def test_vote_bill_clearing(): # ensure that we don't wind up with votes sitting around forever on bills as changes # make it look like there are multiple votes j = Jurisdiction.objects.create(id='jid', division_id='did') session = j.legislative_sessions.create(name='1900', identifier='1900') org = Organization.objects.create(id='org-id', name='House', classification='lower') bill = Bill.objects.create(id='bill-1', identifier='HB 1', legislative_session=session, from_organization=org) Bill.objects.create(id='bill-2', identifier='HB 2', legislative_session=session, from_organization=org) dmi = DumbMockImporter() bi = BillImporter('jid', dmi, dmi) vote1 = ScrapeVote(legislative_session='1900', start_date='2013', classification='anything', result='passed', motion_text='a vote on somthing', # typo intentional bill=bill.identifier, bill_chamber='lower') vote2 = ScrapeVote(legislative_session='1900', start_date='2013', classification='anything', result='passed', motion_text='a vote on something else', bill=bill.identifier, bill_chamber='lower') # have to use import_data so postimport is called VoteImporter('jid', dmi, dmi, bi).import_data([vote1.as_dict(), vote2.as_dict()]) assert VoteEvent.objects.count() == 2 # a typo is fixed, we don't want 3 votes now vote1.motion_text = 'a vote on something' VoteImporter('jid', dmi, dmi, bi).import_data([vote1.as_dict(), vote2.as_dict()]) assert VoteEvent.objects.count() == 2
def test_vote_identifier_dedupe(): j = Jurisdiction.objects.create(id='jid', division_id='did') j.legislative_sessions.create(name='1900', identifier='1900') vote = ScrapeVote(legislative_session='1900', start_date='2013', classification='anything', result='passed', motion_text='a vote on something', identifier='Roll Call No. 1') dmi = DumbMockImporter() bi = BillImporter('jid', dmi, dmi) _, what = VoteImporter('jid', dmi, dmi, bi).import_item(vote.as_dict()) assert what == 'insert' assert VoteEvent.objects.count() == 1 # same exact vote, no changes _, what = VoteImporter('jid', dmi, dmi, bi).import_item(vote.as_dict()) assert what == 'noop' assert VoteEvent.objects.count() == 1 # new info, update vote.result = 'failed' _, what = VoteImporter('jid', dmi, dmi, bi).import_item(vote.as_dict()) assert what == 'update' assert VoteEvent.objects.count() == 1 # new bill, insert vote.identifier = 'Roll Call 2' _, what = VoteImporter('jid', dmi, dmi, bi).import_item(vote.as_dict()) assert what == 'insert' assert VoteEvent.objects.count() == 2
def test_vote_identifier_dedupe(): j = Jurisdiction.objects.create(id='jid', division_id='did') session = j.legislative_sessions.create(name='1900', identifier='1900') vote = ScrapeVote( legislative_session='1900', start_date='2013', classification='anything', result='passed', motion_text='a vote on something', identifier='Roll Call No. 1', ) dmi = DumbMockImporter() bi = BillImporter('jid', dmi, dmi) _, what = VoteImporter('jid', dmi, dmi, bi).import_item(vote.as_dict()) assert what == 'insert' assert VoteEvent.objects.count() == 1 # same exact vote, no changes _, what = VoteImporter('jid', dmi, dmi, bi).import_item(vote.as_dict()) assert what == 'noop' assert VoteEvent.objects.count() == 1 # new info, update vote.result = 'failed' _, what = VoteImporter('jid', dmi, dmi, bi).import_item(vote.as_dict()) assert what == 'update' assert VoteEvent.objects.count() == 1 # new bill, insert vote.identifier = 'Roll Call 2' _, what = VoteImporter('jid', dmi, dmi, bi).import_item(vote.as_dict()) assert what == 'insert' assert VoteEvent.objects.count() == 2
def test_vote_org_obj(): o = Organization('something', classification='committee') v = Vote(legislative_session="2009", motion_text="passage of the bill", start_date="2009-01-07", result='pass', classification='bill-passage', organization=o) assert v.organization == o._id
def test_org_and_chamber_conflict(): with pytest.raises(ValueError): Vote(legislative_session="2009", motion_text="passage of the bill", start_date="2009-01-07", result='pass', classification='passage', organization='test', chamber='lower')
def test_vote_org_dict(): odict = {'name': 'Random Committee', 'classification': 'committee'} v = Vote(legislative_session="2009", motion_text="passage of the bill", start_date="2009-01-07", result='pass', classification='bill-passage', organization=odict) assert get_pseudo_id(v.organization) == odict
def scrape_committee_vote(self, bill, actor, date, motion, page, url, uniqid): votes = page.xpath("//table")[0] rows = votes.xpath(".//tr")[0] if rows[0].text_content() == "Votes:": # New webste rows = votes.xpath(".//tr")[2] yno = rows.xpath(".//td") if len(yno) < 3: yes = yno[0] no, other = None, None else: yes, _, no, _, other = rows.xpath(".//td")[:5] def proc_block(obj, typ): if obj is None: return {"type": None, "count": None, "votes": []} votes = [] for vote in obj.xpath("./text()"): if vote.strip(): vote = vote.strip() if vote: votes.append(vote) count = len(votes) return {"type": typ, "count": count, "votes": votes} vote_dict = { "yes": proc_block(yes, "yes"), "no": proc_block(no, "no"), "other": proc_block(other, "other"), } yes_count = vote_dict["yes"]["count"] no_count = vote_dict["no"]["count"] or 0 other_count = vote_dict["other"]["count"] or 0 vote = Vote( chamber=actor, start_date=date, motion_text=motion, identifier=str(uniqid), result="pass" if (yes_count > no_count) else "fail", classification="passage", bill=bill, ) vote.extras = {"_vote_id": uniqid} vote.add_source(url) vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("other", other_count) for key in vote_dict: for voter in vote_dict[key]["votes"]: vote.vote(key, voter) yield vote
def parse_vote(self, bill, actor, date, motion, url, uniqid): page = self.get(url).text bill.add_source(url) vote_re = re.compile( r"YEAS -?\s?(\d+)(.*)NAYS -?\s?(\d+)" r"(.*)ABSENT( OR NOT VOTING)? -?\s?" r"(\d+)(.*)", re.MULTILINE | re.DOTALL, ) match = vote_re.search(page) yes_count = int(match.group(1)) no_count = int(match.group(3)) other_count = int(match.group(6)) if yes_count > no_count: passed = True else: passed = False if actor == "upper" or actor == "lower": vote_chamber = actor else: vote_chamber = "" vote = Vote( chamber=vote_chamber, start_date=date, motion_text=motion, result="pass" if passed else "fail", identifier=str(uniqid), classification="passage", bill=bill, ) vote.add_source(url) vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("other", other_count) yes_votes = re.split(r"\s{2,}", match.group(2).strip()) no_votes = re.split(r"\s{2,}", match.group(4).strip()) other_votes = re.split(r"\s{2,}", match.group(7).strip()) for yes in yes_votes: if yes: vote.yes(yes) for no in no_votes: if no: vote.no(no) for other in other_votes: if other: vote.vote("other", other) yield vote
def scrape_votes(self, bill): bill_num = bill.identifier.split()[1] url = ( "http://wslwebservices.leg.wa.gov/legislationservice.asmx/" "GetRollCalls?billNumber=%s&biennium=%s" % (bill_num, self.biennium) ) page = self.get(url) page = lxml.etree.fromstring(page.content) for rc in xpath(page, "//wa:RollCall"): motion = xpath(rc, "string(wa:Motion)") seq_no = xpath(rc, "string(wa:SequenceNumber)") date = xpath(rc, "string(wa:VoteDate)").split("T")[0] date = datetime.datetime.strptime(date, "%Y-%m-%d").date() yes_count = int(xpath(rc, "string(wa:YeaVotes/wa:Count)")) no_count = int(xpath(rc, "string(wa:NayVotes/wa:Count)")) abs_count = int(xpath(rc, "string(wa:AbsentVotes/wa:Count)")) ex_count = int(xpath(rc, "string(wa:ExcusedVotes/wa:Count)")) other_count = abs_count + ex_count agency = xpath(rc, "string(wa:Agency)") chamber = {"House": "lower", "Senate": "upper"}[agency] vote = Vote( chamber=chamber, start_date=date, motion_text="{} (#{})".format(motion, seq_no), result="pass" if yes_count > (no_count + other_count) else "fail", classification="other", bill=bill, ) vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("other", other_count) vote.add_source(url) for sv in xpath(rc, "wa:Votes/wa:Vote"): name = xpath(sv, "string(wa:Name)") vtype = xpath(sv, "string(wa:VOte)") if vtype == "Yea": vote.yes(name) elif vtype == "Nay": vote.no(name) else: vote.vote("other", name) yield vote
def parse_vote(self, bill, actor, date, motion, url, uniqid): page = self.get(url).text bill.add_source(url) vote_re = re.compile( r'YEAS -?\s?(\d+)(.*)NAYS -?\s?(\d+)' r'(.*)ABSENT( OR NOT VOTING)? -?\s?' r'(\d+)(.*)', re.MULTILINE | re.DOTALL) match = vote_re.search(page) yes_count = int(match.group(1)) no_count = int(match.group(3)) other_count = int(match.group(6)) if yes_count > no_count: passed = True else: passed = False if actor == 'upper' or actor == 'lower': vote_chamber = actor else: vote_chamber = '' vote = Vote(chamber=vote_chamber, start_date=date, motion_text=motion, result='pass' if passed else 'fail', identifier=str(uniqid), classification='passage', bill=bill) vote.add_source(url) vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('other', other_count) yes_votes = re.split(r'\s{2,}', match.group(2).strip()) no_votes = re.split(r'\s{2,}', match.group(4).strip()) other_votes = re.split(r'\s{2,}', match.group(7).strip()) for yes in yes_votes: if yes: vote.yes(yes) for no in no_votes: if no: vote.no(no) for other in other_votes: if other: vote.vote('other', other) yield vote
def addBillHistory(self, bill, history_table) : all_votes = [] history = self.parseDataTable(history_table) for action, _, _ in history : action_description = action['Action'] try : action_date = action['Date'].date().isoformat() except AttributeError : # https://chicago.legistar.com/LegislationDetail.aspx?ID=1424866&GUID=CEC53337-B991-4268-AE8A-D4D174F8D492 continue if action_description : bill.add_action(action_description, action_date, organization=action['Action\xa0By'], classification=ACTION_CLASSIFICATION[action_description]) if 'url' in action['Action\xa0Details'] : action_detail_url = action['Action\xa0Details']['url'] result, votes = self.extractVotes(action_detail_url) if votes and result : # see https://github.com/datamade/municipal-scrapers-us/issues/15 action_vote = Vote(legislative_session=bill.legislative_session, motion_text=action_description, classification=None, start_date=action_date, result=result, bill=bill.identifier) action_vote.add_source(action_detail_url) for option, voter in votes : action_vote.vote(option, voter) all_votes.append(action_vote) return all_votes
def test_full_vote(): j = Jurisdiction.objects.create(id='jid', division_id='did') session = j.legislative_sessions.create(name='1900', identifier='1900') person = Person.objects.create(id='person-id', name='Adam Smith') org = Organization.objects.create(id='org-id', name='House', classification='lower') bill = Bill.objects.create(id='bill-id', identifier='HB 1', legislative_session=session, from_organization=org) com = Organization.objects.create(id='com-id', name='Arbitrary Committee', parent=org) vote = ScrapeVote(legislative_session='1900', motion_text='passage', start_date='1900-04-01', classification='passage:bill', result='pass', bill_chamber='lower', bill=bill.identifier) vote.set_count('yes', 20) vote.yes('John Smith') vote.no('Adam Smith') dmi = DumbMockImporter() bi = BillImporter('jid', dmi, dmi) VoteImporter('jid', dmi, dmi, bi).import_data([vote.as_dict()]) assert VoteEvent.objects.count() == 1 ve = VoteEvent.objects.get() assert ve.legislative_session_id == session.id assert ve.motion_classification == ['passage:bill'] assert ve.bill_id == bill.id count = ve.counts.get() assert count.option == 'yes' assert count.value == 20 votes = list(ve.votes.all()) assert len(votes) == 2 for v in ve.votes.all(): if v.voter_name == 'John Smith': assert v.option == 'yes' else: assert v.option == 'no'
def test_vote_bill_id_dedupe(): j = Jurisdiction.objects.create(id='jid', division_id='did') session = j.legislative_sessions.create(name='1900', identifier='1900') org = Organization.objects.create(id='org-id', name='House', classification='lower') bill = Bill.objects.create(id='bill-1', identifier='HB 1', legislative_session=session, from_organization=org) bill2 = Bill.objects.create(id='bill-2', identifier='HB 2', legislative_session=session, from_organization=org) vote = ScrapeVote(legislative_session='1900', start_date='2013', classification='anything', result='passed', motion_text='a vote on something', bill=bill.identifier, bill_chamber='lower') dmi = DumbMockImporter() bi = BillImporter('jid', dmi, dmi) _, what = VoteImporter('jid', dmi, dmi, bi).import_item(vote.as_dict()) assert what == 'insert' assert VoteEvent.objects.count() == 1 # same exact vote, no changes _, what = VoteImporter('jid', dmi, dmi, bi).import_item(vote.as_dict()) assert what == 'noop' assert VoteEvent.objects.count() == 1 # new info, update vote.result = 'failed' _, what = VoteImporter('jid', dmi, dmi, bi).import_item(vote.as_dict()) assert what == 'update' assert VoteEvent.objects.count() == 1 # new vote, insert vote = ScrapeVote(legislative_session='1900', start_date='2013', classification='anything', result='passed', motion_text='a vote on something', bill=bill2.identifier, bill_chamber='lower') _, what = VoteImporter('jid', dmi, dmi, bi).import_item(vote.as_dict()) assert what == 'insert' assert VoteEvent.objects.count() == 2
def scrape_votes(self, bill): bill_num = bill.identifier.split()[1] url = ("http://wslwebservices.leg.wa.gov/legislationservice.asmx/" "GetRollCalls?billNumber=%s&biennium=%s" % (bill_num, self.biennium)) page = self.get(url) page = lxml.etree.fromstring(page.content) for rc in xpath(page, "//wa:RollCall"): motion = xpath(rc, "string(wa:Motion)") date = xpath(rc, "string(wa:VoteDate)").split("T")[0] date = datetime.datetime.strptime(date, "%Y-%m-%d").date() yes_count = int(xpath(rc, "string(wa:YeaVotes/wa:Count)")) no_count = int(xpath(rc, "string(wa:NayVotes/wa:Count)")) abs_count = int(xpath(rc, "string(wa:AbsentVotes/wa:Count)")) ex_count = int(xpath(rc, "string(wa:ExcusedVotes/wa:Count)")) other_count = abs_count + ex_count agency = xpath(rc, "string(wa:Agency)") chamber = {'House': 'lower', 'Senate': 'upper'}[agency] vote = Vote(chamber=chamber, start_date=date, motion_text=motion, result='pass' if yes_count > (no_count + other_count) else 'fail', classification='other', bill=bill) vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('other', other_count) vote.add_source(url) for sv in xpath(rc, "wa:Votes/wa:Vote"): name = xpath(sv, "string(wa:Name)") vtype = xpath(sv, "string(wa:VOte)") if vtype == 'Yea': vote.yes(name) elif vtype == 'Nay': vote.no(name) else: vote.vote('other', name) yield vote
def test_vote_bill_clearing(): # ensure that we don't wind up with votes sitting around forever on bills as changes # make it look like there are multiple votes j = Jurisdiction.objects.create(id='jid', division_id='did') session = j.legislative_sessions.create(name='1900', identifier='1900') org = Organization.objects.create(id='org-id', name='House', classification='lower') bill = Bill.objects.create(id='bill-1', identifier='HB 1', legislative_session=session, from_organization=org) bill2 = Bill.objects.create(id='bill-2', identifier='HB 2', legislative_session=session, from_organization=org) dmi = DumbMockImporter() bi = BillImporter('jid', dmi, dmi) vote1 = ScrapeVote( legislative_session='1900', start_date='2013', classification='anything', result='passed', motion_text='a vote on somthing', # typo intentional bill=bill.identifier, bill_chamber='lower') vote2 = ScrapeVote(legislative_session='1900', start_date='2013', classification='anything', result='passed', motion_text='a vote on something else', bill=bill.identifier, bill_chamber='lower') # have to use import_data so postimport is called VoteImporter('jid', dmi, dmi, bi).import_data([vote1.as_dict(), vote2.as_dict()]) assert VoteEvent.objects.count() == 2 # a typo is fixed, we don't want 3 votes now vote1.motion_text = 'a vote on something' VoteImporter('jid', dmi, dmi, bi).import_data([vote1.as_dict(), vote2.as_dict()]) assert VoteEvent.objects.count() == 2
def parse_vote(self, chamber, bill, row, action_text, action_date, url): yes = int( row.xpath( './/div[label[contains(text(), "A Favor")]]/span[contains(@class,"smalltxt")]/text()' )[0]) no = int( row.xpath( './/div[label[contains(text(), "En Contra")]]/span[contains(@class,"smalltxt")]/text()' )[0]) abstain = int( row.xpath( './/div[label[contains(text(), "Abstenido")]]/span[contains(@class,"smalltxt")]/text()' )[0]) absent = int( row.xpath( './/div[label[contains(text(), "Ausente")]]/span[contains(@class,"smalltxt")]/text()' )[0]) vote_chamber = self.parse_vote_chamber(chamber, action_text) classification = "passage" if u"Votación Final" in action_text else "other" vote = Vote( chamber=vote_chamber, start_date=action_date, motion_text=action_text, result="pass" if (yes > no) else "fail", bill=bill, classification=classification, ) vote.add_source(url) vote.set_count("yes", yes) vote.set_count("no", no) vote.set_count("absent", absent) vote.set_count("abstain", abstain) # we don't want to add the attached vote PDF as a version, # so add it as a document # TODO: maybe this should be set as the source? self.parse_version(bill, row, is_document=True) yield vote
def test_vote_bill_id_dedupe(): j = Jurisdiction.objects.create(id='jid', division_id='did') session = j.legislative_sessions.create(name='1900', identifier='1900') org = Organization.objects.create(id='org-id', name='House', classification='lower') bill = Bill.objects.create(id='bill-1', identifier='HB 1', legislative_session=session, from_organization=org) bill2 = Bill.objects.create(id='bill-2', identifier='HB 2', legislative_session=session, from_organization=org) vote = ScrapeVote(legislative_session='1900', start_date='2013', classification='anything', result='passed', motion_text='a vote on something', bill=bill.identifier, bill_chamber='lower' ) dmi = DumbMockImporter() bi = BillImporter('jid', dmi, dmi) _, what = VoteImporter('jid', dmi, dmi, bi).import_item(vote.as_dict()) assert what == 'insert' assert VoteEvent.objects.count() == 1 # same exact vote, no changes _, what = VoteImporter('jid', dmi, dmi, bi).import_item(vote.as_dict()) assert what == 'noop' assert VoteEvent.objects.count() == 1 # new info, update vote.result = 'failed' _, what = VoteImporter('jid', dmi, dmi, bi).import_item(vote.as_dict()) assert what == 'update' assert VoteEvent.objects.count() == 1 # new vote, insert vote = ScrapeVote(legislative_session='1900', start_date='2013', classification='anything', result='passed', motion_text='a vote on something', bill=bill2.identifier, bill_chamber='lower' ) _, what = VoteImporter('jid', dmi, dmi, bi).import_item(vote.as_dict()) assert what == 'insert' assert VoteEvent.objects.count() == 2
def test_full_vote(): j = Jurisdiction.objects.create(id='jid', division_id='did') session = j.legislative_sessions.create(name='1900', identifier='1900') Person.objects.create(id='person-id', name='Adam Smith') org = Organization.objects.create(id='org-id', name='House', classification='lower') bill = Bill.objects.create(id='bill-id', identifier='HB 1', legislative_session=session, from_organization=org) Organization.objects.create(id='com-id', name='Arbitrary Committee', parent=org) vote = ScrapeVote(legislative_session='1900', motion_text='passage', start_date='1900-04-01', classification='passage:bill', result='pass', bill_chamber='lower', bill=bill.identifier) vote.set_count('yes', 20) vote.yes('John Smith') vote.no('Adam Smith') dmi = DumbMockImporter() bi = BillImporter('jid', dmi, dmi) VoteImporter('jid', dmi, dmi, bi).import_data([vote.as_dict()]) assert VoteEvent.objects.count() == 1 ve = VoteEvent.objects.get() assert ve.legislative_session_id == session.id assert ve.motion_classification == ['passage:bill'] assert ve.bill_id == bill.id count = ve.counts.get() assert count.option == 'yes' assert count.value == 20 votes = list(ve.votes.all()) assert len(votes) == 2 for v in ve.votes.all(): if v.voter_name == 'John Smith': assert v.option == 'yes' else: assert v.option == 'no'
def scrape_vote(self, bill, name, url): match = re.match('^(Senate|House) Vote on [^,]*,(.*)$', name) if not match: return chamber = {'Senate': 'upper', 'House': 'lower'}[match.group(1)] motion = match.group(2).strip() if motion.startswith('FINAL PASSAGE'): type = 'passage' elif motion.startswith('AMENDMENT'): type = 'amendment' elif 'ON 3RD READING' in motion: type = 'reading:3' else: type = 'other' (fd, temp_path) = tempfile.mkstemp() self.urlretrieve(url, temp_path) html = self.pdf_to_lxml(temp_path) os.close(fd) os.remove(temp_path) vote_type = None body = html.xpath('string(/html/body)') date_match = re.search('Date: (\d{1,2}/\d{1,2}/\d{4})', body) try: date = date_match.group(1) except AttributeError: self.warning("BAD VOTE: date error") return start_date = dt.datetime.strptime(date, '%m/%d/%Y') d = defaultdict(list) for line in body.replace(u'\xa0', '\n').split('\n'): line = line.replace(' ', '').strip() # Skip blank lines and "Total --" if not line or 'Total --' in line: continue if line in ('YEAS', 'NAYS', 'ABSENT'): vote_type = {'YEAS': 'yes', 'NAYS': 'no', 'ABSENT': 'other'}[line] elif line in ('Total', '--'): vote_type = None elif vote_type: if vote_type == 'yes': d['yes'].append(line) elif vote_type == 'no': d['no'].append(line) elif vote_type == 'other': d['other'].append(line) yes_count = len(d['yes']) no_count = len(d['no']) other_count = len(d['other']) # The PDFs oddly don't say whether a vote passed or failed. # Hopefully passage just requires yes_votes > not_yes_votes if yes_count > (no_count + other_count): passed = True else: passed = False vote = Vote(chamber=chamber, start_date=start_date.strftime('%Y-%m-%d'), motion_text=motion, result='pass' if passed else 'fail', classification=type, bill=bill) vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('other', other_count) for key, values in d.items(): for item in values: vote.vote(key, item) vote.add_source(url) yield vote
def scrape_house_vote(self, bill, url): try: filename, resp = self.urlretrieve(url) except scrapelib.HTTPError: self.warning("missing vote file %s" % url) return text = convert_pdf(filename, 'text') os.remove(filename) lines = text.splitlines() vote_type = None votes = collections.defaultdict(list) date = None for idx, line in enumerate(lines): line = line.rstrip().decode('utf-8') match = re.search(r'(\d+)/(\d+)/(\d{4,4})$', line) if match: date = datetime.datetime.strptime(match.group(0), "%m/%d/%Y") continue match = re.match( r'\s+YEAS: (\d+)\s+NAYS: (\d+)\s+NOT VOTING: (\d+)', line) if match: motion = (lines[idx - 2].strip()).decode('utf-8') if not motion: self.warning("No motion text found for vote") motion = "PASSAGE" yes_count, no_count, other_count = [ int(g) for g in match.groups() ] exc_match = re.search(r'EXCUSED: (\d+)', line) if exc_match: other_count += int(exc_match.group(1)) if line.endswith('ADOPTED') or line.endswith('PASSED'): passed = True else: passed = False continue match = re.match( r'(YEAS|NAYS|NOT VOTING|PAIRED|EXCUSED):\s+(\d+)\s*$', line) if match: vote_type = { 'YEAS': 'yes', 'NAYS': 'no', 'NOT VOTING': 'other', 'EXCUSED': 'other', 'PAIRED': 'paired' }[match.group(1)] continue if vote_type == 'paired': for part in line.split(' '): part = part.strip() if not part: continue name, pair_type = re.match(r'([^\(]+)\((YEA|NAY)\)', line).groups() name = name.strip() if pair_type == 'YEA': votes['yes'].append(name) elif pair_type == 'NAY': votes['no'].append(name) elif vote_type: for name in line.split(' '): name = name.strip() if not name: continue votes[vote_type].append(name) if date: vote = Vote(chamber='lower', start_date=date.strftime("%Y-%m-%d"), motion_text=motion, result='pass' if passed else 'fail', classification='passage', bill=bill) vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('other', other_count) vote.add_source(url) vote.pupa_id = url for key, values in votes.items(): for value in values: vote.vote(key, value) yield vote else: self.warning("Syntax Error/Warning using 'convert_pdf'")
def scrape_votes(self, bill, url): page = lxml.html.fromstring(self.get(url).text.replace(u'\xa0', ' ')) seen_rcs = set() re_ns = "http://exslt.org/regular-expressions" path = "//p[re:test(text(), 'OKLAHOMA\s+(HOUSE|STATE\s+SENATE)')]" for header in page.xpath(path, namespaces={'re': re_ns}): bad_vote = False # Each chamber has the motion name on a different line of the file if 'HOUSE' in header.xpath("string()"): chamber = 'lower' motion_index = 8 else: chamber = 'upper' motion_index = 13 motion = header.xpath( "string(following-sibling::p[%d])" % motion_index).strip() motion = re.sub(r'\s+', ' ', motion) if not motion.strip(): self.warning("Motion text not found") return match = re.match(r'^(.*) (PASSED|FAILED)$', motion) if match: motion = match.group(1) passed = match.group(2) == 'PASSED' else: passed = None rcs_p = header.xpath( "following-sibling::p[contains(., 'RCS#')]")[0] rcs_line = rcs_p.xpath("string()").replace(u'\xa0', ' ') rcs = re.search(r'RCS#\s+(\d+)', rcs_line).group(1) if rcs in seen_rcs: continue else: seen_rcs.add(rcs) date_line = rcs_p.getnext().xpath("string()") date = re.search(r'\d+/\d+/\d+', date_line).group(0) date = datetime.datetime.strptime(date, "%m/%d/%Y").date() vtype = None counts = collections.defaultdict(int) votes = collections.defaultdict(list) seen_yes = False for sib in header.xpath("following-sibling::p")[13:]: line = sib.xpath("string()").replace('\r\n', ' ').strip() if "*****" in line: break regex = (r'(YEAS|NAYS|EXCUSED|VACANT|CONSTITUTIONAL ' 'PRIVILEGE|NOT VOTING|N/V)\s*:\s*(\d+)(.*)') match = re.match(regex, line) if match: if match.group(1) == 'YEAS' and 'RCS#' not in line: vtype = 'yes' seen_yes = True elif match.group(1) == 'NAYS' and seen_yes: vtype = 'no' elif match.group(1) == 'VACANT': continue # skip these elif seen_yes: vtype = 'other' if seen_yes and match.group(3).strip(): self.warning("Bad vote format, skipping.") bad_vote = True counts[vtype] += int(match.group(2)) elif seen_yes: for name in line.split(' '): if not name: continue if 'HOUSE' in name or 'SENATE ' in name: continue votes[vtype].append(name.strip()) if bad_vote: continue if passed is None: passed = counts['yes'] > (counts['no'] + counts['other']) vote = Vote(chamber=chamber, start_date=date.strftime('%Y-%m-%d'), motion_text=motion, result='pass' if passed else 'fail', bill=bill, classification='passage') vote.set_count('yes', counts['yes']) vote.set_count('no', counts['no']) vote.set_count('other', counts['other']) vote.pupa_id = url + '#' + rcs vote.add_source(url) for name in votes['yes']: vote.yes(name) for name in votes['no']: if ':' in name: raise Exception(name) vote.no(name) for name in votes['other']: vote.vote('other', name) yield vote
def scrape_bill(self, bill_id): old = self.api('bills/' + bill_id + '?') # not needed old.pop('id') old.pop('state') old.pop('level', None) old.pop('country', None) old.pop('created_at') old.pop('updated_at') old.pop('action_dates') old.pop('+subject', None) old.pop('+scraped_subjects', None) old.pop('subjects', []) classification = old.pop('type') # ca weirdness if 'fiscal committee' in classification: classification.remove('fiscal committee') if 'urgency' in classification: classification.remove('urgency') if 'local program' in classification: classification.remove('local program') if 'tax levy' in classification: classification.remove('tax levy') if classification[0] in ['miscellaneous', 'jres', 'cres']: return if classification == ['memorial resolution'] and self.state == 'ar': classification = ['memorial'] if classification == ['concurrent memorial resolution'] and self.state == 'ar': classification = ['concurrent memorial'] if classification == ['joint session resolution'] and self.state == 'il': classification = ['joint resolution'] if classification == ['legislative resolution'] and self.state == 'ny': classification = ['resolution'] if not old['title'] and self.state == 'me': old['title'] = '(unknown)' chamber = old.pop('chamber') if chamber == 'upper' and self.state in ('ne', 'dc'): chamber = 'legislature' elif chamber in ('joint', 'conference'): chamber = 'legislature' new = Bill(old.pop('bill_id'), old.pop('session'), old.pop('title'), chamber=chamber, classification=classification) abstract = old.pop('summary', None) if abstract: new.add_abstract(abstract, note='') for title in old.pop('alternate_titles'): new.add_title(title) for doc in old.pop('documents'): new.add_document_link(doc['name'], doc['url'], on_duplicate='ignore') for doc in old.pop('versions'): new.add_version_link(doc['name'], doc['url'], media_type=doc.pop('mimetype', '')) for subj in old.pop('scraped_subjects', []): if subj: new.add_subject(subj) for spon in old.pop('sponsors'): if spon.get('committee_id') is not None: entity_type = 'organization' elif spon.get('leg_id') is not None: entity_type = 'person' else: entity_type = '' new.add_sponsorship(spon['name'], spon['type'], entity_type, spon['type'] == 'primary') for act in old.pop('actions'): actor = act['actor'] if actor.lower() in ('governor', 'mayor', 'secretary of state'): actor = 'executive' elif actor.lower() == 'house' or (actor.lower().startswith('lower (') and self.state == 'ca'): actor = 'lower' elif actor.lower() in ('senate', 'upper`') or (actor.lower().startswith('upper (') and self.state == 'ca'): actor = 'upper' elif actor in ('joint', 'other', 'Data Systems', 'Speaker', 'clerk', 'Office of the Legislative Fiscal Analyst', 'Became Law w', 'conference') or (actor.lower().startswith('legislature (') and self.state == 'ca'): actor = 'legislature' if actor in ('committee', 'sponsor') and self.state == 'pr': actor = 'legislature' # nebraska & DC if actor == 'upper' and self.state in ('ne', 'dc'): actor = 'legislature' if act['action']: newact = new.add_action(act['action'], act['date'][:10], chamber=actor, classification=[action_types[c] for c in act['type'] if c != 'other']) for re in act.get('related_entities', []): if re['type'] == 'committee': re['type'] = 'organization' elif re['type'] == 'legislator': re['type'] = 'person' newact.add_related_entity(re['name'], re['type']) for comp in old.pop('companions', []): if self.state in ('nj', 'ny', 'mn'): rtype = 'companion' new.add_related_bill(comp['bill_id'], comp['session'], rtype) for abid in old.pop('alternate_bill_ids', []) + old.pop('+alternate_bill_ids', []): new.add_identifier(abid) # generic OpenStates stuff for id in old.pop('all_ids'): new.add_identifier(id, scheme='openstates') for source in old.pop('sources'): source.pop('retrieved', None) new.add_source(**source) ext_title = old.pop('+extended_title', None) if ext_title: new.add_title(ext_title, note='Extended Title') official_title = old.pop('+official_title', None) if official_title: new.add_title(official_title, note='Official Title') to_extras = ['+status', '+final_disposition', '+volume_chapter', '+ld_number', '+referral', '+companion', '+description', '+fiscal_note_probable:', '+preintroduction_required:', '+drafter', '+category:', '+chapter', '+requester', '+transmittal_date:', '+by_request_of', '+bill_draft_number:', '+bill_lr', '+bill_url', '+rcs_num', '+fiscal_note', '+impact_clause', '+fiscal_notes', '+short_title', '+type_', '+conference_committee', 'conference_committee', '+companion_bill_ids'] for k in to_extras: v = old.pop(k, None) if v: new.extras[k.replace('+', '')] = v # votes vote_no = 1 for vote in old.pop('votes'): vote.pop('id') vote.pop('state') vote.pop('bill_id') vote.pop('bill_chamber', None) vote.pop('+state', None) vote.pop('+country', None) vote.pop('+level', None) vote.pop('+vacant', None) vote.pop('+not_voting', None) vote.pop('+amended', None) vote.pop('+excused', None) vote.pop('+NV', None) vote.pop('+AB', None) vote.pop('+P', None) vote.pop('+V', None) vote.pop('+E', None) vote.pop('+EXC', None) vote.pop('+EMER', None) vote.pop('+present', None) vote.pop('+absent', None) vote.pop('+seconded', None) vote.pop('+moved', None) vote.pop('+vote_type', None) vote.pop('+actual_vote', None) vote.pop('+skip_votes', None) vote.pop('vote_id') vote.pop('+bill_chamber', None) vote.pop('+session', None) vote.pop('+bill_id', None) vote.pop('+bill_session', None) vote.pop('committee', None) vote.pop('committee_id', None) vtype = vote.pop('type', 'passage') if vtype == 'veto_override': vtype = ['veto-override'] elif vtype == 'amendment': vtype = ['amendment-passage'] elif vtype == 'other': vtype = '' else: vtype = ['bill-passage'] # most states need identifiers for uniqueness, just do it everywhere identifier = vote['date'] + '-' + str(vote_no) vote_no += 1 chamber = vote.pop('chamber') if chamber == 'upper' and self.state in ('ne', 'dc'): chamber = 'legislature' elif chamber == 'joint': chamber = 'legislature' newvote = Vote(legislative_session=vote.pop('session'), motion_text=vote.pop('motion'), result='pass' if vote.pop('passed') else 'fail', chamber=chamber, start_date=vote.pop('date'), classification=vtype, bill=new, identifier=identifier) for vt in ('yes', 'no', 'other'): newvote.set_count(vt, vote.pop(vt + '_count')) for name in vote.pop(vt + '_votes'): newvote.vote(vt, name['name']) for source in vote.pop('sources'): source.pop('retrieved', None) newvote.add_source(**source) if not newvote.sources: newvote.sources = new.sources to_extras = ['+record', '+method', 'method', '+filename', 'record', '+action', '+location', '+rcs_num', '+type_', '+threshold', '+other_vote_detail', '+voice_vote'] for k in to_extras: v = vote.pop(k, None) if v: newvote.extras[k.replace('+', '')] = v assert not vote, vote.keys() yield newvote assert not old, old.keys() yield new
def scrape(self): for page in self.iterpages(): for subject in page.xpath('//div[@class="ContainerPanel"]'): dates = subject.xpath(".//font[@color='#276598']/b/text()") motions = [x.strip() for x in subject.xpath(".//div[@style='width:260px; float:left;']/text()")] votes = subject.xpath(".//div[@style='width:150px; float:right;']") docket = subject.xpath(".//div[@class='HeaderContent']/b/text()") docket = list(filter(lambda x: "docket" in x.lower(), docket)) docket = docket[0] if docket else None for date, motion, vote in zip(dates, motions, votes): when = dt.datetime.strptime(date, "%m/%d/%Y") motion = motion.strip() if motion == "": self.warning("Skipping vote.") continue v = Vote( session=self.session, organization="Boston City Council", type="other", passed=False, date=when.strftime("%Y-%m-%d"), motion=motion, yes_count=0, no_count=0, ) if docket: v.set_bill(docket) yes, no, other = 0, 0, 0 vit = iter(vote.xpath("./div")) vote = zip(vit, vit, vit) for who, entry, _ in vote: how = entry.text who = who.text if how == "Y": v.yes(who) yes += 1 elif how == "N": v.no(who) no += 1 else: v.other(who) other += 1 for count in v.vote_counts: count["count"] = {"yes": yes, "no": no, "other": other}[count["vote_type"]] v.add_source(DURL, note="root") yield v
def scrape_votes(self, session, zip_url): votes = {} last_line = [] for line in self.zf.open("tblrollcallsummary.txt"): if line.strip() == "": continue line = line.split("|") if len(line) < 14: if len(last_line + line[1:]) == 14: line = last_line self.warning("used bad vote line") else: last_line = line self.warning("bad vote line %s" % "|".join(line)) session_yr = line[0] body = line[1] vote_num = line[2] timestamp = line[3] bill_id = line[4].strip() yeas = int(line[5]) nays = int(line[6]) # present = int(line[7]) # absent = int(line[8]) motion = line[11].strip() or "[not available]" if session_yr == session and bill_id in self.bills_by_id: actor = "lower" if body == "H" else "upper" time = dt.datetime.strptime(timestamp, "%m/%d/%Y %I:%M:%S %p") # TODO: stop faking passed somehow passed = yeas > nays vote = Vote( chamber=actor, start_date=time.strftime("%Y-%m-%d"), motion_text=motion, result="pass" if passed else "fail", classification="passage", bill=self.bills_by_id[bill_id], ) vote.set_count("yes", yeas) vote.set_count("no", nays) vote.add_source(zip_url) votes[body + vote_num] = vote for line in self.zf.open("tblrollcallhistory.txt"): # 2012 | H | 2 | 330795 | HB309 | Yea |1/4/2012 8:27:03 PM session_yr, body, v_num, employee, bill_id, vote, date = line.split( "|") if not bill_id: continue if session_yr == session and bill_id.strip() in self.bills_by_id: try: leg = self.legislators[employee]["name"] except KeyError: self.warning("Error, can't find person %s" % employee) continue vote = vote.strip() if body + v_num not in votes: self.warning("Skipping processing this vote:") self.warning("Bad ID: %s" % (body + v_num)) continue other_count = 0 # code = self.legislators[employee]['seat'] if vote == "Yea": votes[body + v_num].yes(leg) elif vote == "Nay": votes[body + v_num].no(leg) else: votes[body + v_num].other(leg) other_count += 1 votes[body + v_num].set_count("other", other_count) for vote in votes.values(): yield vote
def parse_html_vote(self, bill, actor, date, motion, url, uniqid): try: page = self.get(url).text except scrapelib.HTTPError: self.warning("A vote page not found for bill {}".format( bill.identifier)) return page = lxml.html.fromstring(page) page.make_links_absolute(url) descr = page.xpath("//b")[0].text_content() if descr == '': # New page method descr = page.xpath("//center")[0].text if "on voice vote" in descr: return if "committee" in descr.lower(): yield from self.scrape_committee_vote(bill, actor, date, motion, page, url, uniqid) return passed = None if "Passed" in descr: passed = True elif "Failed" in descr: passed = False elif "UTAH STATE LEGISLATURE" in descr: return elif descr.strip() == '-': return else: self.warning(descr) raise NotImplementedError("Can't see if we passed or failed") headings = page.xpath("//b")[1:] votes = page.xpath("//table") sets = zip(headings, votes) vdict = {} for (typ, votes) in sets: txt = typ.text_content() arr = [x.strip() for x in txt.split("-", 1)] if len(arr) != 2: continue v_txt, count = arr v_txt = v_txt.strip() count = int(count) people = [ x.text_content().strip() for x in votes.xpath(".//font[@face='Arial']") ] vdict[v_txt] = {"count": count, "people": people} vote = Vote(chamber=actor, start_date=date, motion_text=motion, result='pass' if passed else 'fail', bill=bill, classification='passage', identifier=str(uniqid)) vote.set_count('yes', vdict['Yeas']['count']) vote.set_count('no', vdict['Nays']['count']) vote.set_count('other', vdict['Absent or not voting']['count']) vote.add_source(url) for person in vdict['Yeas']['people']: vote.yes(person) for person in vdict['Nays']['people']: vote.no(person) for person in vdict['Absent or not voting']['people']: vote.vote('other', person) yield vote
def scrape_votes(self, bill, url): page = lxml.html.fromstring(self.get(url).text.replace(u"\xa0", " ")) seen_rcs = set() re_ns = "http://exslt.org/regular-expressions" path = r"//p[re:test(text(), 'OKLAHOMA\s+(HOUSE|STATE\s+SENATE)')]" for header in page.xpath(path, namespaces={"re": re_ns}): bad_vote = False # Each chamber has the motion name on a different line of the file if "HOUSE" in header.xpath("string()"): chamber = "lower" motion_index = 8 else: chamber = "upper" motion_index = 13 motion = header.xpath( "string(following-sibling::p[%d])" % motion_index ).strip() motion = re.sub(r"\s+", " ", motion) if not motion.strip(): self.warning("Motion text not found") return match = re.match(r"^(.*) (PASSED|FAILED)$", motion) if match: motion = match.group(1) passed = match.group(2) == "PASSED" else: passed = None rcs_p = header.xpath("following-sibling::p[contains(., 'RCS#')]")[0] rcs_line = rcs_p.xpath("string()").replace(u"\xa0", " ") rcs = re.search(r"RCS#\s+(\d+)", rcs_line).group(1) if rcs in seen_rcs: continue else: seen_rcs.add(rcs) date_line = rcs_p.getnext().xpath("string()") date = re.search(r"\d+/\d+/\d+", date_line).group(0) date = datetime.datetime.strptime(date, "%m/%d/%Y").date() vtype = None counts = collections.defaultdict(int) votes = collections.defaultdict(list) seen_yes = False for sib in header.xpath("following-sibling::p")[13:]: line = sib.xpath("string()").replace("\r\n", " ").strip() if "*****" in line: break regex = ( r"(YEAS|NAYS|EXCUSED|VACANT|CONSTITUTIONAL " r"PRIVILEGE|NOT VOTING|N/V)\s*:\s*(\d+)(.*)" ) match = re.match(regex, line) if match: if match.group(1) == "YEAS" and "RCS#" not in line: vtype = "yes" seen_yes = True elif match.group(1) == "NAYS" and seen_yes: vtype = "no" elif match.group(1) == "VACANT": continue # skip these elif seen_yes: vtype = "other" if seen_yes and match.group(3).strip(): self.warning("Bad vote format, skipping.") bad_vote = True counts[vtype] += int(match.group(2)) elif seen_yes: for name in line.split(" "): if not name: continue if "HOUSE" in name or "SENATE " in name: continue votes[vtype].append(name.strip()) if bad_vote: continue if passed is None: passed = counts["yes"] > (counts["no"] + counts["other"]) vote = Vote( chamber=chamber, start_date=date.strftime("%Y-%m-%d"), motion_text=motion, result="pass" if passed else "fail", bill=bill, classification="passage", ) vote.set_count("yes", counts["yes"]) vote.set_count("no", counts["no"]) vote.set_count("other", counts["other"]) vote.pupa_id = url + "#" + rcs vote.add_source(url) for name in votes["yes"]: vote.yes(name) for name in votes["no"]: if ":" in name: raise Exception(name) vote.no(name) for name in votes["other"]: vote.vote("other", name) yield vote
def scrape(self): for page in self.iterpages(): for subject in page.xpath('//div[@class="ContainerPanel"]'): dates = subject.xpath(".//font[@color='#276598']/b/text()") motions = [x.strip() for x in subject.xpath( ".//div[@style='width:260px; float:left;']/text()")] votes = subject.xpath(".//div[@style='width:150px; float:right;']") docket = subject.xpath(".//div[@class='HeaderContent']/b/text()") docket = list(filter(lambda x: "docket" in x.lower(), docket)) docket = docket[0] if docket else None for date, motion, vote in zip(dates, motions, votes): when = dt.datetime.strptime(date, "%m/%d/%Y") motion = motion.strip() if motion == "": self.warning("Skipping vote.") continue v = Vote(session=self.session, organization="Boston City Council", type='other', passed=False, date=when.strftime("%Y-%m-%d"), motion=motion, yes_count=0, no_count=0,) if docket: v.set_bill(docket) yes, no, other = 0, 0, 0 vit = iter(vote.xpath("./div")) vote = zip(vit, vit, vit) for who, entry, _ in vote: how = entry.text who = who.text if how == 'Y': v.yes(who) yes += 1 elif how == 'N': v.no(who) no += 1 else: v.other(who) other += 1 for count in v.vote_counts: count['count'] = { "yes": yes, "no": no, "other": other }[count['vote_type']] v.add_source(DURL, note='root') yield v
def scrape_votes(self, session): votes = {} other_counts = defaultdict(int) last_line = [] vote_url = 'http://gencourt.state.nh.us/dynamicdatafiles/RollCallSummary.txt' lines = self.get(vote_url).content.decode('utf-8').splitlines() for line in lines: if len(line) < 2: continue if line.strip() == "": continue line = line.split('|') if len(line) < 14: if len(last_line + line[1:]) == 14: line = last_line self.warning('used bad vote line') else: last_line = line self.warning('bad vote line %s' % '|'.join(line)) session_yr = line[0].replace('\xef\xbb\xbf', '') body = line[1] vote_num = line[2] timestamp = line[3] bill_id = line[4].strip() yeas = int(line[5]) nays = int(line[6]) # present = int(line[7]) # absent = int(line[8]) motion = line[11].strip() or '[not available]' if session_yr == session and bill_id in self.bills_by_id: actor = 'lower' if body == 'H' else 'upper' time = dt.datetime.strptime(timestamp, '%m/%d/%Y %I:%M:%S %p') time = pytz.timezone('America/New_York').localize( time).isoformat() # TODO: stop faking passed somehow passed = yeas > nays vote = Vote(chamber=actor, start_date=time, motion_text=motion, result='pass' if passed else 'fail', classification='passage', bill=self.bills_by_id[bill_id]) vote.set_count('yes', yeas) vote.set_count('no', nays) vote.add_source(vote_url) vote.pupa_id = session_yr + body + vote_num # unique ID for vote votes[body + vote_num] = vote for line in self.get('http://gencourt.state.nh.us/dynamicdatafiles/RollCallHistory.txt') \ .content.decode('utf-8').splitlines(): if len(line) < 2: continue # 2016|H|2|330795||Yea| # 2012 | H | 2 | 330795 | 964 | HB309 | Yea | 1/4/2012 8:27:03 PM session_yr, body, v_num, _, employee, bill_id, vote, date = \ line.split('|') if not bill_id: continue if session_yr == session and bill_id.strip() in self.bills_by_id: try: leg = " ".join(self.legislators[employee]['name'].split()) except KeyError: self.warning("Error, can't find person %s" % employee) continue vote = vote.strip() if body + v_num not in votes: self.warning("Skipping processing this vote:") self.warning("Bad ID: %s" % (body + v_num)) continue # code = self.legislators[employee]['seat'] if vote == 'Yea': votes[body + v_num].yes(leg) elif vote == 'Nay': votes[body + v_num].no(leg) else: votes[body + v_num].vote('other', leg) # hack-ish, but will keep the vote count sync'd other_counts[body + v_num] += 1 votes[body + v_num].set_count('other', other_counts[body + v_num]) for vote in votes.values(): yield vote
def scrape_chamber(self, chamber, session): chamber_name = 'house' if chamber == 'lower' else 'senate' session_slug = { '62': '62-2011', '63': '63-2013', '64': '64-2015', '65': '65-2017', }[session] # Open the index page of the session's Registers, and open each url = "http://www.legis.nd.gov/assembly/%s/journals/%s-journal.html" % ( session_slug, chamber_name) page = self.lxmlize(url) pdfs = page.xpath("//a[contains(@href, '.pdf')]") for pdf in pdfs: # Initialize information about the vote parsing results = {} in_motion = False cur_vote = None in_vote = False cur_motion = "" bills = [] # Determine which URLs the information was pulled from pdf_url = pdf.attrib['href'] try: (path, response) = self.urlretrieve(pdf_url) except requests.exceptions.ConnectionError: continue # Convert the PDF to text data = convert_pdf(path, type='text').decode('utf-8') os.unlink(path) # Determine the date of the document date = re.findall(date_re, data) if date: date = date[0][0] cur_date = datetime.datetime.strptime(date, "%A, %B %d, %Y") else: # If no date is found anywhere, do not process the document self.warning("No date was found for the document; skipping.") continue # Check each line of the text for motion and vote information lines = data.splitlines() for line in lines: # Ignore lines with no information if re.search(chamber_re, line) or \ re.search(date_re, line) or \ re.search(page_re, line) or \ line.strip() == "": pass # Ensure that motion and vote capturing are not _both_ active elif in_motion and in_vote: raise AssertionError( "Scraper should not be simultaneously processing " + "motion name and votes, as it is for this motion: " + cur_motion) # Start capturing motion text after a ROLL CALL header elif not in_motion and not in_vote: if line.strip() == "ROLL CALL": in_motion = True elif in_motion and not in_vote: if cur_motion == "": cur_motion = line.strip() else: cur_motion = cur_motion + " " + line.strip() # ABSENT AND NOT VOTING marks the end of each motion name # In this case, prepare to capture votes if line.strip().endswith("VOTING") or \ line.strip().endswith("VOTING."): in_motion = False in_vote = True elif not in_motion and in_vote: # Ignore appointments and confirmations if "The Senate advises and consents to the appointment" \ in line: in_vote = False cur_vote = None results = {} cur_motion = "" bills = [] # If votes are being processed, record the voting members elif ":" in line: cur_vote, who = (x.strip() for x in line.split(":", 1)) who = [ x.strip() for x in who.split(';') if x.strip() != "" ] results[cur_vote] = who name_may_be_continued = False if line.endswith(";") \ else True # Extracts bill numbers in the closing text # used for when the closing text is multiple lines. elif cur_vote is not None and\ re.findall(r"(?i)(H|S|J)(C?)(B|R|M) (\d+)", line) and \ not any(x in line.lower() for x in ['passed', 'adopted', 'sustained', 'prevailed', 'lost', 'failed']): bills.extend( re.findall(r"(?i)(H|S|J)(C?)(B|R|M) (\d+)", line)) elif cur_vote is not None and \ not any(x in line.lower() for x in ['passed', 'adopted', 'sustained', 'prevailed', 'lost', 'failed']): who = [ x.strip() for x in line.split(";") if x.strip() != "" ] if name_may_be_continued: results[cur_vote][-1] = results[cur_vote][-1] + \ " " + who.pop(0) name_may_be_continued = False if line.endswith(";") \ else True results[cur_vote].extend(who) # At the conclusion of a vote, save its data elif any(x in line.lower() for x in [ 'passed', 'adopted', 'sustained', 'prevailed', 'lost', 'failed' ]): in_vote = False cur_vote = None # Identify what is being voted on # Throw a warning if impropper informaiton found bills.extend( re.findall(r"(?i)(H|S|J)(C?)(B|R|M) (\d+)", line)) if bills == [] or cur_motion.strip() == "": results = {} cur_motion = "" self.warning("No motion or bill name found: " + "motion name: " + cur_motion + "; " + "decision text: " + line.strip()) continue # If votes are found in the motion name, throw an error if "YEAS:" in cur_motion or "NAYS:" in cur_motion: raise AssertionError( "Vote data found in motion name: " + cur_motion) # Use the collected results to determine who voted how keys = { "YEAS": "yes", "NAYS": "no", "ABSENT AND NOT VOTING": "other" } res = {} for key in keys: if key in results: res[keys[key]] = results[key] else: res[keys[key]] = [] # Count the number of members voting each way yes, no, other = \ len(res['yes']), \ len(res['no']), \ len(res['other']) chambers = { "H": "lower", "S": "upper", "J": "legislature" } # Almost all of the time, a vote only applies to one bill and this loop # will only be run once. # Some exceptions exist. for bill in bills: cur_bill_id = "%s%s%s %s" % bill # Identify the source chamber for the bill try: bc = chambers[cur_bill_id[0]] except KeyError: bc = 'other' # Determine whether or not the vote passed if "over the governor's veto" in cur_motion.lower( ): VETO_SUPERMAJORITY = 2 / 3 passed = (yes / (yes + no) > VETO_SUPERMAJORITY) else: passed = (yes > no) # Create a Vote object based on the scraped information vote = Vote( chamber=chamber, start_date=cur_date.strftime('%Y-%m-%d'), motion_text=cur_motion, result='pass' if passed else 'fail', legislative_session=session, classification='passage', bill=cur_bill_id, bill_chamber=bc) vote.add_source(pdf_url) vote.add_source(url) vote.set_count('yes', yes) vote.set_count('no', no) vote.set_count('other', other) # For each category of voting members, # add the individuals to the Vote object for key in res: for voter in res[key]: vote.vote(key, voter) # Check the vote counts in the motion text against # the parsed results for category_name in keys.keys(): # Need to search for the singular, not plural, in the text # so it can find, for example, " 1 NAY " vote_re = r"(\d+)\s{}".format( category_name[:-1]) motion_count = int( re.findall(vote_re, cur_motion)[0]) for item in vote.counts: if item['option'] == keys[category_name]: vote_count = item['value'] if motion_count != vote_count: self.warning( "Motion text vote counts ({}) ".format( motion_count) + "differed from roll call counts ({}) ". format(vote_count) + "for {0} on {1}".format( category_name, cur_bill_id)) for item in vote.counts: if item['option'] == keys[ category_name]: vote_count = motion_count yield vote # With the vote successfully processed, # wipe its data and continue to the next one results = {} cur_motion = "" bills = []
def scrape_votes(self, session, zip_url): votes = {} last_line = [] for line in self.zf.open('tblrollcallsummary.txt'): if line.strip() == "": continue line = line.split('|') if len(line) < 14: if len(last_line + line[1:]) == 14: line = last_line self.warning('used bad vote line') else: last_line = line self.warning('bad vote line %s' % '|'.join(line)) session_yr = line[0] body = line[1] vote_num = line[2] timestamp = line[3] bill_id = line[4].strip() yeas = int(line[5]) nays = int(line[6]) # present = int(line[7]) # absent = int(line[8]) motion = line[11].strip() or '[not available]' if session_yr == session and bill_id in self.bills_by_id: actor = 'lower' if body == 'H' else 'upper' time = dt.datetime.strptime(timestamp, '%m/%d/%Y %I:%M:%S %p') # TODO: stop faking passed somehow passed = yeas > nays vote = Vote(chamber=actor, start_date=time.strftime("%Y-%m-%d"), motion_text=motion, result='pass' if passed else 'fail', classification='passage', bill=self.bills_by_id[bill_id]) vote.set_count('yes', yeas) vote.set_count('no', nays) vote.add_source(zip_url) votes[body + vote_num] = vote for line in self.zf.open('tblrollcallhistory.txt'): # 2012 | H | 2 | 330795 | HB309 | Yea |1/4/2012 8:27:03 PM session_yr, body, v_num, employee, bill_id, vote, date \ = line.split('|') if not bill_id: continue if session_yr == session and bill_id.strip() in self.bills_by_id: try: leg = self.legislators[employee]['name'] except KeyError: self.warning("Error, can't find person %s" % employee) continue vote = vote.strip() if body + v_num not in votes: self.warning("Skipping processing this vote:") self.warning("Bad ID: %s" % (body + v_num)) continue other_count = 0 # code = self.legislators[employee]['seat'] if vote == 'Yea': votes[body + v_num].yes(leg) elif vote == 'Nay': votes[body + v_num].no(leg) else: votes[body + v_num].other(leg) other_count += 1 votes[body + v_num].set_count('other', other_count) for vid, vote in votes.items(): yield vote
def scrape_vote(self, bill, name, url): if "VOTE/h" in url: vote_chamber = "lower" cols = (1, 5, 9, 13) name_offset = 3 yes_offset = 0 no_offset = 1 else: vote_chamber = "upper" cols = (1, 6) name_offset = 4 yes_offset = 1 no_offset = 2 page = self.get(url, verify=False).text if "BUDGET ADDRESS" in page: return page = lxml.html.fromstring(page) yes_count = page.xpath( "string(//span[contains(., 'Those voting Yea')])") yes_count = int(re.match(r"[^\d]*(\d+)[^\d]*", yes_count).group(1)) no_count = page.xpath( "string(//span[contains(., 'Those voting Nay')])") no_count = int(re.match(r"[^\d]*(\d+)[^\d]*", no_count).group(1)) other_count = page.xpath("string(//span[contains(., 'Those absent')])") other_count = int(re.match(r"[^\d]*(\d+)[^\d]*", other_count).group(1)) need_count = page.xpath("string(//span[contains(., 'Necessary for')])") need_count = int(re.match(r"[^\d]*(\d+)[^\d]*", need_count).group(1)) date = page.xpath("string(//span[contains(., 'Taken on')])") date = re.match(r".*Taken\s+on\s+(\d+/\s?\d+)", date).group(1) date = date.replace(" ", "") date = datetime.datetime.strptime( date + " " + bill.legislative_session, "%m/%d %Y").date() # not sure about classification. vote = Vote( chamber=vote_chamber, start_date=date, motion_text=name, result="pass" if yes_count > need_count else "fail", classification="passage", bill=bill, ) vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("other", other_count) vote.add_source(url) table = page.xpath("//table")[0] for row in table.xpath("tr"): for i in cols: name = row.xpath("string(td[%d])" % (i + name_offset)).strip() if not name or name == "VACANT": continue name = string.capwords(name) if "Y" in row.xpath("string(td[%d])" % (i + yes_offset)): vote.yes(name) elif "N" in row.xpath("string(td[%d])" % (i + no_offset)): vote.no(name) else: vote.vote("other", name) yield vote
def toy_vote(): v = Vote(session="2009", motion_text="passage of the bill", start_date="2009-01-07", result='pass', classification='passage:bill') v.add_source("http://uri.example.com/", note="foo") return v
def scrape_vote(self, bill, name, url): if "VOTE/h" in url: vote_chamber = 'lower' cols = (1, 5, 9, 13) name_offset = 3 yes_offset = 0 no_offset = 1 else: vote_chamber = 'upper' cols = (1, 6) name_offset = 4 yes_offset = 1 no_offset = 2 # Connecticut's SSL is causing problems with Scrapelib, so use Requests page = requests.get(url, verify=False).text if 'BUDGET ADDRESS' in page: return page = lxml.html.fromstring(page) yes_count = page.xpath( "string(//span[contains(., 'Those voting Yea')])") yes_count = int(re.match(r'[^\d]*(\d+)[^\d]*', yes_count).group(1)) no_count = page.xpath( "string(//span[contains(., 'Those voting Nay')])") no_count = int(re.match(r'[^\d]*(\d+)[^\d]*', no_count).group(1)) other_count = page.xpath( "string(//span[contains(., 'Those absent')])") other_count = int( re.match(r'[^\d]*(\d+)[^\d]*', other_count).group(1)) need_count = page.xpath( "string(//span[contains(., 'Necessary for')])") need_count = int( re.match(r'[^\d]*(\d+)[^\d]*', need_count).group(1)) date = page.xpath("string(//span[contains(., 'Taken on')])") date = re.match(r'.*Taken\s+on\s+(\d+/\s?\d+)', date).group(1) date = date.replace(' ', '') date = datetime.datetime.strptime(date + " " + bill.legislative_session, "%m/%d %Y").date() # not sure about classification. vote = Vote(chamber=vote_chamber, start_date=date, motion_text=name, result='pass' if yes_count > need_count else 'fail', classification='passage', bill=bill ) vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('other', other_count) vote.add_source(url) table = page.xpath("//table")[0] for row in table.xpath("tr"): for i in cols: name = row.xpath("string(td[%d])" % ( i + name_offset)).strip() if not name or name == 'VACANT': continue if "Y" in row.xpath("string(td[%d])" % (i + yes_offset)): vote.yes(name) elif "N" in row.xpath("string(td[%d])" % (i + no_offset)): vote.no(name) else: vote.vote('other', name) yield vote
def get_bills(self): bills = [ {"name": "HB500", "title": "Makes various changes to provisions governing employment practices", "session": "2011", "versions": ["http://example.com/HB500.pdf"], "actions": [ {"description": "Introduced", "actor": "Committee on Pudding Pops", "date": "2014-04-15",}, {"date": "2014-04-15", "description": "Read first time. Referred to Committee on Commerce and Labor. To printer.", "actor": "Test City Council" }, {"date": "2014-04-15", "description": "From printer. To committee.", "actor": "Test City Council"}, {"date": "2014-04-15", "description": "From committee: Do pass.", "actor": "Rules"}, {"description": "Signed into law", "actor": "Fiscal Committee", "date": "2014-04-19",}, ], "sponsors_people": [ ], "sponsors_committee": [ ], "votes": [ {"motion": "Vote by the Committee on the Whole.", "yes_count": 1, "other_count": 1, "no_count": 3, "passed": True, "type": "passage:bill", "date": "2014-04-15", "session": "2011", "roll": { "yes": [ "Eliana Meyer", ], "no": [ "Gunnar Luna", "Regina Cruz", "Makenzie Keller", ], "other": [ "Unknown Person", ], } }, ]}, {"name": "HB101", "title": "Joint county ditch proceedings-conduct by teleconference or video conference", "session": "2011", "versions": ["http://example.com/HB101.pdf"], "actions": [ {"description": "Introduced", "actor": "council", "date": "2014-04-15",}, {"description": "Referred to the Committee on Pudding Pops", "actor": "council", "date": "2014-04-16",}, {"description": "Reported favorably", "actor": "council", "date": "2014-04-16",}, {"description": "Referred to the Bills in the Third Read", "actor": "council", "date": "2014-04-17",}, {"description": "Vote by the Committee on the Whole. Do pass.", "actor": "council", "date": "2014-04-18",}, {"description": "Signed into law", "actor": "council", "date": "2014-04-19",}, ], "sponsors_people": [ "Shayla Fritz", "Gunnar Luna", ], "sponsors_committee": [ "Standing Committee on Public Safety", ], "votes": [ {"motion": "Vote by the Committee on the Whole.", "yes_count": 3, "no_count": 1, "passed": True, "type": "passage:bill", "date": "2014-04-18", "session": "2011", "roll": { "yes": [ "Gunnar Luna", "Regina Cruz", "Makenzie Keller", ], "no": [ "Eliana Meyer", ], "other": [ ], } }, ]}, ] for bill in bills: b = Bill(identifier=bill['name'], title=bill['title'], legislative_session=bill['session']) b.add_source("ftp://example.com/some/bill") for vote in bill['votes']: v = Vote(motion_text=vote['motion'], organization_id=make_psuedo_id( name="Test City Council", classification="legislature" ), yes_count=vote['yes_count'], no_count=vote['no_count'], result='pass' if vote['passed'] else 'fail', classification=vote['type'], start_date=vote['date'], legislative_session=vote['session'], ) v.add_source("http://example.com/votes/vote.xls") for yv in vote['roll']['yes']: v.yes(yv) for nv in vote['roll']['no']: v.no(nv) yield v for sponsor in bill['sponsors_people']: b.add_sponsorship(name=sponsor, classification='primary', entity_type='person', primary=True) for sponsor in bill['sponsors_committee']: b.add_sponsorship(name=sponsor, classification='primary', entity_type='organization', primary=True) for version in bill['versions']: b.add_version_link(note="Bill Version", url=version) for action in bill['actions']: action['organization'] = make_psuedo_id(name=action.pop( 'actor' )) b.add_action(**action) yield b