def scrape_senate_vote(self, bill, url, date): try: filename, resp = self.urlretrieve(url) except scrapelib.HTTPError: self.warning("missing vote file %s" % url) return vote = VoteEvent( chamber="upper", start_date=date.strftime("%Y-%m-%d"), motion_text="Passage", # setting 'fail' for now. result="fail", classification="passage", bill=bill, ) vote.add_source(url) vote.pupa_id = url text = convert_pdf(filename, "text").decode("utf-8") os.remove(filename) if re.search(r"Yea:\s+\d+\s+Nay:\s+\d+\s+Absent:\s+\d+", text): yield from self.scrape_senate_vote_3col(bill, vote, text, url, date) return data = re.split(r"(Yea|Nay|Absent)s?:", text)[::-1] data = filter(None, data) keymap = dict(yea="yes", nay="no") actual_vote = collections.defaultdict(int) vote_count = {"yes": 0, "no": 0, "other": 0} while True: if not data: break vote_val = data.pop() key = keymap.get(vote_val.lower(), "other") values = data.pop() for name in re.split(r"(?:[\s,]+and\s|[\s,]{2,})", values): if name.lower().strip() == "none.": continue name = name.replace("..", "") name = re.sub(r"\.$", "", name) name = name.strip("-1234567890 \n") if not name: continue vote.vote(key, name) actual_vote[vote_val] += 1 vote_count[key] += 1 assert actual_vote[vote_val] == vote_count[key] for key, value in vote_count.items(): vote.set_count(key, value) # updating result with actual value vote.result = ("pass" if vote_count["yes"] > (vote_count["no"] + vote_count["other"]) else "fail") yield vote
def toy_vote_event(): ve = VoteEvent( legislative_session="2009", motion_text="passage of the bill", start_date="2009-01-07", result="pass", classification="passage", ) ve.add_source("http://uri.example.com/", note="foo") return ve
def add_vote(self, bill, chamber, date, text, url): votes = re.findall(r"Ayes,?[\s]?(\d+)[,;]\s+N(?:oes|ays),?[\s]?(\d+)", text) yes, no = int(votes[0][0]), int(votes[0][1]) vtype = [] for regex, type in motion_classifiers.items(): if re.match(regex, text): vtype = type break v = VoteEvent( chamber=chamber, start_date=TIMEZONE.localize(date), motion_text=text, result="pass" if yes > no else "fail", classification=vtype, bill=bill, ) v.dedupe_key = url.split("/")[-1] v.set_count("yes", yes) v.set_count("no", no) # fetch the vote itself if url: v.add_source(url) if "av" in url: self.add_house_votes(v, url) elif "sv" in url: self.add_senate_votes(v, url) return v
def parse_vote_page(self, vote_url, bill): vote_html = self.get(vote_url).text doc = lxml.html.fromstring(vote_html) # chamber if "senate" in vote_url: chamber = "upper" else: chamber = "lower" # date in the following format: Mar 23, 2009 date = doc.xpath('//td[starts-with(text(), "Legislative")]')[0].text date = date.replace(u"\xa0", " ") date = datetime.datetime.strptime(date[18:], "%b %d, %Y") # motion motion = "".join(x.text_content() for x in doc.xpath('//td[@colspan="23"]')) if motion == "": motion = "No motion given" # XXX: Double check this. See SJ 3. motion = motion.replace(u"\xa0", " ") # totals tot_class = doc.xpath('//td[contains(text(), "Yeas")]')[0].get("class") totals = doc.xpath('//td[@class="%s"]/text()' % tot_class)[1:] yes_count = int(totals[0].split()[-1]) no_count = int(totals[1].split()[-1]) other_count = int(totals[2].split()[-1]) other_count += int(totals[3].split()[-1]) other_count += int(totals[4].split()[-1]) passed = yes_count > no_count vote = VoteEvent( bill=bill, chamber=chamber, start_date=date.strftime("%Y-%m-%d"), motion_text=motion, classification="passage", result="pass" if passed else "fail", ) vote.pupa_id = vote_url # contains sequence number vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("other", other_count) # go through, find Voting Yea/Voting Nay/etc. and next tds are voters func = None for td in doc.xpath("//td/text()"): td = td.replace(u"\xa0", " ") if td.startswith("Voting Yea"): func = vote.yes elif td.startswith("Voting Nay"): func = vote.no elif td.startswith("Not Voting"): func = vote.other elif td.startswith("Excused"): func = vote.other elif func: td = td.rstrip("*") func(td) return vote
def test_vote_event_pupa_identifier_dedupe(): j = create_jurisdiction() j.legislative_sessions.create(name="1900", identifier="1900") Organization.objects.create(id="org-id", name="Legislature", classification="legislature", jurisdiction=j) vote_event = ScrapeVoteEvent( legislative_session="1900", start_date="2013", classification="anything", result="passed", motion_text="a vote on something", identifier="Roll Call No. 1", ) vote_event.pupa_id = "foo" dmi = DumbMockImporter() oi = OrganizationImporter("jid") bi = BillImporter("jid", dmi, oi) _, what = VoteEventImporter("jid", dmi, oi, bi).import_item(vote_event.as_dict()) assert what == "insert" assert VoteEvent.objects.count() == 1 # same exact vote event, no changes _, what = VoteEventImporter("jid", dmi, oi, bi).import_item(vote_event.as_dict()) assert what == "noop" assert VoteEvent.objects.count() == 1 # new info, update vote_event.result = "failed" _, what = VoteEventImporter("jid", dmi, oi, bi).import_item(vote_event.as_dict()) assert what == "update" assert VoteEvent.objects.count() == 1 # new bill identifier, update vote_event.identifier = "First Roll Call" _, what = VoteEventImporter("jid", dmi, oi, bi).import_item(vote_event.as_dict()) assert what == "update" assert VoteEvent.objects.count() == 1 # new identifier, insert vote_event.pupa_id = "bar" _, what = VoteEventImporter("jid", dmi, oi, bi).import_item(vote_event.as_dict()) assert what == "insert" assert VoteEvent.objects.count() == 2
def scrape_votes_old(self, bill, billname, session): vote_url = ("http://archives.legislature.state.oh.us/bills.cfm?ID=" + session + "_" + billname) page = self.get(vote_url).text page = lxml.html.fromstring(page) for jlink in page.xpath("//a[contains(@href, 'JournalText')]"): date = self._tz.localize( datetime.datetime.strptime(jlink.text, "%m/%d/%Y")).date() date = "{:%Y-%m-%d}".format(date) details = jlink.xpath("string(../../../td[2])") chamber = details.split(" - ")[0] if chamber == "House": chamber = "lower" elif chamber == "Senate": chamber = "upper" else: raise ScrapeError("Bad chamber: %s" % chamber) motion = details.split(" - ")[1].split("\n")[0].strip() vote_row = jlink.xpath("../../..")[0].getnext() yea_div = vote_row.xpath("td/font/div[contains(@id, 'Yea')]")[0] yeas = [] for td in yea_div.xpath("table/tr/td"): name = td.xpath("string()") if name: yeas.append(name) no_div = vote_row.xpath("td/font/div[contains(@id, 'Nay')]")[0] nays = [] for td in no_div.xpath("table/tr/td"): name = td.xpath("string()") if name: nays.append(name) yes_count = len(yeas) no_count = len(nays) vote = VoteEvent( chamber=chamber, start_date=date, motion_text=motion, result="pass" if yes_count > no_count else "fail", bill=bill, classification="passage", ) for yes in yeas: vote.yes(yes) for no in nays: vote.no(no) vote.add_source(vote_url) yield vote
def _get_votes(self, date, actor, action, bill, url): vre = r"(?P<leader>.*)(AYES|YEAS):\s+(?P<yeas>\d+)\s+(NOES|NAYS):\s+(?P<nays>\d+).*" if "YEAS" in action.upper() or "AYES" in action.upper(): match = re.match(vre, action) if match: v = match.groupdict() yes, no = int(v["yeas"]), int(v["nays"]) vote = VoteEvent( chamber=actor, motion_text=v["leader"], result="pass" if yes > no else "fail", classification="passage", start_date=TIMEZONE.localize(date), bill=bill, ) vote.add_source(url) yield vote
def test_full_vote_event(): j = create_jurisdiction() j.legislative_sessions.create(name="1900", identifier="1900") sp1 = ScrapePerson("John Smith", primary_org="lower") sp2 = ScrapePerson("Adam Smith", primary_org="lower") org = ScrapeOrganization(name="House", classification="lower") bill = ScrapeBill("HB 1", "1900", "Axe & Tack Tax Act", from_organization=org._id) vote_event = ScrapeVoteEvent( legislative_session="1900", motion_text="passage", start_date="1900-04-01", classification="passage:bill", result="pass", bill_chamber="lower", bill="HB 1", organization=org._id, ) vote_event.set_count("yes", 20) vote_event.yes("John Smith") vote_event.no("Adam Smith") oi = OrganizationImporter("jid") oi.import_data([org.as_dict()]) pi = PersonImporter("jid") pi.import_data([sp1.as_dict(), sp2.as_dict()]) mi = MembershipImporter("jid", pi, oi, DumbMockImporter()) mi.import_data([sp1._related[0].as_dict(), sp2._related[0].as_dict()]) bi = BillImporter("jid", oi, pi) bi.import_data([bill.as_dict()]) VoteEventImporter("jid", pi, oi, bi).import_data([vote_event.as_dict()]) assert VoteEvent.objects.count() == 1 ve = VoteEvent.objects.get() assert ve.legislative_session == LegislativeSession.objects.get() assert ve.motion_classification == ["passage:bill"] assert ve.bill == Bill.objects.get() count = ve.counts.get() assert count.option == "yes" assert count.value == 20 votes = list(ve.votes.all()) assert len(votes) == 2 for v in ve.votes.all(): if v.voter_name == "John Smith": assert v.option == "yes" assert v.voter == Person.objects.get(name="John Smith") else: assert v.option == "no" assert v.voter == Person.objects.get(name="Adam Smith")
def test_fix_bill_id(): j = create_jurisdiction() j.legislative_sessions.create(name="1900", identifier="1900") org1 = ScrapeOrganization(name="House", classification="lower") bill = ScrapeBill("HB 1", "1900", "Test Bill ID", classification="bill", chamber="lower") oi = OrganizationImporter("jid") oi.import_data([org1.as_dict()]) from openstates.settings import IMPORT_TRANSFORMERS IMPORT_TRANSFORMERS["bill"] = { "identifier": lambda x: re.sub(r"([A-Z]*)\s*0*([-\d]+)", r"\1 \2", x, 1) } bi = BillImporter("jid", oi, DumbMockImporter()) bi.import_data([bill.as_dict()]) ve = ScrapeVoteEvent( legislative_session="1900", motion_text="passage", start_date="1900-04-02", classification="passage:bill", result="fail", bill_chamber="lower", bill="HB1", identifier="4", bill_action="passage", organization=org1._id, ) VoteEventImporter("jid", DumbMockImporter(), oi, bi).import_data([ve.as_dict()]) IMPORT_TRANSFORMERS["bill"] = {} ve = VoteEvent.objects.get() ve.bill.identifier == "HB 1"
def test_vote_event_org_chamber(): ve = VoteEvent( legislative_session="2009", motion_text="passage of the bill", start_date="2009-01-07", result="pass", classification="passage", chamber="upper", ) assert get_pseudo_id(ve.organization) == {"classification": "upper"}
def test_vote_event_org_obj(): o = Organization("something", classification="committee") ve = VoteEvent( legislative_session="2009", motion_text="passage of the bill", start_date="2009-01-07", result="pass", classification="passage", organization=o, ) assert ve.organization == o._id
def test_vote_event_org_dict(): odict = {"name": "Random Committee", "classification": "committee"} ve = VoteEvent( legislative_session="2009", motion_text="passage of the bill", start_date="2009-01-07", result="pass", classification="passage", organization=odict, ) assert get_pseudo_id(ve.organization) == odict
def test_org_and_chamber_conflict(): with pytest.raises(ValueError): VoteEvent( legislative_session="2009", motion_text="passage of the bill", start_date="2009-01-07", result="pass", classification="passage", organization="test", chamber="lower", )
def test_vote_event_bill_id_dedupe(): create_jurisdiction() bill = Bill.objects.create( id="bill-1", identifier="HB 1", legislative_session=LegislativeSession.objects.get(), from_organization=Organization.objects.get(classification="lower"), ) bill2 = Bill.objects.create( id="bill-2", identifier="HB 2", legislative_session=LegislativeSession.objects.get(), from_organization=Organization.objects.get(classification="lower"), ) vote_event = ScrapeVoteEvent( legislative_session="1900", start_date="2013", classification="anything", result="passed", motion_text="a vote on something", bill=bill.identifier, bill_chamber="lower", chamber="lower", ) bi = BillImporter("jid") _, what = VoteEventImporter("jid", bi).import_item(vote_event.as_dict()) assert what == "insert" assert VoteEvent.objects.count() == 1 # same exact vote event, no changes _, what = VoteEventImporter("jid", bi).import_item(vote_event.as_dict()) assert what == "noop" assert VoteEvent.objects.count() == 1 # new info, update vote_event.result = "failed" _, what = VoteEventImporter("jid", bi).import_item(vote_event.as_dict()) assert what == "update" assert VoteEvent.objects.count() == 1 # new vote event, insert vote_event = ScrapeVoteEvent( legislative_session="1900", start_date="2013", classification="anything", result="passed", motion_text="a vote on something", bill=bill2.identifier, bill_chamber="lower", chamber="lower", ) _, what = VoteEventImporter("jid", bi).import_item(vote_event.as_dict()) assert what == "insert" assert VoteEvent.objects.count() == 2
def test_vote_event_bill_clearing(): # ensure that we don't wind up with vote events sitting around forever on bills as # changes make it look like there are multiple vote events j = create_jurisdiction() session = j.legislative_sessions.create(name="1900", identifier="1900") org = Organization.objects.create(id="org-id", name="House", classification="lower", jurisdiction=j) bill = Bill.objects.create( id="bill-1", identifier="HB 1", legislative_session=session, from_organization=org, ) Bill.objects.create( id="bill-2", identifier="HB 2", legislative_session=session, from_organization=org, ) oi = OrganizationImporter("jid") dmi = DumbMockImporter() bi = BillImporter("jid", dmi, oi) vote_event1 = ScrapeVoteEvent( legislative_session="1900", start_date="2013", classification="anything", result="passed", motion_text="a vote on somthing", # typo intentional bill=bill.identifier, bill_chamber="lower", chamber="lower", ) vote_event2 = ScrapeVoteEvent( legislative_session="1900", start_date="2013", classification="anything", result="passed", motion_text="a vote on something else", bill=bill.identifier, bill_chamber="lower", chamber="lower", ) # have to use import_data so postimport is called VoteEventImporter("jid", dmi, oi, bi).import_data( [vote_event1.as_dict(), vote_event2.as_dict()]) assert VoteEvent.objects.count() == 2 # a typo is fixed, we don't want 3 vote events now vote_event1.motion_text = "a vote on something" VoteEventImporter("jid", dmi, oi, bi).import_data( [vote_event1.as_dict(), vote_event2.as_dict()]) assert VoteEvent.objects.count() == 2
def parse_vote( self, bill, journal_entry_number, action, act_chamber, act_date, url ): # html = self.get(url).text # doc = lxml.html.fromstring(html) yes = no = other = 0 result = "" vote_counts = action.split() for vote_count in vote_counts: if re.match(r"[\D][\d]", vote_count): if "Y" in vote_count: yes = int(vote_count[1:]) elif "N" in vote_count: no = int(vote_count[1:]) elif "E" in vote_count or "A" in vote_count: other += int(vote_count[1:]) if "PASSED" in action: result = "pass" elif "FAILED" in action: result = "fail" else: result = "pass" if yes > no else "fail" vote = VoteEvent( bill=bill, start_date=act_date.strftime("%Y-%m-%d"), chamber=act_chamber, motion_text=action + " #" + journal_entry_number, result=result, classification="passage", ) vote.set_count("yes", yes) vote.set_count("no", no) vote.set_count("other", other) vote.add_source(url) yield vote
def viva_voce_votes(root, session, chamber): for el in root.xpath(u'//div[starts-with(., "All Members are deemed")]'): mv = MaybeViva(el) if not mv.is_valid: continue v = VoteEvent( chamber=chamber, start_date=None, motion_text="passage" if mv.passed else "other", result="pass" if mv.passed else "fail", classification="passage" if mv.passed else None, legislative_session=session[0:2], bill=mv.bill_id, bill_chamber=mv.chamber, ) v.set_count("yes", 0) v.set_count("no", 0) v.set_count("absent", 0) v.set_count("not voting", 0) yield v
def test_vote_event_bill_actions_two_stage(): # this test is very similar to what we're testing in test_vote_event_bill_actions w/ # ve3 and ve4, that two bills that reference the same action won't conflict w/ the # OneToOneField, but in this case we do it in two stages so that the conflict is found # even if the votes weren't in the same scrape create_jurisdiction() bill = ScrapeBill("HB 1", "1900", "Axe & Tack Tax Act", chamber="lower") bill.add_action(description="passage", date="1900-04-02", chamber="lower") ve1 = ScrapeVoteEvent( legislative_session="1900", motion_text="passage", start_date="1900-04-02", classification="passage:bill", result="pass", bill_chamber="lower", bill="HB 1", bill_action="passage", chamber="lower", ) ve2 = ScrapeVoteEvent( legislative_session="1900", motion_text="passage", start_date="1900-04-02", classification="passage:bill", result="pass", bill_chamber="lower", bill="HB 1", bill_action="passage", chamber="lower", ) # disambiguate them ve1.pupa_id = "one" ve2.pupa_id = "two" bi = BillImporter("jid") bi.import_data([bill.as_dict()]) # first imports just fine VoteEventImporter("jid", bi).import_data([ve1.as_dict()]) votes = list(VoteEvent.objects.all()) assert len(votes) == 1 assert votes[0].bill_action is not None # when second is imported, ensure that action stays pinned to first just as it would # have if they were both in same import VoteEventImporter("jid", bi).import_data([ve1.as_dict(), ve2.as_dict()]) votes = list(VoteEvent.objects.all()) assert len(votes) == 2 assert votes[0].bill_action is not None assert votes[1].bill_action is None
def test_fix_bill_id(): create_jurisdiction() bill = ScrapeBill("HB 1", "1900", "Test Bill ID", classification="bill", chamber="lower") from openstates.settings import IMPORT_TRANSFORMERS IMPORT_TRANSFORMERS["bill"] = { "identifier": fix_bill_id, } bi = BillImporter("jid") bi.import_data([bill.as_dict()]) ve = ScrapeVoteEvent( legislative_session="1900", motion_text="passage", start_date="1900-04-02", classification="passage:bill", result="fail", bill_chamber="lower", bill="HB1", identifier="4", bill_action="passage", chamber="lower", ) VoteEventImporter("jid", bi).import_data([ve.as_dict()]) IMPORT_TRANSFORMERS["bill"] = {} ve = VoteEvent.objects.get() ve.bill.identifier == "HB 1"
def test_full_vote_event(): create_jurisdiction() bill = ScrapeBill("HB 1", "1900", "Axe & Tack Tax Act", chamber="lower") vote_event = ScrapeVoteEvent( legislative_session="1900", motion_text="passage", start_date="1900-04-01", classification="passage:bill", result="pass", bill_chamber="lower", bill="HB 1", chamber="lower", ) vote_event.set_count("yes", 20) vote_event.yes("John Smith") vote_event.no("Adam Smith") Person.objects.create(name="John Smith") Person.objects.create(name="Adam Smith") for person in Person.objects.all(): person.memberships.create(organization=Organization.objects.get( classification="lower")) bi = BillImporter("jid") bi.import_data([bill.as_dict()]) VoteEventImporter("jid", bi).import_data([vote_event.as_dict()]) assert VoteEvent.objects.count() == 1 ve = VoteEvent.objects.get() assert ve.legislative_session == LegislativeSession.objects.get() assert ve.motion_classification == ["passage:bill"] assert ve.bill == Bill.objects.get() count = ve.counts.get() assert count.option == "yes" assert count.value == 20 votes = list(ve.votes.all()) assert len(votes) == 2 for v in ve.votes.all(): if v.voter_name == "John Smith": assert v.option == "yes" assert v.voter == Person.objects.get(name="John Smith") else: assert v.option == "no" assert v.voter == Person.objects.get(name="Adam Smith")
def add_archived_votes(self, bill, bill_id): bill_id = bill_id.split() bill_id[0] = bill_id[0][0] if len(bill_id[-1]) == 2: bill_id[-1] = "00" + bill_id[-1] if len(bill_id[-1]) == 3: bill_id[-1] = "0" + bill_id[-1] bill_id = "".join(bill_id) if bill_id in self.archived_votes: for vote_key, legislator_votes in self.archived_votes[ bill_id].items(): ( vote_date, r_number, action_number, action_vote_result, archive_url, cod, _, ) = vote_key if archive_url[-1] == "S": chamber = "upper" else: chamber = "lower" vote_date = eastern.localize(vote_date) vote_date = vote_date.isoformat() motion_text = (action_number + r_number + cod + action_vote_result).replace(" ", "_") ve = VoteEvent( chamber=chamber, # TODO: check this start_date=vote_date, motion_text=motion_text, bill=bill, classification= "other", # No indication on classification for archived votes result=action_vote_result, ) ve.add_source(archive_url) for lv in legislator_votes: ve.vote(lv["how_voted"], lv["leg"]) yield ve
def process_committee_vote(self, committee_action, bill): try: date = committee_action["ActionDate"] vote_info = committee_action["Vote"] except KeyError: self.logger.warning("Committee vote has no data. Skipping.") return date = self.date_format(date) other_count = 0 for v in vote_info: vote_count = 0 if v["VoteCount"] == "" else int(v["VoteCount"]) if v["VoteType"] == "Yes": yes_count = vote_count elif v["VoteType"] == "No": no_count = vote_count else: other_count += vote_count result = "fail" if yes_count > no_count: result = "pass" v = VoteEvent( chamber="legislature", start_date=date, motion_text="Committee Vote", result=result, classification="committee", bill=bill, ) v.set_count("yes", yes_count) v.set_count("no", no_count) v.set_count("other", other_count) return v
def scrape_votes(self, vote_url, bill, chamber): try: filename, response = self.urlretrieve(vote_url) except scrapelib.HTTPError: self.logger.warning("PDF not posted or available") return # Grabs text from pdf pdflines = [ line.decode("utf-8") for line in convert_pdf(filename, "text").splitlines() ] os.remove(filename) vote_date = 0 voters = defaultdict(list) for x in range(len(pdflines)): line = pdflines[x] if re.search(r"(\d+/\d+/\d+)", line): initial_date = line.strip() if ("AM" in line) or ("PM" in line): split_l = line.split() for y in split_l: if ":" in y: time_location = split_l.index(y) motion = " ".join(split_l[0:time_location]) time = split_l[time_location:] if len(time) > 0: time = "".join(time) dt = initial_date + " " + time dt = datetime.strptime(dt, "%m/%d/%Y %I:%M:%S%p") vote_date = central.localize(dt) vote_date = vote_date.isoformat() # In rare case that no motion is provided if len(motion) < 1: motion = "No Motion Provided" if "YEAS:" in line: yeas = int(line.split()[-1]) if "NAYS:" in line: nays = int(line.split()[-1]) if "ABSTAINED:" in line: abstained = int(line.split()[-1]) if "PASSES:" in line: abstained = int(line.split()[-1]) if "NOT VOTING:" in line: not_voting = int(line.split()[-1]) if "YEAS :" in line: y = 0 next_line = pdflines[x + y] while "NAYS : " not in next_line: next_line = next_line.split(" ") if next_line and ("YEAS" not in next_line): for v in next_line: if v and "YEAS" not in v: voters["yes"].append(v.strip()) next_line = pdflines[x + y] y += 1 if line and "NAYS :" in line: y = 0 next_line = 0 next_line = pdflines[x + y] while ("ABSTAINED : " not in next_line) and ("PASSES :" not in next_line): next_line = next_line.split(" ") if next_line and "NAYS" not in next_line: for v in next_line: if v and "NAYS" not in v: voters["no"].append(v.strip()) next_line = pdflines[x + y] y += 1 if line and ("ABSTAINED :" in line or "PASSES :" in line): y = 2 next_line = 0 next_line = pdflines[x + y] while "NOT VOTING :" not in next_line: next_line = next_line.split(" ") if next_line and ("ABSTAINED" not in next_line or "PASSES" not in next_line): for v in next_line: if v: voters["abstain"].append(v.strip()) next_line = pdflines[x + y] y += 1 if line and "NOT VOTING : " in line: lines_to_go_through = math.ceil(not_voting / len(line.split())) next_line = pdflines[x] for y in range(lines_to_go_through): if len(pdflines) > (x + y + 2): next_line = pdflines[x + y + 2].split(" ") for v in next_line: if v: voters["not voting"].append(v.strip()) if yeas > (nays + abstained + not_voting): passed = True else: passed = False ve = VoteEvent( chamber=chamber, start_date=vote_date, motion_text=motion, result="pass" if passed else "fail", bill=bill, classification="passage", ) ve.add_source(vote_url) for how_voted, how_voted_voters in voters.items(): for voter in how_voted_voters: if len(voter) > 0: ve.vote(how_voted, voter) # Resets voters dictionary before going onto next page in pdf voters = defaultdict(list) yield ve
def parse_vote_pdf(self, vote_url, bill): filename, response = self.urlretrieve(vote_url) text = convert_pdf(filename, type="text").decode() lines = text.splitlines() if "Senate" in vote_url: chamber = "upper" else: chamber = "lower" date_string = lines[0].split("Calendar Date:")[1].strip() date = datetime.datetime.strptime(date_string, "%b %d, %Y %I:%M (%p)") page_index = None for index, line in enumerate(lines): if "Yeas" in line and "Nays" in line: page_index = index break vote_counts = 5 * [0] vote_types = ["yes", "no", "not voting", "excused", "absent"] if page_index: counts = re.split(r"\s{2,}", lines[page_index].strip()) for index, count in enumerate(counts): number, string = count.split(" ", 1) number = int(number) vote_counts[index] = number else: raise ValueError("Vote Counts Not found at %s" % vote_url) passed = vote_counts[0] > vote_counts[1] # Consent calendar votes address multiple bills in one VoteEvent # eg, http://mgaleg.maryland.gov/2018RS/votes/Senate/0478.pdf is_consent_calendar = any( ["Consent Calendar" in line for line in lines[:page_index]] ) consent_calendar_bills = None motion = "" if is_consent_calendar: motion = re.split(r"\s{2,}", lines[page_index - 4].strip())[0] consent_calendar_bills = re.split(r"\s{2,}", lines[page_index - 1].strip()) assert ( consent_calendar_bills ), "Could not find bills for consent calendar vote" motion_keywords = [ "favorable", "reading", "amendment", "motion", "introduced", "bill pass", "committee", ] motion_lines = [ 3, 2, 4, 5, ] # Relative LineNumbers to be checked for existence of motion for i in motion_lines: if any( motion_keyword in motion.lower() for motion_keyword in motion_keywords ): break motion = re.split(r"\s{2,}", lines[page_index - i].strip())[0] else: if not any( motion_keyword in motion.lower() for motion_keyword in motion_keywords ): # This condition covers for the bad formating in SB 1260 motion = lines[page_index - 3] if not any( motion_keyword in motion.lower() for motion_keyword in motion_keywords ): # Check this one for SB 747 motion = "No motion given" self.warning("No motion given") vote = VoteEvent( bill=bill, chamber=chamber, start_date=date.strftime("%Y-%m-%d"), motion_text=motion, classification="passage", result="pass" if passed else "fail", ) # Include bill ID to avoid duplication for consent calendars vote.pupa_id = "{}#{}".format(vote_url, bill.identifier) for index, vote_type in enumerate(vote_types): vote.set_count(vote_type, vote_counts[index]) page_index = page_index + 2 # Keywords for identifying where names are located in the pdf show_stoppers = [ "Voting Nay", "Not Voting", "COPY", "Excused", "indicates vote change", "Indicates Vote Change", ] vote_index = 0 # For matching number of names extracted with vote counts(extracted independently) vote_name_counts = 5 * [0] while page_index < len(lines): current_line = lines[page_index].strip() if not current_line or "Voting Yea" in current_line: page_index += 1 continue if any(show_stopper in current_line for show_stopper in show_stoppers): page_index += 1 vote_index = vote_index + 1 continue names = re.split(r"\s{2,}", current_line) vote_name_counts[vote_index] += len(names) for name in names: vote.vote(vote_types[vote_index], name) page_index += 1 if vote_counts != vote_name_counts: raise ValueError("Votes Count and Number of Names don't match") return vote
def scrape_vote(self, bill, date, url): page = self.get(url).json() location = page["actionLog"]["FullName"] if location: if "House" in location: chamber = "lower" elif "Senate" in location: chamber = "upper" elif "Joint" in location: chamber = "legislature" else: self.warning("Bad Vote chamber: '%s', skipping" % location) return else: self.warning("Bad Vote chamber: '%s', skipping" % location) return motion = page["actionLog"]["StatusText"] if motion: # If we can't detect a motion, skip this vote yes_count = page["Yeas"] no_count = page["Nays"] excused_count = page["Excused"] absent_count = page["Absent"] passed = yes_count > no_count if motion.startswith("Do Pass"): vtype = "passage" elif motion == "Concurred in amendments": vtype = "amendment" # commenting out until we add these back to OS-core # elif motion == "Veto override": # vtype = "veto-override" else: vtype = [] vote = VoteEvent( chamber=chamber, start_date=date, motion_text=motion, result="pass" if passed else "fail", classification=vtype, bill=bill, ) # differentiate nearly identical votes vote.dedupe_key = url vote.add_source(url) vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("excused", excused_count) vote.set_count("absent", absent_count) for person in page["RollCalls"]: option = person["Vote1"] if option in ("Aye", "Yea"): vote.yes(person["UniqueName"]) elif option == "Nay": vote.no(person["UniqueName"]) elif option == "Excused": vote.vote("excused", person["UniqueName"]) elif option == "Absent": vote.vote("absent", person["UniqueName"]) yield vote
def scrape_votes(self, bill, page): base_url = "https://apps.azleg.gov/api/BillStatusFloorAction" for header in page["FloorHeaders"]: params = { "billStatusId": page["BillId"], "billStatusActionId": header["BillStatusActionId"], "includeVotes": "true", } resp = self.get(base_url, params=params) actions = json.loads(resp.content.decode("utf-8")) for action in actions: if action["Action"] == "No Action": continue action_date = datetime.datetime.strptime( action["ReportDate"], "%Y-%m-%dT%H:%M:%S") vote = VoteEvent( chamber={ "S": "upper", "H": "lower" }[header["LegislativeBody"]], motion_text=action["Action"], classification="passage", result=("pass" if action["UnanimouslyAdopted"] or action["Ayes"] > action["Nays"] else "fail"), start_date=action_date.strftime("%Y-%m-%d"), bill=bill, ) vote.add_source(resp.url) vote.set_count("yes", action["Ayes"] or 0) vote.set_count("no", action["Nays"] or 0) vote.set_count("other", (action["Present"] or 0)) vote.set_count("absent", (action["Absent"] or 0)) vote.set_count("excused", (action["Excused"] or 0)) vote.set_count("not voting", (action["NotVoting"] or 0)) for v in action["Votes"]: vote_type = {"Y": "yes", "N": "no"}.get(v["Vote"], "other") vote.vote(vote_type, v["Legislator"]["FullName"]) vote.pupa_id = resp.url + str(action["ReferralNumber"]) yield vote
def scrape_vote(self, url, session): fname, _ = self.urlretrieve(url) text = convert_pdf(fname, type="text").decode() lines = text.splitlines() chamber = "upper" if "senate" in url else "lower" if "Maryland" not in text: self.warning(f"empty vote from {url}") return date = re.findall(r"Legislative Date: (\w+ \d+, \d{4})", text)[0] section = "preamble" motion = None bill_id = None how = None voters = defaultdict(list) for line in lines: if section == "preamble": if "vetoed" in line.lower(): self.warning( f"skipping vote that appears to be on prior session: {line}, {bill_id}" ) return possible_bill_id = re.findall(r"([HS][BJR] \d+)", line) if possible_bill_id: bill_id = possible_bill_id[0] # preamble has metadata, then motion, then counts. our process then is to # store the last line as the motion, but if the last line looks like a # continuation, append it to the prior line line = line.strip() counts = re.findall( r"(\d+) Yeas\s+(\d+) Nays\s+(\d+) Not Voting\s+(\d+) Excused\s+(\d+) Absent", line, ) if counts: yes_count, no_count, nv_count, excused_count, absent_count = counts[ 0] yes_count = int(yes_count) no_count = int(no_count) nv_count = int(nv_count) excused_count = int(excused_count) absent_count = int(absent_count) section = "votes" elif line and line != "(Const)": # questions seem to be split across two lines if line.endswith("?"): motion = motion + " " + line else: motion = line elif section == "votes": if line.startswith("Voting Yea"): how = "yes" elif line.startswith("Voting Nay"): how = "no" elif line.startswith("Not Voting"): how = "not voting" elif line.startswith("Excused from Voting"): how = "excused" elif line.startswith("Excused (Absent)"): how = "absent" elif how: names = re.split(r"\s{2,}", line) voters[how].extend(names) if not bill_id and not motion: return elif bill_id and not motion: self.warning( f"got {bill_id} but no motion, not registering as a vote") elif motion and not bill_id: self.warning( f"got {motion} but no bill_id, not registering as a vote") return # bleh - result not indicated anywhere result = "pass" if yes_count > no_count else "fail" bill_chamber = "upper" if bill_id.startswith("S") else "lower" date = datetime.datetime.strptime(date, "%b %d, %Y").strftime("%Y-%m-%d") vote = VoteEvent( chamber=chamber, start_date=date, result=result, classification="passage", motion_text=motion, legislative_session=session, bill=bill_id, bill_chamber=bill_chamber, ) # URL includes sequence ID, will be unique vote.dedupe_key = url vote.add_source(url) vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("not voting", nv_count) vote.set_count("excused", excused_count) vote.set_count("absent", absent_count) for how, names in voters.items(): for name in names: name = name.strip().replace("*", "") if name and "COPY" not in name and "Indicates Vote Change" not in name: vote.vote(how, name) check_counts(vote, raise_error=True) return vote
def parse_bill_actions_table(self, bill, action_table, bill_id, session, url, bill_chamber): # vote types that have been reconsidered since last vote of that type reconsiderations = set() for action in action_table.xpath("*")[1:]: date = action[0].text_content() date = dt.datetime.strptime(date, "%m/%d/%Y").strftime("%Y-%m-%d") actor_code = action[1].text_content().upper() string = action[2].text_content() actor = self._vote_type_map[actor_code] act_type, committees = categorize_action(string) # XXX: Translate short-code to full committee name for the # matcher. real_committees = [] if committees: for committee in committees: try: committee = self.short_ids[committee]["name"] real_committees.append(committee) except KeyError: pass act = bill.add_action(string, date, chamber=actor, classification=act_type) for committee in real_committees: act.add_related_entity(name=committee, entity_type="organization") vote = self.parse_vote(string) if vote: v, motion = vote motion_text = (("Reconsider: " + motion) if actor in reconsiderations else motion) vote = VoteEvent( start_date=date, chamber=actor, bill=bill_id, bill_chamber=bill_chamber, legislative_session=session, motion_text=motion_text, result="pass" if "passed" in string.lower() else "fail", classification="passage", ) reconsiderations.discard(actor) vote.add_source(url) vote.set_count("yes", int(v["n_yes"] or 0)) vote.set_count("no", int(v["n_no"] or 0)) vote.set_count("not voting", int(v["n_excused"] or 0)) for voter in split_specific_votes(v["yes"]): voter = self.clean_voter_name(voter) vote.yes(voter) for voter in split_specific_votes(v["yes_resv"]): voter = self.clean_voter_name(voter) vote.yes(voter) for voter in split_specific_votes(v["no"]): voter = self.clean_voter_name(voter) vote.no(voter) for voter in split_specific_votes(v["excused"]): voter = self.clean_voter_name(voter) vote.vote("not voting", voter) yield vote elif re.search("reconsider", string, re.IGNORECASE): reconsiderations.add(actor)
def handle_page(self): # Checks to see if any vote totals are provided if (len( self.doc.xpath( '//span[contains(@id, "ctl00_MainContent_lblTotal")]/text()' )) > 0): (date, ) = self.doc.xpath('//span[contains(@id, "lblDate")]/text()') date = format_datetime( datetime.datetime.strptime(date, "%m/%d/%Y %I:%M:%S %p"), "US/Eastern") # ctl00_MainContent_lblTotal //span[contains(@id, "ctl00_MainContent_lblTotal")] yes_count = int( self.doc.xpath('//span[contains(@id, "lblYeas")]/text()')[0]) no_count = int( self.doc.xpath('//span[contains(@id, "lblNays")]/text()')[0]) other_count = int( self.doc.xpath('//span[contains(@id, "lblMissed")]/text()')[0]) result = "pass" if yes_count > no_count else "fail" (committee, ) = self.doc.xpath('//span[contains(@id, "lblCommittee")]/text()') (action, ) = self.doc.xpath('//span[contains(@id, "lblAction")]/text()') motion = "{} ({})".format(action, committee) vote = VoteEvent( start_date=date, bill=self.kwargs["bill"], chamber="lower", motion_text=motion, result=result, classification="committee", ) vote.add_source(self.url) vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("not voting", other_count) for member_vote in self.doc.xpath( '//ul[contains(@class, "vote-list")]/li'): if not member_vote.text_content().strip(): continue (member, ) = member_vote.xpath("span[2]//text()") (member_vote, ) = member_vote.xpath("span[1]//text()") member = member.strip() if member_vote == "Y": vote.yes(member) elif member_vote == "N": vote.no(member) elif member_vote == "-": vote.vote("not voting", member) # Parenthetical votes appear to not be counted in the # totals for Yea, Nay, _or_ Missed elif re.search(r"\([YN]\)", member_vote): continue else: raise ValueError( "Unknown vote type found: {}".format(member_vote)) yield vote
def handle_page(self): (_, motion) = self.lines[5].split("FINAL ACTION:") motion = motion.strip() if not motion: self.scraper.warning("Vote appears to be empty") return vote_top_row = [ self.lines.index(x) for x in self.lines if re.search(r"^\s+Yea\s+Nay.*?(?:\s+Yea\s+Nay)+$", x) ][0] yea_columns_end = self.lines[vote_top_row].index("Yea") + len("Yea") nay_columns_begin = self.lines[vote_top_row].index("Nay") votes = {"yes": [], "no": [], "other": []} for line in self.lines[(vote_top_row + 1):]: if line.strip(): member = re.search( r"""(?x) ^\s+(?:[A-Z\-]+)?\s+ # Possible vote indicator ([A-Z][a-z]+ # Name must have lower-case characters [\w\-\s]+) # Continue looking for the rest of the name (?:,[A-Z\s]+?)? # Leadership has an all-caps title (?:\s{2,}.*)? # Name ends when many spaces are seen """, line, ).group(1) # sometimes members have trailing X's from other motions in the # vote sheet we aren't collecting member = re.sub(r"(\s+X)+", "", member) # Usually non-voting members won't even have a code listed # Only a couple of codes indicate an actual vote: # "VA" (vote after roll call) and "VC" (vote change) did_vote = bool(re.search(r"^\s+(X|VA|VC)\s+[A-Z][a-z]", line)) if did_vote: # Check where the "X" or vote code is on the page vote_column = len(line) - len(line.lstrip()) if vote_column <= yea_columns_end: votes["yes"].append(member) elif vote_column >= nay_columns_begin: votes["no"].append(member) else: raise ValueError( "Unparseable vote found for {0} in {1}:\n{2}". format(member, self.url, line)) else: votes["other"].append(member) # End loop as soon as no more members are found else: break totals = re.search(r"(?msu)\s+(\d{1,3})\s+(\d{1,3})\s+.*?TOTALS", self.text).groups() yes_count = int(totals[0]) no_count = int(totals[1]) result = "pass" if (yes_count > no_count) else "fail" vote = VoteEvent( start_date=self.kwargs["date"], bill=self.kwargs["bill"], chamber="upper", motion_text=motion, classification="committee", result=result, ) vote.add_source(self.url) vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("other", len(votes["other"])) # set voters for vtype, voters in votes.items(): for voter in voters: voter = voter.strip() # Removes the few voter names with a ton of extra spaces with VA at the end. # Ex: Cruz VA if " VA" in voter: voter = " ".join(voter.split()[:-2]) if len(voter) > 0: vote.vote(vtype, voter) yield vote