def test_vote_event_pupa_identifier_dedupe(): j = create_jurisdiction() j.legislative_sessions.create(name="1900", identifier="1900") Organization.objects.create(id="org-id", name="Legislature", classification="legislature", jurisdiction=j) vote_event = ScrapeVoteEvent( legislative_session="1900", start_date="2013", classification="anything", result="passed", motion_text="a vote on something", identifier="Roll Call No. 1", ) vote_event.pupa_id = "foo" dmi = DumbMockImporter() oi = OrganizationImporter("jid") bi = BillImporter("jid", dmi, oi) _, what = VoteEventImporter("jid", dmi, oi, bi).import_item(vote_event.as_dict()) assert what == "insert" assert VoteEvent.objects.count() == 1 # same exact vote event, no changes _, what = VoteEventImporter("jid", dmi, oi, bi).import_item(vote_event.as_dict()) assert what == "noop" assert VoteEvent.objects.count() == 1 # new info, update vote_event.result = "failed" _, what = VoteEventImporter("jid", dmi, oi, bi).import_item(vote_event.as_dict()) assert what == "update" assert VoteEvent.objects.count() == 1 # new bill identifier, update vote_event.identifier = "First Roll Call" _, what = VoteEventImporter("jid", dmi, oi, bi).import_item(vote_event.as_dict()) assert what == "update" assert VoteEvent.objects.count() == 1 # new identifier, insert vote_event.pupa_id = "bar" _, what = VoteEventImporter("jid", dmi, oi, bi).import_item(vote_event.as_dict()) assert what == "insert" assert VoteEvent.objects.count() == 2
def test_vote_event_bill_actions_two_stage(): # this test is very similar to what we're testing in test_vote_event_bill_actions w/ # ve3 and ve4, that two bills that reference the same action won't conflict w/ the # OneToOneField, but in this case we do it in two stages so that the conflict is found # even if the votes weren't in the same scrape create_jurisdiction() bill = ScrapeBill("HB 1", "1900", "Axe & Tack Tax Act", chamber="lower") bill.add_action(description="passage", date="1900-04-02", chamber="lower") ve1 = ScrapeVoteEvent( legislative_session="1900", motion_text="passage", start_date="1900-04-02", classification="passage:bill", result="pass", bill_chamber="lower", bill="HB 1", bill_action="passage", chamber="lower", ) ve2 = ScrapeVoteEvent( legislative_session="1900", motion_text="passage", start_date="1900-04-02", classification="passage:bill", result="pass", bill_chamber="lower", bill="HB 1", bill_action="passage", chamber="lower", ) # disambiguate them ve1.pupa_id = "one" ve2.pupa_id = "two" bi = BillImporter("jid") bi.import_data([bill.as_dict()]) # first imports just fine VoteEventImporter("jid", bi).import_data([ve1.as_dict()]) votes = list(VoteEvent.objects.all()) assert len(votes) == 1 assert votes[0].bill_action is not None # when second is imported, ensure that action stays pinned to first just as it would # have if they were both in same import VoteEventImporter("jid", bi).import_data([ve1.as_dict(), ve2.as_dict()]) votes = list(VoteEvent.objects.all()) assert len(votes) == 2 assert votes[0].bill_action is not None assert votes[1].bill_action is None
def add_vote(self, bill, chamber, date, text, url): votes = re.findall(r"Ayes,?[\s]?(\d+)[,;]\s+N(?:oes|ays),?[\s]?(\d+)", text) yes, no = int(votes[0][0]), int(votes[0][1]) vtype = "other" for regex, type in motion_classifiers.items(): if re.match(regex, text): vtype = type break v = VoteEvent( chamber=chamber, start_date=TIMEZONE.localize(date), motion_text=text, result="pass" if yes > no else "fail", classification=vtype, bill=bill, ) v.pupa_id = url.split("/")[-1] v.set_count("yes", yes) v.set_count("no", no) # fetch the vote itself if url: v.add_source(url) if "av" in url: self.add_house_votes(v, url) elif "sv" in url: self.add_senate_votes(v, url) return v
def build_vote(session, bill_id, url, vote_record, chamber, motion_text): # When they vote in a substitute they mark it as XHB bill_id = bill_id.replace("XHB", "HB") passed = len(vote_record["yes"]) > len(vote_record["no"]) vote_event = VoteEvent( result="pass" if passed else "fail", chamber=chamber, start_date=vote_record["date"].strftime("%Y-%m-%d"), motion_text=motion_text, classification="passage", legislative_session=session, bill=bill_id, bill_chamber="upper" if bill_id[0] == "S" else "lower", ) vote_event.pupa_id = url vote_event.set_count("yes", len(vote_record["yes"])) vote_event.set_count("no", len(vote_record["no"])) vote_event.set_count("excused", len(vote_record["excused"])) vote_event.set_count("absent", len(vote_record["absent"])) vote_event.set_count("other", len(vote_record["other"])) for vote_type in ["yes", "no", "excused", "absent", "other"]: for voter in vote_record[vote_type]: vote_event.vote(vote_type, voter) vote_event.add_source(url) return vote_event
def parse_vote_page(self, vote_url, bill): vote_html = self.get(vote_url).text doc = lxml.html.fromstring(vote_html) # chamber if "senate" in vote_url: chamber = "upper" else: chamber = "lower" # date in the following format: Mar 23, 2009 date = doc.xpath('//td[starts-with(text(), "Legislative")]')[0].text date = date.replace(u"\xa0", " ") date = datetime.datetime.strptime(date[18:], "%b %d, %Y") # motion motion = "".join(x.text_content() for x in doc.xpath('//td[@colspan="23"]')) if motion == "": motion = "No motion given" # XXX: Double check this. See SJ 3. motion = motion.replace(u"\xa0", " ") # totals tot_class = doc.xpath('//td[contains(text(), "Yeas")]')[0].get("class") totals = doc.xpath('//td[@class="%s"]/text()' % tot_class)[1:] yes_count = int(totals[0].split()[-1]) no_count = int(totals[1].split()[-1]) other_count = int(totals[2].split()[-1]) other_count += int(totals[3].split()[-1]) other_count += int(totals[4].split()[-1]) passed = yes_count > no_count vote = VoteEvent( bill=bill, chamber=chamber, start_date=date.strftime("%Y-%m-%d"), motion_text=motion, classification="passage", result="pass" if passed else "fail", ) vote.pupa_id = vote_url # contains sequence number vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("other", other_count) # go through, find Voting Yea/Voting Nay/etc. and next tds are voters func = None for td in doc.xpath("//td/text()"): td = td.replace(u"\xa0", " ") if td.startswith("Voting Yea"): func = vote.yes elif td.startswith("Voting Nay"): func = vote.no elif td.startswith("Not Voting"): func = vote.other elif td.startswith("Excused"): func = vote.other elif func: td = td.rstrip("*") func(td) return vote
def scrape_senate_vote(self, bill, url, date): try: filename, resp = self.urlretrieve(url) except scrapelib.HTTPError: self.warning("missing vote file %s" % url) return vote = VoteEvent( chamber="upper", start_date=date.strftime("%Y-%m-%d"), motion_text="Passage", # setting 'fail' for now. result="fail", classification="passage", bill=bill, ) vote.add_source(url) vote.pupa_id = url text = convert_pdf(filename, "text").decode("utf-8") os.remove(filename) if re.search(r"Yea:\s+\d+\s+Nay:\s+\d+\s+Absent:\s+\d+", text): yield from self.scrape_senate_vote_3col(bill, vote, text, url, date) return data = re.split(r"(Yea|Nay|Absent)s?:", text)[::-1] data = filter(None, data) keymap = dict(yea="yes", nay="no") actual_vote = collections.defaultdict(int) vote_count = {"yes": 0, "no": 0, "other": 0} while True: if not data: break vote_val = data.pop() key = keymap.get(vote_val.lower(), "other") values = data.pop() for name in re.split(r"(?:[\s,]+and\s|[\s,]{2,})", values): if name.lower().strip() == "none.": continue name = name.replace("..", "") name = re.sub(r"\.$", "", name) name = name.strip("-1234567890 \n") if not name: continue vote.vote(key, name) actual_vote[vote_val] += 1 vote_count[key] += 1 assert actual_vote[vote_val] == vote_count[key] for key, value in vote_count.items(): vote.set_count(key, value) # updating result with actual value vote.result = ("pass" if vote_count["yes"] > (vote_count["no"] + vote_count["other"]) else "fail") yield vote
def scrape_votes(self, bill, bill_page, chamber): vote_links = bill_page.xpath( '//table[contains(@class,"history")]//a[contains(@href, "view_votes")]' ) for vote_link in vote_links: vote_url = vote_link.attrib["href"] date_td, motion_td, *_ = vote_link.xpath("ancestor::tr/td") date = datetime.strptime(date_td.text, "%b %d, %Y") motion_text = motion_td.text_content() vote_page = self.lxmlize(vote_url) passed = "Passed" in motion_text or "Advanced" in motion_text cells = vote_page.xpath( '//div[contains(@class,"table-responsive")]/table//td') vote = VoteEvent( bill=bill, chamber=chamber, start_date=TIMEZONE.localize(date), motion_text=motion_text, classification="passage", result="pass" if passed else "fail", ) yes_count = self.process_count(vote_page, "Yes:") no_count = self.process_count(vote_page, "No:") exc_count = self.process_count(vote_page, "Excused - Not Voting:") absent_count = self.process_count(vote_page, "Absent - Not Voting:") present_count = self.process_count(vote_page, "Present - Not Voting:") vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("excused", exc_count) vote.set_count("absent", absent_count) vote.set_count("abstain", present_count) query_params = urllib.parse.parse_qs( urllib.parse.urlparse(vote_url).query) vote.pupa_id = query_params["KeyID"][0] vote.add_source(vote_url) for chunk in range(0, len(cells), 2): name = cells[chunk].text vote_type = cells[chunk + 1].text if name and vote_type: vote.vote(VOTE_TYPE_MAP.get(vote_type.lower(), "other"), name) yield vote
def parse_committee_votes(self, bill, url): bill.add_source(url) html = self.get(url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) chamber = "upper" if "Senate" in doc.xpath("string(//h1)") else "lower" committee = tuple(doc.xpath("//h2")[0].itertext())[-2].strip() for link in doc.xpath("//a[contains(@href, 'listVoteSummary.cfm')]"): # Date for fmt in ("%m/%d/%Y", "%m-%d-%Y"): date = link.xpath("../../td")[0].text_content() try: date = datetime.datetime.strptime(date, fmt) except ValueError: continue break # Motion motion = link.text_content().split(" - ")[-1].strip() motion = "Committee vote (%s): %s" % (committee, motion) # Roll call vote_url = link.attrib["href"] rollcall = self.parse_upper_committee_vote_rollcall(bill, vote_url) vote = VoteEvent( chamber=chamber, start_date=tz.localize(date), motion_text=motion, classification=[], result="pass" if rollcall["passed"] else "fail", bill=bill, ) vote.pupa_id = vote_url vote.set_count("yes", rollcall["yes_count"]) vote.set_count("no", rollcall["no_count"]) vote.set_count("other", rollcall["other_count"]) for voteval in ("yes", "no", "other"): for name in rollcall.get(voteval + "_votes", []): vote.vote(voteval, name) vote.add_source(url) vote.add_source(vote_url) yield vote
def scrape_vote(self, chamber, session, bill_id, vote_url): try: resp = self.get(vote_url) html = resp.text except scrapelib.HTTPError: return doc = lxml.html.fromstring(html) motion = doc.xpath("//p[1]//b[1]/text()")[-1].strip() if len(motion) == 0: print(motion) motion = doc.xpath("//h2[1]/text()")[0].strip() vote_count = ( doc.xpath("//h3[contains(text(),'YEA and ')]/text()")[0].strip().split() ) yeas = int(vote_count[0]) nays = int(vote_count[3]) date = doc.xpath("//b[contains(text(),'Date:')]/../text()")[1].strip() date = datetime.datetime.strptime(date, "%m/%d/%Y").date() vote = VoteEvent( chamber="lower", start_date=date, motion_text=motion, result="pass" if yeas > nays else "fail", classification="passage", legislative_session=session, bill=bill_id, bill_chamber=chamber, ) vote.set_count("yes", yeas) vote.set_count("no", nays) vote.add_source(vote_url) vote.pupa_id = vote_url # first table has YEAs for name in doc.xpath("//table[1]//font/text()"): vote.yes(name.strip()) # second table is nays for name in doc.xpath("//table[2]//font/text()"): vote.no(name.strip()) yield vote
def scrape_chamber_votes(self, chamber, session): url = { "upper": "%s/%s" % (RI_URL_BASE, "SVotes"), "lower": "%s/%s" % (RI_URL_BASE, "HVotes"), }[chamber] action = "%s/%s" % (url, "votes.asp") dates = self.get_vote_dates(url, session) for date in dates: votes = self.parse_vote_page(self.post_to(action, date), url, session) for vote_dict in votes: for vote in vote_dict.values(): count = vote["count"] chamber = {"H": "lower", "S": "upper"}[vote["meta"]["chamber"]] try: bill_id = self._bill_id_by_type[(chamber, vote["meta"]["bill"])] except KeyError: self.warning( "no such bill_id %s %s", chamber, vote["meta"]["bill"] ) continue v = VoteEvent( chamber=chamber, start_date=vote["time"].strftime("%Y-%m-%d"), motion_text=vote["meta"]["extra"]["motion"], result="pass" if count["passage"] else "fail", classification="passage", legislative_session=session, bill=bill_id, bill_chamber=chamber, ) v.set_count("yes", int(count["YEAS"])) v.set_count("no", int(count["NAYS"])) v.set_count("other", int(count["NOT VOTING"])) v.add_source(vote["source"]) v.pupa_id = vote["source"] for vt in vote["votes"]: key = {"Y": "yes", "N": "no"}.get(vt["vote"], "other") v.vote(key, vt["name"]) yield v
def scrape_votes(self, bill, page): base_url = "https://apps.azleg.gov/api/BillStatusFloorAction" for header in page["FloorHeaders"]: params = { "billStatusId": page["BillId"], "billStatusActionId": header["BillStatusActionId"], "includeVotes": "true", } resp = self.get(base_url, params=params) actions = json.loads(resp.content.decode("utf-8")) for action in actions: if action["Action"] == "No Action": continue action_date = datetime.datetime.strptime( action["ReportDate"], "%Y-%m-%dT%H:%M:%S") vote = VoteEvent( chamber={ "S": "upper", "H": "lower" }[header["LegislativeBody"]], motion_text=action["Action"], classification="passage", result=("pass" if action["UnanimouslyAdopted"] or action["Ayes"] > action["Nays"] else "fail"), start_date=action_date.strftime("%Y-%m-%d"), bill=bill, ) vote.add_source(resp.url) vote.set_count("yes", action["Ayes"] or 0) vote.set_count("no", action["Nays"] or 0) vote.set_count("other", (action["Present"] or 0)) vote.set_count("absent", (action["Absent"] or 0)) vote.set_count("excused", (action["Excused"] or 0)) vote.set_count("not voting", (action["NotVoting"] or 0)) for v in action["Votes"]: vote_type = {"Y": "yes", "N": "no"}.get(v["Vote"], "other") vote.vote(vote_type, v["Legislator"]["FullName"]) vote.pupa_id = resp.url + str(action["ReferralNumber"]) yield vote
def asvote(self): v = VoteEvent( chamber=self.chamber(), start_date=self.date(), motion_text=self.motion(), result="pass" if self.passed() else "fail", classification="passage", bill=self.bill, ) v.pupa_id = self.url # URL contains sequence number v.set_count("yes", self.yes_count()) v.set_count("no", self.no_count()) v.set_count("other", self.other_count()) for voter in self.yes_votes(): v.yes(voter) for voter in self.no_votes(): v.no(voter) for voter in self.other_votes(): v.vote("other", voter) v.add_source(self.url) return v
def parse_vote(self, bill, link): # Server sometimes sends proper error headers, # sometimes not try: self.info("Get {}".format(link)) text = requests.get(link).text except requests.exceptions.HTTPError as err: self.warning("{} fetching vote {}, skipping".format(err, link)) return if "Varnish cache server" in text: self.warning("Scrape rate is too high, try re-scraping with " "The --rpm set to a lower number") return if "Page Not Found" in text or "Page Unavailable" in text: self.warning("missing vote, skipping") return member_doc = lxml.html.fromstring(text) motion = member_doc.xpath("//div[@id='main_content']/h4/text()") chamber_date_line = "".join( member_doc.xpath("//div[@id='main_content']/h3[1]//text()")) chamber_date_line_words = chamber_date_line.split() vote_chamber = chamber_date_line_words[0] vote_date = datetime.datetime.strptime(chamber_date_line_words[-1], "%m/%d/%Y") vote_status = " ".join(chamber_date_line_words[2:-2]) opinions = member_doc.xpath( "//div[@id='main_content']/h3[position() > 1]/text()") if len(opinions) > 0: vote_status = vote_status if vote_status.strip() else motion[0] vote_chamber = "upper" if vote_chamber == "Senate" else "lower" for i in opinions: try: count = int(i[i.find("(") + 1:i.find(")")]) except ValueError: # This is likely not a vote-count text chunk # It's probably '`On roll call the vote was:` pass else: if "yea" in i.lower(): yes_count = count elif "nay" in i.lower(): no_count = count elif "present" in i.lower(): p_count = count elif "absent" in i.lower(): a_count = count vote = VoteEvent( bill=bill, start_date=vote_date.strftime("%Y-%m-%d"), chamber=vote_chamber, motion_text=vote_status, result="pass" if yes_count > no_count else "fail", classification="passage", ) vote.pupa_id = link vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("abstain", p_count) vote.set_count("absent", a_count) vote.add_source(link) a_links = member_doc.xpath("//div[@id='main_content']/a/text()") for i in range(1, len(a_links)): if i <= yes_count: vote.vote("yes", re.sub(",", "", a_links[i]).split()[0]) elif no_count != 0 and i > yes_count and i <= yes_count + no_count: vote.vote("no", re.sub(",", "", a_links[i]).split()[0]) else: vote.vote("other", re.sub(",", "", a_links[i]).split()[0]) yield vote else: self.warning("No Votes for: %s", link)
def scrape_vote(self, bill, motion, url): page = self.get(url, retry_on_404=True).text page = lxml.html.fromstring(page) yeas_cell = page.xpath("//td[text() = 'Yeas (Y):']")[0] yes_count = int(yeas_cell.xpath("string(following-sibling::td)")) nays_cell = page.xpath("//td[text() = 'Nays (N):']")[0] no_count = int(nays_cell.xpath("string(following-sibling::td)")) abs_cell = page.xpath("//td[text() = 'Absent (X):']")[0] abs_count = int(abs_cell.xpath("string(following-sibling::td)")) ex_cell = page.xpath("//td[text() = 'Excused (E):']")[0] ex_count = int(ex_cell.xpath("string(following-sibling::td)")) other_count = abs_count + ex_count if "chamber=House" in url: chamber = "lower" elif "chamber=Senate" in url: chamber = "upper" date_cell = page.xpath("//td[text() = 'Date:']")[0] date = date_cell.xpath("string(following-sibling::td)") try: date = datetime.datetime.strptime(date, "%B %d, %Y") except ValueError: date = datetime.datetime.strptime(date, "%b. %d, %Y") outcome_cell = page.xpath("//td[text()='Outcome:']")[0] outcome = outcome_cell.xpath("string(following-sibling::td)") vote = VoteEvent( chamber=chamber, start_date=date.strftime("%Y-%m-%d"), motion_text=motion, result="pass" if outcome == "PREVAILS" else "fail", classification="passage", bill=bill, ) vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("other", other_count) vote.add_source(url) vote.pupa_id = url member_cell = page.xpath("//td[text() = 'Member']")[0] for row in member_cell.xpath("../../tr")[1:]: name = row.xpath("string(td[2])") # name = name.split(" of ")[0] vtype = row.xpath("string(td[4])") if vtype == "Y": vote.vote("yes", name) elif vtype == "N": vote.vote("no", name) elif vtype == "X" or vtype == "E": vote.vote("other", name) yield vote
def scrape_vote(self, url, session): fname, _ = self.urlretrieve(url) text = convert_pdf(fname, type="text").decode() lines = text.splitlines() chamber = "upper" if "senate" in url else "lower" if "Maryland" not in text: self.warning(f"empty vote from {url}") return date = re.findall(r"Legislative Date: (\w+ \d+, \d{4})", text)[0] section = "preamble" motion = None bill_id = None how = None voters = defaultdict(list) for line in lines: if section == "preamble": possible_bill_id = re.findall(r"([HS][BJR] \d+)", line) if possible_bill_id: bill_id = possible_bill_id[0] # preamble has metadata, then motion, then counts. our process then is to # store the last line as the motion, but if the last line looks like a # continuation, append it to the prior line line = line.strip() counts = re.findall( r"(\d+) Yeas\s+(\d+) Nays\s+(\d+) Not Voting\s+(\d+) Excused\s+(\d+) Absent", line, ) if counts: yes_count, no_count, nv_count, excused_count, absent_count = counts[ 0 ] yes_count = int(yes_count) no_count = int(no_count) nv_count = int(nv_count) excused_count = int(excused_count) absent_count = int(absent_count) section = "votes" elif line and line != "(Const)": # questions seem to be split across two lines if line.endswith("?"): motion = motion + " " + line else: motion = line elif section == "votes": if line.startswith("Voting Yea"): how = "yes" elif line.startswith("Voting Nay"): how = "no" elif line.startswith("Not Voting"): how = "not voting" elif line.startswith("Excused from Voting"): how = "excused" elif line.startswith("Excused (Absent)"): how = "absent" elif how: names = re.split(r"\s{2,}", line) voters[how].extend(names) if not bill_id and not motion: return elif bill_id and not motion: self.warning(f"got {bill_id} but no motion, not registering as a vote") elif motion and not bill_id: self.warning(f"got {motion} but no bill_id, not registering as a vote") return # bleh - result not indicated anywhere result = "pass" if yes_count > no_count else "fail" bill_chamber = "upper" if bill_id.startswith("S") else "lower" date = datetime.datetime.strptime(date, "%b %d, %Y").strftime("%Y-%m-%d") vote = VoteEvent( chamber=chamber, start_date=date, result=result, classification="passage", motion_text=motion, legislative_session=session, bill=bill_id, bill_chamber=bill_chamber, ) # URL includes sequence ID, will be unique vote.pupa_id = url vote.add_source(url) vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("not voting", nv_count) vote.set_count("excused", excused_count) vote.set_count("absent", absent_count) for how, names in voters.items(): for name in names: name = name.strip().replace("*", "") if name and "COPY" not in name and "Indicates Vote Change" not in name: vote.vote(how, name) check_counts(vote, raise_error=True) return vote
def scrape_votes(self, session): votes = {} other_counts = defaultdict(int) last_line = [] vote_url = "http://gencourt.state.nh.us/dynamicdatafiles/RollCallSummary.txt" lines = self.get(vote_url).content.decode("utf-8").splitlines() for line in lines: if len(line) < 2: continue if line.strip() == "": continue line = line.split("|") if len(line) < 14: if len(last_line + line[1:]) == 14: line = last_line self.warning("used bad vote line") else: last_line = line self.warning("bad vote line %s" % "|".join(line)) session_yr = line[0].replace("\xef\xbb\xbf", "") body = line[1] vote_num = line[2] timestamp = line[3] bill_id = line[4].strip() yeas = int(line[5]) nays = int(line[6]) # present = int(line[7]) # absent = int(line[8]) motion = line[11].strip() or "[not available]" if session_yr == session and bill_id in self.bills_by_id: actor = "lower" if body == "H" else "upper" time = dt.datetime.strptime(timestamp, "%m/%d/%Y %I:%M:%S %p") time = pytz.timezone("America/New_York").localize( time).isoformat() # TODO: stop faking passed somehow passed = yeas > nays vote = Vote( chamber=actor, start_date=time, motion_text=motion, result="pass" if passed else "fail", classification="passage", bill=self.bills_by_id[bill_id], ) vote.set_count("yes", yeas) vote.set_count("no", nays) vote.add_source(vote_url) vote.pupa_id = session_yr + body + vote_num # unique ID for vote votes[body + vote_num] = vote for line in (self.get( "http://gencourt.state.nh.us/dynamicdatafiles/RollCallHistory.txt" ).content.decode("utf-8").splitlines()): if len(line) < 2: continue # 2016|H|2|330795||Yea| # 2012 | H | 2 | 330795 | 964 | HB309 | Yea | 1/4/2012 8:27:03 PM session_yr, body, v_num, _, employee, bill_id, vote, date = line.split( "|") if not bill_id: continue if session_yr == session and bill_id.strip() in self.bills_by_id: try: leg = " ".join(self.legislators[employee]["name"].split()) except KeyError: self.warning("Error, can't find person %s" % employee) continue vote = vote.strip() if body + v_num not in votes: self.warning("Skipping processing this vote:") self.warning("Bad ID: %s" % (body + v_num)) continue # code = self.legislators[employee]['seat'] if vote == "Yea": votes[body + v_num].yes(leg) elif vote == "Nay": votes[body + v_num].no(leg) else: votes[body + v_num].vote("other", leg) # hack-ish, but will keep the vote count sync'd other_counts[body + v_num] += 1 votes[body + v_num].set_count("other", other_counts[body + v_num]) for vote in votes.values(): yield vote
def scrape_action_page(self, bill, page): action_rows = page.xpath("//tbody/tr") for row in action_rows: action_date = row.xpath("td[1]/text()")[0] action_date = datetime.strptime(action_date, "%m/%d/%Y") action_year = action_date.year action_date = action_date.strftime("%Y-%m-%d") if row.xpath("td[2]/text()"): action_actor = row.xpath("td[2]/text()")[0] action_actor = self.chamber_map_reverse[action_actor.strip()] action_name = row.xpath("string(td[3])") # House votes if "Supplement" in action_name: actor = "lower" vote_action = re.findall(r"(.+)-\s*\d+\s*YEAS", action_name)[0].strip() y = int(re.findall(r"(\d+)\s*YEAS", action_name)[0]) n = int(re.findall(r"(\d+)\s*NAYS", action_name)[0]) # get supplement number n_supplement = int(re.findall(r"No\.\s*(\d+)", action_name)[0]) cached_vote = VoteEvent( chamber=actor, start_date=action_date, motion_text=vote_action, result="pass" if y > n else "fail", classification="passage", bill=bill, ) cached_vote.set_count("yes", y) cached_vote.set_count("no", n) housevote_pdf = "https://malegislature.gov/Journal/House/{}/{}/RollCalls".format( bill.legislative_session, action_year ) self.scrape_house_vote(cached_vote, housevote_pdf, n_supplement) cached_vote.add_source(housevote_pdf) cached_vote.pupa_id = "{}#{}".format(housevote_pdf, n_supplement) # XXX: disabled house votes on 8/1 to try to get MA importing again # will leaving this in and commented out once we resolve the ID issue # yield cached_vote # Senate votes if "Roll Call" in action_name: actor = "upper" # placeholder vote_action = action_name.split(" -")[0] # 2019 H86 Breaks our regex, # Ordered to a third reading -- # see Senate Roll Call #25 and House Roll Call 56 if "yeas" in action_name and "nays" in action_name: try: y, n = re.search( r"(\d+) yeas .*? (\d+) nays", action_name.lower() ).groups() y = int(y) n = int(n) except AttributeError: y = int( re.search(r"yeas\s+(\d+)", action_name.lower()).group(1) ) n = int( re.search(r"nays\s+(\d+)", action_name.lower()).group(1) ) # TODO: other count isn't included, set later cached_vote = VoteEvent( chamber=actor, start_date=action_date, motion_text=vote_action, result="pass" if y > n else "fail", classification="passage", bill=bill, ) cached_vote.set_count("yes", y) cached_vote.set_count("no", n) rollcall_pdf = "http://malegislature.gov" + row.xpath( "string(td[3]/a/@href)" ) self.scrape_senate_vote(cached_vote, rollcall_pdf) cached_vote.add_source(rollcall_pdf) cached_vote.pupa_id = rollcall_pdf # XXX: also disabled, see above note # yield cached_vote attrs = self.categorizer.categorize(action_name) action = bill.add_action( action_name.strip(), action_date, chamber=action_actor, classification=attrs["classification"], ) for com in attrs.get("committees", []): com = com.strip() action.add_related_entity(com, entity_type="organization")
def scrape_house_vote(self, bill, url): try: filename, resp = self.urlretrieve(url) except scrapelib.HTTPError: self.warning("missing vote file %s" % url) return text = convert_pdf(filename, "text") os.remove(filename) lines = text.splitlines() vote_type = None votes = collections.defaultdict(list) date = None for idx, line in enumerate(lines): line = line.rstrip().decode("utf-8") match = re.search(r"(\d+)/(\d+)/(\d{4,4})$", line) if match: date = datetime.datetime.strptime(match.group(0), "%m/%d/%Y") continue match = re.match( r"\s+YEAS: (\d+)\s+NAYS: (\d+)\s+NOT VOTING: (\d+)", line) if match: motion = (lines[idx - 2].strip()).decode("utf-8") if not motion: self.warning("No motion text found for vote") motion = "PASSAGE" yes_count, no_count, other_count = [ int(g) for g in match.groups() ] exc_match = re.search(r"EXCUSED: (\d+)", line) if exc_match: other_count += int(exc_match.group(1)) if line.endswith("ADOPTED") or line.endswith("PASSED"): passed = True else: passed = False continue match = re.match( r"(YEAS|NAYS|NOT VOTING|PAIRED|EXCUSED):\s+(\d+)\s*$", line) if match: vote_type = { "YEAS": "yes", "NAYS": "no", "NOT VOTING": "other", "EXCUSED": "other", "PAIRED": "paired", }[match.group(1)] continue if vote_type == "paired": for part in line.split(" "): part = part.strip() if not part: continue name, pair_type = re.match(r"([^\(]+)\((YEA|NAY)\)", line).groups() name = name.strip() if pair_type == "YEA": votes["yes"].append(name) elif pair_type == "NAY": votes["no"].append(name) elif vote_type: for name in line.split(" "): name = name.strip() if not name: continue votes[vote_type].append(name) if date: vote = VoteEvent( chamber="lower", start_date=date.strftime("%Y-%m-%d"), motion_text=motion, result="pass" if passed else "fail", classification="passage", bill=bill, ) vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("other", other_count) vote.add_source(url) vote.pupa_id = url for key, values in votes.items(): for value in values: if "Committee" in value: continue if "*" in value: value = value.replace("*", "") vote.vote(key, value) yield vote else: self.warning("Syntax Error/Warning using 'convert_pdf'")
def scrape_vote_history(self, bill, vurl): """ Obtain the information on a vote and link it to the related Bill :param bill: related bill :param vurl: source for the voteEvent information. :return: voteEvent object """ html = self.get(vurl).text doc = lxml.html.fromstring(html) doc.make_links_absolute(vurl) # skip first two rows for row in doc.xpath("//table/tr")[2:]: tds = row.getchildren() if len(tds) != 11: self.warning("irregular vote row: %s" % vurl) continue ( timestamp, motion, vote, yeas, nays, nv, exc, pres, abst, total, result, ) = tds timestamp = timestamp.text.replace(u"\xa0", " ") timestamp = datetime.datetime.strptime(timestamp, "%m/%d/%Y %H:%M %p") yeas = int(yeas.text) nays = int(nays.text) others = int(nv.text) + int(exc.text) + int(abst.text) + int( pres.text) assert yeas + nays + others == int(total.text) if result.text == "Passed": passed = "pass" else: passed = "fail" vote_link = vote.xpath("a")[0] if "[H]" in vote_link.text: chamber = "lower" else: chamber = "upper" vote = VoteEvent( chamber=chamber, # 'upper' or 'lower' start_date=timestamp.strftime( "%Y-%m-%d"), # 'YYYY-MM-DD' format motion_text=motion.text, result=passed, classification="passage", # Can also be 'other' # Provide a Bill instance to link with the VoteEvent... bill=bill, ) vote.set_count("yes", yeas) vote.set_count("no", nays) vote.set_count("other", others) vote.add_source(vurl) # obtain vote rollcall from pdf and add it to the VoteEvent object rollcall_pdf = vote_link.get("href") self.scrape_rollcall(vote, rollcall_pdf) vote.add_source(rollcall_pdf) if rollcall_pdf in self._seen_vote_ids: self.warning("duplicate usage of %s, skipping", rollcall_pdf) continue else: self._seen_vote_ids.add(rollcall_pdf) vote.pupa_id = rollcall_pdf # distinct KEY for each one yield vote
def process_vote(self, votes, url, base_url, bill, legislators, chamber_dict, vote_results): for v in votes["items"]: try: v["yeas"] except KeyError: # sometimes the actual vote is buried a second layer deep v = self.get(base_url + v["link"]).json() try: v["yeas"] except KeyError: self.logger.warning("No vote info available, skipping") continue try: chamber = chamber_dict[v["chamber"]] except KeyError: chamber = "lower" if "house" in v["apn"] else "upper" try: date = self._tz.localize( datetime.datetime.strptime(v["date"], "%m/%d/%y")) date = "{:%Y-%m-%d}".format(date) except KeyError: try: date = self._tz.localize( datetime.datetime.strptime(v["occurred"], "%m/%d/%y")) date = "{:%Y-%m-%d}".format(date) except KeyError: self.logger.warning("No date found for vote, skipping") continue try: motion = v["action"] except KeyError: motion = v["motiontype"] if motion in self._vote_motion_dict: motion_text = self._vote_motion_dict[motion] else: self.warning( "Unknown vote code {}, please add to _vote_motion_dict". format(motion)) motion_text = v["results"] # Sometimes Ohio's SOLAR will only return part of the JSON, so in that case skip if not motion and isinstance(v["yeas"], str) and isinstance( v["nays"], str): waringText = 'Malformed JSON found for vote ("revno" of {}); skipping' self.warning(waringText.format(v["revno"])) continue result = v.get("results") or v.get("passed") if result is None: if len(v["yeas"]) > len(v["nays"]): result = "passed" else: result = "failed" passed = vote_results[result.lower()] if "committee" in v: vote = VoteEvent( chamber=chamber, start_date=date, motion_text=motion_text, result="pass" if passed else "fail", # organization=v["committee"], bill=bill, classification="committee-passage", ) else: vote = VoteEvent( chamber=chamber, start_date=date, motion_text=motion_text, result="pass" if passed else "fail", classification="passage", bill=bill, ) # Concatenate the bill identifier and vote identifier to avoid collisions vote.pupa_id = "{}:{}".format(bill.identifier.replace(" ", ""), v["revno"]) # the yea and nay counts are not displayed, but vote totals are # and passage status is. yes_count = 0 no_count = 0 absent_count = 0 excused_count = 0 for voter_id in v["yeas"]: vote.yes(legislators[voter_id]) yes_count += 1 for voter_id in v["nays"]: vote.no(legislators[voter_id]) no_count += 1 if "absent" in v: for voter_id in v["absent"]: vote.vote("absent", legislators[voter_id]) absent_count += 1 if "excused" in v: for voter_id in v["excused"]: vote.vote("excused", legislators[voter_id]) excused_count += 1 vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("absent", absent_count) vote.set_count("excused", excused_count) # check to see if there are any other things that look # like vote categories, throw a warning if so for key, val in v.items(): if (type(val) == list and len(val) > 0 and key not in ["yeas", "nays", "absent", "excused"]): if val[0] in legislators: self.logger.warning( "{k} looks like a vote type that's not being counted." " Double check it?".format(k=key)) vote.add_source(url) yield vote
def parse_roll_call(self, bill, link, chamber, date): url = link.attrib["href"] page = self.get(url).text page = lxml.html.fromstring(page) xpath = 'string(//div[@class="Column-OneFourth"]/div[3])' motion = page.xpath(xpath).strip() motion = re.sub(r"\s+", " ", motion) if motion == "FP": motion = "FINAL PASSAGE" if motion == "FINAL PASSAGE": type = "passage" elif re.match(r"CONCUR(RENCE)? IN \w+ AMENDMENTS", motion): type = "amendment" else: type = [] motion = link.text_content() # Looks like for "YEAS" and "NAYS" counts, PA has multiple HTML # formats: one where the "YEAS" text node is nested within a span # element, and another where the text node is a direct child of the div # element yeas_elements = page.xpath("//div/span[text() = 'YEAS']/..") if len(yeas_elements) == 0: yeas_elements = page.xpath( "//div[text()[normalize-space() = 'YEAS']]") yeas = int(yeas_elements[0].getnext().text) nays_elements = page.xpath("//div/span[text() = 'NAYS']/..") if len(nays_elements) == 0: nays_elements = page.xpath( "//div[text()[normalize-space() = 'NAYS']]") nays = int(nays_elements[0].getnext().text) # "LVE" and "N/V" have been moved up as direct children of the div # element other = 0 lve_elements = page.xpath('//div[text()[normalize-space() = "LVE"]]') if lve_elements: other += int(lve_elements[0].getnext().text) nv_elements = page.xpath('//div[text()[normalize-space() = "N/V"]]') if nv_elements: other += int(nv_elements[0].getnext().text) vote = VoteEvent( chamber=chamber, start_date=tz.localize(date), motion_text=motion, classification=type, result="pass" if yeas > (nays + other) else "fail", bill=bill, ) # pupa_id situation here is a bit weird, same vote can be used for # multiple bills see: # http://www.legis.state.pa.us/CFDOCS/Legis/RC/Public/rc_view_action2.cfm?sess_yr=2017&sess_ind=0&rc_body=H&rc_nbr=11 # noqa # so we toss the bill id onto the end of the URL vote.pupa_id = url + "#" + bill.identifier vote.add_source(url) vote.set_count("yes", yeas) vote.set_count("no", nays) vote.set_count("other", other) for div in page.xpath('//*[contains(@class, "RollCalls-Vote")]'): name = div[0].tail.strip() name = re.sub(r"^[\s,]+", "", name) name = re.sub(r"[\s,]+$", "", name) class_attr = div.attrib["class"].lower() if "yea" in class_attr: voteval = "yes" elif "nay" in class_attr: voteval = "no" elif "nvote" in class_attr: voteval = "other" elif "lve" in class_attr: voteval = "other" else: msg = "Unrecognized vote val: %s" % class_attr raise Exception(msg) vote.vote(voteval, name) return vote
def scrape_votes(self, url, motion, date, chamber, bill): vote_pdf, resp = self.urlretrieve(url) text = convert_pdf(vote_pdf, "text") os.remove(vote_pdf) # this way we get a key error on a missing vote type motion, passed = self._vote_mapping[motion] yes_votes = [] no_votes = [] other_votes = [] absent_votes = [] not_voting_votes = [] # point at array to add names to cur_array = None precursors = ( ("yeas--", yes_votes), ("nays--", no_votes), ("absent or those not voting--", absent_votes), ("absent and those not voting--", absent_votes), ("not voting--", not_voting_votes), ("voting present--", other_votes), ("present--", other_votes), ("disclaimer", None), ) # split lines on newline, recombine lines that don't end in punctuation lines = _combine_lines(text.decode().split("\n")) for line in lines: # check if the line starts with a precursor, switch to that array for pc, arr in precursors: if pc in line.lower(): cur_array = arr line = line.replace(pc, "") # split names for name in line.split(","): name = name.strip() # move on if that's all there was if not name: continue # None or a Total indicate the end of a section if "None." in name: cur_array = None match = re.match(r"(.+?)\. Total--.*", name) if match: cur_array.append(match.groups()[0]) cur_array = None # append name if it looks ok junk_in_name = False for junk in ( "on final passage", "Necessary", "who would have", "being a tie", "therefore", "Vacancies", "a pair", "Total-", "ATTORNEY", "on final passage", "SPEAKER", "BOARD", "TREASURER", "GOVERNOR", "ARCHIVES", "SECRETARY", ): if junk in name: junk_in_name = True break if cur_array is not None and not junk_in_name: # strip trailing . if name[-1] == ".": name = name[:-1] name = self.clean_voter_name(name) cur_array.append(name) # return vote object yes_count = len(yes_votes) no_count = len(no_votes) absent_count = len(absent_votes) not_voting_count = len(not_voting_votes) other_count = len(other_votes) vote = VoteEvent( chamber=chamber, start_date=self._tz.localize(date), motion_text=motion, result="pass" if passed else "fail", classification="passage", bill=bill, ) vote.pupa_id = url + "#" + bill.identifier vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("absent", absent_count) vote.set_count("not voting", not_voting_count) vote.set_count("other", other_count) vote.add_source(url) for yes_vote in yes_votes: vote.vote("yes", self.clean_voter_name(yes_vote)) for no_vote in no_votes: vote.vote("no", self.clean_voter_name(no_vote)) for absent_vote in absent_votes: vote.vote("absent", self.clean_voter_name(absent_vote)) for not_voting_vote in not_voting_votes: vote.vote("not voting", self.clean_voter_name(not_voting_vote)) for other_vote in other_votes: vote.vote("other", self.clean_voter_name(other_vote)) yield vote
def scrape_bills(self, session, year_abr): # Main Bill information main_bill_csv = self.access_to_csv("MainBill") # keep a dictionary of bills (mapping bill_id to Bill obj) bill_dict = {} for rec in main_bill_csv: bill_type = rec["BillType"].strip() bill_number = int(rec["BillNumber"]) bill_id = bill_type + str(bill_number) title = rec["Synopsis"] if bill_type[0] == "A": chamber = "lower" else: chamber = "upper" # some bills have a blank title.. just skip it if not title: continue bill = Bill( bill_id, title=title, chamber=chamber, legislative_session=session, classification=self._bill_types[bill_type[1:]], ) if rec["IdenticalBillNumber"].strip(): bill.add_related_bill( rec["IdenticalBillNumber"].split()[0], legislative_session=session, relation_type="companion", ) # TODO: last session info is in there too bill_dict[bill_id] = bill # Sponsors bill_sponsors_csv = self.access_to_csv("BillSpon") for rec in bill_sponsors_csv: bill_type = rec["BillType"].strip() bill_number = int(rec["BillNumber"]) bill_id = bill_type + str(bill_number) if bill_id not in bill_dict: self.warning("unknown bill %s in sponsor database" % bill_id) continue bill = bill_dict[bill_id] name = rec["Sponsor"] sponsor_type = rec["Type"] if sponsor_type == "P": sponsor_type = "primary" else: sponsor_type = "cosponsor" bill.add_sponsorship( name, classification=sponsor_type, entity_type="person", primary=sponsor_type == "primary", ) # Documents bill_document_csv = self.access_to_csv("BillWP") for rec in bill_document_csv: bill_type = rec["BillType"].strip() bill_number = int(rec["BillNumber"]) bill_id = bill_type + str(bill_number) if bill_id not in bill_dict: self.warning("unknown bill %s in document database" % bill_id) continue bill = bill_dict[bill_id] document = rec["Document"] document = document.split("\\") document = document[-2] + "/" + document[-1] # doc_url = "ftp://www.njleg.state.nj.us/%s/%s" % (year, document) htm_url = "http://www.njleg.state.nj.us/{}/Bills/{}".format( year_abr, document.replace(".DOC", ".HTM")) # name document based _doctype try: doc_name = self._doctypes[rec["DocType"]] except KeyError: raise Exception("unknown doctype %s on %s" % (rec["DocType"], bill_id)) if rec["Comment"]: doc_name += " " + rec["Comment"] # Clean HTMX links. if htm_url.endswith("HTMX"): htm_url = re.sub("X$", "", htm_url) if rec["DocType"] in self._version_types: if htm_url.lower().endswith("htm"): mimetype = "text/html" elif htm_url.lower().endswith("wpd"): mimetype = "application/vnd.wordperfect" try: bill.add_version_link(doc_name, htm_url, media_type=mimetype) except ValueError: self.warning( "Couldn't find a document for bill {}".format(bill_id)) pass else: bill.add_document_link(doc_name, htm_url) # Votes next_year = int(year_abr) + 1 vote_info_list = [ "A%s" % year_abr, "A%s" % next_year, "S%s" % year_abr, "S%s" % next_year, "CA%s-%s" % (year_abr, next_year), "CS%s-%s" % (year_abr, next_year), ] # keep votes clean globally, a few votes show up in multiple files votes = {} for filename in vote_info_list: s_vote_url = "ftp://www.njleg.state.nj.us/votes/%s.zip" % filename try: s_vote_zip, resp = self.urlretrieve(s_vote_url) except scrapelib.FTPError: self.warning("could not find %s" % s_vote_url) continue zippedfile = zipfile.ZipFile(s_vote_zip) for vfile in ["%s.txt" % (filename), "%sEnd.txt" % (filename)]: try: vote_file = io.TextIOWrapper(zippedfile.open(vfile, "r"), encoding="latin-1") except KeyError: # # Right, so, 2011 we have an "End" file with more # vote data than was in the original dump. # self.warning("No such file: %s" % (vfile)) continue vdict_file = csv.DictReader(vote_file) if filename.startswith("A") or filename.startswith("CA"): chamber = "lower" else: chamber = "upper" if filename.startswith("C"): vote_file_type = "committee" else: vote_file_type = "chamber" for rec in vdict_file: if vote_file_type == "chamber": bill_id = rec["Bill"].strip() leg = rec["Full_Name"] date = rec["Session_Date"] action = rec["Action"] leg_vote = rec["Legislator_Vote"] vote_parts = (bill_id, chamber, action) else: bill_id = "%s%s" % (rec["Bill_Type"], rec["Bill_Number"]) leg = rec["Name"] # drop time portion date = rec["Agenda_Date"].split()[0] # make motion readable action = self._com_vote_motions[rec["BillAction"]] # first char (Y/N) use [0:1] to ignore '' leg_vote = rec["LegislatorVote"][0:1] committee = rec["Committee_House"] vote_parts = (bill_id, chamber, action, committee) date = datetime.strptime(date, "%m/%d/%Y") vote_id = "_".join(vote_parts).replace(" ", "_") if vote_id not in votes: votes[vote_id] = VoteEvent( start_date=TIMEZONE.localize(date), chamber=chamber, motion_text=action, classification="passage", result=None, bill=bill_dict[bill_id], ) votes[vote_id].pupa_id = vote_id if leg_vote == "Y": votes[vote_id].vote("yes", leg) elif leg_vote == "N": votes[vote_id].vote("no", leg) else: votes[vote_id].vote("other", leg) # remove temp file os.remove(s_vote_zip) # Counts yes/no/other votes and saves overall vote for vote in votes.values(): counts = collections.defaultdict(int) for count in vote.votes: counts[count["option"]] += 1 vote.set_count("yes", counts["yes"]) vote.set_count("no", counts["no"]) vote.set_count("other", counts["other"]) # Veto override. if vote.motion_text == "OVERRIDE": # Per the NJ leg's glossary, a veto override requires # 2/3ds of each chamber. 27 in the senate, 54 in the house. # http://www.njleg.state.nj.us/legislativepub/glossary.asp if "lower" in vote.bill: vote.result = "pass" if counts["yes"] >= 54 else "fail" elif "upper" in vote.bill: vote.result = "pass" if counts["yes"] >= 27 else "fail" else: # Regular vote. vote.result = "pass" if counts["yes"] > counts[ "no"] else "fail" vote.add_source("http://www.njleg.state.nj.us/downloads.asp") yield vote # Actions bill_action_csv = self.access_to_csv("BillHist") actor_map = {"A": "lower", "G": "executive", "S": "upper"} for rec in bill_action_csv: bill_type = rec["BillType"].strip() bill_number = int(rec["BillNumber"]) bill_id = bill_type + str(bill_number) if bill_id not in bill_dict: self.warning("unknown bill %s in action database" % bill_id) continue bill = bill_dict[bill_id] action = rec["Action"] date = rec["DateAction"] date = datetime.strptime(date, "%m/%d/%y %H:%M:%S") actor = actor_map[rec["House"]] comment = rec["Comment"] action, atype = self.categorize_action(action, bill_id) if comment: action += " " + comment bill.add_action( action, date=TIMEZONE.localize(date), classification=atype, chamber=actor, ) # Subjects subject_csv = self.access_to_csv("BillSubj") for rec in subject_csv: bill_id = rec["BillType"].strip() + str(int(rec["BillNumber"])) if bill_id not in bill_dict: self.warning("unknown bill %s in subject database" % bill_id) continue bill = bill_dict.get(bill_id) if bill: bill.subject.append(rec["SubjectKey"]) else: self.warning("invalid bill id in BillSubj: %s" % bill_id) phony_bill_count = 0 # save all bills at the end for bill in bill_dict.values(): # add sources if not bill.actions and not bill.versions: self.warning("probable phony bill detected %s", bill.identifier) phony_bill_count += 1 else: bill.add_source("http://www.njleg.state.nj.us/downloads.asp") yield bill if phony_bill_count: self.warning("%s total phony bills detected", phony_bill_count)
def _parse_votes(self, url, vote, bill): """Given a vote url and a vote object, extract the voters and the vote counts from the vote page and update the vote object. """ if url.lower().endswith(".pdf"): try: resp = self.get(url) except HTTPError: # This vote document wasn't found. msg = "No document found at url %r" % url self.logger.warning(msg) return try: v = PDFCommitteeVote(url, resp.content, bill) return v.asvote() except PDFCommitteeVoteParseError: # Warn and skip. self.warning("Could't parse committee vote at %r" % url) return html = self.get(url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) # Yes, no, excused, absent. try: vals = doc.xpath("//table")[1].xpath("tr/td/text()") except IndexError: # Most likely was a bogus link lacking vote data. return yes_count, no_count, excused_count, absent_count = map(int, vals) # Get the motion. try: motion = doc.xpath("//br")[-1].tail.strip() except IndexError: # Some of them mysteriously have no motion listed. motion = vote["action"] if not motion: motion = vote["action"] vote["motion"] = motion action = vote["action"] vote_url = vote["vote_url"] vote = VoteEvent( chamber=vote["chamber"], start_date=vote["date"], motion_text=vote["motion"], result="fail", # placeholder classification="passage", bill=bill, bill_action=vote["action"], ) vote.pupa_id = vote_url # URL contains sequence number vote.add_source(vote_url) vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("excused", excused_count) vote.set_count("absent", absent_count) for text in doc.xpath("//table")[2].xpath("tr/td/text()"): if not text.strip(u"\xa0"): continue v, name = filter(None, text.split(u"\xa0")) # Considering Name is brackets as short name regex = re.compile(r".*?\((.*?)\)") short_name = re.findall(regex, name) if len(short_name) > 0: note = "Short Name: " + short_name[0] else: note = "" # Name without brackets like 'Kary, Douglas' name = re.sub(r"[\(\[].*?[\)\]]", "", name) if v == "Y": vote.yes(name, note=note) elif v == "N": vote.no(name, note=note) elif v == "E": vote.vote("excused", name, note=note) elif v == "A": vote.vote("absent", name, note=note) # code to deterimine value of `passed` passed = None # some actions take a super majority, so we aren't just # comparing the yeas and nays here. for i in vote_passage_indicators: if i in action: passed = True break for i in vote_failure_indicators: if i in action and passed: # a quick explanation: originally an exception was # thrown if both passage and failure indicators were # present because I thought that would be a bug in my # lists. Then I found 2007 HB 160. # Now passed = False if the nays outnumber the yays.. # I won't automatically mark it as passed if the yays # ounumber the nays because I don't know what requires # a supermajority in MT. if no_count >= yes_count: passed = False break else: raise Exception("passage and failure indicator" "both present at: %s" % url) if i in action and passed is None: passed = False break for i in vote_ambiguous_indicators: if i in action: passed = yes_count > no_count break if passed is None: raise Exception("Unknown passage at: %s" % url) vote.result = "pass" if passed else "fail" return vote
def scrape_bill_type( self, chamber, session, bill_type, type_abbr, committee_abbr_regex=get_committee_name_regex(), ): bills = (self.session.query(CABill).filter_by( session_year=session).filter_by(measure_type=type_abbr)) archive_year = int(session[0:4]) not_archive_year = archive_year >= 2009 for bill in bills: bill_session = session if bill.session_num != "0": bill_session += " Special Session %s" % bill.session_num bill_id = bill.short_bill_id if bill_id.strip() == "SB77" and session == "20052006": continue fsbill = Bill(bill_id, bill_session, title="", chamber=chamber) if (bill_id.startswith("S") and chamber == "lower") or (bill_id.startswith("A") and chamber == "upper"): print("!!!! BAD ID/CHAMBER PAIR !!!!", bill) continue # Construct a fake source url source_url = ("http://leginfo.legislature.ca.gov/faces/" "billNavClient.xhtml?bill_id=%s") % bill.bill_id fsbill.add_source(source_url) fsbill.add_version_link(bill_id, source_url, media_type="text/html") title = "" type_ = ["bill"] subject = "" all_titles = set() summary = "" # Get digest test (aka "summary") from latest version. if bill.versions and not_archive_year: version = bill.versions[-1] nsmap = version.xml.nsmap xpath = "//caml:DigestText/xhtml:p" els = version.xml.xpath(xpath, namespaces=nsmap) chunks = [] for el in els: t = etree_text_content(el) t = re.sub(r"\s+", " ", t) t = re.sub(r"\)(\S)", lambda m: ") %s" % m.group(1), t) chunks.append(t) summary = "\n\n".join(chunks) for version in bill.versions: if not version.bill_xml: continue version_date = self._tz.localize( version.bill_version_action_date) # create a version name to match the state's format # 02/06/17 - Enrolled version_date_human = version_date.strftime("%m/%d/%y") version_name = "{} - {}".format(version_date_human, version.bill_version_action) version_base = "https://leginfo.legislature.ca.gov/faces" version_url_pdf = "{}/billPdf.xhtml?bill_id={}&version={}".format( version_base, version.bill_id, version.bill_version_id) fsbill.add_version_link( version_name, version_url_pdf, media_type="application/pdf", date=version_date.date(), ) # CA is inconsistent in that some bills have a short title # that is longer, more descriptive than title. if bill.measure_type in ("AB", "SB"): impact_clause = clean_title(version.title) title = clean_title(version.short_title) else: impact_clause = None if len(version.title) < len( version.short_title) and not version.title.lower( ).startswith("an act"): title = clean_title(version.short_title) else: title = clean_title(version.title) if title: all_titles.add(title) type_ = [bill_type] if version.appropriation == "Yes": type_.append("appropriation") tags = [] if version.fiscal_committee == "Yes": tags.append("fiscal committee") if version.local_program == "Yes": tags.append("local program") if version.urgency == "Yes": tags.append("urgency") if version.taxlevy == "Yes": tags.append("tax levy") if version.subject: subject = clean_title(version.subject) if not title: self.warning("Couldn't find title for %s, skipping" % bill_id) continue fsbill.title = title if summary: fsbill.add_abstract(summary, note="summary") fsbill.classification = type_ fsbill.subject = [subject] if subject else [] fsbill.extras["impact_clause"] = impact_clause fsbill.extras["tags"] = tags # We don't want the current title in alternate_titles all_titles.remove(title) for title in all_titles: fsbill.add_title(title) for author in version.authors: fsbill.add_sponsorship( author.name, classification=SPONSOR_TYPES[author.contribution], primary=author.primary_author_flg == "Y", entity_type="person", ) # fsbill.sponsorships[-1]['extras'] = {'official_type': author.contribution} seen_actions = set() for action in bill.actions: if not action.action: # NULL action text seems to be an error on CA's part, # unless it has some meaning I'm missing continue actor = action.actor or chamber actor = actor.strip() match = re.match(r"(Assembly|Senate)($| \(Floor)", actor) if match: actor = { "Assembly": "lower", "Senate": "upper" }[match.group(1)] elif actor.startswith("Governor"): actor = "executive" else: def replacer(matchobj): if matchobj: return { "Assembly": "lower", "Senate": "upper" }[matchobj.group()] else: return matchobj.group() actor = re.sub(r"^(Assembly|Senate)", replacer, actor) type_ = [] act_str = action.action act_str = re.sub(r"\s+", " ", act_str) attrs = self.categorizer.categorize(act_str) # Add in the committee strings of the related committees, if any. kwargs = attrs matched_abbrs = committee_abbr_regex.findall(action.action) if re.search(r"Com[s]?. on", action.action) and not matched_abbrs: msg = "Failed to extract committee abbr from %r." self.logger.warning(msg % action.action) if matched_abbrs: committees = [] for abbr in matched_abbrs: try: name = self.committee_abbr_to_name(chamber, abbr) committees.append(name) except KeyError: msg = ("Mapping contains no committee name for " "abbreviation %r. Action text was %r.") args = (abbr, action.action) self.warning(msg % args) committees = filter(None, committees) kwargs["committees"] = committees code = re.search(r"C[SXZ]\d+", actor) if code is not None: code = code.group() kwargs["actor_info"] = {"committee_code": code} if not_archive_year: assert len(list(committees)) == len(matched_abbrs) for committee, abbr in zip(committees, matched_abbrs): act_str = act_str.replace("Coms. on ", "") act_str = act_str.replace("Com. on " + abbr, committee) act_str = act_str.replace(abbr, committee) if not act_str.endswith("."): act_str = act_str + "." # Determine which chamber the action originated from. changed = False for committee_chamber in ["upper", "lower", "legislature"]: if actor.startswith(committee_chamber): actor = committee_chamber changed = True break if not changed: actor = "legislature" if actor != action.actor: actor_info = kwargs.get("actor_info", {}) actor_info["details"] = action.actor kwargs["actor_info"] = actor_info # Add strings for related legislators, if any. rgx = r"(?:senator|assembly[mwp][^ .,:;]+)\s+[^ .,:;]+" legislators = re.findall(rgx, action.action, re.I) if legislators: kwargs["legislators"] = legislators date = action.action_date date = self._tz.localize(date) date = date.date() if (actor, act_str, date) in seen_actions: continue kwargs.update(self.categorizer.categorize(act_str)) action = fsbill.add_action( act_str, date.strftime("%Y-%m-%d"), chamber=actor, classification=kwargs["classification"], ) for committee in kwargs.get("committees", []): action.add_related_entity(committee, entity_type="organization") seen_actions.add((actor, act_str, date)) source_url = ( "http://leginfo.legislature.ca.gov/faces/billVotesClient.xhtml?" ) source_url += f"bill_id={session}{bill.session_num}{fsbill.identifier}" # Votes for non archived years if archive_year > 2009: for vote_num, vote in enumerate(bill.votes): if vote.vote_result == "(PASS)": result = True else: result = False if not vote.location: continue full_loc = vote.location.description first_part = full_loc.split(" ")[0].lower() if first_part in ["asm", "assembly"]: vote_chamber = "lower" # vote_location = ' '.join(full_loc.split(' ')[1:]) elif first_part.startswith("sen"): vote_chamber = "upper" # vote_location = ' '.join(full_loc.split(' ')[1:]) else: # raise ScrapeError("Bad location: %s" % full_loc) # To uncomment continue if vote.motion: motion = vote.motion.motion_text or "" else: motion = "" if "Third Reading" in motion or "3rd Reading" in motion: vtype = "passage" elif "Do Pass" in motion: vtype = "passage" else: vtype = "other" motion = motion.strip() motion = re.compile(r"(\w+)( Extraordinary)? Session$", re.IGNORECASE).sub("", motion) motion = re.compile(r"^(Senate|Assembly) ", re.IGNORECASE).sub("", motion) motion = re.sub(r"^(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? ", "", motion) motion = re.sub(r" \(\w+\)$", "", motion) motion = re.sub(r"(SCR|SB|AB|AJR|ACR)\s?\d+ \w+\.?$", "", motion) motion = re.sub( r"(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? " r"Urgency Clause$", "(Urgency Clause)", motion, ) motion = re.sub(r"\s+", " ", motion) if not motion: self.warning("Got blank motion on vote for %s" % bill_id) continue # XXX this is responsible for all the CA 'committee' votes, not # sure if that's a feature or bug, so I'm leaving it as is... # vote_classification = chamber if (vote_location == 'Floor') else 'committee' # org = { # 'name': vote_location, # 'classification': vote_classification # } fsvote = VoteEvent( motion_text=motion, start_date=self._tz.localize(vote.vote_date_time), result="pass" if result else "fail", classification=vtype, # organization=org, chamber=vote_chamber, bill=fsbill, ) fsvote.extras = {"threshold": vote.threshold} fsvote.add_source(source_url) fsvote.pupa_id = source_url + "#" + str(vote_num) rc = {"yes": [], "no": [], "other": []} for record in vote.votes: if record.vote_code == "AYE": rc["yes"].append(record.legislator_name) elif record.vote_code.startswith("NO"): rc["no"].append(record.legislator_name) else: rc["other"].append(record.legislator_name) # Handle duplicate votes for key in rc.keys(): rc[key] = list(set(rc[key])) for key, voters in rc.items(): for voter in voters: fsvote.vote(key, voter) # Set counts by summed votes for accuracy fsvote.set_count(key, len(voters)) yield fsvote if len(bill.votes) > 0 and archive_year <= 2009: vote_page_url = ( "http://leginfo.legislature.ca.gov/faces/billVotesClient.xhtml?" ) vote_page_url += ( f"bill_id={session}{bill.session_num}{fsbill.identifier}") # parse the bill data page, finding the latest html text data = self.get(vote_page_url).content doc = html.fromstring(data) doc.make_links_absolute(vote_page_url) num_of_votes = len(doc.xpath("//div[@class='status']")) for vote_section in range(1, num_of_votes + 1): lines = doc.xpath( f"//div[@class='status'][{vote_section}]//div[@class='statusRow']" ) date, result, motion, vtype, location = "", "", "", "", "" votes = {} for line in lines: line = line.text_content().split() if line[0] == "Date": date = line[1] date = datetime.datetime.strptime(date, "%m/%d/%y") date = self._tz.localize(date) elif line[0] == "Result": result = "pass" if "PASS" in line[1] else "fail" elif line[0] == "Motion": motion = " ".join(line[1:]) elif line[0] == "Location": location = " ".join(line[1:]) elif len(line) > 1: if line[0] == "Ayes" and line[1] != "Count": votes["yes"] = line[1:] elif line[0] == "Noes" and line[1] != "Count": votes["no"] = line[1:] elif line[0] == "NVR" and line[1] != "Count": votes["not voting"] = line[1:] # Determine chamber based on location first_part = location.split(" ")[0].lower() vote_chamber = "" if first_part in ["asm", "assembly"]: vote_chamber = "lower" elif first_part.startswith("sen"): vote_chamber = "upper" if "Third Reading" in motion or "3rd Reading" in motion: vtype = "passage" elif "Do Pass" in motion: vtype = "passage" else: vtype = "other" if len(motion) > 0: fsvote = VoteEvent( motion_text=motion, start_date=date, result=result, classification=vtype, chamber=vote_chamber, bill=fsbill, ) fsvote.add_source(vote_page_url) fsvote.pupa_id = vote_page_url + "#" + str( vote_section) for how_voted, voters in votes.items(): for voter in voters: voter = voter.replace(",", "") fsvote.vote(how_voted, voter) yield fsvote yield fsbill self.session.expire_all()
def scrape_assembly_votes(self, session, bill, assembly_url, bill_id): # parse the bill data page, finding the latest html text url = assembly_url + "&Floor%26nbspVotes=Y" data = self.get(url).text doc = lxml.html.fromstring(data) doc.make_links_absolute(url) if "Votes:" in doc.text_content(): vote_motions = [] additional_votes_on_motion = 2 for table in doc.xpath("//table"): date = table.xpath('caption/span[contains(., "DATE:")]') date = next(date[0].itersiblings()).text date = datetime.datetime.strptime(date, "%m/%d/%Y") date = eastern.localize(date) date = date.isoformat() spanText = table.xpath("caption/span/text()") motion = spanText[2].strip() + spanText[3].strip() if motion in vote_motions: motion = motion + f" - Vote {additional_votes_on_motion}" additional_votes_on_motion += 1 else: vote_motions.append(motion) votes = (table.xpath("caption/span/span")[0].text.split(":") [1].split("/")) yes_count, no_count = map(int, votes) passed = yes_count > no_count vote = VoteEvent( chamber="lower", start_date=date, motion_text=motion, bill=bill, result="pass" if passed else "fail", classification="passage", ) vote.set_count("yes", yes_count) vote.set_count("no", no_count) absent_count = 0 excused_count = 0 tds = table.xpath("tr/td/text()") votes = [tds[i:i + 2] for i in range(0, len(tds), 2)] vote_dictionary = { "Y": "yes", "NO": "no", "ER": "excused", "AB": "absent", "NV": "not voting", "EL": "other", } for vote_pair in votes: name, vote_val = vote_pair vote.vote(vote_dictionary[vote_val], name) if vote_val == "AB": absent_count += 1 elif vote_val == "ER": excused_count += 1 vote.set_count("absent", absent_count) vote.set_count("excused", excused_count) vote.add_source(url) vote.pupa_id = url + motion + spanText[1] yield vote
def scrape_vote(self, session, bill, vote_url, chamber, date): page = self.lxmlize(vote_url) try: motion = page.xpath("//font/text()")[2] except IndexError: self.warning("Vote Summary Page Broken ") return # eg. http://leg.colorado.gov/content/sb18-033vote563ce6 if ("AM" in motion or "PM" in motion) and "/" in motion: motion = "Motion not given." if "withdrawn" not in motion: yes_no_counts = page.xpath( "//tr/td[preceding-sibling::td/descendant::" "font[contains(text(),'Aye')]]/font/text()" ) other_counts = page.xpath( "//tr/td[preceding-sibling::td/descendant::" "font[contains(text(),'Absent')]]/font/text()" ) abstain_counts = page.xpath( "//tr/td[preceding-sibling::td/descendant::" "font[contains(text(),'17C')]]/font/text()" ) yes_count = int(yes_no_counts[0]) no_count = int(yes_no_counts[2]) exc_count = int(other_counts[2]) absent_count = int(other_counts[0]) abstain_count = 0 if abstain_counts: abstain_count = int(abstain_counts[0]) # fix for # http://leg.colorado.gov/content/hb19-1029vote65e72e if absent_count == -1: absent_count = 0 passed = yes_count > no_count vote = VoteEvent( chamber=chamber, start_date=self._tz.localize(date), motion_text=motion, result="pass" if passed else "fail", bill=bill, classification="passage", ) vote.pupa_id = vote_url vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("excused", exc_count) vote.set_count("absent", absent_count) vote.set_count("abstain", abstain_count) vote.add_source(vote_url) rolls = page.xpath( "//tr[preceding-sibling::tr/descendant::" "td/div/b/font[contains(text(),'Vote')]]" ) vote_abrv = { "Y": "yes", "N": "no", "E": "excused", "A": "absent", "-": "absent", "17C": "abstain", } for roll in rolls: if len(roll.xpath(".//td/div/font/text()")) > 0: voted = roll.xpath(".//td/div/font/text()")[0].strip() voter = roll.xpath(".//td/font/text()")[0].strip() if voted == "V": continue vote.vote(vote_abrv[voted], voter) yield vote
def scrape_votes(self, bill, url): page = lxml.html.fromstring(self.get(url).text.replace(u"\xa0", " ")) seen_rcs = set() re_ns = "http://exslt.org/regular-expressions" path = r"//p[re:test(text(), 'OKLAHOMA\s+(HOUSE|STATE\s+SENATE)')]" for header in page.xpath(path, namespaces={"re": re_ns}): bad_vote = False # Each chamber has the motion name on a different line of the file if "HOUSE" in header.xpath("string()"): chamber = "lower" motion_index = 8 else: chamber = "upper" motion_index = 13 motion = header.xpath("string(following-sibling::p[%d])" % motion_index).strip() motion = re.sub(r"\s+", " ", motion) if not motion.strip(): self.warning("Motion text not found") return match = re.match(r"^(.*) (PASSED|FAILED)$", motion) if match: motion = match.group(1) passed = match.group(2) == "PASSED" else: passed = None rcs_p = header.xpath( "following-sibling::p[contains(., 'RCS#')]")[0] rcs_line = rcs_p.xpath("string()").replace(u"\xa0", " ") rcs = re.search(r"RCS#\s+(\d+)", rcs_line).group(1) if rcs in seen_rcs: continue else: seen_rcs.add(rcs) date_line = rcs_p.getnext().xpath("string()") date = re.search(r"\d+/\d+/\d+", date_line).group(0) date = datetime.datetime.strptime(date, "%m/%d/%Y").date() vtype = None counts = collections.defaultdict(int) votes = collections.defaultdict(list) seen_yes = False for sib in header.xpath("following-sibling::p")[13:]: line = sib.xpath("string()").replace("\r\n", " ").strip() if "*****" in line: break regex = (r"(YEAS|NAYS|EXCUSED|VACANT|CONSTITUTIONAL " r"PRIVILEGE|NOT VOTING|N/V)\s*:\s*(\d+)(.*)") match = re.match(regex, line) if match: if match.group(1) == "YEAS" and "RCS#" not in line: vtype = "yes" seen_yes = True elif match.group(1) == "NAYS" and seen_yes: vtype = "no" elif match.group(1) == "VACANT": continue # skip these elif seen_yes: vtype = "other" if seen_yes and match.group(3).strip(): self.warning("Bad vote format, skipping.") bad_vote = True counts[vtype] += int(match.group(2)) elif seen_yes: for name in line.split(" "): if not name: continue if "HOUSE" in name or "SENATE " in name: continue votes[vtype].append(name.strip()) if bad_vote: continue if passed is None: passed = counts["yes"] > (counts["no"] + counts["other"]) vote = Vote( chamber=chamber, start_date=date.strftime("%Y-%m-%d"), motion_text=motion, result="pass" if passed else "fail", bill=bill, classification="passage", ) vote.set_count("yes", counts["yes"]) vote.set_count("no", counts["no"]) vote.set_count("other", counts["other"]) vote.pupa_id = url + "#" + rcs vote.add_source(url) for name in votes["yes"]: vote.yes(name) for name in votes["no"]: if ":" in name: raise Exception(name) vote.no(name) for name in votes["other"]: vote.vote("other", name) yield vote
def parse_vote_pdf(self, vote_url, bill): filename, response = self.urlretrieve(vote_url) text = convert_pdf(filename, type="text").decode() lines = text.splitlines() if "Senate" in vote_url: chamber = "upper" else: chamber = "lower" date_string = lines[0].split("Calendar Date:")[1].strip() date = datetime.datetime.strptime(date_string, "%b %d, %Y %I:%M (%p)") page_index = None for index, line in enumerate(lines): if "Yeas" in line and "Nays" in line: page_index = index break vote_counts = 5 * [0] vote_types = ["yes", "no", "not voting", "excused", "absent"] if page_index: counts = re.split(r"\s{2,}", lines[page_index].strip()) for index, count in enumerate(counts): number, string = count.split(" ", 1) number = int(number) vote_counts[index] = number else: raise ValueError("Vote Counts Not found at %s" % vote_url) passed = vote_counts[0] > vote_counts[1] # Consent calendar votes address multiple bills in one VoteEvent # eg, http://mgaleg.maryland.gov/2018RS/votes/Senate/0478.pdf is_consent_calendar = any( ["Consent Calendar" in line for line in lines[:page_index]] ) consent_calendar_bills = None motion = "" if is_consent_calendar: motion = re.split(r"\s{2,}", lines[page_index - 4].strip())[0] consent_calendar_bills = re.split(r"\s{2,}", lines[page_index - 1].strip()) assert ( consent_calendar_bills ), "Could not find bills for consent calendar vote" motion_keywords = [ "favorable", "reading", "amendment", "motion", "introduced", "bill pass", "committee", ] motion_lines = [ 3, 2, 4, 5, ] # Relative LineNumbers to be checked for existence of motion for i in motion_lines: if any( motion_keyword in motion.lower() for motion_keyword in motion_keywords ): break motion = re.split(r"\s{2,}", lines[page_index - i].strip())[0] else: if not any( motion_keyword in motion.lower() for motion_keyword in motion_keywords ): # This condition covers for the bad formating in SB 1260 motion = lines[page_index - 3] if not any( motion_keyword in motion.lower() for motion_keyword in motion_keywords ): # Check this one for SB 747 motion = "No motion given" self.warning("No motion given") vote = VoteEvent( bill=bill, chamber=chamber, start_date=date.strftime("%Y-%m-%d"), motion_text=motion, classification="passage", result="pass" if passed else "fail", ) # Include bill ID to avoid duplication for consent calendars vote.pupa_id = "{}#{}".format(vote_url, bill.identifier) for index, vote_type in enumerate(vote_types): vote.set_count(vote_type, vote_counts[index]) page_index = page_index + 2 # Keywords for identifying where names are located in the pdf show_stoppers = [ "Voting Nay", "Not Voting", "COPY", "Excused", "indicates vote change", "Indicates Vote Change", ] vote_index = 0 # For matching number of names extracted with vote counts(extracted independently) vote_name_counts = 5 * [0] while page_index < len(lines): current_line = lines[page_index].strip() if not current_line or "Voting Yea" in current_line: page_index += 1 continue if any(show_stopper in current_line for show_stopper in show_stoppers): page_index += 1 vote_index = vote_index + 1 continue names = re.split(r"\s{2,}", current_line) vote_name_counts[vote_index] += len(names) for name in names: vote.vote(vote_types[vote_index], name) page_index += 1 if vote_counts != vote_name_counts: raise ValueError("Votes Count and Number of Names don't match") return vote
def scrape_vote(self, bill, vote_id, session): vote_url = ( "https://legis.delaware.gov/json/RollCall/GetRollCallVoteByRollCallId" ) form = {"rollCallId": vote_id, "sort": "", "group": "", "filter": ""} self.info("Fetching vote {} for {}".format(vote_id, bill.identifier)) page = self.post(url=vote_url, data=form, allow_redirects=True).json() if page: roll = page["Model"] vote_chamber = self.chamber_map[roll["ChamberName"]] # "7/1/16 01:00 AM" vote_date = dt.datetime.strptime( roll["TakenAtDateTime"], "%m/%d/%y %I:%M %p").strftime("%Y-%m-%d") # TODO: What does this code mean? vote_motion = roll["RollCallVoteType"] vote_passed = "pass" if roll[ "RollCallStatus"] == "Passed" else "fail" other_count = (int(roll["NotVotingCount"]) + int(roll["VacantVoteCount"]) + int(roll["AbsentVoteCount"]) + int(roll["ConflictVoteCount"])) vote = VoteEvent( chamber=vote_chamber, start_date=vote_date, motion_text=vote_motion, result=vote_passed, bill=bill, legislative_session=session, classification=[], ) vote_pdf_url = ("https://legis.delaware.gov" "/json/RollCallController/GenerateRollCallPdf" "?rollCallId={}&chamberId={}".format( vote_id, self.chamber_codes[vote_chamber])) # Vote URL is just a generic search URL with POSTed data, # so provide a different link vote.add_source(vote_pdf_url) vote.pupa_id = vote_pdf_url vote.set_count("yes", roll["YesVoteCount"]) vote.set_count("no", roll["NoVoteCount"]) vote.set_count("other", other_count) for row in roll["AssemblyMemberVotes"]: # AssemblyMemberId looks like it should work here, # but for some sessions it's bugged to only return session try: voter = self.legislators_by_short[str(row["ShortName"])] name = voter["DisplayName"] except KeyError: self.warning("could not find legislator short name %s", row["ShortName"]) name = row["ShortName"] if row["SelectVoteTypeCode"] == "Y": vote.yes(name) elif row["SelectVoteTypeCode"] == "N": vote.no(name) else: vote.vote("other", name) yield vote