def build_vote(session, bill_id, url, vote_record, chamber, motion_text): # When they vote in a substitute they mark it as XHB bill_id = bill_id.replace("XHB", "HB") passed = len(vote_record["yes"]) > len(vote_record["no"]) vote_event = VoteEvent( result="pass" if passed else "fail", chamber=chamber, start_date=vote_record["date"].strftime("%Y-%m-%d"), motion_text=motion_text, classification="passage", legislative_session=session, bill=bill_id, bill_chamber="upper" if bill_id[0] == "S" else "lower", ) vote_event.dedupe_key = url vote_event.set_count("yes", len(vote_record["yes"])) vote_event.set_count("no", len(vote_record["no"])) vote_event.set_count("excused", len(vote_record["excused"])) vote_event.set_count("absent", len(vote_record["absent"])) vote_event.set_count("other", len(vote_record["other"])) for vote_type in ["yes", "no", "excused", "absent", "other"]: for voter in vote_record[vote_type]: vote_event.vote(vote_type, voter) vote_event.add_source(url) return vote_event
def record_votes(root, session, chamber): for el in root.xpath("//div{}".format("".join(vote_selectors))): mv = MaybeVote(el) if not mv.is_valid: continue v = VoteEvent( chamber=chamber, start_date=None, motion_text="passage" if mv.passed else "other", result="pass" if mv.passed else "fail", classification="passage" if mv.passed else None, legislative_session=session[0:2], bill=mv.bill_id, bill_chamber=mv.chamber, ) v.set_count("yes", mv.yeas or 0) v.set_count("no", mv.nays or 0) v.set_count("not voting", mv.present or 0) for each in mv.votes["yeas"]: each = clean_vote_name(each) v.yes(each) for each in mv.votes["nays"]: each = clean_vote_name(each) v.no(each) for each in mv.votes["present"]: each = clean_vote_name(each) v.vote("not voting", each) for each in mv.votes["absent"]: each = clean_vote_name(each) v.vote("absent", each) yield v
def _parse_senate_votes(self, vote_data, bill, url): vote_datetime = datetime.datetime.strptime(vote_data["voteDate"], "%Y-%m-%d") if vote_data["voteType"] == "FLOOR": motion = "Floor Vote" elif vote_data["voteType"] == "COMMITTEE": motion = "{} Vote".format(vote_data["committee"]["name"]) else: raise ValueError("Unknown vote type encountered.") if vote_data["version"]: motion += " - Version: " + vote_data["version"] vote = VoteEvent( chamber="upper", start_date=vote_datetime.strftime("%Y-%m-%d"), motion_text=motion, classification="passage", result="fail", bill=bill, ) vote.add_source(url) vote_rolls = vote_data["memberVotes"]["items"] yes_count, no_count, other_count = 0, 0, 0 # Count all yea votes. if "items" in vote_rolls.get("AYE", {}): for legislator in vote_rolls["AYE"]["items"]: vote.yes(legislator["fullName"]) yes_count += 1 if "items" in vote_rolls.get("AYEWR", {}): for legislator in vote_rolls["AYEWR"]["items"]: vote.yes(legislator["fullName"]) yes_count += 1 # Count all nay votes. if "items" in vote_rolls.get("NAY", {}): for legislator in vote_rolls["NAY"]["items"]: vote.no(legislator["fullName"]) no_count += 1 # Count all other types of votes. other_vote_types = ("EXC", "ABS", "ABD") for vote_type in other_vote_types: if vote_rolls.get(vote_type, []): for legislator in vote_rolls[vote_type]["items"]: vote.vote("other", legislator["fullName"]) other_count += 1 vote.result = "pass" if yes_count > no_count else "fail" vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("other", other_count) return vote
def scrape_senate_vote(self, bill, url, date): try: filename, resp = self.urlretrieve(url) except scrapelib.HTTPError: self.warning("missing vote file %s" % url) return vote = VoteEvent( chamber="upper", start_date=date.strftime("%Y-%m-%d"), motion_text="Passage", # setting 'fail' for now. result="fail", classification="passage", bill=bill, ) vote.add_source(url) vote.pupa_id = url text = convert_pdf(filename, "text").decode("utf-8") os.remove(filename) if re.search(r"Yea:\s+\d+\s+Nay:\s+\d+\s+Absent:\s+\d+", text): yield from self.scrape_senate_vote_3col(bill, vote, text, url, date) return data = re.split(r"(Yea|Nay|Absent)s?:", text)[::-1] data = filter(None, data) keymap = dict(yea="yes", nay="no") actual_vote = collections.defaultdict(int) vote_count = {"yes": 0, "no": 0, "other": 0} while True: if not data: break vote_val = data.pop() key = keymap.get(vote_val.lower(), "other") values = data.pop() for name in re.split(r"(?:[\s,]+and\s|[\s,]{2,})", values): if name.lower().strip() == "none.": continue name = name.replace("..", "") name = re.sub(r"\.$", "", name) name = name.strip("-1234567890 \n") if not name: continue vote.vote(key, name) actual_vote[vote_val] += 1 vote_count[key] += 1 assert actual_vote[vote_val] == vote_count[key] for key, value in vote_count.items(): vote.set_count(key, value) # updating result with actual value vote.result = ("pass" if vote_count["yes"] > (vote_count["no"] + vote_count["other"]) else "fail") yield vote
def scrape_committee_vote(self, bill, actor, date, motion, page, url, uniqid): votes = page.xpath("//table")[0] rows = votes.xpath(".//tr")[0] if rows[0].text_content() == "Votes:": # New webste rows = votes.xpath(".//tr")[2] yno = rows.xpath(".//td") if len(yno) < 3: yes = yno[0] no, other = None, None else: yes, _, no, _, other = rows.xpath(".//td")[:5] def proc_block(obj, typ): if obj is None: return {"type": None, "count": None, "votes": []} votes = [] for vote in obj.xpath("./text()"): if vote.strip(): vote = vote.strip() if vote: votes.append(vote) count = len(votes) return {"type": typ, "count": count, "votes": votes} vote_dict = { "yes": proc_block(yes, "yes"), "no": proc_block(no, "no"), "other": proc_block(other, "other"), } yes_count = vote_dict["yes"]["count"] no_count = vote_dict["no"]["count"] or 0 other_count = vote_dict["other"]["count"] or 0 vote = Vote( chamber=actor, start_date=date, motion_text=motion, identifier=str(uniqid), result="pass" if (yes_count > no_count) else "fail", classification="passage", bill=bill, ) vote.extras = {"_vote_id": uniqid} vote.add_source(url) vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("other", other_count) for key in vote_dict: for voter in vote_dict[key]["votes"]: vote.vote(key, voter) yield vote
def parse_vote(self, bill, actor, date, motion, url, uniqid): page = self.get(url).text bill.add_source(url) vote_re = re.compile( r"YEAS -?\s?(\d+)(.*)NAYS -?\s?(\d+)" r"(.*)ABSENT( OR NOT VOTING)? -?\s?" r"(\d+)(.*)", re.MULTILINE | re.DOTALL, ) match = vote_re.search(page) yes_count = int(match.group(1)) no_count = int(match.group(3)) other_count = int(match.group(6)) if yes_count > no_count: passed = True else: passed = False if actor == "upper" or actor == "lower": vote_chamber = actor else: vote_chamber = "" vote = Vote( chamber=vote_chamber, start_date=date, motion_text=motion, result="pass" if passed else "fail", identifier=str(uniqid), classification="passage", bill=bill, ) vote.add_source(url) vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("other", other_count) yes_votes = re.split(r"\s{2,}", match.group(2).strip()) no_votes = re.split(r"\s{2,}", match.group(4).strip()) other_votes = re.split(r"\s{2,}", match.group(7).strip()) for yes in yes_votes: if yes: vote.yes(yes) for no in no_votes: if no: vote.no(no) for other in other_votes: if other: vote.vote("other", other) yield vote
def scrape_votes(self, bill): bill_num = bill.identifier.split()[1] url = ( "http://wslwebservices.leg.wa.gov/legislationservice.asmx/" "GetRollCalls?billNumber=%s&biennium=%s" % (bill_num, self.biennium) ) page = self.get(url) page = lxml.etree.fromstring(page.content) for rc in xpath(page, "//wa:RollCall"): motion = xpath(rc, "string(wa:Motion)") seq_no = xpath(rc, "string(wa:SequenceNumber)") date = xpath(rc, "string(wa:VoteDate)").split("T")[0] date = datetime.datetime.strptime(date, "%Y-%m-%d").date() yes_count = int(xpath(rc, "string(wa:YeaVotes/wa:Count)")) no_count = int(xpath(rc, "string(wa:NayVotes/wa:Count)")) abs_count = int(xpath(rc, "string(wa:AbsentVotes/wa:Count)")) ex_count = int(xpath(rc, "string(wa:ExcusedVotes/wa:Count)")) other_count = abs_count + ex_count agency = xpath(rc, "string(wa:Agency)") chamber = {"House": "lower", "Senate": "upper"}[agency] vote = Vote( chamber=chamber, start_date=date, motion_text="{} (#{})".format(motion, seq_no), result="pass" if yes_count > (no_count + other_count) else "fail", bill=bill, classification=[], ) vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("other", other_count) vote.add_source(url) for sv in xpath(rc, "wa:Votes/wa:Vote"): name = xpath(sv, "string(wa:Name)") vtype = xpath(sv, "string(wa:VOte)") if vtype == "Yea": vote.yes(name) elif vtype == "Nay": vote.no(name) else: vote.vote("other", name) yield vote
def parse_vote(self, actor, date, row, session, bill_id, bill_chamber, source): """ takes the actor, date and row element and returns a Vote object """ spans = row.xpath(".//span") motion = row.text.replace("\u00a0", " ").replace("-", "").strip() motion = motion if motion else "passage" passed, yes_count, no_count, other_count = ( spans[0].text_content().rsplit("-", 3)) yes_votes = self.get_names(spans[1].tail) no_votes = self.get_names(spans[2].tail) other_votes = [] for span in spans[3:]: if span.text.startswith(("Absent", "Excused")): other_votes += self.get_names(span.tail) for key, val in { "adopted": "pass", "passed": "pass", "failed": "fail" }.items(): if key in passed.lower(): passed = val break vote = VoteEvent( chamber=actor, start_date=date, motion_text=motion, bill=bill_id, bill_chamber=bill_chamber, result=passed, classification="passage", legislative_session=session, ) vote.add_source(source) vote.set_count("yes", int(yes_count)) vote.set_count("no", int(no_count)) vote.set_count("absent", int(other_count)) for name in yes_votes: if name and name != "None": vote.yes(name) for name in no_votes: if name and name != "None": vote.no(name) for name in other_votes: if name and name != "None": vote.vote("absent", name) yield vote
def add_archived_votes(self, bill, bill_id): bill_id = bill_id.split() bill_id[0] = bill_id[0][0] if len(bill_id[-1]) == 2: bill_id[-1] = "00" + bill_id[-1] if len(bill_id[-1]) == 3: bill_id[-1] = "0" + bill_id[-1] bill_id = "".join(bill_id) if bill_id in self.archived_votes: for vote_key, legislator_votes in self.archived_votes[ bill_id].items(): ( vote_date, r_number, action_number, action_vote_result, archive_url, cod, _, ) = vote_key if archive_url[-1] == "S": chamber = "upper" else: chamber = "lower" vote_date = eastern.localize(vote_date) vote_date = vote_date.isoformat() motion_text = (action_number + r_number + cod + action_vote_result).replace(" ", "_") ve = VoteEvent( chamber=chamber, # TODO: check this start_date=vote_date, motion_text=motion_text, bill=bill, classification= "other", # No indication on classification for archived votes result=action_vote_result, ) ve.add_source(archive_url) for lv in legislator_votes: ve.vote(lv["how_voted"], lv["leg"]) yield ve
def scrape_votes(self, bill, bill_page, chamber): vote_links = bill_page.xpath( '//table[contains(@class,"history")]//a[contains(@href, "view_votes")]' ) for vote_link in vote_links: vote_url = vote_link.attrib["href"] date_td, motion_td, *_ = vote_link.xpath("ancestor::tr/td") date = datetime.strptime(date_td.text, "%b %d, %Y") motion_text = motion_td.text_content() vote_page = self.lxmlize(vote_url) passed = "Passed" in motion_text or "Advanced" in motion_text cells = vote_page.xpath( '//div[contains(@class,"table-responsive")]/table//td') vote = VoteEvent( bill=bill, chamber=chamber, start_date=TIMEZONE.localize(date), motion_text=motion_text, classification="passage", result="pass" if passed else "fail", ) yes_count = self.process_count(vote_page, "Yes:") no_count = self.process_count(vote_page, "No:") exc_count = self.process_count(vote_page, "Excused - Not Voting:") absent_count = self.process_count(vote_page, "Absent - Not Voting:") present_count = self.process_count(vote_page, "Present - Not Voting:") vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("excused", exc_count) vote.set_count("absent", absent_count) vote.set_count("abstain", present_count) query_params = urllib.parse.parse_qs( urllib.parse.urlparse(vote_url).query) vote.dedupe_key = query_params["KeyID"][0] vote.add_source(vote_url) for chunk in range(0, len(cells), 2): name = cells[chunk].text vote_type = cells[chunk + 1].text if name and vote_type: vote.vote(VOTE_TYPE_MAP.get(vote_type.lower(), "other"), name) yield vote
def parse_committee_votes(self, bill, url): bill.add_source(url) html = self.get(url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) chamber = "upper" if "Senate" in doc.xpath("string(//h1)") else "lower" committee = tuple(doc.xpath("//h2")[0].itertext())[-2].strip() for link in doc.xpath("//a[contains(@href, 'listVoteSummary.cfm')]"): # Date for fmt in ("%m/%d/%Y", "%m-%d-%Y"): date = link.xpath("../../td")[0].text_content() try: date = datetime.datetime.strptime(date, fmt) except ValueError: continue break # Motion motion = link.text_content().split(" - ")[-1].strip() motion = "Committee vote (%s): %s" % (committee, motion) # Roll call vote_url = link.attrib["href"] rollcall = self.parse_upper_committee_vote_rollcall(bill, vote_url) vote = VoteEvent( chamber=chamber, start_date=tz.localize(date), motion_text=motion, classification=[], result="pass" if rollcall["passed"] else "fail", bill=bill, ) vote.dedupe_key = vote_url vote.set_count("yes", rollcall["yes_count"]) vote.set_count("no", rollcall["no_count"]) vote.set_count("other", rollcall["other_count"]) for voteval in ("yes", "no", "other"): for name in rollcall.get(voteval + "_votes", []): vote.vote(voteval, name) vote.add_source(url) vote.add_source(vote_url) yield vote
def scrape_chamber_votes(self, chamber, session): url = { "upper": "%s/%s" % (RI_URL_BASE, "SVotes"), "lower": "%s/%s" % (RI_URL_BASE, "HVotes"), }[chamber] action = "%s/%s" % (url, "votes.asp") dates = self.get_vote_dates(url, session) for date in dates: votes = self.parse_vote_page(self.post_to(action, date), url, session) for vote_dict in votes: for vote in vote_dict.values(): count = vote["count"] chamber = { "H": "lower", "S": "upper" }[vote["meta"]["chamber"]] try: bill_id = self._bill_id_by_type[(chamber, vote["meta"]["bill"])] except KeyError: self.warning("no such bill_id %s %s", chamber, vote["meta"]["bill"]) continue v = VoteEvent( chamber=chamber, start_date=vote["time"].strftime("%Y-%m-%d"), motion_text=vote["meta"]["extra"]["motion"], result="pass" if count["passage"] else "fail", classification="passage", legislative_session=session, bill=bill_id, bill_chamber=chamber, ) v.set_count("yes", int(count["YEAS"])) v.set_count("no", int(count["NAYS"])) v.set_count("other", int(count["NOT VOTING"])) v.add_source(vote["source"]) v.dedupe_key = vote["source"] for vt in vote["votes"]: key = {"Y": "yes", "N": "no"}.get(vt["vote"], "other") v.vote(key, vt["name"]) yield v
def scrape_votes(self, bill, page): base_url = "https://apps.azleg.gov/api/BillStatusFloorAction" for header in page["FloorHeaders"]: params = { "billStatusId": page["BillId"], "billStatusActionId": header["BillStatusActionId"], "includeVotes": "true", } resp = self.get(base_url, timeout=80, params=params) actions = json.loads(resp.content.decode("utf-8")) for action in actions: if action["Action"] == "No Action": continue if action["ReportDate"] is None: continue cleaned_date = action["ReportDate"].split(".")[0] action_date = datetime.datetime.strptime( cleaned_date, "%Y-%m-%dT%H:%M:%S" ) vote = VoteEvent( chamber={"S": "upper", "H": "lower"}[header["LegislativeBody"]], motion_text=action["Action"], classification="passage", result=( "pass" if action["UnanimouslyAdopted"] or action["Ayes"] > action["Nays"] else "fail" ), start_date=action_date.strftime("%Y-%m-%d"), bill=bill, ) vote.add_source(resp.url) vote.set_count("yes", action["Ayes"] or 0) vote.set_count("no", action["Nays"] or 0) vote.set_count("other", (action["Present"] or 0)) vote.set_count("absent", (action["Absent"] or 0)) vote.set_count("excused", (action["Excused"] or 0)) vote.set_count("not voting", (action["NotVoting"] or 0)) for v in action["Votes"]: vote_type = {"Y": "yes", "N": "no"}.get(v["Vote"], "other") vote.vote(vote_type, v["Legislator"]["FullName"]) vote.dedupe_key = resp.url + str(action["ReferralNumber"]) yield vote
def asvote(self): v = VoteEvent( chamber=self.chamber(), start_date=self.date(), motion_text=self.motion(), result="pass" if self.passed() else "fail", classification="passage", bill=self.bill, ) v.dedupe_key = self.url # URL contains sequence number v.set_count("yes", self.yes_count()) v.set_count("no", self.no_count()) v.set_count("other", self.other_count()) for voter in self.yes_votes(): v.yes(voter) for voter in self.no_votes(): v.no(voter) for voter in self.other_votes(): v.vote("other", voter) v.add_source(self.url) return v
def scrape_vote(self, bill, date, url): page = self.get(url).json() location = page["actionLog"]["FullName"] if location: if "House" in location: chamber = "lower" elif "Senate" in location: chamber = "upper" elif "Joint" in location: chamber = "legislature" else: self.warning("Bad Vote chamber: '%s', skipping" % location) return else: self.warning("Bad Vote chamber: '%s', skipping" % location) return motion = page["actionLog"]["StatusText"] if motion: # If we can't detect a motion, skip this vote yes_count = page["Yeas"] no_count = page["Nays"] excused_count = page["Excused"] absent_count = page["Absent"] passed = yes_count > no_count if motion.startswith("Do Pass"): vtype = "passage" elif motion == "Concurred in amendments": vtype = "amendment" # commenting out until we add these back to OS-core # elif motion == "Veto override": # vtype = "veto-override" else: vtype = [] vote = VoteEvent( chamber=chamber, start_date=date, motion_text=motion, result="pass" if passed else "fail", classification=vtype, bill=bill, ) # differentiate nearly identical votes vote.dedupe_key = url vote.add_source(url) vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("excused", excused_count) vote.set_count("absent", absent_count) for person in page["RollCalls"]: option = person["Vote1"] if option in ("Aye", "Yea"): vote.yes(person["UniqueName"]) elif option == "Nay": vote.no(person["UniqueName"]) elif option == "Excused": vote.vote("excused", person["UniqueName"]) elif option == "Absent": vote.vote("absent", person["UniqueName"]) yield vote
def scrape_vote(self, url, session): fname, _ = self.urlretrieve(url) text = convert_pdf(fname, type="text").decode() lines = text.splitlines() chamber = "upper" if "senate" in url else "lower" if "Maryland" not in text: self.warning(f"empty vote from {url}") return date = re.findall(r"Legislative Date: (\w+ \d+, \d{4})", text)[0] section = "preamble" motion = None bill_id = None how = None voters = defaultdict(list) for line in lines: if section == "preamble": if "vetoed" in line.lower(): self.warning( f"skipping vote that appears to be on prior session: {line}, {bill_id}" ) return possible_bill_id = re.findall(r"([HS][BJR] \d+)", line) if possible_bill_id: bill_id = possible_bill_id[0] # preamble has metadata, then motion, then counts. our process then is to # store the last line as the motion, but if the last line looks like a # continuation, append it to the prior line line = line.strip() counts = re.findall( r"(\d+) Yeas\s+(\d+) Nays\s+(\d+) Not Voting\s+(\d+) Excused\s+(\d+) Absent", line, ) if counts: yes_count, no_count, nv_count, excused_count, absent_count = counts[ 0] yes_count = int(yes_count) no_count = int(no_count) nv_count = int(nv_count) excused_count = int(excused_count) absent_count = int(absent_count) section = "votes" elif line and line != "(Const)": # questions seem to be split across two lines if line.endswith("?"): motion = motion + " " + line else: motion = line elif section == "votes": if line.startswith("Voting Yea"): how = "yes" elif line.startswith("Voting Nay"): how = "no" elif line.startswith("Not Voting"): how = "not voting" elif line.startswith("Excused from Voting"): how = "excused" elif line.startswith("Excused (Absent)"): how = "absent" elif how: names = re.split(r"\s{2,}", line) voters[how].extend(names) if not bill_id and not motion: return elif bill_id and not motion: self.warning( f"got {bill_id} but no motion, not registering as a vote") elif motion and not bill_id: self.warning( f"got {motion} but no bill_id, not registering as a vote") return # bleh - result not indicated anywhere result = "pass" if yes_count > no_count else "fail" bill_chamber = "upper" if bill_id.startswith("S") else "lower" date = datetime.datetime.strptime(date, "%b %d, %Y").strftime("%Y-%m-%d") vote = VoteEvent( chamber=chamber, start_date=date, result=result, classification="passage", motion_text=motion, legislative_session=session, bill=bill_id, bill_chamber=bill_chamber, ) # URL includes sequence ID, will be unique vote.dedupe_key = url vote.add_source(url) vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("not voting", nv_count) vote.set_count("excused", excused_count) vote.set_count("absent", absent_count) for how, names in voters.items(): for name in names: name = name.strip().replace("*", "") if name and "COPY" not in name and "Indicates Vote Change" not in name: vote.vote(how, name) check_counts(vote, raise_error=True) return vote
def parse_bill_actions_table(self, bill, action_table, bill_id, session, url, bill_chamber): # vote types that have been reconsidered since last vote of that type reconsiderations = set() for action in action_table.xpath("*")[1:]: date = action[0].text_content() date = dt.datetime.strptime(date, "%m/%d/%Y").strftime("%Y-%m-%d") actor_code = action[1].text_content().upper() string = action[2].text_content() actor = self._vote_type_map[actor_code] act_type, committees = categorize_action(string) # XXX: Translate short-code to full committee name for the # matcher. real_committees = [] if committees: for committee in committees: try: committee = self.short_ids[committee]["name"] real_committees.append(committee) except KeyError: pass act = bill.add_action(string, date, chamber=actor, classification=act_type) for committee in real_committees: act.add_related_entity(name=committee, entity_type="organization") vote = self.parse_vote(string) if vote: v, motion = vote motion_text = (("Reconsider: " + motion) if actor in reconsiderations else motion) vote = VoteEvent( start_date=date, chamber=actor, bill=bill_id, bill_chamber=bill_chamber, legislative_session=session, motion_text=motion_text, result="pass" if "passed" in string.lower() else "fail", classification="passage", ) reconsiderations.discard(actor) vote.add_source(url) vote.set_count("yes", int(v["n_yes"] or 0)) vote.set_count("no", int(v["n_no"] or 0)) vote.set_count("not voting", int(v["n_excused"] or 0)) for voter in split_specific_votes(v["yes"]): voter = self.clean_voter_name(voter) vote.yes(voter) for voter in split_specific_votes(v["yes_resv"]): voter = self.clean_voter_name(voter) vote.yes(voter) for voter in split_specific_votes(v["no"]): voter = self.clean_voter_name(voter) vote.no(voter) for voter in split_specific_votes(v["excused"]): voter = self.clean_voter_name(voter) vote.vote("not voting", voter) yield vote elif re.search("reconsider", string, re.IGNORECASE): reconsiderations.add(actor)
def handle_page(self): # Checks to see if any vote totals are provided if (len( self.doc.xpath( '//span[contains(@id, "ctl00_MainContent_lblTotal")]/text()' )) > 0): (date, ) = self.doc.xpath('//span[contains(@id, "lblDate")]/text()') date = format_datetime( datetime.datetime.strptime(date, "%m/%d/%Y %I:%M:%S %p"), "US/Eastern") # ctl00_MainContent_lblTotal //span[contains(@id, "ctl00_MainContent_lblTotal")] yes_count = int( self.doc.xpath('//span[contains(@id, "lblYeas")]/text()')[0]) no_count = int( self.doc.xpath('//span[contains(@id, "lblNays")]/text()')[0]) other_count = int( self.doc.xpath('//span[contains(@id, "lblMissed")]/text()')[0]) result = "pass" if yes_count > no_count else "fail" (committee, ) = self.doc.xpath('//span[contains(@id, "lblCommittee")]/text()') (action, ) = self.doc.xpath('//span[contains(@id, "lblAction")]/text()') motion = "{} ({})".format(action, committee) vote = VoteEvent( start_date=date, bill=self.kwargs["bill"], chamber="lower", motion_text=motion, result=result, classification="committee", ) vote.add_source(self.url) vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("not voting", other_count) for member_vote in self.doc.xpath( '//ul[contains(@class, "vote-list")]/li'): if not member_vote.text_content().strip(): continue (member, ) = member_vote.xpath("span[2]//text()") (member_vote, ) = member_vote.xpath("span[1]//text()") member = member.strip() if member_vote == "Y": vote.yes(member) elif member_vote == "N": vote.no(member) elif member_vote == "-": vote.vote("not voting", member) # Parenthetical votes appear to not be counted in the # totals for Yea, Nay, _or_ Missed elif re.search(r"\([YN]\)", member_vote): continue else: raise ValueError( "Unknown vote type found: {}".format(member_vote)) yield vote
def handle_page(self): (_, motion) = self.lines[5].split("FINAL ACTION:") motion = motion.strip() if not motion: self.scraper.warning("Vote appears to be empty") return vote_top_row = [ self.lines.index(x) for x in self.lines if re.search(r"^\s+Yea\s+Nay.*?(?:\s+Yea\s+Nay)+$", x) ][0] yea_columns_end = self.lines[vote_top_row].index("Yea") + len("Yea") nay_columns_begin = self.lines[vote_top_row].index("Nay") votes = {"yes": [], "no": [], "other": []} for line in self.lines[(vote_top_row + 1):]: if line.strip(): member = re.search( r"""(?x) ^\s+(?:[A-Z\-]+)?\s+ # Possible vote indicator ([A-Z][a-z]+ # Name must have lower-case characters [\w\-\s]+) # Continue looking for the rest of the name (?:,[A-Z\s]+?)? # Leadership has an all-caps title (?:\s{2,}.*)? # Name ends when many spaces are seen """, line, ).group(1) # sometimes members have trailing X's from other motions in the # vote sheet we aren't collecting member = re.sub(r"(\s+X)+", "", member) # Usually non-voting members won't even have a code listed # Only a couple of codes indicate an actual vote: # "VA" (vote after roll call) and "VC" (vote change) did_vote = bool(re.search(r"^\s+(X|VA|VC)\s+[A-Z][a-z]", line)) if did_vote: # Check where the "X" or vote code is on the page vote_column = len(line) - len(line.lstrip()) if vote_column <= yea_columns_end: votes["yes"].append(member) elif vote_column >= nay_columns_begin: votes["no"].append(member) else: raise ValueError( "Unparseable vote found for {0} in {1}:\n{2}". format(member, self.url, line)) else: votes["other"].append(member) # End loop as soon as no more members are found else: break totals = re.search(r"(?msu)\s+(\d{1,3})\s+(\d{1,3})\s+.*?TOTALS", self.text).groups() yes_count = int(totals[0]) no_count = int(totals[1]) result = "pass" if (yes_count > no_count) else "fail" vote = VoteEvent( start_date=self.kwargs["date"], bill=self.kwargs["bill"], chamber="upper", motion_text=motion, classification="committee", result=result, ) vote.add_source(self.url) vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("other", len(votes["other"])) # set voters for vtype, voters in votes.items(): for voter in voters: voter = voter.strip() # Removes the few voter names with a ton of extra spaces with VA at the end. # Ex: Cruz VA if " VA" in voter: voter = " ".join(voter.split()[:-2]) if len(voter) > 0: vote.vote(vtype, voter) yield vote
def handle_page(self): MOTION_INDEX = 4 TOTALS_INDEX = 6 VOTE_START_INDEX = 9 if len(self.lines) < 2: self.scraper.warning("Bad PDF! " + self.url) return motion = self.lines[MOTION_INDEX].strip() # Sometimes there is no motion name, only "Passage" in the line above if not motion and not self.lines[MOTION_INDEX - 1].startswith("Calendar Page:"): motion = self.lines[MOTION_INDEX - 1] MOTION_INDEX -= 1 TOTALS_INDEX -= 1 VOTE_START_INDEX -= 1 else: assert motion, "Floor vote's motion name appears to be empty" for _extra_motion_line in range(2): MOTION_INDEX += 1 if self.lines[MOTION_INDEX].strip(): motion = "{}, {}".format(motion, self.lines[MOTION_INDEX].strip()) TOTALS_INDEX += 1 VOTE_START_INDEX += 1 else: break (yes_count, no_count, nv_count) = [ int(x) for x in re.search( r"^\s+Yeas - (\d+)\s+Nays - (\d+)\s+Not Voting - (\d+)\s*$", self.lines[TOTALS_INDEX], ).groups() ] result = "pass" if yes_count > no_count else "fail" vote = VoteEvent( start_date=self.kwargs["date"], chamber=self.kwargs["chamber"], bill=self.kwargs["bill"], motion_text=motion, result=result, classification="passage", ) vote.add_source(self.url) vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("not voting", nv_count) for line in self.lines[VOTE_START_INDEX:]: if not line.strip(): break if " President " in line: line = line.replace(" President ", " ") elif " Speaker " in line: line = line.replace(" Speaker ", " ") # Votes follow the pattern of: # [vote code] [member name]-[district number] for vtype, member in re.findall( r"\s*(Y|N|EX|AV)\s+(.*?)-\d{1,3}\s*", line): vtype = { "Y": "yes", "N": "no", "EX": "excused", "AV": "abstain" }[vtype] member = member.strip() vote.vote(vtype, member) # check totals line up yes_count = no_count = nv_count = 0 for vc in vote.counts: if vc["option"] == "yes": yes_count = vc["value"] elif vc["option"] == "no": no_count = vc["value"] else: nv_count += vc["value"] for vr in vote.votes: if vr["option"] == "yes": yes_count -= 1 elif vr["option"] == "no": no_count -= 1 else: nv_count -= 1 if yes_count != 0 or no_count != 0: raise ValueError("vote count incorrect: " + self.url) if nv_count != 0: # On a rare occasion, a member won't have a vote code, # which indicates that they didn't vote. The totals reflect # this. self.scraper.info( "Votes don't add up; looking for additional ones") for line in self.lines[VOTE_START_INDEX:]: if not line.strip(): break for member in re.findall(r"\s{8,}([A-Z][a-z\'].*?)-\d{1,3}", line): member = member.strip() vote.vote("not voting", member) yield vote
def scrape(self, session=None): if not session: session = self.latest_session() self.info("no session specified, using %s", session) for category in self._categories: leg_listing_url = ( self._API_BASE_URL + f"BulkData/{category['categoryId']}/{session}" ) resp = requests.post(leg_listing_url, headers=self._headers, verify=False,) resp.raise_for_status() leg_listing = resp.json() for leg in leg_listing: bill = Bill( leg["legislationNumber"], legislative_session=session, title=leg["title"], classification=category["name"], ) bill.add_source(leg_listing_url) bill_url = ( f"https://lims.dccouncil.us/Legislation/{leg['legislationNumber']}" ) bill.add_source(bill_url) if leg['lawNumber']: bill.extras['lawNumber'] = leg['lawNumber'] # Actions for hist in leg["legislationHistory"]: hist_date = datetime.datetime.strptime( hist["actionDate"], "%b %d, %Y" ) hist_date = self._TZ.localize(hist_date) hist_action = hist["actionDescription"] if hist_action.split()[0] in ["OtherAmendment", "OtherMotion"]: hist_action = hist_action[5:] hist_class = self.classify_action(hist_action) if "mayor" in hist_action.lower(): actor = "executive" else: actor = "legislature" bill.add_action( hist_action, hist_date, classification=hist_class, chamber=actor ) # Documents with download links if hist["downloadURL"] and ("download" in hist["downloadURL"]): download = hist["downloadURL"] if not download.startswith("http"): download = "https://lims.dccouncil.us/" + download mimetype = ( "application/pdf" if download.endswith("pdf") else None ) is_version = False # figure out if it's a version from type/name possible_version_types = [ "SignedAct", "Introduction", "Enrollment", "Engrossment", ] for vt in possible_version_types: if vt.lower() in download.lower(): is_version = True doc_type = vt if "amendment" in download.lower(): doc_type = "Amendment" if is_version: bill.add_version_link( doc_type, download, media_type=mimetype, on_duplicate="ignore", ) else: bill.add_document_link( hist["actionDescription"], download, media_type=mimetype, on_duplicate="ignore", ) # Grabs Legislation details leg_details_url = ( self._API_BASE_URL + f"LegislationDetails/{leg['legislationNumber']}" ) details_resp = requests.get( leg_details_url, headers=self._headers, verify=False, ) details_resp.raise_for_status() leg_details = details_resp.json() # Sponsors for i in leg_details["introducers"]: name = i["memberName"] bill.add_sponsorship( name, classification="primary", entity_type="person", primary=True, ) # Co-sponsor if leg_details["coSponsors"]: for cs in leg_details["coSponsors"]: name = i["memberName"] bill.add_sponsorship( name, classification="cosponsor", entity_type="person", primary=True, ) # Committee Hearing Doc for commHearing in leg_details["committeeHearing"]: if commHearing["hearingRecord"]: bill.add_document_link( commHearing["hearingType"], commHearing["hearingRecord"], media_type="application/pdf", on_duplicate="ignore", ) for committeeMarkup in leg_details["committeeMarkup"]: if committeeMarkup["committeeReport"]: bill.add_document_link( "Committee Markup", committeeMarkup["committeeReport"], media_type="application/pdf", on_duplicate="ignore", ) # Actions and Votes if leg_details["actions"]: # To prevent duplicate votes vote_ids = [] for act in leg_details["actions"]: action_name = act["action"] action_date = datetime.datetime.strptime( act["actionDate"][:10], "%Y-%m-%d" ) action_date = self._TZ.localize(action_date) if action_name.split()[0] == "Other": action_name = " ".join(action_name.split()[1:]) if "mayor" in action_name.lower(): actor = "executive" else: actor = "legislature" # Documents and Versions if act["attachment"]: mimetype = ( "application/pdf" if act["attachment"].endswith("pdf") else None ) is_version = False # figure out if it's a version from type/name possible_version_types = [ "SignedAct", "Introduction", "Enrollment", "Engrossment", ] for vt in possible_version_types: if vt.lower() in act["attachment"].lower(): is_version = True doc_type = vt if "amendment" in act["attachment"].lower(): doc_type = "Amendment" if is_version: bill.add_version_link( doc_type, act["attachment"], media_type=mimetype, on_duplicate="ignore", ) else: bill.add_document_link( doc_type, act["attachment"], media_type=mimetype, on_duplicate="ignore", ) # Votes if act["voteDetails"]: result = act["voteDetails"]["voteResult"] if result: status = self._vote_statuses[result.lower()] id_text = ( str(leg["legislationNumber"]) + "-" + action_name + "-" + result ) if id_text not in vote_ids: vote_ids.append(id_text) action_class = self.classify_action(action_name) v = VoteEvent( identifier=id_text, chamber=actor, start_date=action_date, motion_text=action_name, result=status, classification=action_class, bill=bill, ) v.add_source(leg_listing_url) yes_count = ( no_count ) = absent_count = abstain_count = other_count = 0 for leg_vote in act["voteDetails"]["votes"]: mem_name = leg_vote["councilMember"] if leg_vote["vote"] == "Yes": yes_count += 1 v.yes(mem_name) elif leg_vote["vote"] == "No": no_count += 1 v.no(mem_name) elif leg_vote["vote"] == "Absent": absent_count += 1 v.vote("absent", mem_name) elif leg_vote["vote"] == "Recused": v.vote("abstain", mem_name) abstain_count += 1 elif leg_vote["vote"] == "Present": v.vote("other", mem_name) other_count += 1 else: # Incase anything new pops up other_count += 1 v.vote("other", mem_name) v.set_count("yes", yes_count) v.set_count("no", no_count) v.set_count("absent", absent_count) v.set_count("abstain", abstain_count) v.set_count("other", other_count) yield v yield bill
def scrape(self, session=None): HTML_TAGS_RE = r"<.*?>" if session is None: session = self.latest_session() year_slug = self.jurisdiction.get_year_slug(session) # Load all bills and resolutions via the private API bills_url = "http://legislature.vermont.gov/bill/loadBillsReleased/{}/".format( year_slug) bills_json = self.get(bills_url).text bills = json.loads(bills_json)["data"] or [] bills_url = "http://legislature.vermont.gov/bill/loadBillsIntroduced/{}/".format( year_slug) bills_json = self.get(bills_url).text bills.extend(json.loads(bills_json)["data"] or []) resolutions_url = "http://legislature.vermont.gov/bill/loadAllResolutionsByChamber/{}/both".format( year_slug) resolutions_json = self.get(resolutions_url).text bills.extend(json.loads(resolutions_json)["data"] or []) # Parse the information from each bill for info in bills: # Strip whitespace from strings info = {k: v.strip() for k, v in info.items()} # Identify the bill type and chamber if info["BillNumber"].startswith("J.R.H."): bill_type = "joint resolution" bill_chamber = "lower" elif info["BillNumber"].startswith("J.R.S."): bill_type = "joint resolution" bill_chamber = "upper" elif info["BillNumber"].startswith("H.C.R."): bill_type = "concurrent resolution" bill_chamber = "lower" elif info["BillNumber"].startswith("S.C.R."): bill_type = "concurrent resolution" bill_chamber = "upper" elif info["BillNumber"].startswith("H.R."): bill_type = "resolution" bill_chamber = "lower" elif info["BillNumber"].startswith("S.R."): bill_type = "resolution" bill_chamber = "upper" elif info["BillNumber"].startswith("PR."): bill_type = "constitutional amendment" if info["Body"] == "H": bill_chamber = "lower" elif info["Body"] == "S": bill_chamber = "upper" else: raise AssertionError("Amendment not tied to chamber") elif info["BillNumber"].startswith("H."): bill_type = "bill" bill_chamber = "lower" elif info["BillNumber"].startswith("S."): bill_type = "bill" bill_chamber = "upper" else: raise AssertionError("Unknown bill type found: '{}'".format( info["BillNumber"])) bill_id_original_format = (info["BillNumber"].replace(".", "").replace( " ", "")) bill_id = bill_id_original_format # put one space back in between type and number bill_id = re.sub(r"([a-zA-Z]+)(\d+)", r"\1 \2", bill_id) # Create the bill using its basic information bill = Bill( identifier=bill_id, legislative_session=session, chamber=bill_chamber, title=info["Title"], classification=bill_type, ) if "resolution" in bill_type: bill.add_source(resolutions_url) else: bill.add_source(bills_url) # Load the bill's information page to access its metadata bill_url = "http://legislature.vermont.gov/bill/status/{0}/{1}".format( year_slug, info["BillNumber"]) doc = self.lxmlize(bill_url) bill.add_source(bill_url) # Capture sponsors sponsors = doc.xpath( '//dl[@class="summary-table"]/dt[text()="Sponsor(s)"]/' "following-sibling::dd[1]/ul/li") sponsor_type = "primary" for sponsor in sponsors: if sponsor.xpath("span/text()") == ["Additional Sponsors"]: sponsor_type = "cosponsor" continue sponsor_name = (sponsor.xpath("a/text()")[0].replace( "Rep.", "").replace("Sen.", "").strip()) if sponsor_name and not (sponsor_name[:5] == "Less" and len(sponsor_name) == 5): bill.add_sponsorship( name=sponsor_name, classification=sponsor_type, entity_type="person", primary=(sponsor_type == "primary"), ) # Capture bill text versions # Warning: There's a TODO in VT's source code saying 'move this to where it used to be' # so leave in the old and new positions versions = doc.xpath( '//dl[@class="summary-table"]/dt[text()="Bill/Resolution Text"]/' "following-sibling::dd[1]/ul/li/a |" '//ul[@class="bill-path"]//a') for version in versions: if version.xpath("text()"): bill.add_version_link( note=version.xpath("text()")[0], url=version.xpath("@href")[0].replace(" ", "%20"), media_type="application/pdf", ) # Identify the internal bill ID, used for actions and votes # If there is no internal bill ID, then it has no extra information try: internal_bill_id = re.search( r'"bill/loadBillDetailedStatus/.+?/(\d+)"', lxml.etree.tostring(doc).decode("utf-8"), ).group(1) except AttributeError: self.warning("Bill {} appears to have no activity".format( info["BillNumber"])) yield bill continue # Capture actions actions_url = "http://legislature.vermont.gov/bill/loadBillDetailedStatus/{0}/{1}".format( year_slug, internal_bill_id) actions_json = self.get(actions_url) # Checks if page actually has json posted if "json" in actions_json.headers.get("Content-Type"): actions = json.loads(actions_json.text)["data"] # Checks to see if any data is actually there if actions == "": continue else: continue bill.add_source(actions_url) chambers_passed = set() for action in actions: action = {k: v for k, v in action.items() if v is not None} if "Signed by Governor" in action["FullStatus"]: actor = "executive" elif action["ChamberCode"] == "H": actor = "lower" elif action["ChamberCode"] == "S": actor = "upper" else: raise AssertionError("Unknown actor for bill action") # Categorize action if "Signed by Governor" in action["FullStatus"]: # assert chambers_passed == set("HS") action_type = "executive-signature" elif "Vetoed by the Governor" in action["FullStatus"]: action_type = "executive-veto" elif ("Read first time" in action["FullStatus"] or "Read 1st time" in action["FullStatus"]): action_type = "introduction" elif "Reported favorably" in action["FullStatus"]: action_type = "committee-passage-favorable" elif actor == "lower" and any( x.lower().startswith("aspassed") for x in action["keywords"].split(";")): action_type = "passage" chambers_passed.add("H") elif actor == "upper" and any( x.lower().startswith(" aspassed") or x.lower().startswith("aspassed") for x in action["keywords"].split(";")): action_type = "passage" chambers_passed.add("S") else: action_type = None # Manual fix for data error in # https://legislature.vermont.gov/bill/status/2020/H.511 action["StatusDate"] = action["StatusDate"].replace( "/0209", "/2019") # Manual fix for data error in # https://legislature.vermont.gov/bill/status/2020/H.754 if bill_id == "H 754" and session == "2019-2020": action["StatusDate"] = action["StatusDate"].replace( "/0202", "/2020") # https://legislature.vermont.gov/bill/status/2020/H.942 if bill_id == "H 942" and session == "2019-2020": action["StatusDate"] = action["StatusDate"].replace( "/0200", "/2020") action_date = datetime.datetime.strftime( datetime.datetime.strptime(action["StatusDate"], "%m/%d/%Y"), "%Y-%m-%d", ) # strftime doesn't always pad year value (%Y) (https://bugs.python.org/issue32195) # and sometimes this state has typos in year part of the StatusDate value # which can cause validation errors, so fix leading zeroes if they are missing if action_date.find("-") < 4: action_date = ("0" * (4 - action_date.find("-"))) + action_date bill.add_action( description=re.sub(HTML_TAGS_RE, "", action["FullStatus"]), date=action_date, chamber=actor, classification=action_type, ) # Capture votes votes_url = "http://legislature.vermont.gov/bill/loadBillRollCalls/{0}/{1}".format( year_slug, internal_bill_id) votes_json = self.get(votes_url).text votes = json.loads(votes_json)["data"] bill.add_source(votes_url) for vote in votes: roll_call_id = vote["VoteHeaderID"] roll_call_url = ("http://legislature.vermont.gov/bill/" "loadBillRollCallDetails/{0}/{1}".format( year_slug, roll_call_id)) roll_call_json = self.get(roll_call_url).text roll_call = json.loads(roll_call_json)["data"] roll_call_yea = [] roll_call_nay = [] roll_call_not_voting = [] for member in roll_call: (member_name, _district) = member["MemberName"].split(" of ") member_name = member_name.strip() if member["MemberVote"] == "Yea": roll_call_yea.append(member_name) elif member["MemberVote"] == "Nay": roll_call_nay.append(member_name) else: roll_call_not_voting.append(member_name) if ("Passed -- " in vote["FullStatus"] # seems like we've seen both or "Governor overridden" in vote["FullStatus"] or "Governor overriden" in vote["FullStatus"]): did_pass = True elif ("Failed -- " in vote["FullStatus"] or "Veto of the Governor sustained" in vote["FullStatus"]): did_pass = False else: raise AssertionError("Roll call vote result is unclear: " + vote["FullStatus"]) # Check vote counts yea_count = int( re.search(r"Yeas = (\d+)", vote["FullStatus"]).group(1)) nay_count = int( re.search(r"Nays = (\d+)", vote["FullStatus"]).group(1)) vote_start_date = datetime.datetime.strftime( datetime.datetime.strptime(vote["StatusDate"], "%m/%d/%Y"), "%Y-%m-%d", ) motion_text = re.sub(HTML_TAGS_RE, "", vote["FullStatus"]).strip() vote_identifer = (vote["StatusDate"] + "--" + motion_text + "--" + roll_call_url) vote_to_add = VoteEvent( identifier=vote_identifer, bill=bill, chamber=("lower" if vote["ChamberCode"] == "H" else "upper"), start_date=vote_start_date, motion_text=motion_text, result="pass" if did_pass else "fail", classification="passage", legislative_session=session, ) vote_to_add.add_source(roll_call_url) vote_to_add.set_count("yes", yea_count) vote_to_add.set_count("no", nay_count) vote_to_add.set_count("not voting", len(roll_call_not_voting)) for member in roll_call_yea: vote_to_add.yes(member) for member in roll_call_nay: vote_to_add.no(member) for member in roll_call_not_voting: vote_to_add.vote("not voting", member) yield vote_to_add # Witnesses: # http://legislature.vermont.gov/bill/loadBillWitnessList/{year_slug}/{internal_bill_id} witnesses_doc_link_url = "https://legislature.vermont.gov/bill/print/2020/{0}/witnesses".format( bill_id_original_format) bill.add_document_link(note="Witness List", url=witnesses_doc_link_url, media_type="text/html") # Conference committee members: # http://legislature.vermont.gov/bill/loadBillConference/{year_slug}/{bill_number} conferees_doc_link_url = "https://legislature.vermont.gov/bill/print/2020/{0}/conference".format( bill_id_original_format) page = self.lxmlize(conferees_doc_link_url) no_data = page.xpath('//div[@class="no-data"]/text()') if not no_data: bill.add_document_link( note="Conference Committee Members", url=conferees_doc_link_url, media_type="text/html", ) # Committee meetings: # http://legislature.vermont.gov/committee/loadHistoryByBill/{year_slug}?LegislationId={internal_bill_id} meetings_doc_link_url = "https://legislature.vermont.gov/bill/print/2020/{0}/meetings".format( bill_id_original_format) bill.add_document_link( note="Committee Meetings", url=meetings_doc_link_url, media_type="text/html", ) yield bill
def scrape_senate_vote(self, session, period, roll_call): url = ( "https://www.senate.gov/legislative/LIS/roll_call_votes/vote{session}{period}/" "vote_{session}_{period}_{vote_id}.xml") url = url.format(session=session, period=period, vote_id=roll_call) page = lxml.html.fromstring(self.get(url).content) vote_date = page.xpath("//roll_call_vote/vote_date/text()")[0].strip() when = self._TZ.localize( datetime.datetime.strptime(vote_date, "%B %d, %Y, %H:%M %p")) roll_call = page.xpath("//roll_call_vote/vote_number/text()")[0] vote_id = "us-{}-upper-{}".format(when.year, roll_call) # note: not everthing the senate votes on is a bill, this is OK # non bills include nominations and impeachments doc_type = page.xpath( "//roll_call_vote/document/document_type/text()")[0] if page.xpath( "//roll_call_vote/amendment/amendment_to_document_number/text()" ): bill_id = page.xpath( "//roll_call_vote/amendment/amendment_to_document_number/text()" )[0].replace(".", "") else: bill_id = page.xpath( "//roll_call_vote/document/document_name/text()")[0].replace( ".", "") motion = page.xpath("//roll_call_vote/vote_question_text/text()")[0] result_text = page.xpath("//roll_call_vote/vote_result/text()")[0] result = self.senate_statuses[result_text] vote = VoteEvent( start_date=when, bill_chamber="lower" if doc_type[0] == "H" else "upper", motion_text=motion, classification="passage", # TODO result=result, legislative_session=session, identifier=vote_id, bill=bill_id, chamber="upper", ) vote.add_source(url) vote.extras["senate-rollcall-num"] = roll_call yeas = page.xpath("//roll_call_vote/count/yeas/text()")[0] nays = page.xpath("//roll_call_vote/count/nays/text()")[0] if page.xpath("//roll_call_vote/count/absent/text()"): absents = page.xpath("//roll_call_vote/count/absent/text()")[0] else: absents = 0 if page.xpath("//roll_call_vote/count/present/text()"): presents = page.xpath("//roll_call_vote/count/present/text()")[0] else: presents = 0 vote.set_count("yes", int(yeas)) vote.set_count("no", int(nays)) vote.set_count("absent", int(absents)) vote.set_count("abstain", int(presents)) for row in page.xpath("//roll_call_vote/members/member"): lis_id = row.xpath("lis_member_id/text()")[0] name = row.xpath("member_full/text()")[0] choice = row.xpath("vote_cast/text()")[0] vote.vote(self.vote_codes[choice], name, note=lis_id) yield vote
def scrape_house_vote(self, url): page = lxml.html.fromstring(self.get(url).content) page.make_links_absolute(url) vote_date = page.xpath( "//rollcall-vote/vote-metadata/action-date/text()")[0] vote_time = page.xpath( "//rollcall-vote/vote-metadata/action-time/@time-etz")[0] when = self._TZ.localize( datetime.datetime.strptime("{} {}".format(vote_date, vote_time), "%d-%b-%Y %H:%M")) motion = page.xpath( "//rollcall-vote/vote-metadata/vote-question/text()")[0] result = page.xpath( "//rollcall-vote/vote-metadata/vote-result/text()")[0] if result == "Passed": result = "pass" else: result = "fail" session = page.xpath( "//rollcall-vote/vote-metadata/congress/text()")[0] bill_id = page.xpath( "//rollcall-vote/vote-metadata/legis-num/text()")[0] # for some reason these are "H R 123" which nobody uses, so fix to "HR 123" bill_id = re.sub(r"([A-Z])\s([A-Z])", r"\1\2", bill_id) roll_call = page.xpath( "//rollcall-vote/vote-metadata/rollcall-num/text()")[0] vote_id = "us-{}-lower-{}".format(when.year, roll_call) vote = VoteEvent( start_date=when, bill_chamber="lower" if bill_id[0] == "H" else "upper", motion_text=motion, classification="passage", # TODO result=result, legislative_session=session, identifier=vote_id, bill=bill_id, chamber="lower", ) vote.add_source(url) vote.extras["house-rollcall-num"] = roll_call yeas = page.xpath( "//rollcall-vote/vote-metadata/vote-totals/totals-by-vote/yea-total/text()" )[0] nays = page.xpath( "//rollcall-vote/vote-metadata/vote-totals/totals-by-vote/nay-total/text()" )[0] nvs = page.xpath( "//rollcall-vote/vote-metadata/vote-totals/totals-by-vote/not-voting-total/text()" )[0] presents = page.xpath( "//rollcall-vote/vote-metadata/vote-totals/totals-by-vote/present-total/text()" )[0] vote.set_count("yes", int(yeas)) vote.set_count("no", int(nays)) vote.set_count("not voting", int(nvs)) vote.set_count("abstain", int(presents)) # vote.yes vote.no vote.vote for row in page.xpath("//rollcall-vote/vote-data/recorded-vote"): bioguide = row.xpath("legislator/@name-id")[0] name = row.xpath("legislator/@sort-field")[0] choice = row.xpath("vote/text()")[0] vote.vote(self.vote_codes[choice], name, note=bioguide) return vote
def scrape_votes(self, bill, url): page = lxml.html.fromstring(self.get(url).text.replace(u"\xa0", " ")) seen_rcs = set() re_ns = "http://exslt.org/regular-expressions" path = r"//p[re:test(text(), 'OKLAHOMA\s+(HOUSE|STATE\s+SENATE)')]" for header in page.xpath(path, namespaces={"re": re_ns}): bad_vote = False # Each chamber has the motion name on a different line of the file if "HOUSE" in header.xpath("string()"): chamber = "lower" motion_index = 8 else: chamber = "upper" motion_index = 13 motion = header.xpath("string(following-sibling::p[%d])" % motion_index).strip() motion = re.sub(r"\s+", " ", motion) if not motion.strip(): self.warning("Motion text not found") return match = re.match(r"^(.*) (PASSED|FAILED)$", motion) if match: motion = match.group(1) passed = match.group(2) == "PASSED" else: passed = None rcs_p = header.xpath( "following-sibling::p[contains(., 'RCS#')]")[0] rcs_line = rcs_p.xpath("string()").replace(u"\xa0", " ") rcs = re.search(r"RCS#\s+(\d+)", rcs_line).group(1) if rcs in seen_rcs: continue else: seen_rcs.add(rcs) date_line = rcs_p.getnext().xpath("string()") date = re.search(r"\d+/\d+/\d+", date_line).group(0) date = datetime.datetime.strptime(date, "%m/%d/%Y").date() vtype = None counts = collections.defaultdict(int) votes = collections.defaultdict(list) seen_yes = False for sib in header.xpath("following-sibling::p")[13:]: line = sib.xpath("string()").replace("\r\n", " ").strip() if "*****" in line: break regex = (r"(YEAS|NAYS|EXCUSED|VACANT|CONSTITUTIONAL " r"PRIVILEGE|NOT VOTING|N/V)\s*:\s*(\d+)(.*)") match = re.match(regex, line) if match: if match.group(1) == "YEAS" and "RCS#" not in line: vtype = "yes" seen_yes = True elif match.group(1) == "NAYS" and seen_yes: vtype = "no" elif match.group(1) == "VACANT": continue # skip these elif seen_yes: vtype = "other" if seen_yes and match.group(3).strip(): self.warning("Bad vote format, skipping.") bad_vote = True counts[vtype] += int(match.group(2)) elif seen_yes: for name in line.split(" "): if not name: continue if "HOUSE" in name or "SENATE " in name: continue votes[vtype].append(name.strip()) if bad_vote: continue if passed is None: passed = counts["yes"] > (counts["no"] + counts["other"]) vote = Vote( chamber=chamber, start_date=date.strftime("%Y-%m-%d"), motion_text=motion, result="pass" if passed else "fail", bill=bill, classification="passage", ) vote.set_count("yes", counts["yes"]) vote.set_count("no", counts["no"]) vote.set_count("other", counts["other"]) vote.dedupe_key = url + "#" + rcs vote.add_source(url) for name in votes["yes"]: vote.yes(name) for name in votes["no"]: if ":" in name: raise Exception(name) vote.no(name) for name in votes["other"]: vote.vote("other", name) yield vote
def scrape_vote(self, session, bill, vote_url, chamber, date): page = self.lxmlize(vote_url) try: motion = page.xpath("//font/text()")[2] except IndexError: self.warning("Vote Summary Page Broken ") return # eg. http://leg.colorado.gov/content/sb18-033vote563ce6 if ("AM" in motion or "PM" in motion) and "/" in motion: motion = "Motion not given." if "withdrawn" not in motion: yes_no_counts = page.xpath( "//tr/td[preceding-sibling::td/descendant::" "font[contains(text(),'Aye')]]/font/text()") other_counts = page.xpath( "//tr/td[preceding-sibling::td/descendant::" "font[contains(text(),'Absent')]]/font/text()") abstain_counts = page.xpath( "//tr/td[preceding-sibling::td/descendant::" "font[contains(text(),'17C')]]/font/text()") if not yes_no_counts: self.info("Missing yes no count") return yes_count = int(yes_no_counts[0]) no_count = int(yes_no_counts[2]) exc_count = int(other_counts[2]) absent_count = int(other_counts[0]) abstain_count = 0 if abstain_counts: abstain_count = int(abstain_counts[0]) # fix for # http://leg.colorado.gov/content/hb19-1029vote65e72e if absent_count == -1: absent_count = 0 passed = yes_count > no_count vote = VoteEvent( chamber=chamber, start_date=self._tz.localize(date), motion_text=motion, result="pass" if passed else "fail", bill=bill, classification="passage", ) vote.dedupe_key = vote_url vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("excused", exc_count) vote.set_count("absent", absent_count) vote.set_count("abstain", abstain_count) vote.add_source(vote_url) rolls = page.xpath("//tr[preceding-sibling::tr/descendant::" "td/div/b/font[contains(text(),'Vote')]]") vote_abrv = { "Y": "yes", "N": "no", "E": "excused", "A": "absent", "-": "absent", "17C": "abstain", } for roll in rolls: if len(roll.xpath(".//td/div/font/text()")) > 0: voted = roll.xpath(".//td/div/font/text()")[0].strip() voter = roll.xpath(".//td/font/text()")[0].strip() if voted == "V": continue vote.vote(vote_abrv[voted], voter) yield vote
def parse_vote_pdf(self, vote_url, bill): filename, response = self.urlretrieve(vote_url) text = convert_pdf(filename, type="text").decode() lines = text.splitlines() if "Senate" in vote_url: chamber = "upper" else: chamber = "lower" date_string = lines[0].split("Calendar Date:")[1].strip() date = datetime.datetime.strptime(date_string, "%b %d, %Y %I:%M (%p)") page_index = None for index, line in enumerate(lines): if "Yeas" in line and "Nays" in line: page_index = index break vote_counts = 5 * [0] vote_types = ["yes", "no", "not voting", "excused", "absent"] if page_index: counts = re.split(r"\s{2,}", lines[page_index].strip()) for index, count in enumerate(counts): number, string = count.split(" ", 1) number = int(number) vote_counts[index] = number else: raise ValueError("Vote Counts Not found at %s" % vote_url) passed = vote_counts[0] > vote_counts[1] # Consent calendar votes address multiple bills in one VoteEvent # eg, http://mgaleg.maryland.gov/2018RS/votes/Senate/0478.pdf is_consent_calendar = any( ["Consent Calendar" in line for line in lines[:page_index]] ) consent_calendar_bills = None motion = "" if is_consent_calendar: motion = re.split(r"\s{2,}", lines[page_index - 4].strip())[0] consent_calendar_bills = re.split(r"\s{2,}", lines[page_index - 1].strip()) assert ( consent_calendar_bills ), "Could not find bills for consent calendar vote" motion_keywords = [ "favorable", "reading", "amendment", "motion", "introduced", "bill pass", "committee", ] motion_lines = [ 3, 2, 4, 5, ] # Relative LineNumbers to be checked for existence of motion for i in motion_lines: if any( motion_keyword in motion.lower() for motion_keyword in motion_keywords ): break motion = re.split(r"\s{2,}", lines[page_index - i].strip())[0] else: if not any( motion_keyword in motion.lower() for motion_keyword in motion_keywords ): # This condition covers for the bad formating in SB 1260 motion = lines[page_index - 3] if not any( motion_keyword in motion.lower() for motion_keyword in motion_keywords ): # Check this one for SB 747 motion = "No motion given" self.warning("No motion given") vote = VoteEvent( bill=bill, chamber=chamber, start_date=date.strftime("%Y-%m-%d"), motion_text=motion, classification="passage", result="pass" if passed else "fail", ) # Include bill ID to avoid duplication for consent calendars vote.pupa_id = "{}#{}".format(vote_url, bill.identifier) for index, vote_type in enumerate(vote_types): vote.set_count(vote_type, vote_counts[index]) page_index = page_index + 2 # Keywords for identifying where names are located in the pdf show_stoppers = [ "Voting Nay", "Not Voting", "COPY", "Excused", "indicates vote change", "Indicates Vote Change", ] vote_index = 0 # For matching number of names extracted with vote counts(extracted independently) vote_name_counts = 5 * [0] while page_index < len(lines): current_line = lines[page_index].strip() if not current_line or "Voting Yea" in current_line: page_index += 1 continue if any(show_stopper in current_line for show_stopper in show_stoppers): page_index += 1 vote_index = vote_index + 1 continue names = re.split(r"\s{2,}", current_line) vote_name_counts[vote_index] += len(names) for name in names: vote.vote(vote_types[vote_index], name) page_index += 1 if vote_counts != vote_name_counts: raise ValueError("Votes Count and Number of Names don't match") return vote
def scrape_votes(self, bill, doc): vote_tr_path = ('//h6[@id="vote-header"]' '/ancestor::div[contains(@class, "gray-card")]' '//div[contains(@class, "card-body")]' '//div[@class="row"]') for vote_row in doc.xpath(vote_tr_path): entries = [ each.text_content() for each in vote_row.xpath("div")[1:-1:2] ] date, subject, rcs, aye, no, nv, abs, exc, total = entries result = vote_row.xpath("div/a")[0] result_text = result.text result_link = result.get("href") if "H" in rcs: chamber = "lower" elif "S" in rcs: chamber = "upper" date = eastern.localize( dt.datetime.strptime(date.replace(".", ""), "%m/%d/%Y %H:%M %p")) date = date.isoformat() ve = VoteEvent( chamber=chamber, start_date=date, motion_text=subject, result="pass" if "PASS" in result_text else "fail", bill=bill, classification="passage", # TODO: classify votes ) ve.set_count("yes", int(aye)) ve.set_count("no", int(no)) ve.set_count("not voting", int(nv)) ve.set_count("absent", int(abs)) ve.set_count("excused", int(exc)) ve.add_source(result_link) data = self.get(result_link).text vdoc = lxml.html.fromstring(data) # only one table that looks like this vote_table = vdoc.xpath("//div[@class='row ncga-row-no-gutters']") # Grabs names for how people voted for row in vote_table: votes_names = [] row = row.text_content() if "None" in row: vote_type = "Nope" elif "Ayes (" in row: row = row.replace("\n", ";") votes_names = row.replace(" ", "").strip().split(";")[2:-1] vote_type = "yes" elif "Noes (" in row: row = row.replace("\n", ";") votes_names = row.replace(" ", "").strip().split(";")[2:-1] vote_type = "no" elif "Excused Absence (" in row: row = row.replace("\n", ";") votes_names = row.replace(" ", "").strip().split(";")[2:-1] vote_type = "absent" elif "Not Voting (" in row: row = row.replace("\n", ";") votes_names = row.replace(" ", "").strip().split(";")[2:-1] vote_type = "abstain" else: vote_type = "Not a vote" if votes_names: for name in votes_names: name = name.replace("\r", "") # Resolves names that have '(Chair)' in them if "(" in name: name = name[:name.find("(")] # Adds a space to names inbetween initial and last name # eg: L.Johnson -> L. Johnson if name[1] == "." and name[2] != " ": name = name[:2] + " " + name[2:] ve.vote(vote_type, name) yield ve
def scrape_votes(self, vote_url, bill, chamber): try: filename, response = self.urlretrieve(vote_url) except scrapelib.HTTPError: self.logger.warning("PDF not posted or available") return # Grabs text from pdf pdflines = [ line.decode("utf-8") for line in convert_pdf(filename, "text").splitlines() ] os.remove(filename) vote_date = 0 voters = defaultdict(list) for x in range(len(pdflines)): line = pdflines[x] if re.search(r"(\d+/\d+/\d+)", line): initial_date = line.strip() if ("AM" in line) or ("PM" in line): split_l = line.split() for y in split_l: if ":" in y: time_location = split_l.index(y) motion = " ".join(split_l[0:time_location]) time = split_l[time_location:] if len(time) > 0: time = "".join(time) dt = initial_date + " " + time dt = datetime.strptime(dt, "%m/%d/%Y %I:%M:%S%p") vote_date = central.localize(dt) vote_date = vote_date.isoformat() # In rare case that no motion is provided if len(motion) < 1: motion = "No Motion Provided" if "YEAS:" in line: yeas = int(line.split()[-1]) if "NAYS:" in line: nays = int(line.split()[-1]) if "ABSTAINED:" in line: abstained = int(line.split()[-1]) if "PASSES:" in line: abstained = int(line.split()[-1]) if "NOT VOTING:" in line: not_voting = int(line.split()[-1]) if "YEAS :" in line: y = 0 next_line = pdflines[x + y] while "NAYS : " not in next_line: next_line = next_line.split(" ") if next_line and ("YEAS" not in next_line): for v in next_line: if v and "YEAS" not in v: voters["yes"].append(v.strip()) next_line = pdflines[x + y] y += 1 if line and "NAYS :" in line: y = 0 next_line = 0 next_line = pdflines[x + y] while ("ABSTAINED : " not in next_line) and ("PASSES :" not in next_line): next_line = next_line.split(" ") if next_line and "NAYS" not in next_line: for v in next_line: if v and "NAYS" not in v: voters["no"].append(v.strip()) next_line = pdflines[x + y] y += 1 if line and ("ABSTAINED :" in line or "PASSES :" in line): y = 2 next_line = 0 next_line = pdflines[x + y] while "NOT VOTING :" not in next_line: next_line = next_line.split(" ") if next_line and ("ABSTAINED" not in next_line or "PASSES" not in next_line): for v in next_line: if v: voters["abstain"].append(v.strip()) next_line = pdflines[x + y] y += 1 if line and "NOT VOTING : " in line: lines_to_go_through = math.ceil(not_voting / len(line.split())) next_line = pdflines[x] for y in range(lines_to_go_through): if len(pdflines) > (x + y + 2): next_line = pdflines[x + y + 2].split(" ") for v in next_line: if v: voters["not voting"].append(v.strip()) if yeas > (nays + abstained + not_voting): passed = True else: passed = False ve = VoteEvent( chamber=chamber, start_date=vote_date, motion_text=motion, result="pass" if passed else "fail", bill=bill, classification="passage", ) ve.add_source(vote_url) for how_voted, how_voted_voters in voters.items(): for voter in how_voted_voters: if len(voter) > 0: ve.vote(how_voted, voter) # Resets voters dictionary before going onto next page in pdf voters = defaultdict(list) yield ve
def scrape_journal(self, url, chamber, session, date): filename, response = self.urlretrieve(url) self.logger.info("Saved journal to %r" % filename) all_text = convert_pdf(filename, type="text") lines = all_text.split(b"\n") lines = [line.decode("utf-8") for line in lines] lines = [ line.strip() .replace("–", "-") .replace("―", '"') .replace("‖", '"') .replace("“", '"') .replace("”", '"') for line in lines ] # Do not process headers or completely empty lines header_date_re = r"\d+\w{2} Day\s+\w+DAY, \w+ \d{1,2}, \d{4}\s+\d+" header_journal_re = r"\d+\s+JOURNAL OF THE \w+\s+\d+\w{2} Day" lines = iter( [ line for line in lines if not ( line == "" or re.match(header_date_re, line) or re.match(header_journal_re, line) ) ] ) # bill_id -> motion -> count motions_per_bill = collections.defaultdict(collections.Counter) for line in lines: # Go through with vote parse if any of # these conditions match. if not line.startswith("On the question") or "shall" not in line.lower(): continue # Get the bill_id bill_id = None bill_re = r"\(\s*([A-Z\.]+\s\d+)\s*\)" # The Senate ends its motion text with a vote announcement if chamber == "upper": end_of_motion_re = r".* the vote was:\s*" # The House may or may not end motion text with a bill name elif chamber == "lower": end_of_motion_re = r'.*Shall.*(?:\?"?|")(\s{})?\s*'.format(bill_re) while not re.match(end_of_motion_re, line, re.IGNORECASE): line += " " + next(lines) try: bill_id = re.search(bill_re, line).group(1) except AttributeError: self.warning( "This motion did not pertain to legislation: {}".format(line) ) continue # Get the motion text motion_re = r""" ^On\sthe\squestion\s # Precedes any motion "+ # Motion is preceded by a quote mark (or two) (Shall\s.+?\??) # The motion text begins with "Shall" \s*(?:\?"?|"|’)\s+ # Motion is followed by a question mark and/or a quote mark (?:{})? # If the vote regards a bill, its number is listed {} # Senate has trailing text \s*$ """.format( # in at least one case [SF 457 from 2020] the bill number is followed by )0 # seemingly just a typo, this gets around that bill_re, r",?.*?the\svote\swas:" if chamber == "upper" else r"\d?", ) # print("motion candidate line:", line) motion = re.search(motion_re, line, re.VERBOSE | re.IGNORECASE) if motion: motion = motion.group(1) for word, letter in (("Senate", "S"), ("House", "H"), ("File", "F")): if bill_id is None: return bill_id = bill_id.replace(word, letter) bill_id = bill_id.replace(".", "") bill_chamber = dict(h="lower", s="upper")[bill_id.lower()[0]] votes, passed = self.parse_votes(lines) # at the very least, there should be a majority # for the bill to have passed, so check that, # but if the bill didn't pass, it could still be OK if it got a majority # eg constitutional amendments if not ( (passed == (votes["yes_count"] > votes["no_count"])) or (not passed) ): self.error("The bill passed without a majority?") raise ValueError("invalid vote") # also throw a warning if the bill failed but got a majority # it could be OK, but is probably something we'd want to check if not passed and votes["yes_count"] > votes["no_count"]: self.logger.warning( "The bill got a majority but did not pass. " "Could be worth confirming." ) result = "" if passed: result = "pass" else: result = "fail" # check for duplicate motions and number second and up if needed motion_text = re.sub("\xad", "-", motion) motions_per_bill[bill_id][motion_text] += 1 new_count = motions_per_bill[bill_id][motion_text] if new_count > 1: motion_text += f" #{new_count}" vote = VoteEvent( chamber=chamber, start_date=date, motion_text=motion_text, result=result, classification="passage", legislative_session=session, bill=bill_id, bill_chamber=bill_chamber, ) # add votes and counts for vtype in ("yes", "no", "absent", "abstain"): vcount = votes["{}_count".format(vtype)] or 0 vote.set_count(vtype, vcount) for voter in votes["{}_votes".format(vtype)]: vote.vote(vtype, voter) vote.add_source(url) yield vote