def asvote(self): v = VoteEvent( chamber=self.chamber(), start_date=self.date(), motion_text=self.motion(), result="pass" if self.passed() else "fail", classification="passage", bill=self.bill, ) v.pupa_id = self.url # URL contains sequence number v.set_count("yes", self.yes_count()) v.set_count("no", self.no_count()) v.set_count("other", self.other_count()) for voter in self.yes_votes(): v.yes(voter) for voter in self.no_votes(): v.no(voter) for voter in self.other_votes(): v.vote("other", voter) v.add_source(self.url) return v
def parse_vote( self, bill, journal_entry_number, action, act_chamber, act_date, url ): # html = self.get(url).text # doc = lxml.html.fromstring(html) yes = no = other = 0 result = "" vote_counts = action.split() for vote_count in vote_counts: if re.match(r"[\D][\d]", vote_count): if "Y" in vote_count: yes = int(vote_count[1:]) elif "N" in vote_count: no = int(vote_count[1:]) elif "E" in vote_count or "A" in vote_count: other += int(vote_count[1:]) if "PASSED" in action: result = "pass" elif "FAILED" in action: result = "fail" else: result = "pass" if yes > no else "fail" vote = VoteEvent( bill=bill, start_date=act_date.strftime("%Y-%m-%d"), chamber=act_chamber, motion_text=action + " #" + journal_entry_number, result=result, classification="passage", ) vote.set_count("yes", yes) vote.set_count("no", no) vote.set_count("other", other) vote.add_source(url) yield vote
def process_committee_vote(self, committee_action, bill): try: date = committee_action["ActionDate"] vote_info = committee_action["Vote"] except KeyError: self.logger.warning("Committee vote has no data. Skipping.") return date = self.date_format(date) other_count = 0 for v in vote_info: vote_count = 0 if v["VoteCount"] == "" else int(v["VoteCount"]) if v["VoteType"] == "Yes": yes_count = vote_count elif v["VoteType"] == "No": no_count = vote_count else: other_count += vote_count result = "fail" if yes_count > no_count: result = "pass" v = VoteEvent( chamber="legislature", start_date=date, motion_text="Committee Vote", result=result, classification="committee", bill=bill, ) v.set_count("yes", yes_count) v.set_count("no", no_count) v.set_count("other", other_count) return v
def scrape_vote(self, bill, date, url): page = self.get(url).json() location = page["actionLog"]["FullName"] if location: if "House" in location: chamber = "lower" elif "Senate" in location: chamber = "upper" elif "Joint" in location: chamber = "legislature" else: self.warning("Bad Vote chamber: '%s', skipping" % location) return else: self.warning("Bad Vote chamber: '%s', skipping" % location) return motion = page["actionLog"]["StatusText"] if motion: # If we can't detect a motion, skip this vote yes_count = page["Yeas"] no_count = page["Nays"] excused_count = page["Excused"] absent_count = page["Absent"] passed = yes_count > no_count if motion.startswith("Do Pass"): vtype = "passage" elif motion == "Concurred in amendments": vtype = "amendment" # commenting out until we add these back to OS-core # elif motion == "Veto override": # vtype = "veto-override" else: vtype = [] vote = VoteEvent( chamber=chamber, start_date=date, motion_text=motion, result="pass" if passed else "fail", classification=vtype, bill=bill, ) # differentiate nearly identical votes vote.dedupe_key = url vote.add_source(url) vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("excused", excused_count) vote.set_count("absent", absent_count) for person in page["RollCalls"]: option = person["Vote1"] if option in ("Aye", "Yea"): vote.yes(person["UniqueName"]) elif option == "Nay": vote.no(person["UniqueName"]) elif option == "Excused": vote.vote("excused", person["UniqueName"]) elif option == "Absent": vote.vote("absent", person["UniqueName"]) yield vote
def parse_bill_actions_table(self, bill, action_table, bill_id, session, url, bill_chamber): # vote types that have been reconsidered since last vote of that type reconsiderations = set() for action in action_table.xpath("*")[1:]: date = action[0].text_content() date = dt.datetime.strptime(date, "%m/%d/%Y").strftime("%Y-%m-%d") actor_code = action[1].text_content().upper() string = action[2].text_content() actor = self._vote_type_map[actor_code] act_type, committees = categorize_action(string) # XXX: Translate short-code to full committee name for the # matcher. real_committees = [] if committees: for committee in committees: try: committee = self.short_ids[committee]["name"] real_committees.append(committee) except KeyError: pass act = bill.add_action(string, date, chamber=actor, classification=act_type) for committee in real_committees: act.add_related_entity(name=committee, entity_type="organization") vote = self.parse_vote(string) if vote: v, motion = vote motion_text = (("Reconsider: " + motion) if actor in reconsiderations else motion) vote = VoteEvent( start_date=date, chamber=actor, bill=bill_id, bill_chamber=bill_chamber, legislative_session=session, motion_text=motion_text, result="pass" if "passed" in string.lower() else "fail", classification="passage", ) reconsiderations.discard(actor) vote.add_source(url) vote.set_count("yes", int(v["n_yes"] or 0)) vote.set_count("no", int(v["n_no"] or 0)) vote.set_count("not voting", int(v["n_excused"] or 0)) for voter in split_specific_votes(v["yes"]): voter = self.clean_voter_name(voter) vote.yes(voter) for voter in split_specific_votes(v["yes_resv"]): voter = self.clean_voter_name(voter) vote.yes(voter) for voter in split_specific_votes(v["no"]): voter = self.clean_voter_name(voter) vote.no(voter) for voter in split_specific_votes(v["excused"]): voter = self.clean_voter_name(voter) vote.vote("not voting", voter) yield vote elif re.search("reconsider", string, re.IGNORECASE): reconsiderations.add(actor)
def handle_page(self): (_, motion) = self.lines[5].split("FINAL ACTION:") motion = motion.strip() if not motion: self.scraper.warning("Vote appears to be empty") return vote_top_row = [ self.lines.index(x) for x in self.lines if re.search(r"^\s+Yea\s+Nay.*?(?:\s+Yea\s+Nay)+$", x) ][0] yea_columns_end = self.lines[vote_top_row].index("Yea") + len("Yea") nay_columns_begin = self.lines[vote_top_row].index("Nay") votes = {"yes": [], "no": [], "other": []} for line in self.lines[(vote_top_row + 1):]: if line.strip(): member = re.search( r"""(?x) ^\s+(?:[A-Z\-]+)?\s+ # Possible vote indicator ([A-Z][a-z]+ # Name must have lower-case characters [\w\-\s]+) # Continue looking for the rest of the name (?:,[A-Z\s]+?)? # Leadership has an all-caps title (?:\s{2,}.*)? # Name ends when many spaces are seen """, line, ).group(1) # sometimes members have trailing X's from other motions in the # vote sheet we aren't collecting member = re.sub(r"(\s+X)+", "", member) # Usually non-voting members won't even have a code listed # Only a couple of codes indicate an actual vote: # "VA" (vote after roll call) and "VC" (vote change) did_vote = bool(re.search(r"^\s+(X|VA|VC)\s+[A-Z][a-z]", line)) if did_vote: # Check where the "X" or vote code is on the page vote_column = len(line) - len(line.lstrip()) if vote_column <= yea_columns_end: votes["yes"].append(member) elif vote_column >= nay_columns_begin: votes["no"].append(member) else: raise ValueError( "Unparseable vote found for {0} in {1}:\n{2}". format(member, self.url, line)) else: votes["other"].append(member) # End loop as soon as no more members are found else: break totals = re.search(r"(?msu)\s+(\d{1,3})\s+(\d{1,3})\s+.*?TOTALS", self.text).groups() yes_count = int(totals[0]) no_count = int(totals[1]) result = "pass" if (yes_count > no_count) else "fail" vote = VoteEvent( start_date=self.kwargs["date"], bill=self.kwargs["bill"], chamber="upper", motion_text=motion, classification="committee", result=result, ) vote.add_source(self.url) vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("other", len(votes["other"])) # set voters for vtype, voters in votes.items(): for voter in voters: voter = voter.strip() # Removes the few voter names with a ton of extra spaces with VA at the end. # Ex: Cruz VA if " VA" in voter: voter = " ".join(voter.split()[:-2]) if len(voter) > 0: vote.vote(vtype, voter) yield vote
def scrape(self, session=None): if not session: session = self.latest_session() self.info("no session specified, using %s", session) for category in self._categories: leg_listing_url = ( self._API_BASE_URL + f"BulkData/{category['categoryId']}/{session}" ) resp = requests.post(leg_listing_url, headers=self._headers, verify=False,) resp.raise_for_status() leg_listing = resp.json() for leg in leg_listing: bill = Bill( leg["legislationNumber"], legislative_session=session, title=leg["title"], classification=category["name"], ) bill.add_source(leg_listing_url) bill_url = ( f"https://lims.dccouncil.us/Legislation/{leg['legislationNumber']}" ) bill.add_source(bill_url) if leg['lawNumber']: bill.extras['lawNumber'] = leg['lawNumber'] # Actions for hist in leg["legislationHistory"]: hist_date = datetime.datetime.strptime( hist["actionDate"], "%b %d, %Y" ) hist_date = self._TZ.localize(hist_date) hist_action = hist["actionDescription"] if hist_action.split()[0] in ["OtherAmendment", "OtherMotion"]: hist_action = hist_action[5:] hist_class = self.classify_action(hist_action) if "mayor" in hist_action.lower(): actor = "executive" else: actor = "legislature" bill.add_action( hist_action, hist_date, classification=hist_class, chamber=actor ) # Documents with download links if hist["downloadURL"] and ("download" in hist["downloadURL"]): download = hist["downloadURL"] if not download.startswith("http"): download = "https://lims.dccouncil.us/" + download mimetype = ( "application/pdf" if download.endswith("pdf") else None ) is_version = False # figure out if it's a version from type/name possible_version_types = [ "SignedAct", "Introduction", "Enrollment", "Engrossment", ] for vt in possible_version_types: if vt.lower() in download.lower(): is_version = True doc_type = vt if "amendment" in download.lower(): doc_type = "Amendment" if is_version: bill.add_version_link( doc_type, download, media_type=mimetype, on_duplicate="ignore", ) else: bill.add_document_link( hist["actionDescription"], download, media_type=mimetype, on_duplicate="ignore", ) # Grabs Legislation details leg_details_url = ( self._API_BASE_URL + f"LegislationDetails/{leg['legislationNumber']}" ) details_resp = requests.get( leg_details_url, headers=self._headers, verify=False, ) details_resp.raise_for_status() leg_details = details_resp.json() # Sponsors for i in leg_details["introducers"]: name = i["memberName"] bill.add_sponsorship( name, classification="primary", entity_type="person", primary=True, ) # Co-sponsor if leg_details["coSponsors"]: for cs in leg_details["coSponsors"]: name = i["memberName"] bill.add_sponsorship( name, classification="cosponsor", entity_type="person", primary=True, ) # Committee Hearing Doc for commHearing in leg_details["committeeHearing"]: if commHearing["hearingRecord"]: bill.add_document_link( commHearing["hearingType"], commHearing["hearingRecord"], media_type="application/pdf", on_duplicate="ignore", ) for committeeMarkup in leg_details["committeeMarkup"]: if committeeMarkup["committeeReport"]: bill.add_document_link( "Committee Markup", committeeMarkup["committeeReport"], media_type="application/pdf", on_duplicate="ignore", ) # Actions and Votes if leg_details["actions"]: # To prevent duplicate votes vote_ids = [] for act in leg_details["actions"]: action_name = act["action"] action_date = datetime.datetime.strptime( act["actionDate"][:10], "%Y-%m-%d" ) action_date = self._TZ.localize(action_date) if action_name.split()[0] == "Other": action_name = " ".join(action_name.split()[1:]) if "mayor" in action_name.lower(): actor = "executive" else: actor = "legislature" # Documents and Versions if act["attachment"]: mimetype = ( "application/pdf" if act["attachment"].endswith("pdf") else None ) is_version = False # figure out if it's a version from type/name possible_version_types = [ "SignedAct", "Introduction", "Enrollment", "Engrossment", ] for vt in possible_version_types: if vt.lower() in act["attachment"].lower(): is_version = True doc_type = vt if "amendment" in act["attachment"].lower(): doc_type = "Amendment" if is_version: bill.add_version_link( doc_type, act["attachment"], media_type=mimetype, on_duplicate="ignore", ) else: bill.add_document_link( doc_type, act["attachment"], media_type=mimetype, on_duplicate="ignore", ) # Votes if act["voteDetails"]: result = act["voteDetails"]["voteResult"] if result: status = self._vote_statuses[result.lower()] id_text = ( str(leg["legislationNumber"]) + "-" + action_name + "-" + result ) if id_text not in vote_ids: vote_ids.append(id_text) action_class = self.classify_action(action_name) v = VoteEvent( identifier=id_text, chamber=actor, start_date=action_date, motion_text=action_name, result=status, classification=action_class, bill=bill, ) v.add_source(leg_listing_url) yes_count = ( no_count ) = absent_count = abstain_count = other_count = 0 for leg_vote in act["voteDetails"]["votes"]: mem_name = leg_vote["councilMember"] if leg_vote["vote"] == "Yes": yes_count += 1 v.yes(mem_name) elif leg_vote["vote"] == "No": no_count += 1 v.no(mem_name) elif leg_vote["vote"] == "Absent": absent_count += 1 v.vote("absent", mem_name) elif leg_vote["vote"] == "Recused": v.vote("abstain", mem_name) abstain_count += 1 elif leg_vote["vote"] == "Present": v.vote("other", mem_name) other_count += 1 else: # Incase anything new pops up other_count += 1 v.vote("other", mem_name) v.set_count("yes", yes_count) v.set_count("no", no_count) v.set_count("absent", absent_count) v.set_count("abstain", abstain_count) v.set_count("other", other_count) yield v yield bill
def scrape_senate_vote(self, session, period, roll_call): url = ( "https://www.senate.gov/legislative/LIS/roll_call_votes/vote{session}{period}/" "vote_{session}_{period}_{vote_id}.xml") url = url.format(session=session, period=period, vote_id=roll_call) page = lxml.html.fromstring(self.get(url).content) vote_date = page.xpath("//roll_call_vote/vote_date/text()")[0].strip() when = self._TZ.localize( datetime.datetime.strptime(vote_date, "%B %d, %Y, %H:%M %p")) roll_call = page.xpath("//roll_call_vote/vote_number/text()")[0] vote_id = "us-{}-upper-{}".format(when.year, roll_call) # note: not everthing the senate votes on is a bill, this is OK # non bills include nominations and impeachments doc_type = page.xpath( "//roll_call_vote/document/document_type/text()")[0] if page.xpath( "//roll_call_vote/amendment/amendment_to_document_number/text()" ): bill_id = page.xpath( "//roll_call_vote/amendment/amendment_to_document_number/text()" )[0].replace(".", "") else: bill_id = page.xpath( "//roll_call_vote/document/document_name/text()")[0].replace( ".", "") motion = page.xpath("//roll_call_vote/vote_question_text/text()")[0] result_text = page.xpath("//roll_call_vote/vote_result/text()")[0] result = self.senate_statuses[result_text] vote = VoteEvent( start_date=when, bill_chamber="lower" if doc_type[0] == "H" else "upper", motion_text=motion, classification="passage", # TODO result=result, legislative_session=session, identifier=vote_id, bill=bill_id, chamber="upper", ) vote.add_source(url) vote.extras["senate-rollcall-num"] = roll_call yeas = page.xpath("//roll_call_vote/count/yeas/text()")[0] nays = page.xpath("//roll_call_vote/count/nays/text()")[0] if page.xpath("//roll_call_vote/count/absent/text()"): absents = page.xpath("//roll_call_vote/count/absent/text()")[0] else: absents = 0 if page.xpath("//roll_call_vote/count/present/text()"): presents = page.xpath("//roll_call_vote/count/present/text()")[0] else: presents = 0 vote.set_count("yes", int(yeas)) vote.set_count("no", int(nays)) vote.set_count("absent", int(absents)) vote.set_count("abstain", int(presents)) for row in page.xpath("//roll_call_vote/members/member"): lis_id = row.xpath("lis_member_id/text()")[0] name = row.xpath("member_full/text()")[0] choice = row.xpath("vote_cast/text()")[0] vote.vote(self.vote_codes[choice], name, note=lis_id) yield vote
def scrape_votes(self, bill, url): page = lxml.html.fromstring(self.get(url).text.replace(u"\xa0", " ")) seen_rcs = set() re_ns = "http://exslt.org/regular-expressions" path = r"//p[re:test(text(), 'OKLAHOMA\s+(HOUSE|STATE\s+SENATE)')]" for header in page.xpath(path, namespaces={"re": re_ns}): bad_vote = False # Each chamber has the motion name on a different line of the file if "HOUSE" in header.xpath("string()"): chamber = "lower" motion_index = 8 else: chamber = "upper" motion_index = 13 motion = header.xpath("string(following-sibling::p[%d])" % motion_index).strip() motion = re.sub(r"\s+", " ", motion) if not motion.strip(): self.warning("Motion text not found") return match = re.match(r"^(.*) (PASSED|FAILED)$", motion) if match: motion = match.group(1) passed = match.group(2) == "PASSED" else: passed = None rcs_p = header.xpath( "following-sibling::p[contains(., 'RCS#')]")[0] rcs_line = rcs_p.xpath("string()").replace(u"\xa0", " ") rcs = re.search(r"RCS#\s+(\d+)", rcs_line).group(1) if rcs in seen_rcs: continue else: seen_rcs.add(rcs) date_line = rcs_p.getnext().xpath("string()") date = re.search(r"\d+/\d+/\d+", date_line).group(0) date = datetime.datetime.strptime(date, "%m/%d/%Y").date() vtype = None counts = collections.defaultdict(int) votes = collections.defaultdict(list) seen_yes = False for sib in header.xpath("following-sibling::p")[13:]: line = sib.xpath("string()").replace("\r\n", " ").strip() if "*****" in line: break regex = (r"(YEAS|NAYS|EXCUSED|VACANT|CONSTITUTIONAL " r"PRIVILEGE|NOT VOTING|N/V)\s*:\s*(\d+)(.*)") match = re.match(regex, line) if match: if match.group(1) == "YEAS" and "RCS#" not in line: vtype = "yes" seen_yes = True elif match.group(1) == "NAYS" and seen_yes: vtype = "no" elif match.group(1) == "VACANT": continue # skip these elif seen_yes: vtype = "other" if seen_yes and match.group(3).strip(): self.warning("Bad vote format, skipping.") bad_vote = True counts[vtype] += int(match.group(2)) elif seen_yes: for name in line.split(" "): if not name: continue if "HOUSE" in name or "SENATE " in name: continue votes[vtype].append(name.strip()) if bad_vote: continue if passed is None: passed = counts["yes"] > (counts["no"] + counts["other"]) vote = Vote( chamber=chamber, start_date=date.strftime("%Y-%m-%d"), motion_text=motion, result="pass" if passed else "fail", bill=bill, classification="passage", ) vote.set_count("yes", counts["yes"]) vote.set_count("no", counts["no"]) vote.set_count("other", counts["other"]) vote.dedupe_key = url + "#" + rcs vote.add_source(url) for name in votes["yes"]: vote.yes(name) for name in votes["no"]: if ":" in name: raise Exception(name) vote.no(name) for name in votes["other"]: vote.vote("other", name) yield vote
def scrape_assembly_votes(self, session, bill, assembly_url, bill_id): # parse the bill data page, finding the latest html text url = assembly_url + "&Floor%26nbspVotes=Y" data = self.get(url).text doc = lxml.html.fromstring(data) doc.make_links_absolute(url) if "Votes:" in doc.text_content(): vote_motions = [] additional_votes_on_motion = 2 for table in doc.xpath("//table"): date = table.xpath('caption/span[contains(., "DATE:")]') date = next(date[0].itersiblings()).text date = datetime.datetime.strptime(date, "%m/%d/%Y") date = eastern.localize(date) date = date.isoformat() spanText = table.xpath("caption/span/text()") motion = spanText[2].strip() + spanText[3].strip() if motion in vote_motions: motion = motion + f" - Vote {additional_votes_on_motion}" additional_votes_on_motion += 1 else: vote_motions.append(motion) votes = ( table.xpath("caption/span/span")[0].text.split(":")[1].split("/") ) yes_count, no_count = map(int, votes) passed = yes_count > no_count vote = VoteEvent( chamber="lower", start_date=date, motion_text=motion, bill=bill, result="pass" if passed else "fail", classification="passage", ) vote.set_count("yes", yes_count) vote.set_count("no", no_count) absent_count = 0 excused_count = 0 tds = table.xpath("tr/td/text()") votes = [tds[i : i + 2] for i in range(0, len(tds), 2)] vote_dictionary = { "Y": "yes", "NO": "no", "ER": "excused", "AB": "absent", "NV": "not voting", "EL": "other", } for vote_pair in votes: name, vote_val = vote_pair vote.vote(vote_dictionary[vote_val], name) if vote_val == "AB": absent_count += 1 elif vote_val == "ER": excused_count += 1 vote.set_count("absent", absent_count) vote.set_count("excused", excused_count) vote.add_source(url) vote.dedupe_key = url + motion + spanText[1] yield vote
def scrape_votes(self, session): votes = {} other_counts = defaultdict(int) last_line = [] vote_url = "http://gencourt.state.nh.us/dynamicdatafiles/RollCallSummary.txt" lines = self.get(vote_url).content.decode("utf-8").splitlines() for line in lines: if len(line) < 2: continue if line.strip() == "": continue line = line.split("|") if len(line) < 14: if len(last_line + line[1:]) == 14: line = last_line self.warning("used bad vote line") else: last_line = line self.warning("bad vote line %s" % "|".join(line)) session_yr = line[0].replace("\xef\xbb\xbf", "") body = line[1] vote_num = line[2] timestamp = line[3] bill_id = line[4].strip() yeas = int(line[5]) nays = int(line[6]) # present = int(line[7]) # absent = int(line[8]) motion = line[11].strip() or "[not available]" if session_yr == session and bill_id in self.bills_by_id: actor = "lower" if body == "H" else "upper" time = dt.datetime.strptime(timestamp, "%m/%d/%Y %I:%M:%S %p") time = pytz.timezone("America/New_York").localize( time).isoformat() # TODO: stop faking passed somehow passed = yeas > nays vote = Vote( chamber=actor, start_date=time, motion_text=motion, result="pass" if passed else "fail", classification="passage", bill=self.bills_by_id[bill_id], ) vote.set_count("yes", yeas) vote.set_count("no", nays) vote.add_source(vote_url) vote.pupa_id = session_yr + body + vote_num # unique ID for vote votes[body + vote_num] = vote for line in (self.get( "http://gencourt.state.nh.us/dynamicdatafiles/RollCallHistory.txt" ).content.decode("utf-8").splitlines()): if len(line) < 2: continue # 2016|H|2|330795||Yea| # 2012 | H | 2 | 330795 | 964 | HB309 | Yea | 1/4/2012 8:27:03 PM session_yr, body, v_num, _, employee, bill_id, vote, date = line.split( "|") if not bill_id: continue if session_yr == session and bill_id.strip() in self.bills_by_id: try: leg = " ".join(self.legislators[employee]["name"].split()) except KeyError: self.warning("Error, can't find person %s" % employee) continue vote = vote.strip() if body + v_num not in votes: self.warning("Skipping processing this vote:") self.warning("Bad ID: %s" % (body + v_num)) continue # code = self.legislators[employee]['seat'] if vote == "Yea": votes[body + v_num].yes(leg) elif vote == "Nay": votes[body + v_num].no(leg) else: votes[body + v_num].vote("other", leg) # hack-ish, but will keep the vote count sync'd other_counts[body + v_num] += 1 votes[body + v_num].set_count("other", other_counts[body + v_num]) for vote in votes.values(): yield vote
def scrape_vote_history(self, bill, vurl): """ Obtain the information on a vote and link it to the related Bill :param bill: related bill :param vurl: source for the voteEvent information. :return: voteEvent object """ html = self.get(vurl).text doc = lxml.html.fromstring(html) doc.make_links_absolute(vurl) # skip first two rows for row in doc.xpath("//table/tr")[2:]: tds = row.getchildren() if len(tds) != 11: self.warning("irregular vote row: %s" % vurl) continue ( timestamp, motion, vote, yeas, nays, nv, exc, pres, abst, total, result, ) = tds timestamp = timestamp.text.replace("\xa0", " ") timestamp = datetime.datetime.strptime(timestamp, "%m/%d/%Y %H:%M %p") yeas = int(yeas.text) nays = int(nays.text) others = int(nv.text) + int(exc.text) + int(abst.text) + int(pres.text) assert yeas + nays + others == int(total.text) if result.text == "Passed": passed = "pass" else: passed = "fail" vote_link = vote.xpath("a")[0] if "[H]" in vote_link.text: chamber = "lower" else: chamber = "upper" vote = VoteEvent( chamber=chamber, # 'upper' or 'lower' start_date=timestamp.strftime("%Y-%m-%d"), # 'YYYY-MM-DD' format motion_text=motion.text, result=passed, classification="passage", # Can also be 'other' # Provide a Bill instance to link with the VoteEvent... bill=bill, ) vote.set_count("yes", yeas) vote.set_count("no", nays) vote.set_count("other", others) vote.add_source(vurl) # obtain vote rollcall from pdf and add it to the VoteEvent object rollcall_pdf = vote_link.get("href") self.scrape_rollcall(vote, rollcall_pdf) vote.add_source(rollcall_pdf) if rollcall_pdf in self._seen_vote_ids: self.warning("duplicate usage of %s, skipping", rollcall_pdf) continue else: self._seen_vote_ids.add(rollcall_pdf) vote.dedupe_key = rollcall_pdf # distinct KEY for each one yield vote
def scrape_action_page(self, bill, page): action_rows = page.xpath("//tbody/tr") for row in action_rows: action_date = row.xpath("td[1]/text()")[0] action_date = datetime.strptime(action_date, "%m/%d/%Y") action_year = action_date.year action_date = action_date.strftime("%Y-%m-%d") if row.xpath("td[2]/text()"): action_actor = row.xpath("td[2]/text()")[0] action_actor = self.chamber_map_reverse[action_actor.strip()] action_name = row.xpath("string(td[3])") # House votes if "Supplement" in action_name: actor = "lower" if not re.findall(r"(.+)-\s*\d+\s*YEAS", action_name): self.warning( "vote {} did not match regex, skipping".format( action_name)) continue vote_action = re.findall(r"(.+)-\s*\d+\s*YEAS", action_name)[0].strip() y = int(re.findall(r"(\d+)\s*YEAS", action_name)[0]) n = int(re.findall(r"(\d+)\s*NAYS", action_name)[0]) # get supplement number n_supplement = int( re.findall(r"No\.\s*(\d+)", action_name, re.IGNORECASE)[0]) cached_vote = VoteEvent( chamber=actor, start_date=action_date, motion_text=vote_action, result="pass" if y > n else "fail", classification="passage", bill=bill, ) cached_vote.set_count("yes", y) cached_vote.set_count("no", n) housevote_pdf = ( "https://malegislature.gov/Journal/House/{}/{}/RollCalls". format(bill.legislative_session, action_year)) self.scrape_house_vote(cached_vote, housevote_pdf, n_supplement) cached_vote.add_source(housevote_pdf) cached_vote.dedupe_key = "{}#{}".format( housevote_pdf, n_supplement) # XXX: disabled house votes on 8/1 to try to get MA importing again # will leaving this in and commented out once we resolve the ID issue # yield cached_vote # Senate votes if "Roll Call" in action_name: actor = "upper" # placeholder vote_action = action_name.split(" -")[0] # 2019 H86 Breaks our regex, # Ordered to a third reading -- # see Senate Roll Call #25 and House Roll Call 56 if "yeas" in action_name and "nays" in action_name: try: y, n = re.search(r"(\d+) yeas .*? (\d+) nays", action_name.lower()).groups() y = int(y) n = int(n) except AttributeError: y = int( re.search(r"yeas\s+(\d+)", action_name.lower()).group(1)) n = int( re.search(r"nays\s+(\d+)", action_name.lower()).group(1)) # TODO: other count isn't included, set later cached_vote = VoteEvent( chamber=actor, start_date=action_date, motion_text=vote_action, result="pass" if y > n else "fail", classification="passage", bill=bill, ) cached_vote.set_count("yes", y) cached_vote.set_count("no", n) rollcall_pdf = "http://malegislature.gov" + row.xpath( "string(td[3]/a/@href)") self.scrape_senate_vote(cached_vote, rollcall_pdf) cached_vote.add_source(rollcall_pdf) cached_vote.dedupe_key = rollcall_pdf # XXX: also disabled, see above note # yield cached_vote attrs = self.categorizer.categorize(action_name) action = bill.add_action( action_name.strip(), action_date, chamber=action_actor, classification=attrs["classification"], ) for com in attrs.get("committees", []): com = com.strip() action.add_related_entity(com, entity_type="organization")
def scrape_vote(self, bill, vote_id, session): vote_url = ( "https://legis.delaware.gov/json/RollCall/GetRollCallVoteByRollCallId" ) form = {"rollCallId": vote_id, "sort": "", "group": "", "filter": ""} self.info("Fetching vote {} for {}".format(vote_id, bill.identifier)) page = self.post(url=vote_url, data=form, allow_redirects=True).json() if page: roll = page["Model"] vote_chamber = self.chamber_map[roll["ChamberName"]] # "7/1/16 01:00 AM" vote_date = dt.datetime.strptime( roll["TakenAtDateTime"], "%m/%d/%y %I:%M %p").strftime("%Y-%m-%d") # TODO: What does this code mean? vote_motion = roll["RollCallVoteType"] vote_passed = "pass" if roll[ "RollCallStatus"] == "Passed" else "fail" other_count = (int(roll["NotVotingCount"]) + int(roll["VacantVoteCount"]) + int(roll["AbsentVoteCount"]) + int(roll["ConflictVoteCount"])) vote = VoteEvent( chamber=vote_chamber, start_date=vote_date, motion_text=vote_motion, result=vote_passed, bill=bill, legislative_session=session, classification=[], ) vote_pdf_url = ("https://legis.delaware.gov" "/json/RollCallController/GenerateRollCallPdf" "?rollCallId={}&chamberId={}".format( vote_id, self.chamber_codes[vote_chamber])) # Vote URL is just a generic search URL with POSTed data, # so provide a different link vote.add_source(vote_pdf_url) vote.dedupe_key = vote_pdf_url vote.set_count("yes", roll["YesVoteCount"]) vote.set_count("no", roll["NoVoteCount"]) vote.set_count("other", other_count) for row in roll["AssemblyMemberVotes"]: # AssemblyMemberId looks like it should work here, # but for some sessions it's bugged to only return session try: voter = self.legislators_by_short[str(row["ShortName"])] name = voter["DisplayName"] except KeyError: self.warning("could not find legislator short name %s", row["ShortName"]) name = row["ShortName"] if row["SelectVoteTypeCode"] == "Y": vote.yes(name) elif row["SelectVoteTypeCode"] == "N": vote.no(name) else: vote.vote("other", name) yield vote
def scrape_chamber(self, chamber, session): chamber_name = "house" if chamber == "lower" else "senate" session_slug = { "62": "62-2011", "63": "63-2013", "64": "64-2015", "65": "65-2017", "66": "66-2019", }[session] # Open the index page of the session's Registers, and open each url = "http://www.legis.nd.gov/assembly/%s/journals/%s-journal.html" % ( session_slug, chamber_name, ) page = self.lxmlize(url) pdfs = page.xpath("//a[contains(@href, '.pdf')]") for pdf in pdfs: # Initialize information about the vote parsing results = {} in_motion = False cur_vote = None in_vote = False cur_motion = "" bills = [] # Determine which URLs the information was pulled from pdf_url = pdf.attrib["href"] try: (path, response) = self.urlretrieve(pdf_url) except requests.exceptions.ConnectionError: continue # Convert the PDF to text data = convert_pdf(path, type="text").decode("utf-8") os.unlink(path) # Determine the date of the document date = re.findall(date_re, data) if date: date = date[0][0] cur_date = datetime.datetime.strptime(date, "%A, %B %d, %Y") else: # If no date is found anywhere, do not process the document self.warning("No date was found for the document; skipping.") continue # Check each line of the text for motion and vote information lines = data.splitlines() for line in lines: # Ignore lines with no information if (re.search(chamber_re, line) or re.search(date_re, line) or re.search(page_re, line) or line.strip() == ""): pass # Ensure that motion and vote capturing are not _both_ active elif in_motion and in_vote: raise AssertionError( "Scraper should not be simultaneously processing " + "motion name and votes, as it is for this motion: " + cur_motion) # Start capturing motion text after a ROLL CALL header elif not in_motion and not in_vote: if line.strip() == "ROLL CALL": in_motion = True elif in_motion and not in_vote: if cur_motion == "": cur_motion = line.strip() else: cur_motion = cur_motion + " " + line.strip() # ABSENT AND NOT VOTING marks the end of each motion name # In this case, prepare to capture votes if line.strip().endswith( "VOTING") or line.strip().endswith("VOTING."): in_motion = False in_vote = True elif not in_motion and in_vote: # Ignore appointments and confirmations if "The Senate advises and consents to the appointment" in line: in_vote = False cur_vote = None results = {} cur_motion = "" bills = [] # If votes are being processed, record the voting members elif ":" in line: cur_vote, who = (x.strip() for x in line.split(":", 1)) who = [ x.strip() for x in who.split(";") if x.strip() != "" ] results[cur_vote] = who name_may_be_continued = False if line.endswith( ";") else True # Extracts bill numbers in the closing text # used for when the closing text is multiple lines. elif (cur_vote is not None and re.findall(r"(?i)(H|S|J)(C?)(B|R|M) (\d+)", line) and not any(x in line.lower() for x in [ "passed", "adopted", "sustained", "prevailed", "lost", "failed", ])): bills.extend( re.findall(r"(?i)(H|S|J)(C?)(B|R|M) (\d+)", line)) elif cur_vote is not None and not any(x in line.lower() for x in [ "passed", "adopted", "sustained", "prevailed", "lost", "failed", ]): who = [ x.strip() for x in line.split(";") if x.strip() != "" ] if name_may_be_continued: results[cur_vote][-1] = (results[cur_vote][-1] + " " + who.pop(0)) name_may_be_continued = False if line.endswith( ";") else True results[cur_vote].extend(who) # At the conclusion of a vote, save its data elif any(x in line.lower() for x in [ "passed", "adopted", "sustained", "prevailed", "lost", "failed", ]): in_vote = False cur_vote = None # Identify what is being voted on # Throw a warning if impropper informaiton found bills.extend( re.findall(r"(?i)(H|S|J)(C?)(B|R|M) (\d+)", line)) if bills == [] or cur_motion.strip() == "": results = {} cur_motion = "" self.warning("No motion or bill name found: " + "motion name: " + cur_motion + "; " + "decision text: " + line.strip()) continue # If votes are found in the motion name, throw an error if "YEAS:" in cur_motion or "NAYS:" in cur_motion: raise AssertionError( "Vote data found in motion name: " + cur_motion) # Use the collected results to determine who voted how keys = { "YEAS": "yes", "NAYS": "no", "ABSENT AND NOT VOTING": "other", } res = {} for key in keys: if key in results: res[keys[key]] = results[key] else: res[keys[key]] = [] # Count the number of members voting each way yes, no, other = ( len(res["yes"]), len(res["no"]), len(res["other"]), ) chambers = { "H": "lower", "S": "upper", "J": "legislature" } # Almost all of the time, a vote only applies to one bill and this loop # will only be run once. # Some exceptions exist. for bill in bills: cur_bill_id = "%s%s%s %s" % bill # Identify the source chamber for the bill try: bc = chambers[cur_bill_id[0]] except KeyError: bc = "other" # Determine whether or not the vote passed if "over the governor's veto" in cur_motion.lower( ): VETO_SUPERMAJORITY = 2 / 3 passed = yes / (yes + no) > VETO_SUPERMAJORITY else: passed = yes > no # Create a Vote object based on the scraped information vote = Vote( chamber=chamber, start_date=cur_date.strftime("%Y-%m-%d"), motion_text=cur_motion, result="pass" if passed else "fail", legislative_session=session, classification="passage", bill=cur_bill_id, bill_chamber=bc, ) vote.add_source(pdf_url) vote.add_source(url) vote.set_count("yes", yes) vote.set_count("no", no) vote.set_count("other", other) # For each category of voting members, # add the individuals to the Vote object for key in res: for voter in res[key]: vote.vote(key, voter) # Check the vote counts in the motion text against # the parsed results for category_name in keys.keys(): # Need to search for the singular, not plural, in the text # so it can find, for example, " 1 NAY " vote_re = r"(\d+)\s{}".format( category_name[:-1]) motion_count = int( re.findall(vote_re, cur_motion)[0]) for item in vote.counts: if item["option"] == keys[category_name]: vote_count = item["value"] if motion_count != vote_count: self.warning( "Motion text vote counts ({}) ".format( motion_count) + "differed from roll call counts ({}) ". format(vote_count) + "for {0} on {1}".format( category_name, cur_bill_id)) for item in vote.counts: if item["option"] == keys[ category_name]: vote_count = motion_count yield vote # With the vote successfully processed, # wipe its data and continue to the next one results = {} cur_motion = "" bills = []
def scrape_journal(self, url, chamber, session, date): filename, response = self.urlretrieve(url) self.logger.info("Saved journal to %r" % filename) all_text = convert_pdf(filename, type="text") lines = all_text.split(b"\n") lines = [line.decode("utf-8") for line in lines] lines = [ line.strip() .replace("–", "-") .replace("―", '"') .replace("‖", '"') .replace("“", '"') .replace("”", '"') for line in lines ] # Do not process headers or completely empty lines header_date_re = r"\d+\w{2} Day\s+\w+DAY, \w+ \d{1,2}, \d{4}\s+\d+" header_journal_re = r"\d+\s+JOURNAL OF THE \w+\s+\d+\w{2} Day" lines = iter( [ line for line in lines if not ( line == "" or re.match(header_date_re, line) or re.match(header_journal_re, line) ) ] ) # bill_id -> motion -> count motions_per_bill = collections.defaultdict(collections.Counter) for line in lines: # Go through with vote parse if any of # these conditions match. if not line.startswith("On the question") or "shall" not in line.lower(): continue # Get the bill_id bill_id = None bill_re = r"\(\s*([A-Z\.]+\s\d+)\s*\)" # The Senate ends its motion text with a vote announcement if chamber == "upper": end_of_motion_re = r".* the vote was:\s*" # The House may or may not end motion text with a bill name elif chamber == "lower": end_of_motion_re = r'.*Shall.*(?:\?"?|")(\s{})?\s*'.format(bill_re) while not re.match(end_of_motion_re, line, re.IGNORECASE): line += " " + next(lines) try: bill_id = re.search(bill_re, line).group(1) except AttributeError: self.warning( "This motion did not pertain to legislation: {}".format(line) ) continue # Get the motion text motion_re = r""" ^On\sthe\squestion\s # Precedes any motion "+ # Motion is preceded by a quote mark (or two) (Shall\s.+?\??) # The motion text begins with "Shall" \s*(?:\?"?|"|’)\s+ # Motion is followed by a question mark and/or a quote mark (?:{})? # If the vote regards a bill, its number is listed {} # Senate has trailing text \s*$ """.format( # in at least one case [SF 457 from 2020] the bill number is followed by )0 # seemingly just a typo, this gets around that bill_re, r",?.*?the\svote\swas:" if chamber == "upper" else r"\d?", ) # print("motion candidate line:", line) motion = re.search(motion_re, line, re.VERBOSE | re.IGNORECASE) if motion: motion = motion.group(1) for word, letter in (("Senate", "S"), ("House", "H"), ("File", "F")): if bill_id is None: return bill_id = bill_id.replace(word, letter) bill_id = bill_id.replace(".", "") bill_chamber = dict(h="lower", s="upper")[bill_id.lower()[0]] votes, passed = self.parse_votes(lines) # at the very least, there should be a majority # for the bill to have passed, so check that, # but if the bill didn't pass, it could still be OK if it got a majority # eg constitutional amendments if not ( (passed == (votes["yes_count"] > votes["no_count"])) or (not passed) ): self.error("The bill passed without a majority?") raise ValueError("invalid vote") # also throw a warning if the bill failed but got a majority # it could be OK, but is probably something we'd want to check if not passed and votes["yes_count"] > votes["no_count"]: self.logger.warning( "The bill got a majority but did not pass. " "Could be worth confirming." ) result = "" if passed: result = "pass" else: result = "fail" # check for duplicate motions and number second and up if needed motion_text = re.sub("\xad", "-", motion) motions_per_bill[bill_id][motion_text] += 1 new_count = motions_per_bill[bill_id][motion_text] if new_count > 1: motion_text += f" #{new_count}" vote = VoteEvent( chamber=chamber, start_date=date, motion_text=motion_text, result=result, classification="passage", legislative_session=session, bill=bill_id, bill_chamber=bill_chamber, ) # add votes and counts for vtype in ("yes", "no", "absent", "abstain"): vcount = votes["{}_count".format(vtype)] or 0 vote.set_count(vtype, vcount) for voter in votes["{}_votes".format(vtype)]: vote.vote(vtype, voter) vote.add_source(url) yield vote
def scrape_vote(self, session, bill, vote_url, chamber, date): page = self.lxmlize(vote_url) try: motion = page.xpath("//font/text()")[2] except IndexError: self.warning("Vote Summary Page Broken ") return # eg. http://leg.colorado.gov/content/sb18-033vote563ce6 if ("AM" in motion or "PM" in motion) and "/" in motion: motion = "Motion not given." if "withdrawn" not in motion: yes_no_counts = page.xpath( "//tr/td[preceding-sibling::td/descendant::" "font[contains(text(),'Aye')]]/font/text()") other_counts = page.xpath( "//tr/td[preceding-sibling::td/descendant::" "font[contains(text(),'Absent')]]/font/text()") abstain_counts = page.xpath( "//tr/td[preceding-sibling::td/descendant::" "font[contains(text(),'17C')]]/font/text()") if not yes_no_counts: self.info("Missing yes no count") return yes_count = int(yes_no_counts[0]) no_count = int(yes_no_counts[2]) exc_count = int(other_counts[2]) absent_count = int(other_counts[0]) abstain_count = 0 if abstain_counts: abstain_count = int(abstain_counts[0]) # fix for # http://leg.colorado.gov/content/hb19-1029vote65e72e if absent_count == -1: absent_count = 0 passed = yes_count > no_count vote = VoteEvent( chamber=chamber, start_date=self._tz.localize(date), motion_text=motion, result="pass" if passed else "fail", bill=bill, classification="passage", ) vote.dedupe_key = vote_url vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("excused", exc_count) vote.set_count("absent", absent_count) vote.set_count("abstain", abstain_count) vote.add_source(vote_url) rolls = page.xpath("//tr[preceding-sibling::tr/descendant::" "td/div/b/font[contains(text(),'Vote')]]") vote_abrv = { "Y": "yes", "N": "no", "E": "excused", "A": "absent", "-": "absent", "17C": "abstain", } for roll in rolls: if len(roll.xpath(".//td/div/font/text()")) > 0: voted = roll.xpath(".//td/div/font/text()")[0].strip() voter = roll.xpath(".//td/font/text()")[0].strip() if voted == "V": continue vote.vote(vote_abrv[voted], voter) yield vote
def scrape_votes(self, url, motion, date, chamber, bill): try: vote_pdf, resp = self.urlretrieve(url) except scrapelib.HTTPError: self.warning("Can't find vote file {}, skipping".format(url)) return text = convert_pdf(vote_pdf, "text") os.remove(vote_pdf) # this way we get a key error on a missing vote type motion, passed = self._vote_mapping[motion] yes_votes = [] no_votes = [] other_votes = [] absent_votes = [] not_voting_votes = [] # point at array to add names to cur_array = None precursors = ( ("yeas--", yes_votes), ("nays--", no_votes), ("absent or those not voting--", absent_votes), ("absent and those not voting--", absent_votes), ("not voting--", not_voting_votes), ("voting present--", other_votes), ("present--", other_votes), ("disclaimer", None), ) # split lines on newline, recombine lines that don't end in punctuation lines = _combine_lines(text.decode().split("\n")) for line in lines: # check if the line starts with a precursor, switch to that array for pc, arr in precursors: if pc in line.lower(): cur_array = arr line = line.replace(pc, "") # split names for name in line.split(","): name = name.strip() # move on if that's all there was if not name: continue # None or a Total indicate the end of a section if "None." in name: cur_array = None match = re.match(r"(.+?)\. Total--.*", name) if match: cur_array.append(match.groups()[0]) cur_array = None # append name if it looks ok junk_in_name = False for junk in ( "on final passage", "Necessary", "who would have", "being a tie", "therefore", "Vacancies", "a pair", "Total-", "ATTORNEY", "on final passage", "SPEAKER", "BOARD", "TREASURER", "GOVERNOR", "ARCHIVES", "SECRETARY", ): if junk in name: junk_in_name = True break if cur_array is not None and not junk_in_name: # strip trailing . if name[-1] == ".": name = name[:-1] name = self.clean_voter_name(name) cur_array.append(name) # return vote object yes_count = len(yes_votes) no_count = len(no_votes) absent_count = len(absent_votes) not_voting_count = len(not_voting_votes) other_count = len(other_votes) vote = VoteEvent( chamber=chamber, start_date=self._tz.localize(date), motion_text=motion, result="pass" if passed else "fail", classification="passage", bill=bill, ) vote.dedupe_key = url + "#" + bill.identifier vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("absent", absent_count) vote.set_count("not voting", not_voting_count) vote.set_count("other", other_count) vote.add_source(url) for yes_vote in yes_votes: vote.vote("yes", self.clean_voter_name(yes_vote)) for no_vote in no_votes: vote.vote("no", self.clean_voter_name(no_vote)) for absent_vote in absent_votes: vote.vote("absent", self.clean_voter_name(absent_vote)) for not_voting_vote in not_voting_votes: vote.vote("not voting", self.clean_voter_name(not_voting_vote)) for other_vote in other_votes: vote.vote("other", self.clean_voter_name(other_vote)) yield vote
def scrape_house_vote(self, url): page = lxml.html.fromstring(self.get(url).content) page.make_links_absolute(url) vote_date = page.xpath( "//rollcall-vote/vote-metadata/action-date/text()")[0] vote_time = page.xpath( "//rollcall-vote/vote-metadata/action-time/@time-etz")[0] when = self._TZ.localize( datetime.datetime.strptime("{} {}".format(vote_date, vote_time), "%d-%b-%Y %H:%M")) motion = page.xpath( "//rollcall-vote/vote-metadata/vote-question/text()")[0] result = page.xpath( "//rollcall-vote/vote-metadata/vote-result/text()")[0] if result == "Passed": result = "pass" else: result = "fail" session = page.xpath( "//rollcall-vote/vote-metadata/congress/text()")[0] bill_id = page.xpath( "//rollcall-vote/vote-metadata/legis-num/text()")[0] # for some reason these are "H R 123" which nobody uses, so fix to "HR 123" bill_id = re.sub(r"([A-Z])\s([A-Z])", r"\1\2", bill_id) roll_call = page.xpath( "//rollcall-vote/vote-metadata/rollcall-num/text()")[0] vote_id = "us-{}-lower-{}".format(when.year, roll_call) vote = VoteEvent( start_date=when, bill_chamber="lower" if bill_id[0] == "H" else "upper", motion_text=motion, classification="passage", # TODO result=result, legislative_session=session, identifier=vote_id, bill=bill_id, chamber="lower", ) vote.add_source(url) vote.extras["house-rollcall-num"] = roll_call yeas = page.xpath( "//rollcall-vote/vote-metadata/vote-totals/totals-by-vote/yea-total/text()" )[0] nays = page.xpath( "//rollcall-vote/vote-metadata/vote-totals/totals-by-vote/nay-total/text()" )[0] nvs = page.xpath( "//rollcall-vote/vote-metadata/vote-totals/totals-by-vote/not-voting-total/text()" )[0] presents = page.xpath( "//rollcall-vote/vote-metadata/vote-totals/totals-by-vote/present-total/text()" )[0] vote.set_count("yes", int(yeas)) vote.set_count("no", int(nays)) vote.set_count("not voting", int(nvs)) vote.set_count("abstain", int(presents)) # vote.yes vote.no vote.vote for row in page.xpath("//rollcall-vote/vote-data/recorded-vote"): bioguide = row.xpath("legislator/@name-id")[0] name = row.xpath("legislator/@sort-field")[0] choice = row.xpath("vote/text()")[0] vote.vote(self.vote_codes[choice], name, note=bioguide) return vote
def scrape(self, session=None): if not session: session = self.jurisdiction.legislative_sessions[-1]["identifier"] self.info("no session specified, using %s", session) chamber_types = { "H": "lower", "S": "upper", "G": "executive", "C": "legislature", } # pull the current session's details to tell if it's a special session_details = next( each for each in self.jurisdiction.legislative_sessions if each["identifier"] == session) is_special = False if ("classification" in session_details and session_details["classification"] == "special"): is_special = True session_id = SESSION_SITE_IDS[session] self.init_sftp(session_id) bill_url_base = "https://lis.virginia.gov/cgi-bin/" if not is_special: self.load_members() self.load_sponsors() self.load_fiscal_notes() self.load_summaries() self.load_history() self.load_votes() self.load_bills() if not is_special: self.load_amendments() for bill in self._bills: bill = self._bills[bill][0] bill_id = bill["bill_id"] chamber = chamber_types[bill_id[0]] bill_type = { "B": "bill", "J": "joint resolution", "R": "resolution" }[bill_id[1]] b = Bill( bill_id, session, bill["bill_description"], chamber=chamber, classification=bill_type, ) bill_url = bill_url_base + f"legp604.exe?{session_id}+sum+{bill_id}" b.add_source(bill_url) # Long Bill ID needs to have 6 characters to work with vote urls, sponsors, and summaries. # Fill in blanks with 0s long_bill_id = bill_id if len(bill_id) == 3: long_bill_id = bill_id[0:2] + "000" + bill_id[-1] elif len(bill_id) == 4: long_bill_id = bill_id[0:2] + "00" + bill_id[-2:] elif len(bill_id) == 5: long_bill_id = bill_id[0:2] + "0" + bill_id[-3:] # Sponsors if long_bill_id not in self._sponsors: if "patron_name" in bill and bill["patron_name"].strip() != "": b.add_sponsorship( bill["patron_name"], classification="primary", entity_type="person", primary=True, ) for spon in self._sponsors[long_bill_id]: if spon["member_name"].strip() == "": continue sponsor_type = spon["patron_type"] if sponsor_type.endswith("Chief Patron"): sponsor_type = "primary" else: sponsor_type = "cosponsor" b.add_sponsorship( spon["member_name"], classification=sponsor_type, entity_type="person", primary=sponsor_type == "primary", ) # Summary summary_texts = self._summaries[long_bill_id] for sum_text in summary_texts: b.add_abstract(sum_text["summary_text"], sum_text["summary_type"]) # Amendment docs amendments = self._amendments[bill_id] for amend in amendments: doc_link = ( bill_url_base + f"legp604.exe?{session_id}+amd+{amend['txt_docid']}") b.add_document_link("Amendment: " + amend["txt_docid"], doc_link, media_type="text/html") # fiscal notes for fn in self._fiscal_notes[long_bill_id]: doc_link = bill_url_base + f"legp604.exe?{session_id}+oth+{fn['refid']}" b.add_document_link( "Fiscal Impact Statement: " + fn["refid"], doc_link.replace(".PDF", "+PDF"), media_type="application/pdf", ) # actions with 8-digit number followed by D are version titles too doc_actions = defaultdict(list) # History and then votes for hist in self._history[bill_id]: action = hist["history_description"] action_date = hist["history_date"] date = datetime.datetime.strptime(action_date, "%m/%d/%y").date() chamber = chamber_types[action[0]] vote_id = hist["history_refid"] cleaned_action = action[2:] if re.findall(r"\d{8}D", cleaned_action): doc_actions[action_date].append(cleaned_action) # categorize actions for pattern, atype in ACTION_CLASSIFIERS: if re.match(pattern, cleaned_action): break else: atype = None if atype != SKIP: b.add_action(cleaned_action, date, chamber=chamber, classification=atype) if len(vote_id) > 0: total_yes = 0 total_no = 0 total_not_voting = 0 total_abstain = 0 for v in self._votes[vote_id]: if v["vote_result"] == "yes": total_yes += 1 elif v["vote_result"] == "no": total_no += 1 elif v["vote_result"] == "not voting": total_not_voting += 1 elif v["vote_result"] == "abstain": total_abstain += 1 vote = VoteEvent( identifier=vote_id, start_date=date, chamber=chamber, motion_text=cleaned_action, result="pass" if total_yes > total_no else "fail", classification="passage", bill=b, ) vote.set_count("yes", total_yes) vote.set_count("no", total_no) vote.set_count("not voting", total_not_voting) vote.set_count("abstain", total_abstain) vote_url = ( bill_url_base + f"legp604.exe?{session_id}+vot+{vote_id}+{long_bill_id}" ) vote.add_source(vote_url) for v in self._votes[vote_id]: vote.vote(v["vote_result"], v["member_id"]) yield vote # Versions for version in bill["text_docs"]: # Checks if abbr is blank as not every bill has multiple versions if version["doc_abbr"]: version_url = ( bill_url_base + f"legp604.exe?{session_id}+ful+{version['doc_abbr']}") version_date = datetime.datetime.strptime( version["doc_date"], "%m/%d/%y").date() # version text will default to abbreviation provided in CSV # but if there is an unambiguous action from that date with # a version, we'll use that as the document title version_text = version["doc_abbr"] if len(doc_actions[version["doc_date"]]) == 1: version_text = doc_actions[version["doc_date"]][0] b.add_version_link( version_text, version_url, date=version_date, media_type="text/html", on_duplicate="ignore", ) yield b
def scrape(self, session=None): HTML_TAGS_RE = r"<.*?>" if session is None: session = self.latest_session() year_slug = self.jurisdiction.get_year_slug(session) # Load all bills and resolutions via the private API bills_url = "http://legislature.vermont.gov/bill/loadBillsReleased/{}/".format( year_slug) bills_json = self.get(bills_url).text bills = json.loads(bills_json)["data"] or [] bills_url = "http://legislature.vermont.gov/bill/loadBillsIntroduced/{}/".format( year_slug) bills_json = self.get(bills_url).text bills.extend(json.loads(bills_json)["data"] or []) resolutions_url = "http://legislature.vermont.gov/bill/loadAllResolutionsByChamber/{}/both".format( year_slug) resolutions_json = self.get(resolutions_url).text bills.extend(json.loads(resolutions_json)["data"] or []) # Parse the information from each bill for info in bills: # Strip whitespace from strings info = {k: v.strip() for k, v in info.items()} # Identify the bill type and chamber if info["BillNumber"].startswith("J.R.H."): bill_type = "joint resolution" bill_chamber = "lower" elif info["BillNumber"].startswith("J.R.S."): bill_type = "joint resolution" bill_chamber = "upper" elif info["BillNumber"].startswith("H.C.R."): bill_type = "concurrent resolution" bill_chamber = "lower" elif info["BillNumber"].startswith("S.C.R."): bill_type = "concurrent resolution" bill_chamber = "upper" elif info["BillNumber"].startswith("H.R."): bill_type = "resolution" bill_chamber = "lower" elif info["BillNumber"].startswith("S.R."): bill_type = "resolution" bill_chamber = "upper" elif info["BillNumber"].startswith("PR."): bill_type = "constitutional amendment" if info["Body"] == "H": bill_chamber = "lower" elif info["Body"] == "S": bill_chamber = "upper" else: raise AssertionError("Amendment not tied to chamber") elif info["BillNumber"].startswith("H."): bill_type = "bill" bill_chamber = "lower" elif info["BillNumber"].startswith("S."): bill_type = "bill" bill_chamber = "upper" else: raise AssertionError("Unknown bill type found: '{}'".format( info["BillNumber"])) bill_id_original_format = (info["BillNumber"].replace(".", "").replace( " ", "")) bill_id = bill_id_original_format # put one space back in between type and number bill_id = re.sub(r"([a-zA-Z]+)(\d+)", r"\1 \2", bill_id) # Create the bill using its basic information bill = Bill( identifier=bill_id, legislative_session=session, chamber=bill_chamber, title=info["Title"], classification=bill_type, ) if "resolution" in bill_type: bill.add_source(resolutions_url) else: bill.add_source(bills_url) # Load the bill's information page to access its metadata bill_url = "http://legislature.vermont.gov/bill/status/{0}/{1}".format( year_slug, info["BillNumber"]) doc = self.lxmlize(bill_url) bill.add_source(bill_url) # Capture sponsors sponsors = doc.xpath( '//dl[@class="summary-table"]/dt[text()="Sponsor(s)"]/' "following-sibling::dd[1]/ul/li") sponsor_type = "primary" for sponsor in sponsors: if sponsor.xpath("span/text()") == ["Additional Sponsors"]: sponsor_type = "cosponsor" continue sponsor_name = (sponsor.xpath("a/text()")[0].replace( "Rep.", "").replace("Sen.", "").strip()) if sponsor_name and not (sponsor_name[:5] == "Less" and len(sponsor_name) == 5): bill.add_sponsorship( name=sponsor_name, classification=sponsor_type, entity_type="person", primary=(sponsor_type == "primary"), ) # Capture bill text versions # Warning: There's a TODO in VT's source code saying 'move this to where it used to be' # so leave in the old and new positions versions = doc.xpath( '//dl[@class="summary-table"]/dt[text()="Bill/Resolution Text"]/' "following-sibling::dd[1]/ul/li/a |" '//ul[@class="bill-path"]//a') for version in versions: if version.xpath("text()"): bill.add_version_link( note=version.xpath("text()")[0], url=version.xpath("@href")[0].replace(" ", "%20"), media_type="application/pdf", ) # Identify the internal bill ID, used for actions and votes # If there is no internal bill ID, then it has no extra information try: internal_bill_id = re.search( r'"bill/loadBillDetailedStatus/.+?/(\d+)"', lxml.etree.tostring(doc).decode("utf-8"), ).group(1) except AttributeError: self.warning("Bill {} appears to have no activity".format( info["BillNumber"])) yield bill continue # Capture actions actions_url = "http://legislature.vermont.gov/bill/loadBillDetailedStatus/{0}/{1}".format( year_slug, internal_bill_id) actions_json = self.get(actions_url) # Checks if page actually has json posted if "json" in actions_json.headers.get("Content-Type"): actions = json.loads(actions_json.text)["data"] # Checks to see if any data is actually there if actions == "": continue else: continue bill.add_source(actions_url) chambers_passed = set() for action in actions: action = {k: v for k, v in action.items() if v is not None} if "Signed by Governor" in action["FullStatus"]: actor = "executive" elif action["ChamberCode"] == "H": actor = "lower" elif action["ChamberCode"] == "S": actor = "upper" else: raise AssertionError("Unknown actor for bill action") # Categorize action if "Signed by Governor" in action["FullStatus"]: # assert chambers_passed == set("HS") action_type = "executive-signature" elif "Vetoed by the Governor" in action["FullStatus"]: action_type = "executive-veto" elif ("Read first time" in action["FullStatus"] or "Read 1st time" in action["FullStatus"]): action_type = "introduction" elif "Reported favorably" in action["FullStatus"]: action_type = "committee-passage-favorable" elif actor == "lower" and any( x.lower().startswith("aspassed") for x in action["keywords"].split(";")): action_type = "passage" chambers_passed.add("H") elif actor == "upper" and any( x.lower().startswith(" aspassed") or x.lower().startswith("aspassed") for x in action["keywords"].split(";")): action_type = "passage" chambers_passed.add("S") else: action_type = None # Manual fix for data error in # https://legislature.vermont.gov/bill/status/2020/H.511 action["StatusDate"] = action["StatusDate"].replace( "/0209", "/2019") # Manual fix for data error in # https://legislature.vermont.gov/bill/status/2020/H.754 if bill_id == "H 754" and session == "2019-2020": action["StatusDate"] = action["StatusDate"].replace( "/0202", "/2020") # https://legislature.vermont.gov/bill/status/2020/H.942 if bill_id == "H 942" and session == "2019-2020": action["StatusDate"] = action["StatusDate"].replace( "/0200", "/2020") action_date = datetime.datetime.strftime( datetime.datetime.strptime(action["StatusDate"], "%m/%d/%Y"), "%Y-%m-%d", ) # strftime doesn't always pad year value (%Y) (https://bugs.python.org/issue32195) # and sometimes this state has typos in year part of the StatusDate value # which can cause validation errors, so fix leading zeroes if they are missing if action_date.find("-") < 4: action_date = ("0" * (4 - action_date.find("-"))) + action_date bill.add_action( description=re.sub(HTML_TAGS_RE, "", action["FullStatus"]), date=action_date, chamber=actor, classification=action_type, ) # Capture votes votes_url = "http://legislature.vermont.gov/bill/loadBillRollCalls/{0}/{1}".format( year_slug, internal_bill_id) votes_json = self.get(votes_url).text votes = json.loads(votes_json)["data"] bill.add_source(votes_url) for vote in votes: roll_call_id = vote["VoteHeaderID"] roll_call_url = ("http://legislature.vermont.gov/bill/" "loadBillRollCallDetails/{0}/{1}".format( year_slug, roll_call_id)) roll_call_json = self.get(roll_call_url).text roll_call = json.loads(roll_call_json)["data"] roll_call_yea = [] roll_call_nay = [] roll_call_not_voting = [] for member in roll_call: (member_name, _district) = member["MemberName"].split(" of ") member_name = member_name.strip() if member["MemberVote"] == "Yea": roll_call_yea.append(member_name) elif member["MemberVote"] == "Nay": roll_call_nay.append(member_name) else: roll_call_not_voting.append(member_name) if ("Passed -- " in vote["FullStatus"] # seems like we've seen both or "Governor overridden" in vote["FullStatus"] or "Governor overriden" in vote["FullStatus"]): did_pass = True elif ("Failed -- " in vote["FullStatus"] or "Veto of the Governor sustained" in vote["FullStatus"]): did_pass = False else: raise AssertionError("Roll call vote result is unclear: " + vote["FullStatus"]) # Check vote counts yea_count = int( re.search(r"Yeas = (\d+)", vote["FullStatus"]).group(1)) nay_count = int( re.search(r"Nays = (\d+)", vote["FullStatus"]).group(1)) vote_start_date = datetime.datetime.strftime( datetime.datetime.strptime(vote["StatusDate"], "%m/%d/%Y"), "%Y-%m-%d", ) motion_text = re.sub(HTML_TAGS_RE, "", vote["FullStatus"]).strip() vote_identifer = (vote["StatusDate"] + "--" + motion_text + "--" + roll_call_url) vote_to_add = VoteEvent( identifier=vote_identifer, bill=bill, chamber=("lower" if vote["ChamberCode"] == "H" else "upper"), start_date=vote_start_date, motion_text=motion_text, result="pass" if did_pass else "fail", classification="passage", legislative_session=session, ) vote_to_add.add_source(roll_call_url) vote_to_add.set_count("yes", yea_count) vote_to_add.set_count("no", nay_count) vote_to_add.set_count("not voting", len(roll_call_not_voting)) for member in roll_call_yea: vote_to_add.yes(member) for member in roll_call_nay: vote_to_add.no(member) for member in roll_call_not_voting: vote_to_add.vote("not voting", member) yield vote_to_add # Witnesses: # http://legislature.vermont.gov/bill/loadBillWitnessList/{year_slug}/{internal_bill_id} witnesses_doc_link_url = "https://legislature.vermont.gov/bill/print/2020/{0}/witnesses".format( bill_id_original_format) bill.add_document_link(note="Witness List", url=witnesses_doc_link_url, media_type="text/html") # Conference committee members: # http://legislature.vermont.gov/bill/loadBillConference/{year_slug}/{bill_number} conferees_doc_link_url = "https://legislature.vermont.gov/bill/print/2020/{0}/conference".format( bill_id_original_format) page = self.lxmlize(conferees_doc_link_url) no_data = page.xpath('//div[@class="no-data"]/text()') if not no_data: bill.add_document_link( note="Conference Committee Members", url=conferees_doc_link_url, media_type="text/html", ) # Committee meetings: # http://legislature.vermont.gov/committee/loadHistoryByBill/{year_slug}?LegislationId={internal_bill_id} meetings_doc_link_url = "https://legislature.vermont.gov/bill/print/2020/{0}/meetings".format( bill_id_original_format) bill.add_document_link( note="Committee Meetings", url=meetings_doc_link_url, media_type="text/html", ) yield bill
def scrape_vote(self, bill, vote_chamber, bill_id, vote_id, vote_date, action_text): url = ("http://alisondb.legislature.state.al.us/Alison/" "GetRollCallVoteResults.aspx?" "VOTE={0}&BODY={1}&INST={2}&SESS={3}".format( vote_id, vote_chamber, bill_id, self.session_id)) doc = lxml.html.fromstring(self.get(url=url).text) voters = {"Y": [], "N": [], "P": [], "A": []} voters_and_votes = doc.xpath("//table/tr/td/font/text()") capture_vote = False name = "" for item in voters_and_votes: if capture_vote: capture_vote = False if name: voters[item].append(name) else: capture_vote = True name = item if (name.endswith(", Vacant") or name.startswith("Total ") or not name.strip()): name = "" # Check name counts against totals listed on the site total_yea = doc.xpath('//*[starts-with(text(), "Total Yea")]/text()') if total_yea: total_yea = int(total_yea[0].split(":")[-1]) assert total_yea == len(voters["Y"]), "Yea count incorrect" else: total_yea = len(voters["Y"]) total_nay = doc.xpath('//*[starts-with(text(), "Total Nay")]/text()') if total_nay: total_nay = int(total_nay[0].split(":")[-1]) assert total_nay == len(voters["N"]), "Nay count incorrect" else: total_nay = len(voters["N"]) total_absent = doc.xpath( '//*[starts-with(text(), "Total Absent")]/text()') if total_absent: total_absent = int(total_absent[0].split(":")[-1]) assert total_absent == len(voters["A"]), "Absent count incorrect" total_other = len(voters["P"]) + len(voters["A"]) vote = VoteEvent( chamber=self.CHAMBERS[vote_chamber[0]], start_date=vote_date, motion_text=action_text, result="pass" if total_yea > total_nay else "fail", classification="passage", bill=bill, ) vote.set_count("yes", total_yea) vote.set_count("no", total_nay) vote.set_count("other", total_other) vote.add_source(url) for member in voters["Y"]: vote.vote("yes", member) for member in voters["N"]: vote.vote("no", member) for member in voters["A"] + voters["P"]: vote.vote("other", member) yield vote
def handle_page(self): MOTION_INDEX = 4 TOTALS_INDEX = 6 VOTE_START_INDEX = 9 if len(self.lines) < 2: self.scraper.warning("Bad PDF! " + self.url) return motion = self.lines[MOTION_INDEX].strip() # Sometimes there is no motion name, only "Passage" in the line above if not motion and not self.lines[MOTION_INDEX - 1].startswith("Calendar Page:"): motion = self.lines[MOTION_INDEX - 1] MOTION_INDEX -= 1 TOTALS_INDEX -= 1 VOTE_START_INDEX -= 1 else: assert motion, "Floor vote's motion name appears to be empty" for _extra_motion_line in range(2): MOTION_INDEX += 1 if self.lines[MOTION_INDEX].strip(): motion = "{}, {}".format(motion, self.lines[MOTION_INDEX].strip()) TOTALS_INDEX += 1 VOTE_START_INDEX += 1 else: break (yes_count, no_count, nv_count) = [ int(x) for x in re.search( r"^\s+Yeas - (\d+)\s+Nays - (\d+)\s+Not Voting - (\d+)\s*$", self.lines[TOTALS_INDEX], ).groups() ] result = "pass" if yes_count > no_count else "fail" vote = VoteEvent( start_date=self.kwargs["date"], chamber=self.kwargs["chamber"], bill=self.kwargs["bill"], motion_text=motion, result=result, classification="passage", ) vote.add_source(self.url) vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("not voting", nv_count) for line in self.lines[VOTE_START_INDEX:]: if not line.strip(): break if " President " in line: line = line.replace(" President ", " ") elif " Speaker " in line: line = line.replace(" Speaker ", " ") # Votes follow the pattern of: # [vote code] [member name]-[district number] for vtype, member in re.findall( r"\s*(Y|N|EX|AV)\s+(.*?)-\d{1,3}\s*", line): vtype = { "Y": "yes", "N": "no", "EX": "excused", "AV": "abstain" }[vtype] member = member.strip() vote.vote(vtype, member) # check totals line up yes_count = no_count = nv_count = 0 for vc in vote.counts: if vc["option"] == "yes": yes_count = vc["value"] elif vc["option"] == "no": no_count = vc["value"] else: nv_count += vc["value"] for vr in vote.votes: if vr["option"] == "yes": yes_count -= 1 elif vr["option"] == "no": no_count -= 1 else: nv_count -= 1 if yes_count != 0 or no_count != 0: raise ValueError("vote count incorrect: " + self.url) if nv_count != 0: # On a rare occasion, a member won't have a vote code, # which indicates that they didn't vote. The totals reflect # this. self.scraper.info( "Votes don't add up; looking for additional ones") for line in self.lines[VOTE_START_INDEX:]: if not line.strip(): break for member in re.findall(r"\s{8,}([A-Z][a-z\'].*?)-\d{1,3}", line): member = member.strip() vote.vote("not voting", member) yield vote
def _scrape_upper_chamber(self, session): if int(session[:4]) >= 2016: if len(session) == 4: # regular session url = "http://www.senate.mo.gov/%sinfo/jrnlist/default.aspx" % ( session[-2:], ) else: # special session url = "http://www.senate.mo.gov/%sinfo/jrnlist/%sJournals.aspx" % ( session[-4:-2], session[-2:], ) else: url = "http://www.senate.mo.gov/%sinfo/jrnlist/journals.aspx" % ( session[-2:]) vote_types = { "YEAS": "yes", "NAYS": "no", "Absent with leave": "other", "Absent": "other", "Vacancies": "other", } page = self.lxmlize(url) journs = page.xpath("//table")[0].xpath(".//a") for a in journs: pdf_url = a.attrib["href"] data = self._get_pdf(pdf_url).decode() lines = data.split("\n") in_vote = False cur_date = None vote_type = "other" cur_bill = "" cur_motion = "" bc = None vote = {} counts = collections.defaultdict(int) for line in lines: line = line.strip() if cur_date is None: matches = re.findall(date_re, line) if matches != []: date = matches[0] date = "%s, %s %s, %s" % date date = dt.datetime.strptime(date, "%A, %B %d, %Y") cur_date = date matches = re.findall(motion_re, line) if matches != []: cont = False for x in matches: if "vote" in x.lower(): cur_motion = x bill = re.findall(bill_re, x) if bill != []: bc = { "H": "lower", "S": "upper", "J": "legislature" }[bill[0][0]] cur_bill = "%s%s%s %s" % bill[0] in_vote = True cont = True if cont: continue if in_vote: if is_vote_end(line): in_vote = False yes, no, other = counts["yes"], counts["no"], counts[ "other"] if bc is None: continue v = VoteEvent( start_date=TIMEZONE.localize(date), motion_text=cur_motion, result="pass" if yes > no else "fail", legislative_session=session, classification="passage", bill=cur_bill, bill_chamber=bc, ) v.add_source(url) v.add_source(pdf_url) v.set_count("yes", yes) v.set_count("no", no) v.set_count("other", other) for key in vote: for person in vote[key]: v.vote(key, person) yield v vote = {} counts = collections.defaultdict(int) continue if "Journal of the Senate" in line: continue if re.match( r".*(Monday|Tuesday|Wednesday|Thursday|Friday|" r"Saturday|Sunday), .* \d+, \d+.*", line, ): continue found = False rl = None for vote_type in list(vote_types): if line.lower().startswith(vote_type.lower()): if "none" in line.lower(): continue if "Senator" in line and "Senators" not in line: line = self._clean_line(line) line = line[len(vote_type):] line = line.replace("-Senator ", "") rl = line vote_category = vote_types[vote_type] found = True if vote_category not in vote: vote[vote_category] = [] if found and rl is None: continue elif rl: line = rl names = [self._clean_line(x) for x in line.strip().split()] if names == []: continue lname = names[-1] lname = lname.rsplit("-", 1) if len(lname) > 1: person, count = lname if count.isdigit() is False: continue names.pop(-1) names.append(person) counts[vote_category] += int(count) for name in names: vote[vote_category].append(name)
def handle_page(self): # Checks to see if any vote totals are provided if (len( self.doc.xpath( '//span[contains(@id, "ctl00_MainContent_lblTotal")]/text()' )) > 0): (date, ) = self.doc.xpath('//span[contains(@id, "lblDate")]/text()') date = format_datetime( datetime.datetime.strptime(date, "%m/%d/%Y %I:%M:%S %p"), "US/Eastern") # ctl00_MainContent_lblTotal //span[contains(@id, "ctl00_MainContent_lblTotal")] yes_count = int( self.doc.xpath('//span[contains(@id, "lblYeas")]/text()')[0]) no_count = int( self.doc.xpath('//span[contains(@id, "lblNays")]/text()')[0]) other_count = int( self.doc.xpath('//span[contains(@id, "lblMissed")]/text()')[0]) result = "pass" if yes_count > no_count else "fail" (committee, ) = self.doc.xpath('//span[contains(@id, "lblCommittee")]/text()') (action, ) = self.doc.xpath('//span[contains(@id, "lblAction")]/text()') motion = "{} ({})".format(action, committee) vote = VoteEvent( start_date=date, bill=self.kwargs["bill"], chamber="lower", motion_text=motion, result=result, classification="committee", ) vote.add_source(self.url) vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("not voting", other_count) for member_vote in self.doc.xpath( '//ul[contains(@class, "vote-list")]/li'): if not member_vote.text_content().strip(): continue (member, ) = member_vote.xpath("span[2]//text()") (member_vote, ) = member_vote.xpath("span[1]//text()") member = member.strip() if member_vote == "Y": vote.yes(member) elif member_vote == "N": vote.no(member) elif member_vote == "-": vote.vote("not voting", member) # Parenthetical votes appear to not be counted in the # totals for Yea, Nay, _or_ Missed elif re.search(r"\([YN]\)", member_vote): continue else: raise ValueError( "Unknown vote type found: {}".format(member_vote)) yield vote
def scrape_votes(self, session, zip_url): votes = {} last_line = [] for line in self.zf.open("tblrollcallsummary.txt"): if line.strip() == "": continue line = line.split("|") if len(line) < 14: if len(last_line + line[1:]) == 14: line = last_line self.warning("used bad vote line") else: last_line = line self.warning("bad vote line %s" % "|".join(line)) session_yr = line[0] body = line[1] vote_num = line[2] timestamp = line[3] bill_id = line[4].strip() yeas = int(line[5]) nays = int(line[6]) # present = int(line[7]) # absent = int(line[8]) motion = line[11].strip() or "[not available]" if session_yr == session and bill_id in self.bills_by_id: actor = "lower" if body == "H" else "upper" time = dt.datetime.strptime(timestamp, "%m/%d/%Y %I:%M:%S %p") # TODO: stop faking passed somehow passed = yeas > nays vote = Vote( chamber=actor, start_date=time.strftime("%Y-%m-%d"), motion_text=motion, result="pass" if passed else "fail", classification="passage", bill=self.bills_by_id[bill_id], ) vote.set_count("yes", yeas) vote.set_count("no", nays) vote.add_source(zip_url) votes[body + vote_num] = vote for line in self.zf.open("tblrollcallhistory.txt"): # 2012 | H | 2 | 330795 | HB309 | Yea |1/4/2012 8:27:03 PM session_yr, body, v_num, employee, bill_id, vote, date = line.split("|") if not bill_id: continue if session_yr == session and bill_id.strip() in self.bills_by_id: try: leg = self.legislators[employee]["name"] except KeyError: self.warning("Error, can't find person %s" % employee) continue vote = vote.strip() if body + v_num not in votes: self.warning("Skipping processing this vote:") self.warning("Bad ID: %s" % (body + v_num)) continue other_count = 0 # code = self.legislators[employee]['seat'] if vote == "Yea": votes[body + v_num].yes(leg) elif vote == "Nay": votes[body + v_num].no(leg) else: votes[body + v_num].other(leg) other_count += 1 votes[body + v_num].set_count("other", other_count) for vote in votes.values(): yield vote
def scrape_vote(self, url, session): fname, _ = self.urlretrieve(url) text = convert_pdf(fname, type="text").decode() lines = text.splitlines() chamber = "upper" if "senate" in url else "lower" if "Maryland" not in text: self.warning(f"empty vote from {url}") return date = re.findall(r"Legislative Date: (\w+ \d+, \d{4})", text)[0] section = "preamble" motion = None bill_id = None how = None voters = defaultdict(list) for line in lines: if section == "preamble": if "vetoed" in line.lower(): self.warning( f"skipping vote that appears to be on prior session: {line}, {bill_id}" ) return possible_bill_id = re.findall(r"([HS][BJR] \d+)", line) if possible_bill_id: bill_id = possible_bill_id[0] # preamble has metadata, then motion, then counts. our process then is to # store the last line as the motion, but if the last line looks like a # continuation, append it to the prior line line = line.strip() counts = re.findall( r"(\d+) Yeas\s+(\d+) Nays\s+(\d+) Not Voting\s+(\d+) Excused\s+(\d+) Absent", line, ) if counts: yes_count, no_count, nv_count, excused_count, absent_count = counts[ 0] yes_count = int(yes_count) no_count = int(no_count) nv_count = int(nv_count) excused_count = int(excused_count) absent_count = int(absent_count) section = "votes" elif line and line != "(Const)": # questions seem to be split across two lines if line.endswith("?"): motion = motion + " " + line else: motion = line elif section == "votes": if line.startswith("Voting Yea"): how = "yes" elif line.startswith("Voting Nay"): how = "no" elif line.startswith("Not Voting"): how = "not voting" elif line.startswith("Excused from Voting"): how = "excused" elif line.startswith("Excused (Absent)"): how = "absent" elif how: names = re.split(r"\s{2,}", line) voters[how].extend(names) if not bill_id and not motion: return elif bill_id and not motion: self.warning( f"got {bill_id} but no motion, not registering as a vote") elif motion and not bill_id: self.warning( f"got {motion} but no bill_id, not registering as a vote") return # bleh - result not indicated anywhere result = "pass" if yes_count > no_count else "fail" bill_chamber = "upper" if bill_id.startswith("S") else "lower" date = datetime.datetime.strptime(date, "%b %d, %Y").strftime("%Y-%m-%d") vote = VoteEvent( chamber=chamber, start_date=date, result=result, classification="passage", motion_text=motion, legislative_session=session, bill=bill_id, bill_chamber=bill_chamber, ) # URL includes sequence ID, will be unique vote.dedupe_key = url vote.add_source(url) vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("not voting", nv_count) vote.set_count("excused", excused_count) vote.set_count("absent", absent_count) for how, names in voters.items(): for name in names: name = name.strip().replace("*", "") if name and "COPY" not in name and "Indicates Vote Change" not in name: vote.vote(how, name) check_counts(vote, raise_error=True) return vote
def handle_page(self): summary = self.doc.xpath("/".join([ '//h4[starts-with(text(), "SUMMARY")]', "/following-sibling::p", "text()", ])) if summary and summary[0].strip(): self.obj.add_abstract(abstract=summary[0].strip(), note="summary") # versions for va in self.doc.xpath( '//h4[text()="FULL TEXT"]/following-sibling::ul[1]/li/a[1]'): # 11/16/09 \xa0House: Prefiled and ordered printed; offered 01/13/10 10100110D date, desc = va.text.split(u" \xa0") desc.rsplit(" ", 1)[0] # chop off last part link = va.get("href") if "http" not in link: link = "{}{}".format(BASE_URL, link) date = datetime.datetime.strptime(date, "%m/%d/%y").date() # budget bills in VA are searchable but no full text available if "+men+" in link: logging.getLogger("va").warning( "not adding budget version, bill text not available") else: # VA duplicates reprinted bills, lets keep the original name self.obj.add_version_link(desc, link, date=date, media_type="text/html", on_duplicate="ignore") # amendments for va in self.doc.xpath( '//h4[text()="AMENDMENTS"]/following-sibling::ul[1]/li/a[1]'): version_name = va.xpath("string(.)") if (("adopted" in version_name.lower() or "engrossed" in version_name.lower()) and "not adopted" not in version_name.lower() and "not engrossed" not in version_name.lower()): version_url = va.xpath("@href")[0] self.obj.add_version_link( version_name, version_url, media_type="text/html", on_duplicate="ignore", ) # actions seen_next = False for ali, next_ali in pairwise( self.doc.xpath( '//h4[text()="HISTORY"]/following-sibling::ul[1]/li')): # If we've used this action text before, we don't need to parse it again if seen_next: seen_next = False continue date, action = ali.text_content().split(u" \xa0") try: actor, action = action.split(": ", 1) except ValueError: assert any([ action.startswith("{}:".format(x)) for x in self.actor_map.keys() ]), "Unparseable action text found: '{}'".format(action) logging.getLogger("va").warning( "Skipping apparently-null action: '{}'".format(action)) continue # Bill history entries purely in parentheses tend to be # notes and not actions, so we'll skip them. if action.startswith("(") and action.endswith(")"): continue actor = self.actor_map[actor] date = datetime.datetime.strptime(date.strip(), "%m/%d/%y").date() # if action ends in (##-Y ##-N) remove that part vrematch = self.vote_strip_re.match(action) # The following conditional logic is messy to handle # Virginia's crazy and inconsistently formatted bill # histories. Someone less harried and tired than me # could probably make this much cleaner. - alo if vrematch: vote_action, y, n, o = vrematch.groups() y = int(y) n = int(n) # Set default count for "other" votes to 0. We have to # do this explicitly as it's excluded from the action # text when there were no abstentions (the only type of # "other" vote encountered thus far). o = int(o) if o else 0 vote_url = ali.xpath("a/@href") # Finds relevant information from the current action if # vote count encountered, then searches for the presence # of identical counts in the next entry (we assume that # it's probably there). If matching votes are found, it # merges data in both to create a unified vote record. # # This is because Virginia usually publishes two lines # of history data for a single vote, without guaranteed # order, so we unsafely attempt to match on identical # vote counts in the next line. vote = VoteEvent( start_date=date, chamber=actor, motion_text=vote_action.strip(), result="pass" if y > n else "fail", classification="passage", bill=self.obj, ) vote.set_count("yes", y) vote.set_count("no", n) vote.set_count("other", o) try: next_action = ( next_ali.text_content().split(" \xa0")[1].split( ": ", 1)[1]) except (AttributeError, ValueError): next_action = "" vrematch_next = self.vote_strip_re.match(next_action) if vrematch_next: vote_action_next, y_next, n_next, o_next = vrematch_next.groups( ) y_next = int(y_next) n_next = int(n_next) o_next = int(o_next) if o_next else 0 vote_url_next = next_ali.xpath("a/@href") # Check that the vote counts match and that only one action # has a URL (otherwise, they're probably different votes). if [y_next, n_next, o_next ] == [y, n, o] and len(vote_url) != len(vote_url_next): seen_next = True if not vote_url: vote_url = vote_url_next else: vote.motion_text = vote_action_next.strip() action = next_action if vote_url: list( self.scrape_page_items(VotePage, url=vote_url[0], obj=vote)) vote.add_source(vote_url[0]) else: vote.add_source(self.url) yield from add_pupa_id(vote) # categorize actions for pattern, atype in ACTION_CLASSIFIERS: if re.match(pattern, action): break else: atype = None # if matched a 'None' atype, don't add the action if atype != SKIP: self.obj.add_action(action, date, chamber=actor, classification=atype)
def parse_vote_pdf(self, vote_url, bill): filename, response = self.urlretrieve(vote_url) text = convert_pdf(filename, type="text").decode() lines = text.splitlines() if "Senate" in vote_url: chamber = "upper" else: chamber = "lower" date_string = lines[0].split("Calendar Date:")[1].strip() date = datetime.datetime.strptime(date_string, "%b %d, %Y %I:%M (%p)") page_index = None for index, line in enumerate(lines): if "Yeas" in line and "Nays" in line: page_index = index break vote_counts = 5 * [0] vote_types = ["yes", "no", "not voting", "excused", "absent"] if page_index: counts = re.split(r"\s{2,}", lines[page_index].strip()) for index, count in enumerate(counts): number, string = count.split(" ", 1) number = int(number) vote_counts[index] = number else: raise ValueError("Vote Counts Not found at %s" % vote_url) passed = vote_counts[0] > vote_counts[1] # Consent calendar votes address multiple bills in one VoteEvent # eg, http://mgaleg.maryland.gov/2018RS/votes/Senate/0478.pdf is_consent_calendar = any( ["Consent Calendar" in line for line in lines[:page_index]] ) consent_calendar_bills = None motion = "" if is_consent_calendar: motion = re.split(r"\s{2,}", lines[page_index - 4].strip())[0] consent_calendar_bills = re.split(r"\s{2,}", lines[page_index - 1].strip()) assert ( consent_calendar_bills ), "Could not find bills for consent calendar vote" motion_keywords = [ "favorable", "reading", "amendment", "motion", "introduced", "bill pass", "committee", ] motion_lines = [ 3, 2, 4, 5, ] # Relative LineNumbers to be checked for existence of motion for i in motion_lines: if any( motion_keyword in motion.lower() for motion_keyword in motion_keywords ): break motion = re.split(r"\s{2,}", lines[page_index - i].strip())[0] else: if not any( motion_keyword in motion.lower() for motion_keyword in motion_keywords ): # This condition covers for the bad formating in SB 1260 motion = lines[page_index - 3] if not any( motion_keyword in motion.lower() for motion_keyword in motion_keywords ): # Check this one for SB 747 motion = "No motion given" self.warning("No motion given") vote = VoteEvent( bill=bill, chamber=chamber, start_date=date.strftime("%Y-%m-%d"), motion_text=motion, classification="passage", result="pass" if passed else "fail", ) # Include bill ID to avoid duplication for consent calendars vote.pupa_id = "{}#{}".format(vote_url, bill.identifier) for index, vote_type in enumerate(vote_types): vote.set_count(vote_type, vote_counts[index]) page_index = page_index + 2 # Keywords for identifying where names are located in the pdf show_stoppers = [ "Voting Nay", "Not Voting", "COPY", "Excused", "indicates vote change", "Indicates Vote Change", ] vote_index = 0 # For matching number of names extracted with vote counts(extracted independently) vote_name_counts = 5 * [0] while page_index < len(lines): current_line = lines[page_index].strip() if not current_line or "Voting Yea" in current_line: page_index += 1 continue if any(show_stopper in current_line for show_stopper in show_stoppers): page_index += 1 vote_index = vote_index + 1 continue names = re.split(r"\s{2,}", current_line) vote_name_counts[vote_index] += len(names) for name in names: vote.vote(vote_types[vote_index], name) page_index += 1 if vote_counts != vote_name_counts: raise ValueError("Votes Count and Number of Names don't match") return vote
def scrape_votes(self, bill, doc): vote_tr_path = ('//h6[@id="vote-header"]' '/ancestor::div[contains(@class, "gray-card")]' '//div[contains(@class, "card-body")]' '//div[@class="row"]') for vote_row in doc.xpath(vote_tr_path): entries = [ each.text_content() for each in vote_row.xpath("div")[1:-1:2] ] date, subject, rcs, aye, no, nv, abs, exc, total = entries result = vote_row.xpath("div/a")[0] result_text = result.text result_link = result.get("href") if "H" in rcs: chamber = "lower" elif "S" in rcs: chamber = "upper" date = eastern.localize( dt.datetime.strptime(date.replace(".", ""), "%m/%d/%Y %H:%M %p")) date = date.isoformat() ve = VoteEvent( chamber=chamber, start_date=date, motion_text=subject, result="pass" if "PASS" in result_text else "fail", bill=bill, classification="passage", # TODO: classify votes ) ve.set_count("yes", int(aye)) ve.set_count("no", int(no)) ve.set_count("not voting", int(nv)) ve.set_count("absent", int(abs)) ve.set_count("excused", int(exc)) ve.add_source(result_link) data = self.get(result_link).text vdoc = lxml.html.fromstring(data) # only one table that looks like this vote_table = vdoc.xpath("//div[@class='row ncga-row-no-gutters']") # Grabs names for how people voted for row in vote_table: votes_names = [] row = row.text_content() if "None" in row: vote_type = "Nope" elif "Ayes (" in row: row = row.replace("\n", ";") votes_names = row.replace(" ", "").strip().split(";")[2:-1] vote_type = "yes" elif "Noes (" in row: row = row.replace("\n", ";") votes_names = row.replace(" ", "").strip().split(";")[2:-1] vote_type = "no" elif "Excused Absence (" in row: row = row.replace("\n", ";") votes_names = row.replace(" ", "").strip().split(";")[2:-1] vote_type = "absent" elif "Not Voting (" in row: row = row.replace("\n", ";") votes_names = row.replace(" ", "").strip().split(";")[2:-1] vote_type = "abstain" else: vote_type = "Not a vote" if votes_names: for name in votes_names: name = name.replace("\r", "") # Resolves names that have '(Chair)' in them if "(" in name: name = name[:name.find("(")] # Adds a space to names inbetween initial and last name # eg: L.Johnson -> L. Johnson if name[1] == "." and name[2] != " ": name = name[:2] + " " + name[2:] ve.vote(vote_type, name) yield ve