def scrape_committee_vote(self, bill, actor, date, motion, page, url, uniqid): votes = page.xpath("//table")[0] rows = votes.xpath(".//tr")[0] if rows[0].text_content() == "Votes:": # New webste rows = votes.xpath(".//tr")[2] yno = rows.xpath(".//td") if len(yno) < 3: yes = yno[0] no, other = None, None else: yes, _, no, _, other = rows.xpath(".//td")[:5] def proc_block(obj, typ): if obj is None: return {"type": None, "count": None, "votes": []} votes = [] for vote in obj.xpath("./text()"): if vote.strip(): vote = vote.strip() if vote: votes.append(vote) count = len(votes) return {"type": typ, "count": count, "votes": votes} vote_dict = { "yes": proc_block(yes, "yes"), "no": proc_block(no, "no"), "other": proc_block(other, "other"), } yes_count = vote_dict["yes"]["count"] no_count = vote_dict["no"]["count"] or 0 other_count = vote_dict["other"]["count"] or 0 vote = Vote( chamber=actor, start_date=date, motion_text=motion, identifier=str(uniqid), result="pass" if (yes_count > no_count) else "fail", classification="passage", bill=bill, ) vote.extras = {"_vote_id": uniqid} vote.add_source(url) vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("other", other_count) for key in vote_dict: for voter in vote_dict[key]["votes"]: vote.vote(key, voter) yield vote
def parse_vote(self, bill, actor, date, motion, url, uniqid): page = self.get(url).text bill.add_source(url) vote_re = re.compile( r"YEAS -?\s?(\d+)(.*)NAYS -?\s?(\d+)" r"(.*)ABSENT( OR NOT VOTING)? -?\s?" r"(\d+)(.*)", re.MULTILINE | re.DOTALL, ) match = vote_re.search(page) yes_count = int(match.group(1)) no_count = int(match.group(3)) other_count = int(match.group(6)) if yes_count > no_count: passed = True else: passed = False if actor == "upper" or actor == "lower": vote_chamber = actor else: vote_chamber = "" vote = Vote( chamber=vote_chamber, start_date=date, motion_text=motion, result="pass" if passed else "fail", identifier=str(uniqid), classification="passage", bill=bill, ) vote.add_source(url) vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("other", other_count) yes_votes = re.split(r"\s{2,}", match.group(2).strip()) no_votes = re.split(r"\s{2,}", match.group(4).strip()) other_votes = re.split(r"\s{2,}", match.group(7).strip()) for yes in yes_votes: if yes: vote.yes(yes) for no in no_votes: if no: vote.no(no) for other in other_votes: if other: vote.vote("other", other) yield vote
def scrape_votes(self, bill): bill_num = bill.identifier.split()[1] url = ( "http://wslwebservices.leg.wa.gov/legislationservice.asmx/" "GetRollCalls?billNumber=%s&biennium=%s" % (bill_num, self.biennium) ) page = self.get(url) page = lxml.etree.fromstring(page.content) for rc in xpath(page, "//wa:RollCall"): motion = xpath(rc, "string(wa:Motion)") seq_no = xpath(rc, "string(wa:SequenceNumber)") date = xpath(rc, "string(wa:VoteDate)").split("T")[0] date = datetime.datetime.strptime(date, "%Y-%m-%d").date() yes_count = int(xpath(rc, "string(wa:YeaVotes/wa:Count)")) no_count = int(xpath(rc, "string(wa:NayVotes/wa:Count)")) abs_count = int(xpath(rc, "string(wa:AbsentVotes/wa:Count)")) ex_count = int(xpath(rc, "string(wa:ExcusedVotes/wa:Count)")) other_count = abs_count + ex_count agency = xpath(rc, "string(wa:Agency)") chamber = {"House": "lower", "Senate": "upper"}[agency] vote = Vote( chamber=chamber, start_date=date, motion_text="{} (#{})".format(motion, seq_no), result="pass" if yes_count > (no_count + other_count) else "fail", bill=bill, classification=[], ) vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("other", other_count) vote.add_source(url) for sv in xpath(rc, "wa:Votes/wa:Vote"): name = xpath(sv, "string(wa:Name)") vtype = xpath(sv, "string(wa:VOte)") if vtype == "Yea": vote.yes(name) elif vtype == "Nay": vote.no(name) else: vote.vote("other", name) yield vote
def parse_vote(self, chamber, bill, row, action_text, action_date, url): yes = int( row.xpath( './/div[label[contains(text(), "A Favor")]]/span[contains(@class,"smalltxt")]/text()' )[0] ) no = int( row.xpath( './/div[label[contains(text(), "En Contra")]]/span[contains(@class,"smalltxt")]/text()' )[0] ) abstain = int( row.xpath( './/div[label[contains(text(), "Abstenido")]]/span[contains(@class,"smalltxt")]/text()' )[0] ) absent = int( row.xpath( './/div[label[contains(text(), "Ausente")]]/span[contains(@class,"smalltxt")]/text()' )[0] ) vote_chamber = self.parse_vote_chamber(chamber, action_text) classification = "passage" if u"Votación Final" in action_text else "other" vote = Vote( chamber=vote_chamber, start_date=action_date, motion_text=action_text, result="pass" if (yes > no) else "fail", bill=bill, classification=classification, ) vote.add_source(url) vote.set_count("yes", yes) vote.set_count("no", no) vote.set_count("absent", absent) vote.set_count("abstain", abstain) # we don't want to add the attached vote PDF as a version, # so add it as a document # TODO: maybe this should be set as the source? self.parse_version(bill, row, is_document=True) yield vote
def scrape_votes(self, bill, url): page = lxml.html.fromstring(self.get(url).text.replace(u"\xa0", " ")) seen_rcs = set() re_ns = "http://exslt.org/regular-expressions" path = r"//p[re:test(text(), 'OKLAHOMA\s+(HOUSE|STATE\s+SENATE)')]" for header in page.xpath(path, namespaces={"re": re_ns}): bad_vote = False # Each chamber has the motion name on a different line of the file if "HOUSE" in header.xpath("string()"): chamber = "lower" motion_index = 8 else: chamber = "upper" motion_index = 13 motion = header.xpath("string(following-sibling::p[%d])" % motion_index).strip() motion = re.sub(r"\s+", " ", motion) if not motion.strip(): self.warning("Motion text not found") return match = re.match(r"^(.*) (PASSED|FAILED)$", motion) if match: motion = match.group(1) passed = match.group(2) == "PASSED" else: passed = None rcs_p = header.xpath( "following-sibling::p[contains(., 'RCS#')]")[0] rcs_line = rcs_p.xpath("string()").replace(u"\xa0", " ") rcs = re.search(r"RCS#\s+(\d+)", rcs_line).group(1) if rcs in seen_rcs: continue else: seen_rcs.add(rcs) date_line = rcs_p.getnext().xpath("string()") date = re.search(r"\d+/\d+/\d+", date_line).group(0) date = datetime.datetime.strptime(date, "%m/%d/%Y").date() vtype = None counts = collections.defaultdict(int) votes = collections.defaultdict(list) seen_yes = False for sib in header.xpath("following-sibling::p")[13:]: line = sib.xpath("string()").replace("\r\n", " ").strip() if "*****" in line: break regex = (r"(YEAS|NAYS|EXCUSED|VACANT|CONSTITUTIONAL " r"PRIVILEGE|NOT VOTING|N/V)\s*:\s*(\d+)(.*)") match = re.match(regex, line) if match: if match.group(1) == "YEAS" and "RCS#" not in line: vtype = "yes" seen_yes = True elif match.group(1) == "NAYS" and seen_yes: vtype = "no" elif match.group(1) == "VACANT": continue # skip these elif seen_yes: vtype = "other" if seen_yes and match.group(3).strip(): self.warning("Bad vote format, skipping.") bad_vote = True counts[vtype] += int(match.group(2)) elif seen_yes: for name in line.split(" "): if not name: continue if "HOUSE" in name or "SENATE " in name: continue votes[vtype].append(name.strip()) if bad_vote: continue if passed is None: passed = counts["yes"] > (counts["no"] + counts["other"]) vote = Vote( chamber=chamber, start_date=date.strftime("%Y-%m-%d"), motion_text=motion, result="pass" if passed else "fail", bill=bill, classification="passage", ) vote.set_count("yes", counts["yes"]) vote.set_count("no", counts["no"]) vote.set_count("other", counts["other"]) vote.dedupe_key = url + "#" + rcs vote.add_source(url) for name in votes["yes"]: vote.yes(name) for name in votes["no"]: if ":" in name: raise Exception(name) vote.no(name) for name in votes["other"]: vote.vote("other", name) yield vote
def scrape_votes(self, session, zip_url): votes = {} last_line = [] for line in self.zf.open("tblrollcallsummary.txt"): if line.strip() == "": continue line = line.split("|") if len(line) < 14: if len(last_line + line[1:]) == 14: line = last_line self.warning("used bad vote line") else: last_line = line self.warning("bad vote line %s" % "|".join(line)) session_yr = line[0] body = line[1] vote_num = line[2] timestamp = line[3] bill_id = line[4].strip() yeas = int(line[5]) nays = int(line[6]) # present = int(line[7]) # absent = int(line[8]) motion = line[11].strip() or "[not available]" if session_yr == session and bill_id in self.bills_by_id: actor = "lower" if body == "H" else "upper" time = dt.datetime.strptime(timestamp, "%m/%d/%Y %I:%M:%S %p") # TODO: stop faking passed somehow passed = yeas > nays vote = Vote( chamber=actor, start_date=time.strftime("%Y-%m-%d"), motion_text=motion, result="pass" if passed else "fail", classification="passage", bill=self.bills_by_id[bill_id], ) vote.set_count("yes", yeas) vote.set_count("no", nays) vote.add_source(zip_url) votes[body + vote_num] = vote for line in self.zf.open("tblrollcallhistory.txt"): # 2012 | H | 2 | 330795 | HB309 | Yea |1/4/2012 8:27:03 PM session_yr, body, v_num, employee, bill_id, vote, date = line.split("|") if not bill_id: continue if session_yr == session and bill_id.strip() in self.bills_by_id: try: leg = self.legislators[employee]["name"] except KeyError: self.warning("Error, can't find person %s" % employee) continue vote = vote.strip() if body + v_num not in votes: self.warning("Skipping processing this vote:") self.warning("Bad ID: %s" % (body + v_num)) continue other_count = 0 # code = self.legislators[employee]['seat'] if vote == "Yea": votes[body + v_num].yes(leg) elif vote == "Nay": votes[body + v_num].no(leg) else: votes[body + v_num].other(leg) other_count += 1 votes[body + v_num].set_count("other", other_count) for vote in votes.values(): yield vote
def scrape_votes(self, session): votes = {} other_counts = defaultdict(int) last_line = [] vote_url = "http://gencourt.state.nh.us/dynamicdatafiles/RollCallSummary.txt" lines = self.get(vote_url).content.decode("utf-8").splitlines() for line in lines: if len(line) < 2: continue if line.strip() == "": continue line = line.split("|") if len(line) < 14: if len(last_line + line[1:]) == 14: line = last_line self.warning("used bad vote line") else: last_line = line self.warning("bad vote line %s" % "|".join(line)) session_yr = line[0].replace("\xef\xbb\xbf", "") body = line[1] vote_num = line[2] timestamp = line[3] bill_id = line[4].strip() yeas = int(line[5]) nays = int(line[6]) # present = int(line[7]) # absent = int(line[8]) motion = line[11].strip() or "[not available]" if session_yr == session and bill_id in self.bills_by_id: actor = "lower" if body == "H" else "upper" time = dt.datetime.strptime(timestamp, "%m/%d/%Y %I:%M:%S %p") time = pytz.timezone("America/New_York").localize( time).isoformat() # TODO: stop faking passed somehow passed = yeas > nays vote = Vote( chamber=actor, start_date=time, motion_text=motion, result="pass" if passed else "fail", classification="passage", bill=self.bills_by_id[bill_id], ) vote.set_count("yes", yeas) vote.set_count("no", nays) vote.add_source(vote_url) vote.pupa_id = session_yr + body + vote_num # unique ID for vote votes[body + vote_num] = vote for line in (self.get( "http://gencourt.state.nh.us/dynamicdatafiles/RollCallHistory.txt" ).content.decode("utf-8").splitlines()): if len(line) < 2: continue # 2016|H|2|330795||Yea| # 2012 | H | 2 | 330795 | 964 | HB309 | Yea | 1/4/2012 8:27:03 PM session_yr, body, v_num, _, employee, bill_id, vote, date = line.split( "|") if not bill_id: continue if session_yr == session and bill_id.strip() in self.bills_by_id: try: leg = " ".join(self.legislators[employee]["name"].split()) except KeyError: self.warning("Error, can't find person %s" % employee) continue vote = vote.strip() if body + v_num not in votes: self.warning("Skipping processing this vote:") self.warning("Bad ID: %s" % (body + v_num)) continue # code = self.legislators[employee]['seat'] if vote == "Yea": votes[body + v_num].yes(leg) elif vote == "Nay": votes[body + v_num].no(leg) else: votes[body + v_num].vote("other", leg) # hack-ish, but will keep the vote count sync'd other_counts[body + v_num] += 1 votes[body + v_num].set_count("other", other_counts[body + v_num]) for vote in votes.values(): yield vote
def scrape_chamber(self, chamber, session): chamber_name = "house" if chamber == "lower" else "senate" session_slug = { "62": "62-2011", "63": "63-2013", "64": "64-2015", "65": "65-2017", "66": "66-2019", }[session] # Open the index page of the session's Registers, and open each url = "http://www.legis.nd.gov/assembly/%s/journals/%s-journal.html" % ( session_slug, chamber_name, ) page = self.lxmlize(url) pdfs = page.xpath("//a[contains(@href, '.pdf')]") for pdf in pdfs: # Initialize information about the vote parsing results = {} in_motion = False cur_vote = None in_vote = False cur_motion = "" bills = [] # Determine which URLs the information was pulled from pdf_url = pdf.attrib["href"] try: (path, response) = self.urlretrieve(pdf_url) except requests.exceptions.ConnectionError: continue # Convert the PDF to text data = convert_pdf(path, type="text").decode("utf-8") os.unlink(path) # Determine the date of the document date = re.findall(date_re, data) if date: date = date[0][0] cur_date = datetime.datetime.strptime(date, "%A, %B %d, %Y") else: # If no date is found anywhere, do not process the document self.warning("No date was found for the document; skipping.") continue # Check each line of the text for motion and vote information lines = data.splitlines() for line in lines: # Ignore lines with no information if (re.search(chamber_re, line) or re.search(date_re, line) or re.search(page_re, line) or line.strip() == ""): pass # Ensure that motion and vote capturing are not _both_ active elif in_motion and in_vote: raise AssertionError( "Scraper should not be simultaneously processing " + "motion name and votes, as it is for this motion: " + cur_motion) # Start capturing motion text after a ROLL CALL header elif not in_motion and not in_vote: if line.strip() == "ROLL CALL": in_motion = True elif in_motion and not in_vote: if cur_motion == "": cur_motion = line.strip() else: cur_motion = cur_motion + " " + line.strip() # ABSENT AND NOT VOTING marks the end of each motion name # In this case, prepare to capture votes if line.strip().endswith( "VOTING") or line.strip().endswith("VOTING."): in_motion = False in_vote = True elif not in_motion and in_vote: # Ignore appointments and confirmations if "The Senate advises and consents to the appointment" in line: in_vote = False cur_vote = None results = {} cur_motion = "" bills = [] # If votes are being processed, record the voting members elif ":" in line: cur_vote, who = (x.strip() for x in line.split(":", 1)) who = [ x.strip() for x in who.split(";") if x.strip() != "" ] results[cur_vote] = who name_may_be_continued = False if line.endswith( ";") else True # Extracts bill numbers in the closing text # used for when the closing text is multiple lines. elif (cur_vote is not None and re.findall(r"(?i)(H|S|J)(C?)(B|R|M) (\d+)", line) and not any(x in line.lower() for x in [ "passed", "adopted", "sustained", "prevailed", "lost", "failed", ])): bills.extend( re.findall(r"(?i)(H|S|J)(C?)(B|R|M) (\d+)", line)) elif cur_vote is not None and not any(x in line.lower() for x in [ "passed", "adopted", "sustained", "prevailed", "lost", "failed", ]): who = [ x.strip() for x in line.split(";") if x.strip() != "" ] if name_may_be_continued: results[cur_vote][-1] = (results[cur_vote][-1] + " " + who.pop(0)) name_may_be_continued = False if line.endswith( ";") else True results[cur_vote].extend(who) # At the conclusion of a vote, save its data elif any(x in line.lower() for x in [ "passed", "adopted", "sustained", "prevailed", "lost", "failed", ]): in_vote = False cur_vote = None # Identify what is being voted on # Throw a warning if impropper informaiton found bills.extend( re.findall(r"(?i)(H|S|J)(C?)(B|R|M) (\d+)", line)) if bills == [] or cur_motion.strip() == "": results = {} cur_motion = "" self.warning("No motion or bill name found: " + "motion name: " + cur_motion + "; " + "decision text: " + line.strip()) continue # If votes are found in the motion name, throw an error if "YEAS:" in cur_motion or "NAYS:" in cur_motion: raise AssertionError( "Vote data found in motion name: " + cur_motion) # Use the collected results to determine who voted how keys = { "YEAS": "yes", "NAYS": "no", "ABSENT AND NOT VOTING": "other", } res = {} for key in keys: if key in results: res[keys[key]] = results[key] else: res[keys[key]] = [] # Count the number of members voting each way yes, no, other = ( len(res["yes"]), len(res["no"]), len(res["other"]), ) chambers = { "H": "lower", "S": "upper", "J": "legislature" } # Almost all of the time, a vote only applies to one bill and this loop # will only be run once. # Some exceptions exist. for bill in bills: cur_bill_id = "%s%s%s %s" % bill # Identify the source chamber for the bill try: bc = chambers[cur_bill_id[0]] except KeyError: bc = "other" # Determine whether or not the vote passed if "over the governor's veto" in cur_motion.lower( ): VETO_SUPERMAJORITY = 2 / 3 passed = yes / (yes + no) > VETO_SUPERMAJORITY else: passed = yes > no # Create a Vote object based on the scraped information vote = Vote( chamber=chamber, start_date=cur_date.strftime("%Y-%m-%d"), motion_text=cur_motion, result="pass" if passed else "fail", legislative_session=session, classification="passage", bill=cur_bill_id, bill_chamber=bc, ) vote.add_source(pdf_url) vote.add_source(url) vote.set_count("yes", yes) vote.set_count("no", no) vote.set_count("other", other) # For each category of voting members, # add the individuals to the Vote object for key in res: for voter in res[key]: vote.vote(key, voter) # Check the vote counts in the motion text against # the parsed results for category_name in keys.keys(): # Need to search for the singular, not plural, in the text # so it can find, for example, " 1 NAY " vote_re = r"(\d+)\s{}".format( category_name[:-1]) motion_count = int( re.findall(vote_re, cur_motion)[0]) for item in vote.counts: if item["option"] == keys[category_name]: vote_count = item["value"] if motion_count != vote_count: self.warning( "Motion text vote counts ({}) ".format( motion_count) + "differed from roll call counts ({}) ". format(vote_count) + "for {0} on {1}".format( category_name, cur_bill_id)) for item in vote.counts: if item["option"] == keys[ category_name]: vote_count = motion_count yield vote # With the vote successfully processed, # wipe its data and continue to the next one results = {} cur_motion = "" bills = []
def scrape_vote(self, bill, name, url): if "VOTE/h" in url: vote_chamber = "lower" cols = (1, 5, 9, 13) name_offset = 3 yes_offset = 0 no_offset = 1 else: vote_chamber = "upper" cols = (1, 6) name_offset = 4 yes_offset = 1 no_offset = 2 page = self.get(url, verify=False).text if "BUDGET ADDRESS" in page: return page = lxml.html.fromstring(page) yes_count = page.xpath("string(//span[contains(., 'Those voting Yea')])") yes_count = int(re.match(r"[^\d]*(\d+)[^\d]*", yes_count).group(1)) no_count = page.xpath("string(//span[contains(., 'Those voting Nay')])") no_count = int(re.match(r"[^\d]*(\d+)[^\d]*", no_count).group(1)) other_count = page.xpath("string(//span[contains(., 'Those absent')])") other_count = int(re.match(r"[^\d]*(\d+)[^\d]*", other_count).group(1)) need_count = page.xpath("string(//span[contains(., 'Necessary for')])") need_count = int(re.match(r"[^\d]*(\d+)[^\d]*", need_count).group(1)) date = page.xpath("string(//span[contains(., 'Taken on')])") date = re.match(r".*Taken\s+on\s+(\d+/\s?\d+)", date).group(1) date = date.replace(" ", "") date = datetime.datetime.strptime( date + " " + bill.legislative_session, "%m/%d %Y" ).date() # not sure about classification. vote = Vote( chamber=vote_chamber, start_date=date, motion_text=name, result="pass" if yes_count > need_count else "fail", classification="passage", bill=bill, ) vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("other", other_count) vote.add_source(url) table = page.xpath("//table")[0] for row in table.xpath("tr"): for i in cols: name = row.xpath("string(td[%d])" % (i + name_offset)).strip() if not name or name == "VACANT": continue name = string.capwords(name) if "Y" in row.xpath("string(td[%d])" % (i + yes_offset)): vote.yes(name) elif "N" in row.xpath("string(td[%d])" % (i + no_offset)): vote.no(name) else: vote.vote("other", name) yield vote
def parse_html_vote(self, bill, actor, date, motion, url, uniqid): try: page = self.get(url).text except scrapelib.HTTPError: self.warning("A vote page not found for bill {}".format( bill.identifier)) return page = lxml.html.fromstring(page) page.make_links_absolute(url) descr = page.xpath("//b")[0].text_content() if descr == "": # New page method descr = page.xpath("//center")[0].text if "on voice vote" in descr: return if "committee" in descr.lower(): yield from self.scrape_committee_vote(bill, actor, date, motion, page, url, uniqid) return passed = None if "Passed" in descr: passed = True elif "Failed" in descr: passed = False elif "UTAH STATE LEGISLATURE" in descr: return elif descr.strip() == "-": return else: self.warning(descr) raise NotImplementedError("Can't see if we passed or failed") headings = page.xpath("//b")[1:] votes = page.xpath("//table") sets = zip(headings, votes) vdict = {} for (typ, votes) in sets: txt = typ.text_content() arr = [x.strip() for x in txt.split("-", 1)] if len(arr) != 2: continue v_txt, count = arr v_txt = v_txt.strip() count = int(count) people = [ x.text_content().strip() for x in votes.xpath(".//font[@face='Arial']") ] vdict[v_txt] = {"count": count, "people": people} vote = Vote( chamber=actor, start_date=date, motion_text=motion, result="pass" if passed else "fail", bill=bill, classification="passage", identifier=str(uniqid), ) vote.set_count("yes", vdict["Yeas"]["count"]) vote.set_count("no", vdict["Nays"]["count"]) vote.set_count("other", vdict["Absent or not voting"]["count"]) vote.add_source(url) for person in vdict["Yeas"]["people"]: vote.yes(person) for person in vdict["Nays"]["people"]: vote.no(person) for person in vdict["Absent or not voting"]["people"]: vote.vote("other", person) yield vote