def scrape_vote_text(self, filelocation, local=False): """Retrieves or uses local copy of vote pdf and converts into XML.""" if not local: try: filename, response = self.urlretrieve(url=filelocation) vote_text = convert_pdf(filename, type="xml") os.remove(filename) except scrapelib.HTTPError: self.warning("Request failed: {}".format(filelocation)) return else: vote_text = convert_pdf(filelocation, type="xml") os.remove(filelocation) return vote_text
def _fix_house_text(self, filename): """ TLDR: throw out bad text, replace it using different parser settings. When using `pdftotext` on the 2015 House committee list, the second and third columns of the second page get mixed up, which makes it very difficult to parse. Adding the `--layout` option fixes this, but isn't worth switching all parsing to that since the standard `pdftotext --nolayout` is easier in all other cases. The best solution to this is to throw out the offending text, and replace it with the correct text. The third and fourth columns are joint comittees that are scraped from the Senate document, so the only column that needs to be inserted this way is the second. """ # Take the usable text from the normally-working parsing settings text = convert_pdf(filename, type="text-nolayout") assert "Revised: January 23, 2015" in text, ( "House committee list has changed; check that the special-case" " fix is still necessary, and that the result is still correct") text = re.sub(r"(?sm)Appropriations/F&C.*$", "", text) # Take the usable column from the alternate parser alternate_text = convert_pdf(filename, type="text") alternate_lines = alternate_text.split("\n") HEADER_OF_COLUMN_TO_REPLACE = "State Administration (cont.) " (text_of_line_to_replace, ) = [ x for x in alternate_lines if HEADER_OF_COLUMN_TO_REPLACE in x ] first_line_to_replace = alternate_lines.index(text_of_line_to_replace) first_character_to_replace = (alternate_lines[first_line_to_replace]. index(HEADER_OF_COLUMN_TO_REPLACE) - 1) last_character_to_replace = first_character_to_replace + len( HEADER_OF_COLUMN_TO_REPLACE) column_lines_to_add = [ x[first_character_to_replace:last_character_to_replace] for x in alternate_lines[first_line_to_replace + 1:] ] column_text_to_add = "\n".join(column_lines_to_add) text = text + column_text_to_add return text
def __init__(self, url, resp, bill): self.url = url self.bill = bill # Fetch the document and put it into tempfile. fd, filename = tempfile.mkstemp() with open(filename, "wb") as f: f.write(resp) # Convert it to text. try: text = convert_pdf(filename, type="text") except Exception: msg = "couldn't convert pdf." raise PDFCommitteeVoteParseError(msg) # Get rid of the temp file. os.close(fd) os.remove(filename) if not text.strip(): msg = "PDF file was empty." raise PDFCommitteeVoteParseError(msg) self.text = "\n".join( [line.decode() for line in text.splitlines() if line])
def _scrape_from_pdf(self): # FIXME: change for other years (2019 URL still valid for 2020) pdf_url = ( "https://www.elections.ny.gov/NYSBOE/Elections/2019/ElectedOfficials.pdf" ) filename, response = self.urlretrieve(pdf_url) text = convert_pdf(filename, type="text") columns = [] lines = iter(text.decode().split("\n")) for ln in lines: if "ELECTED REPRESENTATIVES FOR NEW YORK STATE" in ln: next(lines) next(lines) else: columns.append(ln) serial = chain((ln[:40].strip() for ln in columns), (ln[40:].strip() for ln in columns)) for ln in serial: if not ln: continue section = [] while ln: section.append(ln) ln = next(serial) yield section
def scrape_senate_vote(self, bill, url, date): try: filename, resp = self.urlretrieve(url) except scrapelib.HTTPError: self.warning("missing vote file %s" % url) return vote = VoteEvent( chamber="upper", start_date=date.strftime("%Y-%m-%d"), motion_text="Passage", # setting 'fail' for now. result="fail", classification="passage", bill=bill, ) vote.add_source(url) vote.pupa_id = url text = convert_pdf(filename, "text").decode("utf-8") os.remove(filename) if re.search(r"Yea:\s+\d+\s+Nay:\s+\d+\s+Absent:\s+\d+", text): yield from self.scrape_senate_vote_3col(bill, vote, text, url, date) return data = re.split(r"(Yea|Nay|Absent)s?:", text)[::-1] data = filter(None, data) keymap = dict(yea="yes", nay="no") actual_vote = collections.defaultdict(int) vote_count = {"yes": 0, "no": 0, "other": 0} while True: if not data: break vote_val = data.pop() key = keymap.get(vote_val.lower(), "other") values = data.pop() for name in re.split(r"(?:[\s,]+and\s|[\s,]{2,})", values): if name.lower().strip() == "none.": continue name = name.replace("..", "") name = re.sub(r"\.$", "", name) name = name.strip("-1234567890 \n") if not name: continue vote.vote(key, name) actual_vote[vote_val] += 1 vote_count[key] += 1 assert actual_vote[vote_val] == vote_count[key] for key, value in vote_count.items(): vote.set_count(key, value) # updating result with actual value vote.result = ("pass" if vote_count["yes"] > (vote_count["no"] + vote_count["other"]) else "fail") yield vote
def get_house_pdf(self, vurl): """cache house PDFs since they are done by year""" if vurl not in self.house_pdf_cache: (path, resp) = self.urlretrieve(vurl) pdflines = convert_pdf(path, "text") os.remove(path) self.house_pdf_cache[vurl] = pdflines.decode("utf-8").replace( "\u2019", "'") return self.house_pdf_cache[vurl]
def fetch_pdf_lines(self, href): # download the file try: fname, resp = self.urlretrieve(href) pdflines = [ line.decode("utf-8") for line in convert_pdf(fname, "text").splitlines() ] os.remove(fname) return pdflines except scrapelib.HTTPError as e: assert "404" in e.args[0], "File not found: {}".format(e) self.warning("404 error for vote; skipping vote") return False
def _load_emails_from_directory_pdf(self): """ Load the house PDF directory and convert to LXML - needed to find email addresses which are gone from the website. """ with tempfile.NamedTemporaryFile() as temp: self.scraper.urlretrieve(self.directory_pdf_url, temp.name) directory = convert_pdf(temp.name, "xml").decode('latin1') # pull out member email addresses from the XML salad produced # above - there's no obvious way to match these to names, but # fortunately they have names in them return set(re.findall(r'[\w.][email protected]', directory))
def scrape_rollcall(self, vote, vurl): """ Get text information from the pdf, containing the vote roll call and add the information obtained to the related voteEvent object :param vote: related voteEvent object :param vurl: pdf source url """ (path, resp) = self.urlretrieve(vurl) pdflines = convert_pdf(path, "text") os.remove(path) current_vfunc = None option = None for line in pdflines.split(b"\n"): line = line.strip().decode() # change what is being recorded if line.startswith("YEAS") or line.startswith("AYES"): current_vfunc = vote.yes elif line.startswith("NAYS"): current_vfunc = vote.no elif line.startswith("EXCUSED"): current_vfunc = vote.vote option = "excused" elif line.startswith("NOT VOTING"): current_vfunc = vote.vote option = "excused" elif line.startswith("ABSTAIN"): current_vfunc = vote.vote option = "excused" elif line.startswith("PAIRED"): current_vfunc = vote.vote option = "paired" # skip these elif not line or line.startswith("Page "): continue # if a vfunc is active elif current_vfunc: # split names apart by 3 or more spaces names = re.split(r"\s{3,}", line) for name in names: if name: if not option: current_vfunc(name.strip()) else: current_vfunc(option=option, voter=name.strip())
def _load_emails_from_directory_pdf(self): """ Load the house PDF directory and convert to LXML - needed to find email addresses which are gone from the website. """ with tempfile.NamedTemporaryFile() as temp: self.scraper.urlretrieve(self.directory_pdf_url, temp.name) directory = lxml.etree.fromstring(convert_pdf(temp.name, "xml")) # pull out member email addresses from the XML salad produced # above - there's no obvious way to match these to names, but # fortunately they have names in them return set( directory.xpath( '//text[contains(text(), "@myfloridahouse.gov")]/text()'))
def scrape_senate_vote(self, vote, vurl): # download file to server (path, resp) = self.urlretrieve(vurl) pdflines = convert_pdf(path, "text") os.remove(path) # for y, n mode = None lines = pdflines.splitlines() # handle individual lines in pdf to id legislator votes for line in lines: line = line.strip() line = line.decode("utf-8").replace("\u2212", "-") if line == "": continue # change mode accordingly elif line.startswith("YEAS"): mode = "y" elif line.startswith("NAYS"): mode = "n" elif line.startswith("ABSENT OR"): mode = "o" # else parse line with names else: nameline = line.split(" ") for raw_name in nameline: raw_name = raw_name.strip() if raw_name == "": continue # handles vote count lines cut_name = raw_name.split("-") clean_name = "" if cut_name[-1].strip(" .").isdigit(): del cut_name[-1] clean_name = "".join(cut_name) else: clean_name = raw_name.strip() # update vote object with names if mode == "y": vote.yes(clean_name) elif mode == "n": vote.no(clean_name) elif mode == "o": vote.vote("other", clean_name)
def scrape_votes(self, url, motion, date, chamber, bill): try: vote_pdf, resp = self.urlretrieve(url) except scrapelib.HTTPError: self.warning("Can't find vote file {}, skipping".format(url)) return text = convert_pdf(vote_pdf, "text") os.remove(vote_pdf) # this way we get a key error on a missing vote type motion, passed = self._vote_mapping[motion] yes_votes = [] no_votes = [] other_votes = [] absent_votes = [] not_voting_votes = [] # point at array to add names to cur_array = None precursors = ( ("yeas--", yes_votes), ("nays--", no_votes), ("absent or those not voting--", absent_votes), ("absent and those not voting--", absent_votes), ("not voting--", not_voting_votes), ("voting present--", other_votes), ("present--", other_votes), ("disclaimer", None), ) # split lines on newline, recombine lines that don't end in punctuation lines = _combine_lines(text.decode().split("\n")) for line in lines: # check if the line starts with a precursor, switch to that array for pc, arr in precursors: if pc in line.lower(): cur_array = arr line = line.replace(pc, "") # split names for name in line.split(","): name = name.strip() # move on if that's all there was if not name: continue # None or a Total indicate the end of a section if "None." in name: cur_array = None match = re.match(r"(.+?)\. Total--.*", name) if match: cur_array.append(match.groups()[0]) cur_array = None # append name if it looks ok junk_in_name = False for junk in ( "on final passage", "Necessary", "who would have", "being a tie", "therefore", "Vacancies", "a pair", "Total-", "ATTORNEY", "on final passage", "SPEAKER", "BOARD", "TREASURER", "GOVERNOR", "ARCHIVES", "SECRETARY", ): if junk in name: junk_in_name = True break if cur_array is not None and not junk_in_name: # strip trailing . if name[-1] == ".": name = name[:-1] name = self.clean_voter_name(name) cur_array.append(name) # return vote object yes_count = len(yes_votes) no_count = len(no_votes) absent_count = len(absent_votes) not_voting_count = len(not_voting_votes) other_count = len(other_votes) vote = VoteEvent( chamber=chamber, start_date=self._tz.localize(date), motion_text=motion, result="pass" if passed else "fail", classification="passage", bill=bill, ) vote.dedupe_key = url + "#" + bill.identifier vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("absent", absent_count) vote.set_count("not voting", not_voting_count) vote.set_count("other", other_count) vote.add_source(url) for yes_vote in yes_votes: vote.vote("yes", self.clean_voter_name(yes_vote)) for no_vote in no_votes: vote.vote("no", self.clean_voter_name(no_vote)) for absent_vote in absent_votes: vote.vote("absent", self.clean_voter_name(absent_vote)) for not_voting_vote in not_voting_votes: vote.vote("not voting", self.clean_voter_name(not_voting_vote)) for other_vote in other_votes: vote.vote("other", self.clean_voter_name(other_vote)) yield vote
def scrape_committees_pdf(self, year, chamber, filename, url): if chamber == "lower" and year == "2015": text = self._fix_house_text(filename).decode() else: text = convert_pdf(filename, type="text-nolayout").decode() for hotgarbage, replacement in ( ( r"Judicial Branch, Law Enforcement,\s+and\s+Justice", "Judicial Branch, Law Enforcement, and Justice", ), ( r"Natural Resources and\s+Transportation", "Natural Resources and Transportation", ), ( r"(?u)Federal Relations, Energy,?\s+and\s+Telecommunications", "Federal Relations, Energy, and Telecommunications", ), ): text = re.sub(hotgarbage, replacement, text) lines = iter(text.splitlines()) # Drop any lines before the ag committee. lines = dropwhile(lambda s: "Agriculture" not in s, lines) comm = None for line in lines: # Replace Unicode variants with ASCII equivalents line = line.replace(" ", " ").replace("‐", "-") if "Subcommittees" in line: self.warning("Currently, we're skipping subcommittees") # https://github.com/openstates/openstates/issues/2099 break if is_committee_name(line): if comm and comm._related: yield comm committee = line.strip() comm = Organization(name=committee, chamber=chamber, classification="committee") comm.add_source(url) elif is_legislator_name(line): name, party = line.rsplit("(", 1) name = name.strip().replace("Rep. ", "").replace("Sen. ", "") if re.search(" Ch", party): role = "chair" elif " VCh" in party: role = "vice chair" elif " MVCh" in party: role = "minority vice chair" else: role = "member" comm.add_member(name, role) if comm._related: yield comm
def parse_vote_pdf(self, vote_url, bill): filename, response = self.urlretrieve(vote_url) text = convert_pdf(filename, type="text").decode() lines = text.splitlines() if "Senate" in vote_url: chamber = "upper" else: chamber = "lower" date_string = lines[0].split("Calendar Date:")[1].strip() date = datetime.datetime.strptime(date_string, "%b %d, %Y %I:%M (%p)") page_index = None for index, line in enumerate(lines): if "Yeas" in line and "Nays" in line: page_index = index break vote_counts = 5 * [0] vote_types = ["yes", "no", "not voting", "excused", "absent"] if page_index: counts = re.split(r"\s{2,}", lines[page_index].strip()) for index, count in enumerate(counts): number, string = count.split(" ", 1) number = int(number) vote_counts[index] = number else: raise ValueError("Vote Counts Not found at %s" % vote_url) passed = vote_counts[0] > vote_counts[1] # Consent calendar votes address multiple bills in one VoteEvent # eg, http://mgaleg.maryland.gov/2018RS/votes/Senate/0478.pdf is_consent_calendar = any( ["Consent Calendar" in line for line in lines[:page_index]] ) consent_calendar_bills = None motion = "" if is_consent_calendar: motion = re.split(r"\s{2,}", lines[page_index - 4].strip())[0] consent_calendar_bills = re.split(r"\s{2,}", lines[page_index - 1].strip()) assert ( consent_calendar_bills ), "Could not find bills for consent calendar vote" motion_keywords = [ "favorable", "reading", "amendment", "motion", "introduced", "bill pass", "committee", ] motion_lines = [ 3, 2, 4, 5, ] # Relative LineNumbers to be checked for existence of motion for i in motion_lines: if any( motion_keyword in motion.lower() for motion_keyword in motion_keywords ): break motion = re.split(r"\s{2,}", lines[page_index - i].strip())[0] else: if not any( motion_keyword in motion.lower() for motion_keyword in motion_keywords ): # This condition covers for the bad formating in SB 1260 motion = lines[page_index - 3] if not any( motion_keyword in motion.lower() for motion_keyword in motion_keywords ): # Check this one for SB 747 motion = "No motion given" self.warning("No motion given") vote = VoteEvent( bill=bill, chamber=chamber, start_date=date.strftime("%Y-%m-%d"), motion_text=motion, classification="passage", result="pass" if passed else "fail", ) # Include bill ID to avoid duplication for consent calendars vote.pupa_id = "{}#{}".format(vote_url, bill.identifier) for index, vote_type in enumerate(vote_types): vote.set_count(vote_type, vote_counts[index]) page_index = page_index + 2 # Keywords for identifying where names are located in the pdf show_stoppers = [ "Voting Nay", "Not Voting", "COPY", "Excused", "indicates vote change", "Indicates Vote Change", ] vote_index = 0 # For matching number of names extracted with vote counts(extracted independently) vote_name_counts = 5 * [0] while page_index < len(lines): current_line = lines[page_index].strip() if not current_line or "Voting Yea" in current_line: page_index += 1 continue if any(show_stopper in current_line for show_stopper in show_stoppers): page_index += 1 vote_index = vote_index + 1 continue names = re.split(r"\s{2,}", current_line) vote_name_counts[vote_index] += len(names) for name in names: vote.vote(vote_types[vote_index], name) page_index += 1 if vote_counts != vote_name_counts: raise ValueError("Votes Count and Number of Names don't match") return vote
def scrape_journal(self, url, chamber, session, date): filename, response = self.urlretrieve(url) self.logger.info("Saved journal to %r" % filename) all_text = convert_pdf(filename, type="text") lines = all_text.split(b"\n") lines = [line.decode("utf-8") for line in lines] lines = [ line.strip() .replace("–", "-") .replace("―", '"') .replace("‖", '"') .replace("“", '"') .replace("”", '"') for line in lines ] # Do not process headers or completely empty lines header_date_re = r"\d+\w{2} Day\s+\w+DAY, \w+ \d{1,2}, \d{4}\s+\d+" header_journal_re = r"\d+\s+JOURNAL OF THE \w+\s+\d+\w{2} Day" lines = iter( [ line for line in lines if not ( line == "" or re.match(header_date_re, line) or re.match(header_journal_re, line) ) ] ) # bill_id -> motion -> count motions_per_bill = collections.defaultdict(collections.Counter) for line in lines: # Go through with vote parse if any of # these conditions match. if not line.startswith("On the question") or "shall" not in line.lower(): continue # Get the bill_id bill_id = None bill_re = r"\(\s*([A-Z\.]+\s\d+)\s*\)" # The Senate ends its motion text with a vote announcement if chamber == "upper": end_of_motion_re = r".* the vote was:\s*" # The House may or may not end motion text with a bill name elif chamber == "lower": end_of_motion_re = r'.*Shall.*(?:\?"?|")(\s{})?\s*'.format(bill_re) while not re.match(end_of_motion_re, line, re.IGNORECASE): line += " " + next(lines) try: bill_id = re.search(bill_re, line).group(1) except AttributeError: self.warning( "This motion did not pertain to legislation: {}".format(line) ) continue # Get the motion text motion_re = r""" ^On\sthe\squestion\s # Precedes any motion "+ # Motion is preceded by a quote mark (or two) (Shall\s.+?\??) # The motion text begins with "Shall" \s*(?:\?"?|"|’)\s+ # Motion is followed by a question mark and/or a quote mark (?:{})? # If the vote regards a bill, its number is listed {} # Senate has trailing text \s*$ """.format( # in at least one case [SF 457 from 2020] the bill number is followed by )0 # seemingly just a typo, this gets around that bill_re, r",?.*?the\svote\swas:" if chamber == "upper" else r"\d?", ) # print("motion candidate line:", line) motion = re.search(motion_re, line, re.VERBOSE | re.IGNORECASE) if motion: motion = motion.group(1) for word, letter in (("Senate", "S"), ("House", "H"), ("File", "F")): if bill_id is None: return bill_id = bill_id.replace(word, letter) bill_id = bill_id.replace(".", "") bill_chamber = dict(h="lower", s="upper")[bill_id.lower()[0]] votes, passed = self.parse_votes(lines) # at the very least, there should be a majority # for the bill to have passed, so check that, # but if the bill didn't pass, it could still be OK if it got a majority # eg constitutional amendments if not ( (passed == (votes["yes_count"] > votes["no_count"])) or (not passed) ): self.error("The bill passed without a majority?") raise ValueError("invalid vote") # also throw a warning if the bill failed but got a majority # it could be OK, but is probably something we'd want to check if not passed and votes["yes_count"] > votes["no_count"]: self.logger.warning( "The bill got a majority but did not pass. " "Could be worth confirming." ) result = "" if passed: result = "pass" else: result = "fail" # check for duplicate motions and number second and up if needed motion_text = re.sub("\xad", "-", motion) motions_per_bill[bill_id][motion_text] += 1 new_count = motions_per_bill[bill_id][motion_text] if new_count > 1: motion_text += f" #{new_count}" vote = VoteEvent( chamber=chamber, start_date=date, motion_text=motion_text, result=result, classification="passage", legislative_session=session, bill=bill_id, bill_chamber=bill_chamber, ) # add votes and counts for vtype in ("yes", "no", "absent", "abstain"): vcount = votes["{}_count".format(vtype)] or 0 vote.set_count(vtype, vcount) for voter in votes["{}_votes".format(vtype)]: vote.vote(vtype, voter) vote.add_source(url) yield vote
def scrape_journal(self, url, chamber, session, date): filename = self.urlretrieve(url)[0] self.logger.info("Saved journal to %r", filename) all_text = convert_pdf(filename, type="text") lines = all_text.split(b"\n") lines = [line.decode("utf-8") for line in lines] lines = [line.strip() for line in lines] for index, line in enumerate(lines): if "Resultado de la Votación para la Medida" not in line: continue name_line = lines[index + 1] result_line = lines[index + 2] nomination_result_line = lines[index + 3] name_match = re.match(r"^(?P<type>.*) (?P<num>\d*) (?P<ref>.*)$", name_line).groupdict() bill = self.classify_measure_type(name_match) if not bill: continue if re.match("^NM", bill): # Nomination if re.match(r"(.*)Confirmado", nomination_result_line): result = "pass" else: msg = "Unhandled nomination result of: {}. Skipping.".format( nomination_result_line) self.logger.warning(msg) continue name_line = result_line else: # Not a Nomination if re.match(r"(.*)Recibido", result_line): msg = "Result was 'Recibido': {}. Skipping.".format( result_line) self.logger.warning(msg) continue try: vote_result = re.match( r".* (?P<yes>\d*)X(?P<no>\d*)X(?P<abstain>\d*)X(?P<absent>\d*) (?P<result>\w*)", result_line, ).groupdict() except AttributeError: msg = "Could not determine voting result of: {}. Skipping.".format( result_line) self.logger.warning(msg) continue if vote_result["result"] == "Aprobada": result = "pass" else: result = "fail" msg = "Voting result {} not guarenteed to be 'fail'. Take a look.".format( vote_result["result"]) self.logger.warning(msg) vote = VoteEvent( chamber=chamber, start_date=date, motion_text=name_line, result=result, classification="passage", legislative_session=session, bill=bill, bill_chamber=chamber, ) vote_index = index + 3 while not re.match("^Votante", lines[vote_index]): vote_index = vote_index + 1 vote_index = vote_index + 1 votes = { "yes": 0, "no": 0, "absent": 0, "abstain": 0, } while lines[vote_index].strip() and not re.match( r"Senado de", lines[vote_index]): name, vtype = parse_vote(lines[vote_index]) votes[vtype] += 1 vote.vote(vtype, name) vote_index = vote_index + 1 for vtype in ("yes", "no", "absent", "abstain"): vote.set_count(vtype, votes[vtype]) vote.add_source(url) yield vote
def scrape_house_vote(self, bill, url): try: filename, resp = self.urlretrieve(url) except scrapelib.HTTPError: self.warning("missing vote file %s" % url) return text = convert_pdf(filename, "text") os.remove(filename) lines = text.splitlines() vote_type = None votes = collections.defaultdict(list) date = None for idx, line in enumerate(lines): line = line.rstrip().decode("utf-8") match = re.search(r"(\d+)/(\d+)/(\d{4,4})$", line) if match: date = datetime.datetime.strptime(match.group(0), "%m/%d/%Y") continue match = re.match( r"\s+YEAS: (\d+)\s+NAYS: (\d+)\s+NOT VOTING: (\d+)", line) if match: motion = (lines[idx - 2].strip()).decode("utf-8") if not motion: self.warning("No motion text found for vote") motion = "PASSAGE" yes_count, no_count, other_count = [ int(g) for g in match.groups() ] exc_match = re.search(r"EXCUSED: (\d+)", line) if exc_match: other_count += int(exc_match.group(1)) if line.endswith("ADOPTED") or line.endswith("PASSED"): passed = True else: passed = False continue match = re.match( r"(YEAS|NAYS|NOT VOTING|PAIRED|EXCUSED):\s+(\d+)\s*$", line) if match: vote_type = { "YEAS": "yes", "NAYS": "no", "NOT VOTING": "other", "EXCUSED": "other", "PAIRED": "paired", }[match.group(1)] continue if vote_type == "paired": for part in line.split(" "): part = part.strip() if not part: continue name, pair_type = re.match(r"([^\(]+)\((YEA|NAY)\)", line).groups() name = name.strip() if pair_type == "YEA": votes["yes"].append(name) elif pair_type == "NAY": votes["no"].append(name) elif vote_type: for name in line.split(" "): name = name.strip() if not name: continue votes[vote_type].append(name) if date: vote = VoteEvent( chamber="lower", start_date=date.strftime("%Y-%m-%d"), motion_text=motion, result="pass" if passed else "fail", classification="passage", bill=bill, ) vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("other", other_count) vote.add_source(url) vote.pupa_id = url for key, values in votes.items(): for value in values: if "Committee" in value: continue if "*" in value: value = value.replace("*", "") vote.vote(key, value) yield vote else: self.warning("Syntax Error/Warning using 'convert_pdf'")
def _process_votes(self, rollcalls, bill_id, original_chamber, session): result_types = { "FAILED": False, "DEFEATED": False, "PREVAILED": True, "PASSED": True, "SUSTAINED": True, "NOT SECONDED": False, "OVERRIDDEN": True, "ADOPTED": True, } for r in rollcalls: proxy_link = PROXY_BASE_URL + r["link"] try: (path, resp) = self.urlretrieve(proxy_link) except scrapelib.HTTPError as e: self.warning(e) self.warning( "Unable to contact openstates proxy, skipping vote {}".format( r["link"] ) ) continue text = convert_pdf(path, "text").decode("utf-8") lines = text.split("\n") os.remove(path) chamber = ( "lower" if "house of representatives" in lines[0].lower() else "upper" ) date_parts = lines[1].strip().split()[-3:] date_str = " ".join(date_parts).title() + " " + lines[2].strip() vote_date = datetime.datetime.strptime(date_str, "%b %d, %Y %I:%M:%S %p") vote_date = pytz.timezone("America/Indiana/Indianapolis").localize( vote_date ) vote_date = vote_date.isoformat() passed = None for res, val in result_types.items(): # We check multiple lines now because the result of the # roll call vote as parsed can potentially be split. # PDF documents suck. for line in lines[3:5]: if res in line.upper(): passed = val break if passed is None: raise AssertionError("Missing bill passage type") motion = " ".join(lines[4].split()[:-2]) try: yeas = int(lines[4].split()[-1]) nays = int(lines[5].split()[-1]) excused = int(lines[6].split()[-1]) not_voting = int(lines[7].split()[-1]) except ValueError: self.logger.warning("Vote format is weird, skipping") continue vote = VoteEvent( chamber=chamber, legislative_session=session, bill=bill_id, bill_chamber=original_chamber, start_date=vote_date, motion_text=motion, result="pass" if passed else "fail", classification="passage", ) vote.set_count("yes", yeas) vote.set_count("no", nays) vote.set_count("excused", excused) vote.set_count("not voting", not_voting) vote.add_source(proxy_link) currently_counting = "" possible_vote_lines = lines[8:] for line in possible_vote_lines: line = line.replace("NOT\xc2\xa0VOTING", "NOT VOTING") line = line.replace("\xc2\xa0", " -") if "yea-" in line.lower().replace(" ", ""): currently_counting = "yes" elif "nay-" in line.lower().replace(" ", ""): currently_counting = "no" elif "excused-" in line.lower().replace(" ", ""): currently_counting = "excused" elif "notvoting-" in line.lower().replace(" ", ""): currently_counting = "not voting" elif currently_counting == "": pass elif re.search(r"v\. \d\.\d", line): # this gets rid of the version number # which is often found at the bottom of the doc pass else: voters = line.split(" ") for v in voters: if v.strip(): vote.vote(currently_counting, v.strip()) yield vote
def pdf_to_lxml(self): filename, resp = self.scraper.urlretrieve(self.url) text = convert_pdf(filename, "html") return lxml.html.fromstring(text)
def scrape_lower(self): PDF_URL = "http://www.ohiohouse.gov/Assets/CommitteeSchedule/calendar.pdf" (path, _response) = self.urlretrieve(PDF_URL) text = convert_pdf(path, type="text-nolayout").decode() os.remove(path) days = re.split(r"(\wF+day, \w+ \d{1,2}, 20\d{2})", text) date = None for day in enumerate(days[1:]): if day[0] % 2 == 0: date = day[1] else: events = re.split(r"\n((?:\w+\s?)+)\n", day[1]) comm = "" for event in enumerate(events[1:]): if event[0] % 2 == 0: comm = event[1].strip() else: try: (time, location, description) = re.search( r"""(?mxs) (\d{1,2}:\d{2}\s[ap]\.m\.) # Meeting time .*?,\s # Potential extra text for meeting time (.*?),\s # Location, usually a room .*?\n # Chairman of committee holding event (.*) # Description of event """, event[1], ).groups() except AttributeError: continue time = time.replace(".", "").upper() time = datetime.datetime.strptime( time + "_" + date, "%I:%M %p_%A, %B %d, %Y") time = self._tz.localize(time) location = location.strip() description = "\n".join([ x.strip() for x in description.split("\n") if x.strip() and not x.strip()[0].isdigit() ]) if not description: description = "[No description provided by state]" event = Event( name=description, start_date=time, location_name=location, description=description, ) event.add_source(PDF_URL) event.add_participant(comm, type="committee", note="host") for line in description.split("\n"): related_bill = re.search( r"(H\.?(?:[JC]\.?)?[BR]\.?\s+\d+)\s+(.*)$", line) if related_bill: (related_bill, relation) = related_bill.groups() relation = relation.strip() related_bill = related_bill.replace(".", "") item = event.add_agenda_item(relation) item.add_bill(related_bill) yield event
def _get_pdf(self, url): (path, response) = self.urlretrieve(url) data = convert_pdf(path, type="text") os.remove(path) return data
def pdf_to_lxml(self, filename, type="html"): text = convert_pdf(filename, type) return lxml.html.fromstring(text)
def scrape_vote(self, url, session): fname, _ = self.urlretrieve(url) text = convert_pdf(fname, type="text").decode() lines = text.splitlines() chamber = "upper" if "senate" in url else "lower" if "Maryland" not in text: self.warning(f"empty vote from {url}") return date = re.findall(r"Legislative Date: (\w+ \d+, \d{4})", text)[0] section = "preamble" motion = None bill_id = None how = None voters = defaultdict(list) for line in lines: if section == "preamble": if "vetoed" in line.lower(): self.warning( f"skipping vote that appears to be on prior session: {line}, {bill_id}" ) return possible_bill_id = re.findall(r"([HS][BJR] \d+)", line) if possible_bill_id: bill_id = possible_bill_id[0] # preamble has metadata, then motion, then counts. our process then is to # store the last line as the motion, but if the last line looks like a # continuation, append it to the prior line line = line.strip() counts = re.findall( r"(\d+) Yeas\s+(\d+) Nays\s+(\d+) Not Voting\s+(\d+) Excused\s+(\d+) Absent", line, ) if counts: yes_count, no_count, nv_count, excused_count, absent_count = counts[ 0] yes_count = int(yes_count) no_count = int(no_count) nv_count = int(nv_count) excused_count = int(excused_count) absent_count = int(absent_count) section = "votes" elif line and line != "(Const)": # questions seem to be split across two lines if line.endswith("?"): motion = motion + " " + line else: motion = line elif section == "votes": if line.startswith("Voting Yea"): how = "yes" elif line.startswith("Voting Nay"): how = "no" elif line.startswith("Not Voting"): how = "not voting" elif line.startswith("Excused from Voting"): how = "excused" elif line.startswith("Excused (Absent)"): how = "absent" elif how: names = re.split(r"\s{2,}", line) voters[how].extend(names) if not bill_id and not motion: return elif bill_id and not motion: self.warning( f"got {bill_id} but no motion, not registering as a vote") elif motion and not bill_id: self.warning( f"got {motion} but no bill_id, not registering as a vote") return # bleh - result not indicated anywhere result = "pass" if yes_count > no_count else "fail" bill_chamber = "upper" if bill_id.startswith("S") else "lower" date = datetime.datetime.strptime(date, "%b %d, %Y").strftime("%Y-%m-%d") vote = VoteEvent( chamber=chamber, start_date=date, result=result, classification="passage", motion_text=motion, legislative_session=session, bill=bill_id, bill_chamber=bill_chamber, ) # URL includes sequence ID, will be unique vote.dedupe_key = url vote.add_source(url) vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("not voting", nv_count) vote.set_count("excused", excused_count) vote.set_count("absent", absent_count) for how, names in voters.items(): for name in names: name = name.strip().replace("*", "") if name and "COPY" not in name and "Indicates Vote Change" not in name: vote.vote(how, name) check_counts(vote, raise_error=True) return vote
def scrape_chamber(self, chamber, session): chamber_name = "house" if chamber == "lower" else "senate" session_slug = { "62": "62-2011", "63": "63-2013", "64": "64-2015", "65": "65-2017", "66": "66-2019", }[session] # Open the index page of the session's Registers, and open each url = "http://www.legis.nd.gov/assembly/%s/journals/%s-journal.html" % ( session_slug, chamber_name, ) page = self.lxmlize(url) pdfs = page.xpath("//a[contains(@href, '.pdf')]") for pdf in pdfs: # Initialize information about the vote parsing results = {} in_motion = False cur_vote = None in_vote = False cur_motion = "" bills = [] # Determine which URLs the information was pulled from pdf_url = pdf.attrib["href"] try: (path, response) = self.urlretrieve(pdf_url) except requests.exceptions.ConnectionError: continue # Convert the PDF to text data = convert_pdf(path, type="text").decode("utf-8") os.unlink(path) # Determine the date of the document date = re.findall(date_re, data) if date: date = date[0][0] cur_date = datetime.datetime.strptime(date, "%A, %B %d, %Y") else: # If no date is found anywhere, do not process the document self.warning("No date was found for the document; skipping.") continue # Check each line of the text for motion and vote information lines = data.splitlines() for line in lines: # Ignore lines with no information if (re.search(chamber_re, line) or re.search(date_re, line) or re.search(page_re, line) or line.strip() == ""): pass # Ensure that motion and vote capturing are not _both_ active elif in_motion and in_vote: raise AssertionError( "Scraper should not be simultaneously processing " + "motion name and votes, as it is for this motion: " + cur_motion) # Start capturing motion text after a ROLL CALL header elif not in_motion and not in_vote: if line.strip() == "ROLL CALL": in_motion = True elif in_motion and not in_vote: if cur_motion == "": cur_motion = line.strip() else: cur_motion = cur_motion + " " + line.strip() # ABSENT AND NOT VOTING marks the end of each motion name # In this case, prepare to capture votes if line.strip().endswith( "VOTING") or line.strip().endswith("VOTING."): in_motion = False in_vote = True elif not in_motion and in_vote: # Ignore appointments and confirmations if "The Senate advises and consents to the appointment" in line: in_vote = False cur_vote = None results = {} cur_motion = "" bills = [] # If votes are being processed, record the voting members elif ":" in line: cur_vote, who = (x.strip() for x in line.split(":", 1)) who = [ x.strip() for x in who.split(";") if x.strip() != "" ] results[cur_vote] = who name_may_be_continued = False if line.endswith( ";") else True # Extracts bill numbers in the closing text # used for when the closing text is multiple lines. elif (cur_vote is not None and re.findall(r"(?i)(H|S|J)(C?)(B|R|M) (\d+)", line) and not any(x in line.lower() for x in [ "passed", "adopted", "sustained", "prevailed", "lost", "failed", ])): bills.extend( re.findall(r"(?i)(H|S|J)(C?)(B|R|M) (\d+)", line)) elif cur_vote is not None and not any(x in line.lower() for x in [ "passed", "adopted", "sustained", "prevailed", "lost", "failed", ]): who = [ x.strip() for x in line.split(";") if x.strip() != "" ] if name_may_be_continued: results[cur_vote][-1] = (results[cur_vote][-1] + " " + who.pop(0)) name_may_be_continued = False if line.endswith( ";") else True results[cur_vote].extend(who) # At the conclusion of a vote, save its data elif any(x in line.lower() for x in [ "passed", "adopted", "sustained", "prevailed", "lost", "failed", ]): in_vote = False cur_vote = None # Identify what is being voted on # Throw a warning if impropper informaiton found bills.extend( re.findall(r"(?i)(H|S|J)(C?)(B|R|M) (\d+)", line)) if bills == [] or cur_motion.strip() == "": results = {} cur_motion = "" self.warning("No motion or bill name found: " + "motion name: " + cur_motion + "; " + "decision text: " + line.strip()) continue # If votes are found in the motion name, throw an error if "YEAS:" in cur_motion or "NAYS:" in cur_motion: raise AssertionError( "Vote data found in motion name: " + cur_motion) # Use the collected results to determine who voted how keys = { "YEAS": "yes", "NAYS": "no", "ABSENT AND NOT VOTING": "other", } res = {} for key in keys: if key in results: res[keys[key]] = results[key] else: res[keys[key]] = [] # Count the number of members voting each way yes, no, other = ( len(res["yes"]), len(res["no"]), len(res["other"]), ) chambers = { "H": "lower", "S": "upper", "J": "legislature" } # Almost all of the time, a vote only applies to one bill and this loop # will only be run once. # Some exceptions exist. for bill in bills: cur_bill_id = "%s%s%s %s" % bill # Identify the source chamber for the bill try: bc = chambers[cur_bill_id[0]] except KeyError: bc = "other" # Determine whether or not the vote passed if "over the governor's veto" in cur_motion.lower( ): VETO_SUPERMAJORITY = 2 / 3 passed = yes / (yes + no) > VETO_SUPERMAJORITY else: passed = yes > no # Create a Vote object based on the scraped information vote = Vote( chamber=chamber, start_date=cur_date.strftime("%Y-%m-%d"), motion_text=cur_motion, result="pass" if passed else "fail", legislative_session=session, classification="passage", bill=cur_bill_id, bill_chamber=bc, ) vote.add_source(pdf_url) vote.add_source(url) vote.set_count("yes", yes) vote.set_count("no", no) vote.set_count("other", other) # For each category of voting members, # add the individuals to the Vote object for key in res: for voter in res[key]: vote.vote(key, voter) # Check the vote counts in the motion text against # the parsed results for category_name in keys.keys(): # Need to search for the singular, not plural, in the text # so it can find, for example, " 1 NAY " vote_re = r"(\d+)\s{}".format( category_name[:-1]) motion_count = int( re.findall(vote_re, cur_motion)[0]) for item in vote.counts: if item["option"] == keys[category_name]: vote_count = item["value"] if motion_count != vote_count: self.warning( "Motion text vote counts ({}) ".format( motion_count) + "differed from roll call counts ({}) ". format(vote_count) + "for {0} on {1}".format( category_name, cur_bill_id)) for item in vote.counts: if item["option"] == keys[ category_name]: vote_count = motion_count yield vote # With the vote successfully processed, # wipe its data and continue to the next one results = {} cur_motion = "" bills = []
def scrape_votes(self, vote_url, bill, chamber): try: filename, response = self.urlretrieve(vote_url) except scrapelib.HTTPError: self.logger.warning("PDF not posted or available") return # Grabs text from pdf pdflines = [ line.decode("utf-8") for line in convert_pdf(filename, "text").splitlines() ] os.remove(filename) vote_date = 0 voters = defaultdict(list) for x in range(len(pdflines)): line = pdflines[x] if re.search(r"(\d+/\d+/\d+)", line): initial_date = line.strip() if ("AM" in line) or ("PM" in line): split_l = line.split() for y in split_l: if ":" in y: time_location = split_l.index(y) motion = " ".join(split_l[0:time_location]) time = split_l[time_location:] if len(time) > 0: time = "".join(time) dt = initial_date + " " + time dt = datetime.strptime(dt, "%m/%d/%Y %I:%M:%S%p") vote_date = central.localize(dt) vote_date = vote_date.isoformat() # In rare case that no motion is provided if len(motion) < 1: motion = "No Motion Provided" if "YEAS:" in line: yeas = int(line.split()[-1]) if "NAYS:" in line: nays = int(line.split()[-1]) if "ABSTAINED:" in line: abstained = int(line.split()[-1]) if "PASSES:" in line: abstained = int(line.split()[-1]) if "NOT VOTING:" in line: not_voting = int(line.split()[-1]) if "YEAS :" in line: y = 0 next_line = pdflines[x + y] while "NAYS : " not in next_line: next_line = next_line.split(" ") if next_line and ("YEAS" not in next_line): for v in next_line: if v and "YEAS" not in v: voters["yes"].append(v.strip()) next_line = pdflines[x + y] y += 1 if line and "NAYS :" in line: y = 0 next_line = 0 next_line = pdflines[x + y] while ("ABSTAINED : " not in next_line) and ("PASSES :" not in next_line): next_line = next_line.split(" ") if next_line and "NAYS" not in next_line: for v in next_line: if v and "NAYS" not in v: voters["no"].append(v.strip()) next_line = pdflines[x + y] y += 1 if line and ("ABSTAINED :" in line or "PASSES :" in line): y = 2 next_line = 0 next_line = pdflines[x + y] while "NOT VOTING :" not in next_line: next_line = next_line.split(" ") if next_line and ("ABSTAINED" not in next_line or "PASSES" not in next_line): for v in next_line: if v: voters["abstain"].append(v.strip()) next_line = pdflines[x + y] y += 1 if line and "NOT VOTING : " in line: lines_to_go_through = math.ceil(not_voting / len(line.split())) next_line = pdflines[x] for y in range(lines_to_go_through): if len(pdflines) > (x + y + 2): next_line = pdflines[x + y + 2].split(" ") for v in next_line: if v: voters["not voting"].append(v.strip()) if yeas > (nays + abstained + not_voting): passed = True else: passed = False ve = VoteEvent( chamber=chamber, start_date=vote_date, motion_text=motion, result="pass" if passed else "fail", bill=bill, classification="passage", ) ve.add_source(vote_url) for how_voted, how_voted_voters in voters.items(): for voter in how_voted_voters: if len(voter) > 0: ve.vote(how_voted, voter) # Resets voters dictionary before going onto next page in pdf voters = defaultdict(list) yield ve
def scrape_upper(self): PDF_URL = "http://www.ohiosenate.gov/Assets/CommitteeSchedule/calendar.pdf" (path, _response) = self.urlretrieve(PDF_URL) text = convert_pdf(path, type="text").decode() os.remove(path) days = re.split(r"(\w+day, \w+ \d{1,2})", text) date = None for day in enumerate(days[1:]): if day[0] % 2 == 0: # Calendar is put out for the current week, so use that year date = day[1] + ", " + str(datetime.datetime.now().year) else: events = re.split(r"\n\n((?:\w+\s?)+),\s", day[1]) comm = "" for event in enumerate(events[1:]): if event[0] % 2 == 0: comm = event[1].strip() else: try: (time, location, description) = re.search( r"""(?mxs) (\d{1,2}:\d{2}\s[AP]M) # Meeting time .*?,\s # Potential extra text for meeting time (.*?)\n # Location, usually a room .*?\n # Chairman of committee holding event (.*) # Description of event """, event[1], ).groups() except AttributeError: continue time = datetime.datetime.strptime( time + "_" + date, "%I:%M %p_%A, %B %d, %Y") time = self._tz.localize(time) location = location.strip() description = "\n".join([ x.strip() for x in description.split("\n") if x.strip() and not x.strip().startswith("Page ") and not x.strip().startswith("*Possible Vote") and not x.strip() == "NO OTHER COMMITTEES WILL MEET" ]) if not description: description = "[No description provided by state]" event = Event( name=description, start_date=time, location_name=location, description=description, ) event.add_source(PDF_URL) event.add_participant(comm, type="committee", note="host") for line in description.split("\n"): related_bill = re.search( r"(S\.?(?:[JC]\.?)?[BR]\.?\s+\d+)\s+(.*)$", line) if related_bill: (related_bill, relation) = related_bill.groups() relation = relation.strip() related_bill = related_bill.replace(".", "") item = event.add_agenda_item(relation) item.add_bill(related_bill) yield event