def _fix_house_text(self, filename): """ TLDR: throw out bad text, replace it using different parser settings. When using `pdftotext` on the 2015 House committee list, the second and third columns of the second page get mixed up, which makes it very difficult to parse. Adding the `--layout` option fixes this, but isn't worth switching all parsing to that since the standard `pdftotext --nolayout` is easier in all other cases. The best solution to this is to throw out the offending text, and replace it with the correct text. The third and fourth columns are joint comittees that are scraped from the Senate document, so the only column that needs to be inserted this way is the second. """ # Take the usable text from the normally-working parsing settings text = convert_pdf(filename, type="text-nolayout") assert "Revised: January 23, 2015" in text, ( "House committee list has changed; check that the special-case" " fix is still necessary, and that the result is still correct" ) text = re.sub(r"(?sm)Appropriations/F&C.*$", "", text) # Take the usable column from the alternate parser alternate_text = convert_pdf(filename, type="text") alternate_lines = alternate_text.split("\n") HEADER_OF_COLUMN_TO_REPLACE = "State Administration (cont.) " (text_of_line_to_replace,) = [ x for x in alternate_lines if HEADER_OF_COLUMN_TO_REPLACE in x ] first_line_to_replace = alternate_lines.index(text_of_line_to_replace) first_character_to_replace = ( alternate_lines[first_line_to_replace].index(HEADER_OF_COLUMN_TO_REPLACE) - 1 ) last_character_to_replace = first_character_to_replace + len( HEADER_OF_COLUMN_TO_REPLACE ) column_lines_to_add = [ x[first_character_to_replace:last_character_to_replace] for x in alternate_lines[first_line_to_replace + 1 :] ] column_text_to_add = "\n".join(column_lines_to_add) text = text + column_text_to_add return text
def scrape_vote_text(self, filelocation, local=False): """Retrieves or uses local copy of vote pdf and converts into XML.""" if not local: try: filename, response = self.urlretrieve(url=filelocation) vote_text = convert_pdf(filename, type='xml') os.remove(filename) except scrapelib.HTTPError: self.warning('Request failed: {}'.format(filelocation)) return else: vote_text = convert_pdf(filelocation, type='xml') os.remove(filelocation) return vote_text
def _fix_house_text(self, filename): ''' TLDR: throw out bad text, replace it using different parser settings. When using `pdftotext` on the 2015 House committee list, the second and third columns of the second page get mixed up, which makes it very difficult to parse. Adding the `--layout` option fixes this, but isn't worth switching all parsing to that since the standard `pdftotext --nolayout` is easier in all other cases. The best solution to this is to throw out the offending text, and replace it with the correct text. The third and fourth columns are joint comittees that are scraped from the Senate document, so the only column that needs to be inserted this way is the second. ''' # Take the usable text from the normally-working parsing settings text = convert_pdf(filename, type='text-nolayout') assert "Revised: January 23, 2015" in text,\ "House committee list has changed; check that the special-case"\ " fix is still necessary, and that the result is still correct" text = re.sub(r'(?sm)Appropriations/F&C.*$', "", text) # Take the usable column from the alternate parser alternate_text = convert_pdf(filename, type='text') alternate_lines = alternate_text.split('\n') HEADER_OF_COLUMN_TO_REPLACE = "State Administration (cont.) " (text_of_line_to_replace, ) = [ x for x in alternate_lines if HEADER_OF_COLUMN_TO_REPLACE in x ] first_line_to_replace = alternate_lines.index(text_of_line_to_replace) first_character_to_replace = alternate_lines[ first_line_to_replace].index(HEADER_OF_COLUMN_TO_REPLACE) - 1 last_character_to_replace = (first_character_to_replace + len(HEADER_OF_COLUMN_TO_REPLACE)) column_lines_to_add = [ x[first_character_to_replace:last_character_to_replace] for x in alternate_lines[first_line_to_replace + 1:] ] column_text_to_add = '\n'.join(column_lines_to_add) text = text + column_text_to_add return text
def __init__(self, url, resp, bill): self.url = url self.bill = bill # Fetch the document and put it into tempfile. fd, filename = tempfile.mkstemp() with open(filename, 'wb') as f: f.write(resp) # Convert it to text. try: text = convert_pdf(filename, type='text') except Exception: msg = "couldn't convert pdf." raise PDFCommitteeVoteParseError(msg) # Get rid of the temp file. os.close(fd) os.remove(filename) if not text.strip(): msg = 'PDF file was empty.' raise PDFCommitteeVoteParseError(msg) self.text = '\n'.join([line.decode() for line in text.splitlines() if line])
def scrape_committees_pdf(self, year, chamber, filename, url): if chamber == 'lower' and year == '2015': text = self._fix_house_text(filename).decode() else: text = convert_pdf(filename, type='text-nolayout').decode() for hotgarbage, replacement in ( (r'Judicial Branch, Law Enforcement,\s+and\s+Justice', 'Judicial Branch, Law Enforcement, and Justice'), (r'Natural Resources and\s+Transportation', 'Natural Resources and Transportation'), (r'(?u)Federal Relations, Energy,?\s+and\s+Telecommunications', 'Federal Relations, Energy, and Telecommunications') ): text = re.sub(hotgarbage, replacement, text) lines = iter(text.splitlines()) # Drop any lines before the ag committee. lines = dropwhile(lambda s: 'Agriculture' not in s, lines) comm = None for line in lines: # Replace Unicode variants with ASCII equivalents line = line.replace(" ", " ").replace("‐", "-") if 'Subcommittees' in line: self.warning("Currently, we're skipping subcommittees") # https://github.com/openstates/openstates/issues/2099 break if is_committee_name(line): if comm and comm._related: yield comm committee = line.strip() comm = Organization(name=committee, chamber=chamber, classification='committee') comm.add_source(url) elif is_legislator_name(line): name, party = line.rsplit('(', 1) name = name.strip().replace("Rep. ", "").replace("Sen. ", "") if re.search(' Ch', party): role = 'chair' elif ' VCh' in party: role = 'vice chair' elif ' MVCh' in party: role = 'minority vice chair' else: role = 'member' comm.add_member(name, role) if comm._related: yield comm
def scrape_committees_pdf(self, year, chamber, filename, url): if chamber == 'lower' and year == '2015': text = self._fix_house_text(filename).decode() else: text = convert_pdf(filename, type='text-nolayout').decode() for hotgarbage, replacement in ( (r'Judicial Branch, Law Enforcement,\s+and\s+Justice', 'Judicial Branch, Law Enforcement, and Justice'), (r'Natural Resources and\s+Transportation', 'Natural Resources and Transportation'), (r'(?u)Federal Relations, Energy,?\s+and\s+Telecommunications', 'Federal Relations, Energy, and Telecommunications')): text = re.sub(hotgarbage, replacement, text) lines = iter(text.splitlines()) # Drop any lines before the ag committee. lines = dropwhile(lambda s: 'Agriculture' not in s, lines) comm = None for line in lines: # Replace Unicode variants with ASCII equivalents line = line.replace(" ", " ").replace("‐", "-") if 'Subcommittees' in line: self.warning("Currently, we're skipping subcommittees") # https://github.com/openstates/openstates/issues/2099 break if is_committee_name(line): if comm and comm._related: yield comm committee = line.strip() comm = Organization(name=committee, chamber=chamber, classification='committee') comm.add_source(url) elif is_legislator_name(line): name, party = line.rsplit('(', 1) name = name.strip().replace("Rep. ", "").replace("Sen. ", "") if re.search(' Ch', party): role = 'chair' elif ' VCh' in party: role = 'vice chair' elif ' MVCh' in party: role = 'minority vice chair' else: role = 'member' comm.add_member(name, role) if comm._related: yield comm
def scrape_rollcall(self, vote, vurl): """ Get text information from the pdf, containing the vote roll call and add the information obtained to the related voteEvent object :param vote: related voteEvent object :param vurl: pdf source url """ (path, resp) = self.urlretrieve(vurl) pdflines = convert_pdf(path, 'text') os.remove(path) current_vfunc = None option = None for line in pdflines.split(b'\n'): line = line.strip().decode() # change what is being recorded if line.startswith('YEAS') or line.startswith('AYES'): current_vfunc = vote.yes elif line.startswith('NAYS'): current_vfunc = vote.no elif line.startswith('EXCUSED'): current_vfunc = vote.vote option = 'excused' elif line.startswith('NOT VOTING'): current_vfunc = vote.vote option = 'excused' elif line.startswith('ABSTAIN'): current_vfunc = vote.vote option = 'excused' elif line.startswith('PAIRED'): current_vfunc = vote.vote option = 'paired' # skip these elif not line or line.startswith('Page '): continue # if a vfunc is active elif current_vfunc: # split names apart by 3 or more spaces names = re.split(r'\s{3,}', line) for name in names: if name: if not option: current_vfunc(name.strip()) else: current_vfunc(option=option, voter=name.strip())
def scrape_votes(self, url, motion, date, chamber, bill): vote_pdf, resp = self.urlretrieve(url) text = convert_pdf(vote_pdf, 'text') os.remove(vote_pdf) # this way we get a key error on a missing vote type motion, passed = self._vote_mapping[motion] yes_votes = [] no_votes = [] other_votes = [] absent_votes = [] not_voting_votes = [] # point at array to add names to cur_array = None precursors = ( ('yeas--', yes_votes), ('nays--', no_votes), ('absent or those not voting--', absent_votes), ('absent and those not voting--', absent_votes), ('not voting--', not_voting_votes), ('voting present--', other_votes), ('present--', other_votes), ('disclaimer', None), ) # split lines on newline, recombine lines that don't end in punctuation lines = _combine_lines(text.decode().split('\n')) for line in lines: # check if the line starts with a precursor, switch to that array for pc, arr in precursors: if pc in line.lower(): cur_array = arr line = line.replace(pc, '') # split names for name in line.split(','): name = name.strip() # move on if that's all there was if not name: continue # None or a Total indicate the end of a section if 'None.' in name: cur_array = None match = re.match(r'(.+?)\. Total--.*', name) if match: cur_array.append(match.groups()[0]) cur_array = None # append name if it looks ok junk_in_name = False for junk in ('on final passage', 'Necessary', 'who would have', 'being a tie', 'therefore', 'Vacancies', 'a pair', 'Total-', 'ATTORNEY', 'on final passage', 'SPEAKER', 'BOARD', 'TREASURER', 'GOVERNOR', 'ARCHIVES', 'SECRETARY'): if junk in name: junk_in_name = True break if cur_array is not None and not junk_in_name: # strip trailing . if name[-1] == '.': name = name[:-1] cur_array.append(name) # return vote object yes_count = len(yes_votes) no_count = len(no_votes) absent_count = len(absent_votes) not_voting_count = len(not_voting_votes) other_count = len(other_votes) vote = VoteEvent(chamber=chamber, start_date=self._tz.localize(date), motion_text=motion, result='pass' if passed else 'fail', classification='passage', bill=bill) vote.pupa_id = url + '#' + bill.identifier vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('absent', absent_count) vote.set_count('not voting', not_voting_count) vote.set_count('other', other_count) vote.add_source(url) for yes_vote in yes_votes: vote.vote('yes', yes_vote) for no_vote in no_votes: vote.vote('no', no_vote) for absent_vote in absent_votes: vote.vote('absent', absent_vote) for not_voting_vote in not_voting_votes: vote.vote('not voting', not_voting_vote) for other_vote in other_votes: vote.vote('other', other_vote) yield vote
def parse_vote_pdf(self, vote_url, bill): filename, response = self.urlretrieve(vote_url) text = convert_pdf(filename, type='text').decode() lines = text.splitlines() if 'Senate' in vote_url: chamber = 'upper' else: chamber = 'lower' date_string = lines[0].split('Calendar Date:')[1].strip() date = datetime.datetime.strptime(date_string, "%b %d, %Y %I:%M (%p)") page_index = None for index, line in enumerate(lines): if 'Yeas' in line and 'Nays' in line: page_index = index break vote_counts = 5 * [0] vote_types = ['yes', 'no', 'not voting', 'excused', 'absent'] if page_index: counts = re.split(r'\s{2,}', lines[page_index].strip()) for index, count in enumerate(counts): number, string = count.split(' ', 1) number = int(number) vote_counts[index] = number else: raise ValueError("Vote Counts Not found at %s" % vote_url) passed = vote_counts[0] > vote_counts[1] motion = re.split(r'\s{2,}', lines[page_index - 3].strip())[0] motion_keywords = [ 'favorable', 'reading', 'amendment', 'motion', 'bill be introduced' ] if not any(motion_keyword in motion.lower() for motion_keyword in motion_keywords): motion = re.split(r'\s{2,}', lines[page_index - 2].strip())[0] if not any(motion_keyword in motion.lower() for motion_keyword in motion_keywords): self.error("Motion Extracted: %s" % motion) raise ValueError("No Motion or faulty Motion scraped.") vote = VoteEvent( bill=bill, chamber=chamber, start_date=date.strftime('%Y-%m-%d'), motion_text=motion, classification='passage', result='pass' if passed else 'fail', ) vote.pupa_id = vote_url # contains sequence number for index, vote_type in enumerate(vote_types): vote.set_count(vote_type, vote_counts[index]) page_index = page_index + 2 # Keywords for identifying where names are located in the pdf show_stoppers = [ 'Voting Nay', 'Not Voting', 'COPY', 'Excused', 'indicates vote change' ] vote_index = 0 # For matching number of names extracted with vote counts(extracted independently) vote_name_counts = 5 * [0] while page_index < len(lines): current_line = lines[page_index].strip() if not current_line or 'Voting Yea' in current_line: page_index += 1 continue if any(show_stopper in current_line for show_stopper in show_stoppers): page_index += 1 vote_index = (vote_index + 1) continue names = re.split(r'\s{2,}', current_line) vote_name_counts[vote_index] += len(names) for name in names: vote.vote(vote_types[vote_index], name) page_index += 1 if vote_counts != vote_name_counts: raise ValueError("Votes Count and Number of Names don't match") return vote
def scrape_votes(self, url, motion, date, chamber, bill): vote_pdf, resp = self.urlretrieve(url) text = convert_pdf(vote_pdf, "text") os.remove(vote_pdf) # this way we get a key error on a missing vote type motion, passed = self._vote_mapping[motion] yes_votes = [] no_votes = [] other_votes = [] absent_votes = [] not_voting_votes = [] # point at array to add names to cur_array = None precursors = ( ("yeas--", yes_votes), ("nays--", no_votes), ("absent or those not voting--", absent_votes), ("absent and those not voting--", absent_votes), ("not voting--", not_voting_votes), ("voting present--", other_votes), ("present--", other_votes), ("disclaimer", None), ) # split lines on newline, recombine lines that don't end in punctuation lines = _combine_lines(text.decode().split("\n")) for line in lines: # check if the line starts with a precursor, switch to that array for pc, arr in precursors: if pc in line.lower(): cur_array = arr line = line.replace(pc, "") # split names for name in line.split(","): name = name.strip() # move on if that's all there was if not name: continue # None or a Total indicate the end of a section if "None." in name: cur_array = None match = re.match(r"(.+?)\. Total--.*", name) if match: cur_array.append(match.groups()[0]) cur_array = None # append name if it looks ok junk_in_name = False for junk in ( "on final passage", "Necessary", "who would have", "being a tie", "therefore", "Vacancies", "a pair", "Total-", "ATTORNEY", "on final passage", "SPEAKER", "BOARD", "TREASURER", "GOVERNOR", "ARCHIVES", "SECRETARY", ): if junk in name: junk_in_name = True break if cur_array is not None and not junk_in_name: # strip trailing . if name[-1] == ".": name = name[:-1] cur_array.append(name) # return vote object yes_count = len(yes_votes) no_count = len(no_votes) absent_count = len(absent_votes) not_voting_count = len(not_voting_votes) other_count = len(other_votes) vote = VoteEvent( chamber=chamber, start_date=self._tz.localize(date), motion_text=motion, result="pass" if passed else "fail", classification="passage", bill=bill, ) vote.pupa_id = url + "#" + bill.identifier vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("absent", absent_count) vote.set_count("not voting", not_voting_count) vote.set_count("other", other_count) vote.add_source(url) for yes_vote in yes_votes: vote.vote("yes", yes_vote) for no_vote in no_votes: vote.vote("no", no_vote) for absent_vote in absent_votes: vote.vote("absent", absent_vote) for not_voting_vote in not_voting_votes: vote.vote("not voting", not_voting_vote) for other_vote in other_votes: vote.vote("other", other_vote) yield vote
def scrape_journal(self, url, chamber, session, date): filename, response = self.urlretrieve(url) self.logger.info('Saved journal to %r' % filename) all_text = convert_pdf(filename, type="text") lines = all_text.split(b'\n') lines = [line.decode('utf-8') for line in lines] lines = [line. strip(). replace('–', '-'). replace('―', '"'). replace('‖', '"'). replace('“', '"'). replace('”', '"') for line in lines] # Do not process headers or completely empty lines header_date_re = r"\d+\w{2} Day\s+\w+DAY, \w+ \d{1,2}, \d{4}\s+\d+" header_journal_re = r"\d+\s+JOURNAL OF THE \w+\s+\d+\w{2} Day" lines = iter([line for line in lines if not( line == "" or re.match(header_date_re, line) or re.match(header_journal_re, line))]) for line in lines: # Go through with vote parse if any of # these conditions match. if not line.startswith("On the question") or \ "shall" not in line.lower(): continue # Get the bill_id bill_id = None bill_re = r'\(\s*([A-Z\.]+\s\d+)\s*\)' # The Senate ends its motion text with a vote announcement if chamber == "upper": end_of_motion_re = r'.* the vote was:\s*' # The House may or may not end motion text with a bill name elif chamber == "lower": end_of_motion_re = r'.*Shall.*\?"?(\s{})?\s*'.format(bill_re) while not re.match(end_of_motion_re, line, re.IGNORECASE): line += " " + next(lines) try: bill_id = re.search(bill_re, line).group(1) except AttributeError: self.warning("This motion did not pertain to legislation: {}". format(line)) continue # Get the motion text motion_re = r''' ^On\sthe\squestion\s # Precedes any motion "+ # Motion is preceded by a quote mark (or two) (Shall\s.+?\??) # The motion text begins with "Shall" \s*"\s+ # Motion is followed by a quote mark (?:{})? # If the vote regards a bill, its number is listed {} # Senate has trailing text \s*$ '''.format( bill_re, r',?.*?the\svote\swas:' if chamber == 'upper' else '' ) print(line) motion = re.search(motion_re, line, re.VERBOSE | re.IGNORECASE).group(1) for word, letter in (('Senate', 'S'), ('House', 'H'), ('File', 'F')): if bill_id is None: return bill_id = bill_id.replace(word, letter) bill_id = bill_id.replace('.', '') bill_chamber = dict(h='lower', s='upper')[bill_id.lower()[0]] self.current_id = bill_id votes, passed = self.parse_votes(lines) # at the very least, there should be a majority # for the bill to have passed, so check that, # but if the bill didn't pass, it could still be OK if it got a majority # eg constitutional amendments if not ((passed == (votes['yes_count'] > votes['no_count'])) or (not passed)): self.error("The bill passed without a majority?") raise ValueError('invalid vote') # also throw a warning if the bill failed but got a majority # it could be OK, but is probably something we'd want to check if not passed and votes['yes_count'] > votes['no_count']: self.logger.warning("The bill got a majority but did not pass. " "Could be worth confirming.") result = "" if passed: result = "pass" else: result = "fail" vote = VoteEvent(chamber=chamber, start_date=date, motion_text=re.sub('\xad', '-', motion), result=result, classification='passage', legislative_session=session, bill=bill_id, bill_chamber=bill_chamber ) # add votes and counts for vtype in ('yes', 'no', 'absent', 'abstain'): vcount = votes['{}_count'.format(vtype)] or 0 vote.set_count(vtype, vcount) for voter in votes['{}_votes'.format(vtype)]: vote.vote(vtype, voter) vote.add_source(url) yield vote
def scrape_journal(self, session, url): journal, resp = self.urlretrieve(url) text = convert_pdf(journal, type='text').decode() lines = text.splitlines() # state machine: # None - undefined state # question_quote - in question, looking for end quote # pre-yes - vote is active, haven't hit yes votes yet # yes - yes votes # no - no votes # other - other votes state = None vote = None question = None date = None other_count = 0 for line in lines: date_match = DATE_RE.findall(line) # skip headers if 'LEGISLATIVE JOURNAL' in line: continue elif date_match: date = datetime.datetime.strptime(' '.join(date_match[0]), '%B %d %Y') continue # keep adding lines to question while quotes are open elif state == 'question_quote': question += ' %s' % line elif state in ('pre-yes', 'yes', 'no', 'other'): yes_match = YES_RE.match(line) no_match = NO_RE.match(line) other_match = NOT_VOTING_RE.match(line) if yes_match: vote.set_count('yes', int(yes_match.group(1))) state = 'yes' elif no_match: vote.set_count('no', int(no_match.group(1))) state = 'no' elif other_match: other_count += int(other_match.group(1)) state = 'other' elif 'having voted in the affirmative' in line: vote.set_count('other', other_count) vote.result = 'pass' state = None vote.validate() yield vote vote = None other_count = 0 elif 'Having failed' in line: vote.set_count('other', other_count) vote.result = 'fail' state = None vote.validate() yield vote vote = None other_count = 0 elif line: people = re.split('\s{3,}', line) # try: # except KeyError: # self.warning('line showed up in pre-yes state: %s', # line) for p in people: if p: # special cases for long name w/ 1 space if p.startswith( ('Lautenbaugh ', 'Langemeier ', 'McCollister ', 'Pansing Brooks ', 'Schumacher ')): p1, p2 = p.split(' ', 1) vote.vote(state, p1) vote.vote(state, p2) else: vote.vote(state, p) # check the text against our regexes bill_match = BILL_RE.match(line) veto_match = VETO_BILL_RE.findall(line) question_match = QUESTION_RE.findall(line) if bill_match: bill_type, bill_id = bill_match.groups() if bill_type == 'BILL': bill_id = 'LB ' + bill_id elif bill_type == 'RESOLUTION': bill_id = 'LR ' + bill_id elif question_match: question = question_match[0] state = 'question_quote' elif veto_match: bill_id = veto_match[0] # line just finished a question if state == 'question_quote' and QUESTION_MATCH_END in question: question = re.sub( '\s+', ' ', question.replace(QUESTION_MATCH_END, '').strip()) if not bill_id: raise Exception('cannot save vote without bill_id') # save prior vote vtuple = (bill_id, date, question) if vtuple in self._seen: vote = None continue else: self._seen.add(vtuple) vote = VoteEvent( bill=bill_id, bill_chamber='legislature', chamber='legislature', legislative_session=session, start_date=date.strftime('%Y-%m-%d'), motion_text=question, classification='passage', result='fail', ) vote.add_source(url) state = 'pre-yes' # reset bill_id and question bill_id = question = None
def parse_vote_pdf(self, vote_url, bill): filename, response = self.urlretrieve(vote_url) text = convert_pdf(filename, type='text').decode() lines = text.splitlines() if 'Senate' in vote_url: chamber = 'upper' else: chamber = 'lower' date_string = lines[0].split('Calendar Date:')[1].strip() date = datetime.datetime.strptime(date_string, "%b %d, %Y %I:%M (%p)") page_index = None for index, line in enumerate(lines): if 'Yeas' in line and 'Nays' in line: page_index = index break vote_counts = 5 * [0] vote_types = ['yes', 'no', 'not voting', 'excused', 'absent'] if page_index: counts = re.split(r'\s{2,}', lines[page_index].strip()) for index, count in enumerate(counts): number, string = count.split(' ', 1) number = int(number) vote_counts[index] = number else: raise ValueError("Vote Counts Not found at %s" % vote_url) passed = vote_counts[0] > vote_counts[1] # Consent calendar votes address multiple bills in one VoteEvent # eg, http://mgaleg.maryland.gov/2018RS/votes/Senate/0478.pdf is_consent_calendar = any( ['Consent Calendar' in line for line in lines[:page_index]]) consent_calendar_bills = None motion = "" if is_consent_calendar: motion = re.split(r'\s{2,}', lines[page_index - 4].strip())[0] consent_calendar_bills = re.split(r'\s{2,}', lines[page_index - 1].strip()) assert consent_calendar_bills, "Could not find bills for consent calendar vote" motion_keywords = [ 'favorable', 'reading', 'amendment', 'motion', 'introduced', 'bill pass', 'committee' ] motion_lines = [ 3, 2, 4, 5 ] # Relative LineNumbers to be checked for existence of motion for i in motion_lines: if any(motion_keyword in motion.lower() for motion_keyword in motion_keywords): break motion = re.split(r'\s{2,}', lines[page_index - i].strip())[0] else: if not any(motion_keyword in motion.lower() for motion_keyword in motion_keywords): # This condition covers for the bad formating in SB 1260 motion = lines[page_index - 3] if not any(motion_keyword in motion.lower() for motion_keyword in motion_keywords): # Check this one for SB 747 motion = "No motion given" self.warning("No motion given") vote = VoteEvent( bill=bill, chamber=chamber, start_date=date.strftime('%Y-%m-%d'), motion_text=motion, classification='passage', result='pass' if passed else 'fail', ) # Include bill ID to avoid duplication for consent calendars vote.pupa_id = '{}#{}'.format(vote_url, bill.identifier) for index, vote_type in enumerate(vote_types): vote.set_count(vote_type, vote_counts[index]) page_index = page_index + 2 # Keywords for identifying where names are located in the pdf show_stoppers = [ 'Voting Nay', 'Not Voting', 'COPY', 'Excused', 'indicates vote change' ] vote_index = 0 # For matching number of names extracted with vote counts(extracted independently) vote_name_counts = 5 * [0] while page_index < len(lines): current_line = lines[page_index].strip() if not current_line or 'Voting Yea' in current_line: page_index += 1 continue if any(show_stopper in current_line for show_stopper in show_stoppers): page_index += 1 vote_index = (vote_index + 1) continue names = re.split(r'\s{2,}', current_line) vote_name_counts[vote_index] += len(names) for name in names: vote.vote(vote_types[vote_index], name) page_index += 1 if vote_counts != vote_name_counts: raise ValueError("Votes Count and Number of Names don't match") return vote
def _process_votes(self, rollcalls, bill_id, original_chamber, session, proxy): result_types = { "FAILED": False, "DEFEATED": False, "PREVAILED": True, "PASSED": True, "SUSTAINED": True, "NOT SECONDED": False, "OVERRIDDEN": True, "ADOPTED": True, } for r in rollcalls: proxy_link = proxy["url"] + r["link"] try: (path, resp) = self.urlretrieve(proxy_link) except scrapelib.HTTPError as e: self.warning(e) self.warning( "Unable to contact openstates proxy, skipping vote {}".format( r["link"] ) ) continue text = convert_pdf(path, "text").decode("utf-8") lines = text.split("\n") os.remove(path) chamber = ( "lower" if "house of representatives" in lines[0].lower() else "upper" ) date_parts = lines[1].strip().split()[-3:] date_str = " ".join(date_parts).title() + " " + lines[2].strip() vote_date = datetime.datetime.strptime(date_str, "%b %d, %Y %I:%M:%S %p") vote_date = pytz.timezone("America/Indiana/Indianapolis").localize( vote_date ) vote_date = vote_date.isoformat() passed = None for res, val in result_types.items(): # We check multiple lines now because the result of the # roll call vote as parsed can potentially be split. # PDF documents suck. for line in lines[3:5]: if res in line.upper(): passed = val break if passed is None: raise AssertionError("Missing bill passage type") motion = " ".join(lines[4].split()[:-2]) try: yeas = int(lines[4].split()[-1]) nays = int(lines[5].split()[-1]) excused = int(lines[6].split()[-1]) not_voting = int(lines[7].split()[-1]) except ValueError: self.logger.warning("Vote format is weird, skipping") continue vote = VoteEvent( chamber=chamber, legislative_session=session, bill=bill_id, bill_chamber=original_chamber, start_date=vote_date, motion_text=motion, result="pass" if passed else "fail", classification="passage", ) vote.set_count("yes", yeas) vote.set_count("no", nays) vote.set_count("excused", excused) vote.set_count("not voting", not_voting) vote.add_source(proxy_link) currently_counting = "" possible_vote_lines = lines[8:] for line in possible_vote_lines: line = line.replace("NOT\xc2\xa0VOTING", "NOT VOTING") line = line.replace("\xc2\xa0", " -") if "yea-" in line.lower().replace(" ", ""): currently_counting = "yes" elif "nay-" in line.lower().replace(" ", ""): currently_counting = "no" elif "excused-" in line.lower().replace(" ", ""): currently_counting = "excused" elif "notvoting-" in line.lower().replace(" ", ""): currently_counting = "not voting" elif currently_counting == "": pass elif re.search(r"v\. \d\.\d", line): # this gets rid of the version number # which is often found at the bottom of the doc pass else: voters = line.split(" ") for v in voters: if v.strip(): vote.vote(currently_counting, v.strip()) yield vote
def scrape_lower(self): PDF_URL = 'http://www.ohiohouse.gov/Assets/CommitteeSchedule/calendar.pdf' (path, _response) = self.urlretrieve(PDF_URL) text = convert_pdf(path, type='text-nolayout').decode() os.remove(path) days = re.split(r'(\wF+day, \w+ \d{1,2}, 20\d{2})', text) date = None for day in enumerate(days[1:]): if day[0] % 2 == 0: date = day[1] else: events = re.split(r'\n((?:\w+\s?)+)\n', day[1]) comm = '' for event in enumerate(events[1:]): if event[0] % 2 == 0: comm = event[1].strip() else: try: (time, location, description) = re.search( r'''(?mxs) (\d{1,2}:\d{2}\s[ap]\.m\.) # Meeting time .*?,\s # Potential extra text for meeting time (.*?),\s # Location, usually a room .*?\n # Chairman of committee holding event (.*) # Description of event ''', event[1]).groups() except AttributeError: continue time = time.replace(".", "").upper() time = datetime.datetime.strptime( time + "_" + date, '%I:%M %p_%A, %B %d, %Y' ) time = self._tz.localize(time) location = location.strip() description = '\n'.join([ x.strip() for x in description.split('\n') if x.strip() and not x.strip()[0].isdigit() ]) if not description: description = '[No description provided by state]' event = Event( name=description, start_date=time, location_name=location, description=description ) event.add_source(PDF_URL) event.add_participant(comm, type='committee', note='host') for line in description.split('\n'): related_bill = re.search(r'(H\.?(?:[JC]\.?)?[BR]\.?\s+\d+)\s+(.*)$', line) if related_bill: (related_bill, relation) = related_bill.groups() relation = relation.strip() related_bill = related_bill.replace(".", "") item = event.add_agenda_item(relation) item.add_bill(related_bill) yield event
def scrape_lower(self): PDF_URL = 'http://www.ohiohouse.gov/Assets/CommitteeSchedule/calendar.pdf' (path, _response) = self.urlretrieve(PDF_URL) text = convert_pdf(path, type='text-nolayout').decode() os.remove(path) days = re.split(r'(\wF+day, \w+ \d{1,2}, 20\d{2})', text) date = None for day in enumerate(days[1:]): if day[0] % 2 == 0: date = day[1] else: events = re.split(r'\n((?:\w+\s?)+)\n', day[1]) comm = '' for event in enumerate(events[1:]): if event[0] % 2 == 0: comm = event[1].strip() else: try: (time, location, description) = re.search( r'''(?mxs) (\d{1,2}:\d{2}\s[ap]\.m\.) # Meeting time .*?,\s # Potential extra text for meeting time (.*?),\s # Location, usually a room .*?\n # Chairman of committee holding event (.*) # Description of event ''', event[1]).groups() except AttributeError: continue time = time.replace(".", "").upper() time = datetime.datetime.strptime( time + "_" + date, '%I:%M %p_%A, %B %d, %Y') time = self._tz.localize(time) location = location.strip() description = '\n'.join([ x.strip() for x in description.split('\n') if x.strip() and not x.strip()[0].isdigit() ]) if not description: description = '[No description provided by state]' event = Event(name=description, start_date=time, location_name=location, description=description) event.add_source(PDF_URL) event.add_participant(comm, type='committee', note='host') for line in description.split('\n'): related_bill = re.search( r'(H\.?(?:[JC]\.?)?[BR]\.?\s+\d+)\s+(.*)$', line) if related_bill: (related_bill, relation) = related_bill.groups() relation = relation.strip() related_bill = related_bill.replace(".", "") item = event.add_agenda_item(relation) item.add_bill(related_bill) yield event
def scrape_committees_pdf(self, year, chamber, filename, url): if chamber == 'lower' and year == '2015': text = self._fix_house_text(filename).decode() else: text = convert_pdf(filename, type='text-nolayout').decode() for hotgarbage, replacement in ( (r'Judicial Branch, Law Enforcement,\s+and\s+Justice', 'Judicial Branch, Law Enforcement, and Justice'), (r'Natural Resources and\s+Transportation', 'Natural Resources and Transportation'), (r'(?u)Federal Relations, Energy,?\s+and\s+Telecommunications', 'Federal Relations, Energy, and Telecommunications') ): text = re.sub(hotgarbage, replacement, text) lines = iter(text.splitlines()) # Drop any lines before the ag committee. lines = dropwhile(lambda s: 'Agriculture' not in s, lines) comm = None in_senate_subcommittees = False for line in lines: # Replace Unicode variants with ASCII equivalents line = line.replace(" ", " ").replace("‐", "-") if 'Subcommittees' in line: # These appear in both chambers' lists, so de-dup the scraping if chamber == 'lower': break elif chamber == 'upper': self.info("Beginning scrape of joint subcommittees") in_senate_subcommittees = True chamber = 'legislature' continue if is_committee_name(line): if comm and comm._related: yield comm if in_senate_subcommittees: committee = 'Joint Appropriations/Finance & Claims' subcommittee = line.strip() comm = Organization(name=subcommittee, parent_id={'name': committee, 'classification': 'joint'}, classification='committee', ) else: committee = line.strip() comm = Organization(name=committee, chamber=chamber, classification='committee') comm.add_source(url) elif is_legislator_name(line): name, party = line.rsplit('(', 1) name = name.strip().replace("Rep. ", "").replace("Sen. ", "") if re.search(' Ch', party): role = 'chair' elif ' VCh' in party: role = 'vice chair' elif ' MVCh' in party: role = 'minority vice chair' else: role = 'member' comm.add_member(name, role) if comm._related: yield comm
def scrape_upper(self): PDF_URL = 'http://www.ohiosenate.gov/Assets/CommitteeSchedule/calendar.pdf' (path, _response) = self.urlretrieve(PDF_URL) text = convert_pdf(path, type='text').decode() os.remove(path) days = re.split(r'(\w+day, \w+ \d{1,2})', text) date = None for day in enumerate(days[1:]): if day[0] % 2 == 0: # Calendar is put out for the current week, so use that year date = day[1] + ", " + str(datetime.datetime.now().year) else: events = re.split(r'\n\n((?:\w+\s?)+),\s', day[1]) comm = '' for event in enumerate(events[1:]): if event[0] % 2 == 0: comm = event[1].strip() else: try: (time, location, description) = re.search( r'''(?mxs) (\d{1,2}:\d{2}\s[AP]M) # Meeting time .*?,\s # Potential extra text for meeting time (.*?)\n # Location, usually a room .*?\n # Chairman of committee holding event (.*) # Description of event ''', event[1]).groups() except AttributeError: continue time = datetime.datetime.strptime( time + "_" + date, '%I:%M %p_%A, %B %d, %Y') time = self._tz.localize(time) location = location.strip() description = '\n'.join([ x.strip() for x in description.split('\n') if x.strip() and not x.strip().startswith("Page ") and not x.strip().startswith("*Possible Vote") and not x.strip() == "NO OTHER COMMITTEES WILL MEET" ]) if not description: description = '[No description provided by state]' event = Event(name=description, start_date=time, location_name=location, description=description) event.add_source(PDF_URL) event.add_participant(comm, type='committee', note='host') for line in description.split('\n'): related_bill = re.search( r'(S\.?(?:[JC]\.?)?[BR]\.?\s+\d+)\s+(.*)$', line) if related_bill: (related_bill, relation) = related_bill.groups() relation = relation.strip() related_bill = related_bill.replace(".", "") item = event.add_agenda_item(relation) item.add_bill(related_bill) yield event
def _get_pdf(self, url): (path, response) = self.urlretrieve(url) data = convert_pdf(path, type="text") os.remove(path) return data
def _process_votes(self, rollcalls, bill_id, original_chamber, session, proxy): result_types = { 'FAILED': False, 'DEFEATED': False, 'PREVAILED': True, 'PASSED': True, 'SUSTAINED': True, 'NOT SECONDED': False, 'OVERRIDDEN': True, 'ADOPTED': True, } for r in rollcalls: proxy_link = proxy["url"] + r["link"] (path, resp) = self.urlretrieve(proxy_link) text = convert_pdf(path, 'text').decode("utf-8") lines = text.split("\n") os.remove(path) chamber = "lower" if "house of representatives" in lines[0].lower( ) else "upper" date_parts = lines[1].strip().split()[-3:] date_str = " ".join(date_parts).title() + " " + lines[2].strip() vote_date = datetime.datetime.strptime(date_str, "%b %d, %Y %I:%M:%S %p") vote_date = vote_date.strftime("%Y-%m-%d %H:%M:%S") passed = None for res, val in result_types.items(): # We check multiple lines now because the result of the # roll call vote as parsed can potentially be split. # PDF documents suck. for line in lines[3:5]: if res in line.upper(): passed = val break if passed is None: raise AssertionError("Missing bill passage type") motion = " ".join(lines[4].split()[:-2]) try: yeas = int(lines[4].split()[-1]) nays = int(lines[5].split()[-1]) excused = int(lines[6].split()[-1]) not_voting = int(lines[7].split()[-1]) except ValueError: self.logger.warning("Vote format is weird, skipping") continue vote = VoteEvent(chamber=chamber, legislative_session=session, bill=bill_id, bill_chamber=original_chamber, start_date=vote_date, motion_text=motion, result="pass" if passed else "fail", classification="passage") vote.set_count('yes', yeas) vote.set_count('no', nays) vote.set_count('excused', excused) vote.set_count('not voting', not_voting) vote.add_source(proxy_link) currently_counting = "" possible_vote_lines = lines[8:] for l in possible_vote_lines: l = l.replace("NOT\xc2\xa0VOTING", "NOT VOTING") l = l.replace("\xc2\xa0", " -") if "yea-" in l.lower().replace(" ", ""): currently_counting = "yes" elif "nay-" in l.lower().replace(" ", ""): currently_counting = "no" elif "excused-" in l.lower().replace(" ", ""): currently_counting = "excused" elif "notvoting-" in l.lower().replace(" ", ""): currently_counting = "not voting" elif currently_counting == "": pass elif re.search(r'v\. \d\.\d', l): # this gets rid of the version number # which is often found at the bottom of the doc pass else: voters = l.split(" ") for v in voters: if v.strip(): vote.vote(currently_counting, v.strip()) yield vote
def scrape_upper(self): PDF_URL = 'http://www.ohiosenate.gov/Assets/CommitteeSchedule/calendar.pdf' (path, _response) = self.urlretrieve(PDF_URL) text = convert_pdf(path, type='text').decode() os.remove(path) days = re.split(r'(\w+day, \w+ \d{1,2})', text) date = None for day in enumerate(days[1:]): if day[0] % 2 == 0: # Calendar is put out for the current week, so use that year date = day[1] + ", " + str(datetime.datetime.now().year) else: events = re.split(r'\n\n((?:\w+\s?)+),\s', day[1]) comm = '' for event in enumerate(events[1:]): if event[0] % 2 == 0: comm = event[1].strip() else: try: (time, location, description) = re.search( r'''(?mxs) (\d{1,2}:\d{2}\s[AP]M) # Meeting time .*?,\s # Potential extra text for meeting time (.*?)\n # Location, usually a room .*?\n # Chairman of committee holding event (.*) # Description of event ''', event[1]).groups() except AttributeError: continue time = datetime.datetime.strptime( time + "_" + date, '%I:%M %p_%A, %B %d, %Y' ) time = self._tz.localize(time) location = location.strip() description = '\n'.join([ x.strip() for x in description.split('\n') if x.strip() and not x.strip().startswith("Page ") and not x.strip().startswith("*Possible Vote") and not x.strip() == "NO OTHER COMMITTEES WILL MEET" ]) if not description: description = '[No description provided by state]' event = Event( name=description, start_date=time, location_name=location, description=description ) event.add_source(PDF_URL) event.add_participant(comm, type='committee', note='host') for line in description.split('\n'): related_bill = re.search(r'(S\.?(?:[JC]\.?)?[BR]\.?\s+\d+)\s+(.*)$', line) if related_bill: (related_bill, relation) = related_bill.groups() relation = relation.strip() related_bill = related_bill.replace(".", "") item = event.add_agenda_item(relation) item.add_bill(related_bill) yield event
def scrape_journal(self, session, url): journal, resp = self.urlretrieve(url) text = convert_pdf(journal, type='text').decode() lines = text.splitlines() # state machine: # None - undefined state # question_quote - in question, looking for end quote # pre-yes - vote is active, haven't hit yes votes yet # yes - yes votes # no - no votes # other - other votes state = None vote = None question = None date = None for line_num, line in enumerate(lines): date_match = DATE_RE.findall(line) # skip headers if 'LEGISLATIVE JOURNAL' in line: continue elif date_match: date = datetime.datetime.strptime(' '.join(date_match[0]), '%B %d %Y') continue # keep adding lines to question while quotes are open elif state == 'question_quote': question += ' %s' % line elif state in ('pre-yes', 'yes', 'no', 'other'): yes_match = YES_RE.match(line) no_match = NO_RE.match(line) other_match = NOT_VOTING_RE.match(line) if yes_match: vote.set_count('yes', int(yes_match.group(1))) state = 'yes' elif no_match: vote.set_count('no', int(no_match.group(1))) state = 'no' elif other_match: vote.set_count('other', int(other_match.group(1))) state = 'other' elif 'having voted in the affirmative' in line: vote.result = 'pass' state = None vote.validate() yield vote vote = None elif 'Having failed' in line: vote.result = 'fail' state = None vote.validate() yield vote vote = None elif line: people = re.split('\s{3,}', line) # try: # except KeyError: # self.warning('line showed up in pre-yes state: %s', # line) for p in people: if p: # special cases for long name w/ 1 space if p.startswith(('Lautenbaugh ', 'Langemeier ', 'McCollister ', 'Pansing Brooks ', 'Schumacher ')): p1, p2 = p.split(' ', 1) vote.vote(state, p1) vote.vote(state, p2) else: vote.vote(state, p) # check the text against our regexes bill_match = BILL_RE.match(line) veto_match = VETO_BILL_RE.findall(line) question_match = QUESTION_RE.findall(line) if bill_match: bill_type, bill_id = bill_match.groups() if bill_type == 'BILL': bill_id = 'LB ' + bill_id elif bill_type == 'RESOLUTION': bill_id = 'LR ' + bill_id elif question_match: question = question_match[0] state = 'question_quote' elif veto_match: bill_id = veto_match[0] # line just finished a question if state == 'question_quote' and QUESTION_MATCH_END in question: question = re.sub('\s+', ' ', question.replace(QUESTION_MATCH_END, '').strip()) if not bill_id: raise Exception('cannot save vote without bill_id') # save prior vote vtuple = (bill_id, date, question) if vtuple in self._seen: vote = None continue else: self._seen.add(vtuple) vote = VoteEvent( bill=bill_id, bill_chamber='legislature', chamber='legislature', legislative_session=session, start_date=date.strftime('%Y-%m-%d'), motion_text=question, classification='passage', result='fail', ) vote.add_source(url) state = 'pre-yes' # reset bill_id and question bill_id = question = None
def _get_pdf(self, url): (path, response) = self.urlretrieve(url) data = convert_pdf(path, type='text') os.remove(path) return data
def parse_vote_pdf(self, vote_url, bill): filename, response = self.urlretrieve(vote_url) text = convert_pdf(filename, type="text").decode() lines = text.splitlines() if "Senate" in vote_url: chamber = "upper" else: chamber = "lower" date_string = lines[0].split("Calendar Date:")[1].strip() date = datetime.datetime.strptime(date_string, "%b %d, %Y %I:%M (%p)") page_index = None for index, line in enumerate(lines): if "Yeas" in line and "Nays" in line: page_index = index break vote_counts = 5 * [0] vote_types = ["yes", "no", "not voting", "excused", "absent"] if page_index: counts = re.split(r"\s{2,}", lines[page_index].strip()) for index, count in enumerate(counts): number, string = count.split(" ", 1) number = int(number) vote_counts[index] = number else: raise ValueError("Vote Counts Not found at %s" % vote_url) passed = vote_counts[0] > vote_counts[1] # Consent calendar votes address multiple bills in one VoteEvent # eg, http://mgaleg.maryland.gov/2018RS/votes/Senate/0478.pdf is_consent_calendar = any( ["Consent Calendar" in line for line in lines[:page_index]] ) consent_calendar_bills = None motion = "" if is_consent_calendar: motion = re.split(r"\s{2,}", lines[page_index - 4].strip())[0] consent_calendar_bills = re.split(r"\s{2,}", lines[page_index - 1].strip()) assert ( consent_calendar_bills ), "Could not find bills for consent calendar vote" motion_keywords = [ "favorable", "reading", "amendment", "motion", "introduced", "bill pass", "committee", ] motion_lines = [ 3, 2, 4, 5, ] # Relative LineNumbers to be checked for existence of motion for i in motion_lines: if any( motion_keyword in motion.lower() for motion_keyword in motion_keywords ): break motion = re.split(r"\s{2,}", lines[page_index - i].strip())[0] else: if not any( motion_keyword in motion.lower() for motion_keyword in motion_keywords ): # This condition covers for the bad formating in SB 1260 motion = lines[page_index - 3] if not any( motion_keyword in motion.lower() for motion_keyword in motion_keywords ): # Check this one for SB 747 motion = "No motion given" self.warning("No motion given") vote = VoteEvent( bill=bill, chamber=chamber, start_date=date.strftime("%Y-%m-%d"), motion_text=motion, classification="passage", result="pass" if passed else "fail", ) # Include bill ID to avoid duplication for consent calendars vote.pupa_id = "{}#{}".format(vote_url, bill.identifier) for index, vote_type in enumerate(vote_types): vote.set_count(vote_type, vote_counts[index]) page_index = page_index + 2 # Keywords for identifying where names are located in the pdf show_stoppers = [ "Voting Nay", "Not Voting", "COPY", "Excused", "indicates vote change", "Indicates Vote Change", ] vote_index = 0 # For matching number of names extracted with vote counts(extracted independently) vote_name_counts = 5 * [0] while page_index < len(lines): current_line = lines[page_index].strip() if not current_line or "Voting Yea" in current_line: page_index += 1 continue if any(show_stopper in current_line for show_stopper in show_stoppers): page_index += 1 vote_index = vote_index + 1 continue names = re.split(r"\s{2,}", current_line) vote_name_counts[vote_index] += len(names) for name in names: vote.vote(vote_types[vote_index], name) page_index += 1 if vote_counts != vote_name_counts: raise ValueError("Votes Count and Number of Names don't match") return vote
def scrape_journal(self, url, chamber, session, date): filename, response = self.urlretrieve(url) self.logger.info('Saved journal to %r' % filename) all_text = convert_pdf(filename, type="text") lines = all_text.split(b'\n') lines = [line.decode('utf-8') for line in lines] lines = [ line.strip().replace('–', '-').replace('―', '"').replace( '‖', '"').replace('“', '"').replace('”', '"') for line in lines ] # Do not process headers or completely empty lines header_date_re = r"\d+\w{2} Day\s+\w+DAY, \w+ \d{1,2}, \d{4}\s+\d+" header_journal_re = r"\d+\s+JOURNAL OF THE \w+\s+\d+\w{2} Day" lines = iter([ line for line in lines if not (line == "" or re.match(header_date_re, line) or re.match(header_journal_re, line)) ]) for line in lines: # Go through with vote parse if any of # these conditions match. if not line.startswith("On the question") or \ "shall" not in line.lower(): continue # Get the bill_id bill_id = None bill_re = r'\(\s*([A-Z\.]+\s\d+)\s*\)' # The Senate ends its motion text with a vote announcement if chamber == "upper": end_of_motion_re = r'.* the vote was:\s*' # The House may or may not end motion text with a bill name elif chamber == "lower": end_of_motion_re = r'.*Shall.*\?"?(\s{})?\s*'.format(bill_re) while not re.match(end_of_motion_re, line, re.IGNORECASE): line += " " + next(lines) try: bill_id = re.search(bill_re, line).group(1) except AttributeError: self.warning( "This motion did not pertain to legislation: {}".format( line)) continue # Get the motion text motion_re = r''' ^On\sthe\squestion\s # Precedes any motion "+ # Motion is preceded by a quote mark (or two) (Shall\s.+?\??) # The motion text begins with "Shall" \s*"\s+ # Motion is followed by a quote mark (?:{})? # If the vote regards a bill, its number is listed {} # Senate has trailing text \s*$ '''.format( bill_re, r',?.*?the\svote\swas:' if chamber == 'upper' else '') print(line) motion = re.search(motion_re, line, re.VERBOSE | re.IGNORECASE).group(1) for word, letter in (('Senate', 'S'), ('House', 'H'), ('File', 'F')): if bill_id is None: return bill_id = bill_id.replace(word, letter) bill_id = bill_id.replace('.', '') bill_chamber = dict(h='lower', s='upper')[bill_id.lower()[0]] votes, passed = self.parse_votes(lines) # at the very least, there should be a majority # for the bill to have passed, so check that, # but if the bill didn't pass, it could still be OK if it got a majority # eg constitutional amendments if not ((passed == (votes['yes_count'] > votes['no_count'])) or (not passed)): self.error("The bill passed without a majority?") raise ValueError('invalid vote') # also throw a warning if the bill failed but got a majority # it could be OK, but is probably something we'd want to check if not passed and votes['yes_count'] > votes['no_count']: self.logger.warning( "The bill got a majority but did not pass. " "Could be worth confirming.") result = "" if passed: result = "pass" else: result = "fail" vote = VoteEvent(chamber=chamber, start_date=date, motion_text=re.sub('\xad', '-', motion), result=result, classification='passage', legislative_session=session, bill=bill_id, bill_chamber=bill_chamber) # add votes and counts for vtype in ('yes', 'no', 'absent', 'abstain'): vcount = votes['{}_count'.format(vtype)] or 0 vote.set_count(vtype, vcount) for voter in votes['{}_votes'.format(vtype)]: vote.vote(vtype, voter) vote.add_source(url) yield vote
def scrape_votes(self, vote_url, bill, chamber): # Grabs text from pdf pdflines = [ line.decode("utf-8") for line in convert_pdf(vote_url, "text").splitlines() ] vote_date = 0 voters = defaultdict(list) for x in range(len(pdflines)): line = pdflines[x] if re.search(r"(\d+/\d+/\d+)", line): initial_date = line.strip() if ("AM" in line) or ("PM" in line): split_l = line.split() for y in split_l: if ":" in y: time_location = split_l.index(y) motion = " ".join(split_l[0:time_location]) time = split_l[time_location:] if len(time) > 0: time = "".join(time) dt = initial_date + " " + time dt = datetime.strptime(dt, "%m/%d/%Y %I:%M:%S%p") vote_date = central.localize(dt) vote_date = vote_date.isoformat() # In rare case that no motion is provided if len(motion) < 1: motion = "No Motion Provided" if "YEAS:" in line: yeas = int(line.split()[-1]) if "NAYS:" in line: nays = int(line.split()[-1]) if "ABSTAINED:" in line: abstained = int(line.split()[-1]) if "PASSES:" in line: abstained = int(line.split()[-1]) if "NOT VOTING:" in line: not_voting = int(line.split()[-1]) if "YEAS :" in line: y = 0 next_line = pdflines[x + y] while "NAYS : " not in next_line: next_line = next_line.split(" ") if next_line and ("YEAS" not in next_line): for v in next_line: if v and "YEAS" not in v: voters["yes"].append(v.strip()) next_line = pdflines[x + y] y += 1 if line and "NAYS :" in line: y = 0 next_line = 0 next_line = pdflines[x + y] while ("ABSTAINED : " not in next_line) and ("PASSES :" not in next_line): next_line = next_line.split(" ") if next_line and "NAYS" not in next_line: for v in next_line: if v and "NAYS" not in v: voters["no"].append(v.strip()) next_line = pdflines[x + y] y += 1 if line and ("ABSTAINED :" in line or "PASSES :" in line): y = 2 next_line = 0 next_line = pdflines[x + y] while "NOT VOTING :" not in next_line: next_line = next_line.split(" ") if next_line and ("ABSTAINED" not in next_line or "PASSES" not in next_line): for v in next_line: if v: voters["abstain"].append(v.strip()) next_line = pdflines[x + y] y += 1 if line and "NOT VOTING : " in line: lines_to_go_through = math.ceil(not_voting / len(line.split())) next_line = pdflines[x] for y in range(lines_to_go_through): next_line = pdflines[x + y + 2].split(" ") for v in next_line: if v: voters["not voting"].append(v.strip()) if yeas > (nays + abstained + not_voting): passed = True else: passed = False ve = VoteEvent( chamber=chamber, start_date=vote_date, motion_text=motion, result="pass" if passed else "fail", classification="bill", bill=bill, ) ve.add_source(vote_url) for how_voted, how_voted_voters in voters.items(): for voter in how_voted_voters: if len(voter) > 0: ve.vote(how_voted, voter) # Resets voters dictionary before going onto next page in pdf voters = defaultdict(list) yield ve
def scrape_digest(self, bill, chamber): digest_url = 'http://legisweb.state.wy.us/{}/Digest/{}.pdf'.format( bill.legislative_session, bill.identifier, ) bill.add_source(digest_url) try: (filename, response) = self.urlretrieve(digest_url) all_text = convert_pdf(filename, type='text').decode() except scrapelib.HTTPError: self.warning('no digest for %s' % bill.identifier) return if all_text.strip() == "": self.warning( 'Non-functional digest for bill {}'. format(bill.identifier) ) return # Split the digest's text into sponsors, description, and actions SPONSOR_RE = r'(?sm)Sponsored By:\s+(.*?)\n\n' DESCRIPTION_RE = r'(?sm)\n\n((?:AN\s*?ACT|A JOINT RESOLUTION) .*?)\n\n' try: ext_title = re.search(DESCRIPTION_RE, all_text).group(1) except AttributeError: ext_title = '' bill_desc = ext_title.replace('\n', ' ') bill_desc = re.sub(" *", " ", bill_desc) if bill_desc: bill.add_abstract(abstract=bill_desc, note='description') sponsor_span = re.search(SPONSOR_RE, all_text).group(1) sponsors = '' sponsors = sponsor_span.replace('\n', ' ') if sponsors: if 'Committee' in sponsors: bill.add_sponsorship(sponsors, 'primary', primary=True, entity_type='organization') else: if chamber == 'lower': sp_lists = sponsors.split('and Senator(s)') else: sp_lists = sponsors.split('and Representative(s)') for spl in sp_lists: for sponsor in split_names(spl): sponsor = sponsor.strip() if sponsor != "": bill.add_sponsorship(sponsor, 'primary', primary=True, entity_type='person') action_re = re.compile('(\d{1,2}/\d{1,2}/\d{4})\s+(H |S )?(.+)') vote_total_re = re.compile('(Ayes )?(\d*)(\s*)Nays(\s*)(\d+)(\s*)Excused(\s*)(\d+)' '(\s*)Absent(\s*)(\d+)(\s*)Conflicts(\s*)(\d+)') # initial actor is bill chamber actor = chamber lines = all_text.splitlines() for idx, line in enumerate(lines): if action_re.search(line): break action_lines = lines[idx:] for line in action_lines: line = clean_line(line) # skip blank lines if not line: continue amatch = action_re.match(line) if amatch: date, achamber, action = amatch.groups() # change actor if one is on this action if achamber == 'H ': actor = 'lower' elif achamber == 'S ': actor = 'upper' date = datetime.datetime.strptime(date, '%m/%d/%Y') bill.add_action(action.strip(), TIMEZONE.localize(date), chamber=actor, classification=categorize_action(action)) elif line == 'ROLL CALL': voters = defaultdict(str) # if we hit a roll call, use an inner loop to consume lines # in a psuedo-state machine manner, 3 types # Ayes|Nays|Excused|... - indicates next line is voters # : (Senators|Representatives): ... - voters # \d+ Nays \d+ Excused ... - totals voters_type = None for ainext in action_lines: nextline = clean_line(ainext) if not nextline: continue breakers = ["Ayes:", "Nays:", "Nayes:", "Excused:", "Absent:", "Conflicts:"] for breaker in breakers: if nextline.startswith(breaker): voters_type = breaker[:-1] if voters_type == "Nayes": voters_type = "Nays" self.log("Fixed a case of 'Naye-itis'") nextline = nextline[len(breaker) - 1:] if nextline.startswith(': '): voters[voters_type] = nextline elif nextline in ('Ayes', 'Nays', 'Excused', 'Absent', 'Conflicts'): voters_type = nextline elif vote_total_re.match(nextline): # _, ayes, _, nays, _, exc, _, abs, _, con, _ = \ tup = vote_total_re.match(nextline).groups() ayes = tup[1] nays = tup[4] exc = tup[7] abs = tup[10] con = tup[13] passed = (('Passed' in action or 'Do Pass' in action or 'Did Concur' in action or 'Referred to' in action) and 'Failed' not in action) vote = VoteEvent( chamber=chamber, start_date=TIMEZONE.localize(date), motion_text=action, result='pass' if passed else 'fail', classification='passage', bill=bill, ) vote.set_count('yes', int(ayes)) vote.set_count('no', int(nays)) vote.set_count('other', int(exc) + int(abs) + int(con)) vote.add_source(digest_url) for vtype, voters in voters.items(): for voter in split_names(voters): if voter: if vtype == 'Ayes': vote.vote('yes', voter) elif vtype == 'Nays': vote.vote('no', voter) else: vote.vote('other', voter) # done collecting this vote yield vote break else: # if it is a stray line within the vote, is is a # continuation of the voter list # (sometimes has a newline) voters[voters_type] += ' ' + nextline
def _process_votes(self, rollcalls, bill_id, original_chamber, session, proxy): result_types = { 'FAILED': False, 'DEFEATED': False, 'PREVAILED': True, 'PASSED': True, 'SUSTAINED': True, 'NOT SECONDED': False, 'OVERRIDDEN': True, 'ADOPTED': True, } for r in rollcalls: proxy_link = proxy["url"] + r["link"] (path, resp) = self.urlretrieve(proxy_link) text = convert_pdf(path, 'text').decode("utf-8") lines = text.split("\n") os.remove(path) chamber = "lower" if "house of representatives" in lines[0].lower() else "upper" date_parts = lines[1].strip().split()[-3:] date_str = " ".join(date_parts).title() + " " + lines[2].strip() vote_date = datetime.datetime.strptime(date_str, "%b %d, %Y %I:%M:%S %p") vote_date = pytz.timezone('America/Indiana/Indianapolis').localize(vote_date) vote_date = vote_date.isoformat() passed = None for res, val in result_types.items(): # We check multiple lines now because the result of the # roll call vote as parsed can potentially be split. # PDF documents suck. for line in lines[3:5]: if res in line.upper(): passed = val break if passed is None: raise AssertionError("Missing bill passage type") motion = " ".join(lines[4].split()[:-2]) try: yeas = int(lines[4].split()[-1]) nays = int(lines[5].split()[-1]) excused = int(lines[6].split()[-1]) not_voting = int(lines[7].split()[-1]) except ValueError: self.logger.warning("Vote format is weird, skipping") continue vote = VoteEvent(chamber=chamber, legislative_session=session, bill=bill_id, bill_chamber=original_chamber, start_date=vote_date, motion_text=motion, result="pass" if passed else "fail", classification="passage") vote.set_count('yes', yeas) vote.set_count('no', nays) vote.set_count('excused', excused) vote.set_count('not voting', not_voting) vote.add_source(proxy_link) currently_counting = "" possible_vote_lines = lines[8:] for l in possible_vote_lines: l = l.replace("NOT\xc2\xa0VOTING", "NOT VOTING") l = l.replace("\xc2\xa0", " -") if "yea-" in l.lower().replace(" ", ""): currently_counting = "yes" elif "nay-" in l.lower().replace(" ", ""): currently_counting = "no" elif "excused-" in l.lower().replace(" ", ""): currently_counting = "excused" elif "notvoting-" in l.lower().replace(" ", ""): currently_counting = "not voting" elif currently_counting == "": pass elif re.search(r'v\. \d\.\d', l): # this gets rid of the version number # which is often found at the bottom of the doc pass else: voters = l.split(" ") for v in voters: if v.strip(): vote.vote(currently_counting, v.strip()) yield vote
def parse_vote_pdf(self, vote_url, bill): filename, response = self.urlretrieve(vote_url) text = convert_pdf(filename, type='text').decode() lines = text.splitlines() if 'Senate' in vote_url: chamber = 'upper' else: chamber = 'lower' date_string = lines[0].split('Calendar Date:')[1].strip() date = datetime.datetime.strptime(date_string, "%b %d, %Y %I:%M (%p)") page_index = None for index, line in enumerate(lines): if 'Yeas' in line and 'Nays' in line: page_index = index break vote_counts = 5*[0] vote_types = ['yes', 'no', 'not voting', 'excused', 'absent'] if page_index: counts = re.split(r'\s{2,}', lines[page_index].strip()) for index, count in enumerate(counts): number, string = count.split(' ', 1) number = int(number) vote_counts[index] = number else: raise ValueError("Vote Counts Not found at %s" % vote_url) passed = vote_counts[0] > vote_counts[1] # Consent calendar votes address multiple bills in one VoteEvent # eg, http://mgaleg.maryland.gov/2018RS/votes/Senate/0478.pdf is_consent_calendar = any(['Consent Calendar' in line for line in lines[:page_index]]) consent_calendar_bills = None motion = "" if is_consent_calendar: motion = re.split(r'\s{2,}', lines[page_index - 4].strip())[0] consent_calendar_bills = re.split(r'\s{2,}', lines[page_index-1].strip()) assert consent_calendar_bills, "Could not find bills for consent calendar vote" motion_keywords = ['favorable', 'reading', 'amendment', 'motion', 'introduced', 'bill pass', 'committee'] motion_lines = [3, 2, 4, 5] # Relative LineNumbers to be checked for existence of motion for i in motion_lines: if any(motion_keyword in motion.lower() for motion_keyword in motion_keywords): break motion = re.split(r'\s{2,}', lines[page_index-i].strip())[0] else: if not any(motion_keyword in motion.lower() for motion_keyword in motion_keywords): # This condition covers for the bad formating in SB 1260 motion = lines[page_index-3] if not any(motion_keyword in motion.lower() for motion_keyword in motion_keywords): # Check this one for SB 747 motion = "No motion given" self.warning("No motion given") vote = VoteEvent( bill=bill, chamber=chamber, start_date=date.strftime('%Y-%m-%d'), motion_text=motion, classification='passage', result='pass' if passed else 'fail', ) # Include bill ID to avoid duplication for consent calendars vote.pupa_id = '{}#{}'.format(vote_url, bill.identifier) for index, vote_type in enumerate(vote_types): vote.set_count(vote_type, vote_counts[index]) page_index = page_index + 2 # Keywords for identifying where names are located in the pdf show_stoppers = ['Voting Nay', 'Not Voting', 'COPY', 'Excused', 'indicates vote change', 'Indicates Vote Change'] vote_index = 0 # For matching number of names extracted with vote counts(extracted independently) vote_name_counts = 5*[0] while page_index < len(lines): current_line = lines[page_index].strip() if not current_line or 'Voting Yea' in current_line: page_index += 1 continue if any(show_stopper in current_line for show_stopper in show_stoppers): page_index += 1 vote_index = (vote_index + 1) continue names = re.split(r'\s{2,}', current_line) vote_name_counts[vote_index] += len(names) for name in names: vote.vote(vote_types[vote_index], name) page_index += 1 if vote_counts != vote_name_counts: raise ValueError("Votes Count and Number of Names don't match") return vote