def _fix_house_text(self, filename): ''' TLDR: throw out bad text, replace it using different parser settings. When using `pdftotext` on the 2015 House committee list, the second and third columns of the second page get mixed up, which makes it very difficult to parse. Adding the `--layout` option fixes this, but isn't worth switching all parsing to that since the standard `pdftotext --nolayout` is easier in all other cases. The best solution to this is to throw out the offending text, and replace it with the correct text. The third and fourth columns are joint comittees that are scraped from the Senate document, so the only column that needs to be inserted this way is the second. ''' # Take the usable text from the normally-working parsing settings text = convert_pdf(filename, type='text-nolayout') assert "Revised: January 23, 2015" in text,\ "House committee list has changed; check that the special-case"\ " fix is still necessary, and that the result is still correct" text = re.sub(r'(?sm)Appropriations/F&C.*$', "", text) # Take the usable column from the alternate parser alternate_text = convert_pdf(filename, type='text') alternate_lines = alternate_text.split('\n') HEADER_OF_COLUMN_TO_REPLACE = "State Administration (cont.) " (text_of_line_to_replace, ) = [ x for x in alternate_lines if HEADER_OF_COLUMN_TO_REPLACE in x ] first_line_to_replace = alternate_lines.index(text_of_line_to_replace) first_character_to_replace = alternate_lines[ first_line_to_replace].index(HEADER_OF_COLUMN_TO_REPLACE) - 1 last_character_to_replace = (first_character_to_replace + len(HEADER_OF_COLUMN_TO_REPLACE)) column_lines_to_add = [ x[first_character_to_replace:last_character_to_replace] for x in alternate_lines[first_line_to_replace + 1:] ] column_text_to_add = '\n'.join(column_lines_to_add) text = text + column_text_to_add return text
def scrape_senate_vote(self, bill, url): (path, resp) = self.urlretrieve(url) text = convert_pdf(path, "text") os.remove(path) lines = text.split("\n") date_match = re.search(r"Date:\s+(\d+/\d+/\d+)", text) if not date_match: self.log("Couldn't find date on %s" % url) return time_match = re.search(r"Time:\s+(\d+:\d+:\d+)\s+(AM|PM)", text) date = "%s %s %s" % (date_match.group(1), time_match.group(1), time_match.group(2)) date = datetime.datetime.strptime(date, "%m/%d/%Y %I:%M:%S %p") date = self._tz.localize(date) vote_type = None yes_count, no_count, other_count = None, None, 0 votes = [] for line in lines[21:]: line = line.strip() if not line: continue if line.startswith("YEAS"): yes_count = int(line.split(" - ")[1]) vote_type = "yes" elif line.startswith("NAYS"): no_count = int(line.split(" - ")[1]) vote_type = "no" elif line.startswith("EXCUSED") or line.startswith("NOT VOTING"): other_count += int(line.split(" - ")[1]) vote_type = "other" else: votes.extend([(n.strip(), vote_type) for n in re.split(r"\s{2,}", line)]) if yes_count is None or no_count is None: self.log("Couldne't find vote counts in %s" % url) return passed = yes_count > no_count + other_count clean_bill_id = fix_bill_id(bill["bill_id"]) motion_line = None for i, line in enumerate(lines): if line.strip() == clean_bill_id: motion_line = i + 2 motion = lines[motion_line] if not motion: self.log("Couldn't find motion for %s" % url) return vote = Vote("upper", date, motion, passed, yes_count, no_count, other_count) vote.add_source(url) insert_specific_votes(vote, votes) check_vote_counts(vote) bill.add_vote(vote)
def scrape_rollcall(self, vote, vurl): (path, resp) = self.urlretrieve(vurl) pdflines = convert_pdf(path, 'text') os.remove(path) current_vfunc = None for line in pdflines.split('\n'): line = line.strip() # change what is being recorded if line.startswith('YEAS') or line.startswith('AYES'): current_vfunc = vote.yes elif line.startswith('NAYS'): current_vfunc = vote.no elif (line.startswith('EXCUSED') or line.startswith('NOT VOTING') or line.startswith('ABSTAIN')): current_vfunc = vote.other # skip these elif not line or line.startswith('Page '): continue # if a vfunc is active elif current_vfunc: # split names apart by 3 or more spaces names = re.split('\s{3,}', line) for name in names: if name: current_vfunc(name.strip())
def scrape_upper_committee(self,url): filename, resp = self.urlretrieve(url) root = lxml.etree.fromstring( convert_pdf(filename,'xml')) for link in root.xpath('/pdf2xml/page'): comm = None for line in link.findall('text'): text = line.findtext('b') if text is not None and text.startswith('Comisi'): comm = Committee('upper',text); comm.add_source(url) else: if line.text and line.text.startswith('Hon.'): line_text = line.text.replace(u'–','-') name_split = line_text.split(u'-',1) title = 'member' # print name_split if len(name_split) >= 2: name_split[1] = name_split[1].strip() if name_split[1] == 'Presidenta' or name_split[1] == 'Presidente': title = 'chairman' elif name_split[1] == 'Vicepresidente' or name_split[1] == 'Vicepresidenta': title = 'vicechairman' elif name_split[1] == 'Secretaria' or name_split[1] == 'Secretario': title = 'secretary' # if title != 'member': # print name_split[0] if name_split[0] != 'VACANTE': comm.add_member(name_split[0].replace('Hon.',''),title) self.save_committee(comm) os.remove(filename);
def parse_subjects(self, url, chamber_letter): try: pdf, resp = self.urlretrieve(url) except scrapelib.HTTPError: self.warning("could not fetch subject index %s" % url) return lines = convert_pdf(pdf, 'text-nolayout').splitlines() os.remove(pdf) last_line = '' subject_re = re.compile('^[A-Z ]+$') bill_re = re.compile('(?:S|H)[A-Z]{1,2} \d+') for line in lines[1:]: if 'BILL INDEX' in line: pass elif subject_re.match(line): if subject_re.match(last_line): title += ' %s' % line elif last_line == '': title = line else: last_was_upper = False for bill_id in bill_re.findall(line): if bill_id.startswith(chamber_letter): if bill_id not in self.all_bills: self.warning("unknown bill %s" % bill_id) continue self.all_bills[bill_id].setdefault('subjects', []).append(title) # sometimes we need to look back last_line = line
def add_house_votes(self, vote, filename): vcount_re = re.compile( 'AYES.* (\d+).*NAYS.* (\d+).*NOT VOTING.* (\d+).* PAIRED.*(\d+)') xml = convert_pdf(filename, 'xml') doc = lxml.html.fromstring(xml) # use lxml.html for text_content() # function to call on next legislator name vfunc = None name = '' for textitem in doc.xpath('//text/text()'): if textitem.startswith('AYES'): ayes, nays, nv, paired = vcount_re.match(textitem).groups() vote['yes_count'] = int(ayes) vote['no_count'] = int(nays) vote['other_count'] = int(nv) + int(paired) elif textitem == 'N': vfunc = vote.no name = '' elif textitem == 'Y': vfunc = vote.yes name = '' elif textitem == 'x': vfunc = vote.other name = '' elif textitem in ('R', 'D', 'I'): vfunc(name) else: if name: name += ' ' + textitem else: name = textitem
def __init__(self, url, resp): self.url = url # Fetch the document and put it into tempfile. fd, filename = tempfile.mkstemp() with open(filename, "wb") as f: f.write(resp) # Convert it to text. try: text = convert_pdf(filename, type="text") except: msg = "couldn't convert pdf." raise PDFCommitteeVoteParseError(msg) # Get rid of the temp file. os.close(fd) os.remove(filename) if not text.strip(): msg = "PDF file was empty." raise PDFCommitteeVoteParseError(msg) self.text = "\n".join(filter(None, text.splitlines()))
def add_house_votes(self, vote, filename): vcount_re = re.compile('AYES.* (\d+).*NAYS.* (\d+).*NOT VOTING.* (\d+).* PAIRED.*(\d+)') xml = convert_pdf(filename, 'xml') doc = lxml.html.fromstring(xml) # use lxml.html for text_content() # function to call on next legislator name vfunc = None name = '' for textitem in doc.xpath('//text/text()'): if textitem.startswith('AYES'): ayes, nays, nv, paired = vcount_re.match(textitem).groups() vote['yes_count'] = int(ayes) vote['no_count'] = int(nays) vote['other_count'] = int(nv)+int(paired) elif textitem == 'N': vfunc = vote.no name = '' elif textitem == 'Y': vfunc = vote.yes name = '' elif textitem == 'x': vfunc = vote.other name = '' elif textitem in ('R', 'D', 'I'): vfunc(name) else: if name: name += ' ' + textitem else: name = textitem
def scrape_vote(self, session, rollcall_number): # Fetch this piece of garbage. url = ( 'http://www.mass.gov/legis/journal/RollCallPdfs/' '{session}/{rollcall}.pdf?Session={session}&RollCall={rollcall}') url_args = dict( session=re.findall(r'\d+', session).pop(), rollcall=str(rollcall_number).zfill(5)) url = url.format(**url_args) try: vote_file, resp = self.urlretrieve(url) except scrapelib.HTTPError: # We'll hit a 404 at the end of the votes. self.warning('Stopping; encountered a 404 at %s' % url) raise self.EndOfHouseVotes text = convert_pdf(vote_file, type='text') text = text.decode('utf8') # A hack to guess whether this PDF has embedded images or contains # machine readable text. if len(re.findall(r'[YNPX]', text)) > 157: vote = self.house_get_vote(text, vote_file, session) else: vote = self.house_get_vote_with_images(text, vote_file, session) self.house_add_votes_from_image(vote_file, vote) vote.add_source(url) if not self.house_check_vote(vote): self.logger.warning('Bad vote counts for %s' % vote) return self.save_vote(vote) os.remove(vote_file)
def scrape_vote(self, session, rollcall_number): # Fetch this piece of garbage. url = ( 'http://www.mass.gov/legis/journal/RollCallPdfs/' '{session}/{rollcall}.pdf?Session={session}&RollCall={rollcall}') url_args = dict(session=re.findall(r'\d+', session).pop(), rollcall=str(rollcall_number).zfill(5)) url = url.format(**url_args) try: vote_file, resp = self.urlretrieve(url) except scrapelib.HTTPError: # We'll hit a 404 at the end of the votes. raise self.EndOfHouseVotes text = convert_pdf(vote_file, type='text') text = text.decode('utf8') # A hack to guess whether this PDF has embedded images or contains # machine readable text. if len(re.findall(r'[YNPX]', text)) > 157: vote = self.house_get_vote(text, vote_file, session) else: vote = self.house_get_vote_with_images(text, vote_file, session) self.house_add_votes_from_image(vote_file, vote) vote.add_source(url) if not self.house_check_vote(vote): self.logger.warning('Bad vote counts for %s' % vote) return self.save_vote(vote) os.remove(vote_file)
def fetch_pdf_lines(self, href): # download the file fname, resp = self.urlretrieve(href) pdflines = [line.decode('utf-8') for line in convert_pdf(fname, 'text').splitlines()] os.remove(fname) return pdflines
def __init__(self, url, resp): self.url = url # Fetch the document and put it into tempfile. fd, filename = tempfile.mkstemp() with open(filename, 'wb') as f: f.write(resp) # Convert it to text. try: text = convert_pdf(filename, type='text') except: msg = "couldn't convert pdf." raise PDFCommitteeVoteParseError(msg) # Get rid of the temp file. os.close(fd) os.remove(filename) if not text.strip(): msg = 'PDF file was empty.' raise PDFCommitteeVoteParseError(msg) self.text = '\n'.join(filter(None, text.splitlines()))
def extract_rollcall_from_pdf(self,chamber,vote, bill, url,bill_id): billnum = re.search("(\d+)", bill_id).group(1) self.debug("Scraping rollcall %s|%s|" % (billnum, url)) bill_prefix = "vote_%s_%s_" % (chamber, re.sub(r'\s+', '_', bill_id )) bill.add_source(url) #billnum = re.search("(\d+)", bill_id).group(1) # Save roll call pdf to a local file temp_file = tempfile.NamedTemporaryFile(delete=False,suffix='.pdf', prefix=bill_prefix ) pdf_temp_name = temp_file.name self.debug("Parsing pdf votes, saving to tempfile [%s]" % temp_file.name) with self.urlopen(url) as pdata: pdf_file = file(pdf_temp_name, 'w') pdf_file.write(pdata) pdf_file.close() # Pdf is in pdf_temp_name rollcall_data = convert_pdf(pdf_temp_name, type='text') (valid_data, expected, areas, yays, nays, other) = self.count_votes(url,chamber,bill_id,rollcall_data) os.unlink(pdf_temp_name) if valid_data: self.debug("VOTE %s %s yays %d nays %d other %d pdf=%s" % (bill_id, chamber, len(yays), len(nays), len(other), pdf_temp_name )) [vote.yes(legislator) for legislator in yays] [vote.no(legislator) for legislator in nays] [vote.other(legislator) for legislator in other]
def scrape_upper_committee(self, url): filename, resp = self.urlretrieve(url) root = lxml.etree.fromstring(convert_pdf(filename, 'xml')) for link in root.xpath('/pdf2xml/page'): comm = None for line in link.findall('text'): text = line.findtext('b') if text is not None and text.startswith('Comisi'): comm = Committee('upper', text) comm.add_source(url) else: if line.text and line.text.startswith('Hon.'): line_text = line.text.replace(u'–', '-') name_split = line_text.split(u'-', 1) title = 'member' # print name_split if len(name_split) >= 2: name_split[1] = name_split[1].strip() if name_split[1] == 'Presidenta' or name_split[ 1] == 'Presidente': title = 'chairman' elif name_split[ 1] == 'Vicepresidente' or name_split[ 1] == 'Vicepresidenta': title = 'vicechairman' elif name_split[1] == 'Secretaria' or name_split[ 1] == 'Secretario': title = 'secretary' # if title != 'member': # print name_split[0] if name_split[0] != 'VACANTE': comm.add_member(name_split[0].replace('Hon.', ''), title) self.save_committee(comm) os.remove(filename)
def scrape_joint_committee(self, url): filename, resp = self.urlretrieve(url) root = lxml.etree.fromstring(convert_pdf(filename, 'xml')) for link in root.xpath('/pdf2xml/page'): comm = None self.log(lxml.etree.tostring(root)) return
def parse_senate_vote(self, url): vote = Vote('upper', '?', 'senate passage', False, 0, 0, 0) vote.add_source(url) fname, resp = self.urlretrieve(url) sv_text = convert_sv_text(convert_pdf(fname, 'text')) os.remove(fname) in_votes = False for line in sv_text: if not in_votes: dmatch = re.search('DATE:(\d{2}-\d{2}-\d{2})', line) if dmatch: date = dmatch.groups()[0] vote['date'] = datetime.strptime(date, '%m-%d-%y') if 'YES NO ABS EXC' in line: in_votes = True elif 'PASSED' in line: vote['passed'] = True else: if 'TOTALS' in line: # Lt. Governor voted if 'GOVERNOR' in line: name, spaces, line = re.match(' ([A-Z,.]+)(\s+)X(.*)', line).groups() if len(spaces) == 1: vote.yes(name) else: vote.no(name) _, yes, no, abs, exc = line.split() vote['yes_count'] = int(yes) vote['no_count'] = int(no) vote['other_count'] = int(abs)+int(exc) # no longer in votes in_votes = False continue # pull votes out matches = re.match(' ([A-Z,.]+)(\s+)X\s+([A-Z,.]+)(\s+)X', line).groups() name1, spaces1, name2, spaces2 = matches # vote can be determined by # of spaces if len(spaces1) == 1: vote.yes(name1) elif len(spaces1) == 2: vote.no(name1) else: vote.other(name1) if len(spaces2) == 1: vote.yes(name2) elif len(spaces2) == 2: vote.no(name2) else: vote.other(name2) return vote
def scrape_joint_committee(self,url): filename, resp = self.urlretrieve(url) root = lxml.etree.fromstring(convert_pdf(filename,'xml')) for link in root.xpath('/pdf2xml/page'): comm = None self.log(lxml.etree.tostring(root)) return
def scrape_vote(self, bill, chamber, date, url): (path, resp) = self.urlretrieve(url) text = convert_pdf(path, 'text') os.remove(path) try: motion = text.split('\n')[4].strip() except IndexError: return try: yes_count = int(re.search(r'Yeas - (\d+)', text).group(1)) except AttributeError: return no_count = int(re.search(r'Nays - (\d+)', text).group(1)) other_count = int(re.search(r'Not Voting - (\d+)', text).group(1)) passed = yes_count > (no_count + other_count) vote = Vote(chamber, date, motion, passed, yes_count, no_count, other_count) vote.add_source(url) y,n,o = 0,0,0 break_outter = False for line in text.split('\n')[9:]: if break_outter: break if 'after roll call' in line: break if 'Indication of Vote' in line: break if 'Presiding' in line: continue for col in re.split(r'-\d+', line): col = col.strip() if not col: continue match = re.match(r'(Y|N|EX|\*)\s+(.+)$', col) if match: if match.group(2) == "PAIR": break_outter = True break if match.group(1) == 'Y': vote.yes(match.group(2)) elif match.group(1) == 'N': vote.no(match.group(2)) else: vote.other(match.group(2)) else: vote.other(col.strip()) vote.validate() bill.add_vote(vote)
def scrape_house(self, vote, vurl, supplement): #Point to PDF and read to memory (path, resp) = self.urlretrieve(vurl) pdflines = convert_pdf(path, 'text') os.remove(path) pdflines = pdflines.decode('utf-8').replace(u'\u2019', "'") # get pdf data from supplement number try: vote_text = pdflines.split('No. ' + str(supplement))[1].split('MASSACHUSETTS')[0] except IndexError: self.info("No vote found in supplement for vote #%s" % supplement) return # create list of independant items in vote_text rows = vote_text.splitlines() lines = [] for row in rows: lines.extend(row.split(' ')) # retrieving votes in columns vote_tally = [] voters = [] for line in lines: # removes whitespace and after-vote '*' tag line = line.strip().strip('*').strip() if 'NAYS' in line or 'YEAS' in line or '=' in line or '/' in line: continue elif line == '': continue elif line == 'N': vote_tally.append('n') elif line == 'Y': vote_tally.append('y') # Not Voting elif line == 'X': vote_tally.append('x') #Present elif line == 'P': vote_tally.append('p') #True for all records 2009 - 2017 - brittle code, this will change in the future #elif line == 'Mr. Speaker': # voters.append('DeLeo') else: voters.append(line) house_votes = list(zip(voters, vote_tally)) # iterate list and add individual names to vote.yes, vote.no for tup1 in house_votes: if tup1[1] == 'y': vote.yes(tup1[0]) elif tup1[1] == 'n': vote.no(tup1[0]) else: vote.other(tup1[0])
def scrape_house(self, vote, vurl, supplement): #Point to PDF and read to memory (path, resp) = self.urlretrieve(vurl) pdflines = convert_pdf(path, 'text') os.remove(path) pdflines = pdflines.decode('utf-8').replace(u'\u2019', "'") # get pdf data from supplement number try: vote_text = pdflines.split('No. ' + str(supplement))[1].split( 'MASSACHUSETTS')[0] except IndexError: self.info("No vote found in supplement for vote #%s" % supplement) return # create list of independant items in vote_text rows = vote_text.splitlines() lines = [] for row in rows: lines.extend(row.split(' ')) # retrieving votes in columns vote_tally = [] voters = [] for line in lines: # removes whitespace and after-vote '*' tag line = line.strip().strip('*').strip() if 'NAYS' in line or 'YEAS' in line or '=' in line or '/' in line: continue elif line == '': continue elif line == 'N': vote_tally.append('n') elif line == 'Y': vote_tally.append('y') # Not Voting elif line == 'X': vote_tally.append('x') #Present elif line == 'P': vote_tally.append('p') #True for all records 2009 - 2017 - brittle code, this will change in the future #elif line == 'Mr. Speaker': # voters.append('DeLeo') else: voters.append(line) house_votes = list(zip(voters, vote_tally)) # iterate list and add individual names to vote.yes, vote.no for tup1 in house_votes: if tup1[1] == 'y': vote.yes(tup1[0]) elif tup1[1] == 'n': vote.no(tup1[0]) else: vote.other(tup1[0])
def ca_handler(filedata, metadata): if file.endswith('.pdf'): # NOTE: this strips the summary, it'd be useful for search (but not SFM) lines = convert_pdf(file, 'text').splitlines() return text_after_line_numbers(lines) elif file.endswith('.html'): doc = lxml.html.fromstring(open(file).read()) text = doc.xpath('//pre')[0].text_content() return collapse_spaces(text)
def text(self): text = getattr(self, '_text', None) if text: return text (path, resp) = self.scraper.urlretrieve(self.url) text = convert_pdf(path, 'text') os.remove(path) self._text = text return text
def fetch_pdf_lines(self, href): # download the file fname, resp = self.urlretrieve(href) pdflines = [ line.decode('utf-8') for line in convert_pdf(fname, 'text').splitlines() ] os.remove(fname) return pdflines
def scrape_vote(self, url, local=False): """Retrieves or uses local copy of vote pdf and converts into XML.""" if not local: try: url, resp = self.urlretrieve(url) except scrapelib.HTTPError: self.warning("Request failed: {}".format(url)) return v_text = convert_pdf(url, 'xml') os.remove(url) return v_text
def scrape_votes(self, bill, votes_url): html = self.urlopen(votes_url) doc = lxml.html.fromstring(html) doc.make_links_absolute(votes_url) EXPECTED_VOTE_CODES = ['Y','N','E','NV','A','P','-'] # vote indicator, a few spaces, a name, newline or multiple spaces VOTE_RE = re.compile('(Y|N|E|NV|A|P|-)\s{2,5}(\w.+?)(?:\n|\s{2})') for link in doc.xpath('//a[contains(@href, "votehistory")]'): pieces = link.text.split(' - ') date = pieces[-1] if len(pieces) == 3: motion = pieces[1] else: motion = 'Third Reading' chamber = link.xpath('../following-sibling::td/text()')[0] if chamber == 'HOUSE': chamber = 'lower' elif chamber == 'SENATE': chamber = 'upper' else: self.warning('unknown chamber %s' % chamber) date = datetime.datetime.strptime(date, "%A, %B %d, %Y") # download the file fname, resp = self.urlretrieve(link.get('href')) pdflines = convert_pdf(fname, 'text').splitlines() os.remove(fname) vote = Vote(chamber, date, motion.strip(), False, 0, 0, 0) for line in pdflines: for match in VOTE_RE.findall(line): vcode, name = match if vcode == 'Y': vote.yes(name) elif vcode == 'N': vote.no(name) else: vote.other(name) # fake the counts vote['yes_count'] = len(vote['yes_votes']) vote['no_count'] = len(vote['no_votes']) vote['other_count'] = len(vote['other_votes']) vote['passed'] = vote['yes_count'] > vote['no_count'] vote.add_source(link.get('href')) bill.add_vote(vote)
def fetch_pdf_lines(self, href): # download the file try: fname, resp = self.urlretrieve(href) pdflines = [line.decode('utf-8') for line in convert_pdf(fname, 'text').splitlines()] os.remove(fname) return pdflines except scrapelib.HTTPError as e: assert '404' in e.args[0], "File not found: {}".format(e) self.warning("404 error for vote; skipping vote") return False
def text(self): text = getattr(self, "_text", None) if text: return text try: (path, resp) = self.scraper.urlretrieve(self.url) except scrapelib.HttpError as exc: self.scraper.warning("Got error %r while fetching %s" % (exc, self.url)) raise self.VoteParseError() text = convert_pdf(path, "text") os.remove(path) self._text = text return text
def scrape_senate(self, vote, vurl): # download file to server (path, resp) = self.urlretrieve(vurl) pdflines = convert_pdf(path, 'text') os.remove(path) # for y, n mode = None lines = pdflines.splitlines() # handle individual lines in pdf to id legislator votes for line in lines: line = line.strip() line = line.decode('utf-8').replace(u'\u2212', '-') if line == '': continue # change mode accordingly elif line.startswith('YEAS'): mode = 'y' elif line.startswith('NAYS'): mode = 'n' elif line.startswith('ABSENT OR'): mode = 'o' # else parse line with names else: nameline = line.split(' ') for raw_name in nameline: raw_name = raw_name.strip() if raw_name == '': continue # handles vote count lines cut_name = raw_name.split('-') clean_name = '' if cut_name[len(cut_name) - 1].strip(' .').isdigit(): del cut_name[-1] clean_name = ''.join(cut_name) else: clean_name = raw_name.strip() # update vote object with names if mode == 'y': vote.yes(clean_name) elif mode == 'n': vote.no(clean_name) elif mode == 'o': vote.other(clean_name)
def scrape_senate_vote(self, bill, url, date): try: filename, resp = self.urlretrieve(url) except scrapelib.HTTPError: self.warning("missing vote file %s" % url) return vote = Vote('upper', date, 'Passage', passed=None, yes_count=0, no_count=0, other_count=0) vote.add_source(url) text = convert_pdf(filename, 'text') os.remove(filename) if re.search('Yea:\s+\d+\s+Nay:\s+\d+\s+Absent:\s+\d+', text): return self.scrape_senate_vote_3col(bill, vote, text, url, date) data = re.split(r'(Yea|Nay|Absent)s?:', text)[::-1] data = filter(None, data) keymap = dict(yea='yes', nay='no') actual_vote = collections.defaultdict(int) while True: if not data: break vote_val = data.pop() key = keymap.get(vote_val.lower(), 'other') values = data.pop() for name in re.split(r'(?:[\s,]+and\s|[\s,]{2,})', values): if name.lower().strip() == 'none.': continue name = name.replace('..', '') name = re.sub(r'\.$', '', name) name = name.strip('-1234567890 \n') if not name: continue getattr(vote, key)(name) actual_vote[vote_val] += 1 vote[key + '_count'] += 1 assert actual_vote[vote_val] == vote[key + '_count'] vote['passed'] = vote['no_count'] < vote['yes_count'] bill.add_vote(vote)
def parse_house_vote(self, url): """ house votes are pdfs that can be converted to text, require some nasty regex to get votes out reliably """ fname, resp = self.urlretrieve(url) text = convert_pdf(fname, 'text') if not text.strip(): self.warning('image PDF %s' % url) return os.remove(fname) # get date if text.strip() == 'NEW MEXICO HOUSE OF REPRESENTATIVES': self.warning("What the heck: %s" % (url)) return None date = re.findall('(\d+/\d+/\d+)', text)[0] date = datetime.strptime(date, '%m/%d/%Y') # get totals yea, nay, exc, absent = self.HOUSE_TOTAL_RE.findall(text)[0] # make vote (faked passage indicator) vote = Vote('lower', date, 'house passage', int(yea) > int(nay), int(yea), int(nay), int(absent) + int(exc)) vote.add_source(url) # votes real_votes = False for v, name in HOUSE_VOTE_RE.findall(text): # our regex is a bit broad, wait until we see 'Nays' to start # and end when we see CERTIFIED or ____ signature line if 'Nays' in name or 'Excused' in name: real_votes = True continue elif 'CERTIFIED' in name or '___' in name: break elif real_votes and name.strip(): if v == 'Y': vote.yes(name) elif v == 'N': vote.no(name) else: # excused/absent vote.other(name) return vote
def parse_vote(scraper, chamber, doc_meta): # Get the pdf text. try: (path, resp) = scraper.urlretrieve(doc_meta.url) except scrapelib.HTTPError as exc: scraper.warning("Got error %r while fetching %s" % (exc, url)) raise VoteParseError() text = convert_pdf(path, "text") text = text.replace("\xc2\xa0", " ") text = text.replace("\xc2\xad", " ") os.remove(path) # Figure out what type of vote this is. if "Roll Call" in text: return RollCallVote(text, scraper, chamber, doc_meta).vote() else: scraper.warning("Skipping a committee vote (See Jira issue DATA-80).") raise VoteParseError()
def parse_vote(scraper, chamber, doc_meta): # Get the pdf text. try: (path, resp) = scraper.urlretrieve(doc_meta.url) except scrapelib.HTTPError as exc: scraper.warning('Got error %r while fetching %s' % (exc, url)) raise VoteParseError() text = convert_pdf(path, 'text') text = text.replace('\xc2\xa0', ' ') text = text.replace('\xc2\xad', ' ') os.remove(path) # Figure out what type of vote this is. if 'Roll Call' in text: return RollCallVote(text, scraper, chamber, doc_meta).vote() else: scraper.warning('Skipping a committee vote (See Jira issue DATA-80).') raise VoteParseError()
def scrape_vote(self, bill, chamber, date, url): (path, resp) = self.urlretrieve(url) text = convert_pdf(path, 'text') os.remove(path) motion = text.split('\n')[4].strip() yes_count = int(re.search(r'Yeas - (\d+)', text).group(1)) no_count = int(re.search(r'Nays - (\d+)', text).group(1)) other_count = int(re.search(r'Not Voting - (\d+)', text).group(1)) passed = yes_count > (no_count + other_count) vote = Vote(chamber, date, motion, passed, yes_count, no_count, other_count) vote.add_source(url) for line in text.split('\n')[9:]: if 'after roll call' in line: break if 'Indication of Vote' in line: break if 'Presiding' in line: continue for col in re.split(r'-\d+', line): col = col.strip() if not col: continue match = re.match(r'(Y|N|EX|\*)\s+(.+)$', col) if match: if match.group(1) == 'Y': vote.yes(match.group(2)) elif match.group(1) == 'N': vote.no(match.group(2)) elif match.group(1) == '*': pass # skip paired voters, don't factor into count else: vote.other(match.group(2)) else: vote.other(col.strip()) vote.validate() bill.add_vote(vote)
def scrape_senate_vote(self, bill, url, date): try: filename, resp = self.urlretrieve(url) except scrapelib.HTTPError: self.warning("missing vote file %s" % url) return vote = Vote("upper", date, "Passage", passed=None, yes_count=0, no_count=0, other_count=0) vote.add_source(url) text = convert_pdf(filename, "text") os.remove(filename) if re.search("Yea:\s+\d+\s+Nay:\s+\d+\s+Absent:\s+\d+", text): return self.scrape_senate_vote_3col(bill, vote, text, url, date) data = re.split(r"(Yea|Nay|Absent)s?:", text)[::-1] data = filter(None, data) keymap = dict(yea="yes", nay="no") actual_vote = collections.defaultdict(int) while True: if not data: break vote_val = data.pop() key = keymap.get(vote_val.lower(), "other") values = data.pop() for name in re.split(r"(?:[\s,]+and\s|[\s,]{2,})", values): if name.lower().strip() == "none.": continue name = name.replace("..", "") name = re.sub(r"\.$", "", name) name = name.strip("-1234567890 \n") if not name: continue getattr(vote, key)(name) actual_vote[vote_val] += 1 vote[key + "_count"] += 1 assert actual_vote[vote_val] == vote[key + "_count"] vote["passed"] = vote["no_count"] < vote["yes_count"] bill.add_vote(vote)
def scrape_vote(self, bill, chamber, date, url): (path, resp) = self.urlretrieve(url) text = convert_pdf(path, "text") os.remove(path) motion = text.split("\n")[4].strip() yes_count = int(re.search(r"Yeas - (\d+)", text).group(1)) no_count = int(re.search(r"Nays - (\d+)", text).group(1)) other_count = int(re.search(r"Not Voting - (\d+)", text).group(1)) passed = yes_count > (no_count + other_count) vote = Vote(chamber, date, motion, passed, yes_count, no_count, other_count) vote.add_source(url) for line in text.split("\n")[9:]: if "after roll call" in line: break if "Indication of Vote" in line: break if "Presiding" in line: continue for col in re.split(r"-\d+", line): col = col.strip() if not col: continue match = re.match(r"(Y|N|EX)\s+(.+)$", col) if match: if match.group(1) == "Y": vote.yes(match.group(2)) elif match.group(1) == "N": vote.no(match.group(2)) else: vote.other(match.group(2)) else: vote.other(col.strip()) vote.validate() bill.add_vote(vote)
def extract_rollcall_from_pdf(self, chamber, vote, bill, url, bill_id): billnum = re.search("(\d+)", bill_id).group(1) self.debug("Scraping rollcall %s|%s|" % (billnum, url)) bill_prefix = "vote_%s_%s_" % (chamber, re.sub(r'\s+', '_', bill_id)) bill.add_source(url) #billnum = re.search("(\d+)", bill_id).group(1) # Save roll call pdf to a local file temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf', prefix=bill_prefix) pdf_temp_name = temp_file.name # Save converted text to a local file #otemp_file = tempfile.NamedTemporaryFile(delete=False,suffix='.txt',prefix=bill_prefix ) #txt_temp_name = otemp_file.name #self.debug("Parsing pdf votes, saving to tempfile [%s] textfile=[%s]" % (temp_file.name, otemp_file.name)) self.debug("Parsing pdf votes, saving to tempfile [%s]" % temp_file.name) with self.urlopen(url) as pdata: pdf_file = file(pdf_temp_name, 'w') pdf_file.write(pdata) pdf_file.close() # Pdf is in pdf_temp_name rollcall_data = convert_pdf(pdf_temp_name, type='text') (valid_data, expected, areas, yays, nays, other) = self.count_votes(url, chamber, bill_id, rollcall_data) os.unlink(pdf_temp_name) if valid_data: self.debug("VOTE %s %s yays %d nays %d other %d pdf=%s" % (bill_id, chamber, len(yays), len(nays), len(other), pdf_temp_name)) [vote.yes(legislator) for legislator in yays] [vote.no(legislator) for legislator in nays] [vote.other(legislator) for legislator in other]
def parse_house_vote(self, url): """ house votes are pdfs that can be converted to text, require some nasty regex to get votes out reliably """ fname, resp = self.urlretrieve(url) text = convert_pdf(fname, "text") if not text.strip(): self.warning("image PDF %s" % url) return os.remove(fname) # get date date = re.findall("(\d+/\d+/\d+)", text)[0] date = datetime.strptime(date, "%m/%d/%y") # get totals absent, yea, nay, exc = self.HOUSE_TOTAL_RE.findall(text)[0] # make vote (faked passage indicator) vote = Vote("lower", date, "house passage", int(yea) > int(nay), int(yea), int(nay), int(absent) + int(exc)) vote.add_source(url) # votes real_votes = False for v, name in HOUSE_VOTE_RE.findall(text): # our regex is a bit broad, wait until we see 'Nays' to start # and end when we see CERTIFIED or ____ signature line if "Nays" in name or "Excused" in name: real_votes = True continue elif "CERTIFIED" in name or "___" in name: break elif real_votes and name.strip(): if v == "Y": vote.yes(name) elif v == "N": vote.no(name) else: # excused/absent vote.other(name) return vote
def scrape_vote(self, bill, chamber, date, url): (path, resp) = self.urlretrieve(url) text = convert_pdf(path, 'text') os.remove(path) motion = text.split('\n')[4].strip() yes_count = int(re.search(r'Yeas - (\d+)', text).group(1)) no_count = int(re.search(r'Nays - (\d+)', text).group(1)) other_count = int(re.search(r'Not Voting - (\d+)', text).group(1)) passed = yes_count > (no_count + other_count) vote = Vote(chamber, date, motion, passed, yes_count, no_count, other_count) vote.add_source(url) for line in text.split('\n')[9:]: if 'after roll call' in line: break if 'Presiding' in line: continue for col in re.split(r'-\d+', line): col = col.strip() if not col: continue match = re.match(r'(Y|N|EX)\s+(.+)$', col) if match: if match.group(1) == 'Y': vote.yes(match.group(2)) elif match.group(1) == 'N': vote.no(match.group(2)) else: vote.other(match.group(2)) else: vote.other(col.strip()) vote.validate() bill.add_vote(vote)
def main(): html = convert_pdf('openstates/ny/scripts/assembly_parties.pdf') doc = lxml.html.fromstring(html) dems = doc.xpath('//text[@font="4"]/b/text()') dems = map(getname, dems) repubs = doc.xpath('//text[@font="5"]/i/b/text()') repubs = map(getname, repubs) name_to_party = {} for list_, party in ((dems, 'Democratic'), (repubs, 'Republican')): for name in list_: print name, 'matched', try: full_name = difflib.get_close_matches(name, legs).pop(0) except IndexError: print 'NO MATCH FOUND' continue print full_name, ':: party = ', party name_to_party[full_name] = party print party # import pprint # pprint.pprint(name_to_party) print 'party_dict = {' it = iter(name_to_party.items()) while True: try: # Col 1 full_name1, party1 = next(it) full_name2, party2 = next(it) print ('\n %r: %r,' % (full_name1, party1)).ljust(45), print ('%r: %r,' % (full_name2, party2)) except StopIteration: break print ' }'
def parse_house_vote(self, url): fname, resp = self.urlretrieve(url) text = convert_pdf(fname, 'text') if not text.strip(): self.warning('image PDF %s' % url) return os.remove(fname) # get date date = re.findall('(\d+/\d+/\d+)', text)[0] date = datetime.strptime(date, '%m/%d/%y') # get totals absent, yea, nay, exc = self.HOUSE_TOTAL_RE.findall(text)[0] # make vote (faked passage indicator) vote = Vote('lower', date, 'house passage', int(yea) > int(nay), int(yea), int(nay), int(absent)+int(exc)) vote.add_source(url) # votes real_votes = False for v, name in HOUSE_VOTE_RE.findall(text): # our regex is a bit broad, wait until we see 'Nays' to start # and end when we see CERTIFIED or ____ signature line if 'Nays' in name or 'Excused' in name: real_votes = True continue elif 'CERTIFIED' in name or '___' in name: break elif real_votes and name.strip(): if v == 'Y': vote.yes(name) elif v == 'N': vote.no(name) else: # excused/absent vote.other(name) return vote
def add_senate_votes(self, vote, filename): xml = convert_pdf(filename, 'xml') doc = lxml.html.fromstring(xml) # use lxml.html for text_content() # what to do with the pieces vfunc = None for textitem in doc.xpath('//text'): text = textitem.text_content().strip() if text.startswith('AYES'): vfunc = vote.yes vote['yes_count'] = int(text.split(u' \u2212 ')[1]) elif text.startswith('NAYS'): vfunc = vote.no vote['no_count'] = int(text.split(u' \u2212 ')[1]) elif text.startswith('NOT VOTING'): vfunc = vote.other vote['other_count'] = int(text.split(u' \u2212 ')[1]) elif text.startswith('SEQUENCE NO'): vfunc = None elif vfunc: vfunc(text)
def scrape_house(self, session): url = journals % (session, 'House') page = self.lxmlize(url) hrefs = page.xpath("//font//a") for href in hrefs: (path, response) = self.urlretrieve(href.attrib['href']) data = convert_pdf(path, type='text') in_vote = False cur_vote = {} known_date = None cur_vote_count = None in_question = False cur_question = None cur_bill_id = None for line in data.split("\n"): if known_date is None: dt = date_re.findall(line) if dt != []: dt, dow = dt[0] known_date = datetime.datetime.strptime(dt, "%A, %B %d, %Y") non_std = False if re.match("(\s+)?\d+.*", line) is None: non_std = True l = line.lower().strip() skip = False blacklist = [ "house", "page", "general assembly", "state of colorado", "session", "legislative day" ] for thing in blacklist: if thing in l: skip = True if skip: continue found = re.findall( "(?P<bill_id>(H|S|SJ|HJ)(B|M|R)\d{2}-\d{3,4})", line ) if found != []: found = found[0] cur_bill_id, chamber, typ = found try: if not non_std: _, line = line.strip().split(" ", 1) line = line.strip() except ValueError: in_vote = False in_question = False continue if in_question: cur_question += " " + line.strip() continue if ("The question being" in line) or \ ("On motion of" in line) or \ ("the following" in line) or \ ("moved that the" in line): cur_question = line.strip() in_question = True if in_vote: if line == "": likely_garbage = True likely_garbage = False if "co-sponsor" in line.lower(): likely_garbage = True if 'the speaker' in line.lower(): likely_garbage = True votes = re.findall(votes_re, line) if likely_garbage: votes = [] for person, _, v in votes: cur_vote[person] = v last_line = False for who, _, vote in votes: if who.lower() == "speaker": last_line = True if votes == [] or last_line: in_vote = False # save vote yes, no, other = cur_vote_count if cur_bill_id is None or cur_question is None: continue bc = { "H": "lower", "S": "upper", "J": "joint" }[cur_bill_id[0].upper()] vote = Vote('lower', known_date, cur_question, (yes > no), yes, no, other, session=session, bill_id=cur_bill_id, bill_chamber=bc) vote.add_source(href.attrib['href']) vote.add_source(url) for person in cur_vote: if person is None: continue vot = cur_vote[person] if person.endswith("Y"): vot = "Y" person = person[:-1] if person.endswith("N"): vot = "N" person = person[:-1] if person.endswith("E"): vot = "E" person = person[:-1] if vot == 'Y': vote.yes(person) elif vot == 'N': vote.no(person) elif vot == 'E' or vot == '-': vote.other(person) self.save_vote(vote) cur_vote = {} in_question = False cur_question = None in_vote = False cur_vote_count = None continue summ = vote_re.findall(line) if summ == []: continue summ = summ[0] yes, no, exc, ab = summ yes, no, exc, ab = \ int(yes), int(no), int(exc), int(ab) other = exc + ab cur_vote_count = (yes, no, other) in_vote = True continue os.unlink(path)
def scrape_house_vote(self, bill, url): try: filename, resp = self.urlretrieve(url) except scrapelib.HTTPError: self.warning("missing vote file %s" % url) return text = convert_pdf(filename, 'text') os.remove(filename) lines = text.splitlines() vote_type = None votes = collections.defaultdict(list) for idx, line in enumerate(lines): line = line.rstrip() match = re.search(r'(\d+)/(\d+)/(\d{4,4})$', line) if match: date = datetime.datetime.strptime(match.group(0), "%m/%d/%Y") continue match = re.match( r'\s+YEAS: (\d+)\s+NAYS: (\d+)\s+NOT VOTING: (\d+)', line) if match: motion = lines[idx - 2].strip() if not motion: self.warning("No motion text found for vote") motion = "PASSAGE" yes_count, no_count, other_count = [ int(g) for g in match.groups() ] exc_match = re.search(r'EXCUSED: (\d+)', line) if exc_match: other_count += int(exc_match.group(1)) if line.endswith('ADOPTED') or line.endswith('PASSED'): passed = True else: passed = False continue match = re.match( r'(YEAS|NAYS|NOT VOTING|PAIRED|EXCUSED):\s+(\d+)\s*$', line) if match: vote_type = { 'YEAS': 'yes', 'NAYS': 'no', 'NOT VOTING': 'other', 'EXCUSED': 'other', 'PAIRED': 'paired' }[match.group(1)] continue if vote_type == 'paired': for part in line.split(' '): part = part.strip() if not part: continue name, pair_type = re.match(r'([^\(]+)\((YEA|NAY)\)', line).groups() name = name.strip() if pair_type == 'YEA': votes['yes'].append(name) elif pair_type == 'NAY': votes['no'].append(name) elif vote_type: for name in line.split(' '): name = name.strip() if not name: continue votes[vote_type].append(name) vote = Vote('lower', date, motion, passed, yes_count, no_count, other_count) vote.add_source(url) vote['yes_votes'] = votes['yes'] vote['no_votes'] = votes['no'] vote['other_votes'] = votes['other'] assert len(vote['yes_votes']) == yes_count assert len(vote['no_votes']) == no_count assert len(vote['other_votes']) == other_count bill.add_vote(vote)
def scrape_digest(self, bill): digest_url = 'http://legisweb.state.wy.us/%(session)s/Digest/%(bill_id)s.pdf' % bill bill.add_source(digest_url) try: (filename, response) = self.urlretrieve(digest_url) all_text = convert_pdf(filename, type='text') except scrapelib.HTTPError: self.warning('no digest for %s' % bill['bill_id']) return if all_text.strip() == "": self.warning('Non-functional digest for bill {}'.format( bill['bill_id'])) return # Split the digest's text into sponsors, description, and actions SPONSOR_RE = r'(?sm)Sponsored By:\s+(.*?)\n\n' DESCRIPTION_RE = r'(?sm)\n\n((?:AN\s*?ACT|A JOINT RESOLUTION) .*?)\n\n' ACTIONS_RE = r'(?sm)\n\n(\d{1,2}/\d{1,2}/\d{4}.*)' ext_title = re.search(DESCRIPTION_RE, all_text).group(1) bill_desc = ext_title.replace('\n', ' ') bill_desc = re.sub(" *", " ", bill_desc.decode('utf-8')).encode('utf-8') bill['description'] = bill_desc sponsor_span = re.search(SPONSOR_RE, all_text).group(1) sponsors = '' sponsors = sponsor_span.replace('\n', ' ') if sponsors: if 'Committee' in sponsors: bill.add_sponsor('primary', sponsors) else: if bill['chamber'] == 'lower': sp_lists = sponsors.split('and Senator(s)') else: sp_lists = sponsors.split('and Representative(s)') for spl in sp_lists: for sponsor in split_names(spl): sponsor = sponsor.strip() if sponsor != "": bill.add_sponsor('primary', sponsor) action_re = re.compile('(\d{1,2}/\d{1,2}/\d{4})\s+(H |S )?(.+)') vote_total_re = re.compile( '(Ayes )?(\d*)(\s*)Nays(\s*)(\d+)(\s*)Excused(\s*)(\d+)(\s*)Absent(\s*)(\d+)(\s*)Conflicts(\s*)(\d+)' ) # initial actor is bill chamber actor = bill['chamber'] actions = [] action_lines = re.search(ACTIONS_RE, all_text).group(1).split('\n') action_lines = iter(action_lines) for line in action_lines: line = clean_line(line) # skip blank lines if not line: continue amatch = action_re.match(line) if amatch: date, achamber, action = amatch.groups() # change actor if one is on this action if achamber == 'H ': actor = 'lower' elif achamber == 'S ': actor = 'upper' date = datetime.datetime.strptime(date, '%m/%d/%Y') bill.add_action(actor, action.strip(), date, type=categorize_action(action)) elif line == 'ROLL CALL': voters = defaultdict(str) # if we hit a roll call, use an inner loop to consume lines # in a psuedo-state machine manner, 3 types # Ayes|Nays|Excused|... - indicates next line is voters # : (Senators|Representatives): ... - voters # \d+ Nays \d+ Excused ... - totals voters_type = None for ainext in action_lines: nextline = clean_line(ainext) if not nextline: continue breakers = [ "Ayes:", "Nays:", "Nayes:", "Excused:", "Absent:", "Conflicts:" ] for breaker in breakers: if nextline.startswith(breaker): voters_type = breaker[:-1] if voters_type == "Nayes": voters_type = "Nays" self.log("Fixed a case of 'Naye-itis'") nextline = nextline[len(breaker) - 1:] if nextline.startswith(': '): voters[voters_type] = nextline elif nextline in ('Ayes', 'Nays', 'Excused', 'Absent', 'Conflicts'): voters_type = nextline elif vote_total_re.match(nextline): #_, ayes, _, nays, _, exc, _, abs, _, con, _ = \ tupple = vote_total_re.match(nextline).groups() ayes = tupple[1] nays = tupple[4] exc = tupple[7] abs = tupple[10] con = tupple[13] passed = (('Passed' in action or 'Do Pass' in action or 'Did Concur' in action or 'Referred to' in action) and 'Failed' not in action) vote = Vote(actor, date, action, passed, int(ayes), int(nays), int(exc) + int(abs) + int(con)) vote.add_source(digest_url) for vtype, voters in voters.iteritems(): for voter in split_names(voters): if voter: if vtype == 'Ayes': vote.yes(voter) elif vtype == 'Nays': vote.no(voter) else: vote.other(voter) # done collecting this vote bill.add_vote(vote) break else: # if it is a stray line within the vote, is is a # continuation of the voter list # (sometimes has a newline) voters[voters_type] += ' ' + nextline
def scrape_uppper_committee_vote(self, bill, date, url): (path, resp) = self.urlretrieve(url) text = convert_pdf(path, 'text') lines = text.split("\n") os.remove(path) (_, motion) = lines[5].split("FINAL ACTION:") motion = motion.strip() if not motion: self.warning("Vote appears to be empty") return vote_top_row = [ lines.index(x) for x in lines if re.search(r'^\s+Yea\s+Nay.*?(?:\s+Yea\s+Nay)+$', x) ][0] yea_columns_end = lines[vote_top_row].index("Yea") + len("Yea") nay_columns_begin = lines[vote_top_row].index("Nay") votes = {'yes': [], 'no': [], 'other': []} for line in lines[(vote_top_row + 1):]: if line.strip(): member = re.search( r'''(?x) ^\s+(?:[A-Z\-]+)?\s+ # Possible vote indicator ([A-Z][a-z]+ # Name must have lower-case characters [\w\-\s]+) # Continue looking for the rest of the name (?:,[A-Z\s]+?)? # Leadership has an all-caps title (?:\s{2,}.*)? # Name ends when many spaces are seen ''', line).group(1) # Usually non-voting members won't even have a code listed # Only a couple of codes indicate an actual vote: # "VA" (vote after roll call) and "VC" (vote change) did_vote = bool(re.search(r'^\s+(X|VA|VC)\s+[A-Z][a-z]', line)) if did_vote: # Check where the "X" or vote code is on the page vote_column = len(line) - len(line.lstrip()) if vote_column <= yea_columns_end: votes['yes'].append(member) elif vote_column >= nay_columns_begin: votes['no'].append(member) else: raise AssertionError( "Unparseable vote found for {0} in {1}:\n{2}". format(member, url, line)) else: votes['other'].append(member) # End loop as soon as no more members are found else: break totals = re.search(r'(?msu)\s+(\d{1,3})\s+(\d{1,3})\s+.*?TOTALS', text).groups() yes_count = int(totals[0]) no_count = int(totals[1]) passed = (yes_count > no_count) other_count = len(votes['other']) vote = Vote('upper', date, motion, passed, yes_count, no_count, other_count) vote.add_source(url) vote['yes_votes'] = votes['yes'] vote['no_votes'] = votes['no'] vote['other_votes'] = votes['other'] vote.validate() bill.add_vote(vote)
def scrape_floor_vote(self, chamber, bill, date, url): (path, resp) = self.urlretrieve(url) text = convert_pdf(path, 'text') lines = text.split("\n") os.remove(path) MOTION_INDEX = 4 TOTALS_INDEX = 6 VOTE_START_INDEX = 9 motion = lines[MOTION_INDEX].strip() # Sometimes there is no motion name, only "Passage" in the line above if (not motion and not lines[MOTION_INDEX - 1].startswith("Calendar Page:")): motion = lines[MOTION_INDEX - 1] MOTION_INDEX -= 1 TOTALS_INDEX -= 1 VOTE_START_INDEX -= 1 else: assert motion, "Floor vote's motion name appears to be empty" for _extra_motion_line in range(2): MOTION_INDEX += 1 if lines[MOTION_INDEX].strip(): motion = "{}, {}".format(motion, lines[MOTION_INDEX].strip()) TOTALS_INDEX += 1 VOTE_START_INDEX += 1 else: break (yes_count, no_count, other_count) = [ int(x) for x in re.search( r'^\s+Yeas - (\d+)\s+Nays - (\d+)\s+Not Voting - (\d+)\s*$', lines[TOTALS_INDEX]).groups() ] passed = (yes_count > no_count) vote = Vote(chamber, date, motion, passed, yes_count, no_count, other_count) vote.add_source(url) for line in lines[VOTE_START_INDEX:]: if not line.strip(): break if " President " in line: line = line.replace(" President ", " ") elif " Speaker " in line: line = line.replace(" Speaker ", " ") # Votes follow the pattern of: # [vote code] [member name]-[district number] for member in re.findall(r'\s*Y\s+(.*?)-\d{1,3}\s*', line): vote.yes(member) for member in re.findall(r'\s*N\s+(.*?)-\d{1,3}\s*', line): vote.no(member) for member in re.findall(r'\s*(?:EX|AV)\s+(.*?)-\d{1,3}\s*', line): vote.other(member) try: vote.validate() except ValueError: # On a rare occasion, a member won't have a vote code, # which indicates that they didn't vote. The totals reflect # this. self.logger.info("Votes don't add up; looking for additional ones") for line in lines[VOTE_START_INDEX:]: if not line.strip(): break for member in re.findall(r'\s{8,}([A-Z][a-z\'].*?)-\d{1,3}', line): vote.other(member) vote.validate() bill.add_vote(vote)
def scrape(self, chamber, session): chamber_name = 'house' if chamber == 'lower' else 'senate' session_slug = { '62': '62-2011', '63': '63-2013', '64': '64-2015', '65': '65-2017', }[session] # Open the index page of the session's Registers, and open each url = "http://www.legis.nd.gov/assembly/%s/journals/%s-journal.html" % ( session_slug, chamber_name) page = self.lxmlize(url) pdfs = page.xpath("//a[contains(@href, '.pdf')]") for pdf in pdfs: # Initialize information about the vote parsing results = {} in_motion = False cur_vote = None in_vote = False cur_motion = "" bills = [] # Determine which URLs the information was pulled from pdf_url = pdf.attrib['href'] try: (path, response) = self.urlretrieve(pdf_url) except requests.exceptions.ConnectionError: continue # Convert the PDF to text data = convert_pdf(path, type='text') os.unlink(path) # Determine the date of the document date = re.findall(date_re, data) if date: date = date[0][0] cur_date = datetime.datetime.strptime(date, "%A, %B %d, %Y") else: # If no date is found anywhere, do not process the document self.warning("No date was found for the document; skipping.") continue # Check each line of the text for motion and vote information lines = data.splitlines() for line in lines: # Ignore lines with no information if re.search(chamber_re, line) or \ re.search(date_re, line) or \ re.search(page_re, line) or \ line.strip() == "": pass # Ensure that motion and vote capturing are not _both_ active elif in_motion and in_vote: raise AssertionError( "Scraper should not be simultaneously processing " + "motion name and votes, as it is for this motion: " + cur_motion) # Start capturing motion text after a ROLL CALL header elif not in_motion and not in_vote: if line.strip() == "ROLL CALL": in_motion = True elif in_motion and not in_vote: if cur_motion == "": cur_motion = line.strip() else: cur_motion = cur_motion + " " + line.strip() # ABSENT AND NOT VOTING marks the end of each motion name # In this case, prepare to capture votes if line.strip().endswith("VOTING") or \ line.strip().endswith("VOTING."): in_motion = False in_vote = True elif not in_motion and in_vote: # Ignore appointments and confirmations if "The Senate advises and consents to the appointment" \ in line: in_vote = False cur_vote = None results = {} cur_motion = "" bills = [] # If votes are being processed, record the voting members elif ":" in line: cur_vote, who = (x.strip() for x in line.split(":", 1)) who = [ x.strip() for x in who.split(';') if x.strip() != "" ] results[cur_vote] = who name_may_be_continued = False if line.endswith(";") \ else True # Extracts bill numbers in the closing text # used for when the closing text is multiple lines. elif cur_vote is not None and\ re.findall(r"(?i)(H|S|J)(C?)(B|R|M) (\d+)", line) and \ not any(x in line.lower() for x in ['passed', 'adopted', 'sustained', 'prevailed', 'lost', 'failed']): bills.extend( re.findall(r"(?i)(H|S|J)(C?)(B|R|M) (\d+)", line)) elif cur_vote is not None and \ not any(x in line.lower() for x in ['passed', 'adopted', 'sustained', 'prevailed', 'lost', 'failed']): who = [ x.strip() for x in line.split(";") if x.strip() != "" ] if name_may_be_continued: results[cur_vote][-1] = results[cur_vote][-1] + \ " " + who.pop(0) name_may_be_continued = False if line.endswith(";") \ else True results[cur_vote].extend(who) # At the conclusion of a vote, save its data elif any(x in line.lower() for x in [ 'passed', 'adopted', 'sustained', 'prevailed', 'lost', 'failed' ]): in_vote = False cur_vote = None # Identify what is being voted on # Throw a warning if impropper informaiton found bills.extend( re.findall(r"(?i)(H|S|J)(C?)(B|R|M) (\d+)", line)) if bills == [] or cur_motion.strip() == "": results = {} cur_motion = "" self.warning("No motion or bill name found: " + "motion name: " + cur_motion + "; " + "decision text: " + line.strip()) continue # If votes are found in the motion name, throw an error if "YEAS:" in cur_motion or "NAYS:" in cur_motion: raise AssertionError( "Vote data found in motion name: " + cur_motion) # Use the collected results to determine who voted how keys = { "YEAS": "yes", "NAYS": "no", "ABSENT AND NOT VOTING": "other" } res = {} for key in keys: if key in results: res[keys[key]] = filter( lambda a: a != "", results[key]) else: res[keys[key]] = [] # Count the number of members voting each way yes, no, other = \ len(res['yes']), \ len(res['no']), \ len(res['other']) chambers = {"H": "lower", "S": "upper", "J": "joint"} # Almost all of the time, a vote only applies to one bill and this loop will only be run once. # Some exceptions exist. for bill in bills: cur_bill_id = "%s%s%s %s" % bill # Identify the source chamber for the bill try: bc = chambers[cur_bill_id[0]] except KeyError: bc = 'other' # Determine whether or not the vote passed if "over the governor's veto" in cur_motion.lower( ): VETO_SUPERMAJORITY = 2 / 3 passed = (yes / (yes + no) > VETO_SUPERMAJORITY) else: passed = (yes > no) # Create a Vote object based on the scraped information vote = Vote(chamber, cur_date, cur_motion, passed, yes, no, other, session=session, bill_id=cur_bill_id, bill_chamber=bc) vote.add_source(pdf_url) vote.add_source(url) # For each category of voting members, # add the individuals to the Vote object for key in res: obj = getattr(vote, key) for person in res[key]: obj(person) # Check the vote counts in the motion text against # the parsed results for category_name in keys.keys(): # Need to search for the singular, not plural, in the text # so it can find, for example, " 1 NAY " vote_re = r"(\d+)\s{}".format( category_name[:-1]) motion_count = int( re.findall(vote_re, cur_motion)[0]) vote_count = vote[keys[category_name] + "_count"] if motion_count != vote_count: self.warning( "Motion text vote counts ({}) ".format( motion_count) + "differed from roll call counts ({}) ". format(vote_count) + "for {0} on {1}".format( category_name, cur_bill_id)) vote[keys[category_name] + "_count"] = motion_count self.save_vote(vote) # With the vote successfully processed, # wipe its data and continue to the next one results = {} cur_motion = "" bills = []
def scrape_votes(self, url, motion, date, chamber): vote_pdf, resp = self.urlretrieve(url) text = convert_pdf(vote_pdf, 'text') os.remove(vote_pdf) # this way we get a key error on a missing vote type motion, passed = self._vote_mapping[motion] yes_votes = [] no_votes = [] other_votes = [] # point at array to add names to cur_array = None precursors = ( ('Yeas--', yes_votes), ('Nays--', no_votes), ('Absent or those not voting--', other_votes), ('Absent and those not voting--', other_votes), ('Not Voting--', other_votes), ('Voting Present--', other_votes), ('Present--', other_votes), ('DISCLAIMER', None), ) # split lines on newline, recombine lines that don't end in punctuation lines = _combine_lines(text.split('\n')) for line in lines: # check if the line starts with a precursor, switch to that array for pc, arr in precursors: if pc in line: cur_array = arr line = line.replace(pc, '') # split names for name in line.split(','): name = name.strip() # move on if that's all there was if not name: continue # None or a Total indicate the end of a section if 'None.' in name: cur_array = None match = re.match(r'(.+?)\. Total--.*', name) if match: cur_array.append(match.groups()[0]) cur_array = None # append name if it looks ok junk_in_name = False for junk in ('on final passage', 'Necessary', 'who would have', 'being a tie', 'therefore', 'Vacancies', 'a pair', 'Total-', 'ATTORNEY', 'on final passage', 'SPEAKER', 'BOARD', 'TREASURER', 'GOVERNOR', 'ARCHIVES', 'SECRETARY'): if junk in name: junk_in_name = True break if cur_array is not None and not junk_in_name: # strip trailing . if name[-1] == '.': name = name[:-1] cur_array.append(name) # return vote object yes_count = len(yes_votes) no_count = len(no_votes) other_count = len(other_votes) vote = Vote(chamber, date, motion, passed, yes_count, no_count, other_count) vote['yes_votes'] = yes_votes vote['no_votes'] = no_votes vote['other_votes'] = other_votes return vote