def scrape_senate_vote(self, bill, url, date): try: filename, resp = self.urlretrieve(url) except scrapelib.HTTPError: self.warning("missing vote file %s" % url) return vote = Vote( chamber='upper', start_date=date.strftime("%Y-%m-%d"), motion_text='Passage', # setting 'fail' for now. result='fail', classification='passage', bill=bill ) vote.add_source(url) text = convert_pdf(filename, 'text').decode('utf-8') os.remove(filename) if re.search('Yea:\s+\d+\s+Nay:\s+\d+\s+Absent:\s+\d+', text): yield from self.scrape_senate_vote_3col(bill, vote, text, url, date) return data = re.split(r'(Yea|Nay|Absent)s?:', text)[::-1] data = filter(None, data) keymap = dict(yea='yes', nay='no') actual_vote = collections.defaultdict(int) vote_count = { 'yes': 0, 'no': 0, 'other': 0 } while True: if not data: break vote_val = data.pop() key = keymap.get(vote_val.lower(), 'other') values = data.pop() for name in re.split(r'(?:[\s,]+and\s|[\s,]{2,})', values): if name.lower().strip() == 'none.': continue name = name.replace('..', '') name = re.sub(r'\.$', '', name) name = name.strip('-1234567890 \n') if not name: continue vote.vote(key, name) actual_vote[vote_val] += 1 vote_count[key] += 1 assert actual_vote[vote_val] == vote_count[key] for key, value in vote_count.items(): vote.set_count(key, value) # updating result with actual value vote.result = 'pass' if vote_count['yes'] > (vote_count['no'] + vote_count['other']) else 'fail' yield vote
def scrape_senate_vote(self, bill, url, date): try: filename, resp = self.urlretrieve(url) except scrapelib.HTTPError: self.warning("missing vote file %s" % url) return vote = Vote( chamber='upper', start_date=date.strftime("%Y-%m-%d"), motion_text='Passage', # setting 'fail' for now. result='fail', classification='passage', bill=bill ) vote.add_source(url) text = convert_pdf(filename, 'text').decode('utf-8') os.remove(filename) if re.search('Yea:\s+\d+\s+Nay:\s+\d+\s+Absent:\s+\d+', text): yield from self.scrape_senate_vote_3col(bill, vote, text, url, date) return data = re.split(r'(Yea|Nay|Absent)s?:', text)[::-1] data = filter(None, data) keymap = dict(yea='yes', nay='no') actual_vote = collections.defaultdict(int) vote_count = { 'yes': 0, 'no': 0, 'other': 0 } while True: if not data: break vote_val = data.pop() key = keymap.get(vote_val.lower(), 'other') values = data.pop() for name in re.split(r'(?:[\s,]+and\s|[\s,]{2,})', values): if name.lower().strip() == 'none.': continue name = name.replace('..', '') name = re.sub(r'\.$', '', name) name = name.strip('-1234567890 \n') if not name: continue vote.vote(key, name) actual_vote[vote_val] += 1 vote_count[key] += 1 assert actual_vote[vote_val] == vote_count[key] for key, value in vote_count.items(): vote.set_count(key, value) # updating result with actual value vote.result = 'pass' if vote_count['yes'] > (vote_count['no'] + vote_count['other']) else 'fail' yield vote
def scrape_senate_vote(self, bill, url, date): try: filename, resp = self.urlretrieve(url) except scrapelib.HTTPError: self.warning("missing vote file %s" % url) return vote = VoteEvent( chamber="upper", start_date=date.strftime("%Y-%m-%d"), motion_text="Passage", # setting 'fail' for now. result="fail", classification="passage", bill=bill, ) vote.add_source(url) vote.pupa_id = url text = convert_pdf(filename, "text").decode("utf-8") os.remove(filename) if re.search(r"Yea:\s+\d+\s+Nay:\s+\d+\s+Absent:\s+\d+", text): yield from self.scrape_senate_vote_3col(bill, vote, text, url, date) return data = re.split(r"(Yea|Nay|Absent)s?:", text)[::-1] data = filter(None, data) keymap = dict(yea="yes", nay="no") actual_vote = collections.defaultdict(int) vote_count = {"yes": 0, "no": 0, "other": 0} while True: if not data: break vote_val = data.pop() key = keymap.get(vote_val.lower(), "other") values = data.pop() for name in re.split(r"(?:[\s,]+and\s|[\s,]{2,})", values): if name.lower().strip() == "none.": continue name = name.replace("..", "") name = re.sub(r"\.$", "", name) name = name.strip("-1234567890 \n") if not name: continue vote.vote(key, name) actual_vote[vote_val] += 1 vote_count[key] += 1 assert actual_vote[vote_val] == vote_count[key] for key, value in vote_count.items(): vote.set_count(key, value) # updating result with actual value vote.result = ( "pass" if vote_count["yes"] > (vote_count["no"] + vote_count["other"]) else "fail" ) yield vote
def get_house_pdf(self, vurl): """ cache house PDFs since they are done by year """ if vurl not in self.house_pdf_cache: (path, resp) = self.urlretrieve(vurl) pdflines = convert_pdf(path, 'text') os.remove(path) self.house_pdf_cache[vurl] = pdflines.decode('utf-8').replace(u'\u2019', "'") return self.house_pdf_cache[vurl]
def fetch_pdf_lines(self, href): # download the file try: fname, resp = self.urlretrieve(href) pdflines = [line.decode('utf-8') for line in convert_pdf(fname, 'text').splitlines()] os.remove(fname) return pdflines except scrapelib.HTTPError as e: assert '404' in e.args[0], "File not found: {}".format(e) self.warning("404 error for vote; skipping vote") return False
def fetch_pdf_lines(self, href): # download the file try: fname, resp = self.urlretrieve(href) pdflines = [line.decode('utf-8') for line in convert_pdf(fname, 'text').splitlines()] os.remove(fname) return pdflines except scrapelib.HTTPError as e: assert '404' in e.args[0], "File not found: {}".format(e) self.warning("404 error for vote; skipping vote") return False
def _load_emails_from_directory_pdf(self): """ Load the house PDF directory and convert to LXML - needed to find email addresses which are gone from the website. """ with tempfile.NamedTemporaryFile() as temp: self.scraper.urlretrieve(self.directory_pdf_url, temp.name) directory = lxml.etree.fromstring(convert_pdf(temp.name, 'xml')) # pull out member email addresses from the XML salad produced # above - there's no obvious way to match these to names, but # fortunately they have names in them return set(directory.xpath( '//text[contains(text(), "@myfloridahouse.gov")]/text()'))
def scrape_senate_vote(self, vote, vurl): # download file to server (path, resp) = self.urlretrieve(vurl) pdflines = convert_pdf(path, 'text') os.remove(path) # for y, n mode = None lines = pdflines.splitlines() # handle individual lines in pdf to id legislator votes for line in lines: line = line.strip() line = line.decode('utf-8').replace(u'\u2212', '-') if line == '': continue # change mode accordingly elif line.startswith('YEAS'): mode = 'y' elif line.startswith('NAYS'): mode = 'n' elif line.startswith('ABSENT OR'): mode = 'o' # else parse line with names else: nameline = line.split(' ') for raw_name in nameline: raw_name = raw_name.strip() if raw_name == '': continue # handles vote count lines cut_name = raw_name.split('-') clean_name = '' if cut_name[len(cut_name) - 1].strip(' .').isdigit(): del cut_name[-1] clean_name = ''.join(cut_name) else: clean_name = raw_name.strip() # update vote object with names if mode == 'y': vote.yes(clean_name) elif mode == 'n': vote.no(clean_name) elif mode == 'o': vote.vote('other', clean_name)
def scrape_senate_vote(self, vote, vurl): # download file to server (path, resp) = self.urlretrieve(vurl) pdflines = convert_pdf(path, "text") os.remove(path) # for y, n mode = None lines = pdflines.splitlines() # handle individual lines in pdf to id legislator votes for line in lines: line = line.strip() line = line.decode("utf-8").replace(u"\u2212", "-") if line == "": continue # change mode accordingly elif line.startswith("YEAS"): mode = "y" elif line.startswith("NAYS"): mode = "n" elif line.startswith("ABSENT OR"): mode = "o" # else parse line with names else: nameline = line.split(" ") for raw_name in nameline: raw_name = raw_name.strip() if raw_name == "": continue # handles vote count lines cut_name = raw_name.split("-") clean_name = "" if cut_name[len(cut_name) - 1].strip(" .").isdigit(): del cut_name[-1] clean_name = "".join(cut_name) else: clean_name = raw_name.strip() # update vote object with names if mode == "y": vote.yes(clean_name) elif mode == "n": vote.no(clean_name) elif mode == "o": vote.vote("other", clean_name)
def pdf_to_lxml(self): filename, resp = self.scraper.urlretrieve(self.url) text = convert_pdf(filename, "html") return lxml.html.fromstring(text)
def pdf_to_lxml(self, filename, type='html'): text = convert_pdf(filename, type) return lxml.html.fromstring(text)
def scrape_upper_committee(self, url): filename, resp = self.urlretrieve(url) lines = convert_pdf(filename, 'text').split('\n') comm = None comm_name = '' title = '' MINIMUM_NAME_LENGTH = len('Hon _ _') for line in (x.decode('utf8') for x in lines): line = line.strip() if not line.strip(): continue if (line.startswith('Comisi') or line.startswith('COMISIONES') or line.startswith('SECRETAR')): if comm: # Joint committee rosters are not complete, unfortunately if "Conjunta" not in comm_name: yield comm comm = None comm_name = '' if not (line.startswith('COMISIONES') or line.startswith('SECRETAR')): comm_name = line # Remove "Committee" from committee names comm_name = ( comm_name. replace(u"Comisión de ", ""). replace(u"Comisión Especial para el Estudio de ", ""). replace(u"Comisión Especial para ", "") ) comm_name = re.sub(r'(?u)^(las?|el|los)\s', "", comm_name) comm_name = comm_name[0].upper() + comm_name[1:] # Committee president is always listed right after committee name elif (not comm and comm_name and not re.search(r'^(?:Co.)?President', line) and not line.startswith('Miembr')): comm_name = comm_name + " " + line elif (not comm and (re.search(r'^(?:Co.)?President', line) or line.startswith('Miembr')) and len(line) > len('Presidente ') + MINIMUM_NAME_LENGTH): comm = Organization(comm_name, chamber='upper', classification='committee') comm.add_source(url) if comm: assert re.search(r'(?u)Hon\.?\s\w', line) (temp_title, name) = line.split("Hon") name = name.strip(". ") if temp_title.strip(): title = temp_title # Translate titles to English for parity with other states if "President" in title: title = 'chairman' elif title.startswith("Vicepresident"): title = 'vicechairman' elif title.startswith("Secretari"): title = 'secretary' elif "Miembr" in title: title = 'member' else: raise AssertionError("Unknown member type: {}". format(title)) # Many of the ex-officio members have appended titles if ", " in name: name = name.split(", ")[0] if name.lower() != 'vacante': comm.add_member(name, title) if comm and "Conjunta" not in comm_name: yield comm os.remove(filename)
def scrape_chamber(self, chamber, session): chamber_name = 'house' if chamber == 'lower' else 'senate' session_slug = { '62': '62-2011', '63': '63-2013', '64': '64-2015', '65': '65-2017', }[session] # Open the index page of the session's Registers, and open each url = "http://www.legis.nd.gov/assembly/%s/journals/%s-journal.html" % ( session_slug, chamber_name) page = self.lxmlize(url) pdfs = page.xpath("//a[contains(@href, '.pdf')]") for pdf in pdfs: # Initialize information about the vote parsing results = {} in_motion = False cur_vote = None in_vote = False cur_motion = "" bills = [] # Determine which URLs the information was pulled from pdf_url = pdf.attrib['href'] try: (path, response) = self.urlretrieve(pdf_url) except requests.exceptions.ConnectionError: continue # Convert the PDF to text data = convert_pdf(path, type='text').decode('utf-8') os.unlink(path) # Determine the date of the document date = re.findall(date_re, data) if date: date = date[0][0] cur_date = datetime.datetime.strptime(date, "%A, %B %d, %Y") else: # If no date is found anywhere, do not process the document self.warning("No date was found for the document; skipping.") continue # Check each line of the text for motion and vote information lines = data.splitlines() for line in lines: # Ignore lines with no information if re.search(chamber_re, line) or \ re.search(date_re, line) or \ re.search(page_re, line) or \ line.strip() == "": pass # Ensure that motion and vote capturing are not _both_ active elif in_motion and in_vote: raise AssertionError( "Scraper should not be simultaneously processing " + "motion name and votes, as it is for this motion: " + cur_motion) # Start capturing motion text after a ROLL CALL header elif not in_motion and not in_vote: if line.strip() == "ROLL CALL": in_motion = True elif in_motion and not in_vote: if cur_motion == "": cur_motion = line.strip() else: cur_motion = cur_motion + " " + line.strip() # ABSENT AND NOT VOTING marks the end of each motion name # In this case, prepare to capture votes if line.strip().endswith("VOTING") or \ line.strip().endswith("VOTING."): in_motion = False in_vote = True elif not in_motion and in_vote: # Ignore appointments and confirmations if "The Senate advises and consents to the appointment" \ in line: in_vote = False cur_vote = None results = {} cur_motion = "" bills = [] # If votes are being processed, record the voting members elif ":" in line: cur_vote, who = (x.strip() for x in line.split(":", 1)) who = [ x.strip() for x in who.split(';') if x.strip() != "" ] results[cur_vote] = who name_may_be_continued = False if line.endswith(";") \ else True # Extracts bill numbers in the closing text # used for when the closing text is multiple lines. elif cur_vote is not None and\ re.findall(r"(?i)(H|S|J)(C?)(B|R|M) (\d+)", line) and \ not any(x in line.lower() for x in ['passed', 'adopted', 'sustained', 'prevailed', 'lost', 'failed']): bills.extend( re.findall(r"(?i)(H|S|J)(C?)(B|R|M) (\d+)", line)) elif cur_vote is not None and \ not any(x in line.lower() for x in ['passed', 'adopted', 'sustained', 'prevailed', 'lost', 'failed']): who = [ x.strip() for x in line.split(";") if x.strip() != "" ] if name_may_be_continued: results[cur_vote][-1] = results[cur_vote][-1] + \ " " + who.pop(0) name_may_be_continued = False if line.endswith(";") \ else True results[cur_vote].extend(who) # At the conclusion of a vote, save its data elif any(x in line.lower() for x in [ 'passed', 'adopted', 'sustained', 'prevailed', 'lost', 'failed' ]): in_vote = False cur_vote = None # Identify what is being voted on # Throw a warning if impropper informaiton found bills.extend( re.findall(r"(?i)(H|S|J)(C?)(B|R|M) (\d+)", line)) if bills == [] or cur_motion.strip() == "": results = {} cur_motion = "" self.warning("No motion or bill name found: " + "motion name: " + cur_motion + "; " + "decision text: " + line.strip()) continue # If votes are found in the motion name, throw an error if "YEAS:" in cur_motion or "NAYS:" in cur_motion: raise AssertionError( "Vote data found in motion name: " + cur_motion) # Use the collected results to determine who voted how keys = { "YEAS": "yes", "NAYS": "no", "ABSENT AND NOT VOTING": "other" } res = {} for key in keys: if key in results: res[keys[key]] = results[key] else: res[keys[key]] = [] # Count the number of members voting each way yes, no, other = \ len(res['yes']), \ len(res['no']), \ len(res['other']) chambers = { "H": "lower", "S": "upper", "J": "legislature" } # Almost all of the time, a vote only applies to one bill and this loop # will only be run once. # Some exceptions exist. for bill in bills: cur_bill_id = "%s%s%s %s" % bill # Identify the source chamber for the bill try: bc = chambers[cur_bill_id[0]] except KeyError: bc = 'other' # Determine whether or not the vote passed if "over the governor's veto" in cur_motion.lower( ): VETO_SUPERMAJORITY = 2 / 3 passed = (yes / (yes + no) > VETO_SUPERMAJORITY) else: passed = (yes > no) # Create a Vote object based on the scraped information vote = Vote( chamber=chamber, start_date=cur_date.strftime('%Y-%m-%d'), motion_text=cur_motion, result='pass' if passed else 'fail', legislative_session=session, classification='passage', bill=cur_bill_id, bill_chamber=bc) vote.add_source(pdf_url) vote.add_source(url) vote.set_count('yes', yes) vote.set_count('no', no) vote.set_count('other', other) # For each category of voting members, # add the individuals to the Vote object for key in res: for voter in res[key]: vote.vote(key, voter) # Check the vote counts in the motion text against # the parsed results for category_name in keys.keys(): # Need to search for the singular, not plural, in the text # so it can find, for example, " 1 NAY " vote_re = r"(\d+)\s{}".format( category_name[:-1]) motion_count = int( re.findall(vote_re, cur_motion)[0]) for item in vote.counts: if item['option'] == keys[category_name]: vote_count = item['value'] if motion_count != vote_count: self.warning( "Motion text vote counts ({}) ".format( motion_count) + "differed from roll call counts ({}) ". format(vote_count) + "for {0} on {1}".format( category_name, cur_bill_id)) for item in vote.counts: if item['option'] == keys[ category_name]: vote_count = motion_count yield vote # With the vote successfully processed, # wipe its data and continue to the next one results = {} cur_motion = "" bills = []
def pdf_to_lxml(self, filename, type='html'): text = convert_pdf(filename, type) return lxml.html.fromstring(text)
def scrape_vote(self, url, session): fname, _ = self.urlretrieve(url) text = convert_pdf(fname, type="text").decode() lines = text.splitlines() chamber = "upper" if "senate" in url else "lower" if "Maryland" not in text: self.warning(f"empty vote from {url}") return date = re.findall(r"Legislative Date: (\w+ \d+, \d{4})", text)[0] section = "preamble" motion = None bill_id = None how = None voters = defaultdict(list) for line in lines: if section == "preamble": possible_bill_id = re.findall(r"([HS][BJR] \d+)", line) if possible_bill_id: bill_id = possible_bill_id[0] # preamble has metadata, then motion, then counts. our process then is to # store the last line as the motion, but if the last line looks like a # continuation, append it to the prior line line = line.strip() counts = re.findall( r"(\d+) Yeas\s+(\d+) Nays\s+(\d+) Not Voting\s+(\d+) Excused\s+(\d+) Absent", line, ) if counts: yes_count, no_count, nv_count, excused_count, absent_count = counts[ 0 ] yes_count = int(yes_count) no_count = int(no_count) nv_count = int(nv_count) excused_count = int(excused_count) absent_count = int(absent_count) section = "votes" elif line and line != "(Const)": # questions seem to be split across two lines if line.endswith("?"): motion = motion + " " + line else: motion = line elif section == "votes": if line.startswith("Voting Yea"): how = "yes" elif line.startswith("Voting Nay"): how = "no" elif line.startswith("Not Voting"): how = "not voting" elif line.startswith("Excused from Voting"): how = "excused" elif line.startswith("Excused (Absent)"): how = "absent" elif how: names = re.split(r"\s{2,}", line) voters[how].extend(names) if not bill_id and not motion: return elif bill_id and not motion: self.warning(f"got {bill_id} but no motion, not registering as a vote") elif motion and not bill_id: self.warning(f"got {motion} but no bill_id, not registering as a vote") return # bleh - result not indicated anywhere result = "pass" if yes_count > no_count else "fail" bill_chamber = "upper" if bill_id.startswith("S") else "lower" date = datetime.datetime.strptime(date, "%b %d, %Y").strftime("%Y-%m-%d") vote = VoteEvent( chamber=chamber, start_date=date, result=result, classification="passage", motion_text=motion, legislative_session=session, bill=bill_id, bill_chamber=bill_chamber, ) # URL includes sequence ID, will be unique vote.pupa_id = url vote.add_source(url) vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("not voting", nv_count) vote.set_count("excused", excused_count) vote.set_count("absent", absent_count) for how, names in voters.items(): for name in names: name = name.strip().replace("*", "") if name and "COPY" not in name and "Indicates Vote Change" not in name: vote.vote(how, name) check_counts(vote, raise_error=True) return vote
def pdf_to_lxml(self): filename, resp = self.scraper.urlretrieve(self.url) text = convert_pdf(filename, 'html') return lxml.html.fromstring(text)
def scrape_chamber(self, chamber, session): chamber_name = 'house' if chamber == 'lower' else 'senate' session_slug = { '62': '62-2011', '63': '63-2013', '64': '64-2015', '65': '65-2017', '66': '66-2019', }[session] # Open the index page of the session's Registers, and open each url = "http://www.legis.nd.gov/assembly/%s/journals/%s-journal.html" % ( session_slug, chamber_name) page = self.lxmlize(url) pdfs = page.xpath("//a[contains(@href, '.pdf')]") for pdf in pdfs: # Initialize information about the vote parsing results = {} in_motion = False cur_vote = None in_vote = False cur_motion = "" bills = [] # Determine which URLs the information was pulled from pdf_url = pdf.attrib['href'] try: (path, response) = self.urlretrieve(pdf_url) except requests.exceptions.ConnectionError: continue # Convert the PDF to text data = convert_pdf(path, type='text').decode('utf-8') os.unlink(path) # Determine the date of the document date = re.findall(date_re, data) if date: date = date[0][0] cur_date = datetime.datetime.strptime(date, "%A, %B %d, %Y") else: # If no date is found anywhere, do not process the document self.warning("No date was found for the document; skipping.") continue # Check each line of the text for motion and vote information lines = data.splitlines() for line in lines: # Ignore lines with no information if re.search(chamber_re, line) or \ re.search(date_re, line) or \ re.search(page_re, line) or \ line.strip() == "": pass # Ensure that motion and vote capturing are not _both_ active elif in_motion and in_vote: raise AssertionError( "Scraper should not be simultaneously processing " + "motion name and votes, as it is for this motion: " + cur_motion ) # Start capturing motion text after a ROLL CALL header elif not in_motion and not in_vote: if line.strip() == "ROLL CALL": in_motion = True elif in_motion and not in_vote: if cur_motion == "": cur_motion = line.strip() else: cur_motion = cur_motion + " " + line.strip() # ABSENT AND NOT VOTING marks the end of each motion name # In this case, prepare to capture votes if line.strip().endswith("VOTING") or \ line.strip().endswith("VOTING."): in_motion = False in_vote = True elif not in_motion and in_vote: # Ignore appointments and confirmations if "The Senate advises and consents to the appointment" \ in line: in_vote = False cur_vote = None results = {} cur_motion = "" bills = [] # If votes are being processed, record the voting members elif ":" in line: cur_vote, who = (x.strip() for x in line.split(":", 1)) who = [x.strip() for x in who.split(';') if x.strip() != ""] results[cur_vote] = who name_may_be_continued = False if line.endswith(";") \ else True # Extracts bill numbers in the closing text # used for when the closing text is multiple lines. elif cur_vote is not None and\ re.findall(r"(?i)(H|S|J)(C?)(B|R|M) (\d+)", line) and \ not any(x in line.lower() for x in ['passed', 'adopted', 'sustained', 'prevailed', 'lost', 'failed']): bills.extend(re.findall(r"(?i)(H|S|J)(C?)(B|R|M) (\d+)", line)) elif cur_vote is not None and \ not any(x in line.lower() for x in ['passed', 'adopted', 'sustained', 'prevailed', 'lost', 'failed']): who = [x.strip() for x in line.split(";") if x.strip() != ""] if name_may_be_continued: results[cur_vote][-1] = results[cur_vote][-1] + \ " " + who.pop(0) name_may_be_continued = False if line.endswith(";") \ else True results[cur_vote].extend(who) # At the conclusion of a vote, save its data elif any(x in line.lower() for x in ['passed', 'adopted', 'sustained', 'prevailed', 'lost', 'failed']): in_vote = False cur_vote = None # Identify what is being voted on # Throw a warning if impropper informaiton found bills.extend(re.findall(r"(?i)(H|S|J)(C?)(B|R|M) (\d+)", line)) if bills == [] or cur_motion.strip() == "": results = {} cur_motion = "" self.warning( "No motion or bill name found: " + "motion name: " + cur_motion + "; " + "decision text: " + line.strip() ) continue # If votes are found in the motion name, throw an error if "YEAS:" in cur_motion or "NAYS:" in cur_motion: raise AssertionError( "Vote data found in motion name: " + cur_motion ) # Use the collected results to determine who voted how keys = { "YEAS": "yes", "NAYS": "no", "ABSENT AND NOT VOTING": "other" } res = {} for key in keys: if key in results: res[keys[key]] = results[key] else: res[keys[key]] = [] # Count the number of members voting each way yes, no, other = \ len(res['yes']), \ len(res['no']), \ len(res['other']) chambers = { "H": "lower", "S": "upper", "J": "legislature" } # Almost all of the time, a vote only applies to one bill and this loop # will only be run once. # Some exceptions exist. for bill in bills: cur_bill_id = "%s%s%s %s" % bill # Identify the source chamber for the bill try: bc = chambers[cur_bill_id[0]] except KeyError: bc = 'other' # Determine whether or not the vote passed if "over the governor's veto" in cur_motion.lower(): VETO_SUPERMAJORITY = 2 / 3 passed = (yes / (yes + no) > VETO_SUPERMAJORITY) else: passed = (yes > no) # Create a Vote object based on the scraped information vote = Vote(chamber=chamber, start_date=cur_date.strftime('%Y-%m-%d'), motion_text=cur_motion, result='pass' if passed else 'fail', legislative_session=session, classification='passage', bill=cur_bill_id, bill_chamber=bc) vote.add_source(pdf_url) vote.add_source(url) vote.set_count('yes', yes) vote.set_count('no', no) vote.set_count('other', other) # For each category of voting members, # add the individuals to the Vote object for key in res: for voter in res[key]: vote.vote(key, voter) # Check the vote counts in the motion text against # the parsed results for category_name in keys.keys(): # Need to search for the singular, not plural, in the text # so it can find, for example, " 1 NAY " vote_re = r"(\d+)\s{}".format(category_name[:-1]) motion_count = int(re.findall(vote_re, cur_motion)[0]) for item in vote.counts: if item['option'] == keys[category_name]: vote_count = item['value'] if motion_count != vote_count: self.warning( "Motion text vote counts ({}) ".format(motion_count) + "differed from roll call counts ({}) ".format( vote_count) + "for {0} on {1}".format(category_name, cur_bill_id) ) for item in vote.counts: if item['option'] == keys[category_name]: vote_count = motion_count yield vote # With the vote successfully processed, # wipe its data and continue to the next one results = {} cur_motion = "" bills = []
def scrape_house_vote(self, bill, url): try: filename, resp = self.urlretrieve(url) except scrapelib.HTTPError: self.warning("missing vote file %s" % url) return text = convert_pdf(filename, "text") os.remove(filename) lines = text.splitlines() vote_type = None votes = collections.defaultdict(list) date = None for idx, line in enumerate(lines): line = line.rstrip().decode("utf-8") match = re.search(r"(\d+)/(\d+)/(\d{4,4})$", line) if match: date = datetime.datetime.strptime(match.group(0), "%m/%d/%Y") continue match = re.match( r"\s+YEAS: (\d+)\s+NAYS: (\d+)\s+NOT VOTING: (\d+)", line) if match: motion = (lines[idx - 2].strip()).decode("utf-8") if not motion: self.warning("No motion text found for vote") motion = "PASSAGE" yes_count, no_count, other_count = [ int(g) for g in match.groups() ] exc_match = re.search(r"EXCUSED: (\d+)", line) if exc_match: other_count += int(exc_match.group(1)) if line.endswith("ADOPTED") or line.endswith("PASSED"): passed = True else: passed = False continue match = re.match( r"(YEAS|NAYS|NOT VOTING|PAIRED|EXCUSED):\s+(\d+)\s*$", line) if match: vote_type = { "YEAS": "yes", "NAYS": "no", "NOT VOTING": "other", "EXCUSED": "other", "PAIRED": "paired", }[match.group(1)] continue if vote_type == "paired": for part in line.split(" "): part = part.strip() if not part: continue name, pair_type = re.match(r"([^\(]+)\((YEA|NAY)\)", line).groups() name = name.strip() if pair_type == "YEA": votes["yes"].append(name) elif pair_type == "NAY": votes["no"].append(name) elif vote_type: for name in line.split(" "): name = name.strip() if not name: continue votes[vote_type].append(name) if date: vote = VoteEvent( chamber="lower", start_date=date.strftime("%Y-%m-%d"), motion_text=motion, result="pass" if passed else "fail", classification="passage", bill=bill, ) vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("other", other_count) vote.add_source(url) vote.pupa_id = url for key, values in votes.items(): for value in values: vote.vote(key, value) yield vote else: self.warning("Syntax Error/Warning using 'convert_pdf'")
def scrape_house_vote(self, bill, url): try: filename, resp = self.urlretrieve(url) except scrapelib.HTTPError: self.warning("missing vote file %s" % url) return text = convert_pdf(filename, 'text') os.remove(filename) lines = text.splitlines() vote_type = None votes = collections.defaultdict(list) date = None for idx, line in enumerate(lines): line = line.rstrip().decode('utf-8') match = re.search(r'(\d+)/(\d+)/(\d{4,4})$', line) if match: date = datetime.datetime.strptime(match.group(0), "%m/%d/%Y") continue match = re.match( r'\s+YEAS: (\d+)\s+NAYS: (\d+)\s+NOT VOTING: (\d+)', line) if match: motion = (lines[idx - 2].strip()).decode('utf-8') if not motion: self.warning("No motion text found for vote") motion = "PASSAGE" yes_count, no_count, other_count = [ int(g) for g in match.groups()] exc_match = re.search(r'EXCUSED: (\d+)', line) if exc_match: other_count += int(exc_match.group(1)) if line.endswith('ADOPTED') or line.endswith('PASSED'): passed = True else: passed = False continue match = re.match( r'(YEAS|NAYS|NOT VOTING|PAIRED|EXCUSED):\s+(\d+)\s*$', line) if match: vote_type = {'YEAS': 'yes', 'NAYS': 'no', 'NOT VOTING': 'other', 'EXCUSED': 'other', 'PAIRED': 'paired'}[match.group(1)] continue if vote_type == 'paired': for part in line.split(' '): part = part.strip() if not part: continue name, pair_type = re.match( r'([^\(]+)\((YEA|NAY)\)', line).groups() name = name.strip() if pair_type == 'YEA': votes['yes'].append(name) elif pair_type == 'NAY': votes['no'].append(name) elif vote_type: for name in line.split(' '): name = name.strip() if not name: continue votes[vote_type].append(name) if date: vote = Vote(chamber='lower', start_date=date.strftime("%Y-%m-%d"), motion_text=motion, result='pass' if passed else 'fail', classification='passage', bill=bill) vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('other', other_count) vote.add_source(url) for key, values in votes.items(): for value in values: vote.vote(key, value) yield vote else: self.warning("Syntax Error/Warning using 'convert_pdf'")
def scrape_house_vote(self, bill, url): try: filename, resp = self.urlretrieve(url) except scrapelib.HTTPError: self.warning("missing vote file %s" % url) return text = convert_pdf(filename, 'text') os.remove(filename) lines = text.splitlines() vote_type = None votes = collections.defaultdict(list) date = None for idx, line in enumerate(lines): line = line.rstrip().decode('utf-8') match = re.search(r'(\d+)/(\d+)/(\d{4,4})$', line) if match: date = datetime.datetime.strptime(match.group(0), "%m/%d/%Y") continue match = re.match( r'\s+YEAS: (\d+)\s+NAYS: (\d+)\s+NOT VOTING: (\d+)', line) if match: motion = (lines[idx - 2].strip()).decode('utf-8') if not motion: self.warning("No motion text found for vote") motion = "PASSAGE" yes_count, no_count, other_count = [ int(g) for g in match.groups() ] exc_match = re.search(r'EXCUSED: (\d+)', line) if exc_match: other_count += int(exc_match.group(1)) if line.endswith('ADOPTED') or line.endswith('PASSED'): passed = True else: passed = False continue match = re.match( r'(YEAS|NAYS|NOT VOTING|PAIRED|EXCUSED):\s+(\d+)\s*$', line) if match: vote_type = { 'YEAS': 'yes', 'NAYS': 'no', 'NOT VOTING': 'other', 'EXCUSED': 'other', 'PAIRED': 'paired' }[match.group(1)] continue if vote_type == 'paired': for part in line.split(' '): part = part.strip() if not part: continue name, pair_type = re.match(r'([^\(]+)\((YEA|NAY)\)', line).groups() name = name.strip() if pair_type == 'YEA': votes['yes'].append(name) elif pair_type == 'NAY': votes['no'].append(name) elif vote_type: for name in line.split(' '): name = name.strip() if not name: continue votes[vote_type].append(name) if date: vote = VoteEvent(chamber='lower', start_date=date.strftime("%Y-%m-%d"), motion_text=motion, result='pass' if passed else 'fail', classification='passage', bill=bill) vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('other', other_count) vote.add_source(url) vote.pupa_id = url for key, values in votes.items(): for value in values: vote.vote(key, value) yield vote else: self.warning("Syntax Error/Warning using 'convert_pdf'")
def scrape_upper_committee(self, url): filename, resp = self.urlretrieve(url) lines = convert_pdf(filename, 'text').split('\n') comm = None comm_name = '' title = '' MINIMUM_NAME_LENGTH = len('Hon _ _') for line in (x.decode('utf8') for x in lines): line = line.strip() if not line.strip(): continue if (line.startswith('Comisi') or line.startswith('COMISIONES') or line.startswith('SECRETAR')): if comm: # Joint committee rosters are not complete, unfortunately if "Conjunta" not in comm_name: yield comm comm = None comm_name = '' if not (line.startswith('COMISIONES') or line.startswith('SECRETAR')): comm_name = line # Remove "Committee" from committee names comm_name = (comm_name.replace( u"Comisión de ", "").replace(u"Comisión Especial para el Estudio de ", "").replace(u"Comisión Especial para ", "")) comm_name = re.sub(r'(?u)^(las?|el|los)\s', "", comm_name) comm_name = comm_name[0].upper() + comm_name[1:] # Committee president is always listed right after committee name elif (not comm and comm_name and not re.search(r'^(?:Co.)?President', line) and not line.startswith('Miembr')): comm_name = comm_name + " " + line elif (not comm and (re.search(r'^(?:Co.)?President', line) or line.startswith('Miembr')) and len(line) > len('Presidente ') + MINIMUM_NAME_LENGTH): comm = Organization(comm_name, chamber='upper', classification='committee') comm.add_source(url) if comm: assert re.search(r'(?u)Hon\.?\s\w', line) (temp_title, name) = line.split("Hon") name = name.strip(". ") if temp_title.strip(): title = temp_title # Translate titles to English for parity with other states if "President" in title: title = 'chairman' elif title.startswith("Vicepresident"): title = 'vicechairman' elif title.startswith("Secretari"): title = 'secretary' elif "Miembr" in title: title = 'member' else: raise AssertionError( "Unknown member type: {}".format(title)) # Many of the ex-officio members have appended titles if ", " in name: name = name.split(", ")[0] if name.lower() != 'vacante': comm.add_member(name, title) if comm and "Conjunta" not in comm_name: yield comm os.remove(filename)