def parse_vote( self, bill, action, act_chamber, act_date, url, re_vote_text=re.compile( r'The question (?:being|to be reconsidered):\s*"(.*?\?)"', re.S), re_header=re.compile( r'\d{2}-\d{2}-\d{4}\s{10,}\w{,20} Journal\s{10,}\d{,6}\s{,4}')): html = self.urlopen(url) doc = lxml.html.fromstring(html) # Find all chunks of text representing voting reports. votes_text = doc.xpath('//pre')[1].text_content() votes_text = re_vote_text.split(votes_text) votes_data = zip(votes_text[1::2], votes_text[2::2]) # Process each. for motion, text in votes_data: yes = no = other = 0 tally = re.findall(r'\b([YNEA])[A-Z]+:\s{,3}(\d{,3})', text) for vtype, vcount in tally: vcount = int(vcount) if vcount != '-' else 0 if vtype == 'Y': yes = vcount elif vtype == 'N': no = vcount else: other += vcount vote = Vote(act_chamber, act_date, motion, yes > no, yes, no, other) # In lengthy documents, the "header" can be repeated in the middle # of content. This regex gets rid of it. vote_lines = re_header.sub('', text) vote_lines = vote_lines.split('\r\n') vote_type = None for vote_list in vote_lines: if vote_list.startswith('Yeas: '): vote_list, vote_type = vote_list[6:], vote.yes elif vote_list.startswith('Nays: '): vote_list, vote_type = vote_list[6:], vote.no elif vote_list.startswith('Excused: '): vote_list, vote_type = vote_list[9:], vote.other elif vote_list.startswith('Absent: '): vote_list, vote_type = vote_list[9:], vote.other elif vote_list.strip() == '': vote_type = None if vote_type: for name in vote_list.split(','): name = name.strip() if name: vote_type(name) vote.add_source(url) bill.add_vote(vote)
def scrape_house(self, session): url = journals % (session, 'House') page = self.lxmlize(url) hrefs = page.xpath("//font//a") for href in hrefs: (path, response) = self.urlretrieve(href.attrib['href']) data = convert_pdf(path, type='text') in_vote = False cur_vote = {} known_date = None cur_vote_count = None in_question = False cur_question = None cur_bill_id = None for line in data.split("\n"): if known_date is None: dt = date_re.findall(line) if dt != []: dt, dow = dt[0] known_date = datetime.datetime.strptime( dt, "%A, %B %d, %Y") non_std = False if re.match("(\s+)?\d+.*", line) is None: non_std = True l = line.lower().strip() skip = False blacklist = [ "house", "page", "general assembly", "state of colorado", "session", "legislative day" ] for thing in blacklist: if thing in l: skip = True if skip: continue found = re.findall( "(?P<bill_id>(H|S|SJ|HJ)(B|M|R)\d{2}-\d{4})", line) if found != []: found = found[0] cur_bill_id, chamber, typ = found try: if not non_std: _, line = line.strip().split(" ", 1) line = line.strip() except ValueError: in_vote = False in_question = False continue if in_question: cur_question += " " + line continue if ("The question being" in line) or \ ("On motion of" in line) or \ ("the following" in line) or \ ("moved that the" in line): cur_question = line in_question = True if in_vote: if line == "": in_vote = False continue likely_garbage = False if "co-sponsor" in line.lower(): likely_garbage = True if 'the speaker' in line.lower(): likely_garbage = True votes = [] votes = re.findall(votes_re, line) if likely_garbage: votes = [] for person, _, v in votes: cur_vote[person] = v if votes == []: in_vote = False # save vote yes, no, other = cur_vote_count if cur_bill_id is None: continue bc = { "H": "lower", "S": "upper", "J": "joint" }[cur_bill_id[0].upper()] vote = Vote('upper', known_date, cur_question, (yes > no), yes, no, other, session=session, bill_id=cur_bill_id, bill_chamber=bc) vote.add_source(href.attrib['href']) vote.add_source(url) for person in cur_vote: if person is None: continue vot = cur_vote[person] if vot == 'Y': vote.yes(person) elif vot == 'N': vote.no(person) elif vot == 'E' or vot == '-': vote.other(person) self.save_vote(vote) cur_vote = {} in_question = False cur_question = None in_vote = False cur_vote_count = None continue summ = vote_re.findall(line) if summ == []: continue summ = summ[0] yes, no, exc, ab = summ yes, no, exc, ab = \ int(yes), int(no), int(exc), int(ab) other = exc + ab cur_vote_count = (yes, no, other) in_vote = True continue os.unlink(path)
def parse_senate_vote(self, url): """ senate PDFs -> garbled text -> good text -> Vote """ vote = Vote('upper', '?', 'senate passage', False, 0, 0, 0) vote.add_source(url) fname, resp = self.urlretrieve(url) # this gives us the cleaned up text sv_text = convert_sv_text(convert_pdf(fname, 'text')) os.remove(fname) in_votes = False flag = None overrides = {"ONEILL": "O'NEILL"} vote_override = { ("SB0112SVOTE.PDF", "RYAN"): vote.other, # Recused ("HB0144SVOTE.PDF", "SOULES"): vote.other, # Recused ("HJR15SVOTE.PDF", "KELLER"): vote.other, # Recused } # use in_votes as a sort of state machine for line in sv_text: # not 'in_votes', get date or passage if "bT" in line: # Whatever generates this text renders the cross # in the table as a bT continue # GARBAGE_SPECIAL = ["'", "%", "$", "&"] # for x in GARBAGE_SPECIAL: # for y in [" {} ", "{} ", " {}"]: # line = line.replace(y.format(x), " ") if not in_votes: dmatch = re.search('DATE: (\d{2}-\d{2}-\d{2})', line) if dmatch: date = dmatch.groups()[0] vote['date'] = datetime.strptime(date, '%m-%d-%y') els = re.findall("YES.*NO.*ABS.*EXC", line) if els != []: flag = line[0] in_votes = True if 'PASSED' in line: vote['passed'] = True # in_votes: totals & votes else: line = line.replace(flag, "|") # totals if 'TOTALS' in line: # Lt. Governor voted if 'GOVERNOR' in line: _, name, y, n, a, e = [ x.strip() for x in line.split("|") ][:6] assert name == "LT. GOVERNOR" if y == "X": vote.yes(name) elif n == "X": vote.no(name) elif a == "X" or e == "X": vote.other(name) else: raise ValueError("Bad parse") name, yes, no, abs, exc = [ x.strip() for x in line.split("|") ][6:-1] vote['yes_count'] = int(yes) vote['no_count'] = int(no) vote['other_count'] = int(abs) + int(exc) # no longer in votes in_votes = False continue # pull votes out matches = re.match( ' ([A-Z,\'\-.]+)(\s+)X\s+([A-Z,\'\-.]+)(\s+)X', line) votes = [x.strip() for x in line.split("|")][1:-1] vote1 = votes[:5] vote2 = votes[5:] for voted in [vote1, vote2]: name = "".join(voted[:2]) if name in overrides: name = overrides[name] voted.pop(0) voted[0] = name name, yes, no, abs, exc = voted if "District" in name: continue if yes == "X": vote.yes(name) elif no == "X": vote.no(name) elif abs == "X" or exc == "X": vote.other(name) else: key = (os.path.basename(url), name) if key in vote_override: vote_override[key](name) else: raise ValueError("Bad parse") if not isinstance(vote['date'], datetime): return None return vote
def scrape_vote(self, bill, name, url): match = re.match('^(Senate|House) Vote on [^,]*,(.*)$', name) if not match: return chamber = {'Senate': 'upper', 'House': 'lower'}[match.group(1)] motion = match.group(2).strip() if motion.startswith('FINAL PASSAGE'): type = 'passage' elif motion.startswith('AMENDMENT'): type = 'amendment' elif 'ON 3RD READING' in motion: type = 'reading:3' else: type = 'other' vote = Vote(chamber, None, motion, None, None, None, None) vote['type'] = type vote.add_source(url) (fd, temp_path) = tempfile.mkstemp() self.urlretrieve(url, temp_path) html = pdf_to_lxml(temp_path) os.close(fd) os.remove(temp_path) vote_type = None total_re = re.compile('^Total--(\d+)$') body = html.xpath('string(/html/body)') date_match = re.search('Date: (\d{1,2}/\d{1,2}/\d{4})', body) try: date = date_match.group(1) except AttributeError: self.warning("BAD VOTE: date error") return vote['date'] = dt.datetime.strptime(date, '%m/%d/%Y') for line in body.replace(u'\xa0', '\n').split('\n'): line = line.replace(' ', '').strip() if not line: continue if line in ('YEAS', 'NAYS', 'ABSENT'): vote_type = { 'YEAS': 'yes', 'NAYS': 'no', 'ABSENT': 'other' }[line] elif line in ('Total', '--'): vote_type = None elif vote_type: match = total_re.match(line) if match: vote['%s_count' % vote_type] = int(match.group(1)) elif vote_type == 'yes': vote.yes(line) elif vote_type == 'no': vote.no(line) elif vote_type == 'other': vote.other(line) # tally counts vote['yes_count'] = len(vote['yes_votes']) vote['no_count'] = len(vote['no_votes']) vote['other_count'] = len(vote['other_votes']) # The PDFs oddly don't say whether a vote passed or failed. # Hopefully passage just requires yes_votes > not_yes_votes if vote['yes_count'] > (vote['no_count'] + vote['other_count']): vote['passed'] = True else: vote['passed'] = False bill.add_vote(vote)
def scrape(self, chamber, session): #determining the start year of the term start_year = self.metadata['session_details'][session][ 'start_date'].year # URL building if chamber == 'upper': url_chamber_name = 'senate' norm_chamber_name = 'Senate' chamber_letter = 'S' else: url_chamber_name = 'house' norm_chamber_name = 'House' chamber_letter = 'H' assembly_url = urljoin(self.site_root, '/assembly/%s-%s' % (session, start_year)) chamber_url = '/bill-text/%s-bill.html' % (url_chamber_name) bill_list_url = assembly_url + chamber_url subject_url = assembly_url + '/subject-index/major-topic.html' if not self.subjects: self._scrape_subjects(subject_url) with self.urlopen(bill_list_url) as html: list_page = lxml.html.fromstring(html) # connects bill_num with bill details page bills_url_dict = {} #connects bill_num with bills to be accessed later. bills_id_dict = {} title = '' for bills in list_page.xpath('/html/body/table[3]/tr/th/a'): bill_num = bills.text bill_url = bill_list_url[0:-26] + '/' + bills.attrib['href'][ 2:len(bills.attrib['href'])] bill_prefix, bill_type = self.bill_type_info(bill_num) bill_id = '%s%s %s' % (chamber_letter, bill_prefix, bill_num) bill = Bill(session, chamber, bill_id, title, type=bill_type, subjects=self.subjects[bill_id]) #versions versions_url = assembly_url + '/bill-index/bi' + bill_num + '.html' #sources bill.add_source(bill_url) bill.add_source(bill_list_url) #storing bills to be accessed bills_url_dict[bill_num] = bill_url bills_id_dict[bill_num] = bill #bill details page for bill_keys in bills_url_dict.keys(): url = bills_url_dict[bill_keys] curr_bill = bills_id_dict[bill_keys] with self.urlopen(url) as bill_html: bill_page = lxml.html.fromstring(bill_html) for bill_info in bill_page.xpath( '/html/body/table[4]/tr/td'): info = bill_info.text #Sponsors if "Introduced" in info: if ('Rep' in info) or ('Sen' in info): rep = info[14:17] info = info[18:len(info)] sponsors = info.split(',') else: sponsors = [info[13:len(info)]] rep = '' for sponsor in sponsors: if sponsor == sponsors[0]: sponsor_type = 'primary' else: sponsor_type = 'cosponsor' curr_bill.add_sponsor(sponsor_type, sponsor.strip()) else: #title title = info.strip() curr_bill["title"] = title #actions last_date = datetime actor = '' action_num = len( bill_page.xpath('/html/body/table[5]//tr')) for actions in range(2, action_num, 2): path = '//table[5]/tr[%s]/' % (actions) action = bill_page.xpath(path + 'td[4]')[0].text raw_actor = bill_page.xpath(path + 'td[2]')[0].text if not raw_actor: pass elif raw_actor.strip() == 'Senate': actor = 'upper' else: actor = 'lower' action_date = bill_page.xpath( path + 'th')[0].text.strip() + '/' + str(start_year) if action_date == ('/' + str(start_year)): action_date = last_date else: action_date = datetime.strptime( action_date, '%m/%d/%Y') last_date = action_date atype = categorize_action(action) curr_bill.add_action(actor, action, action_date, atype) #votes if "yeas" in action: yes_count = int( action.split()[action.split().index('yeas') + 1]) no_count = action.split()[ action.split().index('nays') + 1] no_count = int( no_count[0:-1]) if ',' in no_count else int( no_count) passed = True if yes_count > no_count else False vote_type = self.vote_type_info(action) vote = Vote(actor, action_date, action, passed, yes_count, no_count, 0, vote_type) curr_bill.add_vote(vote) #document within actions doc_num_pos = len(bill_page.xpath(path + 'td')) if doc_num_pos > 5: doc_name = bill_page.xpath( path + 'td[6]/a')[0].attrib['href'] doc_url = url[0:url.find('bill')].replace( '///', '/') + doc_name[3:len(doc_name)] #versions bill_num = curr_bill['bill_id'].split()[1] versions_url = assembly_url + '/bill-index/bi' + bill_num + '.html' curr_bill.add_source(versions_url) with self.urlopen(versions_url) as versions_page: versions_page = lxml.html.fromstring(versions_page) version_count = 2 for versions in versions_page.xpath('//table[4]/tr'): tds = versions.xpath("./*") if len(tds) < 3: continue link = tds[2] link = link.xpath("./a")[0] link_name = link.text_content().strip() link = "%s/%s" % (assembly_url + '/bill-index', link.attrib['href']) curr_bill.add_version(link_name, link, mimetype='application/pdf') self.save_bill(curr_bill)
def scrape_votes(self, session): votes = {} last_line = [] lines = self.get( 'http://gencourt.state.nh.us/dynamicdatafiles/RollCallSummary.txt' ).content.splitlines() for line in lines: if len(line) < 2: continue if line.strip() == "": continue line = line.split('|') if len(line) < 14: if len(last_line + line[1:]) == 14: line = last_line self.warning('used bad vote line') else: last_line = line self.warning('bad vote line %s' % '|'.join(line)) session_yr = line[0].replace('\xef\xbb\xbf', '') body = line[1] vote_num = line[2] timestamp = line[3] bill_id = line[4].strip() yeas = int(line[5]) nays = int(line[6]) present = int(line[7]) absent = int(line[8]) motion = line[11].strip() or '[not available]' if session_yr == session and bill_id in self.bills_by_id: actor = 'lower' if body == 'H' else 'upper' time = dt.datetime.strptime(timestamp, '%m/%d/%Y %I:%M:%S %p') # TODO: stop faking passed somehow passed = yeas > nays vote = Vote(actor, time, motion, passed, yeas, nays, other_count=0) votes[body + vote_num] = vote self.bills_by_id[bill_id].add_vote(vote) for line in self.get( 'http://gencourt.state.nh.us/dynamicdatafiles/RollCallHistory.txt' ).content.splitlines(): if len(line) < 2: continue # 2016|H|2|330795||Yea| # 2012 | H | 2 | 330795 | 964 | HB309 | Yea | 1/4/2012 8:27:03 PM session_yr, body, v_num, _, employee, bill_id, vote, date = \ line.split('|') if not bill_id: continue if session_yr == session and bill_id.strip() in self.bills_by_id: try: leg = self.legislators[employee]['name'] except KeyError: self.warning("Error, can't find person %s" % employee) continue vote = vote.strip() if body + v_num not in votes: self.warning("Skipping processing this vote:") self.warning("Bad ID: %s" % (body + v_num)) continue #code = self.legislators[employee]['seat'] if vote == 'Yea': votes[body + v_num].yes(leg) elif vote == 'Nay': votes[body + v_num].no(leg) else: votes[body + v_num].other(leg) votes[body + v_num]['other_count'] += 1
def asvote(self): v = Vote(**self.asdict()) for key in 'yes_votes no_votes other_votes'.split(): v[key] = getattr(self, key)() v.add_source(self.url) return v
def scrape_journal(self, url, chamber, session, date): filename, response = self.urlretrieve(url) self.logger.info('Saved journal to %r' % filename) xml = convert_pdf(filename) try: et = lxml.etree.fromstring(xml) except lxml.etree.XMLSyntaxError: self.logger.warning('Skipping invalid pdf: %r' % filename) return lines = self._journal_lines(et) while True: try: line = next(lines) except StopIteration: break text = gettext(line) # Go through with vote parse if any of # these conditions match. if 'Shall' in text: if 'bill pass?' in text: pass elif 'resolution' in text: pass elif 'amendment' in text: pass else: continue else: continue # Get the bill_id. bill_id = None for line in lines: text += gettext(line) m = re.search(r'\(\s*([A-Z\.]+\s+\d+)\s*\)', text) if m: bill_id = m.group(1) break motion = text.strip() motion = re.sub(r'\s+', ' ', motion) motion, _ = motion.rsplit('(', 1) motion = motion.replace('"', '') motion = motion.replace(u'“', '') motion = motion.replace(u'\u201d', '') motion = motion.replace(u' ,', ',') motion = motion.strip() motion = re.sub(r'[SH].\d+', lambda m: ' %s ' % m.group(), motion) motion = re.sub(r'On the question\s*', '', motion, flags=re.I) for word, letter in (('Senate', 'S'), ('House', 'H'), ('File', 'F')): if bill_id is None: return bill_id = bill_id.replace(word, letter) bill_chamber = dict(h='lower', s='upper')[bill_id.lower()[0]] self.current_id = bill_id votes = self.parse_votes(lines) totals = filter(lambda x: isinstance(x, int), votes.values()) passed = (1.0 * votes['yes_count'] / sum(totals)) >= 0.5 vote = Vote(motion=motion, passed=passed, chamber=chamber, date=date, session=session, bill_id=bill_id, bill_chamber=bill_chamber, **votes) vote.update(votes) vote.add_source(url) self.save_vote(vote)
def scrape_votes(self, url, motion, date, chamber): vote_pdf, resp = self.urlretrieve(url) text = convert_pdf(vote_pdf, 'text') os.remove(vote_pdf) # this way we get a key error on a missing vote type motion, passed = self._vote_mapping[motion] yes_votes = [] no_votes = [] other_votes = [] # point at array to add names to cur_array = None precursors = ( ('Yeas--', yes_votes), ('Nays--', no_votes), ('Absent or those not voting--', other_votes), ('Absent and those not voting--', other_votes), ('Voting Present--', other_votes), ('Present--', other_votes), ('DISCLAIMER', None), ) # split lines on newline, recombine lines that don't end in punctuation lines = _combine_lines(text.split('\n')) for line in lines: # check if the line starts with a precursor, switch to that array for pc, arr in precursors: if pc in line: cur_array = arr line = line.replace(pc, '') # split names for name in line.split(','): name = name.strip() # move on if that's all there was if not name: continue # None or a Total indicate the end of a section if 'None.' in name: cur_array = None match = re.match(r'(.+?)\. Total--.*', name) if match: cur_array.append(match.groups()[0]) cur_array = None # append name if it looks ok junk_in_name = False for junk in ('on final passage', 'Necessary', 'who would have', 'being a tie', 'therefore', 'Vacancies', 'a pair', 'Total-', 'ATTORNEY', 'on final passage', 'SPEAKER', 'BOARD', 'TREASURER', 'GOVERNOR', 'ARCHIVES', 'SECRETARY'): if junk in name: junk_in_name = True break if cur_array is not None and not junk_in_name: # strip trailing . if name[-1] == '.': name = name[:-1] cur_array.append(name) # return vote object yes_count = len(yes_votes) no_count = len(no_votes) other_count = len(other_votes) vote = Vote(chamber, date, motion, passed, yes_count, no_count, other_count) vote['yes_votes'] = yes_votes vote['no_votes'] = no_votes vote['other_votes'] = other_votes return vote
def scrape_bill_pages(self, session, year_abr): """ assemble information on a bill from a number of DBF files """ #Main Bill information main_bill_url, main_bill_db = self.get_dbf(year_abr, 'MAINBILL') # keep a dictionary of bills (mapping bill_id to Bill obj) bill_dict = {} for rec in main_bill_db: bill_type = rec["billtype"] bill_number = int(rec["billnumber"]) bill_id = bill_type + str(bill_number) title = rec["synopsis"] if bill_type[0] == 'A': chamber = "lower" else: chamber = "upper" # some bills have a blank title.. just skip it if not title: continue bill = Bill(str(session), chamber, bill_id, title, type=self._bill_types[bill_type[1:]]) bill.add_source(main_bill_url) bill_dict[bill_id] = bill #Sponsors bill_sponsors_url, bill_sponsors_db = self.get_dbf( year_abr, 'BILLSPON') for rec in bill_sponsors_db: bill_type = rec["billtype"] bill_number = int(rec["billnumber"]) bill_id = bill_type + str(bill_number) bill = bill_dict[bill_id] name = rec["sponsor"] sponsor_type = rec["type"] if sponsor_type == 'P': sponsor_type = "Primary" else: sponsor_type = "Co-sponsor" bill.add_sponsor(sponsor_type, name) #Documents bill_document_url, bill_document_db = self.get_dbf(year_abr, 'BILLWP') #print bill_document_db[2] for rec in bill_document_db: bill_type = rec["billtype"] bill_number = int(rec["billnumber"]) bill_id = bill_type + str(bill_number) bill = bill_dict[bill_id] document = rec["document"] document = document.split('\\') document = document[-2] + "/" + document[-1] year = str(year_abr) + str((year_abr + 1)) #doc_url = "ftp://www.njleg.state.nj.us/%s/%s" % (year, document) htm_url = 'http://www.njleg.state.nj.us/%s/Bills/%s' % ( year_abr, document.replace('.DOC', '.HTM')) # name document based _doctype doc_name = self._doctypes[rec['doctype']] if rec['comment']: doc_name += ' ' + rec['comment'] if rec['doctype'] in self._version_types: bill.add_version(doc_name, htm_url) else: bill.add_document(doc_name, htm_url) # Votes next_year = int(year_abr) + 1 vote_info_list = [ 'A%s' % year_abr, 'A%s' % next_year, 'S%s' % year_abr, 'S%s' % next_year, 'CA%s-%s' % (year_abr, next_year), 'CS%s-%s' % (year_abr, next_year), ] for filename in vote_info_list: s_vote_url = 'ftp://www.njleg.state.nj.us/votes/%s.zip' % filename s_vote_zip, resp = self.urlretrieve(s_vote_url) zipedfile = zipfile.ZipFile(s_vote_zip) vfile = "%s.txt" % filename vote_file = zipedfile.open(vfile, 'U') vdict_file = csv.DictReader(vote_file) votes = {} if filename.startswith('A') or filename.startswith('CA'): chamber = "lower" else: chamber = "upper" if filename.startswith('C'): vote_file_type = 'committee' else: vote_file_type = 'chamber' for rec in vdict_file: if vote_file_type == 'chamber': bill_id = rec["Bill"].strip() leg = rec["Full_Name"] date = rec["Session_Date"] action = rec["Action"] leg_vote = rec["Legislator_Vote"] else: bill_id = '%s%s' % (rec['Bill_Type'], rec['Bill_Number']) leg = rec['Name'] # drop time portion date = rec['Agenda_Date'].split()[0] # make motion readable action = self._com_vote_motions[rec['BillAction']] # first char (Y/N) use [0:1] to ignore '' leg_vote = rec['LegislatorVote'][0:1] date = datetime.strptime(date, "%m/%d/%Y") vote_id = '_'.join((bill_id, chamber, action)) vote_id = vote_id.replace(" ", "_") if vote_id not in votes: votes[vote_id] = Vote(chamber, date, action, None, None, None, None, bill_id=bill_id) if vote_file_type == 'committee': votes[vote_id]['committee'] = self._committees[ rec['Committee_House']] if leg_vote == "Y": votes[vote_id].yes(leg) elif leg_vote == "N": votes[vote_id].no(leg) else: votes[vote_id].other(leg) # remove temp file os.remove(s_vote_zip) #Counts yes/no/other votes and saves overall vote for vote in votes.itervalues(): vote_yes_count = len(vote["yes_votes"]) vote_no_count = len(vote["no_votes"]) vote_other_count = len(vote["other_votes"]) vote["yes_count"] = vote_yes_count vote["no_count"] = vote_no_count vote["other_count"] = vote_other_count if vote_yes_count > vote_no_count: vote["passed"] = True else: vote["passed"] = False vote_bill_id = vote["bill_id"] bill = bill_dict[vote_bill_id] bill.add_vote(vote) #Actions bill_action_url, bill_action_db = self.get_dbf(year_abr, 'BILLHIST') for rec in bill_action_db: bill_type = rec["billtype"] bill_number = int(rec["billnumber"]) bill_id = bill_type + str(bill_number) bill = bill_dict[bill_id] action = rec["action"] date = rec["dateaction"] actor = rec["house"] comment = rec["comment"] action, atype = self.categorize_action(action) if comment: action += (' ' + comment) bill.add_action(actor, action, date, type=atype) # Subjects subject_url, subject_db = self.get_dbf(year_abr, 'BILLSUBJ') for rec in subject_db: bill_id = rec['billtype'] + str(int(rec['billnumber'])) bill = bill_dict.get(bill_id) if bill: bill.setdefault('subjects', []).append(rec['subjectkey']) else: self.warning('invalid bill id in BILLSUBJ.DBF: %s' % bill_id) # save all bills at the end for bill in bill_dict.itervalues(): # add sources bill.add_source(bill_sponsors_url) bill.add_source(bill_document_url) bill.add_source(bill_action_url) bill.add_source(subject_url) self.save_bill(bill)
def scrape(self, session, chambers): HTML_TAGS_RE = r'<.*?>' year_slug = session[5:] # Load all bills and resolutions via the private API bills_url = \ 'http://legislature.vermont.gov/bill/loadBillsIntroduced/{}/'.\ format(year_slug) bills_json = self.urlopen(bills_url) bills = json.loads(bills_json)['data'] resolutions_url = \ 'http://legislature.vermont.gov/bill/loadAllResolutionsByChamber/{}/both'.\ format(year_slug) resolutions_json = self.urlopen(resolutions_url) bills.extend(json.loads(resolutions_json)['data']) # Parse the information from each bill for info in bills: # Strip whitespace from strings info = {k: v.strip() for k, v in info.iteritems()} # Identify the bill type and chamber if info['BillNumber'].startswith('J.R.H.'): bill_type = 'joint resolution' bill_chamber = 'lower' elif info['BillNumber'].startswith('J.R.S.'): bill_type = 'joint resolution' bill_chamber = 'upper' elif info['BillNumber'].startswith('H.C.R.'): bill_type = 'concurrent resolution' bill_chamber = 'lower' elif info['BillNumber'].startswith('S.C.R.'): bill_type = 'concurrent resolution' bill_chamber = 'upper' elif info['BillNumber'].startswith('H.R.'): bill_type = 'resolution' bill_chamber = 'lower' elif info['BillNumber'].startswith('S.R.'): bill_type = 'resolution' bill_chamber = 'upper' elif info['BillNumber'].startswith('PR.'): bill_type = 'constitutional amendment' if info['Body'] == 'H': bill_chamber = 'lower' elif info['Body'] == 'S': bill_chamber = 'upper' else: raise AssertionError("Amendment not tied to chamber") elif info['BillNumber'].startswith('H.'): bill_type = 'bill' bill_chamber = 'lower' elif info['BillNumber'].startswith('S.'): bill_type = 'bill' bill_chamber = 'upper' else: raise AssertionError("Unknown bill type found: '{}'".format( info['BillNumber'])) # Create the bill using its basic information bill = Bill(session=session, bill_id=info['BillNumber'], title=info['Title'], chamber=bill_chamber, type=bill_type) if 'resolution' in bill_type: bill.add_source(resolutions_url) else: bill.add_source(bills_url) # Load the bill's information page to access its metadata bill_url = \ 'http://legislature.vermont.gov/bill/status/{0}/{1}'.\ format(year_slug, info['BillNumber']) doc = self.lxmlize(bill_url) bill.add_source(bill_url) # Capture sponsors sponsors = doc.xpath( '//dl[@class="summary-table"]/dt[text()="Sponsor(s)"]/' 'following-sibling::dd[1]/ul/li') sponsor_type = 'primary' for sponsor in sponsors: if sponsor.xpath('span/text()') == ['Additional Sponsors']: sponsor_type = 'cosponsor' continue sponsor_name = sponsor.xpath('a/text()')[0].\ replace("Rep.", "").replace("Sen.", "").strip() if sponsor_name and not \ (sponsor_name[ :5] == "Less" and len(sponsor_name) == 5): bill.add_sponsor(sponsor_type, sponsor_name) # Capture bill text versions versions = doc.xpath( '//dl[@class="summary-table"]/dt[text()="Bill/Resolution Text"]/' 'following-sibling::dd[1]/ul/li/a') for version in versions: bill.add_version(name=version.xpath('text()')[0], url=version.xpath('@href')[0].replace( ' ', '%20'), mimetype='application/pdf') # Identify the internal bill ID, used for actions and votes # If there is no internal bill ID, then it has no extra information try: internal_bill_id = re.search( r'"bill/loadBillDetailedStatus/{}/(\d+)"'.format( year_slug), lxml.etree.tostring(doc)).group(1) except AttributeError: self.warning("Bill {} appears to have no activity".\ format(info['BillNumber'])) self.save_bill(bill) continue # Capture actions actions_url = 'http://legislature.vermont.gov/bill/loadBillDetailedStatus/{0}/{1}'.\ format(year_slug, internal_bill_id) actions_json = self.urlopen(actions_url) actions = json.loads(actions_json)['data'] bill.add_source(actions_url) chambers_passed = set() for action in actions: action = {k: v.strip() for k, v in action.iteritems()} if "Signed by Governor" in action['FullStatus']: actor = 'governor' elif action['ChamberCode'] == 'H': actor = 'lower' elif action['ChamberCode'] == 'S': actor = 'upper' else: raise AssertionError("Unknown actor for bill action") # Categorize action if "Signed by Governor" in action['FullStatus']: assert chambers_passed == set("HS") action_type = 'governor:signed' elif actor == 'lower' and \ action['FullStatus'] in ( "Passed", "Read Third time and Passed", "Read and Adopted in Concurrence", "Read and Adopted", "Adopted", "Adopted in Concurrence"): action_type = 'bill:passed' assert "H" not in chambers_passed chambers_passed.add("H") elif actor == 'upper' and \ any(action['FullStatus'].startswith(x) for x in ( "Read 3rd time & passed", "Read & adopted", "Adopted")): action_type = 'bill:passed' assert "S" not in chambers_passed chambers_passed.add("S") else: action_type = 'other' bill.add_action(actor=actor, action=re.sub(HTML_TAGS_RE, "", action['FullStatus']), date=datetime.datetime.strptime( action['StatusDate'], '%m/%d/%Y'), type=action_type) # Capture votes votes_url = 'http://legislature.vermont.gov/bill/loadBillRollCalls/{0}/{1}'.\ format(year_slug, internal_bill_id) votes_json = self.urlopen(votes_url) votes = json.loads(votes_json)['data'] bill.add_source(votes_url) for vote in votes: roll_call_id = vote['VoteHeaderID'] roll_call_url = 'http://legislature.vermont.gov/bill/loadBillRollCallDetails/{0}/{1}'.\ format(year_slug, roll_call_id) roll_call_json = self.urlopen(roll_call_url) roll_call = json.loads(roll_call_json)['data'] roll_call_yea = [] roll_call_nay = [] roll_call_other = [] for member in roll_call: (member_name, _district) = member['MemberName'].split(" of ") member_name = member_name.strip() if member['MemberVote'] == "Yea": roll_call_yea.append(member_name) elif member['MemberVote'] == "Nay": roll_call_nay.append(member_name) else: roll_call_other.append(member_name) if "Passed -- " in vote['FullStatus']: did_pass = True elif "Failed -- " in vote['FullStatus']: did_pass = False else: raise AssertionError("Roll call vote result is unclear") # Check vote counts yea_count = \ int(re.search(r'Yeas = (\d+)', vote['FullStatus']).group(1)) nay_count = \ int(re.search(r'Nays = (\d+)', vote['FullStatus']).group(1)) if yea_count != len(roll_call_yea) or \ nay_count != len(roll_call_nay): raise AssertionError( "Yea and/or nay counts incongruous:\n" + "Yeas from vote text: {}\n".format(yea_count) + "Yeas from number of members: {}\n".format( len(roll_call_yea)) + "Nays from vote text: {}\n".format(nay_count) + "Nays from number of members: {}".format( len(roll_call_nay))) vote_to_add = Vote(chamber=('lower' if vote['ChamberCode'] == 'H' else 'upper'), date=datetime.datetime.strptime( vote['StatusDate'], '%m/%d/%Y'), motion=re.sub(HTML_TAGS_RE, "", vote['FullStatus']).strip(), passed=did_pass, yes_count=yea_count, no_count=nay_count, other_count=len(roll_call_other)) vote_to_add.add_source(roll_call_url) for member in roll_call_yea: vote_to_add.yes(member) for member in roll_call_nay: vote_to_add.no(member) for member in roll_call_other: vote_to_add.other(member) bill.add_vote(vote_to_add) # Capture extra information # This is not in the OpenStates spec, but is available # Not yet implemented # Witnesses: http://legislature.vermont.gov/bill/loadBillWitnessList/{year_slug}/{internal_bill_id} # Conference committee members: http://legislature.vermont.gov/bill/loadBillConference/{year_slug}/{bill_number} # Committee meetings: http://legislature.vermont.gov/committee/loadHistoryByBill/{year_slug}?LegislationId={internal_bill_id} self.save_bill(bill)
def scrape_votes(self, bill, url): page = lxml.html.fromstring(self.get(url).text.replace(u'\xa0', ' ')) re_ns = "http://exslt.org/regular-expressions" path = "//p[re:test(text(), 'OKLAHOMA\s+(HOUSE|STATE\s+SENATE)')]" for header in page.xpath(path, namespaces={'re': re_ns}): bad_vote = False # Each chamber has the motion name on a different line of the file if 'HOUSE' in header.xpath("string()"): chamber = 'lower' motion_index = 8 else: chamber = 'upper' motion_index = 13 motion = header.xpath("string(following-sibling::p[%d])" % motion_index).strip() motion = re.sub(r'\s+', ' ', motion) assert motion.strip(), "Motion text not found" match = re.match(r'^(.*) (PASSED|FAILED)$', motion) if match: motion = match.group(1) passed = match.group(2) == 'PASSED' else: passed = None rcs_p = header.xpath( "following-sibling::p[contains(., 'RCS#')]")[0] rcs_line = rcs_p.xpath("string()").replace(u'\xa0', ' ') rcs = re.search(r'RCS#\s+(\d+)', rcs_line).group(1) date_line = rcs_p.getnext().xpath("string()") date = re.search(r'\d+/\d+/\d+', date_line).group(0) date = datetime.datetime.strptime(date, "%m/%d/%Y").date() vtype = None counts = collections.defaultdict(int) votes = collections.defaultdict(list) seen_yes = False for sib in header.xpath("following-sibling::p")[13:]: line = sib.xpath("string()").replace('\r\n', ' ').strip() if "*****" in line: break match = re.match( r'(YEAS|NAYS|EXCUSED|VACANT|CONSTITUTIONAL PRIVILEGE|NOT VOTING|N/V)\s*:\s*(\d+)(.*)', line) if match: if match.group(1) == 'YEAS' and 'RCS#' not in line: vtype = 'yes' seen_yes = True elif match.group(1) == 'NAYS' and seen_yes: vtype = 'no' elif match.group(1) == 'VACANT': continue # skip these elif seen_yes: vtype = 'other' if seen_yes and match.group(3).strip(): self.logger.warning("Bad vote format, skipping.") bad_vote = True counts[vtype] += int(match.group(2)) elif seen_yes: for name in line.split(' '): if not name: continue if 'HOUSE' in name or 'SENATE ' in name: continue votes[vtype].append(name.strip()) if bad_vote: continue if passed is None: passed = counts['yes'] > (counts['no'] + counts['other']) vote = Vote(chamber, date, motion, passed, counts['yes'], counts['no'], counts['other'], rcs_num=rcs) vote.validate() vote.add_source(url) for name in votes['yes']: vote.yes(name) for name in votes['no']: if ':' in name: raise Exception(name) vote.no(name) for name in votes['other']: vote.other(name) vote.validate() bill.add_vote(vote)
def scrape_vote(self, bill, name, url): if "VOTE/H" in url: vote_chamber = 'lower' cols = (1, 5, 9, 13) name_offset = 3 yes_offset = 0 no_offset = 1 else: vote_chamber = 'upper' cols = (1, 6) name_offset = 4 yes_offset = 1 no_offset = 2 # Connecticut's SSL is causing problems with Scrapelib, so use Requests page = requests.get(url, verify=False).text if 'BUDGET ADDRESS' in page: return page = lxml.html.fromstring(page) yes_count = page.xpath( "string(//span[contains(., 'Those voting Yea')])") yes_count = int(re.match(r'[^\d]*(\d+)[^\d]*', yes_count).group(1)) no_count = page.xpath( "string(//span[contains(., 'Those voting Nay')])") no_count = int(re.match(r'[^\d]*(\d+)[^\d]*', no_count).group(1)) other_count = page.xpath("string(//span[contains(., 'Those absent')])") other_count = int(re.match(r'[^\d]*(\d+)[^\d]*', other_count).group(1)) need_count = page.xpath("string(//span[contains(., 'Necessary for')])") need_count = int(re.match(r'[^\d]*(\d+)[^\d]*', need_count).group(1)) date = page.xpath("string(//span[contains(., 'Taken on')])") date = re.match(r'.*Taken\s+on\s+(\d+/\s?\d+)', date).group(1) date = date.replace(' ', '') date = datetime.datetime.strptime(date + " " + bill['session'], "%m/%d %Y").date() vote = Vote(vote_chamber, date, name, yes_count > need_count, yes_count, no_count, other_count) vote.add_source(url) table = page.xpath("//table")[0] for row in table.xpath("tr"): for i in cols: name = row.xpath("string(td[%d])" % (i + name_offset)).strip() if not name or name == 'VACANT': continue if "Y" in row.xpath("string(td[%d])" % (i + yes_offset)): vote.yes(name) elif "N" in row.xpath("string(td[%d])" % (i + no_offset)): vote.no(name) else: vote.other(name) bill.add_vote(vote)
def scrape(self, chamber, session): chamber_name = 'senate' if chamber == 'lower' else 'house' session_slug = {'62': '62-2011', '63': '63-2013'}[session] url = "http://www.legis.nd.gov/assembly/%s/journals/%s-journal.html" % ( session_slug, chamber_name) page = self.lxmlize(url) pdfs = page.xpath("//a[contains(@href, '.pdf')]") for pdf in pdfs: bill_id = None results = {} in_vote = False cur_date = None in_motion = False cur_vote = None in_vote = False cur_motion = "" pdf_url = pdf.attrib['href'] (path, response) = self.urlretrieve(pdf_url) data = convert_pdf(path, type='text') os.unlink(path) lines = data.splitlines() for line in lines: date = re.findall(date_re, line) if date != [] and not cur_date: date = date[0][0] cur_date = datetime.datetime.strptime( date, "%A, %B %d, %Y") if line.strip() == "": in_motion = False continue if True in [x in line.lower() for x in ['passed', 'lost']] and in_vote: in_vote = False bills = re.findall(r"(?i)(H|S|J)(B|R|M) (\d+)", line) if bills == [] or cur_motion.strip() == "": bill_id = None results = {} in_vote = False in_motion = False cur_vote = None in_vote = False continue print "CM: ", cur_motion cur_bill_id = "%s%s %s" % (bills[-1]) keys = { "YEAS": "yes", "NAYS": "no", "ABSENT AND NOT VOTING": "other" } res = {} for key in keys: if key in results: res[keys[key]] = filter(lambda a: a != "", results[key]) else: res[keys[key]] = [] # results results = {} yes, no, other = len(res['yes']), len(res['no']), \ len(res['other']) chambers = {"H": "lower", "S": "upper", "J": "joint"} try: bc = chambers[cur_bill_id[0]] except KeyError: bc = 'other' vote = Vote(chamber, cur_date, cur_motion, (yes > no), yes, no, other, session=session, bill_id=cur_bill_id, bill_chamber=bc) vote.add_source(pdf_url) vote.add_source(url) for key in res: obj = getattr(vote, key) for person in res[key]: obj(person) self.save_vote(vote) bill_id = None results = {} in_vote = False in_motion = False cur_vote = None in_vote = False cur_motion = "" # print bills # print "VOTE TAKEN" if 'VOTES FOR' in line: in_motion = False in_vote = False continue if 'ABSET' in line: if in_motion: in_vote = True in_motion = False if ":" in line and in_vote: cur_vote, who = line.split(":", 1) who = [x.strip() for x in who.split(';')] results[cur_vote] = who continue if in_vote: if cur_vote is None: continue who = [x.strip() for x in line.split(";")] for person in who: # print cur_vote results[cur_vote].append(person) continue if "question being" in line: cur_motion = line.strip() in_motion = True continue if in_motion: cur_motion += line.strip() continue if line.strip() == 'ROLL CALL': in_vote = True
def scrape_bill(self, chamber, session, bill_id, bill_type): url = '%s?r=%s' % (self.base_url, bill_id) html = self.get(url).text if "error '80020009'" in html: self.warning('asp error on page, skipping %s', bill_id) return doc = lxml.html.fromstring(html) # search for Titulo, accent over i messes up lxml, so use 'tulo' title = doc.xpath( u'//td/b[contains(text(),"tulo")]/../following-sibling::td/text()') if not title: raise NoSuchBill() bill = Bill(session, chamber, bill_id, title[0], type=bill_type) author = doc.xpath(u'//td/b[contains(text(),"Autor")]/../text()')[0] for aname in author.split(','): aname = self.clean_name(aname).strip() if aname: bill.add_sponsor('primary', aname) co_authors = doc.xpath( u'//td/b[contains(text(),"Co-autor")]/../text()') if len(co_authors) != 0: for co_author in co_authors[1].split(','): bill.add_sponsor('cosponsor', self.clean_name(co_author).strip()) action_table = doc.xpath('//table')[-1] for row in action_table[1:]: tds = row.xpath('td') # ignore row missing date if len(tds) != 2: continue if tds[0].text_content(): date = datetime.datetime.strptime(tds[0].text_content(), "%m/%d/%Y") action = tds[1].text_content().strip() #parse the text to see if it's a new version or a unrelated document #if has a hyphen let's assume it's a vote document #get url of action action_url = tds[1].xpath('a/@href') atype, action = self.parse_action(chamber, bill, action, action_url, date) # Some lower-house roll calls could be parsed, but finnicky # Most roll lists are just images embedded within a document, # and offer no alt text to scrape # Instead, just scrape the vote counts vote_info = re.search( r'(?u)^(.*),\s([\s\d]{2})-([\s\d]{2})-([\s\d]{2})-([\s\d]{0,2})$', action) if vote_info and re.search(r'\d{1,2}', action): vote_name = vote_info.group(1) if u"Votación Final" in vote_name: (vote_chamber, vote_name) = re.search(r'(?u)^\w+ por (.*?) en (.*)$', vote_name).groups() if "Senado" in vote_chamber: vote_chamber = 'upper' else: vote_chamber = 'lower' elif "Cuerpo de Origen" in vote_name: vote_name = re.search(r'(?u)^Cuerpo de Origen (.*)$', vote_name).group(1) vote_chamber = chamber elif u"informe de Comisión de Conferencia" in vote_name: (vote_chamber, vote_name) = re.search( r'(?u)^(\w+) (\w+ informe de Comisi\wn de Conferencia)$', vote_name).groups() if vote_chamber == "Senado": vote_chamber = 'upper' else: vote_chamber = 'lower' elif u"Se reconsideró" in vote_name: if bill['votes']: vote_chamber = bill['votes'][-1]['chamber'] else: vote_chamber = chamber else: raise AssertionError( u"Unknown vote text found: {}".format(vote_name)) vote_name = vote_name.title() yes = int(vote_info.group(2)) no = int(vote_info.group(3)) other = 0 if vote_info.group(4).strip(): other += int(vote_info.group(4)) if vote_info.group(5).strip(): other += int(vote_info.group(5)) vote = Vote(chamber=vote_chamber, date=date, motion=vote_name, passed=(yes > no), yes_count=yes, no_count=no, other_count=other) vote.add_source(url) bill.add_vote(vote) bill.add_source(url) self.save_bill(bill)
def scrape_bill_sheet(self, session, chamber): sheet_url = self.get_bill_folder(session, chamber) bill_chamber = {"Senate": "upper", "House": "lower"}[chamber] index = { "id": 0, "title_sponsor": 1, "version": 2, "history": 3, "votes": 7 } with self.urlopen(sheet_url) as sheet_html: sheet_page = lxml.html.fromstring(sheet_html) bills = sheet_page.xpath('//table/tr') for bill in bills: bill_id = self.read_td(bill[index["id"]][0]) if bill_id == None: # Every other entry is null for some reason continue bill_id = bill_id[:bill_id.find(".")] title_and_sponsor = bill[index["title_sponsor"]][0] bill_title = title_and_sponsor.text bill_title_and_sponsor = title_and_sponsor.text_content() sponsors = bill_title_and_sponsor.replace(bill_title, "").\ replace(" & ...", "").split("--") bill_history_href = CO_URL_BASE + \ bill[index["history"]][0][0].attrib['href'] # ^^^^^^^ We assume this is a full path to the target. # might want to consider some better rel-path support # XXX: Look at this ^ history = self.parse_history(bill_history_href) b = Bill(session, bill_chamber, bill_id, bill_title) for action in history: self.add_action_to_bill(b, action) for sponsor in sponsors: b.add_sponsor("primary", sponsor) # Now that we have history, let's see if we can't grab some # votes bill_vote_href = self.get_vote_url(bill_id, session) votes = self.parse_votes(bill_vote_href) if votes['sanity-check'] != bill_id: print "XXX: READ ME!" print " -> Scraped ID: " + votes['sanity-check'] print " -> 'Real' ID: " + bill_id assert votes['sanity-check'] == bill_id for vote in votes['votes']: print vote filed_votes = vote['votes'] passage = vote['meta'] result = vote['result'] composite_time = "%s %s" % (passage['x-parent-date'], passage['TIME']) # It's now like: 04/01/2011 02:10:14 PM pydate = dt.datetime.strptime(composite_time, "%m/%d/%Y %I:%M:%S %p") hasHouse = "House" in passage['x-parent-ctty'] hasSenate = "Senate" in passage['x-parent-ctty'] if hasHouse and hasSenate: actor = "legislature" elif hasHouse: actor = "lower" else: actor = "upper" v = Vote(actor, pydate, passage['MOTION'], (result['FINAL_ACTION'] == "YES"), int(result['YES']), int(result['NO']), int(result['EXC'] + result['ABS']), moved=passage['MOVED'], seconded=passage['SECONDED']) # XXX: Add more stuff to kwargs, we have a ton of data for voter in filed_votes: who = voter vote = filed_votes[who] if vote.lower() == "yes": v.yes(who) elif vote.lower() == "no": v.no(who) else: v.other(who) v.add_source(bill_vote_href) b.add_vote(v) self.save_bill(b)
def parse_senate_vote(self, url): """ senate PDFs -> garbled text -> good text -> Vote """ vote = Vote('upper', '?', 'senate passage', False, 0, 0, 0) vote.add_source(url) fname, resp = self.urlretrieve(url) # this gives us the cleaned up text sv_text = convert_sv_text(convert_pdf(fname, 'text')) os.remove(fname) in_votes = False # use in_votes as a sort of state machine for line in sv_text: # not 'in_votes', get date or passage if not in_votes: dmatch = re.search('DATE:(\d{2}-\d{2}-\d{2})', line) if dmatch: date = dmatch.groups()[0] vote['date'] = datetime.strptime(date, '%m-%d-%y') if 'YES NO ABS EXC' in line: in_votes = True elif 'PASSED' in line: vote['passed'] = True # in_votes: totals & votes else: # totals if 'TOTALS' in line: # Lt. Governor voted if 'GOVERNOR' in line: name, spaces, line = re.match(' ([A-Z,.]+)(\s+)X(.*)', line).groups() if len(spaces) == 1: vote.yes(name) else: vote.no(name) _, yes, no, abs, exc = line.split() vote['yes_count'] = int(yes) vote['no_count'] = int(no) vote['other_count'] = int(abs) + int(exc) # no longer in votes in_votes = False continue # pull votes out matches = re.match(' ([A-Z,.]+)(\s+)X\s+([A-Z,.]+)(\s+)X', line).groups() name1, spaces1, name2, spaces2 = matches # vote can be determined by # of spaces if len(spaces1) == 1: vote.yes(name1) elif len(spaces1) == 2: vote.no(name1) else: vote.other(name1) if len(spaces2) == 1: vote.yes(name2) elif len(spaces2) == 2: vote.no(name2) else: vote.other(name2) return vote
def scrape_bill(self, chamber, session, bill_id): # try and get bill for current year url = 'http://legislature.mi.gov/doc.aspx?%s-%s' % ( session[:4], bill_id.replace(' ', '-')) html = self.get(url).text # if first page isn't found, try second year if ('Page Not Found' in html or 'The bill you are looking for is not available yet' in html): html = self.get('http://legislature.mi.gov/doc.aspx?%s-%s' % (session[-4:], bill_id.replace(' ', '-'))).text if ('Page Not Found' in html or 'The bill you are looking for is not available yet' in html): return None doc = lxml.html.fromstring(html) title = doc.xpath( '//span[@id="frg_billstatus_ObjectSubject"]')[0].text_content() # get B/R/JR/CR part and look up bill type bill_type = bill_types[bill_id.split(' ')[0][1:]] bill = Bill(session=session, chamber=chamber, bill_id=bill_id, title=title, type=bill_type) bill.add_source(url) # sponsors sp_type = 'primary' for sponsor in doc.xpath( '//span[@id="frg_billstatus_SponsorList"]/a/text()'): sponsor = sponsor.replace(u'\xa0', ' ') bill.add_sponsor(sp_type, sponsor) sp_type = 'cosponsor' bill['subjects'] = doc.xpath( '//span[@id="frg_billstatus_CategoryList"]/a/text()') # actions (skip header) for row in doc.xpath( '//table[@id="frg_billstatus_HistoriesGridView"]/tr')[1:]: tds = row.xpath('td') # date, journal link, action date = tds[0].text_content() journal = tds[1].text_content() action = tds[2].text_content() date = datetime.datetime.strptime(date, "%m/%d/%Y") # instead of trusting upper/lower case, use journal for actor actor = 'upper' if 'SJ' in journal else 'lower' type = categorize_action(action) bill.add_action(actor, action, date, type=type) # check if action mentions a vote rcmatch = re.search('Roll Call # (\d+)', action, re.IGNORECASE) if rcmatch: rc_num = rcmatch.groups()[0] # in format mileg.aspx?page=getobject&objectname=2011-SJ-02-10-011 journal_link = tds[1].xpath('a/@href') if journal_link: objectname = journal_link[0].rsplit('=', 1)[-1] chamber_name = {'upper': 'Senate', 'lower': 'House'}[actor] vote_url = BASE_URL + '/documents/%s/Journal/%s/htm/%s.htm' % ( session, chamber_name, objectname) vote = Vote(actor, date, action, False, 0, 0, 0) self.parse_roll_call(vote, vote_url, rc_num) # check the expected counts vs actual count = re.search('YEAS (\d+)', action, re.IGNORECASE) count = int(count.groups()[0]) if count else 0 if count != len(vote['yes_votes']): self.warning( 'vote count mismatch for %s %s, %d != %d' % (bill_id, action, count, len(vote['yes_votes']))) count = re.search('NAYS (\d+)', action, re.IGNORECASE) count = int(count.groups()[0]) if count else 0 if count != len(vote['no_votes']): self.warning( 'vote count mismatch for %s %s, %d != %d' % (bill_id, action, count, len(vote['no_votes']))) vote['yes_count'] = len(vote['yes_votes']) vote['no_count'] = len(vote['no_votes']) vote['other_count'] = len(vote['other_votes']) vote['passed'] = vote['yes_count'] > vote['no_count'] vote.add_source(vote_url) bill.add_vote(vote) else: self.warning("missing journal link for %s %s" % (bill_id, journal)) # versions for row in doc.xpath( '//table[@id="frg_billstatus_DocumentGridTable"]/tr'): version = self.parse_doc_row(row) if version: if version[1].endswith('.pdf'): mimetype = 'application/pdf' elif version[1].endswith('.htm'): mimetype = 'text/html' bill.add_version(*version, mimetype=mimetype) # documents for row in doc.xpath('//table[@id="frg_billstatus_HlaTable"]/tr'): document = self.parse_doc_row(row) if document: bill.add_document(*document) for row in doc.xpath('//table[@id="frg_billstatus_SfaTable"]/tr'): document = self.parse_doc_row(row) if document: bill.add_document(*document) self.save_bill(bill) return True
def _parse_votes(self, url, vote): '''Given a vote url and a vote object, extract the voters and the vote counts from the vote page and update the vote object. ''' if url.lower().endswith('.pdf'): try: resp = self.get(url) except HTTPError: # This vote document wasn't found. msg = 'No document found at url %r' % url self.logger.warning(msg) return try: v = PDFCommitteeVote(url, resp.content) return v.asvote() except PDFCommitteeVoteParseError as e: # Warn and skip. self.warning("Could't parse committee vote at %r" % url) return keymap = {'Y': 'yes', 'N': 'no'} html = self.get(url).text doc = lxml.html.fromstring(html) # Yes, no, excused, absent. try: vals = doc.xpath('//table')[1].xpath('tr/td/text()') except IndexError: # Most likely was a bogus link lacking vote data. return y, n, e, a = map(int, vals) vote.update(yes_count=y, no_count=n, other_count=e + a) # Get the motion. try: motion = doc.xpath('//br')[-1].tail.strip() except: # Some of them mysteriously have no motion listed. motion = vote['action'] if not motion: motion = vote['action'] vote['motion'] = motion # Add placeholder for passed (see below) vote['passed'] = False vote = Vote(**vote) for text in doc.xpath('//table')[2].xpath('tr/td/text()'): if not text.strip(u'\xa0'): continue v, name = filter(None, text.split(u'\xa0')) getattr(vote, keymap.get(v, 'other'))(name) action = vote['action'] # Existing code to deterimine value of `passed` yes_votes = vote['yes_votes'] no_votes = vote['no_votes'] passed = None # some actions take a super majority, so we aren't just # comparing the yeas and nays here. for i in vote_passage_indicators: if action.count(i): passed = True for i in vote_failure_indicators: if action.count(i) and passed == True: # a quick explanation: originally an exception was # thrown if both passage and failure indicators were # present because I thought that would be a bug in my # lists. Then I found 2007 HB 160. # Now passed = False if the nays outnumber the yays.. # I won't automatically mark it as passed if the yays # ounumber the nays because I don't know what requires # a supermajority in MT. if no_votes >= yes_votes: passed = False else: raise Exception("passage and failure indicator" "both present at: %s" % url) if action.count(i) and passed == None: passed = False for i in vote_ambiguous_indicators: if action.count(i): passed = yes_votes > no_votes if passed is None: raise Exception("Unknown passage at: %s" % url) vote['passed'] = passed return vote
def scrape_pdf_for_votes(self, session, chamber, date, motion, href): warned = False # vote indicator, a few spaces, a name, newline or multiple spaces VOTE_RE = re.compile('(Y|N|E|NV|A|P|-)\s{2,5}(\w.+?)(?:\n|\s{2})') COUNT_RE = re.compile(r'^(\d+)\s+YEAS?\s+(\d+)\s+NAYS?\s+(\d+)\s+PRESENT(?:\s+(\d+)\s+NOT\sVOTING)?\s*$') PASS_FAIL_WORDS = { 'PASSED': True, 'PREVAILED': True, 'ADOPTED': True, 'CONCURRED': True, 'FAILED': False, 'LOST': False, } pdflines = self.fetch_pdf_lines(href) yes_count = no_count = present_count = other_count = 0 yes_votes = [] no_votes = [] present_votes = [] other_vote_detail = defaultdict(list) passed = None counts_found = False vote_lines = [] for line in pdflines: # consider pass/fail as a document property instead of a result of the vote count # extract the vote count from the document instead of just using counts of names if not line.strip(): continue elif line.strip() in PASS_FAIL_WORDS: if passed is not None: raise Exception("Duplicate pass/fail matches in [%s]" % href) passed = PASS_FAIL_WORDS[line.strip()] elif COUNT_RE.match(line): yes_count, no_count, present_count, not_voting_count = COUNT_RE.match(line).groups() yes_count = int(yes_count) no_count = int(no_count) present_count = int(present_count) counts_found = True elif counts_found: for value in VOTE_VALUES: if re.search(r'^\s*({})\s+\w'.format(value), line): vote_lines.append(line) break votes = find_columns_and_parse(vote_lines) for name, vcode in votes.items(): if name == 'Mr. Speaker': name = self.metadata['session_details'][session]['speaker'] elif name == 'Mr. President': name = self.metadata['session_details'][session]['president'] if vcode == 'Y': yes_votes.append(name) elif vcode == 'N': no_votes.append(name) else: other_vote_detail[vcode].append(name) other_count += 1 if vcode == 'P': present_votes.append(name) # fake the counts if yes_count == 0 and no_count == 0 and present_count == 0: yes_count = len(yes_votes) no_count = len(no_votes) else: # audit if yes_count != len(yes_votes): self.warning("Mismatched yes count [expect: %i] [have: %i]" % (yes_count, len(yes_votes))) warned = True if no_count != len(no_votes): self.warning("Mismatched no count [expect: %i] [have: %i]" % (no_count, len(no_votes))) warned = True if present_count != len(present_votes): self.warning("Mismatched present count [expect: %i] [have: %i]" % (present_count, len(present_votes))) warned = True if passed is None: if chamber == 'lower': # senate doesn't have these lines self.warning("No pass/fail word found; fall back to comparing yes and no vote.") warned = True passed = yes_count > no_count vote = Vote(chamber, date, motion, passed, yes_count, no_count, other_count, other_vote_detail=other_vote_detail) for name in yes_votes: vote.yes(name) for name in no_votes: vote.no(name) for other_type, names in other_vote_detail.iteritems(): for name in names: vote.other(name) vote.add_source(href) if warned: self.warning("Warnings were issued. Best to check %s" % href) return vote
def scrape(self, chamber, session): if session == '2009': # 2009 files have a different delimiter and naming scheme. vote_data_url = 'ftp://www.ncga.state.nc.us/Bill_Status/Vote Data 2009.zip' naming_scheme = '{session}{file_label}.txt' delimiter = ";" else: vote_data_url = 'ftp://www.ncga.state.nc.us/Bill_Status/Votes%s.zip' % session naming_scheme = '{file_label}_{session}.txt' delimiter = "\t" fname, resp = self.urlretrieve(vote_data_url) # fname = "/Users/brian/Downloads/Vote Data 2009.zip" zf = ZipFile(fname) chamber_code = 'H' if chamber == 'lower' else 'S' # Members_YYYY.txt: tab separated # 0: id (unique only in chamber) # 1: H or S # 2: member name # 3-5: county, district, party # 6: mmUserId member_file = zf.open(naming_scheme.format(file_label='Members', session=session)) members = {} for line in member_file.readlines(): data = line.split(delimiter) if data[1] == chamber_code: members[data[0]] = data[2] # Votes_YYYY.txt # 0: sequence number # 1: chamber (S/H) # 2: date # 3: prefix # 4: bill_id # 5: yes votes # 6: no votes # 7: excused absences # 8: excused votes # 9: didn't votes # 10: total yes+no # 11: sponsor # 12: reading info # 13: info # 20: PASSED/FAILED # 21: legislative day vote_file = zf.open(naming_scheme.format(file_label='Votes', session=session)) bill_chambers = {'H':'lower', 'S':'upper'} votes = {} for line in vote_file.readlines(): data = line.split(delimiter) if data[1] == chamber_code: date = datetime.datetime.strptime(data[2][:16], '%Y-%m-%d %H:%M') if data[3][0] not in bill_chambers: # skip votes that aren't on bills self.log('skipping vote %s' % data[0]) continue votes[data[0]] = Vote(chamber, date, data[13], 'PASS' in data[20], int(data[5]), int(data[6]), int(data[7])+int(data[8])+int(data[9]), bill_chamber=bill_chambers[data[3][0]], bill_id=data[3]+data[4], session=session) member_vote_file = zf.open(naming_scheme.format(file_label='MemberVotes', session=session)) # 0: member id # 1: chamber (S/H) # 2: vote id # 3: vote chamber (always same as 1) # 4: vote (Y,N,E,X) # 5: pair ID (member) # 6: pair order # If a vote is paired then it should be counted as an 'other' for line in member_vote_file.readlines(): data = line.split(delimiter) if data[1] == chamber_code: try: member_voting = members[data[0]] except KeyError: self.debug('Member %s not found.' % data[0]) continue try: vote = votes[data[2]] except KeyError: self.debug('Vote %s not found.' % data[2]) continue # -1 votes are Lt. Gov, not included in count, so we add them if data[4] == 'Y' and not data[5]: if data[0] == '-1': vote['yes_count'] += 1 vote.yes(member_voting) elif data[4] == 'N' and not data[5]: if data[0] == '-1': vote['no_count'] += 1 vote.no(member_voting) else: # for some reason other_count is high for paired votes if data[5]: vote['other_count'] -= 1 # is either E: excused, X: no vote, or paired (doesn't count) vote.other(member_voting) for vote in votes.itervalues(): vote.validate() vote.add_source(vote_data_url) self.save_vote(vote) # remove file zf.close() os.remove(fname)
def scrape_votes(self, url, chamb): with self.urlopen(url) as doc: soup = BeautifulSoup(doc) date = None motion = None yeas = None neas = None others = None passed = None chamber = chamb necessary = None vote = None fonts = soup.findAll('font') span = soup.findAll('span') if (len(fonts) + (len(span))) > 4: #data is vaguely structured if (len(fonts) < 4): fonts = span for line in fonts: #this could be sped up. line = str(line.contents[0]) line = line.strip() if line.find("Taken on") > -1: #then the text is in the form of: "Take on <date> <reason>" split = line.split(None, 3) date = split[2] if (len(split) > 3): motion = split[3] elif line.find("Those voting Yea") > -1: yeas = self.get_num_from_line(line) elif line.find("Those voting Nay") > -1: neas = self.get_num_from_line(line) elif line.find("Those absent and not voting") > -1: others = self.get_num_from_line(line) elif (line.find("Necessary for Adoption") > -1) or (line.find("Necessary for Passage") > -1): necessary = self.get_num_from_line(line) if yeas >= necessary: passed = True else: passed = False vote = Vote(chamber, date, motion, passed, yeas, neas, others) #figure out who voted for what table = soup.findAll('table') tds = table[len(table) - 1].findAll('td') #get the last table vote_value = None digits = re.compile('^[\d ]+$') for cell in tds: string = cell.find('font') if (string == None): string = cell.find( 'span') #either we are looking at fonts or spans if (string != None): string = string.contents[0] string = string.strip() else: string = '' if (len(string) > 0) and (digits.search(string) == None): if vote_value == None: if (string == 'Y') or (string == 'N'): vote_value = string elif (string == 'X') or (string == 'A'): vote_value = 'X' else: if vote_value == 'Y': vote.yes(string) elif vote_value == 'N': vote.no(string) else: vote.other(string) vote_value = None else: #data is mostly unstructured. Have to sift through a string data = soup.find('pre') lines = data.contents[len(data.contents) - 1] lines = lines.strip() exp = re.compile(r'\n+|\r+|\f+') lines = exp.split(lines) names = [] for i in range(len(lines)): line = lines[i].strip() if line.find("Taken on") > -1: #then the text is in the form of: "Take on <date> <reason>" split = line.split(None, 3) date = split[2] if (len(split) > 3): motion = split[3] elif line.find("Those voting Yea") > -1: yeas = self.get_num_from_line(line) elif line.find("Those voting Nay") > -1: neas = self.get_num_from_line(line) elif line.find("Those absent and not voting") > -1: others = self.get_num_from_line(line) elif (line.find("Necessary for Adoption") > -1) or (line.find("Necessary for Passage") > -1): if (line.find("Adoption") > -1): motion = "Adoption" else: motion = "Passage" necessary = self.get_num_from_line(line) elif (line.find("The following is the roll call vote:") > -1): break #the next lines contain actual votes #process the vote values if yeas >= necessary: passed = True else: passed = False vote = Vote(chamber, date, motion, passed, yeas, neas, others) lines = lines[i + 1:] lines = string.join(lines, ' ') lines = lines.split(' ') absent_vote_value = re.compile('^(X|A)$') yea_vote_value = re.compile('^Y$') nea_vote_value = re.compile('^N$') #there aren't two spaces between vote and name so it doesn't get parsed annoying_vote = re.compile('^(Y|X|A|N) ([\S ]+)$') digits = re.compile('^[\d ]+$') vote_value = None for word in lines: word = word.strip() if (len(word) > 0) and (digits.search(word) == None): word = strip_digits(word) if vote_value != None: if vote_value == 'Y': vote.yes(word) elif vote_value == 'N': vote.no(word) else: vote.other(word) vote_value = None elif absent_vote_value.match(word) != None: vote_value = 'X' elif yea_vote_value.match(word) != None: vote_value = 'Y' elif nea_vote_value.match(word) != None: vote_value = 'N' elif annoying_vote.match(word) != None: split = annoying_vote.match(word) vote_value = split.group(2) name = split.group(1) if vote_value == 'Y': vote.yes(name) elif vote_value == 'N': vote.no(name) else: vote.other(name) vote_value = None return vote
def scrape_bill_sheet(self, session, chamber): """ Scrape the bill sheet (the page full of bills and other small bits of data) """ sheet_url = self.get_bill_folder(session, chamber) bill_chamber = {"Senate": "upper", "House": "lower"}[chamber] index = { "id": 0, "title_sponsor": 1, "version": 2, "history": 3, "votes": 7 } sheet_html = self.get(sheet_url).text sheet_page = lxml.html.fromstring(sheet_html) sheet_page.make_links_absolute(sheet_url) bills = sheet_page.xpath('//table/tr') for bill in bills: bill_id = self.read_td(bill[index["id"]][0]) if bill_id == None: # Every other entry is null for some reason continue dot_loc = bill_id.find('.') if dot_loc != -1: # budget bills are missing the .pdf, don't truncate bill_id = bill_id[:dot_loc] title_and_sponsor = bill[index["title_sponsor"]][0] bill_title = title_and_sponsor.text bill_title_and_sponsor = title_and_sponsor.text_content() if bill_title is None: continue # Odd ... sponsors = bill_title_and_sponsor.replace(bill_title, "").\ replace(" & ...", "").split("--") cats = { "SB": "bill", "HB": "bill", "HR": "resolution", "SR": "resolution", "SCR": "concurrent resolution", "HCR": "concurrent resolution", "SJR": "joint resolution", "HJR": "joint resolution", "SM": "memorial", "HM": "memorial" } bill_type = None for cat in cats: if bill_id[:len(cat)] == cat: bill_type = cats[cat] b = Bill(session, bill_chamber, bill_id, bill_title, type=bill_type) b.add_source(sheet_url) versions_url = \ bill[index["version"]].xpath('font/a')[0].attrib["href"] versions_url = versions_url versions = self.parse_versions(versions_url) for version in versions: b.add_version(version['name'], version['link'], mimetype=version['mimetype']) bill_history_href = bill[index["history"]][0][0].attrib['href'] history = self.parse_history(bill_history_href) if history is None: self.logger.warning( "Bill history for %s is not correctly formatted" % bill_id) continue b.add_source(bill_history_href) chamber_map = dict(Senate='upper', House='lower') for action, date in history: action_actor = chamber_map.get(chamber, chamber) attrs = dict(actor=action_actor, action=action, date=date) attrs.update(self.categorizer.categorize(action)) b.add_action(**attrs) for sponsor in sponsors: if sponsor != None and sponsor != "(NONE)" and \ sponsor != "": if "&" in sponsor: for sponsor in [x.strip() for x in sponsor.split("&")]: b.add_sponsor("primary", sponsor) else: b.add_sponsor("primary", sponsor) # Now that we have history, let's see if we can't grab some # votes bill_vote_href, = bill.xpath(".//a[contains(text(), 'Votes')]") bill_vote_href = bill_vote_href.attrib['href'] #bill_vote_href = self.get_vote_url(bill_id, session) votes = self.parse_votes(bill_vote_href) if (votes['sanity-check'] == 'This site only supports frames ' 'compatible browsers!'): votes['votes'] = [] elif votes['sanity-check'] != bill_id: self.warning("XXX: READ ME! Sanity check failed!") self.warning(" -> Scraped ID: " + votes['sanity-check']) self.warning(" -> 'Real' ID: " + bill_id) assert votes['sanity-check'] == bill_id for vote in votes['votes']: filed_votes = vote['votes'] passage = vote['meta'] result = vote['result'] composite_time = "%s %s" % (passage['x-parent-date'], passage['TIME']) # It's now like: 04/01/2011 02:10:14 PM pydate = dt.datetime.strptime(composite_time, "%m/%d/%Y %I:%M:%S %p") hasHouse = "House" in passage['x-parent-ctty'] hasSenate = "Senate" in passage['x-parent-ctty'] if hasHouse and hasSenate: actor = "joint" elif hasHouse: actor = "lower" else: actor = "upper" other = (int(result['EXC']) + int(result['ABS'])) # OK, sometimes the Other count is wrong. local_other = 0 for voter in filed_votes: l_vote = filed_votes[voter].lower().strip() if l_vote != "yes" and l_vote != "no": local_other = local_other + 1 if local_other != other: self.warning( \ "XXX: !!!WARNING!!! - resetting the 'OTHER' VOTES") self.warning(" -> Old: %s // New: %s" % (other, local_other)) other = local_other passed = (result['FINAL_ACTION'] == "PASS") if passage['MOTION'].strip() == "": continue if "without objection" in passage['MOTION'].lower(): passed = True v = Vote(actor, pydate, passage['MOTION'], passed, int(result['YES']), int(result['NO']), other, moved=passage['MOVED'], seconded=passage['SECONDED']) v.add_source(vote['meta']['url']) # v.add_source( bill_vote_href ) # XXX: Add more stuff to kwargs, we have a ton of data seen = set([]) for voter in filed_votes: who = voter if who in seen: raise Exception("Seeing the double-thing. - bug #702") seen.add(who) vote = filed_votes[who] if vote.lower() == "yes": v.yes(who) elif vote.lower() == "no": v.no(who) else: v.other(who) b.add_vote(v) self.save_bill(b)
def parse_bill_votes(self, doc, bill): params = { 'chamber': None, 'date': None, 'motion': None, 'passed': None, 'yes_count': None, 'no_count': None, 'other_count': None, } elems = doc.cssselect('a') # MD has a habit of listing votes twice seen_votes = set() for elem in elems: href = elem.get('href') if (href and "votes" in href and href.endswith('htm') and href not in seen_votes): seen_votes.add(href) vote_url = BASE_URL + href with self.urlopen(vote_url) as vote_html: vote_doc = lxml.html.fromstring(vote_html) # motion box = vote_doc.xpath('//td[@colspan=3]/font[@size=-1]/text()') params['motion'] = box[-1] params['type'] = 'other' if 'senate' in href: params['chamber'] = 'upper' else: params['chamber'] = 'lower' for regex, vtype in vote_classifiers.iteritems(): if re.findall(regex, params['motion'], re.IGNORECASE): params['type'] = vtype # counts bs = vote_doc.xpath('//td[@width="20%"]/font/b/text()') yeas = int(bs[0].split()[0]) nays = int(bs[1].split()[0]) excused = int(bs[2].split()[0]) not_voting = int(bs[3].split()[0]) absent = int(bs[4].split()[0]) params['yes_count'] = yeas params['no_count'] = nays params['other_count'] = excused + not_voting + absent params['passed'] = yeas > nays # date # parse the following format: March 23, 2009 date_elem = vote_doc.xpath('//font[starts-with(text(), "Legislative Date")]')[0] params['date'] = datetime.datetime.strptime(date_elem.text[18:], '%B %d, %Y') vote = Vote(**params) status = None for row in vote_doc.cssselect('table')[3].cssselect('tr'): text = row.text_content() if text.startswith('Voting Yea'): status = 'yes' elif text.startswith('Voting Nay'): status = 'no' elif text.startswith('Not Voting') or text.startswith('Excused'): status = 'other' else: for cell in row.cssselect('a'): getattr(vote, status)(cell.text.strip()) vote.add_source(vote_url) bill.add_vote(vote)
def scrape_senate(self, session): url = journals % (session, 'Senate') page = self.lxmlize(url) hrefs = page.xpath("//font//a") for href in hrefs: (path, response) = self.urlretrieve(href.attrib['href']) data = convert_pdf(path, type='text') cur_bill_id = None cur_vote_count = None in_vote = False cur_question = None in_question = False known_date = None cur_vote = {} for line in data.split("\n"): if not known_date: dt = date_re.findall(line) if dt != []: dt, dow = dt[0] known_date = datetime.datetime.strptime( dt, "%A, %B %d, %Y") if in_question: line = line.strip() if re.match("\d+", line): in_question = False continue try: line, _ = line.rsplit(" ", 1) cur_question += line except ValueError: in_question = False continue cur_question += line if not in_vote: summ = vote_re.findall(line) if summ != []: cur_vote = {} cur_vote_count = summ[0] in_vote = True continue if ("The question being" in line) or \ ("On motion of" in line) or \ ("the following" in line) or \ ("moved that the" in line): cur_question, _ = line.strip().rsplit(" ", 1) in_question = True if line.strip() == "": continue first = line[0] if first != " ": if " " not in line: # wtf continue bill_id, kruft = line.split(" ", 1) if len(bill_id) < 3: continue if bill_id[0] != "H" and bill_id[0] != "S": continue if bill_id[1] not in ['B', 'J', 'R', 'M']: continue cur_bill_id = bill_id else: line = line.strip() try: line, lineno = line.rsplit(" ", 1) except ValueError: in_vote = False if cur_question is None: continue if cur_bill_id is None: continue yes, no, exc, ab = cur_vote_count other = int(exc) + int(ab) yes, no, other = int(yes), int(no), int(other) bc = {'H': 'lower', 'S': 'upper'}[cur_bill_id[0]] vote = Vote('upper', known_date, cur_question, (yes > no), yes, no, other, session=session, bill_id=cur_bill_id, bill_chamber=bc) for person in cur_vote: if person is None: continue howvote = cur_vote[person] howvote = howvote.upper() if howvote == 'Y': vote.yes(person) elif howvote == 'N': vote.no(person) else: vote.other(person) vote.add_source(href.attrib['href']) self.save_vote(vote) cur_vote, cur_question, cur_vote_count = (None, None, None) continue votes = re.findall(votes_re, line) for person in votes: name, li, vot = person cur_vote[name] = vot os.unlink(path)
def parse_old_vote_page(self, vote_url): params = { 'chamber': None, 'date': None, 'motion': None, 'passed': None, 'yes_count': None, 'no_count': None, 'other_count': None, } with self.urlopen(vote_url) as vote_html: vote_doc = lxml.html.fromstring(vote_html) # motion box = vote_doc.xpath('//td[@colspan=3]/font[@size=-1]/text()') params['motion'] = box[-1] params['type'] = 'other' if 'senate' in vote_url: params['chamber'] = 'upper' else: params['chamber'] = 'lower' for regex, vtype in vote_classifiers.iteritems(): if re.findall(regex, params['motion'], re.IGNORECASE): params['type'] = vtype # counts bs = vote_doc.xpath('//td[@width="20%"]/font/b/text()') yeas = int(bs[0].split()[0]) nays = int(bs[1].split()[0]) excused = int(bs[2].split()[0]) not_voting = int(bs[3].split()[0]) absent = int(bs[4].split()[0]) params['yes_count'] = yeas params['no_count'] = nays params['other_count'] = excused + not_voting + absent params['passed'] = yeas > nays # date # parse the following format: March 23, 2009 date_elem = vote_doc.xpath( '//font[starts-with(text(), "Legislative Date")]')[0] params['date'] = datetime.datetime.strptime( date_elem.text[18:], '%B %d, %Y') vote = Vote(**params) status = None for row in vote_doc.xpath('//table')[3].xpath('tr'): text = row.text_content() if text.startswith('Voting Yea'): status = 'yes' elif text.startswith('Voting Nay'): status = 'no' elif text.startswith('Not Voting') or text.startswith( 'Excused'): status = 'other' else: for cell in row.xpath('a'): getattr(vote, status)(cell.text.strip()) return vote
def viva_voce_votes(root, session): prev_id = None for el in root.xpath(u'//div[starts-with(., "All Members are deemed")]'): text = ''.join(el.getprevious().getprevious().itertext()) text.replace('\n', ' ') m = re.search( r'(?P<bill_id>\w+\W+\d+)(,\W+as\W+amended,)?\W+was\W+' '(?P<type>adopted|passed' '(\W+to\W+(?P<to>engrossment|third\W+reading))?)\W+' 'by\W+a\W+viva\W+voce\W+vote', text) if m: motion = get_motion(m) # No identifier, generate our own record = str(uuid.uuid1()) bill_id = m.group('bill_id') bill_id = bill_id.replace(u'\xa0', ' ') bill_id = re.sub(r'CS(SB|HB)', r'\1', bill_id) if bill_id.startswith('H') or bill_id.startswith('CSHB'): bill_chamber = 'lower' elif bill_id.startswith('S') or bill_id.startswith('CSSB'): bill_chamber = 'upper' else: continue vote = Vote(None, None, motion, True, 0, 0, 0) vote['bill_id'] = bill_id vote['bill_chamber'] = bill_chamber vote['session'] = session[0:2] vote['method'] = 'viva voce' vote['record'] = record vote['type'] = get_type(motion) yield vote continue m = re.search( 'The\W+bill\W+was.+and\W+was\W+' '(?P<type>adopted|passed' '(\W+to\W+(?P<to>engrossment|third\W+reading))?)\W+' 'by\W+a\W+viva\W+voce\W+vote', text) if m: prev_text = ''.join(el.getprevious().getprevious().itertext()) m2 = re.match('(HB|SB|CSHB|CSSB|HR|SR)\W+\d+', prev_text) if m2: bill_id = m2.group() prev_id = bill_id else: # This is scary bill_id = prev_id if not bill_id: continue if bill_id.startswith('H') or bill_id.startswith('CSHB'): bill_chamber = 'lower' elif bill_id.startswith('S') or bill_id.startswith('CSSB'): bill_chamber = 'upper' else: continue bill_id = bill_id.replace(u'\xa0', ' ') motion = get_motion(m) record = str(uuid.uuid1()) vote = Vote(None, None, motion, True, 0, 0, 0) vote['bill_id'] = bill_id vote['bill_chamber'] = bill_chamber vote['session'] = session[0:2] vote['method'] = 'viva voce' vote['record'] = record vote['type'] = get_type(motion) yield vote continue
def get_senate_votes(self): for b in self.doc.xpath("//div/b[starts-with(., 'VOTE: FLOOR VOTE:')]"): date = b.text.split('-')[1].strip() date = datetime.datetime.strptime(date, "%b %d, %Y").date() yes_votes, no_votes, other_votes = [], [], [] yes_count, no_count, other_count = 0, 0, 0 actual_vote = collections.defaultdict(list) vtype = None for tag in b.xpath("following-sibling::blockquote/*"): if tag.tag == 'b': text = tag.text if text.startswith('Ayes'): vtype = 'yes' yes_count = int(re.search( r'\((\d+)\):', text).group(1)) elif text.startswith('Nays'): vtype = 'no' no_count = int(re.search( r'\((\d+)\):', text).group(1)) elif (text.startswith('Excused') or text.startswith('Abstain') or text.startswith('Absent') ): vtype = 'other' other_count += int(re.search( r'\((\d+)\):', text).group(1)) else: raise ValueError('bad vote type: %s' % tag.text) elif tag.tag == 'a': name = tag.text.strip() if vtype == 'yes': yes_votes.append(name) elif vtype == 'no': no_votes.append(name) elif vtype == 'other': other_votes.append((name, tag.text)) passed = yes_count > (no_count + other_count) vote = Vote('upper', date, 'Floor Vote', passed, yes_count, no_count, other_count) for name in yes_votes: vote.yes(name) for name in no_votes: vote.no(name) for name, vote_val in other_votes: vote.other(name) actual_vote[vote_val].append(name) vote['actual_vote'] = actual_vote vote.add_source(self.url) self.bill.add_vote(vote) for b in self.doc.xpath("//div/b[starts-with(., 'VOTE: COMMITTEE VOTE:')]"): _, committee, date = re.split(r'\s*\t+\s*-\s*', b.text) date = date.strip() date = datetime.datetime.strptime(date, "%b %d, %Y").date() yes_votes, no_votes, other_votes = [], [], [] yes_count, no_count, other_count = 0, 0, 0 vtype = None for tag in b.xpath("following-sibling::blockquote/*"): if tag.tag == 'b': text = tag.text if text.startswith('Ayes'): vtype = 'yes' yes_count += int(re.search( r'\((\d+)\):', text).group(1)) elif text.startswith('Nays'): vtype = 'no' no_count += int(re.search( r'\((\d+)\):', text).group(1)) elif (text.startswith('Excused') or text.startswith('Abstain') or text.startswith('Absent') ): vtype = 'other' other_count += int(re.search( r'\((\d+)\):', text).group(1)) else: raise ValueError('bad vote type: %s' % tag.text) elif tag.tag == 'a': name = tag.text.strip() if vtype == 'yes': yes_votes.append(name) elif vtype == 'no': no_votes.append(name) elif vtype == 'other': other_votes.append(name) passed = yes_count > (no_count + other_count) vote = Vote('upper', date, '%s Committee Vote' % committee, passed, yes_count, no_count, other_count) for name in yes_votes: vote.yes(name) for name in no_votes: vote.no(name) for name in other_votes: vote.other(name) vote.add_source(self.url) self.bill.add_vote(vote)
def scrape_votes(self, bill, link): page = self.urlopen(link) page = lxml.html.fromstring(page) raw_vote_data = page.xpath("//span[@id='lblVoteData']")[0].text_content() raw_vote_data = re.split('\w+? by [\w ]+?\s+-', raw_vote_data.strip())[1:] for raw_vote in raw_vote_data: raw_vote = raw_vote.split(u'\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0') motion = raw_vote[0] vote_date = re.search('(\d+/\d+/\d+)', motion) if vote_date: vote_date = datetime.datetime.strptime(vote_date.group(), '%m/%d/%Y') passed = ('Passed' in motion or 'Recommended for passage' in motion or 'Adopted' in raw_vote[1] ) vote_regex = re.compile('\d+$') aye_regex = re.compile('^.+voting aye were: (.+) -') no_regex = re.compile('^.+voting no were: (.+) -') other_regex = re.compile('^.+present and not voting were: (.+) -') yes_count = 0 no_count = 0 other_count = 0 ayes = [] nos = [] others = [] for v in raw_vote[1:]: v = v.strip() if v.startswith('Ayes...') and vote_regex.search(v): yes_count = int(vote_regex.search(v).group()) elif v.startswith('Noes...') and vote_regex.search(v): no_count = int(vote_regex.search(v).group()) elif v.startswith('Present and not voting...') and vote_regex.search(v): other_count += int(vote_regex.search(v).group()) elif aye_regex.search(v): ayes = aye_regex.search(v).groups()[0].split(', ') elif no_regex.search(v): nos = no_regex.search(v).groups()[0].split(', ') elif other_regex.search(v): others += other_regex.search(v).groups()[0].split(', ') if 'ChamberVoting=H' in link: chamber = 'lower' else: chamber = 'upper' vote = Vote(chamber, vote_date, motion, passed, yes_count, no_count, other_count) vote.add_source(link) seen = set() for a in ayes: if a in seen: continue vote.yes(a) seen.add(a) for n in nos: if n in seen: continue vote.no(n) seen.add(n) for o in others: if o in seen: continue vote.other(o) seen.add(o) # vote.validate() bill.add_vote(vote) return bill
def scrape_bill_type(self, chamber, session, bill_type, type_abbr, committee_abbr_regex=get_committee_name_regex()): if chamber == 'upper': chamber_name = 'SENATE' else: chamber_name = 'ASSEMBLY' bills = self.session.query(CABill).filter_by( session_year=session).filter_by(measure_type=type_abbr) for bill in bills: bill_session = session if bill.session_num != '0': bill_session += ' Special Session %s' % bill.session_num bill_id = bill.short_bill_id fsbill = Bill(bill_session, chamber, bill_id, '') # # Construct session for web query, going from '20092010' to '0910' # source_session = session[2:4] + session[6:8] # # Turn 'AB 10' into 'ab_10' # source_num = "%s_%s" % (bill.measure_type.lower(), # bill.measure_num) # Construct a fake source url source_url = ('http://leginfo.legislature.ca.gov/faces/' 'billNavClient.xhtml?bill_id=%s') % bill.bill_id fsbill.add_source(source_url) fsbill.add_version(bill_id, source_url, 'text/html') title = '' type_ = ['bill'] subject = '' all_titles = set() # Get digest test (aka "summary") from latest version. if bill.versions: version = bill.versions[-1] nsmap = version.xml.nsmap xpath = '//caml:DigestText/xhtml:p' els = version.xml.xpath(xpath, namespaces=nsmap) chunks = [] for el in els: t = etree_text_content(el) t = re.sub(r'\s+', ' ', t) t = re.sub(r'\)(\S)', lambda m: ') %s' % m.group(1), t) chunks.append(t) summary = '\n\n'.join(chunks) for version in bill.versions: if not version.bill_xml: continue # CA is inconsistent in that some bills have a short title # that is longer, more descriptive than title. if bill.measure_type in ('AB', 'SB'): impact_clause = clean_title(version.title) title = clean_title(version.short_title) else: impact_clause = None if len(version.title) < len(version.short_title) and \ not version.title.lower().startswith('an act'): title = clean_title(version.short_title) else: title = clean_title(version.title) if title: all_titles.add(title) type_ = [bill_type] if version.appropriation == 'Yes': type_.append('appropriation') if version.fiscal_committee == 'Yes': type_.append('fiscal committee') if version.local_program == 'Yes': type_.append('local program') if version.urgency == 'Yes': type_.append('urgency') if version.taxlevy == 'Yes': type_.append('tax levy') if version.subject: subject = clean_title(version.subject) if not title: self.warning("Couldn't find title for %s, skipping" % bill_id) continue fsbill['title'] = title fsbill['summary'] = summary fsbill['type'] = type_ fsbill['subjects'] = filter(None, [subject]) fsbill['impact_clause'] = impact_clause # We don't want the current title in alternate_titles all_titles.remove(title) fsbill['alternate_titles'] = list(all_titles) for author in version.authors: if author.house == chamber_name: fsbill.add_sponsor(SPONSOR_TYPES[author.contribution], author.name, official_type=author.contribution) introduced = False for action in bill.actions: if not action.action: # NULL action text seems to be an error on CA's part, # unless it has some meaning I'm missing continue actor = action.actor or chamber actor = actor.strip() match = re.match(r'(Assembly|Senate)($| \(Floor)', actor) if match: actor = { 'Assembly': 'lower', 'Senate': 'upper' }[match.group(1)] elif actor.startswith('Governor'): actor = 'other' else: def replacer(matchobj): if matchobj: return { 'Assembly': 'lower', 'Senate': 'upper' }[matchobj.group()] else: return matchobj.group() actor = re.sub(r'^(Assembly|Senate)', replacer, actor) type_ = [] act_str = action.action act_str = re.sub(r'\s+', ' ', act_str) if act_str.startswith('Introduced'): introduced = True type_.append('bill:introduced') if 'Read first time.' in act_str: if not introduced: type_.append('bill:introduced') introduced = True type_.append('bill:reading:1') if 'To Com' in act_str or 'referred to' in act_str.lower(): type_.append('committee:referred') if 'Read third time. Passed' in act_str: type_.append('bill:passed') if 'Read third time. Passed' in act_str: type_.append('bill:passed') if 'Read third time, passed' in act_str: type_.append('bill:passed') if re.search(r'Read third time.+?Passed and', act_str): type_.append('bill:passed') if 'Approved by Governor' in act_str: type_.append('governor:signed') if 'Approved by the Governor' in act_str: type_.append('governor:signed') if 'Item veto' in act_str: type_.append('governor:vetoed:line-item') if 'Vetoed by Governor' in act_str: type_.append('governor:vetoed') if 'To Governor' in act_str: type_.append('governor:received') if 'Read second time' in act_str: type_.append('bill:reading:2') if not type_: type_ = ['other'] # Add in the committee strings of the related committees, if any. kwargs = {} matched_abbrs = committee_abbr_regex.findall(action.action) if 'Com. on' in action.action and not matched_abbrs: msg = 'Failed to extract committee abbr from %r.' raise ValueError(action.action) if matched_abbrs: committees = [] for abbr in matched_abbrs: try: name = self.committee_abbr_to_name(chamber, abbr) except KeyError: msg = ('Mapping contains no committee name for ' 'abbreviation %r. Action text was %r.') args = (abbr, action.action) raise KeyError(msg % args) else: committees.append(name) committees = filter(None, committees) kwargs['committees'] = committees code = re.search(r'C[SXZ]\d+', actor) if code is not None: code = code.group() kwargs['actor_info'] = {'committee_code': code} assert len(committees) == len(matched_abbrs) for committee, abbr in zip(committees, matched_abbrs): act_str = act_str.replace('Com. on ' + abbr, committee) act_str = act_str.replace(abbr, committee) changed = False for string in ['upper', 'lower', 'joint']: if actor.startswith(string): actor = string changed = True break if not changed: actor = 'other' if actor != action.actor: actor_info = kwargs.get('actor_info', {}) actor_info['details'] = action.actor kwargs['actor_info'] = actor_info # Add strings for related legislators, if any. rgx = '(?:senator|assembly[mwp][^ .,:;]+)\s+[^ .,:;]+' legislators = re.findall(rgx, action.action, re.I) if legislators: kwargs['legislators'] = legislators fsbill.add_action(actor, act_str, action.action_date.date(), type_=type_, **kwargs) for vote in bill.votes: if vote.vote_result == '(PASS)': result = True else: result = False full_loc = vote.location.description first_part = full_loc.split(' ')[0].lower() if first_part in ['asm', 'assembly']: vote_chamber = 'lower' vote_location = ' '.join(full_loc.split(' ')[1:]) elif first_part.startswith('sen'): vote_chamber = 'upper' vote_location = ' '.join(full_loc.split(' ')[1:]) else: raise ScrapeError("Bad location: %s" % full_loc) motion = vote.motion.motion_text or '' if "Third Reading" in motion or "3rd Reading" in motion: vtype = 'passage' elif "Do Pass" in motion: vtype = 'passage' else: vtype = 'other' motion = motion.strip() # Why did it take until 2.7 to get a flags argument on re.sub? motion = re.compile(r'(\w+)( Extraordinary)? Session$', re.IGNORECASE).sub('', motion) motion = re.compile(r'^(Senate|Assembly) ', re.IGNORECASE).sub('', motion) motion = re.sub(r'^(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? ', '', motion) motion = re.sub(r' \(\w+\)$', '', motion) motion = re.sub(r'(SCR|SB|AB|AJR|ACR)\s?\d+ \w+\.?$', '', motion) motion = re.sub( r'(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? ' r'Urgency Clause$', '(Urgency Clause)', motion) motion = re.sub(r'\s+', ' ', motion) if not motion: self.warning("Got blank motion on vote for %s" % bill_id) continue fsvote = Vote(vote_chamber, self._tz.localize(vote.vote_date_time), motion, result, int(vote.ayes), int(vote.noes), int(vote.abstain), threshold=vote.threshold, type_=vtype) if vote_location != 'Floor': fsvote['committee'] = vote_location for record in vote.votes: if record.vote_code == 'AYE': fsvote.yes(record.legislator_name) elif record.vote_code.startswith('NO'): fsvote.no(record.legislator_name) else: fsvote.other(record.legislator_name) for s in ('yes', 'no', 'other'): # Kill dupe votes. key = s + '_votes' fsvote[key] = list(set(fsvote[key])) # In a small percentage of bills, the integer vote counts # are inaccurate, so let's ignore them. for k in ('yes', 'no', 'other'): fsvote[k + '_count'] = len(fsvote[k + '_votes']) fsbill.add_vote(fsvote) self.save_bill(fsbill)