def scrape_senate_vote(self, bill, url, date): try: filename, resp = self.urlretrieve(url) except scrapelib.HTTPError: self.warning("missing vote file %s" % url) return vote = Vote( chamber='upper', start_date=date.strftime("%Y-%m-%d"), motion_text='Passage', # setting 'fail' for now. result='fail', classification='passage', bill=bill ) vote.add_source(url) text = convert_pdf(filename, 'text').decode('utf-8') os.remove(filename) if re.search('Yea:\s+\d+\s+Nay:\s+\d+\s+Absent:\s+\d+', text): yield from self.scrape_senate_vote_3col(bill, vote, text, url, date) return data = re.split(r'(Yea|Nay|Absent)s?:', text)[::-1] data = filter(None, data) keymap = dict(yea='yes', nay='no') actual_vote = collections.defaultdict(int) vote_count = { 'yes': 0, 'no': 0, 'other': 0 } while True: if not data: break vote_val = data.pop() key = keymap.get(vote_val.lower(), 'other') values = data.pop() for name in re.split(r'(?:[\s,]+and\s|[\s,]{2,})', values): if name.lower().strip() == 'none.': continue name = name.replace('..', '') name = re.sub(r'\.$', '', name) name = name.strip('-1234567890 \n') if not name: continue vote.vote(key, name) actual_vote[vote_val] += 1 vote_count[key] += 1 assert actual_vote[vote_val] == vote_count[key] for key, value in vote_count.items(): vote.set_count(key, value) # updating result with actual value vote.result = 'pass' if vote_count['yes'] > (vote_count['no'] + vote_count['other']) else 'fail' yield vote
def scrape_committee_vote(self, bill, actor, date, motion, page, url, uniqid): votes = page.xpath("//table")[0] rows = votes.xpath(".//tr")[0] if rows[0].text_content() == 'Votes:': # New webste rows = votes.xpath(".//tr")[2] yno = rows.xpath(".//td") if len(yno) < 3: yes = yno[0] no, other = None, None else: yes, no, other = rows.xpath(".//td")[:3] def proc_block(obj, typ): if obj is None: return { "type": None, "count": None, "votes": [] } votes = [] for vote in obj.xpath(".//br"): if vote.tail: vote = vote.tail.strip() if vote: votes.append(vote) count = len(votes) return { "type": typ, "count": count, "votes": votes } vote_dict = { "yes": proc_block(yes, 'yes'), "no": proc_block(no, 'no'), "other": proc_block(other, 'other'), } yes_count = vote_dict['yes']['count'] no_count = vote_dict['no']['count'] or 0 other_count = vote_dict['other']['count'] or 0 vote = Vote(chamber=actor, start_date=date, motion_text=motion, identifier=str(uniqid), result='pass' if (yes_count > no_count) else 'fail', classification='passage', bill=bill) vote.extras = {'_vote_id': uniqid} vote.add_source(url) vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('other', other_count) for key in vote_dict: for voter in vote_dict[key]['votes']: vote.vote(key, voter) yield vote
def test_vote_org_chamber(): v = Vote(legislative_session="2009", motion_text="passage of the bill", start_date="2009-01-07", result='pass', classification='bill-passage', chamber='upper') assert get_pseudo_id(v.organization) == {'classification': 'upper'}
def toy_vote(): v = Vote(legislative_session="2009", motion_text="passage of the bill", start_date="2009-01-07", result='pass', classification='bill-passage') v.add_source("http://uri.example.com/", note="foo") return v
def scrape(self): for page in self.iterpages(): for subject in page.xpath('//div[@class="ContainerPanel"]'): dates = subject.xpath(".//font[@color='#276598']/b/text()") motions = [x.strip() for x in subject.xpath( ".//div[@style='width:260px; float:left;']/text()")] votes = subject.xpath(".//div[@style='width:150px; float:right;']") docket = subject.xpath(".//div[@class='HeaderContent']/b/text()") docket = list(filter(lambda x: "docket" in x.lower(), docket)) docket = docket[0] if docket else None for date, motion, vote in zip(dates, motions, votes): when = dt.datetime.strptime(date, "%m/%d/%Y") motion = motion.strip() if motion == "": self.warning("Skipping vote.") continue v = Vote(session=self.session, organization="Boston City Council", type='other', passed=False, date=when.strftime("%Y-%m-%d"), motion=motion, yes_count=0, no_count=0,) if docket: v.set_bill(docket) yes, no, other = 0, 0, 0 vit = iter(vote.xpath("./div")) vote = zip(vit, vit, vit) for who, entry, _ in vote: how = entry.text who = who.text if how == 'Y': v.yes(who) yes += 1 elif how == 'N': v.no(who) no += 1 else: v.other(who) other += 1 for count in v.vote_counts: count['count'] = { "yes": yes, "no": no, "other": other }[count['vote_type']] v.add_source(DURL, note='root') yield v
def scrape_vote(self, bill, vote_id, session): vote_url = 'https://legis.delaware.gov/json/RollCall/GetRollCallVoteByRollCallId' form = { 'rollCallId': vote_id, 'sort': '', 'group': '', 'filter': '', } page = self.post(url=vote_url, data=form, allow_redirects=True).json() if page: roll = page['Model'] vote_chamber = self.chamber_map[roll['ChamberName']] # "7/1/16 01:00 AM" vote_date = dt.datetime.strptime(roll['TakenAtDateTime'], '%m/%d/%y %I:%M %p').strftime('%Y-%m-%d') # TODO: What does this code mean? vote_motion = roll['RollCallVoteType'] vote_passed = 'pass' if roll['RollCallStatus'] == 'Passed' else 'fail' other_count = (int(roll['NotVotingCount']) + int(roll['VacantVoteCount']) + int(roll['AbsentVoteCount']) + int(roll['ConflictVoteCount']) ) vote = Vote(chamber=vote_chamber, start_date=vote_date, motion_text=vote_motion, result=vote_passed, classification='other', bill=bill.identifier, legislative_session=session ) vote.add_source(vote_url) vote.set_count('yes', roll['YesVoteCount']) vote.set_count('no', roll['NoVoteCount']) vote.set_count('other', other_count) for row in roll['AssemblyMemberVotes']: # AssemblyMemberId looks like it should work here, # but for some sessions it's bugged to only return session try: voter = self.legislators_by_short[str(row['ShortName'])] name = voter['DisplayName'] except KeyError: self.warning('could not find legislator short name %s', row['ShortName']) name = row['ShortName'] if row['SelectVoteTypeCode'] == 'Y': vote.yes(name) elif row['SelectVoteTypeCode'] == 'N': vote.no(name) else: vote.vote('other', name) # bill.add_vote_event(vote) yield vote
def test_vote_org_dict(): odict = {'name': 'Random Committee', 'classification': 'committee'} v = Vote(legislative_session="2009", motion_text="passage of the bill", start_date="2009-01-07", result='pass', classification='bill-passage', organization=odict) assert get_pseudo_id(v.organization) == odict
def test_org_and_chamber_conflict(): with pytest.raises(ValueError): Vote(legislative_session="2009", motion_text="passage of the bill", start_date="2009-01-07", result='pass', classification='passage', organization='test', chamber='lower')
def test_vote_org_obj(): o = Organization('something', classification='committee') v = Vote(legislative_session="2009", motion_text="passage of the bill", start_date="2009-01-07", result='pass', classification='bill-passage', organization=o) assert v.organization == o._id
def scrape_committee_vote(self, bill, actor, date, motion, page, url, uniqid): votes = page.xpath("//table")[0] rows = votes.xpath(".//tr")[0] if rows[0].text_content() == "Votes:": # New webste rows = votes.xpath(".//tr")[2] yno = rows.xpath(".//td") if len(yno) < 3: yes = yno[0] no, other = None, None else: yes, _, no, _, other = rows.xpath(".//td")[:5] def proc_block(obj, typ): if obj is None: return {"type": None, "count": None, "votes": []} votes = [] for vote in obj.xpath("./text()"): if vote.strip(): vote = vote.strip() if vote: votes.append(vote) count = len(votes) return {"type": typ, "count": count, "votes": votes} vote_dict = { "yes": proc_block(yes, "yes"), "no": proc_block(no, "no"), "other": proc_block(other, "other"), } yes_count = vote_dict["yes"]["count"] no_count = vote_dict["no"]["count"] or 0 other_count = vote_dict["other"]["count"] or 0 vote = Vote( chamber=actor, start_date=date, motion_text=motion, identifier=str(uniqid), result="pass" if (yes_count > no_count) else "fail", classification="passage", bill=bill, ) vote.extras = {"_vote_id": uniqid} vote.add_source(url) vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("other", other_count) for key in vote_dict: for voter in vote_dict[key]["votes"]: vote.vote(key, voter) yield vote
def parse_vote(self, bill, actor, date, motion, url, uniqid): page = self.get(url).text bill.add_source(url) vote_re = re.compile( r"YEAS -?\s?(\d+)(.*)NAYS -?\s?(\d+)" r"(.*)ABSENT( OR NOT VOTING)? -?\s?" r"(\d+)(.*)", re.MULTILINE | re.DOTALL, ) match = vote_re.search(page) yes_count = int(match.group(1)) no_count = int(match.group(3)) other_count = int(match.group(6)) if yes_count > no_count: passed = True else: passed = False if actor == "upper" or actor == "lower": vote_chamber = actor else: vote_chamber = "" vote = Vote( chamber=vote_chamber, start_date=date, motion_text=motion, result="pass" if passed else "fail", identifier=str(uniqid), classification="passage", bill=bill, ) vote.add_source(url) vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("other", other_count) yes_votes = re.split(r"\s{2,}", match.group(2).strip()) no_votes = re.split(r"\s{2,}", match.group(4).strip()) other_votes = re.split(r"\s{2,}", match.group(7).strip()) for yes in yes_votes: if yes: vote.yes(yes) for no in no_votes: if no: vote.no(no) for other in other_votes: if other: vote.vote("other", other) yield vote
def scrape_votes(self, bill): bill_num = bill.identifier.split()[1] url = ( "http://wslwebservices.leg.wa.gov/legislationservice.asmx/" "GetRollCalls?billNumber=%s&biennium=%s" % (bill_num, self.biennium) ) page = self.get(url) page = lxml.etree.fromstring(page.content) for rc in xpath(page, "//wa:RollCall"): motion = xpath(rc, "string(wa:Motion)") seq_no = xpath(rc, "string(wa:SequenceNumber)") date = xpath(rc, "string(wa:VoteDate)").split("T")[0] date = datetime.datetime.strptime(date, "%Y-%m-%d").date() yes_count = int(xpath(rc, "string(wa:YeaVotes/wa:Count)")) no_count = int(xpath(rc, "string(wa:NayVotes/wa:Count)")) abs_count = int(xpath(rc, "string(wa:AbsentVotes/wa:Count)")) ex_count = int(xpath(rc, "string(wa:ExcusedVotes/wa:Count)")) other_count = abs_count + ex_count agency = xpath(rc, "string(wa:Agency)") chamber = {"House": "lower", "Senate": "upper"}[agency] vote = Vote( chamber=chamber, start_date=date, motion_text="{} (#{})".format(motion, seq_no), result="pass" if yes_count > (no_count + other_count) else "fail", classification="other", bill=bill, ) vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("other", other_count) vote.add_source(url) for sv in xpath(rc, "wa:Votes/wa:Vote"): name = xpath(sv, "string(wa:Name)") vtype = xpath(sv, "string(wa:VOte)") if vtype == "Yea": vote.yes(name) elif vtype == "Nay": vote.no(name) else: vote.vote("other", name) yield vote
def parse_vote(self, bill, actor, date, motion, url, uniqid): page = self.get(url).text bill.add_source(url) vote_re = re.compile( r'YEAS -?\s?(\d+)(.*)NAYS -?\s?(\d+)' r'(.*)ABSENT( OR NOT VOTING)? -?\s?' r'(\d+)(.*)', re.MULTILINE | re.DOTALL) match = vote_re.search(page) yes_count = int(match.group(1)) no_count = int(match.group(3)) other_count = int(match.group(6)) if yes_count > no_count: passed = True else: passed = False if actor == 'upper' or actor == 'lower': vote_chamber = actor else: vote_chamber = '' vote = Vote(chamber=vote_chamber, start_date=date, motion_text=motion, result='pass' if passed else 'fail', identifier=str(uniqid), classification='passage', bill=bill) vote.add_source(url) vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('other', other_count) yes_votes = re.split(r'\s{2,}', match.group(2).strip()) no_votes = re.split(r'\s{2,}', match.group(4).strip()) other_votes = re.split(r'\s{2,}', match.group(7).strip()) for yes in yes_votes: if yes: vote.yes(yes) for no in no_votes: if no: vote.no(no) for other in other_votes: if other: vote.vote('other', other) yield vote
def scrape_votes(self, bill): bill_num = bill.identifier.split()[1] url = ("http://wslwebservices.leg.wa.gov/legislationservice.asmx/" "GetRollCalls?billNumber=%s&biennium=%s" % (bill_num, self.biennium)) page = self.get(url) page = lxml.etree.fromstring(page.content) for rc in xpath(page, "//wa:RollCall"): motion = xpath(rc, "string(wa:Motion)") date = xpath(rc, "string(wa:VoteDate)").split("T")[0] date = datetime.datetime.strptime(date, "%Y-%m-%d").date() yes_count = int(xpath(rc, "string(wa:YeaVotes/wa:Count)")) no_count = int(xpath(rc, "string(wa:NayVotes/wa:Count)")) abs_count = int(xpath(rc, "string(wa:AbsentVotes/wa:Count)")) ex_count = int(xpath(rc, "string(wa:ExcusedVotes/wa:Count)")) other_count = abs_count + ex_count agency = xpath(rc, "string(wa:Agency)") chamber = {'House': 'lower', 'Senate': 'upper'}[agency] vote = Vote(chamber=chamber, start_date=date, motion_text=motion, result='pass' if yes_count > (no_count + other_count) else 'fail', classification='other', bill=bill) vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('other', other_count) vote.add_source(url) for sv in xpath(rc, "wa:Votes/wa:Vote"): name = xpath(sv, "string(wa:Name)") vtype = xpath(sv, "string(wa:VOte)") if vtype == 'Yea': vote.yes(name) elif vtype == 'Nay': vote.no(name) else: vote.vote('other', name) yield vote
def parse_vote(self, chamber, bill, row, action_text, action_date, url): yes = int( row.xpath( './/div[label[contains(text(), "A Favor")]]/span[contains(@class,"smalltxt")]/text()' )[0]) no = int( row.xpath( './/div[label[contains(text(), "En Contra")]]/span[contains(@class,"smalltxt")]/text()' )[0]) abstain = int( row.xpath( './/div[label[contains(text(), "Abstenido")]]/span[contains(@class,"smalltxt")]/text()' )[0]) absent = int( row.xpath( './/div[label[contains(text(), "Ausente")]]/span[contains(@class,"smalltxt")]/text()' )[0]) vote_chamber = self.parse_vote_chamber(chamber, action_text) classification = "passage" if u"Votación Final" in action_text else "other" vote = Vote( chamber=vote_chamber, start_date=action_date, motion_text=action_text, result="pass" if (yes > no) else "fail", bill=bill, classification=classification, ) vote.add_source(url) vote.set_count("yes", yes) vote.set_count("no", no) vote.set_count("absent", absent) vote.set_count("abstain", abstain) # we don't want to add the attached vote PDF as a version, # so add it as a document # TODO: maybe this should be set as the source? self.parse_version(bill, row, is_document=True) yield vote
def scrape_house_vote(self, bill, url): try: filename, resp = self.urlretrieve(url) except scrapelib.HTTPError: self.warning("missing vote file %s" % url) return text = convert_pdf(filename, 'text') os.remove(filename) lines = text.splitlines() vote_type = None votes = collections.defaultdict(list) date = None for idx, line in enumerate(lines): line = line.rstrip().decode('utf-8') match = re.search(r'(\d+)/(\d+)/(\d{4,4})$', line) if match: date = datetime.datetime.strptime(match.group(0), "%m/%d/%Y") continue match = re.match( r'\s+YEAS: (\d+)\s+NAYS: (\d+)\s+NOT VOTING: (\d+)', line) if match: motion = (lines[idx - 2].strip()).decode('utf-8') if not motion: self.warning("No motion text found for vote") motion = "PASSAGE" yes_count, no_count, other_count = [ int(g) for g in match.groups() ] exc_match = re.search(r'EXCUSED: (\d+)', line) if exc_match: other_count += int(exc_match.group(1)) if line.endswith('ADOPTED') or line.endswith('PASSED'): passed = True else: passed = False continue match = re.match( r'(YEAS|NAYS|NOT VOTING|PAIRED|EXCUSED):\s+(\d+)\s*$', line) if match: vote_type = { 'YEAS': 'yes', 'NAYS': 'no', 'NOT VOTING': 'other', 'EXCUSED': 'other', 'PAIRED': 'paired' }[match.group(1)] continue if vote_type == 'paired': for part in line.split(' '): part = part.strip() if not part: continue name, pair_type = re.match(r'([^\(]+)\((YEA|NAY)\)', line).groups() name = name.strip() if pair_type == 'YEA': votes['yes'].append(name) elif pair_type == 'NAY': votes['no'].append(name) elif vote_type: for name in line.split(' '): name = name.strip() if not name: continue votes[vote_type].append(name) if date: vote = Vote(chamber='lower', start_date=date.strftime("%Y-%m-%d"), motion_text=motion, result='pass' if passed else 'fail', classification='passage', bill=bill) vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('other', other_count) vote.add_source(url) vote.pupa_id = url for key, values in votes.items(): for value in values: vote.vote(key, value) yield vote else: self.warning("Syntax Error/Warning using 'convert_pdf'")
def scrape_votes(self, bill, url): page = lxml.html.fromstring(self.get(url).text.replace(u'\xa0', ' ')) seen_rcs = set() re_ns = "http://exslt.org/regular-expressions" path = "//p[re:test(text(), 'OKLAHOMA\s+(HOUSE|STATE\s+SENATE)')]" for header in page.xpath(path, namespaces={'re': re_ns}): bad_vote = False # Each chamber has the motion name on a different line of the file if 'HOUSE' in header.xpath("string()"): chamber = 'lower' motion_index = 8 else: chamber = 'upper' motion_index = 13 motion = header.xpath( "string(following-sibling::p[%d])" % motion_index).strip() motion = re.sub(r'\s+', ' ', motion) if not motion.strip(): self.warning("Motion text not found") return match = re.match(r'^(.*) (PASSED|FAILED)$', motion) if match: motion = match.group(1) passed = match.group(2) == 'PASSED' else: passed = None rcs_p = header.xpath( "following-sibling::p[contains(., 'RCS#')]")[0] rcs_line = rcs_p.xpath("string()").replace(u'\xa0', ' ') rcs = re.search(r'RCS#\s+(\d+)', rcs_line).group(1) if rcs in seen_rcs: continue else: seen_rcs.add(rcs) date_line = rcs_p.getnext().xpath("string()") date = re.search(r'\d+/\d+/\d+', date_line).group(0) date = datetime.datetime.strptime(date, "%m/%d/%Y").date() vtype = None counts = collections.defaultdict(int) votes = collections.defaultdict(list) seen_yes = False for sib in header.xpath("following-sibling::p")[13:]: line = sib.xpath("string()").replace('\r\n', ' ').strip() if "*****" in line: break regex = (r'(YEAS|NAYS|EXCUSED|VACANT|CONSTITUTIONAL ' 'PRIVILEGE|NOT VOTING|N/V)\s*:\s*(\d+)(.*)') match = re.match(regex, line) if match: if match.group(1) == 'YEAS' and 'RCS#' not in line: vtype = 'yes' seen_yes = True elif match.group(1) == 'NAYS' and seen_yes: vtype = 'no' elif match.group(1) == 'VACANT': continue # skip these elif seen_yes: vtype = 'other' if seen_yes and match.group(3).strip(): self.warning("Bad vote format, skipping.") bad_vote = True counts[vtype] += int(match.group(2)) elif seen_yes: for name in line.split(' '): if not name: continue if 'HOUSE' in name or 'SENATE ' in name: continue votes[vtype].append(name.strip()) if bad_vote: continue if passed is None: passed = counts['yes'] > (counts['no'] + counts['other']) vote = Vote(chamber=chamber, start_date=date.strftime('%Y-%m-%d'), motion_text=motion, result='pass' if passed else 'fail', bill=bill, classification='passage') vote.set_count('yes', counts['yes']) vote.set_count('no', counts['no']) vote.set_count('other', counts['other']) vote.pupa_id = url + '#' + rcs vote.add_source(url) for name in votes['yes']: vote.yes(name) for name in votes['no']: if ':' in name: raise Exception(name) vote.no(name) for name in votes['other']: vote.vote('other', name) yield vote
def scrape_vote(self, bill, name, url): match = re.match("^(Senate|House) Vote on [^,]*,(.*)$", name) if not match: return chamber = {"Senate": "upper", "House": "lower"}[match.group(1)] motion = match.group(2).strip() if motion.startswith("FINAL PASSAGE"): type = "passage" elif motion.startswith("AMENDMENT"): type = "amendment" elif "ON 3RD READING" in motion: type = "reading:3" else: type = "other" (fd, temp_path) = tempfile.mkstemp() self.urlretrieve(url, temp_path) html = self.pdf_to_lxml(temp_path) os.close(fd) os.remove(temp_path) vote_type = None body = html.xpath("string(/html/body)") date_match = re.search(r"Date: (\d{1,2}/\d{1,2}/\d{4})", body) try: date = date_match.group(1) except AttributeError: self.warning("BAD VOTE: date error") return start_date = dt.datetime.strptime(date, "%m/%d/%Y") d = defaultdict(list) for line in body.replace(u"\xa0", "\n").split("\n"): line = line.replace(" ", "").strip() # Skip blank lines and "Total --" if not line or "Total --" in line: continue if line in ("YEAS", "NAYS", "ABSENT"): vote_type = { "YEAS": "yes", "NAYS": "no", "ABSENT": "other" }[line] elif line in ("Total", "--"): vote_type = None elif vote_type: if vote_type == "yes": d["yes"].append(line) elif vote_type == "no": d["no"].append(line) elif vote_type == "other": d["other"].append(line) yes_count = len(d["yes"]) no_count = len(d["no"]) other_count = len(d["other"]) # The PDFs oddly don't say whether a vote passed or failed. # Hopefully passage just requires yes_votes > not_yes_votes if yes_count > (no_count + other_count): passed = True else: passed = False vote = Vote( chamber=chamber, start_date=start_date.strftime("%Y-%m-%d"), motion_text=motion, result="pass" if passed else "fail", classification=type, bill=bill, ) vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("other", other_count) for key, values in d.items(): for item in values: vote.vote(key, item) vote.add_source(url) yield vote
def scrape_votes(self, session): votes = {} other_counts = defaultdict(int) last_line = [] vote_url = 'http://gencourt.state.nh.us/dynamicdatafiles/RollCallSummary.txt' lines = self.get(vote_url).content.decode('utf-8').splitlines() for line in lines: if len(line) < 2: continue if line.strip() == "": continue line = line.split('|') if len(line) < 14: if len(last_line + line[1:]) == 14: line = last_line self.warning('used bad vote line') else: last_line = line self.warning('bad vote line %s' % '|'.join(line)) session_yr = line[0].replace('\xef\xbb\xbf', '') body = line[1] vote_num = line[2] timestamp = line[3] bill_id = line[4].strip() yeas = int(line[5]) nays = int(line[6]) # present = int(line[7]) # absent = int(line[8]) motion = line[11].strip() or '[not available]' if session_yr == session and bill_id in self.bills_by_id: actor = 'lower' if body == 'H' else 'upper' time = dt.datetime.strptime(timestamp, '%m/%d/%Y %I:%M:%S %p') time = pytz.timezone('America/New_York').localize( time).isoformat() # TODO: stop faking passed somehow passed = yeas > nays vote = Vote(chamber=actor, start_date=time, motion_text=motion, result='pass' if passed else 'fail', classification='passage', bill=self.bills_by_id[bill_id]) vote.set_count('yes', yeas) vote.set_count('no', nays) vote.add_source(vote_url) vote.pupa_id = session_yr + body + vote_num # unique ID for vote votes[body + vote_num] = vote for line in self.get('http://gencourt.state.nh.us/dynamicdatafiles/RollCallHistory.txt') \ .content.decode('utf-8').splitlines(): if len(line) < 2: continue # 2016|H|2|330795||Yea| # 2012 | H | 2 | 330795 | 964 | HB309 | Yea | 1/4/2012 8:27:03 PM session_yr, body, v_num, _, employee, bill_id, vote, date = \ line.split('|') if not bill_id: continue if session_yr == session and bill_id.strip() in self.bills_by_id: try: leg = " ".join(self.legislators[employee]['name'].split()) except KeyError: self.warning("Error, can't find person %s" % employee) continue vote = vote.strip() if body + v_num not in votes: self.warning("Skipping processing this vote:") self.warning("Bad ID: %s" % (body + v_num)) continue # code = self.legislators[employee]['seat'] if vote == 'Yea': votes[body + v_num].yes(leg) elif vote == 'Nay': votes[body + v_num].no(leg) else: votes[body + v_num].vote('other', leg) # hack-ish, but will keep the vote count sync'd other_counts[body + v_num] += 1 votes[body + v_num].set_count('other', other_counts[body + v_num]) for vote in votes.values(): yield vote
def scrape_votes(self, session, zip_url): votes = {} last_line = [] for line in self.zf.open("tblrollcallsummary.txt"): if line.strip() == "": continue line = line.split("|") if len(line) < 14: if len(last_line + line[1:]) == 14: line = last_line self.warning("used bad vote line") else: last_line = line self.warning("bad vote line %s" % "|".join(line)) session_yr = line[0] body = line[1] vote_num = line[2] timestamp = line[3] bill_id = line[4].strip() yeas = int(line[5]) nays = int(line[6]) # present = int(line[7]) # absent = int(line[8]) motion = line[11].strip() or "[not available]" if session_yr == session and bill_id in self.bills_by_id: actor = "lower" if body == "H" else "upper" time = dt.datetime.strptime(timestamp, "%m/%d/%Y %I:%M:%S %p") # TODO: stop faking passed somehow passed = yeas > nays vote = Vote( chamber=actor, start_date=time.strftime("%Y-%m-%d"), motion_text=motion, result="pass" if passed else "fail", classification="passage", bill=self.bills_by_id[bill_id], ) vote.set_count("yes", yeas) vote.set_count("no", nays) vote.add_source(zip_url) votes[body + vote_num] = vote for line in self.zf.open("tblrollcallhistory.txt"): # 2012 | H | 2 | 330795 | HB309 | Yea |1/4/2012 8:27:03 PM session_yr, body, v_num, employee, bill_id, vote, date = line.split( "|") if not bill_id: continue if session_yr == session and bill_id.strip() in self.bills_by_id: try: leg = self.legislators[employee]["name"] except KeyError: self.warning("Error, can't find person %s" % employee) continue vote = vote.strip() if body + v_num not in votes: self.warning("Skipping processing this vote:") self.warning("Bad ID: %s" % (body + v_num)) continue other_count = 0 # code = self.legislators[employee]['seat'] if vote == "Yea": votes[body + v_num].yes(leg) elif vote == "Nay": votes[body + v_num].no(leg) else: votes[body + v_num].other(leg) other_count += 1 votes[body + v_num].set_count("other", other_count) for vote in votes.values(): yield vote
def scrape_chamber(self, chamber, session): chamber_name = 'house' if chamber == 'lower' else 'senate' session_slug = { '62': '62-2011', '63': '63-2013', '64': '64-2015', '65': '65-2017', }[session] # Open the index page of the session's Registers, and open each url = "http://www.legis.nd.gov/assembly/%s/journals/%s-journal.html" % ( session_slug, chamber_name) page = self.lxmlize(url) pdfs = page.xpath("//a[contains(@href, '.pdf')]") for pdf in pdfs: # Initialize information about the vote parsing results = {} in_motion = False cur_vote = None in_vote = False cur_motion = "" bills = [] # Determine which URLs the information was pulled from pdf_url = pdf.attrib['href'] try: (path, response) = self.urlretrieve(pdf_url) except requests.exceptions.ConnectionError: continue # Convert the PDF to text data = convert_pdf(path, type='text').decode('utf-8') os.unlink(path) # Determine the date of the document date = re.findall(date_re, data) if date: date = date[0][0] cur_date = datetime.datetime.strptime(date, "%A, %B %d, %Y") else: # If no date is found anywhere, do not process the document self.warning("No date was found for the document; skipping.") continue # Check each line of the text for motion and vote information lines = data.splitlines() for line in lines: # Ignore lines with no information if re.search(chamber_re, line) or \ re.search(date_re, line) or \ re.search(page_re, line) or \ line.strip() == "": pass # Ensure that motion and vote capturing are not _both_ active elif in_motion and in_vote: raise AssertionError( "Scraper should not be simultaneously processing " + "motion name and votes, as it is for this motion: " + cur_motion) # Start capturing motion text after a ROLL CALL header elif not in_motion and not in_vote: if line.strip() == "ROLL CALL": in_motion = True elif in_motion and not in_vote: if cur_motion == "": cur_motion = line.strip() else: cur_motion = cur_motion + " " + line.strip() # ABSENT AND NOT VOTING marks the end of each motion name # In this case, prepare to capture votes if line.strip().endswith("VOTING") or \ line.strip().endswith("VOTING."): in_motion = False in_vote = True elif not in_motion and in_vote: # Ignore appointments and confirmations if "The Senate advises and consents to the appointment" \ in line: in_vote = False cur_vote = None results = {} cur_motion = "" bills = [] # If votes are being processed, record the voting members elif ":" in line: cur_vote, who = (x.strip() for x in line.split(":", 1)) who = [ x.strip() for x in who.split(';') if x.strip() != "" ] results[cur_vote] = who name_may_be_continued = False if line.endswith(";") \ else True # Extracts bill numbers in the closing text # used for when the closing text is multiple lines. elif cur_vote is not None and\ re.findall(r"(?i)(H|S|J)(C?)(B|R|M) (\d+)", line) and \ not any(x in line.lower() for x in ['passed', 'adopted', 'sustained', 'prevailed', 'lost', 'failed']): bills.extend( re.findall(r"(?i)(H|S|J)(C?)(B|R|M) (\d+)", line)) elif cur_vote is not None and \ not any(x in line.lower() for x in ['passed', 'adopted', 'sustained', 'prevailed', 'lost', 'failed']): who = [ x.strip() for x in line.split(";") if x.strip() != "" ] if name_may_be_continued: results[cur_vote][-1] = results[cur_vote][-1] + \ " " + who.pop(0) name_may_be_continued = False if line.endswith(";") \ else True results[cur_vote].extend(who) # At the conclusion of a vote, save its data elif any(x in line.lower() for x in [ 'passed', 'adopted', 'sustained', 'prevailed', 'lost', 'failed' ]): in_vote = False cur_vote = None # Identify what is being voted on # Throw a warning if impropper informaiton found bills.extend( re.findall(r"(?i)(H|S|J)(C?)(B|R|M) (\d+)", line)) if bills == [] or cur_motion.strip() == "": results = {} cur_motion = "" self.warning("No motion or bill name found: " + "motion name: " + cur_motion + "; " + "decision text: " + line.strip()) continue # If votes are found in the motion name, throw an error if "YEAS:" in cur_motion or "NAYS:" in cur_motion: raise AssertionError( "Vote data found in motion name: " + cur_motion) # Use the collected results to determine who voted how keys = { "YEAS": "yes", "NAYS": "no", "ABSENT AND NOT VOTING": "other" } res = {} for key in keys: if key in results: res[keys[key]] = results[key] else: res[keys[key]] = [] # Count the number of members voting each way yes, no, other = \ len(res['yes']), \ len(res['no']), \ len(res['other']) chambers = { "H": "lower", "S": "upper", "J": "legislature" } # Almost all of the time, a vote only applies to one bill and this loop # will only be run once. # Some exceptions exist. for bill in bills: cur_bill_id = "%s%s%s %s" % bill # Identify the source chamber for the bill try: bc = chambers[cur_bill_id[0]] except KeyError: bc = 'other' # Determine whether or not the vote passed if "over the governor's veto" in cur_motion.lower( ): VETO_SUPERMAJORITY = 2 / 3 passed = (yes / (yes + no) > VETO_SUPERMAJORITY) else: passed = (yes > no) # Create a Vote object based on the scraped information vote = Vote( chamber=chamber, start_date=cur_date.strftime('%Y-%m-%d'), motion_text=cur_motion, result='pass' if passed else 'fail', legislative_session=session, classification='passage', bill=cur_bill_id, bill_chamber=bc) vote.add_source(pdf_url) vote.add_source(url) vote.set_count('yes', yes) vote.set_count('no', no) vote.set_count('other', other) # For each category of voting members, # add the individuals to the Vote object for key in res: for voter in res[key]: vote.vote(key, voter) # Check the vote counts in the motion text against # the parsed results for category_name in keys.keys(): # Need to search for the singular, not plural, in the text # so it can find, for example, " 1 NAY " vote_re = r"(\d+)\s{}".format( category_name[:-1]) motion_count = int( re.findall(vote_re, cur_motion)[0]) for item in vote.counts: if item['option'] == keys[category_name]: vote_count = item['value'] if motion_count != vote_count: self.warning( "Motion text vote counts ({}) ".format( motion_count) + "differed from roll call counts ({}) ". format(vote_count) + "for {0} on {1}".format( category_name, cur_bill_id)) for item in vote.counts: if item['option'] == keys[ category_name]: vote_count = motion_count yield vote # With the vote successfully processed, # wipe its data and continue to the next one results = {} cur_motion = "" bills = []
def parse_html_vote(self, bill, actor, date, motion, url, uniqid): try: page = self.get(url).text except scrapelib.HTTPError: self.warning("A vote page not found for bill {}".format( bill.identifier)) return page = lxml.html.fromstring(page) page.make_links_absolute(url) descr = page.xpath("//b")[0].text_content() if descr == '': # New page method descr = page.xpath("//center")[0].text if "on voice vote" in descr: return if "committee" in descr.lower(): yield from self.scrape_committee_vote(bill, actor, date, motion, page, url, uniqid) return passed = None if "Passed" in descr: passed = True elif "Failed" in descr: passed = False elif "UTAH STATE LEGISLATURE" in descr: return elif descr.strip() == '-': return else: self.warning(descr) raise NotImplementedError("Can't see if we passed or failed") headings = page.xpath("//b")[1:] votes = page.xpath("//table") sets = zip(headings, votes) vdict = {} for (typ, votes) in sets: txt = typ.text_content() arr = [x.strip() for x in txt.split("-", 1)] if len(arr) != 2: continue v_txt, count = arr v_txt = v_txt.strip() count = int(count) people = [ x.text_content().strip() for x in votes.xpath(".//font[@face='Arial']") ] vdict[v_txt] = {"count": count, "people": people} vote = Vote(chamber=actor, start_date=date, motion_text=motion, result='pass' if passed else 'fail', bill=bill, classification='passage', identifier=str(uniqid)) vote.set_count('yes', vdict['Yeas']['count']) vote.set_count('no', vdict['Nays']['count']) vote.set_count('other', vdict['Absent or not voting']['count']) vote.add_source(url) for person in vdict['Yeas']['people']: vote.yes(person) for person in vdict['Nays']['people']: vote.no(person) for person in vdict['Absent or not voting']['people']: vote.vote('other', person) yield vote
def get_bills(self): bills = [ { "name": "HB500", "title": "Makes various changes to provisions governing employment practices", "session": "2011", "versions": ["http://example.com/HB500.pdf"], "actions": [ { "description": "Introduced", "actor": "Committee on Pudding Pops", "date": "2014-04-15", }, { "date": "2014-04-15", "description": "Read first time. Referred to Committee on Commerce and Labor. To printer.", "actor": "Test City Council" }, { "date": "2014-04-15", "description": "From printer. To committee.", "actor": "Test City Council" }, { "date": "2014-04-15", "description": "From committee: Do pass.", "actor": "Rules" }, { "description": "Signed into law", "actor": "Fiscal Committee", "date": "2014-04-19", }, ], "sponsors_people": [], "sponsors_committee": [], "votes": [ { "motion": "Vote by the Committee on the Whole.", "yes_count": 1, "other_count": 1, "no_count": 3, "passed": True, "type": "passage:bill", "date": "2014-04-15", "session": "2011", "roll": { "yes": [ "Eliana Meyer", ], "no": [ "Gunnar Luna", "Regina Cruz", "Makenzie Keller", ], "other": [ "Unknown Person", ], } }, ] }, { "name": "HB101", "title": "Joint county ditch proceedings-conduct by teleconference or video conference", "session": "2011", "versions": ["http://example.com/HB101.pdf"], "actions": [ { "description": "Introduced", "actor": "council", "date": "2014-04-15", }, { "description": "Referred to the Committee on Pudding Pops", "actor": "council", "date": "2014-04-16", }, { "description": "Reported favorably", "actor": "council", "date": "2014-04-16", }, { "description": "Referred to the Bills in the Third Read", "actor": "council", "date": "2014-04-17", }, { "description": "Vote by the Committee on the Whole. Do pass.", "actor": "council", "date": "2014-04-18", }, { "description": "Signed into law", "actor": "council", "date": "2014-04-19", }, ], "sponsors_people": [ "Shayla Fritz", "Gunnar Luna", ], "sponsors_committee": [ "Standing Committee on Public Safety", ], "votes": [ { "motion": "Vote by the Committee on the Whole.", "yes_count": 3, "no_count": 1, "passed": True, "type": "passage:bill", "date": "2014-04-18", "session": "2011", "roll": { "yes": [ "Gunnar Luna", "Regina Cruz", "Makenzie Keller", ], "no": [ "Eliana Meyer", ], "other": [], } }, ] }, ] for bill in bills: b = Bill(identifier=bill['name'], title=bill['title'], legislative_session=bill['session']) b.add_source("ftp://example.com/some/bill") for vote in bill['votes']: v = Vote( motion_text=vote['motion'], organization_id=make_psuedo_id( name="Test City Council", classification="legislature"), yes_count=vote['yes_count'], no_count=vote['no_count'], result='pass' if vote['passed'] else 'fail', classification=vote['type'], start_date=vote['date'], legislative_session=vote['session'], ) v.add_source("http://example.com/votes/vote.xls") for yv in vote['roll']['yes']: v.yes(yv) for nv in vote['roll']['no']: v.no(nv) yield v for sponsor in bill['sponsors_people']: b.add_sponsorship(name=sponsor, classification='primary', entity_type='person', primary=True) for sponsor in bill['sponsors_committee']: b.add_sponsorship(name=sponsor, classification='primary', entity_type='organization', primary=True) for version in bill['versions']: b.add_version_link(note="Bill Version", url=version) for action in bill['actions']: action['organization'] = make_psuedo_id( name=action.pop('actor')) b.add_action(**action) yield b
def scrape_votes(self, session, zip_url): votes = {} last_line = [] for line in self.zf.open('tblrollcallsummary.txt'): if line.strip() == "": continue line = line.split('|') if len(line) < 14: if len(last_line + line[1:]) == 14: line = last_line self.warning('used bad vote line') else: last_line = line self.warning('bad vote line %s' % '|'.join(line)) session_yr = line[0] body = line[1] vote_num = line[2] timestamp = line[3] bill_id = line[4].strip() yeas = int(line[5]) nays = int(line[6]) # present = int(line[7]) # absent = int(line[8]) motion = line[11].strip() or '[not available]' if session_yr == session and bill_id in self.bills_by_id: actor = 'lower' if body == 'H' else 'upper' time = dt.datetime.strptime(timestamp, '%m/%d/%Y %I:%M:%S %p') # TODO: stop faking passed somehow passed = yeas > nays vote = Vote(chamber=actor, start_date=time.strftime("%Y-%m-%d"), motion_text=motion, result='pass' if passed else 'fail', classification='passage', bill=self.bills_by_id[bill_id]) vote.set_count('yes', yeas) vote.set_count('no', nays) vote.add_source(zip_url) votes[body + vote_num] = vote for line in self.zf.open('tblrollcallhistory.txt'): # 2012 | H | 2 | 330795 | HB309 | Yea |1/4/2012 8:27:03 PM session_yr, body, v_num, employee, bill_id, vote, date \ = line.split('|') if not bill_id: continue if session_yr == session and bill_id.strip() in self.bills_by_id: try: leg = self.legislators[employee]['name'] except KeyError: self.warning("Error, can't find person %s" % employee) continue vote = vote.strip() if body + v_num not in votes: self.warning("Skipping processing this vote:") self.warning("Bad ID: %s" % (body + v_num)) continue other_count = 0 # code = self.legislators[employee]['seat'] if vote == 'Yea': votes[body + v_num].yes(leg) elif vote == 'Nay': votes[body + v_num].no(leg) else: votes[body + v_num].other(leg) other_count += 1 votes[body + v_num].set_count('other', other_count) for vid, vote in votes.items(): yield vote
def scrape_votes(self, bill, url): page = lxml.html.fromstring(self.get(url).text.replace(u"\xa0", " ")) seen_rcs = set() re_ns = "http://exslt.org/regular-expressions" path = r"//p[re:test(text(), 'OKLAHOMA\s+(HOUSE|STATE\s+SENATE)')]" for header in page.xpath(path, namespaces={"re": re_ns}): bad_vote = False # Each chamber has the motion name on a different line of the file if "HOUSE" in header.xpath("string()"): chamber = "lower" motion_index = 8 else: chamber = "upper" motion_index = 13 motion = header.xpath( "string(following-sibling::p[%d])" % motion_index ).strip() motion = re.sub(r"\s+", " ", motion) if not motion.strip(): self.warning("Motion text not found") return match = re.match(r"^(.*) (PASSED|FAILED)$", motion) if match: motion = match.group(1) passed = match.group(2) == "PASSED" else: passed = None rcs_p = header.xpath("following-sibling::p[contains(., 'RCS#')]")[0] rcs_line = rcs_p.xpath("string()").replace(u"\xa0", " ") rcs = re.search(r"RCS#\s+(\d+)", rcs_line).group(1) if rcs in seen_rcs: continue else: seen_rcs.add(rcs) date_line = rcs_p.getnext().xpath("string()") date = re.search(r"\d+/\d+/\d+", date_line).group(0) date = datetime.datetime.strptime(date, "%m/%d/%Y").date() vtype = None counts = collections.defaultdict(int) votes = collections.defaultdict(list) seen_yes = False for sib in header.xpath("following-sibling::p")[13:]: line = sib.xpath("string()").replace("\r\n", " ").strip() if "*****" in line: break regex = ( r"(YEAS|NAYS|EXCUSED|VACANT|CONSTITUTIONAL " r"PRIVILEGE|NOT VOTING|N/V)\s*:\s*(\d+)(.*)" ) match = re.match(regex, line) if match: if match.group(1) == "YEAS" and "RCS#" not in line: vtype = "yes" seen_yes = True elif match.group(1) == "NAYS" and seen_yes: vtype = "no" elif match.group(1) == "VACANT": continue # skip these elif seen_yes: vtype = "other" if seen_yes and match.group(3).strip(): self.warning("Bad vote format, skipping.") bad_vote = True counts[vtype] += int(match.group(2)) elif seen_yes: for name in line.split(" "): if not name: continue if "HOUSE" in name or "SENATE " in name: continue votes[vtype].append(name.strip()) if bad_vote: continue if passed is None: passed = counts["yes"] > (counts["no"] + counts["other"]) vote = Vote( chamber=chamber, start_date=date.strftime("%Y-%m-%d"), motion_text=motion, result="pass" if passed else "fail", bill=bill, classification="passage", ) vote.set_count("yes", counts["yes"]) vote.set_count("no", counts["no"]) vote.set_count("other", counts["other"]) vote.pupa_id = url + "#" + rcs vote.add_source(url) for name in votes["yes"]: vote.yes(name) for name in votes["no"]: if ":" in name: raise Exception(name) vote.no(name) for name in votes["other"]: vote.vote("other", name) yield vote
def scrape_bill(self, bill_id): old = self.api('bills/' + bill_id + '?') # not needed old.pop('id') old.pop('state') old.pop('level', None) old.pop('country', None) old.pop('created_at') old.pop('updated_at') old.pop('action_dates') old.pop('+subject', None) old.pop('+scraped_subjects', None) old.pop('subjects', []) classification = old.pop('type') # ca weirdness if 'fiscal committee' in classification: classification.remove('fiscal committee') if 'urgency' in classification: classification.remove('urgency') if 'local program' in classification: classification.remove('local program') if 'tax levy' in classification: classification.remove('tax levy') if classification[0] in ['miscellaneous', 'jres', 'cres']: return if classification == ['memorial resolution'] and self.state == 'ar': classification = ['memorial'] if classification == ['concurrent memorial resolution'] and self.state == 'ar': classification = ['concurrent memorial'] if classification == ['joint session resolution'] and self.state == 'il': classification = ['joint resolution'] if classification == ['legislative resolution'] and self.state == 'ny': classification = ['resolution'] if not old['title'] and self.state == 'me': old['title'] = '(unknown)' chamber = old.pop('chamber') if chamber == 'upper' and self.state in ('ne', 'dc'): chamber = 'legislature' elif chamber in ('joint', 'conference'): chamber = 'legislature' new = Bill(old.pop('bill_id'), old.pop('session'), old.pop('title'), chamber=chamber, classification=classification) abstract = old.pop('summary', None) if abstract: new.add_abstract(abstract, note='') for title in old.pop('alternate_titles'): new.add_title(title) for doc in old.pop('documents'): new.add_document_link(doc['name'], doc['url'], on_duplicate='ignore') for doc in old.pop('versions'): new.add_version_link(doc['name'], doc['url'], media_type=doc.pop('mimetype', '')) for subj in old.pop('scraped_subjects', []): if subj: new.add_subject(subj) for spon in old.pop('sponsors'): if spon.get('committee_id') is not None: entity_type = 'organization' elif spon.get('leg_id') is not None: entity_type = 'person' else: entity_type = '' new.add_sponsorship(spon['name'], spon['type'], entity_type, spon['type'] == 'primary') for act in old.pop('actions'): actor = act['actor'] if actor.lower() in ('governor', 'mayor', 'secretary of state'): actor = 'executive' elif actor.lower() == 'house' or (actor.lower().startswith('lower (') and self.state == 'ca'): actor = 'lower' elif actor.lower() in ('senate', 'upper`') or (actor.lower().startswith('upper (') and self.state == 'ca'): actor = 'upper' elif actor in ('joint', 'other', 'Data Systems', 'Speaker', 'clerk', 'Office of the Legislative Fiscal Analyst', 'Became Law w', 'conference') or (actor.lower().startswith('legislature (') and self.state == 'ca'): actor = 'legislature' if actor in ('committee', 'sponsor') and self.state == 'pr': actor = 'legislature' # nebraska & DC if actor == 'upper' and self.state in ('ne', 'dc'): actor = 'legislature' if act['action']: newact = new.add_action(act['action'], act['date'][:10], chamber=actor, classification=[action_types[c] for c in act['type'] if c != 'other']) for re in act.get('related_entities', []): if re['type'] == 'committee': re['type'] = 'organization' elif re['type'] == 'legislator': re['type'] = 'person' newact.add_related_entity(re['name'], re['type']) for comp in old.pop('companions', []): if self.state in ('nj', 'ny', 'mn'): rtype = 'companion' new.add_related_bill(comp['bill_id'], comp['session'], rtype) for abid in old.pop('alternate_bill_ids', []) + old.pop('+alternate_bill_ids', []): new.add_identifier(abid) # generic OpenStates stuff for id in old.pop('all_ids'): new.add_identifier(id, scheme='openstates') for source in old.pop('sources'): source.pop('retrieved', None) new.add_source(**source) ext_title = old.pop('+extended_title', None) if ext_title: new.add_title(ext_title, note='Extended Title') official_title = old.pop('+official_title', None) if official_title: new.add_title(official_title, note='Official Title') to_extras = ['+status', '+final_disposition', '+volume_chapter', '+ld_number', '+referral', '+companion', '+description', '+fiscal_note_probable:', '+preintroduction_required:', '+drafter', '+category:', '+chapter', '+requester', '+transmittal_date:', '+by_request_of', '+bill_draft_number:', '+bill_lr', '+bill_url', '+rcs_num', '+fiscal_note', '+impact_clause', '+fiscal_notes', '+short_title', '+type_', '+conference_committee', 'conference_committee', '+companion_bill_ids'] for k in to_extras: v = old.pop(k, None) if v: new.extras[k.replace('+', '')] = v # votes vote_no = 1 for vote in old.pop('votes'): vote.pop('id') vote.pop('state') vote.pop('bill_id') vote.pop('bill_chamber', None) vote.pop('+state', None) vote.pop('+country', None) vote.pop('+level', None) vote.pop('+vacant', None) vote.pop('+not_voting', None) vote.pop('+amended', None) vote.pop('+excused', None) vote.pop('+NV', None) vote.pop('+AB', None) vote.pop('+P', None) vote.pop('+V', None) vote.pop('+E', None) vote.pop('+EXC', None) vote.pop('+EMER', None) vote.pop('+present', None) vote.pop('+absent', None) vote.pop('+seconded', None) vote.pop('+moved', None) vote.pop('+vote_type', None) vote.pop('+actual_vote', None) vote.pop('+skip_votes', None) vote.pop('vote_id') vote.pop('+bill_chamber', None) vote.pop('+session', None) vote.pop('+bill_id', None) vote.pop('+bill_session', None) vote.pop('committee', None) vote.pop('committee_id', None) vtype = vote.pop('type', 'passage') if vtype == 'veto_override': vtype = ['veto-override'] elif vtype == 'amendment': vtype = ['amendment-passage'] elif vtype == 'other': vtype = '' else: vtype = ['bill-passage'] # most states need identifiers for uniqueness, just do it everywhere identifier = vote['date'] + '-' + str(vote_no) vote_no += 1 chamber = vote.pop('chamber') if chamber == 'upper' and self.state in ('ne', 'dc'): chamber = 'legislature' elif chamber == 'joint': chamber = 'legislature' newvote = Vote(legislative_session=vote.pop('session'), motion_text=vote.pop('motion'), result='pass' if vote.pop('passed') else 'fail', chamber=chamber, start_date=vote.pop('date'), classification=vtype, bill=new, identifier=identifier) for vt in ('yes', 'no', 'other'): newvote.set_count(vt, vote.pop(vt + '_count')) for name in vote.pop(vt + '_votes'): newvote.vote(vt, name['name']) for source in vote.pop('sources'): source.pop('retrieved', None) newvote.add_source(**source) if not newvote.sources: newvote.sources = new.sources to_extras = ['+record', '+method', 'method', '+filename', 'record', '+action', '+location', '+rcs_num', '+type_', '+threshold', '+other_vote_detail', '+voice_vote'] for k in to_extras: v = vote.pop(k, None) if v: newvote.extras[k.replace('+', '')] = v assert not vote, vote.keys() yield newvote assert not old, old.keys() yield new
def scrape_vote(self, bill, name, url): if "VOTE/h" in url: vote_chamber = "lower" cols = (1, 5, 9, 13) name_offset = 3 yes_offset = 0 no_offset = 1 else: vote_chamber = "upper" cols = (1, 6) name_offset = 4 yes_offset = 1 no_offset = 2 page = self.get(url, verify=False).text if "BUDGET ADDRESS" in page: return page = lxml.html.fromstring(page) yes_count = page.xpath( "string(//span[contains(., 'Those voting Yea')])") yes_count = int(re.match(r"[^\d]*(\d+)[^\d]*", yes_count).group(1)) no_count = page.xpath( "string(//span[contains(., 'Those voting Nay')])") no_count = int(re.match(r"[^\d]*(\d+)[^\d]*", no_count).group(1)) other_count = page.xpath("string(//span[contains(., 'Those absent')])") other_count = int(re.match(r"[^\d]*(\d+)[^\d]*", other_count).group(1)) need_count = page.xpath("string(//span[contains(., 'Necessary for')])") need_count = int(re.match(r"[^\d]*(\d+)[^\d]*", need_count).group(1)) date = page.xpath("string(//span[contains(., 'Taken on')])") date = re.match(r".*Taken\s+on\s+(\d+/\s?\d+)", date).group(1) date = date.replace(" ", "") date = datetime.datetime.strptime( date + " " + bill.legislative_session, "%m/%d %Y").date() # not sure about classification. vote = Vote( chamber=vote_chamber, start_date=date, motion_text=name, result="pass" if yes_count > need_count else "fail", classification="passage", bill=bill, ) vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("other", other_count) vote.add_source(url) table = page.xpath("//table")[0] for row in table.xpath("tr"): for i in cols: name = row.xpath("string(td[%d])" % (i + name_offset)).strip() if not name or name == "VACANT": continue name = string.capwords(name) if "Y" in row.xpath("string(td[%d])" % (i + yes_offset)): vote.yes(name) elif "N" in row.xpath("string(td[%d])" % (i + no_offset)): vote.no(name) else: vote.vote("other", name) yield vote
def scrape_vote(self, bill, name, url): match = re.match('^(Senate|House) Vote on [^,]*,(.*)$', name) if not match: return chamber = {'Senate': 'upper', 'House': 'lower'}[match.group(1)] motion = match.group(2).strip() if motion.startswith('FINAL PASSAGE'): type = 'passage' elif motion.startswith('AMENDMENT'): type = 'amendment' elif 'ON 3RD READING' in motion: type = 'reading:3' else: type = 'other' (fd, temp_path) = tempfile.mkstemp() self.urlretrieve(url, temp_path) html = self.pdf_to_lxml(temp_path) os.close(fd) os.remove(temp_path) vote_type = None body = html.xpath('string(/html/body)') date_match = re.search('Date: (\d{1,2}/\d{1,2}/\d{4})', body) try: date = date_match.group(1) except AttributeError: self.warning("BAD VOTE: date error") return start_date = dt.datetime.strptime(date, '%m/%d/%Y') d = defaultdict(list) for line in body.replace(u'\xa0', '\n').split('\n'): line = line.replace(' ', '').strip() # Skip blank lines and "Total --" if not line or 'Total --' in line: continue if line in ('YEAS', 'NAYS', 'ABSENT'): vote_type = {'YEAS': 'yes', 'NAYS': 'no', 'ABSENT': 'other'}[line] elif line in ('Total', '--'): vote_type = None elif vote_type: if vote_type == 'yes': d['yes'].append(line) elif vote_type == 'no': d['no'].append(line) elif vote_type == 'other': d['other'].append(line) yes_count = len(d['yes']) no_count = len(d['no']) other_count = len(d['other']) # The PDFs oddly don't say whether a vote passed or failed. # Hopefully passage just requires yes_votes > not_yes_votes if yes_count > (no_count + other_count): passed = True else: passed = False vote = Vote(chamber=chamber, start_date=start_date.strftime('%Y-%m-%d'), motion_text=motion, result='pass' if passed else 'fail', classification=type, bill=bill) vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('other', other_count) for key, values in d.items(): for item in values: vote.vote(key, item) vote.add_source(url) yield vote
def scrape_vote(self, bill, name, url): if "VOTE/h" in url: vote_chamber = 'lower' cols = (1, 5, 9, 13) name_offset = 3 yes_offset = 0 no_offset = 1 else: vote_chamber = 'upper' cols = (1, 6) name_offset = 4 yes_offset = 1 no_offset = 2 # Connecticut's SSL is causing problems with Scrapelib, so use Requests page = requests.get(url, verify=False).text if 'BUDGET ADDRESS' in page: return page = lxml.html.fromstring(page) yes_count = page.xpath( "string(//span[contains(., 'Those voting Yea')])") yes_count = int(re.match(r'[^\d]*(\d+)[^\d]*', yes_count).group(1)) no_count = page.xpath( "string(//span[contains(., 'Those voting Nay')])") no_count = int(re.match(r'[^\d]*(\d+)[^\d]*', no_count).group(1)) other_count = page.xpath( "string(//span[contains(., 'Those absent')])") other_count = int( re.match(r'[^\d]*(\d+)[^\d]*', other_count).group(1)) need_count = page.xpath( "string(//span[contains(., 'Necessary for')])") need_count = int( re.match(r'[^\d]*(\d+)[^\d]*', need_count).group(1)) date = page.xpath("string(//span[contains(., 'Taken on')])") date = re.match(r'.*Taken\s+on\s+(\d+/\s?\d+)', date).group(1) date = date.replace(' ', '') date = datetime.datetime.strptime(date + " " + bill.legislative_session, "%m/%d %Y").date() # not sure about classification. vote = Vote(chamber=vote_chamber, start_date=date, motion_text=name, result='pass' if yes_count > need_count else 'fail', classification='passage', bill=bill ) vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('other', other_count) vote.add_source(url) table = page.xpath("//table")[0] for row in table.xpath("tr"): for i in cols: name = row.xpath("string(td[%d])" % ( i + name_offset)).strip() if not name or name == 'VACANT': continue if "Y" in row.xpath("string(td[%d])" % (i + yes_offset)): vote.yes(name) elif "N" in row.xpath("string(td[%d])" % (i + no_offset)): vote.no(name) else: vote.vote('other', name) yield vote
def scrape_bill(self, chamber, session, bill_id, bill_type): url = '%s?r=%s' % (self.base_url, bill_id) html = self.get(url).text if "error '80020009'" in html: self.warning('asp error on page, skipping %s', bill_id) return doc = lxml.html.fromstring(html) # search for Titulo, accent over i messes up lxml, so use 'tulo' title = doc.xpath( u'//td/b[contains(text(),"tulo")]/../following-sibling::td/text()') if not title: raise NoSuchBill() bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=title[0], classification=bill_type) author = doc.xpath(u'//td/b[contains(text(),"Autor")]/../text()')[0] for aname in author.split(','): aname = self.clean_name(aname).strip() if aname: bill.add_sponsorship(aname, classification='primary', entity_type='person', primary=True) co_authors = doc.xpath( u'//td/b[contains(text(),"Co-autor")]/../text()') if len(co_authors) != 0: for co_author in co_authors[1].split(','): bill.add_sponsorship(self.clean_name(co_author).strip(), classification='cosponsor', entity_type='person', primary=False) action_table = doc.xpath('//table')[-1] bill_vote_chamber = None for row in action_table[1:]: tds = row.xpath('td') # ignore row missing date if len(tds) != 2: continue if tds[0].text_content(): date = datetime.datetime.strptime(tds[0].text_content(), "%m/%d/%Y") action = tds[1].text_content().strip() # parse the text to see if it's a new version or a unrelated document # if has a hyphen let's assume it's a vote document # get url of action action_url = tds[1].xpath('a/@href') atype, action = self.parse_action(chamber, bill, action, action_url, date) # Some lower-house roll calls could be parsed, but finnicky # Most roll lists are just images embedded within a document, # and offer no alt text to scrape # Instead, just scrape the vote counts regex = r'(?u)^(.*),\s([\s\d]{2})-([\s\d]{2})-([\s\d]{2})-([\s\d]{0,2})$' vote_info = re.search(regex, action) if vote_info and re.search(r'\d{1,2}', action): vote_name = vote_info.group(1) if u"Votación Final" in vote_name: (vote_chamber, vote_name) = re.search(r'(?u)^\w+ por (.*?) en (.*)$', vote_name).groups() if "Senado" in vote_chamber: vote_chamber = 'upper' else: vote_chamber = 'lower' elif "Cuerpo de Origen" in vote_name: vote_name = re.search(r'(?u)^Cuerpo de Origen (.*)$', vote_name).group(1) vote_chamber = chamber elif u"informe de Comisión de Conferencia" in vote_name: (vote_chamber, vote_name) = re.search( r'(?u)^(\w+) (\w+ informe de Comisi\wn de Conferencia)$', vote_name).groups() if vote_chamber == "Senado": vote_chamber = 'upper' else: vote_chamber = 'lower' # TODO replace bill['votes'] elif u"Se reconsideró" in vote_name: if bill_vote_chamber: vote_chamber = bill_vote_chamber else: vote_chamber = chamber else: raise AssertionError( u"Unknown vote text found: {}".format(vote_name)) vote_name = vote_name.title() yes = int(vote_info.group(2)) no = int(vote_info.group(3)) other = 0 if vote_info.group(4).strip(): other += int(vote_info.group(4)) if vote_info.group(5).strip(): other += int(vote_info.group(5)) vote = Vote( chamber=vote_chamber, start_date=date.strftime('%Y-%m-%d'), motion_text=vote_name, result='pass' if (yes > no) else 'fail', bill=bill, classification='passage', ) vote.set_count('yes', yes) vote.set_count('no', no) vote.set_count('other', other) vote.add_source(url) yield vote bill_vote_chamber = chamber bill.add_source(url) yield bill