def scrape_votes_old(self, bill, billname, session): vote_url = ("http://archives.legislature.state.oh.us/bills.cfm?ID=" + session + "_" + billname) page = self.get(vote_url).text page = lxml.html.fromstring(page) for jlink in page.xpath("//a[contains(@href, 'JournalText')]"): date = self._tz.localize( datetime.datetime.strptime(jlink.text, "%m/%d/%Y")).date() date = "{:%Y-%m-%d}".format(date) details = jlink.xpath("string(../../../td[2])") chamber = details.split(" - ")[0] if chamber == "House": chamber = "lower" elif chamber == "Senate": chamber = "upper" else: raise ScrapeError("Bad chamber: %s" % chamber) motion = details.split(" - ")[1].split("\n")[0].strip() vote_row = jlink.xpath("../../..")[0].getnext() yea_div = vote_row.xpath("td/font/div[contains(@id, 'Yea')]")[0] yeas = [] for td in yea_div.xpath("table/tr/td"): name = td.xpath("string()") if name: yeas.append(name) no_div = vote_row.xpath("td/font/div[contains(@id, 'Nay')]")[0] nays = [] for td in no_div.xpath("table/tr/td"): name = td.xpath("string()") if name: nays.append(name) yes_count = len(yeas) no_count = len(nays) vote = VoteEvent( chamber=chamber, start_date=date, motion_text=motion, result="pass" if yes_count > no_count else "fail", bill=bill, classification="passed", ) for yes in yeas: vote.yes(yes) for no in nays: vote.no(no) vote.add_source(vote_url) yield vote
def scrape_bill_type(self, chamber, session, bill_type, type_abbr, committee_abbr_regex=get_committee_name_regex()): bills = self.session.query(CABill).filter_by( session_year=session).filter_by(measure_type=type_abbr) for bill in bills: bill_session = session if bill.session_num != '0': bill_session += ' Special Session %s' % bill.session_num bill_id = bill.short_bill_id fsbill = Bill(bill_id, session, title='', chamber=chamber) if ((bill_id.startswith('S') and chamber == 'lower') or (bill_id.startswith('A') and chamber == 'upper')): print("!!!! BAD ID/CHAMBER PAIR !!!!", bill) continue # # Construct session for web query, going from '20092010' to '0910' # source_session = session[2:4] + session[6:8] # # Turn 'AB 10' into 'ab_10' # source_num = "%s_%s" % (bill.measure_type.lower(), # bill.measure_num) # Construct a fake source url source_url = ('http://leginfo.legislature.ca.gov/faces/' 'billNavClient.xhtml?bill_id=%s') % bill.bill_id fsbill.add_source(source_url) fsbill.add_version_link(bill_id, source_url, media_type='text/html') title = '' type_ = ['bill'] subject = '' all_titles = set() # Get digest test (aka "summary") from latest version. if bill.versions: version = bill.versions[-1] nsmap = version.xml.nsmap xpath = '//caml:DigestText/xhtml:p' els = version.xml.xpath(xpath, namespaces=nsmap) chunks = [] for el in els: t = etree_text_content(el) t = re.sub(r'\s+', ' ', t) t = re.sub(r'\)(\S)', lambda m: ') %s' % m.group(1), t) chunks.append(t) summary = '\n\n'.join(chunks) for version in bill.versions: if not version.bill_xml: continue version_date = self._tz.localize( version.bill_version_action_date) # create a version name to match the state's format # 02/06/17 - Enrolled version_date_human = version_date.strftime('%m/%d/%y') version_name = "{} - {}".format(version_date_human, version.bill_version_action) version_base = "https://leginfo.legislature.ca.gov/faces" version_url_pdf = "{}/billPdf.xhtml?bill_id={}&version={}".format( version_base, version.bill_id, version.bill_version_id) fsbill.add_version_link(version_name, version_url_pdf, media_type='application/pdf', date=version_date.date()) # CA is inconsistent in that some bills have a short title # that is longer, more descriptive than title. if bill.measure_type in ('AB', 'SB'): impact_clause = clean_title(version.title) title = clean_title(version.short_title) else: impact_clause = None if len(version.title) < len(version.short_title) and \ not version.title.lower().startswith('an act'): title = clean_title(version.short_title) else: title = clean_title(version.title) if title: all_titles.add(title) type_ = [bill_type] if version.appropriation == 'Yes': type_.append('appropriation') tags = [] if version.fiscal_committee == 'Yes': tags.append('fiscal committee') if version.local_program == 'Yes': tags.append('local program') if version.urgency == 'Yes': tags.append('urgency') if version.taxlevy == 'Yes': tags.append('tax levy') if version.subject: subject = clean_title(version.subject) if not title: self.warning("Couldn't find title for %s, skipping" % bill_id) continue fsbill.title = title if summary: fsbill.add_abstract(summary, note='summary') fsbill.classification = type_ fsbill.subject = [subject] if subject else [] fsbill.extras['impact_clause'] = impact_clause fsbill.extras['tags'] = tags # We don't want the current title in alternate_titles all_titles.remove(title) for title in all_titles: fsbill.add_title(title) for author in version.authors: fsbill.add_sponsorship( author.name, classification=SPONSOR_TYPES[author.contribution], primary=author.primary_author_flg == 'Y', entity_type='person', ) # fsbill.sponsorships[-1]['extras'] = {'official_type': author.contribution} seen_actions = set() for action in bill.actions: if not action.action: # NULL action text seems to be an error on CA's part, # unless it has some meaning I'm missing continue actor = action.actor or chamber actor = actor.strip() match = re.match(r'(Assembly|Senate)($| \(Floor)', actor) if match: actor = { 'Assembly': 'lower', 'Senate': 'upper' }[match.group(1)] elif actor.startswith('Governor'): actor = 'executive' else: def replacer(matchobj): if matchobj: return { 'Assembly': 'lower', 'Senate': 'upper' }[matchobj.group()] else: return matchobj.group() actor = re.sub(r'^(Assembly|Senate)', replacer, actor) type_ = [] act_str = action.action act_str = re.sub(r'\s+', ' ', act_str) attrs = self.categorizer.categorize(act_str) # Add in the committee strings of the related committees, if any. kwargs = attrs matched_abbrs = committee_abbr_regex.findall(action.action) if re.search(r'Com[s]?. on', action.action) and not matched_abbrs: msg = 'Failed to extract committee abbr from %r.' self.logger.warning(msg % action.action) if matched_abbrs: committees = [] for abbr in matched_abbrs: try: name = self.committee_abbr_to_name(chamber, abbr) committees.append(name) except KeyError: msg = ('Mapping contains no committee name for ' 'abbreviation %r. Action text was %r.') args = (abbr, action.action) raise KeyError(msg % args) committees = filter(None, committees) kwargs['committees'] = committees code = re.search(r'C[SXZ]\d+', actor) if code is not None: code = code.group() kwargs['actor_info'] = {'committee_code': code} assert len(list(committees)) == len(matched_abbrs) for committee, abbr in zip(committees, matched_abbrs): act_str = act_str.replace('Coms. on ', '') act_str = act_str.replace('Com. on ' + abbr, committee) act_str = act_str.replace(abbr, committee) if not act_str.endswith('.'): act_str = act_str + '.' # Determine which chamber the action originated from. changed = False for committee_chamber in ['upper', 'lower', 'legislature']: if actor.startswith(committee_chamber): actor = committee_chamber changed = True break if not changed: actor = 'legislature' if actor != action.actor: actor_info = kwargs.get('actor_info', {}) actor_info['details'] = action.actor kwargs['actor_info'] = actor_info # Add strings for related legislators, if any. rgx = r'(?:senator|assembly[mwp][^ .,:;]+)\s+[^ .,:;]+' legislators = re.findall(rgx, action.action, re.I) if legislators: kwargs['legislators'] = legislators date = action.action_date date = self._tz.localize(date) date = date.date() if (actor, act_str, date) in seen_actions: continue kwargs.update(self.categorizer.categorize(act_str)) action = fsbill.add_action( act_str, date.strftime('%Y-%m-%d'), chamber=actor, classification=kwargs['classification']) for committee in kwargs.get('committees', []): action.add_related_entity(committee, entity_type='organization') seen_actions.add((actor, act_str, date)) for vote_num, vote in enumerate(bill.votes): if vote.vote_result == '(PASS)': result = True else: result = False if not vote.location: continue full_loc = vote.location.description first_part = full_loc.split(' ')[0].lower() if first_part in ['asm', 'assembly']: vote_chamber = 'lower' # vote_location = ' '.join(full_loc.split(' ')[1:]) elif first_part.startswith('sen'): vote_chamber = 'upper' # vote_location = ' '.join(full_loc.split(' ')[1:]) else: raise ScrapeError("Bad location: %s" % full_loc) if vote.motion: motion = vote.motion.motion_text or '' else: motion = '' if "Third Reading" in motion or "3rd Reading" in motion: vtype = 'passage' elif "Do Pass" in motion: vtype = 'passage' else: vtype = 'other' motion = motion.strip() # Why did it take until 2.7 to get a flags argument on re.sub? motion = re.compile(r'(\w+)( Extraordinary)? Session$', re.IGNORECASE).sub('', motion) motion = re.compile(r'^(Senate|Assembly) ', re.IGNORECASE).sub('', motion) motion = re.sub(r'^(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? ', '', motion) motion = re.sub(r' \(\w+\)$', '', motion) motion = re.sub(r'(SCR|SB|AB|AJR|ACR)\s?\d+ \w+\.?$', '', motion) motion = re.sub( r'(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? ' r'Urgency Clause$', '(Urgency Clause)', motion) motion = re.sub(r'\s+', ' ', motion) if not motion: self.warning("Got blank motion on vote for %s" % bill_id) continue # XXX this is responsible for all the CA 'committee' votes, not # sure if that's a feature or bug, so I'm leaving it as is... # vote_classification = chamber if (vote_location == 'Floor') else 'committee' # org = { # 'name': vote_location, # 'classification': vote_classification # } fsvote = VoteEvent( motion_text=motion, start_date=self._tz.localize(vote.vote_date_time), result='pass' if result else 'fail', classification=vtype, # organization=org, chamber=vote_chamber, bill=fsbill, ) fsvote.extras = {'threshold': vote.threshold} source_url = ('http://leginfo.legislature.ca.gov/faces' '/billVotesClient.xhtml?bill_id={}').format( fsbill.identifier) fsvote.add_source(source_url) fsvote.pupa_id = source_url + '#' + str(vote_num) rc = {'yes': [], 'no': [], 'other': []} for record in vote.votes: if record.vote_code == 'AYE': rc['yes'].append(record.legislator_name) elif record.vote_code.startswith('NO'): rc['no'].append(record.legislator_name) else: rc['other'].append(record.legislator_name) # Handle duplicate votes for key in rc.keys(): rc[key] = list(set(rc[key])) for key, voters in rc.items(): for voter in voters: fsvote.vote(key, voter) # Set counts by summed votes for accuracy fsvote.set_count(key, len(voters)) yield fsvote yield fsbill self.session.expire_all()
def scrape_bill(self, chamber, session, bill_id, title, url): page = self.lxmlize(url) if re.match(r'^(S|H)B ', bill_id): btype = ['bill'] elif re.match(r'(S|H)C ', bill_id): btype = ['commemoration'] elif re.match(r'(S|H)JR ', bill_id): btype = ['joint resolution'] elif re.match(r'(S|H)CR ', bill_id): btype = ['concurrent resolution'] else: btype = ['bill'] bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=title, classification=btype) bill.add_source(url) regex_ns = "http://exslt.org/regular-expressions" version_links = page.xpath( r"//a[re:test(@href, 'Bill.aspx\?File=.*\.htm', 'i')]", namespaces={'re': regex_ns}) for link in version_links: bill.add_version_link(link.xpath('string()').strip(), link.attrib['href'], media_type='text/html', on_duplicate='ignore') sponsor_links = page.xpath( '//div[@id="ctl00_ContentPlaceHolder1_ctl00_BillDetail"]' + '/label[contains(text(), "Sponsors:")]' + '/following-sibling::div[1]/p/a') for link in sponsor_links: if link.attrib['href'].startswith( 'https://sdlegislature.gov/Legislators/'): sponsor_type = 'person' elif link.attrib['href'].startswith( 'https://sdlegislature.gov/Legislative_Session/Committees' ): sponsor_type = 'organization' else: raise ScrapeError('Found unexpected sponsor, URL: ' + link.attrib['href']) bill.add_sponsorship(link.text, classification='primary', primary=True, entity_type=sponsor_type) actor = chamber use_row = False for row in page.xpath("//table[contains(@id, 'tblBillActions')]//tr"): # Some tables have null rows, that are just `<tr></tr>` # Eg: sdlegislature.gov/Legislative_Session/Bills/Bill.aspx?Bill=1005&Session=2018 if row.text_content() == '': self.debug( 'Skipping action table row that is completely empty') continue if 'Date' in row.text_content() and 'Action' in row.text_content(): use_row = True continue elif not use_row: continue action = row.xpath("string(td[2])").strip() atypes = [] if action.startswith('First read'): atypes.append('introduction') atypes.append('reading-1') if re.match(r'Signed by (?:the\s)*Governor', action, re.IGNORECASE): atypes.append('executive-signature') actor = 'executive' match = re.match(r'(.*) Do Pass( Amended)?, (Passed|Failed)', action) if match: if match.group(1) in ['Senate', 'House of Representatives']: first = '' else: first = 'committee-' if match.group(3).lower() == 'passed': second = 'passage' elif match.group(3).lower() == 'failed': second = 'failure' atypes.append("%s%s" % (first, second)) if 'referred to' in action.lower(): atypes.append('referral-committee') if 'Motion to amend, Passed Amendment' in action: atypes.append('amendment-introduction') atypes.append('amendment-passage') if 'Veto override, Passed' in action: atypes.append('veto-override-passage') elif 'Veto override, Failed' in action: atypes.append('veto-override-failure') if 'Delivered to the Governor' in action: atypes.append('executive-receipt') match = re.match("First read in (Senate|House)", action) if match: if match.group(1) == 'Senate': actor = 'upper' else: actor = 'lower' date = row.xpath("string(td[1])").strip() match = re.match(r'\d{2}/\d{2}/\d{4}', date) if not match: self.warning("Bad date: %s" % date) continue date = datetime.datetime.strptime(date, "%m/%d/%Y").date() for link in row.xpath("td[2]/a[contains(@href, 'RollCall')]"): yield from self.scrape_vote(bill, date, link.attrib['href']) bill.add_action(action, date, chamber=actor, classification=atypes) for link in page.xpath("//a[contains(@href, 'Keyword')]"): bill.add_subject(link.text.strip()) yield bill
def scrape_vote(self, bill, date, url): page = self.get(url).text page = lxml.html.fromstring(page) header = page.xpath("string(//h3[contains(@id, 'hdVote')])") if 'No Bill Action' in header: self.warning("bad vote header -- skipping") return location = header.split(', ')[1] if location.startswith('House'): chamber = 'lower' elif location.startswith('Senate'): chamber = 'upper' elif location.startswith('Joint'): chamber = 'legislature' else: raise ScrapeError("Bad chamber: %s" % location) motion = ', '.join(header.split(', ')[2:]).strip() if motion: # If we can't detect a motion, skip this vote yes_count = int( page.xpath("string(//span[contains(@id, 'tdAyes')])")) no_count = int( page.xpath("string(//span[contains(@id, 'tdNays')])")) excused_count = int( page.xpath("string(//span[contains(@id, 'tdExcused')])")) absent_count = int( page.xpath("string(//span[contains(@id, 'tdAbsent')])")) passed = yes_count > no_count if motion.startswith('Do Pass'): type = 'passage' elif motion == 'Concurred in amendments': type = 'amendment' elif motion == 'Veto override': type = 'veto_override' else: type = 'other' vote = VoteEvent(chamber=chamber, start_date=date, motion_text=motion, result='pass' if passed else 'fail', classification=type, bill=bill) # The vote page URL has a unique ID # However, some votes are "consent calendar" events, # and relate to the passage of _multiple_ bills # These can't be modeled yet in Pupa, but for now we can # append a bill ID to the URL that forms the `pupa_id` # https://github.com/opencivicdata/pupa/issues/308 vote.pupa_id = '{}#{}'.format(url, bill.identifier.replace(' ', '')) vote.add_source(url) vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('excused', excused_count) vote.set_count('absent', absent_count) for td in page.xpath("//table[@id='tblVoteTotals']/tbody/tr/td"): option_or_person = td.text.strip() if option_or_person in ('Aye', 'Yea'): vote.yes(td.getprevious().text.strip()) elif option_or_person == 'Nay': vote.no(td.getprevious().text.strip()) elif option_or_person == 'Excused': vote.vote('excused', td.getprevious().text.strip()) elif option_or_person == 'Absent': vote.vote('absent', td.getprevious().text.strip()) yield vote
def scrape_vote(self, bill, date, url): page = self.get(url).text page = lxml.html.fromstring(page) header = page.xpath("string(//h3[contains(@id, 'hdVote')])") if 'No Bill Action' in header: self.warning("bad vote header -- skipping") return location = header.split(', ')[1] if location.startswith('House'): chamber = 'lower' elif location.startswith('Senate'): chamber = 'upper' elif location.startswith('Joint'): chamber = 'legislature' else: raise ScrapeError("Bad chamber: %s" % location) # committee = ' '.join(location.split(' ')[1:]).strip() # if not committee or committee.startswith('of Representatives'): # committee = None motion = ', '.join(header.split(', ')[2:]).strip() if motion: # If we can't detect a motion, skip this vote yes_count = int( page.xpath("string(//span[contains(@id, 'tdAyes')])")) no_count = int( page.xpath("string(//span[contains(@id, 'tdNays')])")) excused_count = int( page.xpath("string(//span[contains(@id, 'tdExcused')])")) absent_count = int( page.xpath("string(//span[contains(@id, 'tdAbsent')])")) passed = yes_count > no_count if motion.startswith('Do Pass'): type = 'passage' elif motion == 'Concurred in amendments': type = 'amendment' elif motion == 'Veto override': type = 'veto_override' else: type = 'other' vote = VoteEvent(chamber=chamber, start_date=date, motion_text=motion, result='pass' if passed else 'fail', classification=type, bill=bill) vote.pupa_id = url # vote id is in URL vote.add_source(url) vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('excused', excused_count) vote.set_count('absent', absent_count) for td in page.xpath("//table[@id='tblVoteTotals']/tbody/tr/td"): option_or_person = td.text.strip() if option_or_person in ('Aye', 'Yea'): vote.yes(td.getprevious().text.strip()) elif option_or_person == 'Nay': vote.no(td.getprevious().text.strip()) elif option_or_person == 'Excused': vote.vote('excused', td.getprevious().text.strip()) elif option_or_person == 'Absent': vote.vote('absent', td.getprevious().text.strip()) yield vote
def scrape_bill(self, session, history_url): history_xml = self.get(history_url).text root = etree.fromstring(history_xml) bill_title = root.findtext("caption") if (bill_title is None or "Bill does not exist" in history_xml): self.warning("Bill does not appear to exist") return bill_id = ' '.join(root.attrib['bill'].split(' ')[1:]) chamber = self.CHAMBERS[bill_id[0]] if bill_id[1] == 'B': bill_type = ['bill'] elif bill_id[1] == 'R': bill_type = ['resolution'] elif bill_id[1:3] == 'CR': bill_type = ['concurrent resolution'] elif bill_id[1:3] == 'JR': bill_type = ['joint resolution'] else: raise ScrapeError("Invalid bill_id: %s" % bill_id) bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=bill_title, classification=bill_type) bill.add_source(history_url) for subject in root.iterfind('subjects/subject'): bill.add_subject(subject.text.strip()) versions = [x for x in self.versions if x[0] == bill_id] for version in versions: bill.add_version_link(note=self.NAME_SLUGS[version[1][-5]], url=version[1], media_type='text/html') analyses = [x for x in self.analyses if x[0] == bill_id] for analysis in analyses: bill.add_document_link(note="Analysis ({})".format( self.NAME_SLUGS[analysis[1][-5]]), url=analysis[1], media_type='text/html') fiscal_notes = [x for x in self.fiscal_notes if x[0] == bill_id] for fiscal_note in fiscal_notes: bill.add_document_link(note="Fiscal Note ({})".format( self.NAME_SLUGS[fiscal_note[1][-5]]), url=fiscal_note[1], media_type='text/html') witnesses = [x for x in self.witnesses if x[0] == bill_id] for witness in witnesses: bill.add_document_link(note="Witness List ({})".format( self.NAME_SLUGS[witness[1][-5]]), url=witness[1], media_type='text/html') for action in root.findall('actions/action'): act_date = datetime.datetime.strptime(action.findtext('date'), "%m/%d/%Y").date() action_number = action.find('actionNumber').text actor = { 'H': 'lower', 'S': 'upper', 'E': 'executive' }[action_number[0]] desc = action.findtext('description').strip() if desc == 'Scheduled for public hearing on . . .': self.warning("Skipping public hearing action with no date") continue introduced = False if desc == 'Amended': atype = 'amendment-passage' elif desc == 'Amendment(s) offered': atype = 'amendment-introduction' elif desc == 'Amendment amended': atype = 'amendment-amendment' elif desc == 'Amendment withdrawn': atype = 'amendment-withdrawal' elif desc == 'Passed' or desc == 'Adopted': atype = 'passage' elif re.match(r'^Received (by|from) the', desc): if 'Secretary of the Senate' not in desc: atype = 'introduction' else: atype = 'filing' elif desc.startswith('Sent to the Governor'): # But what if it gets lost in the mail? atype = 'executive-receipt' elif desc.startswith('Signed by the Governor'): atype = 'executive-signature' elif desc.startswith('Effective on'): atype = 'became-law' elif desc == 'Vetoed by the Governor': atype = 'executive-veto' elif desc == 'Read first time': atype = ['introduction', 'reading-1'] introduced = True elif desc == 'Read & adopted': atype = ['passage'] if not introduced: introduced = True atype.append('introduction') elif desc == "Passed as amended": atype = 'passage' elif (desc.startswith('Referred to') or desc.startswith("Recommended to be sent to ")): atype = 'referral-committee' elif desc == "Reported favorably w/o amendment(s)": atype = 'committee-passage' elif desc == "Filed": atype = 'filing' elif desc == 'Read 3rd time': atype = 'reading-3' elif desc == 'Read 2nd time': atype = 'reading-2' elif desc.startswith('Reported favorably'): atype = 'committee-passage-favorable' else: atype = None act = bill.add_action(action.findtext('description'), act_date, chamber=actor, classification=atype) if atype and 'referral-committee' in atype: repls = ['Referred to', "Recommended to be sent to "] ctty = desc for r in repls: ctty = ctty.replace(r, "").strip() act.add_related_entity(name=ctty, entity_type='organization') for author in root.findtext('authors').split(' | '): if author != "": bill.add_sponsorship(author, classification='primary', entity_type='person', primary=True) for coauthor in root.findtext('coauthors').split(' | '): if coauthor != "": bill.add_sponsorship(coauthor, classification='cosponsor', entity_type='person', primary=False) for sponsor in root.findtext('sponsors').split(' | '): if sponsor != "": bill.add_sponsorship(sponsor, classification='primary', entity_type='person', primary=True) for cosponsor in root.findtext('cosponsors').split(' | '): if cosponsor != "": bill.add_sponsorship(cosponsor, classification='cosponsor', entity_type='person', primary=False) if root.findtext('companions'): self._get_companion(bill) yield bill
def scrape_bill(self, chamber, session, bill_id, title, url): page = self.lxmlize(url) if re.match(r"^(S|H)B ", bill_id): btype = ["bill"] elif re.match(r"(S|H)C ", bill_id): btype = ["commemoration"] elif re.match(r"(S|H)JR ", bill_id): btype = ["joint resolution"] elif re.match(r"(S|H)CR ", bill_id): btype = ["concurrent resolution"] else: btype = ["bill"] bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=title, classification=btype, ) bill.add_source(url) regex_ns = "http://exslt.org/regular-expressions" version_links = page.xpath( r"//a[re:test(@href, 'Bill.aspx\?File=.*\.htm', 'i')]", namespaces={"re": regex_ns}, ) for link in version_links: bill.add_version_link( link.xpath("string()").strip(), link.attrib["href"], media_type="text/html", on_duplicate="ignore", ) sponsor_links = page.xpath( '//div[@id="ctl00_ContentPlaceHolder1_ctl00_BillDetail"]' + '/label[contains(text(), "Sponsors:")]' + "/following-sibling::div[1]/p/a") for link in sponsor_links: if link.attrib["href"].startswith( "https://sdlegislature.gov/Legislators/"): sponsor_type = "person" elif link.attrib["href"].startswith( "https://sdlegislature.gov/Legislative_Session/Committees" ): sponsor_type = "organization" else: raise ScrapeError("Found unexpected sponsor, URL: " + link.attrib["href"]) bill.add_sponsorship( link.text, classification="primary", primary=True, entity_type=sponsor_type, ) actor = chamber use_row = False for row in page.xpath("//table[contains(@id, 'tblBillActions')]//tr"): # Some tables have null rows, that are just `<tr></tr>` # Eg: sdlegislature.gov/Legislative_Session/Bills/Bill.aspx?Bill=1005&Session=2018 if row.text_content() == "": self.debug( "Skipping action table row that is completely empty") continue if "Date" in row.text_content() and "Action" in row.text_content(): use_row = True continue elif not use_row: continue action = row.xpath("string(td[2])").strip() atypes = [] if action.startswith("First read"): atypes.append("introduction") atypes.append("reading-1") if re.match(r"Signed by (?:the\s)*Governor", action, re.IGNORECASE): atypes.append("executive-signature") actor = "executive" match = re.match(r"(.*) Do Pass( Amended)?, (Passed|Failed)", action) if match: if match.group(1) in ["Senate", "House of Representatives"]: first = "" else: first = "committee-" if match.group(3).lower() == "passed": second = "passage" elif match.group(3).lower() == "failed": second = "failure" atypes.append("%s%s" % (first, second)) if "referred to" in action.lower(): atypes.append("referral-committee") if "Motion to amend, Passed Amendment" in action: atypes.append("amendment-introduction") atypes.append("amendment-passage") amd = row.xpath('td[2]/a[contains(@href,"Amendment.aspx")]')[0] version_name = amd.xpath("string(.)") version_url = amd.xpath("@href")[0] if "htm" in version_url: mimetype = "text/html" elif "pdf" in version_url: mimetype = "application/pdf" bill.add_version_link( version_name, version_url, media_type=mimetype, on_duplicate="ignore", ) if "Veto override, Passed" in action: atypes.append("veto-override-passage") elif "Veto override, Failed" in action: atypes.append("veto-override-failure") if "Delivered to the Governor" in action: atypes.append("executive-receipt") match = re.match("First read in (Senate|House)", action) if match: if match.group(1) == "Senate": actor = "upper" else: actor = "lower" date = row.xpath("string(td[1])").strip() match = re.match(r"\d{2}/\d{2}/\d{4}", date) if not match: self.warning("Bad date: %s" % date) continue date = datetime.datetime.strptime(date, "%m/%d/%Y").date() for link in row.xpath("td[2]/a[contains(@href, 'RollCall')]"): yield from self.scrape_vote(bill, date, link.attrib["href"]) bill.add_action(action, date, chamber=actor, classification=atypes) for link in page.xpath("//a[contains(@href, 'Keyword')]"): bill.add_subject(link.text.strip()) yield bill
def scrape_vote(self, bill, date, url): page = self.get(url).text page = lxml.html.fromstring(page) header = page.xpath("string(//h3[contains(@id, 'hdVote')])") if "No Bill Action" in header: self.warning("bad vote header -- skipping") return location = header.split(", ")[1] if location.startswith("House"): chamber = "lower" elif location.startswith("Senate"): chamber = "upper" elif location.startswith("Joint"): chamber = "legislature" else: raise ScrapeError("Bad chamber: %s" % location) motion = ", ".join(header.split(", ")[2:]).strip() if motion: # If we can't detect a motion, skip this vote yes_count = int( page.xpath("string(//span[contains(@id, 'tdAyes')])")) no_count = int( page.xpath("string(//span[contains(@id, 'tdNays')])")) excused_count = int( page.xpath("string(//span[contains(@id, 'tdExcused')])")) absent_count = int( page.xpath("string(//span[contains(@id, 'tdAbsent')])")) passed = yes_count > no_count if motion.startswith("Do Pass"): type = "passage" elif motion == "Concurred in amendments": type = "amendment" elif motion == "Veto override": type = "veto_override" else: type = "other" vote = VoteEvent( chamber=chamber, start_date=date, motion_text=motion, result="pass" if passed else "fail", classification=type, bill=bill, ) # The vote page URL has a unique ID # However, some votes are "consent calendar" events, # and relate to the passage of _multiple_ bills # These can't be modeled yet in Pupa, but for now we can # append a bill ID to the URL that forms the `pupa_id` # https://github.com/opencivicdata/pupa/issues/308 vote.pupa_id = "{}#{}".format(url, bill.identifier.replace(" ", "")) vote.add_source(url) vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("excused", excused_count) vote.set_count("absent", absent_count) for td in page.xpath("//table[@id='tblVoteTotals']/tbody/tr/td"): option_or_person = td.text.strip() if option_or_person in ("Aye", "Yea"): vote.yes(td.getprevious().text.strip()) elif option_or_person == "Nay": vote.no(td.getprevious().text.strip()) elif option_or_person == "Excused": vote.vote("excused", td.getprevious().text.strip()) elif option_or_person == "Absent": vote.vote("absent", td.getprevious().text.strip()) yield vote
def scrape_bill(self, session, history_url): history_xml = self.get(history_url).text root = etree.fromstring(history_xml) bill_title = root.findtext("caption") if bill_title is None or "Bill does not exist" in history_xml: self.warning("Bill does not appear to exist") return bill_id = " ".join(root.attrib["bill"].split(" ")[1:]) chamber = self.CHAMBERS[bill_id[0]] if bill_id[1] == "B": bill_type = ["bill"] elif bill_id[1] == "R": bill_type = ["resolution"] elif bill_id[1:3] == "CR": bill_type = ["concurrent resolution"] elif bill_id[1:3] == "JR": bill_type = ["joint resolution"] else: raise ScrapeError("Invalid bill_id: %s" % bill_id) bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=bill_title, classification=bill_type, ) bill.add_source(history_url) for subject in root.iterfind("subjects/subject"): bill.add_subject(subject.text.strip()) versions = [x for x in self.versions if x[0] == bill_id] for version in versions: bill.add_version_link( note=self.NAME_SLUGS[version[1][-5]], url=version[1], media_type="text/html", ) analyses = [x for x in self.analyses if x[0] == bill_id] for analysis in analyses: bill.add_document_link( note="Analysis ({})".format(self.NAME_SLUGS[analysis[1][-5]]), url=analysis[1], media_type="text/html", ) fiscal_notes = [x for x in self.fiscal_notes if x[0] == bill_id] for fiscal_note in fiscal_notes: bill.add_document_link( note="Fiscal Note ({})".format( self.NAME_SLUGS[fiscal_note[1][-5]]), url=fiscal_note[1], media_type="text/html", ) witnesses = [x for x in self.witnesses if x[0] == bill_id] for witness in witnesses: bill.add_document_link( note="Witness List ({})".format( self.NAME_SLUGS[witness[1][-5]]), url=witness[1], media_type="text/html", ) for action in root.findall("actions/action"): act_date = datetime.datetime.strptime(action.findtext("date"), "%m/%d/%Y").date() action_number = action.find("actionNumber").text actor = { "H": "lower", "S": "upper", "E": "executive" }[action_number[0]] desc = action.findtext("description").strip() if desc == "Scheduled for public hearing on . . .": self.warning("Skipping public hearing action with no date") continue introduced = False if desc == "Amended": atype = "amendment-passage" elif desc == "Amendment(s) offered": atype = "amendment-introduction" elif desc == "Amendment amended": atype = "amendment-amendment" elif desc == "Amendment withdrawn": atype = "amendment-withdrawal" elif desc == "Passed" or desc == "Adopted": atype = "passage" elif re.match(r"^Received (by|from) the", desc): if "Secretary of the Senate" not in desc: atype = "introduction" else: atype = "filing" elif desc.startswith("Sent to the Governor"): # But what if it gets lost in the mail? atype = "executive-receipt" elif desc.startswith("Signed by the Governor"): atype = "executive-signature" elif desc.startswith("Effective on"): atype = "became-law" elif desc == "Vetoed by the Governor": atype = "executive-veto" elif desc == "Read first time": atype = ["introduction", "reading-1"] introduced = True elif desc == "Read & adopted": atype = ["passage"] if not introduced: introduced = True atype.append("introduction") elif desc == "Passed as amended": atype = "passage" elif desc.startswith("Referred to") or desc.startswith( "Recommended to be sent to "): atype = "referral-committee" elif desc == "Reported favorably w/o amendment(s)": atype = "committee-passage" elif desc == "Filed": atype = "filing" elif desc == "Read 3rd time": atype = "reading-3" elif desc == "Read 2nd time": atype = "reading-2" elif desc.startswith("Reported favorably"): atype = "committee-passage-favorable" else: atype = None act = bill.add_action( action.findtext("description"), act_date, chamber=actor, classification=atype, ) if atype and "referral-committee" in atype: repls = ["Referred to", "Recommended to be sent to "] ctty = desc for r in repls: ctty = ctty.replace(r, "").strip() act.add_related_entity(name=ctty, entity_type="organization") for author in root.findtext("authors").split(" | "): if author != "": bill.add_sponsorship(author, classification="primary", entity_type="person", primary=True) for coauthor in root.findtext("coauthors").split(" | "): if coauthor != "": bill.add_sponsorship( coauthor, classification="cosponsor", entity_type="person", primary=False, ) for sponsor in root.findtext("sponsors").split(" | "): if sponsor != "": bill.add_sponsorship( sponsor, classification="primary", entity_type="person", primary=True, ) for cosponsor in root.findtext("cosponsors").split(" | "): if cosponsor != "": bill.add_sponsorship( cosponsor, classification="cosponsor", entity_type="person", primary=False, ) if root.findtext("companions"): self._get_companion(bill) yield bill
def scrape_bill_type( self, chamber, session, bill_type, type_abbr, committee_abbr_regex=get_committee_name_regex(), ): bills = (self.session.query(CABill).filter_by( session_year=session).filter_by(measure_type=type_abbr)) for bill in bills: bill_session = session if bill.session_num != "0": bill_session += " Special Session %s" % bill.session_num bill_id = bill.short_bill_id fsbill = Bill(bill_id, session, title="", chamber=chamber) if (bill_id.startswith("S") and chamber == "lower") or (bill_id.startswith("A") and chamber == "upper"): print("!!!! BAD ID/CHAMBER PAIR !!!!", bill) continue # # Construct session for web query, going from '20092010' to '0910' # source_session = session[2:4] + session[6:8] # # Turn 'AB 10' into 'ab_10' # source_num = "%s_%s" % (bill.measure_type.lower(), # bill.measure_num) # Construct a fake source url source_url = ("http://leginfo.legislature.ca.gov/faces/" "billNavClient.xhtml?bill_id=%s") % bill.bill_id fsbill.add_source(source_url) fsbill.add_version_link(bill_id, source_url, media_type="text/html") title = "" type_ = ["bill"] subject = "" all_titles = set() # Get digest test (aka "summary") from latest version. if bill.versions: version = bill.versions[-1] nsmap = version.xml.nsmap xpath = "//caml:DigestText/xhtml:p" els = version.xml.xpath(xpath, namespaces=nsmap) chunks = [] for el in els: t = etree_text_content(el) t = re.sub(r"\s+", " ", t) t = re.sub(r"\)(\S)", lambda m: ") %s" % m.group(1), t) chunks.append(t) summary = "\n\n".join(chunks) for version in bill.versions: if not version.bill_xml: continue version_date = self._tz.localize( version.bill_version_action_date) # create a version name to match the state's format # 02/06/17 - Enrolled version_date_human = version_date.strftime("%m/%d/%y") version_name = "{} - {}".format(version_date_human, version.bill_version_action) version_base = "https://leginfo.legislature.ca.gov/faces" version_url_pdf = "{}/billPdf.xhtml?bill_id={}&version={}".format( version_base, version.bill_id, version.bill_version_id) fsbill.add_version_link( version_name, version_url_pdf, media_type="application/pdf", date=version_date.date(), ) # CA is inconsistent in that some bills have a short title # that is longer, more descriptive than title. if bill.measure_type in ("AB", "SB"): impact_clause = clean_title(version.title) title = clean_title(version.short_title) else: impact_clause = None if len(version.title) < len( version.short_title) and not version.title.lower( ).startswith("an act"): title = clean_title(version.short_title) else: title = clean_title(version.title) if title: all_titles.add(title) type_ = [bill_type] if version.appropriation == "Yes": type_.append("appropriation") tags = [] if version.fiscal_committee == "Yes": tags.append("fiscal committee") if version.local_program == "Yes": tags.append("local program") if version.urgency == "Yes": tags.append("urgency") if version.taxlevy == "Yes": tags.append("tax levy") if version.subject: subject = clean_title(version.subject) if not title: self.warning("Couldn't find title for %s, skipping" % bill_id) continue fsbill.title = title if summary: fsbill.add_abstract(summary, note="summary") fsbill.classification = type_ fsbill.subject = [subject] if subject else [] fsbill.extras["impact_clause"] = impact_clause fsbill.extras["tags"] = tags # We don't want the current title in alternate_titles all_titles.remove(title) for title in all_titles: fsbill.add_title(title) for author in version.authors: fsbill.add_sponsorship( author.name, classification=SPONSOR_TYPES[author.contribution], primary=author.primary_author_flg == "Y", entity_type="person", ) # fsbill.sponsorships[-1]['extras'] = {'official_type': author.contribution} seen_actions = set() for action in bill.actions: if not action.action: # NULL action text seems to be an error on CA's part, # unless it has some meaning I'm missing continue actor = action.actor or chamber actor = actor.strip() match = re.match(r"(Assembly|Senate)($| \(Floor)", actor) if match: actor = { "Assembly": "lower", "Senate": "upper" }[match.group(1)] elif actor.startswith("Governor"): actor = "executive" else: def replacer(matchobj): if matchobj: return { "Assembly": "lower", "Senate": "upper" }[matchobj.group()] else: return matchobj.group() actor = re.sub(r"^(Assembly|Senate)", replacer, actor) type_ = [] act_str = action.action act_str = re.sub(r"\s+", " ", act_str) attrs = self.categorizer.categorize(act_str) # Add in the committee strings of the related committees, if any. kwargs = attrs matched_abbrs = committee_abbr_regex.findall(action.action) if re.search(r"Com[s]?. on", action.action) and not matched_abbrs: msg = "Failed to extract committee abbr from %r." self.logger.warning(msg % action.action) if matched_abbrs: committees = [] for abbr in matched_abbrs: try: name = self.committee_abbr_to_name(chamber, abbr) committees.append(name) except KeyError: msg = ("Mapping contains no committee name for " "abbreviation %r. Action text was %r.") args = (abbr, action.action) raise KeyError(msg % args) committees = filter(None, committees) kwargs["committees"] = committees code = re.search(r"C[SXZ]\d+", actor) if code is not None: code = code.group() kwargs["actor_info"] = {"committee_code": code} assert len(list(committees)) == len(matched_abbrs) for committee, abbr in zip(committees, matched_abbrs): act_str = act_str.replace("Coms. on ", "") act_str = act_str.replace("Com. on " + abbr, committee) act_str = act_str.replace(abbr, committee) if not act_str.endswith("."): act_str = act_str + "." # Determine which chamber the action originated from. changed = False for committee_chamber in ["upper", "lower", "legislature"]: if actor.startswith(committee_chamber): actor = committee_chamber changed = True break if not changed: actor = "legislature" if actor != action.actor: actor_info = kwargs.get("actor_info", {}) actor_info["details"] = action.actor kwargs["actor_info"] = actor_info # Add strings for related legislators, if any. rgx = r"(?:senator|assembly[mwp][^ .,:;]+)\s+[^ .,:;]+" legislators = re.findall(rgx, action.action, re.I) if legislators: kwargs["legislators"] = legislators date = action.action_date date = self._tz.localize(date) date = date.date() if (actor, act_str, date) in seen_actions: continue kwargs.update(self.categorizer.categorize(act_str)) action = fsbill.add_action( act_str, date.strftime("%Y-%m-%d"), chamber=actor, classification=kwargs["classification"], ) for committee in kwargs.get("committees", []): action.add_related_entity(committee, entity_type="organization") seen_actions.add((actor, act_str, date)) for vote_num, vote in enumerate(bill.votes): if vote.vote_result == "(PASS)": result = True else: result = False if not vote.location: continue full_loc = vote.location.description first_part = full_loc.split(" ")[0].lower() if first_part in ["asm", "assembly"]: vote_chamber = "lower" # vote_location = ' '.join(full_loc.split(' ')[1:]) elif first_part.startswith("sen"): vote_chamber = "upper" # vote_location = ' '.join(full_loc.split(' ')[1:]) else: raise ScrapeError("Bad location: %s" % full_loc) if vote.motion: motion = vote.motion.motion_text or "" else: motion = "" if "Third Reading" in motion or "3rd Reading" in motion: vtype = "passage" elif "Do Pass" in motion: vtype = "passage" else: vtype = "other" motion = motion.strip() # Why did it take until 2.7 to get a flags argument on re.sub? motion = re.compile(r"(\w+)( Extraordinary)? Session$", re.IGNORECASE).sub("", motion) motion = re.compile(r"^(Senate|Assembly) ", re.IGNORECASE).sub("", motion) motion = re.sub(r"^(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? ", "", motion) motion = re.sub(r" \(\w+\)$", "", motion) motion = re.sub(r"(SCR|SB|AB|AJR|ACR)\s?\d+ \w+\.?$", "", motion) motion = re.sub( r"(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? " r"Urgency Clause$", "(Urgency Clause)", motion, ) motion = re.sub(r"\s+", " ", motion) if not motion: self.warning("Got blank motion on vote for %s" % bill_id) continue # XXX this is responsible for all the CA 'committee' votes, not # sure if that's a feature or bug, so I'm leaving it as is... # vote_classification = chamber if (vote_location == 'Floor') else 'committee' # org = { # 'name': vote_location, # 'classification': vote_classification # } fsvote = VoteEvent( motion_text=motion, start_date=self._tz.localize(vote.vote_date_time), result="pass" if result else "fail", classification=vtype, # organization=org, chamber=vote_chamber, bill=fsbill, ) fsvote.extras = {"threshold": vote.threshold} source_url = ("http://leginfo.legislature.ca.gov/faces" "/billVotesClient.xhtml?bill_id={}").format( fsbill.identifier) fsvote.add_source(source_url) fsvote.pupa_id = source_url + "#" + str(vote_num) rc = {"yes": [], "no": [], "other": []} for record in vote.votes: if record.vote_code == "AYE": rc["yes"].append(record.legislator_name) elif record.vote_code.startswith("NO"): rc["no"].append(record.legislator_name) else: rc["other"].append(record.legislator_name) # Handle duplicate votes for key in rc.keys(): rc[key] = list(set(rc[key])) for key, voters in rc.items(): for voter in voters: fsvote.vote(key, voter) # Set counts by summed votes for accuracy fsvote.set_count(key, len(voters)) yield fsvote yield fsbill self.session.expire_all()