def scrape(self, session=None, chambers=None): # Bills endpoint can sometimes take a very long time to load self.timeout = 300 if not session: session = self.latest_session() self.info('no session, using %s', session) if int(session) < 128: raise AssertionError("No data for period {}".format(session)) elif int(session) < 131: # they changed their data format starting in 131st and added # an undocumented API yield from self.old_scrape(session) else: chamber_dict = {"Senate": "upper", "House": "lower", "House of Representatives": "lower", "house": "lower", "senate": "upper"} # so presumanbly not everything passes, but we haven't # seen anything not pass yet, so we'll need to wait # till it fails and get the right language in here vote_results = {"approved": True, "passed": True, "adopted": True, "true": True, "false": False, "failed": False, True: True, False: False} action_dict = {"ref_ctte_100": "referral-committee", "intro_100": "introduction", "pass_300": "passage", "intro_110": "reading-1", "refer_210": "referral-committee", "crpt_301": None, "crpt_317": None, "concur_606": "passage", "pass_301": "passage", "refer_220": "referral-committee", "intro_102": ["introduction", "passage"], "intro_105": ["introduction", "passage"], "intro_ref_ctte_100": "referral-committee", "refer_209": None, "intro_108": ["introduction", "passage"], "intro_103": ["introduction", "passage"], "msg_reso_503": "passage", "intro_107": ["introduction", "passage"], "imm_consid_360": "passage", "refer_213": None, "adopt_reso_100": "passage", "msg_507": "amendment-passage", "confer_713": None, "concur_603": None, "confer_712": None, "msg_506": "amendment-failure", "receive_message_100": "passage", "motion_920": None, "concur_611": None, "confer_735": None } base_url = "http://search-prod.lis.state.oh.us" first_page = base_url first_page += "/solarapi/v1/general_assembly_{session}/".format(session=session) legislators = self.get_legislator_ids(first_page) all_amendments = self.get_other_data_source(first_page, base_url, "amendments") all_fiscals = self.get_other_data_source(first_page, base_url, "fiscals") all_synopsis = self.get_other_data_source(first_page, base_url, "synopsiss") all_analysis = self.get_other_data_source(first_page, base_url, "analysiss") for row in self.get_bill_rows(session): number_link, ga, title, primary_sponsor, status = row.xpath('td') bill_id = number_link.text_content() title = title.text_content().strip() chamber = 'lower' if 'H' in bill_id else 'upper' classification = 'bill' if 'B' in bill_id else 'resolution' bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=title, classification=classification) bill.add_source(number_link.xpath('a/@href')[0]) # get bill from API bill_api_url = ('http://search-prod.lis.state.oh.us/solarapi/v1/' 'general_assembly_{}/{}/{}/'.format( session, 'bills' if 'B' in bill_id else 'resolutions', bill_id.lower().replace(' ', '') )) data = self.get(bill_api_url).json() # add title if no short title if not bill.title: bill.title = data['items'][0]['longtitle'] bill.add_title(data['items'][0]['longtitle'], 'long title') # this stuff is version-specific for version in data['items']: version_name = version["version"] version_link = base_url+version["pdfDownloadLink"] bill.add_version_link(version_name, version_link, media_type='application/pdf') # we'll use latest bill_version for everything else bill_version = data['items'][0] bill.add_source(bill_api_url) # subjects for subj in bill_version["subjectindexes"]: try: bill.add_subject(subj["primary"]) except KeyError: pass try: secondary_subj = subj["secondary"] except KeyError: secondary_subj = "" if secondary_subj: bill.add_subject(secondary_subj) # sponsors sponsors = bill_version["sponsors"] for sponsor in sponsors: sponsor_name = self.get_sponsor_name(sponsor) bill.add_sponsorship( sponsor_name, classification='primary', entity_type='person', primary=True ) cosponsors = bill_version["cosponsors"] for sponsor in cosponsors: sponsor_name = self.get_sponsor_name(sponsor) bill.add_sponsorship( sponsor_name, classification='cosponsor', entity_type='person', primary=False, ) try: action_doc = self.get(base_url+bill_version["action"][0]["link"]) except scrapelib.HTTPError: pass else: actions = action_doc.json() for action in reversed(actions["items"]): actor = chamber_dict[action["chamber"]] action_desc = action["description"] try: action_type = action_dict[action["actioncode"]] except KeyError: self.warning("Unknown action {desc} with code {code}." " Add it to the action_dict" ".".format(desc=action_desc, code=action["actioncode"])) action_type = None date = self._tz.localize(datetime.datetime.strptime( action["datetime"], "%Y-%m-%dT%H:%M:%S")) date = "{:%Y-%m-%d}".format(date) bill.add_action(action_desc, date, chamber=actor, classification=action_type) # attach documents gathered earlier self.add_document(all_amendments, bill_id, "amendment", bill, base_url) self.add_document(all_fiscals, bill_id, "fiscal", bill, base_url) self.add_document(all_synopsis, bill_id, "synopsis", bill, base_url) self.add_document(all_analysis, bill_id, "analysis", bill, base_url) # votes vote_url = base_url+bill_version["votes"][0]["link"] vote_doc = self.get(vote_url) votes = vote_doc.json() yield from self.process_vote(votes, vote_url, base_url, bill, legislators, chamber_dict, vote_results) vote_url = base_url vote_url += bill_version["cmtevotes"][0]["link"] try: vote_doc = self.get(vote_url) except scrapelib.HTTPError: self.warning("Vote page not " "loading; skipping: {}".format(vote_url)) continue votes = vote_doc.json() yield from self.process_vote(votes, vote_url, base_url, bill, legislators, chamber_dict, vote_results) # we have never seen a veto or a disapprove, but they seem important. # so we'll check and throw an error if we find one # life is fragile. so are our scrapers. if "veto" in bill_version: veto_url = base_url+bill_version["veto"][0]["link"] veto_json = self.get(veto_url).json() if len(veto_json["items"]) > 0: raise AssertionError("Whoa, a veto! We've never" " gotten one before." " Go write some code to deal" " with it: {}".format(veto_url)) if "disapprove" in bill_version: disapprove_url = base_url+bill_version["disapprove"][0]["link"] disapprove_json = self.get(disapprove_url).json() if len(disapprove_json["items"]) > 0: raise AssertionError("Whoa, a disapprove! We've never" " gotten one before." " Go write some code to deal " "with it: {}".format(disapprove_url)) yield bill
def scrape_bill_list(self, url): bill_list = self._get_bill_list(url) for bill_info in bill_list: (bill_id, ) = bill_info.xpath('td[1]/font/input/@value') (sponsor, ) = bill_info.xpath('td[2]/font/input/@value') (subject, ) = bill_info.xpath('td[3]//text()') subject = subject.strip() chamber = self.CHAMBERS[bill_id[0]] if 'B' in bill_id: bill_type = 'bill' elif 'JR' in bill_id: bill_type = 'joint resolution' elif 'R' in bill_id: bill_type = 'resolution' else: raise AssertionError( "Unknown bill type for bill '{}'".format(bill_id)) bill = Bill( bill_id, legislative_session=self.session, chamber=chamber, title='', classification=bill_type, ) if subject: bill.subject = [subject] if sponsor: bill.add_sponsorship( name=sponsor, entity_type='person', classification='primary', primary=True, ) bill.add_source(url) bill_url = ('http://alisondb.legislature.state.al.us/Alison/' 'SESSBillStatusResult.aspx?BILL={}'.format(bill_id)) bill.add_source(bill_url) bill_html = self._get_bill_response(bill_url) if bill_html is None: self.warning( "Bill {} has no webpage, and will be skipped".format( bill_id)) continue bill_doc = lxml.html.fromstring(bill_html) if (bill_doc.xpath( '//span[@id="ContentPlaceHolder1_lblShotTitle"]')): title = bill_doc.xpath( '//span[@id="ContentPlaceHolder1_lblShotTitle"]' )[0].text_content().strip() if not title: title = "[No title given by state]" bill.title = title version_url_base = ( 'http://alisondb.legislature.state.al.us/ALISON/' 'SearchableInstruments/{0}/PrintFiles/{1}-'.format( self.session, bill_id)) versions = bill_doc.xpath( '//table[@class="box_versions"]/tr/td[2]/font/text()') for version in versions: name = version if version == "Introduced": version_url = version_url_base + 'int.pdf' elif version == "Engrossed": version_url = version_url_base + 'eng.pdf' elif version == "Enrolled": version_url = version_url_base + 'enr.pdf' else: raise NotImplementedError( "Unknown version type found: '{}'".format(name)) bill.add_version_link( name, version_url, media_type='application/pdf', on_duplicate='ignore', ) # Fiscal notes exist, but I can't figure out how to build their URL fiscal_notes = bill_doc.xpath( '//table[@class="box_fiscalnote"]')[1:] for fiscal_note in fiscal_notes: pass # Budget Isolation Resolutions are handled as extra actions/votes birs = bill_doc.xpath( '//div[@class="box_bir"]//table//table/tr')[1:] for bir in birs: bir_action = bir.xpath('td[1]')[0].text_content().strip() # Sometimes ALISON's database puts another bill's # actions into the BIR action list; ignore these if bill_id not in bir_action: self.warning( "BIR action found ({}) ".format(bir_action) + "that doesn't match the bill ID ({})".format(bill_id)) continue bir_date = datetime.datetime.strptime( bir.xpath('td[2]/font/text()')[0], self.DATE_FORMAT) bir_type = bir.xpath('td[1]/font/text()')[0].split(" ")[0] bir_chamber = self.CHAMBERS[bir_type[0]] bir_text = "{0}: {1}".format( bir_type, bir.xpath('td[3]/font/text()')[0].strip()) bill.add_action( bir_text, TIMEZONE.localize(bir_date), chamber=bir_chamber, classification='other', ) try: (bir_vote_id, ) = bir.xpath('td[4]/font/input/@value') except ValueError: bir_vote_id = '' bir_vote_id = bir_vote_id.strip() if bir_vote_id.startswith("Roll "): bir_vote_id = bir_vote_id.split(" ")[-1] yield from self.scrape_vote( bill=bill, vote_chamber=bir_type[0], bill_id="{0}%20for%20{1}".format(bir_type, bill_id), vote_id=bir_vote_id, vote_date=TIMEZONE.localize(bir_date), action_text=bir_text) actions = bill_doc.xpath( '//table[@id="ContentPlaceHolder1_gvHistory"]/tr')[1:] action_date = None for action in actions: # If actions occur on the same day, only one date will exist if (action.xpath('td[1]/font/text()')[0].encode( 'ascii', 'ignore').strip()): action_date = datetime.datetime.strptime( action.xpath('td[1]/font/text()')[0], self.DATE_FORMAT) (action_chamber, ) = action.xpath('td[2]/font/text()') if action.xpath('td[3]/font/u/text()'): (amendment, ) = action.xpath('td[3]/font/u/text()') else: amendment = None (action_text, ) = action.xpath('td[4]/font/text()') action_type = _categorize_action(action_text) # check for occasional extra last row if not action_chamber.strip(): continue # The committee cell is just an abbreviation, so get its name actor = self.CHAMBERS[action_chamber] try: action_committee = re.search( r'.*? referred to the .*? committee on (.*?)$', action_text).group(1).strip() except AttributeError: action_committee = '' act = bill.add_action( action_text, TIMEZONE.localize(action_date), chamber=actor, classification=action_type, ) if action_committee: act.add_related_entity(action_committee, entity_type='organization') try: vote_button = action.xpath('td[9]//text()')[0].strip() except IndexError: vote_button = '' if vote_button.startswith("Roll "): vote_id = vote_button.split(" ")[-1] yield from self.scrape_vote( bill=bill, vote_chamber=action_chamber, bill_id=bill_id, vote_id=vote_id, vote_date=TIMEZONE.localize(action_date), action_text=action_text) if amendment: amend_url = ( 'http://alisondb.legislature.state.al.us/ALISON/' 'SearchableInstruments/{0}/PrintFiles/{1}.pdf'.format( self.session, amendment)) amend_name = 'Amd/Sub {}'.format(amendment) bill.add_version_link( amend_name, amend_url, media_type='application/pdf', on_duplicate='ignore', ) yield bill
def scrape(self, window=28, matter_ids=None): '''By default, scrape board reports updated in the last 28 days. Optionally specify a larger or smaller window of time from which to scrape updates, or specific matters to scrape. Note that passing a value for :matter_ids supercedes the value of :window, such that the given matters will be scraped regardless of when they were updated. Optional parameters :window (numeric) - Amount of time for which to scrape updates, e.g. a window of 7 will scrape legislation updated in the last week. Pass a window of 0 to scrape all legislation. :matter_ids (str) - Comma-separated list of matter IDs to scrape ''' if matter_ids: matters = [ self.matter(matter_id) for matter_id in matter_ids.split(',') ] matters = filter( None, matters) # Skip matters that are not yet in Legistar elif float(window): # Support for partial days, i.e., window=0.15 n_days_ago = datetime.datetime.utcnow() - datetime.timedelta( float(window)) matters = self.matters(n_days_ago) else: # Scrape all matters, including those without a last-modified date matters = self.matters() n_days_ago = datetime.datetime.utcnow() - datetime.timedelta( float(window)) for matter in matters: # Skip this bill, until Metro cleans up duplicate in Legistar API if matter['MatterFile'] == '2017-0447': continue matter_id = matter['MatterId'] date = matter['MatterIntroDate'] title = matter['MatterTitle'] identifier = matter['MatterFile'] if not all((date, title, identifier)): continue # Do not scrape private bills introduced before this timestamp. if self._is_restricted(matter) and ( date < self.START_DATE_PRIVATE_SCRAPE): continue bill_session = self.session(self.toTime(date)) bill_type = BILL_TYPES[matter['MatterTypeName']] if identifier.startswith('S'): alternate_identifiers = [identifier] identifier = identifier[1:] else: alternate_identifiers = [] bill = Bill(identifier=identifier, legislative_session=bill_session, title=title, classification=bill_type, from_organization={"name": "Board of Directors"}) # The Metro scraper scrapes private bills. # However, we do not want to capture significant data about private bills, # other than the value of the helper function `_is_restricted` and a last modified timestamp. # We yield private bills early, wipe data from previously imported once-public bills, # and include only data *required* by the pupa schema. # https://github.com/opencivicdata/pupa/blob/master/pupa/scrape/schemas/bill.py bill.extras = {'restrict_view': self._is_restricted(matter)} # Add API source early. # Private bills should have this url for debugging. legistar_api = self.BASE_URL + '/matters/{0}'.format(matter_id) bill.add_source(legistar_api, note='api') if self._is_restricted(matter): # required fields bill.title = 'Restricted View' # wipe old data bill.extras['plain_text'] = '' bill.extras['rtf_text'] = '' bill.sponsorships = [] bill.related_bills = [] bill.versions = [] bill.documents = [] bill.actions = [] yield bill continue legistar_web = matter['legistar_url'] bill.add_source(legistar_web, note='web') for identifier in alternate_identifiers: bill.add_identifier(identifier) for action, vote in self.actions(matter_id): act = bill.add_action(**action) if action['description'] == 'Referred': body_name = matter['MatterBodyName'] act.add_related_entity( body_name, 'organization', entity_id=_make_pseudo_id(name=body_name)) result, votes = vote if result: vote_event = VoteEvent( legislative_session=bill.legislative_session, motion_text=action['description'], organization=action['organization'], classification=None, start_date=action['date'], result=result, bill=bill) vote_event.add_source(legistar_web) vote_event.add_source(legistar_api + '/histories') for vote in votes: try: raw_option = vote['VoteValueName'].lower() except AttributeError: raw_option = None clean_option = self.VOTE_OPTIONS.get( raw_option, raw_option) vote_event.vote(clean_option, vote['VotePersonName'].strip()) yield vote_event for sponsorship in self.sponsorships(matter_id): bill.add_sponsorship(**sponsorship) for topic in self.topics(matter_id): bill.add_subject(topic['MatterIndexName'].strip()) for relation in self.relations(matter_id): try: # Get data (i.e., json) for the related bill. # Then, we can find the 'MatterFile' (i.e., identifier) and the 'MatterIntroDate' (i.e., to determine its legislative session). # Sometimes, the related bill does not yet exist: in this case, throw an error, and continue. related_bill = self.endpoint( '/matters/{0}', relation['MatterRelationMatterId']) except scrapelib.HTTPError: continue else: date = related_bill['MatterIntroDate'] related_bill_session = self.session(self.toTime(date)) identifier = related_bill['MatterFile'] bill.add_related_bill( identifier=identifier, legislative_session=related_bill_session, relation_type='companion') # Currently, the relation type for bills can be one of a few possibilites: https://github.com/opencivicdata/python-opencivicdata/blob/master/opencivicdata/common.py#L104 # Metro simply understands these as related files, suggesting that they receive a relation of 'companion'. bill.add_version_link( 'Board Report', 'https://metro.legistar.com/ViewReport.ashx?M=R&N=TextL5&GID=557&ID={}&GUID=LATEST&Title=Board+Report' .format(matter_id), media_type="application/pdf") for attachment in self.attachments(matter_id): if attachment['MatterAttachmentName']: bill.add_document_link( attachment['MatterAttachmentName'], attachment['MatterAttachmentHyperlink'].strip(), media_type="application/pdf") bill.extras['local_classification'] = matter['MatterTypeName'] matter_version_value = matter['MatterVersion'] text = self.text(matter_id, matter_version_value) if text: if text['MatterTextPlain']: bill.extras['plain_text'] = text['MatterTextPlain'] if text['MatterTextRtf']: bill.extras['rtf_text'] = text['MatterTextRtf'].replace( u'\u0000', '') yield bill
def scrape_bill_type(self, chamber, session, bill_type, type_abbr, committee_abbr_regex=get_committee_name_regex()): bills = self.session.query(CABill).filter_by( session_year=session).filter_by(measure_type=type_abbr) for bill in bills: bill_session = session if bill.session_num != '0': bill_session += ' Special Session %s' % bill.session_num bill_id = bill.short_bill_id fsbill = Bill(bill_id, session, title='', chamber=chamber) if ((bill_id.startswith('S') and chamber == 'lower') or (bill_id.startswith('A') and chamber == 'upper')): print("!!!! BAD ID/CHAMBER PAIR !!!!", bill) continue # # Construct session for web query, going from '20092010' to '0910' # source_session = session[2:4] + session[6:8] # # Turn 'AB 10' into 'ab_10' # source_num = "%s_%s" % (bill.measure_type.lower(), # bill.measure_num) # Construct a fake source url source_url = ('http://leginfo.legislature.ca.gov/faces/' 'billNavClient.xhtml?bill_id=%s') % bill.bill_id fsbill.add_source(source_url) fsbill.add_version_link(bill_id, source_url, media_type='text/html') title = '' type_ = ['bill'] subject = '' all_titles = set() # Get digest test (aka "summary") from latest version. if bill.versions: version = bill.versions[-1] nsmap = version.xml.nsmap xpath = '//caml:DigestText/xhtml:p' els = version.xml.xpath(xpath, namespaces=nsmap) chunks = [] for el in els: t = etree_text_content(el) t = re.sub(r'\s+', ' ', t) t = re.sub(r'\)(\S)', lambda m: ') %s' % m.group(1), t) chunks.append(t) summary = '\n\n'.join(chunks) for version in bill.versions: if not version.bill_xml: continue version_date = self._tz.localize( version.bill_version_action_date) # create a version name to match the state's format # 02/06/17 - Enrolled version_date_human = version_date.strftime('%m/%d/%y') version_name = "{} - {}".format(version_date_human, version.bill_version_action) version_base = "https://leginfo.legislature.ca.gov/faces" version_url_pdf = "{}/billPdf.xhtml?bill_id={}&version={}".format( version_base, version.bill_id, version.bill_version_id) fsbill.add_version_link(version_name, version_url_pdf, media_type='application/pdf', date=version_date.date()) # CA is inconsistent in that some bills have a short title # that is longer, more descriptive than title. if bill.measure_type in ('AB', 'SB'): impact_clause = clean_title(version.title) title = clean_title(version.short_title) else: impact_clause = None if len(version.title) < len(version.short_title) and \ not version.title.lower().startswith('an act'): title = clean_title(version.short_title) else: title = clean_title(version.title) if title: all_titles.add(title) type_ = [bill_type] if version.appropriation == 'Yes': type_.append('appropriation') tags = [] if version.fiscal_committee == 'Yes': tags.append('fiscal committee') if version.local_program == 'Yes': tags.append('local program') if version.urgency == 'Yes': tags.append('urgency') if version.taxlevy == 'Yes': tags.append('tax levy') if version.subject: subject = clean_title(version.subject) if not title: self.warning("Couldn't find title for %s, skipping" % bill_id) continue fsbill.title = title if summary: fsbill.add_abstract(summary, note='summary') fsbill.classification = type_ fsbill.subject = [subject] if subject else [] fsbill.extras['impact_clause'] = impact_clause fsbill.extras['tags'] = tags # We don't want the current title in alternate_titles all_titles.remove(title) for title in all_titles: fsbill.add_title(title) for author in version.authors: fsbill.add_sponsorship( author.name, classification=SPONSOR_TYPES[author.contribution], primary=author.primary_author_flg == 'Y', entity_type='person', ) # fsbill.sponsorships[-1]['extras'] = {'official_type': author.contribution} seen_actions = set() for action in bill.actions: if not action.action: # NULL action text seems to be an error on CA's part, # unless it has some meaning I'm missing continue actor = action.actor or chamber actor = actor.strip() match = re.match(r'(Assembly|Senate)($| \(Floor)', actor) if match: actor = { 'Assembly': 'lower', 'Senate': 'upper' }[match.group(1)] elif actor.startswith('Governor'): actor = 'executive' else: def replacer(matchobj): if matchobj: return { 'Assembly': 'lower', 'Senate': 'upper' }[matchobj.group()] else: return matchobj.group() actor = re.sub(r'^(Assembly|Senate)', replacer, actor) type_ = [] act_str = action.action act_str = re.sub(r'\s+', ' ', act_str) attrs = self.categorizer.categorize(act_str) # Add in the committee strings of the related committees, if any. kwargs = attrs matched_abbrs = committee_abbr_regex.findall(action.action) if re.search(r'Com[s]?. on', action.action) and not matched_abbrs: msg = 'Failed to extract committee abbr from %r.' self.logger.warning(msg % action.action) if matched_abbrs: committees = [] for abbr in matched_abbrs: try: name = self.committee_abbr_to_name(chamber, abbr) committees.append(name) except KeyError: msg = ('Mapping contains no committee name for ' 'abbreviation %r. Action text was %r.') args = (abbr, action.action) raise KeyError(msg % args) committees = filter(None, committees) kwargs['committees'] = committees code = re.search(r'C[SXZ]\d+', actor) if code is not None: code = code.group() kwargs['actor_info'] = {'committee_code': code} assert len(list(committees)) == len(matched_abbrs) for committee, abbr in zip(committees, matched_abbrs): act_str = act_str.replace('Coms. on ', '') act_str = act_str.replace('Com. on ' + abbr, committee) act_str = act_str.replace(abbr, committee) if not act_str.endswith('.'): act_str = act_str + '.' # Determine which chamber the action originated from. changed = False for committee_chamber in ['upper', 'lower', 'legislature']: if actor.startswith(committee_chamber): actor = committee_chamber changed = True break if not changed: actor = 'legislature' if actor != action.actor: actor_info = kwargs.get('actor_info', {}) actor_info['details'] = action.actor kwargs['actor_info'] = actor_info # Add strings for related legislators, if any. rgx = r'(?:senator|assembly[mwp][^ .,:;]+)\s+[^ .,:;]+' legislators = re.findall(rgx, action.action, re.I) if legislators: kwargs['legislators'] = legislators date = action.action_date date = self._tz.localize(date) date = date.date() if (actor, act_str, date) in seen_actions: continue kwargs.update(self.categorizer.categorize(act_str)) action = fsbill.add_action( act_str, date.strftime('%Y-%m-%d'), chamber=actor, classification=kwargs['classification']) for committee in kwargs.get('committees', []): action.add_related_entity(committee, entity_type='organization') seen_actions.add((actor, act_str, date)) for vote_num, vote in enumerate(bill.votes): if vote.vote_result == '(PASS)': result = True else: result = False if not vote.location: continue full_loc = vote.location.description first_part = full_loc.split(' ')[0].lower() if first_part in ['asm', 'assembly']: vote_chamber = 'lower' # vote_location = ' '.join(full_loc.split(' ')[1:]) elif first_part.startswith('sen'): vote_chamber = 'upper' # vote_location = ' '.join(full_loc.split(' ')[1:]) else: raise ScrapeError("Bad location: %s" % full_loc) if vote.motion: motion = vote.motion.motion_text or '' else: motion = '' if "Third Reading" in motion or "3rd Reading" in motion: vtype = 'passage' elif "Do Pass" in motion: vtype = 'passage' else: vtype = 'other' motion = motion.strip() # Why did it take until 2.7 to get a flags argument on re.sub? motion = re.compile(r'(\w+)( Extraordinary)? Session$', re.IGNORECASE).sub('', motion) motion = re.compile(r'^(Senate|Assembly) ', re.IGNORECASE).sub('', motion) motion = re.sub(r'^(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? ', '', motion) motion = re.sub(r' \(\w+\)$', '', motion) motion = re.sub(r'(SCR|SB|AB|AJR|ACR)\s?\d+ \w+\.?$', '', motion) motion = re.sub( r'(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? ' r'Urgency Clause$', '(Urgency Clause)', motion) motion = re.sub(r'\s+', ' ', motion) if not motion: self.warning("Got blank motion on vote for %s" % bill_id) continue # XXX this is responsible for all the CA 'committee' votes, not # sure if that's a feature or bug, so I'm leaving it as is... # vote_classification = chamber if (vote_location == 'Floor') else 'committee' # org = { # 'name': vote_location, # 'classification': vote_classification # } fsvote = VoteEvent( motion_text=motion, start_date=self._tz.localize(vote.vote_date_time), result='pass' if result else 'fail', classification=vtype, # organization=org, chamber=vote_chamber, bill=fsbill, ) fsvote.extras = {'threshold': vote.threshold} source_url = ('http://leginfo.legislature.ca.gov/faces' '/billVotesClient.xhtml?bill_id={}').format( fsbill.identifier) fsvote.add_source(source_url) fsvote.pupa_id = source_url + '#' + str(vote_num) rc = {'yes': [], 'no': [], 'other': []} for record in vote.votes: if record.vote_code == 'AYE': rc['yes'].append(record.legislator_name) elif record.vote_code.startswith('NO'): rc['no'].append(record.legislator_name) else: rc['other'].append(record.legislator_name) # Handle duplicate votes for key in rc.keys(): rc[key] = list(set(rc[key])) for key, voters in rc.items(): for voter in voters: fsvote.vote(key, voter) # Set counts by summed votes for accuracy fsvote.set_count(key, len(voters)) yield fsvote yield fsbill self.session.expire_all()
def scrape_bill_type(self, chamber, session, bill_type, type_abbr, committee_abbr_regex=get_committee_name_regex()): bills = self.session.query(CABill).filter_by( session_year=session).filter_by( measure_type=type_abbr) for bill in bills: bill_session = session if bill.session_num != '0': bill_session += ' Special Session %s' % bill.session_num bill_id = bill.short_bill_id fsbill = Bill(bill_id, session, title='', chamber=chamber) if ((bill_id.startswith('S') and chamber == 'lower') or (bill_id.startswith('A') and chamber == 'upper')): print("!!!! BAD ID/CHAMBER PAIR !!!!", bill) continue # # Construct session for web query, going from '20092010' to '0910' # source_session = session[2:4] + session[6:8] # # Turn 'AB 10' into 'ab_10' # source_num = "%s_%s" % (bill.measure_type.lower(), # bill.measure_num) # Construct a fake source url source_url = ('http://leginfo.legislature.ca.gov/faces/' 'billNavClient.xhtml?bill_id=%s') % bill.bill_id fsbill.add_source(source_url) fsbill.add_version_link(bill_id, source_url, media_type='text/html') title = '' type_ = ['bill'] subject = '' all_titles = set() # Get digest test (aka "summary") from latest version. if bill.versions: version = bill.versions[-1] nsmap = version.xml.nsmap xpath = '//caml:DigestText/xhtml:p' els = version.xml.xpath(xpath, namespaces=nsmap) chunks = [] for el in els: t = etree_text_content(el) t = re.sub(r'\s+', ' ', t) t = re.sub(r'\)(\S)', lambda m: ') %s' % m.group(1), t) chunks.append(t) summary = '\n\n'.join(chunks) for version in bill.versions: if not version.bill_xml: continue version_date = self._tz.localize(version.bill_version_action_date) # create a version name to match the state's format # 02/06/17 - Enrolled version_date_human = version_date.strftime( '%m/%d/%y') version_name = "{} - {}".format( version_date_human, version.bill_version_action) version_base = "https://leginfo.legislature.ca.gov/faces" version_url_pdf = "{}/billPdf.xhtml?bill_id={}&version={}".format( version_base, version.bill_id, version.bill_version_id) fsbill.add_version_link( version_name, version_url_pdf, media_type='application/pdf', date=version_date.date()) # CA is inconsistent in that some bills have a short title # that is longer, more descriptive than title. if bill.measure_type in ('AB', 'SB'): impact_clause = clean_title(version.title) title = clean_title(version.short_title) else: impact_clause = None if len(version.title) < len(version.short_title) and \ not version.title.lower().startswith('an act'): title = clean_title(version.short_title) else: title = clean_title(version.title) if title: all_titles.add(title) type_ = [bill_type] if version.appropriation == 'Yes': type_.append('appropriation') tags = [] if version.fiscal_committee == 'Yes': tags.append('fiscal committee') if version.local_program == 'Yes': tags.append('local program') if version.urgency == 'Yes': tags.append('urgency') if version.taxlevy == 'Yes': tags.append('tax levy') if version.subject: subject = clean_title(version.subject) if not title: self.warning("Couldn't find title for %s, skipping" % bill_id) continue fsbill.title = title if summary: fsbill.add_abstract(summary, note='summary') fsbill.classification = type_ fsbill.subject = [subject] if subject else [] fsbill.extras['impact_clause'] = impact_clause fsbill.extras['tags'] = tags # We don't want the current title in alternate_titles all_titles.remove(title) for title in all_titles: fsbill.add_title(title) for author in version.authors: fsbill.add_sponsorship( author.name, classification=SPONSOR_TYPES[author.contribution], primary=author.primary_author_flg == 'Y', entity_type='person', ) # fsbill.sponsorships[-1]['extras'] = {'official_type': author.contribution} seen_actions = set() for action in bill.actions: if not action.action: # NULL action text seems to be an error on CA's part, # unless it has some meaning I'm missing continue actor = action.actor or chamber actor = actor.strip() match = re.match(r'(Assembly|Senate)($| \(Floor)', actor) if match: actor = {'Assembly': 'lower', 'Senate': 'upper'}[match.group(1)] elif actor.startswith('Governor'): actor = 'executive' else: def replacer(matchobj): if matchobj: return {'Assembly': 'lower', 'Senate': 'upper'}[matchobj.group()] else: return matchobj.group() actor = re.sub(r'^(Assembly|Senate)', replacer, actor) type_ = [] act_str = action.action act_str = re.sub(r'\s+', ' ', act_str) attrs = self.categorizer.categorize(act_str) # Add in the committee strings of the related committees, if any. kwargs = attrs matched_abbrs = committee_abbr_regex.findall(action.action) if re.search(r'Com[s]?. on', action.action) and not matched_abbrs: msg = 'Failed to extract committee abbr from %r.' self.logger.warning(msg % action.action) if matched_abbrs: committees = [] for abbr in matched_abbrs: try: name = self.committee_abbr_to_name(chamber, abbr) committees.append(name) except KeyError: msg = ('Mapping contains no committee name for ' 'abbreviation %r. Action text was %r.') args = (abbr, action.action) raise KeyError(msg % args) committees = filter(None, committees) kwargs['committees'] = committees code = re.search(r'C[SXZ]\d+', actor) if code is not None: code = code.group() kwargs['actor_info'] = {'committee_code': code} assert len(list(committees)) == len(matched_abbrs) for committee, abbr in zip(committees, matched_abbrs): act_str = act_str.replace('Coms. on ', '') act_str = act_str.replace('Com. on ' + abbr, committee) act_str = act_str.replace(abbr, committee) if not act_str.endswith('.'): act_str = act_str + '.' # Determine which chamber the action originated from. changed = False for committee_chamber in ['upper', 'lower', 'legislature']: if actor.startswith(committee_chamber): actor = committee_chamber changed = True break if not changed: actor = 'legislature' if actor != action.actor: actor_info = kwargs.get('actor_info', {}) actor_info['details'] = action.actor kwargs['actor_info'] = actor_info # Add strings for related legislators, if any. rgx = r'(?:senator|assembly[mwp][^ .,:;]+)\s+[^ .,:;]+' legislators = re.findall(rgx, action.action, re.I) if legislators: kwargs['legislators'] = legislators date = action.action_date date = self._tz.localize(date) date = date.date() if (actor, act_str, date) in seen_actions: continue kwargs.update(self.categorizer.categorize(act_str)) action = fsbill.add_action(act_str, date.strftime('%Y-%m-%d'), chamber=actor, classification=kwargs['classification']) for committee in kwargs.get('committees', []): action.add_related_entity( committee, entity_type='organization') seen_actions.add((actor, act_str, date)) for vote_num, vote in enumerate(bill.votes): if vote.vote_result == '(PASS)': result = True else: result = False if not vote.location: continue full_loc = vote.location.description first_part = full_loc.split(' ')[0].lower() if first_part in ['asm', 'assembly']: vote_chamber = 'lower' # vote_location = ' '.join(full_loc.split(' ')[1:]) elif first_part.startswith('sen'): vote_chamber = 'upper' # vote_location = ' '.join(full_loc.split(' ')[1:]) else: raise ScrapeError("Bad location: %s" % full_loc) if vote.motion: motion = vote.motion.motion_text or '' else: motion = '' if "Third Reading" in motion or "3rd Reading" in motion: vtype = 'passage' elif "Do Pass" in motion: vtype = 'passage' else: vtype = 'other' motion = motion.strip() # Why did it take until 2.7 to get a flags argument on re.sub? motion = re.compile(r'(\w+)( Extraordinary)? Session$', re.IGNORECASE).sub('', motion) motion = re.compile(r'^(Senate|Assembly) ', re.IGNORECASE).sub('', motion) motion = re.sub(r'^(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? ', '', motion) motion = re.sub(r' \(\w+\)$', '', motion) motion = re.sub(r'(SCR|SB|AB|AJR|ACR)\s?\d+ \w+\.?$', '', motion) motion = re.sub(r'(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? ' r'Urgency Clause$', '(Urgency Clause)', motion) motion = re.sub(r'\s+', ' ', motion) if not motion: self.warning("Got blank motion on vote for %s" % bill_id) continue # XXX this is responsible for all the CA 'committee' votes, not # sure if that's a feature or bug, so I'm leaving it as is... # vote_classification = chamber if (vote_location == 'Floor') else 'committee' # org = { # 'name': vote_location, # 'classification': vote_classification # } fsvote = VoteEvent( motion_text=motion, start_date=self._tz.localize(vote.vote_date_time), result='pass' if result else 'fail', classification=vtype, # organization=org, chamber=vote_chamber, bill=fsbill, ) fsvote.extras = {'threshold': vote.threshold} source_url = ( 'http://leginfo.legislature.ca.gov/faces' '/billVotesClient.xhtml?bill_id={}' ).format(fsbill.identifier) fsvote.add_source(source_url) fsvote.pupa_id = source_url + '#' + str(vote_num) rc = {'yes': [], 'no': [], 'other': []} for record in vote.votes: if record.vote_code == 'AYE': rc['yes'].append(record.legislator_name) elif record.vote_code.startswith('NO'): rc['no'].append(record.legislator_name) else: rc['other'].append(record.legislator_name) # Handle duplicate votes for key in rc.keys(): rc[key] = list(set(rc[key])) for key, voters in rc.items(): for voter in voters: fsvote.vote(key, voter) # Set counts by summed votes for accuracy fsvote.set_count(key, len(voters)) yield fsvote yield fsbill self.session.expire_all()
def scrape_bill_list(self, url): bill_list = self._get_bill_list(url) for bill_info in bill_list: (bill_id, ) = bill_info.xpath('td[1]/font/input/@value') (sponsor, ) = bill_info.xpath('td[2]/font/input/@value') (subject, ) = bill_info.xpath('td[3]//text()') subject = subject.strip() chamber = self.CHAMBERS[bill_id[0]] if 'B' in bill_id: bill_type = 'bill' elif 'JR' in bill_id: bill_type = 'joint resolution' elif 'R' in bill_id: bill_type = 'resolution' else: raise AssertionError( "Unknown bill type for bill '{}'".format(bill_id)) bill = Bill( bill_id, legislative_session=self.session, chamber=chamber, title='', classification=bill_type, ) if subject: bill.subject = [subject] if sponsor: bill.add_sponsorship( name=sponsor, entity_type='person', classification='primary', primary=True, ) bill.add_source(url) bill_url = ('http://alisondb.legislature.state.al.us/Alison/' 'SESSBillStatusResult.aspx?BILL={}'.format(bill_id)) bill.add_source(bill_url) bill_html = self._get_bill_response(bill_url) if bill_html is None: self.warning("Bill {} has no webpage, and will be skipped". format(bill_id)) continue bill_doc = lxml.html.fromstring(bill_html) if (bill_doc.xpath('//span[@id="ContentPlaceHolder1_lblShotTitle"]')): title = bill_doc.xpath( '//span[@id="ContentPlaceHolder1_lblShotTitle"]' )[0].text_content().strip() if not title: title = "[No title given by state]" bill.title = title version_url_base = ( 'http://alisondb.legislature.state.al.us/ALISON/' 'SearchableInstruments/{0}/PrintFiles/{1}-'. format(self.session, bill_id)) versions = bill_doc.xpath( '//table[@class="box_versions"]/tr/td[2]/font/text()') for version in versions: name = version if version == "Introduced": version_url = version_url_base + 'int.pdf' elif version == "Engrossed": version_url = version_url_base + 'eng.pdf' elif version == "Enrolled": version_url = version_url_base + 'enr.pdf' else: raise NotImplementedError( "Unknown version type found: '{}'".format(name)) bill.add_version_link( name, version_url, media_type='application/pdf', on_duplicate='ignore', ) # Fiscal notes exist, but I can't figure out how to build their URL fiscal_notes = bill_doc.xpath( '//table[@class="box_fiscalnote"]')[1:] for fiscal_note in fiscal_notes: pass # Budget Isolation Resolutions are handled as extra actions/votes birs = bill_doc.xpath( '//div[@class="box_bir"]//table//table/tr')[1:] for bir in birs: bir_action = bir.xpath('td[1]')[0].text_content().strip() # Sometimes ALISON's database puts another bill's # actions into the BIR action list; ignore these if bill_id not in bir_action: self.warning( "BIR action found ({}) ".format(bir_action) + "that doesn't match the bill ID ({})".format(bill_id)) continue bir_date = datetime.datetime.strptime( bir.xpath('td[2]/font/text()')[0], self.DATE_FORMAT) bir_type = bir.xpath('td[1]/font/text()')[0].split(" ")[0] bir_chamber = self.CHAMBERS[bir_type[0]] bir_text = "{0}: {1}".format( bir_type, bir.xpath('td[3]/font/text()')[0].strip()) bill.add_action( bir_text, TIMEZONE.localize(bir_date), chamber=bir_chamber, classification='other', ) try: (bir_vote_id, ) = bir.xpath('td[4]/font/input/@value') except ValueError: bir_vote_id = '' bir_vote_id = bir_vote_id.strip() if bir_vote_id.startswith("Roll "): bir_vote_id = bir_vote_id.split(" ")[-1] yield from self.scrape_vote( bill=bill, vote_chamber=bir_type[0], bill_id="{0}%20for%20{1}".format(bir_type, bill_id), vote_id=bir_vote_id, vote_date=TIMEZONE.localize(bir_date), action_text=bir_text ) actions = bill_doc.xpath('//table[@id="ContentPlaceHolder1_gvHistory"]/tr')[1:] action_date = None for action in actions: # If actions occur on the same day, only one date will exist if (action.xpath('td[1]/font/text()')[0]. encode('ascii', 'ignore').strip()): action_date = datetime.datetime.strptime( action.xpath('td[1]/font/text()')[0], self.DATE_FORMAT) (action_chamber, ) = action.xpath('td[2]/font/text()') if action.xpath('td[3]/font/u/text()'): (amendment, ) = action.xpath('td[3]/font/u/text()') else: amendment = None (action_text, ) = action.xpath('td[4]/font/text()') action_type = _categorize_action(action_text) # check for occasional extra last row if not action_chamber.strip(): continue # The committee cell is just an abbreviation, so get its name actor = self.CHAMBERS[action_chamber] try: action_committee = re.search( r'.*? referred to the .*? committee on (.*?)$', action_text).group(1).strip() except AttributeError: action_committee = '' act = bill.add_action( action_text, TIMEZONE.localize(action_date), chamber=actor, classification=action_type, ) if action_committee: act.add_related_entity(action_committee, entity_type='organization') try: vote_button = action.xpath('td[9]//text()')[0].strip() except IndexError: vote_button = '' if vote_button.startswith("Roll "): vote_id = vote_button.split(" ")[-1] yield from self.scrape_vote( bill=bill, vote_chamber=action_chamber, bill_id=bill_id, vote_id=vote_id, vote_date=TIMEZONE.localize(action_date), action_text=action_text ) if amendment: amend_url = ( 'http://alisondb.legislature.state.al.us/ALISON/' 'SearchableInstruments/{0}/PrintFiles/{1}.pdf'. format(self.session, amendment)) amend_name = 'Amd/Sub {}'.format(amendment) bill.add_version_link( amend_name, amend_url, media_type='application/pdf', on_duplicate='ignore', ) yield bill
def scrape(self, session=None, chambers=None): # Bills endpoint can sometimes take a very long time to load self.timeout = 300 if not session: session = self.latest_session() self.info('no session, using %s', session) if int(session) < 128: raise AssertionError("No data for period {}".format(session)) elif int(session) < 131: # they changed their data format starting in 131st and added # an undocumented API yield from self.old_scrape(session) else: chamber_dict = {"Senate": "upper", "House": "lower", "House of Representatives": "lower", "house": "lower", "senate": "upper"} # so presumanbly not everything passes, but we haven't # seen anything not pass yet, so we'll need to wait # till it fails and get the right language in here vote_results = {"approved": True, "passed": True, "adopted": True, "true": True, "false": False, "failed": False, True: True, False: False} action_dict = {"ref_ctte_100": "referral-committee", "intro_100": "introduction", "intro_101": "introduction", "pass_300": "passage", "intro_110": "reading-1", "refer_210": "referral-committee", "crpt_301": None, "crpt_317": None, "concur_606": "passage", "pass_301": "passage", "refer_220": "referral-committee", "intro_102": ["introduction", "passage"], "intro_105": ["introduction", "passage"], "intro_ref_ctte_100": "referral-committee", "refer_209": None, "intro_108": ["introduction", "passage"], "intro_103": ["introduction", "passage"], "msg_reso_503": "passage", "intro_107": ["introduction", "passage"], "imm_consid_360": "passage", "refer_213": None, "adopt_reso_100": "passage", "adopt_reso_110": "passage", "msg_507": "amendment-passage", "confer_713": None, "concur_603": None, "confer_712": None, "msg_506": "amendment-failure", "receive_message_100": "passage", "motion_920": None, "concur_611": None, "confer_735": None, "third_429": None, "final_501": None, "concur_608": None, } base_url = "http://search-prod.lis.state.oh.us" first_page = base_url first_page += "/solarapi/v1/general_assembly_{session}/".format(session=session) legislators = self.get_legislator_ids(first_page) all_amendments = self.get_other_data_source(first_page, base_url, "amendments") all_fiscals = self.get_other_data_source(first_page, base_url, "fiscals") all_synopsis = self.get_other_data_source(first_page, base_url, "synopsiss") all_analysis = self.get_other_data_source(first_page, base_url, "analysiss") for row in self.get_bill_rows(session): spacer, number_link, _ga, title, primary_sponsor, status, spacer = row.xpath('td') # S.R.No.1 -> SR1 bill_id = number_link.text_content().replace('No.', '') bill_id = bill_id.replace('.', '').replace(' ', '') # put one space back in between type and number bill_id = re.sub(r'([a-zA-Z]+)(\d+)', r'\1 \2', bill_id) title = title.text_content().strip() title = re.sub(r'^Title', '', title) chamber = 'lower' if 'H' in bill_id else 'upper' classification = 'bill' if 'B' in bill_id else 'resolution' bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=title, classification=classification) bill.add_source(number_link.xpath('a/@href')[0]) # get bill from API bill_api_url = ('http://search-prod.lis.state.oh.us/solarapi/v1/' 'general_assembly_{}/{}/{}/'.format( session, 'bills' if 'B' in bill_id else 'resolutions', bill_id.lower().replace(' ', '') )) data = self.get(bill_api_url).json() # add title if no short title if not bill.title: bill.title = data['items'][0]['longtitle'] bill.add_title(data['items'][0]['longtitle'], 'long title') # this stuff is version-specific for version in data['items']: version_name = version["version"] version_link = base_url+version["pdfDownloadLink"] bill.add_version_link(version_name, version_link, media_type='application/pdf') # we'll use latest bill_version for everything else bill_version = data['items'][0] bill.add_source(bill_api_url) # subjects for subj in bill_version["subjectindexes"]: try: bill.add_subject(subj["primary"]) except KeyError: pass try: secondary_subj = subj["secondary"] except KeyError: secondary_subj = "" if secondary_subj: bill.add_subject(secondary_subj) # sponsors sponsors = bill_version["sponsors"] for sponsor in sponsors: sponsor_name = self.get_sponsor_name(sponsor) bill.add_sponsorship( sponsor_name, classification='primary', entity_type='person', primary=True ) cosponsors = bill_version["cosponsors"] for sponsor in cosponsors: sponsor_name = self.get_sponsor_name(sponsor) bill.add_sponsorship( sponsor_name, classification='cosponsor', entity_type='person', primary=False, ) try: action_doc = self.get(base_url+bill_version["action"][0]["link"]) except scrapelib.HTTPError: pass else: actions = action_doc.json() for action in reversed(actions["items"]): actor = chamber_dict[action["chamber"]] action_desc = action["description"] try: action_type = action_dict[action["actioncode"]] except KeyError: self.warning("Unknown action {desc} with code {code}." " Add it to the action_dict" ".".format(desc=action_desc, code=action["actioncode"])) action_type = None date = self._tz.localize(datetime.datetime.strptime( action["datetime"], "%Y-%m-%dT%H:%M:%S")) date = "{:%Y-%m-%d}".format(date) bill.add_action(action_desc, date, chamber=actor, classification=action_type) # attach documents gathered earlier self.add_document(all_amendments, bill_id, "amendment", bill, base_url) self.add_document(all_fiscals, bill_id, "fiscal", bill, base_url) self.add_document(all_synopsis, bill_id, "synopsis", bill, base_url) self.add_document(all_analysis, bill_id, "analysis", bill, base_url) # votes vote_url = base_url+bill_version["votes"][0]["link"] vote_doc = self.get(vote_url) votes = vote_doc.json() yield from self.process_vote(votes, vote_url, base_url, bill, legislators, chamber_dict, vote_results) vote_url = base_url vote_url += bill_version["cmtevotes"][0]["link"] try: vote_doc = self.get(vote_url) except scrapelib.HTTPError: self.warning("Vote page not " "loading; skipping: {}".format(vote_url)) continue votes = vote_doc.json() yield from self.process_vote(votes, vote_url, base_url, bill, legislators, chamber_dict, vote_results) if data["items"][0]["effective_date"]: effective_date = datetime.datetime.strptime(data["items"][0]["effective_date"], "%Y-%m-%d") effective_date = self._tz.localize(effective_date) # the OH website adds an action that isn't in the action list JSON. # It looks like: # Effective 7/6/18 effective_date_oh = "{:%-m/%-d/%y}".format(effective_date) effective_action = "Effective {}".format(effective_date_oh) bill.add_action(effective_action, effective_date, chamber="executive", classification=["became-law"]) # we have never seen a veto or a disapprove, but they seem important. # so we'll check and throw an error if we find one # life is fragile. so are our scrapers. if "veto" in bill_version: veto_url = base_url+bill_version["veto"][0]["link"] veto_json = self.get(veto_url).json() if len(veto_json["items"]) > 0: raise AssertionError("Whoa, a veto! We've never" " gotten one before." " Go write some code to deal" " with it: {}".format(veto_url)) if "disapprove" in bill_version: disapprove_url = base_url+bill_version["disapprove"][0]["link"] disapprove_json = self.get(disapprove_url).json() if len(disapprove_json["items"]) > 0: raise AssertionError("Whoa, a disapprove! We've never" " gotten one before." " Go write some code to deal " "with it: {}".format(disapprove_url)) yield bill
def scrape_bill_type( self, chamber, session, bill_type, type_abbr, committee_abbr_regex=get_committee_name_regex(), ): bills = (self.session.query(CABill).filter_by( session_year=session).filter_by(measure_type=type_abbr)) archive_year = int(session[0:4]) not_archive_year = archive_year >= 2009 for bill in bills: bill_session = session if bill.session_num != "0": bill_session += " Special Session %s" % bill.session_num bill_id = bill.short_bill_id if bill_id.strip() == "SB77" and session == "20052006": continue fsbill = Bill(bill_id, bill_session, title="", chamber=chamber) if (bill_id.startswith("S") and chamber == "lower") or (bill_id.startswith("A") and chamber == "upper"): print("!!!! BAD ID/CHAMBER PAIR !!!!", bill) continue # Construct a fake source url source_url = ("http://leginfo.legislature.ca.gov/faces/" "billNavClient.xhtml?bill_id=%s") % bill.bill_id fsbill.add_source(source_url) fsbill.add_version_link(bill_id, source_url, media_type="text/html") title = "" type_ = ["bill"] subject = "" all_titles = set() summary = "" # Get digest test (aka "summary") from latest version. if bill.versions and not_archive_year: version = bill.versions[-1] nsmap = version.xml.nsmap xpath = "//caml:DigestText/xhtml:p" els = version.xml.xpath(xpath, namespaces=nsmap) chunks = [] for el in els: t = etree_text_content(el) t = re.sub(r"\s+", " ", t) t = re.sub(r"\)(\S)", lambda m: ") %s" % m.group(1), t) chunks.append(t) summary = "\n\n".join(chunks) for version in bill.versions: if not version.bill_xml: continue version_date = self._tz.localize( version.bill_version_action_date) # create a version name to match the state's format # 02/06/17 - Enrolled version_date_human = version_date.strftime("%m/%d/%y") version_name = "{} - {}".format(version_date_human, version.bill_version_action) version_base = "https://leginfo.legislature.ca.gov/faces" version_url_pdf = "{}/billPdf.xhtml?bill_id={}&version={}".format( version_base, version.bill_id, version.bill_version_id) fsbill.add_version_link( version_name, version_url_pdf, media_type="application/pdf", date=version_date.date(), ) # CA is inconsistent in that some bills have a short title # that is longer, more descriptive than title. if bill.measure_type in ("AB", "SB"): impact_clause = clean_title(version.title) title = clean_title(version.short_title) else: impact_clause = None if len(version.title) < len( version.short_title) and not version.title.lower( ).startswith("an act"): title = clean_title(version.short_title) else: title = clean_title(version.title) if title: all_titles.add(title) type_ = [bill_type] if version.appropriation == "Yes": type_.append("appropriation") tags = [] if version.fiscal_committee == "Yes": tags.append("fiscal committee") if version.local_program == "Yes": tags.append("local program") if version.urgency == "Yes": tags.append("urgency") if version.taxlevy == "Yes": tags.append("tax levy") if version.subject: subject = clean_title(version.subject) if not title: self.warning("Couldn't find title for %s, skipping" % bill_id) continue fsbill.title = title if summary: fsbill.add_abstract(summary, note="summary") fsbill.classification = type_ fsbill.subject = [subject] if subject else [] fsbill.extras["impact_clause"] = impact_clause fsbill.extras["tags"] = tags # We don't want the current title in alternate_titles all_titles.remove(title) for title in all_titles: fsbill.add_title(title) for author in version.authors: fsbill.add_sponsorship( author.name, classification=SPONSOR_TYPES[author.contribution], primary=author.primary_author_flg == "Y", entity_type="person", ) # fsbill.sponsorships[-1]['extras'] = {'official_type': author.contribution} seen_actions = set() for action in bill.actions: if not action.action: # NULL action text seems to be an error on CA's part, # unless it has some meaning I'm missing continue actor = action.actor or chamber actor = actor.strip() match = re.match(r"(Assembly|Senate)($| \(Floor)", actor) if match: actor = { "Assembly": "lower", "Senate": "upper" }[match.group(1)] elif actor.startswith("Governor"): actor = "executive" else: def replacer(matchobj): if matchobj: return { "Assembly": "lower", "Senate": "upper" }[matchobj.group()] else: return matchobj.group() actor = re.sub(r"^(Assembly|Senate)", replacer, actor) type_ = [] act_str = action.action act_str = re.sub(r"\s+", " ", act_str) attrs = self.categorizer.categorize(act_str) # Add in the committee strings of the related committees, if any. kwargs = attrs matched_abbrs = committee_abbr_regex.findall(action.action) if re.search(r"Com[s]?. on", action.action) and not matched_abbrs: msg = "Failed to extract committee abbr from %r." self.logger.warning(msg % action.action) if matched_abbrs: committees = [] for abbr in matched_abbrs: try: name = self.committee_abbr_to_name(chamber, abbr) committees.append(name) except KeyError: msg = ("Mapping contains no committee name for " "abbreviation %r. Action text was %r.") args = (abbr, action.action) self.warning(msg % args) committees = filter(None, committees) kwargs["committees"] = committees code = re.search(r"C[SXZ]\d+", actor) if code is not None: code = code.group() kwargs["actor_info"] = {"committee_code": code} if not_archive_year: assert len(list(committees)) == len(matched_abbrs) for committee, abbr in zip(committees, matched_abbrs): act_str = act_str.replace("Coms. on ", "") act_str = act_str.replace("Com. on " + abbr, committee) act_str = act_str.replace(abbr, committee) if not act_str.endswith("."): act_str = act_str + "." # Determine which chamber the action originated from. changed = False for committee_chamber in ["upper", "lower", "legislature"]: if actor.startswith(committee_chamber): actor = committee_chamber changed = True break if not changed: actor = "legislature" if actor != action.actor: actor_info = kwargs.get("actor_info", {}) actor_info["details"] = action.actor kwargs["actor_info"] = actor_info # Add strings for related legislators, if any. rgx = r"(?:senator|assembly[mwp][^ .,:;]+)\s+[^ .,:;]+" legislators = re.findall(rgx, action.action, re.I) if legislators: kwargs["legislators"] = legislators date = action.action_date date = self._tz.localize(date) date = date.date() if (actor, act_str, date) in seen_actions: continue kwargs.update(self.categorizer.categorize(act_str)) action = fsbill.add_action( act_str, date.strftime("%Y-%m-%d"), chamber=actor, classification=kwargs["classification"], ) for committee in kwargs.get("committees", []): action.add_related_entity(committee, entity_type="organization") seen_actions.add((actor, act_str, date)) source_url = ( "http://leginfo.legislature.ca.gov/faces/billVotesClient.xhtml?" ) source_url += f"bill_id={session}{bill.session_num}{fsbill.identifier}" # Votes for non archived years if archive_year > 2009: for vote_num, vote in enumerate(bill.votes): if vote.vote_result == "(PASS)": result = True else: result = False if not vote.location: continue full_loc = vote.location.description first_part = full_loc.split(" ")[0].lower() if first_part in ["asm", "assembly"]: vote_chamber = "lower" # vote_location = ' '.join(full_loc.split(' ')[1:]) elif first_part.startswith("sen"): vote_chamber = "upper" # vote_location = ' '.join(full_loc.split(' ')[1:]) else: # raise ScrapeError("Bad location: %s" % full_loc) # To uncomment continue if vote.motion: motion = vote.motion.motion_text or "" else: motion = "" if "Third Reading" in motion or "3rd Reading" in motion: vtype = "passage" elif "Do Pass" in motion: vtype = "passage" else: vtype = "other" motion = motion.strip() motion = re.compile(r"(\w+)( Extraordinary)? Session$", re.IGNORECASE).sub("", motion) motion = re.compile(r"^(Senate|Assembly) ", re.IGNORECASE).sub("", motion) motion = re.sub(r"^(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? ", "", motion) motion = re.sub(r" \(\w+\)$", "", motion) motion = re.sub(r"(SCR|SB|AB|AJR|ACR)\s?\d+ \w+\.?$", "", motion) motion = re.sub( r"(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? " r"Urgency Clause$", "(Urgency Clause)", motion, ) motion = re.sub(r"\s+", " ", motion) if not motion: self.warning("Got blank motion on vote for %s" % bill_id) continue # XXX this is responsible for all the CA 'committee' votes, not # sure if that's a feature or bug, so I'm leaving it as is... # vote_classification = chamber if (vote_location == 'Floor') else 'committee' # org = { # 'name': vote_location, # 'classification': vote_classification # } fsvote = VoteEvent( motion_text=motion, start_date=self._tz.localize(vote.vote_date_time), result="pass" if result else "fail", classification=vtype, # organization=org, chamber=vote_chamber, bill=fsbill, ) fsvote.extras = {"threshold": vote.threshold} fsvote.add_source(source_url) fsvote.pupa_id = source_url + "#" + str(vote_num) rc = {"yes": [], "no": [], "other": []} for record in vote.votes: if record.vote_code == "AYE": rc["yes"].append(record.legislator_name) elif record.vote_code.startswith("NO"): rc["no"].append(record.legislator_name) else: rc["other"].append(record.legislator_name) # Handle duplicate votes for key in rc.keys(): rc[key] = list(set(rc[key])) for key, voters in rc.items(): for voter in voters: fsvote.vote(key, voter) # Set counts by summed votes for accuracy fsvote.set_count(key, len(voters)) yield fsvote if len(bill.votes) > 0 and archive_year <= 2009: vote_page_url = ( "http://leginfo.legislature.ca.gov/faces/billVotesClient.xhtml?" ) vote_page_url += ( f"bill_id={session}{bill.session_num}{fsbill.identifier}") # parse the bill data page, finding the latest html text data = self.get(vote_page_url).content doc = html.fromstring(data) doc.make_links_absolute(vote_page_url) num_of_votes = len(doc.xpath("//div[@class='status']")) for vote_section in range(1, num_of_votes + 1): lines = doc.xpath( f"//div[@class='status'][{vote_section}]//div[@class='statusRow']" ) date, result, motion, vtype, location = "", "", "", "", "" votes = {} for line in lines: line = line.text_content().split() if line[0] == "Date": date = line[1] date = datetime.datetime.strptime(date, "%m/%d/%y") date = self._tz.localize(date) elif line[0] == "Result": result = "pass" if "PASS" in line[1] else "fail" elif line[0] == "Motion": motion = " ".join(line[1:]) elif line[0] == "Location": location = " ".join(line[1:]) elif len(line) > 1: if line[0] == "Ayes" and line[1] != "Count": votes["yes"] = line[1:] elif line[0] == "Noes" and line[1] != "Count": votes["no"] = line[1:] elif line[0] == "NVR" and line[1] != "Count": votes["not voting"] = line[1:] # Determine chamber based on location first_part = location.split(" ")[0].lower() vote_chamber = "" if first_part in ["asm", "assembly"]: vote_chamber = "lower" elif first_part.startswith("sen"): vote_chamber = "upper" if "Third Reading" in motion or "3rd Reading" in motion: vtype = "passage" elif "Do Pass" in motion: vtype = "passage" else: vtype = "other" if len(motion) > 0: fsvote = VoteEvent( motion_text=motion, start_date=date, result=result, classification=vtype, chamber=vote_chamber, bill=fsbill, ) fsvote.add_source(vote_page_url) fsvote.pupa_id = vote_page_url + "#" + str( vote_section) for how_voted, voters in votes.items(): for voter in voters: voter = voter.replace(",", "") fsvote.vote(how_voted, voter) yield fsvote yield fsbill self.session.expire_all()
def scrape(self, session=None, chambers=None): # Bills endpoint can sometimes take a very long time to load self.timeout = 300 if not session: session = self.latest_session() self.info("no session, using %s", session) if int(session) < 128: raise AssertionError("No data for period {}".format(session)) elif int(session) < 131: # they changed their data format starting in 131st and added # an undocumented API yield from self.old_scrape(session) else: chamber_dict = { "Senate": "upper", "House": "lower", "House of Representatives": "lower", "house": "lower", "senate": "upper", } # so presumanbly not everything passes, but we haven't # seen anything not pass yet, so we'll need to wait # till it fails and get the right language in here vote_results = { "approved": True, "passed": True, "adopted": True, "true": True, "false": False, "failed": False, True: True, False: False, } action_dict = { "ref_ctte_100": "referral-committee", "intro_100": "introduction", "intro_101": "introduction", "pass_300": "passage", "intro_110": "reading-1", "refer_210": "referral-committee", "crpt_301": None, "crpt_317": None, "concur_606": "passage", "pass_301": "passage", "refer_220": "referral-committee", "intro_102": ["introduction", "passage"], "intro_105": ["introduction", "passage"], "intro_ref_ctte_100": "referral-committee", "refer_209": None, "intro_108": ["introduction", "passage"], "intro_103": ["introduction", "passage"], "msg_reso_503": "passage", "intro_107": ["introduction", "passage"], "imm_consid_360": "passage", "refer_213": None, "adopt_reso_100": "passage", "adopt_reso_110": "passage", "msg_507": "amendment-passage", "confer_713": None, "concur_603": None, "confer_712": None, "msg_506": "amendment-failure", "receive_message_100": "passage", "motion_920": None, "concur_611": None, "confer_735": None, "third_429": None, "final_501": None, "concur_608": None, } base_url = "http://search-prod.lis.state.oh.us" first_page = base_url first_page += "/solarapi/v1/general_assembly_{session}/".format( session=session) legislators = self.get_legislator_ids(first_page) all_amendments = self.get_other_data_source( first_page, base_url, "amendments") all_fiscals = self.get_other_data_source(first_page, base_url, "fiscals") all_synopsis = self.get_other_data_source(first_page, base_url, "synopsiss") all_analysis = self.get_other_data_source(first_page, base_url, "analysiss") for row in self.get_bill_rows(session): ( spacer, number_link, _ga, title, primary_sponsor, status, spacer, ) = row.xpath("td") # S.R.No.1 -> SR1 bill_id = number_link.text_content().replace("No.", "") bill_id = bill_id.replace(".", "").replace(" ", "") # put one space back in between type and number bill_id = re.sub(r"([a-zA-Z]+)(\d+)", r"\1 \2", bill_id) title = title.text_content().strip() title = re.sub(r"^Title", "", title) chamber = "lower" if "H" in bill_id else "upper" classification = "bill" if "B" in bill_id else "resolution" bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=title, classification=classification, ) bill.add_source(number_link.xpath("a/@href")[0]) # get bill from API bill_api_url = ( "http://search-prod.lis.state.oh.us/solarapi/v1/" "general_assembly_{}/{}/{}/".format( session, "bills" if "B" in bill_id else "resolutions", bill_id.lower().replace(" ", ""), )) data = self.get(bill_api_url).json() # add title if no short title if not bill.title: bill.title = data["items"][0]["longtitle"] bill.add_title(data["items"][0]["longtitle"], "long title") # this stuff is version-specific for version in data["items"]: version_name = version["version"] version_link = base_url + version["pdfDownloadLink"] bill.add_version_link(version_name, version_link, media_type="application/pdf") # we'll use latest bill_version for everything else bill_version = data["items"][0] bill.add_source(bill_api_url) # subjects for subj in bill_version["subjectindexes"]: try: bill.add_subject(subj["primary"]) except KeyError: pass try: secondary_subj = subj["secondary"] except KeyError: secondary_subj = "" if secondary_subj: bill.add_subject(secondary_subj) # sponsors sponsors = bill_version["sponsors"] for sponsor in sponsors: sponsor_name = self.get_sponsor_name(sponsor) bill.add_sponsorship( sponsor_name, classification="primary", entity_type="person", primary=True, ) cosponsors = bill_version["cosponsors"] for sponsor in cosponsors: sponsor_name = self.get_sponsor_name(sponsor) bill.add_sponsorship( sponsor_name, classification="cosponsor", entity_type="person", primary=False, ) try: action_doc = self.get(base_url + bill_version["action"][0]["link"]) except scrapelib.HTTPError: pass else: actions = action_doc.json() for action in reversed(actions["items"]): actor = chamber_dict[action["chamber"]] action_desc = action["description"] try: action_type = action_dict[action["actioncode"]] except KeyError: self.warning( "Unknown action {desc} with code {code}." " Add it to the action_dict" ".".format(desc=action_desc, code=action["actioncode"])) action_type = None date = self._tz.localize( datetime.datetime.strptime(action["datetime"], "%Y-%m-%dT%H:%M:%S")) date = "{:%Y-%m-%d}".format(date) bill.add_action(action_desc, date, chamber=actor, classification=action_type) # attach documents gathered earlier self.add_document(all_amendments, bill_id, "amendment", bill, base_url) self.add_document(all_fiscals, bill_id, "fiscal", bill, base_url) self.add_document(all_synopsis, bill_id, "synopsis", bill, base_url) self.add_document(all_analysis, bill_id, "analysis", bill, base_url) # votes vote_url = base_url + bill_version["votes"][0]["link"] try: vote_doc = self.get(vote_url) except scrapelib.HTTPError: self.warning( "Vote page not loading; skipping: {}".format(vote_url)) continue votes = vote_doc.json() yield from self.process_vote( votes, vote_url, base_url, bill, legislators, chamber_dict, vote_results, ) vote_url = base_url vote_url += bill_version["cmtevotes"][0]["link"] try: vote_doc = self.get(vote_url) except scrapelib.HTTPError: self.warning( "Vote page not loading; skipping: {}".format(vote_url)) continue votes = vote_doc.json() yield from self.process_vote( votes, vote_url, base_url, bill, legislators, chamber_dict, vote_results, ) if data["items"][0]["effective_date"]: effective_date = datetime.datetime.strptime( data["items"][0]["effective_date"], "%Y-%m-%d") effective_date = self._tz.localize(effective_date) # the OH website adds an action that isn't in the action list JSON. # It looks like: # Effective 7/6/18 effective_date_oh = "{:%-m/%-d/%y}".format(effective_date) effective_action = "Effective {}".format(effective_date_oh) bill.add_action( effective_action, effective_date, chamber="executive", classification=["became-law"], ) # we have never seen a veto or a disapprove, but they seem important. # so we'll check and throw an error if we find one # life is fragile. so are our scrapers. if "veto" in bill_version: veto_url = base_url + bill_version["veto"][0]["link"] veto_json = self.get(veto_url).json() if len(veto_json["items"]) > 0: raise AssertionError("Whoa, a veto! We've never" " gotten one before." " Go write some code to deal" " with it: {}".format(veto_url)) if "disapprove" in bill_version: disapprove_url = base_url + bill_version["disapprove"][0][ "link"] disapprove_json = self.get(disapprove_url).json() if len(disapprove_json["items"]) > 0: raise AssertionError( "Whoa, a disapprove! We've never" " gotten one before." " Go write some code to deal " "with it: {}".format(disapprove_url)) yield bill