def scrape_bill(self, row, chamber, session): bill_id = row['LegislationNumber'] # TODO: re-evaluate if these should be separate bills if 'SA' in bill_id or 'HA' in bill_id: self.warning('skipping amendment %s', bill_id) return bill_type = self.classify_bill(bill_id) bill = Bill(identifier=bill_id, legislative_session=session, chamber=chamber, title=row['LongTitle'], classification=bill_type) if row['Synopsis']: bill.add_abstract(row['Synopsis'], 'synopsis') if row['ShortTitle']: bill.add_title(row['ShortTitle'], 'short title') if row['SponsorPersonId']: self.add_sponsor_by_legislator_id(bill, row['SponsorPersonId'], 'primary') # TODO: Is there a way get additional sponsors and cosponsors, and versions/fns via API? html_url = 'https://legis.delaware.gov/BillDetail?LegislationId={}'.format( row['LegislationId'] ) bill.add_source(html_url, note='text/html') html = self.lxmlize(html_url) # Additional Sponsors: '//label[text()="Additional Sponsor(s):"]/following-sibling::div/a' additional_sponsors = html.xpath('//label[text()="Additional Sponsor(s):"]' '/following-sibling::div/a/@href') for sponsor_url in additional_sponsors: sponsor_id = sponsor_url.replace('https://legis.delaware.gov/LegislatorDetail?' 'personId=', '') self.add_sponsor_by_legislator_id(bill, sponsor_id, 'primary') # CoSponsors: '//label[text()="Co-Sponsor(s):"]/following-sibling::div/a' cosponsors = html.xpath('//label[text()="Additional Sponsor(s):"]/' 'following-sibling::div/a/@href') for sponsor_url in cosponsors: sponsor_id = sponsor_url.replace('https://legis.delaware.gov/LegislatorDetail?' 'personId=', '') self.add_sponsor_by_legislator_id(bill, sponsor_id, 'cosponsor') versions = html.xpath('//label[text()="Original Text:"]/following-sibling::div/a/@href') for version_url in versions: media_type = self.mime_from_link(version_url) version_name = 'Bill Text' # on_duplicate='error' bill.add_version_link(version_name, version_url, media_type=media_type) fiscals = html.xpath('//div[contains(@class,"fiscalNote")]/a/@href') for fiscal in fiscals: self.scrape_fiscal_note(bill, fiscal) self.scrape_actions(bill, row['LegislationId']) yield from self.scrape_votes(bill, row['LegislationId'], session) yield bill
def handle_list_item(self, item): bill_id = item.text.strip() title = item.xpath("string(../following-sibling::td[1])").strip() sponsor = item.xpath("string(../following-sibling::td[2])").strip() bill_url = item.attrib['href'] + '/ByCategory' if bill_id.startswith(('SB ', 'HB ', 'SPB ', 'HPB ')): bill_type = 'bill' elif bill_id.startswith(('HR ', 'SR ')): bill_type = 'resolution' elif bill_id.startswith(('HJR ', 'SJR ')): bill_type = 'joint resolution' elif bill_id.startswith(('SCR ', 'HCR ')): bill_type = 'concurrent resolution' elif bill_id.startswith(('SM ', 'HM ')): bill_type = 'memorial' else: raise ValueError('Failed to identify bill type.') bill = Bill(bill_id, self.kwargs['session'], title, chamber='lower' if bill_id[0] == 'H' else 'upper', classification=bill_type) bill.add_source(bill_url) # normalize id from HB 0004 to H4 subj_bill_id = re.sub('(H|S)\w+ 0*(\d+)', r'\1\2', bill_id) bill.subject = list(self.kwargs['subjects'][subj_bill_id]) sponsor = re.sub(r'^(?:Rep|Sen)\.\s', "", sponsor) for sp in sponsor.split(', '): bill.add_sponsorship(sp, 'primary', 'person', True) yield from self.scrape_page_items(BillDetail, url=bill_url, obj=bill) yield bill
def scrape_bill(self, chamber, session, bill_id, session_id): bill_json_url = 'https://apps.azleg.gov/api/Bill/?billNumber={}&sessionId={}&' \ 'legislativeBody={}'.format(bill_id, session_id, self.chamber_map[chamber]) response = self.get(bill_json_url) page = json.loads(response.content.decode('utf-8')) if not page: self.warning('null page for %s', bill_id) return bill_title = page['ShortTitle'] bill_id = page['Number'] internal_id = page['BillId'] bill_type = self.get_bill_type(bill_id) bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=bill_title, classification=bill_type, ) self.scrape_actions(bill, page, chamber) self.scrape_versions_and_documents(bill, internal_id) self.scrape_sponsors(bill, internal_id) self.scrape_subjects(bill, internal_id) yield from self.scrape_votes(bill, page) bill_url = 'https://apps.azleg.gov/BillStatus/BillOverview/{}?SessionId={}'.format( internal_id, session_id) bill.add_source(bill_url) self.sort_bill_actions(bill) yield bill
def get_bill(self, bill_id, **kwargs): url = kwargs.pop('url') agenda_item = kwargs.pop('agenda_item') _type = self.get_type(bill_id) bill = Bill(bill_id, self.session, type=_type, **kwargs) bill.add_source(url, note='detail') return bill
def scrape(self): for i, page in enumerate(self.searchLegislation()) : for legislation_summary in self.parseSearchResults(page) : title = legislation_summary['Title'].strip() if title == "": continue if legislation_summary['Type'].lower() in ('order', 'claim', 'communication', 'report', 'oath of office') : continue else : bill_type = legislation_summary['Type'].lower() bill_session = self.session(legislation_summary['Intro\xa0Date']) bill = Bill(identifier=legislation_summary['Record #'], legislative_session=bill_session, title=title, classification=bill_type, from_organization=self.jurisdiction.name) bill.add_source(legislation_summary['url']) bill, votes = self.addDetails(bill, legislation_summary['url']) yield bill for vote in votes : yield vote
def scrape_bill(self, chamber, session, bill_id, session_id): """ Scrapes documents, actions, vote counts and votes for a given bill. """ bill_json_url = 'https://apps.azleg.gov/api/Bill/?billNumber={}&sessionId={}&' \ 'legislativeBody={}'.format(bill_id, session_id, self.chamber_map[chamber]) response = self.get(bill_json_url) # print(response.content) page = json.loads(response.content.decode('utf-8')) bill_title = page['ShortTitle'] bill_id = page['Number'] internal_id = page['BillId'] bill_type = self.get_bill_type(bill_id) bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=bill_title, classification=bill_type, ) bill = self.scrape_actions(bill, page, chamber) bill = self.scrape_versions(bill, internal_id) bill = self.scrape_sponsors(bill, internal_id) bill = self.scrape_subjects(bill, internal_id) bill_url = 'https://apps.azleg.gov/BillStatus/BillOverview/{}?SessionId={}'.format( internal_id, session_id) bill.add_source(bill_url) bill = self.sort_bill_actions(bill) yield bill
def scrape_bill(self, session, bill_id, chamber): # https://malegislature.gov/Bills/189/SD2739 session_for_url = self.replace_non_digits(session) bill_url = 'https://malegislature.gov/Bills/{}/{}'.format(session_for_url, bill_id) try: response = requests.get(bill_url) except requests.exceptions.RequestException as e: self.warning(u'Server Error on {}'.format(bill_url)) return False html = response.text page = lxml.html.fromstring(html) if not page.xpath('//div[contains(@class, "followable")]/h1/text()'): self.warning(u'Server Error on {}'.format(bill_url)) return False bill_title = page.xpath('//div[@id="contentContainer"]/div/div/h2/text()')[0] bill_id = re.sub(r'[^S|H|D|\d]', '', bill_id) bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=bill_title, classification='bill') bill_summary = None if page.xpath('//p[@id="pinslip"]/text()'): bill_summary = page.xpath('//p[@id="pinslip"]/text()')[0] if bill_summary: bill.add_abstract(bill_summary, 'summary') bill.add_source(bill_url) # https://malegislature.gov/Bills/189/SD2739 has a presenter # https://malegislature.gov/Bills/189/S2168 no sponsor # Find the non-blank text of the dt following Sponsor or Presenter, # including any child link text. sponsor = page.xpath( '//dt[text()="Sponsor:" or text()="Presenter:"]/' 'following-sibling::dd/descendant-or-self::*/text()[normalize-space()]') if sponsor: sponsor = sponsor[0].strip() bill.add_sponsorship(sponsor, classification='primary', primary=True, entity_type='person') self.scrape_cosponsors(bill, bill_url) version = page.xpath("//div[contains(@class, 'modalBtnGroup')]/" "a[contains(text(), 'Download PDF') and not(@disabled)]/@href") if version: version_url = "https://malegislature.gov{}".format(version[0]) bill.add_version_link('Bill Text', version_url, media_type='application/pdf') # yield back votes and bill yield from self.scrape_actions(bill, bill_url, session) yield bill
def scrape_bills(self, session): session_key = SESSION_KEYS[session] measures_response = self.api_client.get('measures', page=500, session=session_key) legislators = index_legislators(self, session_key) for measure in measures_response: bid = '{} {}'.format(measure['MeasurePrefix'], measure['MeasureNumber']) chamber = self.chamber_code[bid[0]] bill = Bill( bid.replace(' ', ''), legislative_session=session, chamber=chamber, title=measure['RelatingTo'], classification=self.bill_types[measure['MeasurePrefix'][1:]] ) bill.add_abstract(measure['MeasureSummary'].strip(), note='summary') for sponsor in measure['MeasureSponsors']: legislator_code = sponsor['LegislatoreCode'] # typo in API if legislator_code: try: legislator = legislators[legislator_code] except KeyError: logger.warn('Legislator {} not found in session {}'.format( legislator_code, session)) legislator = legislator_code bill.add_sponsorship( name=legislator, classification={'Chief': 'primary', 'Regular': 'cosponsor'}[ sponsor['SponsorLevel']], entity_type='person', primary=True if sponsor['SponsorLevel'] == 'Chief' else False ) bill.add_source( "https://olis.leg.state.or.us/liz/{session}/Measures/Overview/{bid}".format( session=session_key, bid=bid.replace(' ', '')) ) for document in measure['MeasureDocuments']: # TODO: probably mixing documents & versions here - should revisit try: bill.add_version_link(document['VersionDescription'], document['DocumentUrl'], media_type='application/pdf') except ValueError: logger.warn('Duplicate link found for {}'.format(document['DocumentUrl'])) for action in measure['MeasureHistoryActions']: classifiers = self.determine_action_classifiers(action['ActionText']) when = datetime.datetime.strptime(action['ActionDate'], '%Y-%m-%dT%H:%M:%S') when = self.tz.localize(when) bill.add_action(action['ActionText'], when, chamber=self.chamber_code[action['Chamber']], classification=classifiers) yield bill
def get_bill(self, bill_id, **kwargs): if bill_id == '1': assert kwargs == {'extra': 'param'} raise self.ContinueScraping else: assert bill_id == '2' assert kwargs == {} b = Bill('1', self.session, 'title') b.add_source('http;//example.com') return b
def toy_bill(): b = Bill( identifier="HB 2017", legislative_session="2012A", title="A bill for an act to raise the cookie budget by 200%", from_organization="Foo Senate", classification="bill", ) b.add_source("http://uri.example.com/", note="foo") return b
def scrape_bill(self, chamber, session, bill_id, url): html = self.get(url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) title = doc.xpath('//h3[@class="h3billright"]')[0].text_content() # TODO: grab summary (none present at time of writing) if 'B' in bill_id: _type = ['bill'] elif 'J' in bill_id: _type = ['joint resolution'] else: raise ValueError('unknown bill type ' + bill_id) bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=title, classification=_type) bill.add_source(url) # process sponsors sponsors = _get_td(doc, 'All Sponsors:').text_content() sponsors = sponsors.replace('Delegates ', '') sponsors = sponsors.replace('Delegate ', '') sponsors = sponsors.replace('Senator ', '') sponsors = sponsors.replace('Senators ', '') sponsor_type = 'primary' for sponsor in re.split(', (?:and )?', sponsors): sponsor = sponsor.strip() if not sponsor: continue bill.add_sponsorship( sponsor, sponsor_type, primary=sponsor_type == 'primary', entity_type='person', ) sponsor_type = 'cosponsor' # subjects subject_list = [] for heading in ('Broad Subject(s):', 'Narrow Subject(s):'): subjects = _get_td(doc, heading).xpath('a/text()') subject_list += [s.split(' -see also-')[0] for s in subjects if s] bill.subject = subject_list # documents yield from self.scrape_documents(bill, url.replace('stab=01', 'stab=02')) # actions self.scrape_actions(bill, url.replace('stab=01', 'stab=03')) yield bill
def get_bill(self, bill_id, **kwargs): url = 'http://www.denvergov.org/sirepub/item.aspx?itemid=%s' % bill_id self.urls.add(detail=url) bill_id = kwargs.pop('number') bill = Bill(bill_id, self.session, kwargs['title'], 'butt', type=['bills']) bill.add_source(url, note='detail') xpath = '//table[contains(@class, "history")]/tr' for tr in self.urls.detail.xpath(xpath): import pdb; pdb.set_trace() return bill
def scrape_bill(self, chamber, session, bill_id): bill_num = bill_id.split()[1] url = ("%s/GetLegislation?biennium=%s&billNumber" "=%s" % (self._base_url, self.biennium, bill_num)) page = self.get(url) page = lxml.etree.fromstring(page.content) page = xpath(page, "//wa:Legislation")[0] title = xpath(page, "string(wa:LongDescription)") bill_type = xpath( page, "string(wa:ShortLegislationType/wa:LongLegislationType)") bill_type = bill_type.lower() if bill_type == 'gubernatorial appointment': return bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=title, classification=[bill_type]) fake_source = ("http://apps.leg.wa.gov/billinfo/" "summary.aspx?bill=%s&year=%s" % ( bill_num, session[0:4])) bill.add_source(fake_source) try: for version in self.versions[bill_id]: bill.add_version_link(note=version['note'], url=version['url'], media_type=version['media_type']) except KeyError: self.warning("No versions were found for {}".format(bill_id)) try: for document in self.documents[bill_num]: bill.add_document_link(note=document['note'], url=document['url'], media_type=document['media_type']) except KeyError: pass self.scrape_sponsors(bill) self.scrape_actions(bill, bill_num) self.scrape_hearings(bill, bill_num) yield from self.scrape_votes(bill) bill.subject = list(set(self._subjects[bill_id])) yield bill
def scrape_bill(self, session, session_slug, chamber, url): page = lxml.html.fromstring(self.get(url).text) bill_no = page.xpath('//*[@id="item-header"]/text()')[0].strip() # state bill id internal_id = re.search(r'\/Bill\/(\d+)\/Overview', url).group(1) # bill data gets filled in from another call bill_data_base = 'https://www.leg.state.nv.us/App/NELIS/REL/{}/Bill/' \ 'FillSelectedBillTab?selectedTab=Overview&billKey={}&_={}' bill_data_url = bill_data_base.format( session_slug, internal_id, time.time() * 1000) bill_page = lxml.html.fromstring(self.get(bill_data_url).text) short_title = self.get_header_field(bill_page, 'Summary:').text short_title = short_title.replace(u'\u00a0', ' ') bill = Bill( identifier=bill_no, legislative_session=session, title=short_title, chamber=chamber ) long_title = self.get_header_field(bill_page, 'Title:').text if long_title is not None: bill.add_abstract(long_title, 'Summary') sponsor_div = self.get_header_field(bill_page, 'Primary Sponsor') if sponsor_div is not None: self.add_sponsors(sponsor_div, bill, 'primary') cosponsor_div = self.get_header_field(bill_page, 'Co-Sponsor') if cosponsor_div is not None: self.add_sponsors(cosponsor_div, bill, 'cosponsor') self.add_actions(bill_page, bill, chamber) self.add_versions(session_slug, internal_id, bill) bill.subject = list(set(self.subject_mapping[bill_no])) bdr = self.extract_bdr(short_title) if bdr: bill.extras['BDR'] = bdr bill.extras['NV_ID'] = internal_id bill.add_source(url) yield bill
def scrape_bill(self, bill_page_url): bill_page = lxml.html.fromstring(self.get(bill_page_url).text) title = bill_page.xpath('//span[@id="ctl00_ContentPlaceHolder_SubjectLabel"]/text()') if title: title = title[0] else: self.warning('Missing bill title {}'.format(bill_page_url)) return False bill_no = bill_page.xpath( '//span[@id="ctl00_ContentPlaceHolder_BillNumberLabel"]/a/text()') if bill_no: bill_no = bill_no[0] else: bill_no = bill_page.xpath( '//span[@id="ctl00_ContentPlaceHolder_BillNumberLabel"]/text()') if bill_no: bill_no = bill_no[0] else: self.error('Missing bill number {}'.format(bill_page_url)) return False bill = Bill( bill_no, legislative_session=self.session, chamber='legislature', title=title, classification='bill' ) bill.add_source(bill_page_url) self.parse_versions(bill, bill_page, bill_no) self.parse_acts(bill, bill_page) sponsors = bill_page.xpath('//span[@id="ctl00_ContentPlaceHolder_SponsorsLabel"]/text()') if sponsors: self.assign_sponsors(bill, sponsors[0], 'primary') cosponsors = bill_page.xpath( '//span[@id="ctl00_ContentPlaceHolder_CoSponsorsLabel"]/text()') if cosponsors: self.assign_sponsors(bill, cosponsors[0], 'cosponsor') self.parse_date_actions(bill, bill_page) self.parse_actions(bill, bill_page) yield bill
def scrape_bill_info(self, session, chambers): info_url = "ftp://ftp.cga.ct.gov/pub/data/bill_info.csv" data = self.get(info_url) page = open_csv(data) chamber_map = {'H': 'lower', 'S': 'upper'} for row in page: bill_id = row['bill_num'] chamber = chamber_map[bill_id[0]] if chamber not in chambers: continue # assert that the bill data is from this session, CT is tricky assert row['sess_year'] == session if re.match(r'^(S|H)J', bill_id): bill_type = 'joint resolution' elif re.match(r'^(S|H)R', bill_id): bill_type = 'resolution' else: bill_type = 'bill' bill = Bill(identifier=bill_id, legislative_session=session, title=row['bill_title'], classification=bill_type, chamber=chamber) bill.add_source(info_url) for introducer in self._introducers[bill_id]: bill.add_sponsorship(name=str(introducer), classification='primary', primary=True, entity_type='person') try: for subject in self._subjects[bill_id]: bill.subject.append(subject) self.bills[bill_id] = [bill, chamber] yield from self.scrape_bill_page(bill) except SkipBill: self.warning('no such bill: ' + bill_id) pass
def scrape_bill(self, chamber, session): url = "ftp://www.arkleg.state.ar.us/dfadooas/LegislativeMeasures.txt" page = self.get(url).text page = unicode_csv_reader(StringIO(page), delimiter='|') for row in page: bill_chamber = {'H': 'lower', 'S': 'upper'}[row[0]] if bill_chamber != chamber: continue bill_id = "%s%s %s" % (row[0], row[1], row[2]) type_spec = re.match(r'(H|S)([A-Z]+)\s', bill_id).group(2) bill_type = { 'B': 'bill', 'R': 'resolution', 'JR': 'joint resolution', 'CR': 'concurrent resolution', 'MR': 'memorial', 'CMR': 'concurrent memorial'}[type_spec] if row[-1] != self.slug: continue bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=row[3], classification=bill_type) bill.add_source(url) primary = row[11] if not primary: primary = row[12] if primary: bill.add_sponsorship(primary, classification='primary', entity_type='person', primary=True) # ftp://www.arkleg.state.ar.us/Bills/ # TODO: Keep on eye on this post 2017 to see if they apply R going forward. session_code = '2017R' if session == '2017' else session version_url = ("ftp://www.arkleg.state.ar.us/Bills/" "%s/Public/%s.pdf" % ( session_code, bill_id.replace(' ', ''))) bill.add_version_link(bill_id, version_url, media_type='application/pdf') yield from self.scrape_bill_page(bill) self.bills[bill_id] = bill
def parse_bill(self, chamber, session, special, link): bill_num = link.text.strip() type_abbr = re.search('type=(B|R|)', link.attrib['href']).group(1) if type_abbr == 'B': btype = ['bill'] elif type_abbr == 'R': btype = ['resolution'] bill_id = "%s%s %s" % (utils.bill_abbr(chamber), type_abbr, bill_num) url = utils.info_url(chamber, session, special, type_abbr, bill_num) page = self.get(url).text page = lxml.html.fromstring(page) page.make_links_absolute(url) xpath = '/'.join([ '//div[contains(@class, "BillInfo-ShortTitle")]', 'div[@class="BillInfo-Section-Data"]', ]) title = page.xpath(xpath).pop().text_content().strip() if not title: return bill = Bill(bill_id, legislative_session=session, title=title, chamber=chamber, classification=btype) bill.add_source(url) self.parse_bill_versions(bill, page) self.parse_history(bill, chamber, utils.history_url(chamber, session, special, type_abbr, bill_num)) # only fetch votes if votes were seen in history # if vote_count: yield from self.parse_votes( bill, utils.vote_url(chamber, session, special, type_abbr, bill_num), ) # Dedupe sources. sources = bill.sources for source in sources: if 1 < sources.count(source): sources.remove(source) yield bill
def scrape(self): self.session = '2011' for i, page in enumerate(self.searchLegislation()) : for legislation_summary in self.parseSearchResults(page) : title = legislation_summary['Title'].strip() if title == "": continue bill = Bill(name=legislation_summary['Record #'], session=self.session, title=title, type=[legislation_summary['Type'].lower()], organization=self.jurisdiction.name) bill.add_source(legislation_summary['URL']) legislation_details = self.expandLegislationSummary(legislation_summary) for related_bill in legislation_details.get('Related files', []) : bill.add_related_bill(name = related_bill, session = self.session, relation='other-session', chamber=None) for i, sponsor in enumerate(legislation_details.get('Sponsors', [])) : if i == 0 : primary = True sponsorship_type = "Primary" else : primary = False sponsorship_type = "Regular" bill.add_sponsor(sponsor, sponsorship_type, 'person', primary) for subject in legislation_details.get(u'Topics', []) : bill.add_subject(subject) for attachment in legislation_details.get(u'Attachments', []) : bill.add_version_link('PDF', attachment['url'], mimetype="application/pdf") yield bill
def scrape_bill(self, chamber, session): url = "ftp://www.arkleg.state.ar.us/SessionInformation/LegislativeMeasures.txt" page = csv.reader(get_utf_16_ftp_content(url).splitlines(), delimiter='|') for row in page: bill_chamber = {'H': 'lower', 'S': 'upper'}[row[0]] if bill_chamber != chamber: continue bill_id = "%s%s %s" % (row[0], row[1], row[2]) type_spec = re.match(r'(H|S)([A-Z]+)\s', bill_id).group(2) bill_type = { 'B': 'bill', 'R': 'resolution', 'JR': 'joint resolution', 'CR': 'concurrent resolution', 'MR': 'memorial', 'CMR': 'concurrent memorial'}[type_spec] if row[-1] != self.slug: continue bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=row[3], classification=bill_type) bill.add_source(url) primary = row[11] if not primary: primary = row[12] if primary: bill.add_sponsorship(primary, classification='primary', entity_type='person', primary=True) version_url = ("ftp://www.arkleg.state.ar.us/Bills/" "%s/Public/Searchable/%s.pdf" % ( self.slug, bill_id.replace(' ', ''))) bill.add_version_link(bill_id, version_url, media_type='application/pdf') yield from self.scrape_bill_page(bill) self.bills[bill_id] = bill
def scrape_bill(self, session, chamber, bill_type, url): bill_html = self.get(url).text bill_page = lxml.html.fromstring(bill_html) qs = dict(urlparse.parse_qsl(urlparse.urlparse(url).query)) bill_id = '{}{}'.format(qs['billtype'], qs['billnumber']) versions = bill_page.xpath("//table[contains(@id, 'GridViewVersions')]")[0] metainf_table = bill_page.xpath('//div[contains(@id, "itemPlaceholder")]//table[1]')[0] action_table = bill_page.xpath('//div[contains(@id, "UpdatePanel1")]//table[1]')[0] meta = self.parse_bill_metainf_table(metainf_table) subs = [s.strip() for s in meta['Report Title'].split(";")] if "" in subs: subs.remove("") b = Bill(bill_id, session, meta['Measure Title'], chamber=chamber, classification=bill_type) if meta['Description']: b.add_abstract(meta['Description'], 'description') for subject in subs: b.add_subject(subject) if url: b.add_source(url) prior_session = '{} Regular Session'.format(str(int(session[:4]) - 1)) companion = meta['Companion'].strip() if companion: b.add_related_bill(identifier=companion.replace(u'\xa0', ' '), legislative_session=prior_session, relation_type="companion") prior = bill_page.xpath( "//table[@id='ContentPlaceHolderCol1_GridViewStatus']/tr/td/font/text()")[-1] if 'carried over' in prior.lower(): b.add_related_bill(identifier=bill_id.replace(u'\xa0', ' '), legislative_session=prior_session, relation_type="companion") for sponsor in meta['Introducer(s)']: b.add_sponsorship(sponsor, 'primary', 'person', True) versions = self.parse_bill_versions_table(b, versions) yield from self.parse_bill_actions_table(b, action_table, bill_id, session, url, chamber) yield b
def createBill(self, agenda_item): title = agenda_item['Title'].replace('\n', ' ') title, primary_role, primary_sponsor, secondary_role, secondary_sponsor = re.match(agenda_item_title_re, title).groups() bill = { 'identifier': agenda_item['Item No.'], 'title': title, 'legislative_session': agenda_item['session'], # TODO: Add agenda_item type to OCD 'classification': 'bill', 'from_organization': {'name': self.jurisdiction.name}, } b = Bill(**bill) b.add_source(agenda_item['url'], note='web') if primary_sponsor and secondary_sponsor: b.add_sponsorship(primary_sponsor, 'mover', 'person', True) b.add_sponsorship(secondary_sponsor, 'seconder', 'person', False) return b
def scrape_chamber(self, chamber, session): chamber_abbrev = {'upper': 'SF', 'lower': 'HB'}[chamber] url = ("http://legisweb.state.wy.us/%s/billreference/" "BillReference.aspx?type=%s" % (session, chamber_abbrev)) page = self.lxmlize(url) for tr in page.xpath("//table[contains(@id,'cphContent_gvBills')]//tr")[1:]: bill_id = tr.xpath("string(td[1])").strip() title = tr.xpath("string(td[2])").strip() if bill_id[0:2] in ['SJ', 'HJ']: bill_type = 'joint resolution' else: bill_type = 'bill' bill = Bill(bill_id, legislative_session=session, title=title, chamber=chamber, classification=bill_type) yield from self.scrape_digest(bill, chamber) # versions for a in (tr.xpath('td[8]//a') + tr.xpath('td[11]//a') + tr.xpath('td[12]//a')): # skip references to other bills if a.text.startswith('See'): continue bill.add_version_link(a.text, a.get('href'), media_type='application/pdf') # documents fnote = tr.xpath('td[9]//a') if fnote: bill.add_document_link('Fiscal Note', fnote[0].get('href')) summary = tr.xpath('td[14]//a') if summary: bill.add_document_link('Summary', summary[0].get('href')) bill.add_source(url) yield bill
def scrape_bill(self, session, chamber, bill_url): try: page = self.lxmlize('{}{}'.format(CO_URL_BASE, bill_url)) except scrapelib.HTTPError as e: if e.response.status_code == 503: self.error('Skipping %s w/ 503', bill_url) return else: raise bill_number = page.xpath('//div[contains(@class,"field-name-field-bill-number")]' '//div[contains(@class,"field-item even")][1]/text()')[0].strip() bill_title = page.xpath('//span[@property="dc:title"]/@content')[0] bill_summary = page.xpath( 'string(//div[contains(@class,"field-name-field-bill-summary")])') bill_summary = bill_summary.strip() bill = Bill( bill_number, legislative_session=session, chamber=chamber, title=bill_title, ) if bill_summary: bill.add_abstract(bill_summary, 'summary') bill.add_source('{}{}'.format(CO_URL_BASE, bill_url)) self.scrape_sponsors(bill, page) self.scrape_actions(bill, page) self.scrape_versions(bill, page) self.scrape_research_notes(bill, page) self.scrape_fiscal_notes(bill, page) self.scrape_committee_report(bill, page) self.scrape_amendments(bill, page) yield bill yield from self.scrape_votes(bill, page)
def handle_page(self): bills = self.doc.xpath('//ul[@class="linkSect"]/li') for bill in bills: link = bill.getchildren()[0] bill_id = str(link.text_content()) if not bill_id.startswith(('S', 'H')): continue # create a bill desc = bill.xpath('text()')[0].strip() chamber = { 'H': 'lower', 'S': 'upper', }[bill_id[0]] bill_type = { 'B': 'bill', 'J': 'joint resolution', 'R': 'resolution' }[bill_id[1]] bill = Bill(bill_id, self.kwargs['session'], desc, chamber=chamber, classification=bill_type) bill_url = link.get('href') sponsor_url = BASE_URL + URL_PATTERNS['sponsors'].format( self.kwargs['session_id'], bill_id.replace(' ', ''), ) list(self.scrape_page_items(BillSponsorPage, url=sponsor_url, obj=bill)) yield from self.scrape_page_items(BillDetailPage, url=bill_url, obj=bill) bill.subject = self.kwargs['subjects'][bill_id] bill.add_source(bill_url) yield bill next_url = self.doc.xpath('//a/b[text()="More..."]/../@href') if next_url: yield from self.scrape_page_items(BillListPage, url=next_url[0], **self.kwargs)
def scrape_bill_2012(self, chamber, session, bill_id, url): html = self.get(url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) # find <a name="Title">, get parent dt, get parent dl, then dd n dl title = doc.xpath('//a[@name="Title"][1]/../../dd[1]/text()')[0].strip() summary = doc.xpath('//font[@size="3"]/p/text()')[0].strip() if 'B' in bill_id: _type = ['bill'] elif 'J' in bill_id: _type = ['joint resolution'] bill = Bill( bill_id, legislative_session=session, classification=_type, chamber=chamber, title=title, ) bill.add_abstract(summary, note='summary') bill.add_source(url) self.parse_bill_sponsors(doc, bill) # sponsors self.parse_bill_actions(doc, bill) # actions self.parse_bill_documents(doc, bill) # documents and versions yield from self.parse_bill_votes(doc, bill) # votes # subjects subjects = [] for subj in doc.xpath('//a[contains(@href, "/subjects/")]'): subjects.append(subj.text.split('-see also-')[0]) bill.subject = subjects # add bill to collection self.save_bill(bill)
def _recursively_process_bills( self, request_session, chamber, session, first_item=1): ''' Once a search has been initiated, this function will save a Bill object for every Paper from the given chamber ''' url = 'http://legislature.maine.gov/LawMakerWeb/searchresults.asp' r = request_session.get(url, params={'StartWith': first_item}) r.raise_for_status() bills = lxml.html.fromstring(r.text).xpath('//tr/td/b/a') if bills: for bill in bills: bill_id_slug = bill.xpath('./@href')[0] bill_url = 'http://legislature.maine.gov/LawMakerWeb/{}'.format(bill_id_slug) bill_id = bill.text[:2] + " " + bill.text[2:] bill = Bill( identifier=bill_id, legislative_session=session, title="", chamber=chamber, ) bill.add_source(bill_url) yield from self.scrape_bill(bill, chamber) yield bill # Make a recursive call to this function, for the next page PAGE_SIZE = 25 yield from self._recursively_process_bills( request_session=request_session, chamber=chamber, session=session, first_item=first_item + PAGE_SIZE )
def scrape(self): for leg_summary in self.legislation( created_after=datetime.datetime(2014, 1, 1)): leg_type = BILL_TYPES[leg_summary['Type']] bill = Bill(identifier=leg_summary['File\xa0#'], title=leg_summary['Title'], legislative_session=None, classification=leg_type, from_organization={"name": "New York City Council"}) bill.add_source(leg_summary['url'], note='web') leg_details = self.legDetails(leg_summary['url']) history = self.history(leg_summary['url']) bill.add_title(leg_details['Name'], note='created by administrative staff') if 'Summary' in leg_details: bill.add_abstract(leg_details['Summary'], note='') if leg_details['Law number']: bill.add_identifier(leg_details['Law number'], note='law number') for sponsorship in self._sponsors(leg_details.get('Sponsors', [])): sponsor, sponsorship_type, primary = sponsorship bill.add_sponsorship(sponsor, sponsorship_type, 'person', primary) for attachment in leg_details.get('Attachments', []): if attachment['label']: bill.add_document_link(attachment['label'], attachment['url'], media_type="application/pdf") history = list(history) if history: earliest_action = min( self.toTime(action['Date']) for action in history) bill.legislative_session = self.sessions(earliest_action) else: bill.legislative_session = str(self.SESSION_STARTS[0]) for action in history: action_description = action['Action'] if not action_description: continue action_class = ACTION_CLASSIFICATION[action_description] action_date = self.toDate(action['Date']) responsible_org = action['Action\xa0By'] if responsible_org == 'City Council': responsible_org = 'New York City Council' elif responsible_org == 'Administration': responsible_org = 'Mayor' if responsible_org == 'Town Hall Meeting': continue else: act = bill.add_action( action_description, action_date, organization={'name': responsible_org}, classification=action_class) if 'url' in action['Action\xa0Details']: action_detail_url = action['Action\xa0Details']['url'] if action_class == 'committee-referral': action_details = self.actionDetails(action_detail_url) referred_committee = action_details[ 'Action text'].rsplit(' to the ', 1)[-1] act.add_related_entity( referred_committee, 'organization', entity_id=_make_pseudo_id(name=referred_committee)) result, votes = self.extractVotes(action_detail_url) if result and votes: action_vote = VoteEvent( legislative_session=bill.legislative_session, motion_text=action_description, organization={'name': responsible_org}, classification=action_class, start_date=action_date, result=result, bill=bill) action_vote.add_source(action_detail_url, note='web') for option, voter in votes: action_vote.vote(option, voter) yield action_vote text = self.text(leg_summary['url']) if text: bill.extras = { 'local_classification': leg_summary['Type'], 'full_text': text } else: bill.extras = {'local_classification': leg_summary['Type']} yield bill
def scrape_bill(self, chamber, session, bill_id, bill_type, url): doc = lxml.html.fromstring(self.get(url).text) doc.make_links_absolute(url) title = doc.xpath('//span[text()="Title"]')[0].getparent() if title: title = title[1].text.strip().strip('"') else: self.warning("skipping bill {url}, no information") return bill = Bill( bill_id, title=title, chamber=chamber, classification=bill_type, legislative_session=session, ) bill.add_source(url) # Get sponsors spons_str = (doc.xpath('//span[contains(text(), "Sponsor(S)")]') [0].getparent()[1].text) sponsors_match = re.match(r"(SENATOR|REPRESENTATIVE)", spons_str) if sponsors_match: sponsors = spons_str.split(",") sponsor = sponsors[0].strip() if sponsor: bill.add_sponsorship( sponsors[0].split()[1], entity_type="person", classification="primary", primary=True, ) for sponsor in sponsors[1:]: sponsor = sponsor.strip() if sponsor: bill.add_sponsorship( sponsor, entity_type="person", classification="cosponsor", primary=False, ) else: # Committee sponsorship spons_str = spons_str.strip() if re.match(r" BY REQUEST OF THE GOVERNOR$", spons_str): spons_str = re.sub(r" BY REQUEST OF THE GOVERNOR$", "", spons_str).title() spons_str = spons_str + " Committee (by request of the governor)" if spons_str: bill.add_sponsorship( spons_str, entity_type="person", classification="primary", primary=True, ) # Get actions self._current_comm = None act_rows = doc.xpath("//div[@id='tab6_4']//tr")[1:] for row in act_rows: date, journal, action = row.xpath("td") action = action.text_content().strip() raw_chamber = action[0:3] journal_entry_number = journal.text_content() act_date = datetime.datetime.strptime(date.text_content().strip(), "%m/%d/%Y") if raw_chamber == "(H)": act_chamber = "lower" elif raw_chamber == "(S)": act_chamber = "upper" # Votes if re.search(r"Y(\d+)", action): vote_href = journal.xpath(".//a/@href") if vote_href: vote_href = vote_href[0].replace(" ", "") yield from self.parse_vote( bill, journal_entry_number, action, act_chamber, act_date, vote_href, ) action, atype = self.clean_action(action) match = re.search(r"^Prefile released (\d+/\d+/\d+)$", action) if match: action = "Prefile released" act_date = datetime.datetime.strptime(match.group(1), "%m/%d/%y") bill.add_action( action, chamber=act_chamber, date=act_date.strftime("%Y-%m-%d"), classification=atype, ) # Get subjects for subj in doc.xpath('//a[contains(@href, "subject")]/text()'): bill.add_subject(subj.strip()) # Get versions - to do text_list_url = ( f"https://www.akleg.gov/basis/Bill/Detail/{session}?Root={bill_id}#tab1_4" ) bill.add_source(text_list_url) text_doc = lxml.html.fromstring(self.get(text_list_url).text) text_doc.make_links_absolute(text_list_url) for link in text_doc.xpath('//a[contains(@href, "/Text/")]'): name = link.text_content() text_url = link.get("href") bill.add_version_link(name, text_url, media_type="text/html") # Get documents - to do doc_list_url = ( f"https://www.akleg.gov/basis/Bill/Detail/{session}?Root={bill_id}#tab5_4" ) doc_list = lxml.html.fromstring(self.get(doc_list_url).text) doc_list.make_links_absolute(doc_list_url) bill.add_source(doc_list_url) for href in doc_list.xpath( '//a[contains(@href, "get_documents")][@onclick]'): h_name = href.text_content() h_href = href.attrib["href"] if h_name.strip(): try: bill.add_document_link(h_name, h_href) except KeyError: self.warning("Duplicate found") return yield bill
def scrape_bill(self, chamber, session, bill_id): # there will be a space in bill_id if we're doing a one-off bill scrape # convert HB 102 into H102 if ' ' in bill_id: bill_id = bill_id[0] + bill_id.split(' ')[-1] # if chamber comes in as House/Senate convert to lower/upper if chamber == 'Senate': chamber = 'upper' elif chamber == 'House': chamber = 'lower' bill_detail_url = ( 'http://www.ncleg.net/gascripts/' 'BillLookUp/BillLookUp.pl?Session=%s&BillID=%s&votesToView=all' ) % (session, bill_id) # parse the bill data page, finding the latest html text data = self.get(bill_detail_url).text doc = lxml.html.fromstring(data) doc.make_links_absolute(bill_detail_url) title_div_txt = doc.xpath('//div[contains(@class, "h2")]/text()')[0] if 'Joint Resolution' in title_div_txt: bill_type = 'joint resolution' bill_id = bill_id[0] + 'JR ' + bill_id[1:] elif 'Resolution' in title_div_txt: bill_type = 'resolution' bill_id = bill_id[0] + 'R ' + bill_id[1:] elif 'Bill' in title_div_txt: bill_type = 'bill' bill_id = bill_id[0] + 'B ' + bill_id[1:] bill_title = doc.xpath( '/html/body/div/div/main/div[2]/div[contains(@class,"col-12")]/a' )[0] bill_title = bill_title.text_content().strip() bill = Bill(bill_id, legislative_session=session, title=bill_title, chamber=chamber, classification=bill_type) bill.add_source(bill_detail_url) # skip first PDF link (duplicate link to cur version) if chamber == 'lower': link_xpath = '//a[contains(@href, "/Bills/House/PDF/")]' else: link_xpath = '//a[contains(@href, "/Bills/Senate/PDF/")]' for vlink in doc.xpath(link_xpath)[1:]: # get the name from the PDF link... version_name = vlink.text.replace(u'\xa0', ' ') version_url = vlink.attrib['href'] media_type = 'text/html' if version_url.lower().endswith(".pdf"): media_type = 'application/pdf' bill.add_version_link(version_name, version_url, media_type=media_type, on_duplicate='ignore') # sponsors spon_row = doc.xpath( '//div[contains(text(), "Sponsors")]/following-sibling::div')[0] # first sponsors are primary, until we see (Primary) spon_type = 'primary' for leg in spon_row.text_content().split(';'): name = leg.replace(u'\xa0', ' ').strip() if name.startswith('(Primary)'): name = name.replace('(Primary)', '').strip() spon_type = 'cosponsor' if not name: continue bill.add_sponsorship(name, classification=spon_type, entity_type='person', primary=(spon_type == 'primary')) # keywords kw_row = doc.xpath( '//div[contains(text(), "Keywords:")]/following-sibling::div')[0] for subject in kw_row.text_content().split(', '): bill.add_subject(subject) # actions action_tr_xpath = ('//h6[contains(text(), "History")]' '/ancestor::div[contains(@class, "gray-card")]' '//div[contains(@class, "card-body")]' '/div[@class="row"]') # skip two header rows for row in doc.xpath(action_tr_xpath): cols = row.xpath('div') act_date = cols[1].text actor = cols[3].text or '' # if text is blank, try diving in action = (cols[5].text or '').strip() or cols[5].text_content().strip() act_date = dt.datetime.strptime(act_date, '%m/%d/%Y').strftime('%Y-%m-%d') if actor == 'Senate': actor = 'upper' elif actor == 'House': actor = 'lower' else: actor = 'executive' for pattern, atype in self._action_classifiers.items(): if action.startswith(pattern): break else: atype = None bill.add_action(action, act_date, chamber=actor, classification=atype) # TODO: Fix vote scraper # yield from self.scrape_votes(bill, doc) yield bill
def scrape_bill_list(self, url): bill_list = self._get_bill_list(url) for bill_info in bill_list: (bill_id, ) = bill_info.xpath('td[1]/font/input/@value') (sponsor, ) = bill_info.xpath('td[2]/font/input/@value') (subject, ) = bill_info.xpath('td[3]//text()') subject = subject.strip() chamber = self.CHAMBERS[bill_id[0]] if 'B' in bill_id: bill_type = 'bill' elif 'JR' in bill_id: bill_type = 'joint resolution' elif 'R' in bill_id: bill_type = 'resolution' else: raise AssertionError( "Unknown bill type for bill '{}'".format(bill_id)) bill = Bill( bill_id, legislative_session=self.session, chamber=chamber, title='', classification=bill_type, ) if subject: bill.subject = [subject] if sponsor: bill.add_sponsorship( name=sponsor, entity_type='person', classification='primary', primary=True, ) bill.add_source(url) bill_url = ('http://alisondb.legislature.state.al.us/Alison/' 'SESSBillStatusResult.aspx?BILL={}'.format(bill_id)) bill.add_source(bill_url) bill_html = self._get_bill_response(bill_url) if bill_html is None: self.warning( "Bill {} has no webpage, and will be skipped".format( bill_id)) continue bill_doc = lxml.html.fromstring(bill_html) if (bill_doc.xpath( '//span[@id="ContentPlaceHolder1_lblShotTitle"]')): title = bill_doc.xpath( '//span[@id="ContentPlaceHolder1_lblShotTitle"]' )[0].text_content().strip() if not title: title = "[No title given by state]" bill.title = title version_url_base = ( 'http://alisondb.legislature.state.al.us/ALISON/' 'SearchableInstruments/{0}/PrintFiles/{1}-'.format( self.session, bill_id)) versions = bill_doc.xpath( '//table[@class="box_versions"]/tr/td[2]/font/text()') for version in versions: name = version if version == "Introduced": version_url = version_url_base + 'int.pdf' elif version == "Engrossed": version_url = version_url_base + 'eng.pdf' elif version == "Enrolled": version_url = version_url_base + 'enr.pdf' else: raise NotImplementedError( "Unknown version type found: '{}'".format(name)) bill.add_version_link( name, version_url, media_type='application/pdf', on_duplicate='ignore', ) # Fiscal notes exist, but I can't figure out how to build their URL fiscal_notes = bill_doc.xpath( '//table[@class="box_fiscalnote"]')[1:] for fiscal_note in fiscal_notes: pass # Budget Isolation Resolutions are handled as extra actions/votes birs = bill_doc.xpath( '//div[@class="box_bir"]//table//table/tr')[1:] for bir in birs: bir_action = bir.xpath('td[1]')[0].text_content().strip() # Sometimes ALISON's database puts another bill's # actions into the BIR action list; ignore these if bill_id not in bir_action: self.warning( "BIR action found ({}) ".format(bir_action) + "that doesn't match the bill ID ({})".format(bill_id)) continue bir_date = datetime.datetime.strptime( bir.xpath('td[2]/font/text()')[0], self.DATE_FORMAT) bir_type = bir.xpath('td[1]/font/text()')[0].split(" ")[0] bir_chamber = self.CHAMBERS[bir_type[0]] bir_text = "{0}: {1}".format( bir_type, bir.xpath('td[3]/font/text()')[0].strip()) bill.add_action( bir_text, TIMEZONE.localize(bir_date), chamber=bir_chamber, classification='other', ) try: (bir_vote_id, ) = bir.xpath('td[4]/font/input/@value') except ValueError: bir_vote_id = '' bir_vote_id = bir_vote_id.strip() if bir_vote_id.startswith("Roll "): bir_vote_id = bir_vote_id.split(" ")[-1] yield from self.scrape_vote( bill=bill, vote_chamber=bir_type[0], bill_id="{0}%20for%20{1}".format(bir_type, bill_id), vote_id=bir_vote_id, vote_date=TIMEZONE.localize(bir_date), action_text=bir_text) actions = bill_doc.xpath( '//table[@id="ContentPlaceHolder1_gvHistory"]/tr')[1:] action_date = None for action in actions: # If actions occur on the same day, only one date will exist if (action.xpath('td[1]/font/text()')[0].encode( 'ascii', 'ignore').strip()): action_date = datetime.datetime.strptime( action.xpath('td[1]/font/text()')[0], self.DATE_FORMAT) (action_chamber, ) = action.xpath('td[2]/font/text()') possible_amendment = action.xpath('td[3]/font/u/text()') if len(possible_amendment ) > 0 and not possible_amendment[0].strip() == '': (amendment, ) = possible_amendment else: amendment = None (action_text, ) = action.xpath('td[4]/font/text()') action_type = _categorize_action(action_text) # check for occasional extra last row if not action_chamber.strip(): continue # The committee cell is just an abbreviation, so get its name actor = self.CHAMBERS[action_chamber] try: action_committee = re.search( r'.*? referred to the .*? committee on (.*?)$', action_text).group(1).strip() except AttributeError: action_committee = '' if action_date is not None: act = bill.add_action( action_text, TIMEZONE.localize(action_date), chamber=actor, classification=action_type, ) if action_committee: act.add_related_entity(action_committee, entity_type='organization') try: vote_button = action.xpath('td[9]//text()')[0].strip() except IndexError: vote_button = '' if vote_button.startswith("Roll "): vote_id = vote_button.split(" ")[-1] yield from self.scrape_vote( bill=bill, vote_chamber=action_chamber, bill_id=bill_id, vote_id=vote_id, vote_date=TIMEZONE.localize(action_date), action_text=action_text) if amendment: amend_url = ( 'http://alisondb.legislature.state.al.us/ALISON/' 'SearchableInstruments/{0}/PrintFiles/{1}.pdf'.format( self.session, amendment)) amend_name = 'Amd/Sub {}'.format(amendment) bill.add_version_link( amend_name, amend_url, media_type='application/pdf', on_duplicate='ignore', ) yield bill
def scrape(self, session=None): HTML_TAGS_RE = r'<.*?>' if session is None: session = self.latest_session() year_slug = self.jurisdiction.get_year_slug(session) # Load all bills and resolutions via the private API bills_url = \ 'http://legislature.vermont.gov/bill/loadBillsReleased/{}/'.\ format(year_slug) bills_json = self.get(bills_url).text bills = json.loads(bills_json)['data'] or [] bills_url = \ 'http://legislature.vermont.gov/bill/loadBillsIntroduced/{}/'.\ format(year_slug) bills_json = self.get(bills_url).text bills.extend(json.loads(bills_json)['data'] or []) resolutions_url = \ 'http://legislature.vermont.gov/bill/loadAllResolutionsByChamber/{}/both'.\ format(year_slug) resolutions_json = self.get(resolutions_url).text bills.extend(json.loads(resolutions_json)['data'] or []) # Parse the information from each bill for info in bills: # Strip whitespace from strings info = {k: v.strip() for k, v in info.items()} # Identify the bill type and chamber if info['BillNumber'].startswith('J.R.H.'): bill_type = 'joint resolution' bill_chamber = 'lower' elif info['BillNumber'].startswith('J.R.S.'): bill_type = 'joint resolution' bill_chamber = 'upper' elif info['BillNumber'].startswith('H.C.R.'): bill_type = 'concurrent resolution' bill_chamber = 'lower' elif info['BillNumber'].startswith('S.C.R.'): bill_type = 'concurrent resolution' bill_chamber = 'upper' elif info['BillNumber'].startswith('H.R.'): bill_type = 'resolution' bill_chamber = 'lower' elif info['BillNumber'].startswith('S.R.'): bill_type = 'resolution' bill_chamber = 'upper' elif info['BillNumber'].startswith('PR.'): bill_type = 'constitutional amendment' if info['Body'] == 'H': bill_chamber = 'lower' elif info['Body'] == 'S': bill_chamber = 'upper' else: raise AssertionError("Amendment not tied to chamber") elif info['BillNumber'].startswith('H.'): bill_type = 'bill' bill_chamber = 'lower' elif info['BillNumber'].startswith('S.'): bill_type = 'bill' bill_chamber = 'upper' else: raise AssertionError( "Unknown bill type found: '{}'". format(info['BillNumber']) ) bill_id = info['BillNumber'].replace('.', '').replace(' ', '') # put one space back in between type and number bill_id = re.sub(r'([a-zA-Z]+)(\d+)', r'\1 \2', bill_id) # Create the bill using its basic information bill = Bill( identifier=bill_id, legislative_session=session, chamber=bill_chamber, title=info['Title'], classification=bill_type ) if 'resolution' in bill_type: bill.add_source(resolutions_url) else: bill.add_source(bills_url) # Load the bill's information page to access its metadata bill_url = 'http://legislature.vermont.gov/bill/status/{0}/{1}'.\ format(year_slug, info['BillNumber']) doc = self.lxmlize(bill_url) bill.add_source(bill_url) # Capture sponsors sponsors = doc.xpath( '//dl[@class="summary-table"]/dt[text()="Sponsor(s)"]/' 'following-sibling::dd[1]/ul/li' ) sponsor_type = 'primary' for sponsor in sponsors: if sponsor.xpath('span/text()') == ['Additional Sponsors']: sponsor_type = 'cosponsor' continue sponsor_name = sponsor.xpath('a/text()')[0].\ replace("Rep.", "").replace("Sen.", "").strip() if sponsor_name and not \ (sponsor_name[:5] == "Less" and len(sponsor_name) == 5): bill.add_sponsorship( name=sponsor_name, classification=sponsor_type, entity_type='person', primary=(sponsor_type == 'primary') ) # Capture bill text versions # Warning: There's a TODO in VT's source code saying 'move this to where it used to be' # so leave in the old and new positions versions = doc.xpath( '//dl[@class="summary-table"]/dt[text()="Bill/Resolution Text"]/' 'following-sibling::dd[1]/ul/li/a |' '//ul[@class="bill-path"]//a' ) for version in versions: if version.xpath('text()'): bill.add_version_link( note=version.xpath('text()')[0], url=version.xpath('@href')[0].replace(' ', '%20'), media_type='application/pdf' ) # Identify the internal bill ID, used for actions and votes # If there is no internal bill ID, then it has no extra information try: internal_bill_id = re.search( r'"bill/loadBillDetailedStatus/.+?/(\d+)"', lxml.etree.tostring(doc).decode('utf-8') ).group(1) except AttributeError: self.warning("Bill {} appears to have no activity".format(info['BillNumber'])) yield bill continue # Capture actions actions_url = 'http://legislature.vermont.gov/bill/loadBillDetailedStatus/{0}/{1}'.\ format(year_slug, internal_bill_id) actions_json = self.get(actions_url).text actions = json.loads(actions_json)['data'] bill.add_source(actions_url) chambers_passed = set() for action in actions: action = {k: v.strip() for k, v in action.items()} if "Signed by Governor" in action['FullStatus']: actor = 'executive' elif action['ChamberCode'] == 'H': actor = 'lower' elif action['ChamberCode'] == 'S': actor = 'upper' else: raise AssertionError("Unknown actor for bill action") # Categorize action if "Signed by Governor" in action['FullStatus']: # assert chambers_passed == set("HS") action_type = 'executive-signature' elif "Vetoed by the Governor" in action['FullStatus']: action_type = 'executive-veto' elif "Read first time" in action['FullStatus'] \ or "Read 1st time" in action['FullStatus']: action_type = 'introduction' elif "Reported favorably" in action['FullStatus']: action_type = 'committee-passage-favorable' elif actor == 'lower' and any(x.lower().startswith('aspassed') for x in action['keywords'].split(';')): action_type = 'passage' chambers_passed.add("H") elif actor == 'upper' and any(x.lower().startswith(' aspassed') or x.lower().startswith('aspassed') for x in action['keywords'].split(';')): action_type = 'passage' chambers_passed.add("S") else: action_type = None bill.add_action( description=re.sub(HTML_TAGS_RE, "", action['FullStatus']), date=datetime.datetime.strftime( datetime.datetime.strptime(action['StatusDate'], '%m/%d/%Y'), '%Y-%m-%d' ), chamber=actor, classification=action_type ) # Capture votes votes_url = 'http://legislature.vermont.gov/bill/loadBillRollCalls/{0}/{1}'.format( year_slug, internal_bill_id) votes_json = self.get(votes_url).text votes = json.loads(votes_json)['data'] bill.add_source(votes_url) for vote in votes: roll_call_id = vote['VoteHeaderID'] roll_call_url = ('http://legislature.vermont.gov/bill/' 'loadBillRollCallDetails/{0}/{1}'.format( year_slug, roll_call_id)) roll_call_json = self.get(roll_call_url).text roll_call = json.loads(roll_call_json)['data'] roll_call_yea = [] roll_call_nay = [] roll_call_not_voting = [] for member in roll_call: (member_name, _district) = member['MemberName'].split(" of ") member_name = member_name.strip() if member['MemberVote'] == "Yea": roll_call_yea.append(member_name) elif member['MemberVote'] == "Nay": roll_call_nay.append(member_name) else: roll_call_not_voting.append(member_name) if ("Passed -- " in vote['FullStatus'] or "Veto of Governor overridden" in vote['FullStatus']): did_pass = True elif ("Failed -- " in vote['FullStatus'] or 'Veto of the Governor sustained' in vote['FullStatus']): did_pass = False else: raise AssertionError("Roll call vote result is unclear") # Check vote counts yea_count = int(re.search(r'Yeas = (\d+)', vote['FullStatus']).group(1)) nay_count = int(re.search(r'Nays = (\d+)', vote['FullStatus']).group(1)) vote_to_add = VoteEvent( chamber=('lower' if vote['ChamberCode'] == 'H' else 'upper'), start_date=datetime.datetime.strftime( datetime.datetime.strptime(vote['StatusDate'], '%m/%d/%Y'), '%Y-%m-%d' ), motion_text=re.sub(HTML_TAGS_RE, "", vote['FullStatus']).strip(), result='pass' if did_pass else 'fail', classification='passage', legislative_session=session, bill=info['BillNumber'], bill_chamber=bill_chamber ) vote_to_add.add_source(roll_call_url) vote_to_add.set_count('yes', yea_count) vote_to_add.set_count('no', nay_count) vote_to_add.set_count('not voting', len(roll_call_not_voting)) for member in roll_call_yea: vote_to_add.yes(member) for member in roll_call_nay: vote_to_add.no(member) for member in roll_call_not_voting: vote_to_add.vote('not voting', member) yield vote_to_add # Capture extra information- Not yet implemented # Witnesses: # http://legislature.vermont.gov/bill/loadBillWitnessList/{year_slug}/{internal_bill_id} # Conference committee members: # http://legislature.vermont.gov/bill/loadBillConference/{year_slug}/{bill_number} # Committee meetings: # http://legislature.vermont.gov/committee/loadHistoryByBill/{year_slug}?LegislationId={internal_bill_id} yield bill
def _scrape_bill(self, session, bill_data): details = self._parse_bill_details(bill_data) ( senate_url, assembly_url, bill_chamber, bill_type, bill_id, title, (prefix, number, active_version), ) = details bill = Bill( bill_id, legislative_session=session, chamber=bill_chamber, title=title or bill_data["summary"], classification=bill_type, ) if bill_data["summary"]: bill.add_abstract(bill_data["summary"], note="") bill_active_version = bill_data["amendments"]["items"][active_version] # Parse sponsors. if bill_data["sponsor"] is not None: if bill_data["sponsor"]["rules"] is True: bill.add_sponsorship( "Rules Committee", entity_type="organization", classification="primary", primary=True, ) elif not bill_data["sponsor"]["budget"]: primary_sponsor = bill_data["sponsor"]["member"] bill.add_sponsorship( primary_sponsor["shortName"], entity_type="person", classification="primary", primary=True, ) # There *shouldn't* be cosponsors if there is no sponsor. cosponsors = bill_active_version["coSponsors"]["items"] for cosponsor in cosponsors: bill.add_sponsorship( cosponsor["shortName"], entity_type="person", classification="cosponsor", primary=False, ) # List companion bill. same_as = bill_active_version.get("sameAs", {}) # Check whether "sameAs" property is populated with at least one bill. if same_as["items"]: # Get companion bill ID. companion_bill_id = same_as["items"][0]["basePrintNo"] # Build companion bill session. start_year = same_as["items"][0]["session"] end_year = start_year + 1 companion_bill_session = "-".join([str(start_year), str(end_year)]) # Attach companion bill data. bill.add_related_bill(companion_bill_id, companion_bill_session, relation_type="companion") # Parse actions. chamber_map = {"senate": "upper", "assembly": "lower"} for action in bill_data["actions"]["items"]: chamber = chamber_map[action["chamber"].lower()] action_datetime = datetime.datetime.strptime( action["date"], "%Y-%m-%d") action_date = action_datetime.date() types, _ = NYBillScraper.categorizer.categorize(action["text"]) bill.add_action( action["text"], action_date.strftime("%Y-%m-%d"), chamber=chamber, classification=types, ) # Handling of sources follows. Sources serving either chamber # maintain duplicate data, so we can see certain bill data # through either chamber's resources. However, we have to refer # to a specific chamber's resources if we want to grab certain # specific information such as vote data. # # As such, I'm placing all potential sources in the interest of # thoroughness. - Andy Lo # List Open Legislation API endpoint as a source. api_url = self.api_client.root + self.api_client.resources[ "bill"].format( session_year=session, bill_id=bill_id, summary="", detail="") bill.add_source(api_url) bill.add_source(senate_url) bill.add_source(assembly_url) # Chamber-specific processing. for vote_data in bill_data["votes"]["items"]: yield self._parse_senate_votes(vote_data, bill, api_url) yield from self.scrape_assembly_votes(session, bill, assembly_url, bill_id) # A little strange the way it works out, but the Assembly # provides the HTML version documents and the Senate provides # the PDF version documents. amendments = bill_data["amendments"]["items"] for key, amendment in amendments.items(): version = amendment["printNo"] html_url = ("http://assembly.state.ny.us/leg/?sh=printbill&bn=" "{}&term={}&Text=Y".format(bill_id, self.term_start_year)) bill.add_version_link(version, html_url, on_duplicate="ignore", media_type="text/html") pdf_url = "http://legislation.nysenate.gov/pdf/bills/{}/{}".format( self.term_start_year, version) bill.add_version_link( version, pdf_url, on_duplicate="ignore", media_type="application/pdf", ) yield bill
def scrape_bills(self, chamber, session, subjects): idex = bill_start_numbers(session)[chamber] FROM = "ctl00$rilinContent$txtBillFrom" TO = "ctl00$rilinContent$txtBillTo" YEAR = "ctl00$rilinContent$cbYear" blocks = "FOO" # Ugh. while len(blocks) > 0: default_headers = get_default_headers(SEARCH_URL) default_headers[FROM] = idex default_headers[TO] = idex + MAXQUERY default_headers[YEAR] = session idex += MAXQUERY blocks = self.parse_results_page(self.post(SEARCH_URL, data=default_headers).text) blocks = blocks[1:-1] blocks = self.digest_results_page(blocks) for block in blocks: bill = blocks[block] subs = [] try: subs = subjects[bill['bill_id']] except KeyError: pass title = bill['title'][len("ENTITLED, "):] billid = bill['bill_id'] try: subs = subjects[bill['bill_id']] except KeyError: subs = [] for b in BILL_NAME_TRANSLATIONS: if billid[:len(b)] == b: billid = BILL_NAME_TRANSLATIONS[b] + billid[len(b) + 1:].split()[0] b = Bill( billid, title=title, chamber=chamber, legislative_session=session, classification=self.get_type_by_name(bill['bill_id']), ) b.subject = subs self.process_actions(bill['actions'], b) sponsors = bill['sponsors'][len("BY"):].strip() sponsors = sponsors.split(",") sponsors = [s.strip() for s in sponsors] for href in bill['bill_id_hrefs']: b.add_version_link( href.text, href.attrib['href'], media_type="application/pdf") for sponsor in sponsors: b.add_sponsorship( sponsor, entity_type='person', classification='primary', primary=True) b.add_source(SEARCH_URL) yield b
def scrape_bill(self, session, history_url): history_xml = self.get(history_url).text root = etree.fromstring(history_xml) bill_title = root.findtext("caption") if bill_title is None or "Bill does not exist" in history_xml: self.warning("Bill does not appear to exist") return bill_id = " ".join(root.attrib["bill"].split(" ")[1:]) chamber = self.CHAMBERS[bill_id[0]] if bill_id[1] == "B": bill_type = ["bill"] elif bill_id[1] == "R": bill_type = ["resolution"] elif bill_id[1:3] == "CR": bill_type = ["concurrent resolution"] elif bill_id[1:3] == "JR": bill_type = ["joint resolution"] else: raise ScrapeError("Invalid bill_id: %s" % bill_id) bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=bill_title, classification=bill_type, ) bill.add_source(history_url) for subject in root.iterfind("subjects/subject"): bill.add_subject(subject.text.strip()) versions = [x for x in self.versions if x[0] == bill_id] for version in versions: bill.add_version_link( note=self.NAME_SLUGS[version[1][-5]], url=version[1], media_type="text/html", ) analyses = [x for x in self.analyses if x[0] == bill_id] for analysis in analyses: bill.add_document_link( note="Analysis ({})".format(self.NAME_SLUGS[analysis[1][-5]]), url=analysis[1], media_type="text/html", ) fiscal_notes = [x for x in self.fiscal_notes if x[0] == bill_id] for fiscal_note in fiscal_notes: bill.add_document_link( note="Fiscal Note ({})".format( self.NAME_SLUGS[fiscal_note[1][-5]]), url=fiscal_note[1], media_type="text/html", ) witnesses = [x for x in self.witnesses if x[0] == bill_id] for witness in witnesses: bill.add_document_link( note="Witness List ({})".format( self.NAME_SLUGS[witness[1][-5]]), url=witness[1], media_type="text/html", ) for action in root.findall("actions/action"): act_date = datetime.datetime.strptime(action.findtext("date"), "%m/%d/%Y").date() action_number = action.find("actionNumber").text actor = { "H": "lower", "S": "upper", "E": "executive" }[action_number[0]] desc = action.findtext("description").strip() if desc == "Scheduled for public hearing on . . .": self.warning("Skipping public hearing action with no date") continue introduced = False if desc == "Amended": atype = "amendment-passage" elif desc == "Amendment(s) offered": atype = "amendment-introduction" elif desc == "Amendment amended": atype = "amendment-amendment" elif desc == "Amendment withdrawn": atype = "amendment-withdrawal" elif desc == "Passed" or desc == "Adopted": atype = "passage" elif re.match(r"^Received (by|from) the", desc): if "Secretary of the Senate" not in desc: atype = "introduction" else: atype = "filing" elif desc.startswith("Sent to the Governor"): # But what if it gets lost in the mail? atype = "executive-receipt" elif desc.startswith("Signed by the Governor"): atype = "executive-signature" elif desc.startswith("Effective on"): atype = "became-law" elif desc == "Vetoed by the Governor": atype = "executive-veto" elif desc == "Read first time": atype = ["introduction", "reading-1"] introduced = True elif desc == "Read & adopted": atype = ["passage"] if not introduced: introduced = True atype.append("introduction") elif desc == "Passed as amended": atype = "passage" elif desc.startswith("Referred to") or desc.startswith( "Recommended to be sent to "): atype = "referral-committee" elif desc == "Reported favorably w/o amendment(s)": atype = "committee-passage" elif desc == "Filed": atype = "filing" elif desc == "Read 3rd time": atype = "reading-3" elif desc == "Read 2nd time": atype = "reading-2" elif desc.startswith("Reported favorably"): atype = "committee-passage-favorable" else: atype = None act = bill.add_action( action.findtext("description"), act_date, chamber=actor, classification=atype, ) if atype and "referral-committee" in atype: repls = ["Referred to", "Recommended to be sent to "] ctty = desc for r in repls: ctty = ctty.replace(r, "").strip() act.add_related_entity(name=ctty, entity_type="organization") for author in root.findtext("authors").split(" | "): if author != "": bill.add_sponsorship(author, classification="primary", entity_type="person", primary=True) for coauthor in root.findtext("coauthors").split(" | "): if coauthor != "": bill.add_sponsorship( coauthor, classification="cosponsor", entity_type="person", primary=False, ) for sponsor in root.findtext("sponsors").split(" | "): if sponsor != "": bill.add_sponsorship( sponsor, classification="primary", entity_type="person", primary=True, ) for cosponsor in root.findtext("cosponsors").split(" | "): if cosponsor != "": bill.add_sponsorship( cosponsor, classification="cosponsor", entity_type="person", primary=False, ) if root.findtext("companions"): self._get_companion(bill) yield bill
def scrape_bill(self, chamber, session, bill_id, bill_type): url = "%s?r=%s" % (self.base_url, bill_id) html = self.get(url).text if "error '80020009'" in html: self.warning("asp error on page, skipping %s", bill_id) return doc = lxml.html.fromstring(html) # search for Titulo, accent over i messes up lxml, so use 'tulo' title = doc.xpath( u'//td/b[contains(text(),"tulo")]/../following-sibling::td/text()') if not title: raise NoSuchBill() bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=title[0], classification=bill_type, ) author = doc.xpath(u'//td/b[contains(text(),"Autor")]/../text()')[0] for aname in author.split(","): aname = self.clean_name(aname).strip() if aname: bill.add_sponsorship(aname, classification="primary", entity_type="person", primary=True) co_authors = doc.xpath( u'//td/b[contains(text(),"Co-autor")]/../text()') if len(co_authors) != 0: for co_author in co_authors[1].split(","): bill.add_sponsorship( self.clean_name(co_author).strip(), classification="cosponsor", entity_type="person", primary=False, ) action_table = doc.xpath("//table")[-1] bill_vote_chamber = None for row in action_table[1:]: tds = row.xpath("td") # ignore row missing date if len(tds) != 2: continue if tds[0].text_content(): date = datetime.datetime.strptime(tds[0].text_content(), "%m/%d/%Y") action = tds[1].text_content().strip() # parse the text to see if it's a new version or a unrelated document # if has a hyphen let's assume it's a vote document # get url of action action_url = tds[1].xpath("a/@href") atype, action = self.parse_action(chamber, bill, action, action_url, date) # Some lower-house roll calls could be parsed, but finnicky # Most roll lists are just images embedded within a document, # and offer no alt text to scrape # Instead, just scrape the vote counts regex = r"(?u)^(.*),\s([\s\d]{2})-([\s\d]{2})-([\s\d]{2})-([\s\d]{0,2})$" vote_info = re.search(regex, action) if vote_info and re.search(r"\d{1,2}", action): vote_name = vote_info.group(1) if u"Votación Final" in vote_name: (vote_chamber, vote_name) = re.search(r"(?u)^\w+ por (.*?) en (.*)$", vote_name).groups() if "Senado" in vote_chamber: vote_chamber = "upper" else: vote_chamber = "lower" elif "Cuerpo de Origen" in vote_name: vote_name = re.search(r"(?u)^Cuerpo de Origen (.*)$", vote_name).group(1) vote_chamber = chamber elif u"informe de Comisión de Conferencia" in vote_name: (vote_chamber, vote_name) = re.search( r"(?u)^(\w+) (\w+ informe de Comisi\wn de Conferencia)$", vote_name, ).groups() if vote_chamber == "Senado": vote_chamber = "upper" else: vote_chamber = "lower" # TODO replace bill['votes'] elif u"Se reconsideró" in vote_name: if bill_vote_chamber: vote_chamber = bill_vote_chamber else: vote_chamber = chamber else: raise AssertionError( u"Unknown vote text found: {}".format(vote_name)) vote_name = vote_name.title() yes = int(vote_info.group(2)) no = int(vote_info.group(3)) other = 0 if vote_info.group(4).strip(): other += int(vote_info.group(4)) if vote_info.group(5).strip(): other += int(vote_info.group(5)) vote = Vote( chamber=vote_chamber, start_date=date.strftime("%Y-%m-%d"), motion_text=vote_name, result="pass" if (yes > no) else "fail", bill=bill, classification="passage", ) vote.set_count("yes", yes) vote.set_count("no", no) vote.set_count("other", other) vote.add_source(url) yield vote bill_vote_chamber = chamber bill.add_source(url) yield bill
def scrape_bill(self, chamber, session, bill_id, title, url): page = self.lxmlize(url) if re.match(r"^(S|H)B ", bill_id): btype = ["bill"] elif re.match(r"(S|H)C ", bill_id): btype = ["commemoration"] elif re.match(r"(S|H)JR ", bill_id): btype = ["joint resolution"] elif re.match(r"(S|H)CR ", bill_id): btype = ["concurrent resolution"] else: btype = ["bill"] bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=title, classification=btype, ) bill.add_source(url) version_rows = page.xpath( "//div[@id=\"ctl00_ContentPlaceHolder1_ctl00_BillVersions\"]" + "/section/table/tbody/tr" ) assert len(version_rows) > 0 for row in version_rows: (date,) = row.xpath("./td[@data-title=\"Date\"]/text()") date = date.strip() date = datetime.datetime.strptime(date, "%m/%d/%Y").date() (html_note,) = row.xpath("./td[@data-title=\"HTML\"]/a/text()") (html_link,) = row.xpath("./td[@data-title=\"HTML\"]/a/@href") (pdf_note,) = row.xpath("./td[@data-title=\"PDF\"]/a/text()") (pdf_link,) = row.xpath("./td[@data-title=\"PDF\"]/a/@href") assert html_note == pdf_note note = html_note bill.add_version_link( note, html_link, date=date, media_type="text/html", on_duplicate="ignore", ) bill.add_version_link( note, pdf_link, date=date, media_type="application/pdf", on_duplicate="ignore", ) sponsor_links = page.xpath( '//div[@id="ctl00_ContentPlaceHolder1_ctl00_BillDetail"]' + '/label[contains(text(), "Sponsors:")]' + "/following-sibling::div[1]/p/a" ) for link in sponsor_links: if link.attrib["href"].startswith("https://sdlegislature.gov/Legislators/"): sponsor_type = "person" elif link.attrib["href"].startswith( "https://sdlegislature.gov/Legislative_Session/Committees" ): sponsor_type = "organization" else: raise ScrapeError( "Found unexpected sponsor, URL: " + link.attrib["href"] ) bill.add_sponsorship( link.text, classification="primary", primary=True, entity_type=sponsor_type, ) actor = chamber use_row = False for row in page.xpath("//table[contains(@id, 'tblBillActions')]//tr"): # Some tables have null rows, that are just `<tr></tr>` # Eg: sdlegislature.gov/Legislative_Session/Bills/Bill.aspx?Bill=1005&Session=2018 if row.text_content() == "": self.debug("Skipping action table row that is completely empty") continue if "Date" in row.text_content() and "Action" in row.text_content(): use_row = True continue elif not use_row: continue action = row.xpath("string(td[2])").strip() atypes = [] if action.startswith("First read"): atypes.append("introduction") atypes.append("reading-1") if re.match(r"Signed by (?:the\s)*Governor", action, re.IGNORECASE): atypes.append("executive-signature") actor = "executive" match = re.match(r"(.*) Do Pass( Amended)?, (Passed|Failed)", action) if match: if match.group(1) in ["Senate", "House of Representatives"]: first = "" else: first = "committee-" if match.group(3).lower() == "passed": second = "passage" elif match.group(3).lower() == "failed": second = "failure" atypes.append("%s%s" % (first, second)) if "referred to" in action.lower(): atypes.append("referral-committee") if "Motion to amend, Passed Amendment" in action: atypes.append("amendment-introduction") atypes.append("amendment-passage") if row.xpath('td[2]/a[contains(@href,"Amendment.aspx")]'): amd = row.xpath('td[2]/a[contains(@href,"Amendment.aspx")]')[0] version_name = amd.xpath("string(.)") version_url = amd.xpath("@href")[0] if "htm" in version_url: mimetype = "text/html" elif "pdf" in version_url: mimetype = "application/pdf" bill.add_version_link( version_name, version_url, media_type=mimetype, on_duplicate="ignore", ) if "Veto override, Passed" in action: atypes.append("veto-override-passage") elif "Veto override, Failed" in action: atypes.append("veto-override-failure") if "Delivered to the Governor" in action: atypes.append("executive-receipt") match = re.match("First read in (Senate|House)", action) if match: if match.group(1) == "Senate": actor = "upper" else: actor = "lower" date = row.xpath("string(td[1])").strip() match = re.match(r"\d{2}/\d{2}/\d{4}", date) if not match: self.warning("Bad date: %s" % date) continue date = datetime.datetime.strptime(date, "%m/%d/%Y").date() for link in row.xpath("td[2]/a[contains(@href, 'RollCall')]"): yield from self.scrape_vote(bill, date, link.attrib["href"]) if action: bill.add_action(action, date, chamber=actor, classification=atypes) for link in page.xpath("//a[contains(@href, 'Keyword')]"): bill.add_subject(link.text.strip()) yield bill
def _parse_house_bill(self, url, session): # using the print page makes the page simpler, and also *drastically* smaller # (8k rather than 100k) url = re.sub("billsummary", "billsummaryprn", url) url = '%s/%s' % (self._house_base_url, url) # the URL is an iframed version now, so swap in for the actual bill page url = url.replace('Bill.aspx', 'BillContent.aspx') url = url.replace('&code=R', '&code=R&style=new') # http://www.house.mo.gov/Bill.aspx?bill=HB26&year=2017&code=R # http://www.house.mo.gov/BillContent.aspx?bill=HB26&year=2017&code=R&style=new bill_page = self.get(url).text bill_page = lxml.html.fromstring(bill_page) bill_page.make_links_absolute(url) bill_id = bill_page.xpath('//*[@class="entry-title"]/div') if len(bill_id) == 0: self.info("WARNING: bill summary page is blank! (%s)" % url) self._bad_urls.append(url) return bill_id = bill_id[0].text_content() bill_id = clean_text(bill_id) bill_desc = bill_page.xpath( '//*[@class="BillDescription"]')[0].text_content() bill_desc = clean_text(bill_desc) table_rows = bill_page.xpath('//table/tr') # if there is a cosponsor all the rows are pushed down one for the extra row # for the cosponsor: cosponsorOffset = 0 if table_rows[2][0].text_content().strip() == 'Co-Sponsor:': cosponsorOffset = 1 lr_label_tag = table_rows[3 + cosponsorOffset] assert lr_label_tag[0].text_content().strip() == 'LR Number:' # bill_lr = lr_label_tag[1].text_content() lastActionOffset = 0 if table_rows[4 + cosponsorOffset][0].text_content().strip( ) == 'Governor Action:': lastActionOffset = 1 official_title_tag = table_rows[5 + cosponsorOffset + lastActionOffset] assert official_title_tag[0].text_content().strip() == 'Bill String:' official_title = official_title_tag[1].text_content() # could substitute the description for the name, # but keeping it separate for now. bill_type = "bill" triplet = bill_id[:3] if triplet in bill_types: bill_type = bill_types[triplet] bill_number = int(bill_id[3:].strip()) else: bill_number = int(bill_id[3:]) subs = [] bid = bill_id.replace(" ", "") if bid in self._subjects: subs = self._subjects[bid] self.info("With subjects for this bill") self.info(bid) if bill_desc == "": if bill_number <= 20: # blank bill titles early in session are approp. bills bill_desc = 'Appropriations Bill' else: self.error("Blank title. Skipping. {} / {} / {}".format( bill_id, bill_desc, official_title)) return bill = Bill( bill_id, chamber='lower', title=bill_desc, legislative_session=self._session_id, classification=bill_type, ) bill.subject = subs bill.add_title(official_title, note='official') bill.add_source(url) bill_sponsor = clean_text(table_rows[0][1].text_content()) # try: # bill_sponsor_link = table_rows[0][1][0].attrib['href'] # except IndexError: # return bill.add_sponsorship( bill_sponsor, entity_type='person', classification='primary', primary=True, ) # check for cosponsors sponsors_url, = bill_page.xpath( "//a[contains(@href, 'CoSponsors.aspx')]/@href") self._parse_cosponsors_from_bill(bill, sponsors_url) # actions_link_tag = bill_page.xpath('//div[@class="Sections"]/a')[0] # actions_link = '%s/%s' % (self._house_base_url,actions_link_tag.attrib['href']) # actions_link = re.sub("content", "print", actions_link) actions_link, = bill_page.xpath( "//a[contains(@href, 'BillActions.aspx')]/@href") yield from self._parse_house_actions(bill, actions_link) # get bill versions doc_tags = bill_page.xpath('//div[@class="BillDocuments"][1]/span') for doc_tag in reversed(doc_tags): doc = clean_text(doc_tag.text_content()) text_url = '%s%s' % (self._house_base_url, doc_tag[0].attrib['href']) bill.add_document_link(doc, text_url, media_type='text/html') # get bill versions version_tags = bill_page.xpath('//div[@class="BillDocuments"][2]/span') for version_tag in reversed(version_tags): version = clean_text(version_tag.text_content()) for vurl in version_tag.xpath(".//a"): if vurl.text == 'PDF': mimetype = 'application/pdf' else: mimetype = 'text/html' bill.add_version_link(version, vurl.attrib['href'], media_type=mimetype, on_duplicate='ignore') # house bill versions # everything between the row containing "Bill Text" in an h2 and the next div.DocHeaderRow version_rows = bill_page.xpath( '//div[h2[contains(text(),"Bill Text")]]/' 'following-sibling::div[contains(@class,"DocRow") ' 'and count(preceding-sibling::div[contains(@class,"DocHeaderRow")])=1]' ) for row in version_rows: # some rows are just broken links, not real versions if row.xpath('.//div[contains(@class,"textType")]/a/@href'): version = row.xpath( './/div[contains(@class,"textType")]/a/text()')[0].strip() path = row.xpath( './/div[contains(@class,"textType")]/a/@href')[0].strip() if '.pdf' in path: mimetype = 'application/pdf' else: mimetype = 'text/html' bill.add_version_link(version, path, media_type=mimetype, on_duplicate='ignore') # house bill summaries # everything between the row containing "Bill Summary" in an h2 # and the next div.DocHeaderRow summary_rows = bill_page.xpath( '//div[h2[contains(text(),"Bill Summary")]]/' 'following-sibling::div[contains(@class,"DocRow") ' 'and count(following-sibling::div[contains(@class,"DocHeaderRow")])=1]' ) # if there are no amedments, we need a different xpath for summaries if not summary_rows: summary_rows = bill_page.xpath( '//div[h2[contains(text(),"Bill Summary")]]/' 'following-sibling::div[contains(@class,"DocRow")]') for row in reversed(summary_rows): version = row.xpath( './/div[contains(@class,"textType")]/a/text()')[0].strip() if version: path = row.xpath( './/div[contains(@class,"textType")]/a/@href')[0].strip() summary_name = 'Bill Summary ({})'.format(version) if '.pdf' in path: mimetype = 'application/pdf' else: mimetype = 'text/html' bill.add_document_link(summary_name, path, media_type=mimetype, on_duplicate='ignore') # house bill amendments amendment_rows = bill_page.xpath( '//div[h2[contains(text(),"Amendment")]]/' 'following-sibling::div[contains(@class,"DocRow")]') for row in reversed(amendment_rows): version = row.xpath( './/div[contains(@class,"DocInfoCell")]/a[1]/text()')[0].strip( ) path = row.xpath( './/div[contains(@class,"DocInfoCell")]/a[1]/@href')[0].strip( ) summary_name = 'Amendment {}'.format(version) defeated_icon = row.xpath('.//img[contains(@title,"Defeated")]') if defeated_icon: summary_name = '{} (Defeated)'.format(summary_name) adopted_icon = row.xpath('.//img[contains(@title,"Adopted")]') if adopted_icon: summary_name = '{} (Adopted)'.format(summary_name) distributed_icon = row.xpath( './/img[contains(@title,"Distributed")]') if distributed_icon: summary_name = '{} (Distributed)'.format(summary_name) if '.pdf' in path: mimetype = 'application/pdf' else: mimetype = 'text/html' bill.add_version_link(summary_name, path, media_type=mimetype, on_duplicate='ignore') yield bill
def scrape_chamber(self, chamber, session): chamber_name = 'Senate' if chamber == 'upper' else 'House' chamber_letter = chamber_name[0] # perhaps we should save this data so we can make one request for both? bill_request = self.get(ksapi.url + 'bill_status/').text bill_request_json = json.loads(bill_request) bills = bill_request_json['content'] for bill_data in bills: bill_id = bill_data['BILLNO'] # filter other chambers if not bill_id.startswith(chamber_letter): continue if 'CR' in bill_id: btype = 'concurrent resolution' elif 'R' in bill_id: btype = 'resolution' elif 'B' in bill_id: btype = 'bill' title = bill_data['SHORTTITLE'] or bill_data['LONGTITLE'] # main bill = Bill( bill_id, session, title, chamber=chamber, classification=btype, ) bill.extras = {'status': bill_data['STATUS']} bill.add_source(ksapi.url + 'bill_status/' + bill_id.lower()) if (bill_data['LONGTITLE'] and bill_data['LONGTITLE'] != bill.title): bill.add_title(bill_data['LONGTITLE']) for sponsor in bill_data['SPONSOR_NAMES']: stype = ('primary' if len(bill_data['SPONSOR_NAMES']) == 1 else 'cosponsor') if sponsor: bill.add_sponsorship( name=sponsor, entity_type='person', primary=stype == 'primary', classification=stype, ) # history is backwards for event in reversed(bill_data['HISTORY']): actor = ('upper' if event['chamber'] == 'Senate' else 'lower') date = datetime.datetime.strptime(event['occurred_datetime'], "%Y-%m-%dT%H:%M:%S") # append committee names if present if 'committee_names' in event: action = (event['status'] + ' ' + ' and '.join(event['committee_names'])) else: action = event['status'] if event['action_code'] not in ksapi.action_codes: self.warning('unknown action code on %s: %s %s' % (bill_id, event['action_code'], event['status'])) atype = None else: atype = ksapi.action_codes[event['action_code']] bill.add_action( action, date.strftime('%Y-%m-%d'), chamber=actor, classification=atype) try: yield from self.scrape_html(bill, session) except scrapelib.HTTPError as e: self.warning('unable to fetch HTML for bill {0}'.format( bill['bill_id'])) yield bill
def scrape_bill(self, chamber, session, session_id, bill_id, url): sidebar = lxml.html.fromstring(self.get(url).text) sidebar.make_links_absolute("https://www.legis.iowa.gov") try: hist_url = sidebar.xpath( '//a[contains(., "Bill History")]')[0].attrib['href'] except IndexError: # where is it? return page = lxml.html.fromstring(self.get(hist_url).text) page.make_links_absolute("https://www.legis.iowa.gov") title = page.xpath('string(//div[@id="content"]/div[@class=' '"divideVert"]/div[not(@class)])').strip() if title == '': self.warning("URL: %s gives us an *EMPTY* bill. Aborting." % url) return if title.lower().startswith("in"): title = page.xpath("string(//table[2]/tr[3])").strip() if 'HR' in bill_id or 'SR' in bill_id: bill_type = ['resolution'] elif 'HJR' in bill_id or 'SJR' in bill_id: bill_type = ['joint resolution'] elif 'HCR' in bill_id or 'SCR' in bill_id: bill_type = ['concurrent resolution'] else: bill_type = ['bill'] bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=title, classification=bill_type) bill.add_source(hist_url) # base url for text version (version_abbrev, session_id, bill_id) version_html_url_template = 'https://www.legis.iowa.gov/docs/'\ 'publications/LG{}/{}/attachments/{}.html' version_pdf_url_template = 'https://www.legis.iowa.gov/docs/'\ 'publications/LG{}/{}/{}.pdf' # get pieces of version_link vpieces = sidebar.xpath('//select[@id="billVersions"]/option') if vpieces: for version in vpieces: version_name = version.text version_abbrev = version.xpath('string(@value)') # Get HTML document of bill version. version_html_url = version_html_url_template.format( version_abbrev.upper(), session_id, bill_id.replace(' ', '')) bill.add_version_link(note=version_name, url=version_html_url, media_type='text/html') # Get PDF document of bill version. version_pdf_url = version_pdf_url_template.format( version_abbrev.upper(), session_id, bill_id.replace(' ', '')) bill.add_version_link(note=version_name, url=version_pdf_url, media_type='application/pdf') sponsors_str = page.xpath( "string(//div[@id='content']/div[@class='divideVert']/div[@class='divideVert'])" ).strip() if re.search('^By ', sponsors_str): sponsors = re.split(',| and ', sponsors_str.split('By ')[1]) # for some bills sponsors listed in different format else: sponsors = re.findall('[\w-]+(?:, [A-Z]\.)?(?:,|(?: and)|\.$)', sponsors_str) for sponsor in sponsors: sponsor = sponsor.replace(' and', '').strip(' .,') # a few sponsors get mangled by our regex sponsor = { 'Means': 'Ways & Means', 'Iowa': 'Economic Growth/Rebuild Iowa', 'Safety': 'Public Safety', 'Resources': 'Human Resources', 'Affairs': 'Veterans Affairs', 'Protection': 'Environmental Protection', 'Government': 'State Government', 'Boef': 'De Boef' }.get(sponsor, sponsor) if sponsor[0].islower(): # SSBs catch cruft in it ('charges', 'overpayments') # https://sunlight.atlassian.net/browse/DATA-286 continue bill.add_sponsorship(name=sponsor, classification='primary', entity_type='person', primary=True) for tr in page.xpath( "//table[contains(@class, 'billActionTable')]/tbody/tr"): date = tr.xpath("string(td[contains(text(), ', 20')])").strip() if date.startswith("***"): continue elif "No history is recorded at this time." in date: return if date == "": continue date = datetime.datetime.strptime(date, "%B %d, %Y").date() action = tr.xpath("string(td[2])").strip() action = re.sub(r'\s+', ' ', action) # Capture any amendment links. links = [ link for link in [version['links'] for version in bill.versions] ] version_urls = [ link['url'] for link in [i for sub in links for i in sub] ] if 'amendment' in action.lower(): for anchor in tr.xpath('td[2]/a'): if '-' in anchor.text: # These links aren't given hrefs for some reason # (needs to be fixed upstream) try: url = anchor.attrib['href'] except KeyError: continue if url not in version_urls: bill.add_version_link(note=anchor.text, url=url, media_type='text/html') version_urls.append(url) if 'S.J.' in action or 'SCS' in action: actor = 'upper' elif 'H.J.' in action or 'HCS' in action: actor = 'lower' else: actor = "legislature" action = re.sub(r'(H|S)\.J\.\s+\d+\.$', '', action).strip() if action.startswith('Introduced'): atype = ['introduction'] if ', referred to' in action: atype.append('referral-committee') elif action.startswith('Read first time'): atype = 'reading-1' elif action.startswith('Referred to'): atype = 'referral-committee' elif action.startswith('Sent to Governor'): atype = 'executive-receipt' elif action.startswith('Reported Signed by Governor'): atype = 'executive-signature' elif action.startswith('Signed by Governor'): atype = 'executive-signature' elif action.startswith('Vetoed by Governor'): atype = 'executive-veto' elif action.startswith('Item veto'): atype = 'executive-veto-line-item' elif re.match(r'Passed (House|Senate)', action): atype = 'passage' elif re.match(r'Amendment (S|H)-\d+ filed', action): atype = ['amendment-introduction'] if ', adopted' in action: atype.append('amendment-passage') elif re.match(r'Amendment (S|H)-\d+( as amended,)? adopted', action): atype = 'amendment-passage' elif re.match('Amendment (S|N)-\d+ lost', action): atype = 'amendment-failure' elif action.startswith('Resolution filed'): atype = 'introduction' elif action.startswith('Resolution adopted'): atype = 'passage' elif (action.startswith('Committee report') and action.endswith('passage.')): atype = 'committee-passage' elif action.startswith('Withdrawn'): atype = 'withdrawal' else: atype = None if action.strip() == "": continue if re.search('END OF \d+ ACTIONS', action): continue if '$history' not in action: bill.add_action(description=action, date=date, chamber=actor, classification=atype) for subject in self._subjects[bill_id]: bill.add_subject(subject['Name']) yield bill
def scrape(self, session=None, chambers=None): # Bills endpoint can sometimes take a very long time to load self.timeout = 300 if not session: session = self.latest_session() self.info('no session, using %s', session) if int(session) < 128: raise AssertionError("No data for period {}".format(session)) elif int(session) < 131: # they changed their data format starting in 131st and added # an undocumented API yield from self.old_scrape(session) else: chamber_dict = { "Senate": "upper", "House": "lower", "House of Representatives": "lower", "house": "lower", "senate": "upper" } # so presumanbly not everything passes, but we haven't # seen anything not pass yet, so we'll need to wait # till it fails and get the right language in here vote_results = { "approved": True, "passed": True, "adopted": True, "true": True, "false": False, "failed": False, True: True, False: False } action_dict = { "ref_ctte_100": "referral-committee", "intro_100": "introduction", "pass_300": "passage", "intro_110": "reading-1", "refer_210": "referral-committee", "crpt_301": None, "crpt_317": None, "concur_606": "passage", "pass_301": "passage", "refer_220": "referral-committee", "intro_102": ["introduction", "passage"], "intro_105": ["introduction", "passage"], "intro_ref_ctte_100": "referral-committee", "refer_209": None, "intro_108": ["introduction", "passage"], "intro_103": ["introduction", "passage"], "msg_reso_503": "passage", "intro_107": ["introduction", "passage"], "imm_consid_360": "passage", "refer_213": None, "adopt_reso_100": "passage", "msg_507": "amendment-passage", "confer_713": None, "concur_603": None, "confer_712": None, "msg_506": "amendment-failure", "receive_message_100": "passage", "motion_920": None, "concur_611": None, "confer_735": None } base_url = "http://search-prod.lis.state.oh.us" first_page = base_url first_page += "/solarapi/v1/general_assembly_{session}/".format( session=session) legislators = self.get_legislator_ids(first_page) all_amendments = self.get_other_data_source( first_page, base_url, "amendments") all_fiscals = self.get_other_data_source(first_page, base_url, "fiscals") all_synopsis = self.get_other_data_source(first_page, base_url, "synopsiss") all_analysis = self.get_other_data_source(first_page, base_url, "analysiss") for row in self.get_bill_rows(session): number_link, _ga, title, primary_sponsor, status = row.xpath( 'td') bill_id = number_link.text_content() title = title.text_content().strip() chamber = 'lower' if 'H' in bill_id else 'upper' classification = 'bill' if 'B' in bill_id else 'resolution' bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=title, classification=classification) bill.add_source(number_link.xpath('a/@href')[0]) # get bill from API bill_api_url = ( 'http://search-prod.lis.state.oh.us/solarapi/v1/' 'general_assembly_{}/{}/{}/'.format( session, 'bills' if 'B' in bill_id else 'resolutions', bill_id.lower().replace(' ', ''))) data = self.get(bill_api_url).json() # add title if no short title if not bill.title: bill.title = data['items'][0]['longtitle'] bill.add_title(data['items'][0]['longtitle'], 'long title') # this stuff is version-specific for version in data['items']: version_name = version["version"] version_link = base_url + version["pdfDownloadLink"] bill.add_version_link(version_name, version_link, media_type='application/pdf') # we'll use latest bill_version for everything else bill_version = data['items'][0] bill.add_source(bill_api_url) # subjects for subj in bill_version["subjectindexes"]: try: bill.add_subject(subj["primary"]) except KeyError: pass try: secondary_subj = subj["secondary"] except KeyError: secondary_subj = "" if secondary_subj: bill.add_subject(secondary_subj) # sponsors sponsors = bill_version["sponsors"] for sponsor in sponsors: sponsor_name = self.get_sponsor_name(sponsor) bill.add_sponsorship(sponsor_name, classification='primary', entity_type='person', primary=True) cosponsors = bill_version["cosponsors"] for sponsor in cosponsors: sponsor_name = self.get_sponsor_name(sponsor) bill.add_sponsorship( sponsor_name, classification='cosponsor', entity_type='person', primary=False, ) try: action_doc = self.get(base_url + bill_version["action"][0]["link"]) except scrapelib.HTTPError: pass else: actions = action_doc.json() for action in reversed(actions["items"]): actor = chamber_dict[action["chamber"]] action_desc = action["description"] try: action_type = action_dict[action["actioncode"]] except KeyError: self.warning( "Unknown action {desc} with code {code}." " Add it to the action_dict" ".".format(desc=action_desc, code=action["actioncode"])) action_type = None date = self._tz.localize( datetime.datetime.strptime(action["datetime"], "%Y-%m-%dT%H:%M:%S")) date = "{:%Y-%m-%d}".format(date) bill.add_action(action_desc, date, chamber=actor, classification=action_type) # attach documents gathered earlier self.add_document(all_amendments, bill_id, "amendment", bill, base_url) self.add_document(all_fiscals, bill_id, "fiscal", bill, base_url) self.add_document(all_synopsis, bill_id, "synopsis", bill, base_url) self.add_document(all_analysis, bill_id, "analysis", bill, base_url) # votes vote_url = base_url + bill_version["votes"][0]["link"] vote_doc = self.get(vote_url) votes = vote_doc.json() yield from self.process_vote(votes, vote_url, base_url, bill, legislators, chamber_dict, vote_results) vote_url = base_url vote_url += bill_version["cmtevotes"][0]["link"] try: vote_doc = self.get(vote_url) except scrapelib.HTTPError: self.warning("Vote page not " "loading; skipping: {}".format(vote_url)) continue votes = vote_doc.json() yield from self.process_vote(votes, vote_url, base_url, bill, legislators, chamber_dict, vote_results) # we have never seen a veto or a disapprove, but they seem important. # so we'll check and throw an error if we find one # life is fragile. so are our scrapers. if "veto" in bill_version: veto_url = base_url + bill_version["veto"][0]["link"] veto_json = self.get(veto_url).json() if len(veto_json["items"]) > 0: raise AssertionError("Whoa, a veto! We've never" " gotten one before." " Go write some code to deal" " with it: {}".format(veto_url)) if "disapprove" in bill_version: disapprove_url = base_url + bill_version["disapprove"][0][ "link"] disapprove_json = self.get(disapprove_url).json() if len(disapprove_json["items"]) > 0: raise AssertionError( "Whoa, a disapprove! We've never" " gotten one before." " Go write some code to deal " "with it: {}".format(disapprove_url)) yield bill
def scrape(self, session=None): if not session: session = self.latest_session() self.info('no session specified, using %s', session) # get member id matching for vote parsing member_ids = self.get_member_ids()[session] per_page = 10 # seems like it gives 10 no matter what. start_record = 0 headers = {"Content-Type": "application/json"} url = ("http://lims.dccouncil.us/_layouts/15/uploader/AdminProxy.aspx/" "GetPublicAdvancedSearch") bill_url = "http://lims.dccouncil.us/_layouts/15/uploader/AdminProxy.aspx/GetPublicData" params = { "request": { "sEcho": 2, "iColumns": 4, "sColumns": "", "iDisplayStart": 0, "iDisplayLength": per_page, "mDataProp_0": "ShortTitle", "mDataProp_1": "Title", "mDataProp_2": "LegislationCategories", "mDataProp_3": "Modified", "iSortCol_0": 0, "sSortDir_0": "asc", "iSortingCols": 0, "bSortable_0": "true", "bSortable_1": "true", "bSortable_2": "true", "bSortable_3": "true" }, "criteria": { "Keyword": "", "Category": "", "SubCategoryId": "", "RequestOf": "", "CouncilPeriod": str(session), "Introducer": "", "CoSponsor": "", "CommitteeReferral": "", "CommitteeReferralComments": "", "StartDate": "", "EndDate": "", "QueryLimit": 100, "FilterType": "", "Phases": "", "LegislationStatus": "0", "IncludeDocumentSearch": "false" } } param_json = json.dumps(params) response = self.post(url, headers=headers, data=param_json) # the response is a terrible string-of-nested-json-strings. Yuck. response = decode_json(response.json()["d"]) data = response["aaData"] global bill_versions while len(data) > 0: for bill in data: # sometimes they're in there more than once, so we'll keep track bill_versions = [] bill_id = bill["Title"] if bill_id.startswith("AG"): # actually an agenda, skip continue bill_params = {"legislationId": bill_id} bill_info = self.post(bill_url, headers=headers, data=json.dumps(bill_params)) bill_info = decode_json(bill_info.json()["d"])["data"] bill_source_url = "http://lims.dccouncil.us/Legislation/" + bill_id legislation_info = bill_info["Legislation"][0] title = legislation_info["ShortTitle"] if bill_id.startswith("R") or bill_id.startswith("CER"): bill_type = "resolution" else: bill_type = "bill" bill = Bill(bill_id, legislative_session=session, title=title, classification=bill_type) # sponsors and cosponsors if "Introducer" in legislation_info: introducers = legislation_info["Introducer"] intro_date = self.date_format( legislation_info["IntroductionDate"]) bill.add_action("Introduced", intro_date, classification="introduction") else: # sometimes there are introducers, sometimes not. # Set Introducers to empty array to avoid downstream breakage, # but log bills without introducers self.logger.warning("No Introducer: {0}".format( bill.identifier)) introducers = [] try: # sometimes there are cosponsors, sometimes not. cosponsors = legislation_info["CoSponsor"] except KeyError: cosponsors = [] for i in introducers: name = i["Name"] # they messed up Phil Mendelson's name if name == "Phil Pmendelson": name = "Phil Mendelson" bill.add_sponsorship(name, classification='primary', entity_type='person', primary=True) for s in cosponsors: name = s["Name"] if name == "Phil Pmendelson": name = "Phil Mendelson" bill.add_sponsorship(name=name, classification="cosponsor", entity_type='person', primary=False) # if it's become law, add the law number as an alternate title if "LawNumber" in legislation_info: law_num = legislation_info["LawNumber"] if law_num: bill.add_title(law_num) # also sometimes it's got an act number if "ActNumber" in legislation_info: act_num = legislation_info["ActNumber"] if act_num: bill.add_title(act_num) # sometimes AdditionalInformation has a previous bill name if "AdditionalInformation" in legislation_info: add_info = legislation_info["AdditionalInformation"] if "previously" in add_info.lower(): prev_title = add_info.lower().replace( "previously", "").strip().replace(" ", "") bill.add_title(prev_title.upper()) elif add_info: bill.extras["additional_information"] = add_info if "WithDrawnDate" in legislation_info: withdrawn_date = self.date_format( legislation_info["WithDrawnDate"]) withdrawn_by = legislation_info["WithdrawnBy"][0][ "Name"].strip() if withdrawn_by == "the Mayor": bill.add_action("withdrawn", withdrawn_date, chamber="executive", classification="withdrawal") elif "committee" in withdrawn_by.lower(): a = bill.add_action("withdrawn", withdrawn_date, classification="withdrawal") a.add_related_entity(withdrawn_by, entity_type='organization') else: a = bill.add_action("withdrawn", withdrawn_date, classification="withdrawal") a.add_related_entity(withdrawn_by, entity_type='person') # deal with actions involving the mayor mayor = bill_info["MayorReview"] if mayor != []: mayor = mayor[0] if "TransmittedDate" in mayor: transmitted_date = self.date_format( mayor["TransmittedDate"]) bill.add_action("transmitted to mayor", transmitted_date, chamber="executive", classification="executive-receipt") if 'SignedDate' in mayor: signed_date = self.date_format(mayor["SignedDate"]) bill.add_action("signed", signed_date, chamber="executive", classification="executive-signature") # if returned but not signed, it was vetoed elif 'ReturnedDate' in mayor: veto_date = self.date_format(mayor["ReturnedDate"]) bill.add_action("vetoed", veto_date, chamber="executive", classification="executive-veto") # if it was returned and enacted but not signed, there was a veto override if 'EnactedDate' in mayor: override_date = self.date_format( mayor["EnactedDate"]) bill.add_action( "veto override", override_date, classification="veto-override-passage") if 'AttachmentPath' in mayor: # documents relating to the mayor's review self.add_documents(mayor["AttachmentPath"], bill) congress = bill_info["CongressReview"] if len(congress) > 0: congress = congress[0] if "TransmittedDate" in congress: transmitted_date = self.date_format( congress["TransmittedDate"]) bill.add_action("Transmitted to Congress for review", transmitted_date) # deal with committee actions if "DateRead" in legislation_info: date = legislation_info["DateRead"] elif "IntroductionDate" in legislation_info: date = legislation_info["IntroductionDate"] else: self.logger.warning( "we can't find anything that looks like an " "action date. Skipping") continue date = self.date_format(date) if "CommitteeReferral" in legislation_info: committees = [] for committee in legislation_info["CommitteeReferral"]: if committee["Name"].lower( ) == "retained by the council": committees = [] break else: committees.append(committee["Name"]) if committees != []: a = bill.add_action( "referred to committee", date, classification="referral-committee") for com in committees: a.add_related_entity(com, entity_type='organization') if "CommitteeReferralComments" in legislation_info: a = bill.add_action("comments from committee", date) for committee in legislation_info[ "CommitteeReferralComments"]: a.add_related_entity(committee["Name"], entity_type='organization') # deal with random docs floating around docs = bill_info["OtherDocuments"] for d in docs: if "AttachmentPath" in d: self.add_documents(d["AttachmentPath"], bill) else: self.logger.warning( "Document path missing from 'Other Documents'") if "MemoLink" in legislation_info: self.add_documents(legislation_info["MemoLink"], bill) if "AttachmentPath" in legislation_info: self.add_documents(legislation_info["AttachmentPath"], bill) # full council votes votes = bill_info["VotingSummary"] for vote in votes: v = self.process_vote(vote, bill, member_ids) if v: v.add_source(bill_source_url) yield v # deal with committee votes if "CommitteeMarkup" in bill_info: committee_info = bill_info["CommitteeMarkup"] if len(committee_info) > 0: for committee_action in committee_info: v = self.process_committee_vote( committee_action, bill) if v: v.add_source(bill_source_url) yield v if "AttachmentPath" in committee_info: self.add_documents(vote["AttachmentPath"], bill) bill.add_source(bill_source_url) yield bill # get next page start_record += per_page params["request"]["iDisplayStart"] = start_record param_json = json.dumps(params) response = self.post(url, headers=headers, data=param_json) response = decode_json(response.json()["d"]) data = response["aaData"]
def scrape_bill_type( self, chamber, session, bill_type, type_abbr, committee_abbr_regex=get_committee_name_regex(), ): bills = (self.session.query(CABill).filter_by( session_year=session).filter_by(measure_type=type_abbr)) for bill in bills: bill_session = session if bill.session_num != "0": bill_session += " Special Session %s" % bill.session_num bill_id = bill.short_bill_id fsbill = Bill(bill_id, session, title="", chamber=chamber) if (bill_id.startswith("S") and chamber == "lower") or (bill_id.startswith("A") and chamber == "upper"): print("!!!! BAD ID/CHAMBER PAIR !!!!", bill) continue # # Construct session for web query, going from '20092010' to '0910' # source_session = session[2:4] + session[6:8] # # Turn 'AB 10' into 'ab_10' # source_num = "%s_%s" % (bill.measure_type.lower(), # bill.measure_num) # Construct a fake source url source_url = ("http://leginfo.legislature.ca.gov/faces/" "billNavClient.xhtml?bill_id=%s") % bill.bill_id fsbill.add_source(source_url) fsbill.add_version_link(bill_id, source_url, media_type="text/html") title = "" type_ = ["bill"] subject = "" all_titles = set() # Get digest test (aka "summary") from latest version. if bill.versions: version = bill.versions[-1] nsmap = version.xml.nsmap xpath = "//caml:DigestText/xhtml:p" els = version.xml.xpath(xpath, namespaces=nsmap) chunks = [] for el in els: t = etree_text_content(el) t = re.sub(r"\s+", " ", t) t = re.sub(r"\)(\S)", lambda m: ") %s" % m.group(1), t) chunks.append(t) summary = "\n\n".join(chunks) for version in bill.versions: if not version.bill_xml: continue version_date = self._tz.localize( version.bill_version_action_date) # create a version name to match the state's format # 02/06/17 - Enrolled version_date_human = version_date.strftime("%m/%d/%y") version_name = "{} - {}".format(version_date_human, version.bill_version_action) version_base = "https://leginfo.legislature.ca.gov/faces" version_url_pdf = "{}/billPdf.xhtml?bill_id={}&version={}".format( version_base, version.bill_id, version.bill_version_id) fsbill.add_version_link( version_name, version_url_pdf, media_type="application/pdf", date=version_date.date(), ) # CA is inconsistent in that some bills have a short title # that is longer, more descriptive than title. if bill.measure_type in ("AB", "SB"): impact_clause = clean_title(version.title) title = clean_title(version.short_title) else: impact_clause = None if len(version.title) < len( version.short_title) and not version.title.lower( ).startswith("an act"): title = clean_title(version.short_title) else: title = clean_title(version.title) if title: all_titles.add(title) type_ = [bill_type] if version.appropriation == "Yes": type_.append("appropriation") tags = [] if version.fiscal_committee == "Yes": tags.append("fiscal committee") if version.local_program == "Yes": tags.append("local program") if version.urgency == "Yes": tags.append("urgency") if version.taxlevy == "Yes": tags.append("tax levy") if version.subject: subject = clean_title(version.subject) if not title: self.warning("Couldn't find title for %s, skipping" % bill_id) continue fsbill.title = title if summary: fsbill.add_abstract(summary, note="summary") fsbill.classification = type_ fsbill.subject = [subject] if subject else [] fsbill.extras["impact_clause"] = impact_clause fsbill.extras["tags"] = tags # We don't want the current title in alternate_titles all_titles.remove(title) for title in all_titles: fsbill.add_title(title) for author in version.authors: fsbill.add_sponsorship( author.name, classification=SPONSOR_TYPES[author.contribution], primary=author.primary_author_flg == "Y", entity_type="person", ) # fsbill.sponsorships[-1]['extras'] = {'official_type': author.contribution} seen_actions = set() for action in bill.actions: if not action.action: # NULL action text seems to be an error on CA's part, # unless it has some meaning I'm missing continue actor = action.actor or chamber actor = actor.strip() match = re.match(r"(Assembly|Senate)($| \(Floor)", actor) if match: actor = { "Assembly": "lower", "Senate": "upper" }[match.group(1)] elif actor.startswith("Governor"): actor = "executive" else: def replacer(matchobj): if matchobj: return { "Assembly": "lower", "Senate": "upper" }[matchobj.group()] else: return matchobj.group() actor = re.sub(r"^(Assembly|Senate)", replacer, actor) type_ = [] act_str = action.action act_str = re.sub(r"\s+", " ", act_str) attrs = self.categorizer.categorize(act_str) # Add in the committee strings of the related committees, if any. kwargs = attrs matched_abbrs = committee_abbr_regex.findall(action.action) if re.search(r"Com[s]?. on", action.action) and not matched_abbrs: msg = "Failed to extract committee abbr from %r." self.logger.warning(msg % action.action) if matched_abbrs: committees = [] for abbr in matched_abbrs: try: name = self.committee_abbr_to_name(chamber, abbr) committees.append(name) except KeyError: msg = ("Mapping contains no committee name for " "abbreviation %r. Action text was %r.") args = (abbr, action.action) raise KeyError(msg % args) committees = filter(None, committees) kwargs["committees"] = committees code = re.search(r"C[SXZ]\d+", actor) if code is not None: code = code.group() kwargs["actor_info"] = {"committee_code": code} assert len(list(committees)) == len(matched_abbrs) for committee, abbr in zip(committees, matched_abbrs): act_str = act_str.replace("Coms. on ", "") act_str = act_str.replace("Com. on " + abbr, committee) act_str = act_str.replace(abbr, committee) if not act_str.endswith("."): act_str = act_str + "." # Determine which chamber the action originated from. changed = False for committee_chamber in ["upper", "lower", "legislature"]: if actor.startswith(committee_chamber): actor = committee_chamber changed = True break if not changed: actor = "legislature" if actor != action.actor: actor_info = kwargs.get("actor_info", {}) actor_info["details"] = action.actor kwargs["actor_info"] = actor_info # Add strings for related legislators, if any. rgx = r"(?:senator|assembly[mwp][^ .,:;]+)\s+[^ .,:;]+" legislators = re.findall(rgx, action.action, re.I) if legislators: kwargs["legislators"] = legislators date = action.action_date date = self._tz.localize(date) date = date.date() if (actor, act_str, date) in seen_actions: continue kwargs.update(self.categorizer.categorize(act_str)) action = fsbill.add_action( act_str, date.strftime("%Y-%m-%d"), chamber=actor, classification=kwargs["classification"], ) for committee in kwargs.get("committees", []): action.add_related_entity(committee, entity_type="organization") seen_actions.add((actor, act_str, date)) for vote_num, vote in enumerate(bill.votes): if vote.vote_result == "(PASS)": result = True else: result = False if not vote.location: continue full_loc = vote.location.description first_part = full_loc.split(" ")[0].lower() if first_part in ["asm", "assembly"]: vote_chamber = "lower" # vote_location = ' '.join(full_loc.split(' ')[1:]) elif first_part.startswith("sen"): vote_chamber = "upper" # vote_location = ' '.join(full_loc.split(' ')[1:]) else: raise ScrapeError("Bad location: %s" % full_loc) if vote.motion: motion = vote.motion.motion_text or "" else: motion = "" if "Third Reading" in motion or "3rd Reading" in motion: vtype = "passage" elif "Do Pass" in motion: vtype = "passage" else: vtype = "other" motion = motion.strip() # Why did it take until 2.7 to get a flags argument on re.sub? motion = re.compile(r"(\w+)( Extraordinary)? Session$", re.IGNORECASE).sub("", motion) motion = re.compile(r"^(Senate|Assembly) ", re.IGNORECASE).sub("", motion) motion = re.sub(r"^(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? ", "", motion) motion = re.sub(r" \(\w+\)$", "", motion) motion = re.sub(r"(SCR|SB|AB|AJR|ACR)\s?\d+ \w+\.?$", "", motion) motion = re.sub( r"(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? " r"Urgency Clause$", "(Urgency Clause)", motion, ) motion = re.sub(r"\s+", " ", motion) if not motion: self.warning("Got blank motion on vote for %s" % bill_id) continue # XXX this is responsible for all the CA 'committee' votes, not # sure if that's a feature or bug, so I'm leaving it as is... # vote_classification = chamber if (vote_location == 'Floor') else 'committee' # org = { # 'name': vote_location, # 'classification': vote_classification # } fsvote = VoteEvent( motion_text=motion, start_date=self._tz.localize(vote.vote_date_time), result="pass" if result else "fail", classification=vtype, # organization=org, chamber=vote_chamber, bill=fsbill, ) fsvote.extras = {"threshold": vote.threshold} source_url = ("http://leginfo.legislature.ca.gov/faces" "/billVotesClient.xhtml?bill_id={}").format( fsbill.identifier) fsvote.add_source(source_url) fsvote.pupa_id = source_url + "#" + str(vote_num) rc = {"yes": [], "no": [], "other": []} for record in vote.votes: if record.vote_code == "AYE": rc["yes"].append(record.legislator_name) elif record.vote_code.startswith("NO"): rc["no"].append(record.legislator_name) else: rc["other"].append(record.legislator_name) # Handle duplicate votes for key in rc.keys(): rc[key] = list(set(rc[key])) for key, voters in rc.items(): for voter in voters: fsvote.vote(key, voter) # Set counts by summed votes for accuracy fsvote.set_count(key, len(voters)) yield fsvote yield fsbill self.session.expire_all()
def scrape_details(self, bill_detail_url, session, chamber, bill_id): """ Create the Bill and add the information obtained from the provided bill_detail_url. and then yield the bill object. :param bill_detail_url: :param session: :param chamber: :param bill_id: :return: """ page = self.get(bill_detail_url).text if 'INVALID BILL NUMBER' in page: self.warning('INVALID BILL %s' % bill_detail_url) return doc = lxml.html.fromstring(page) doc.make_links_absolute(bill_detail_url) bill_div = doc.xpath('//div[@style="margin:0 0 40px 0;"]')[0] bill_type = bill_div.xpath('span/text()')[0] if 'General Bill' in bill_type: bill_type = 'bill' elif 'Concurrent Resolution' in bill_type: bill_type = 'concurrent resolution' elif 'Joint Resolution' in bill_type: bill_type = 'joint resolution' elif 'Resolution' in bill_type: bill_type = 'resolution' else: raise ValueError('unknown bill type: %s' % bill_type) # this is fragile, but less fragile than it was b = bill_div.xpath('./b[text()="Summary:"]')[0] bill_summary = b.getnext().tail.strip() bill = Bill( bill_id, legislative_session= session, # session name metadata's `legislative_sessions` chamber=chamber, # 'upper' or 'lower' title=bill_summary, classification=bill_type) subjects = list(self._subjects[bill_id]) for subject in subjects: bill.add_subject(subject) # sponsors for sponsor in doc.xpath('//a[contains(@href, "member.php")]/text()'): bill.add_sponsorship(name=sponsor, classification='primary', primary=True, entity_type='person') for sponsor in doc.xpath( '//a[contains(@href, "committee.php")]/text()'): sponsor = sponsor.replace(u'\xa0', ' ').strip() bill.add_sponsorship(name=sponsor, classification='primary', primary=True, entity_type='organization') # find versions version_url = doc.xpath('//a[text()="View full text"]/@href')[0] version_html = self.get(version_url).text version_doc = lxml.html.fromstring(version_html) version_doc.make_links_absolute(version_url) for version in version_doc.xpath('//a[contains(@href, "/prever/")]'): # duplicate versions with same date, use first appearance bill.add_version_link( note=version. text, # Description of the version from the state; # eg, 'As introduced', 'Amended', etc. url=version.get('href'), on_duplicate='ignore', media_type='text/html' # Still a MIME type ) # actions for row in bill_div.xpath('table/tr'): date_td, chamber_td, action_td = row.xpath('td') date = datetime.datetime.strptime(date_td.text, "%m/%d/%y") action_chamber = { 'Senate': 'upper', 'House': 'lower', None: 'legislature' }[chamber_td.text] action = action_td.text_content() action = action.split('(House Journal')[0] action = action.split('(Senate Journal')[0].strip() atype = action_type(action) bill.add_action( description=action, # Action description, from the state date=date.strftime('%Y-%m-%d'), # `YYYY-MM-DD` format chamber=action_chamber, # 'upper' or 'lower' classification=atype # Options explained in the next section ) # votes vurl = doc.xpath('//a[text()="View Vote History"]/@href') if vurl: vurl = vurl[0] yield from self.scrape_vote_history(bill, vurl) bill.add_source(bill_detail_url) yield bill
def scrape_matter(self, matter_link, sess): matter_types = { "Additions": "other", "Administrative Order": "order", "Annual Evaluation": "other", "Bid Advertisement": "other", "Bid Awards": "other", "Bid Contract": "contract", "Bid Protest": "other", "Bid Rejection": "other", "Birthday Scroll": "commemoration", "Certificate of Appreciation": "commemoration", "Change Order": "order", "Citizen's Presentation": "other", "Commendation": "commemoration", "Conflict Waiver": "other", "Congratulatory Certificate": "commemoration", "Deferrals": "other", "Discussion Item": "other", "Distinguished Visitor": "other", "Joint Meeting/Workshop": "other", "Mayoral Veto": "other", "Miscellaneous": "other", "Nomination": "nomination", "Oath of Office": "other", "Omnibus Reserve": "bill", "Ordinance": "ordinance", "Plaque": "commemoration", "Presentation": "other", "Proclamation": "proclamation", "Professional Service Agreement": "contract", "Public Hearing": "other", "Report": "other", "Request for Proposals": "other", "Request for Qualifications": "other", "Request to Advertise": "other", "Resolution": "resolution", "Resolution of Sympathy": "resolution", "Service Awards": "commemoration", "Special Item": "other", "Special Presentation": "other", "Supplement": "other", "Swearing-In": "other", "Time Sensitive Items": "other", "Withdrawals": "other", "Workshop Item": "other", "Zoning": "other", "Zoning Resolution": "resolution" } matter_doc = self.lxmlize(matter_link) info_dict = self.matter_table_to_dict(matter_doc) #we're going to use the year of the intro date as the session #until/unless we come up with something better intro_date = datetime.strptime(info_dict["Introduced"], "%m/%d/%Y") session = sess["identifier"] try: file_type = info_dict["File Type"] except KeyError: category = 'other' else: category = matter_types[file_type] if 'File Name' in info_dict: title = info_dict["File Name"] elif "Title" in info_dict and info_dict["Title"].strip(): title = info_dict["Title"].strip() else: self.warning("bill has no title") return if category == 'other': bill = Bill(identifier=info_dict["File Number"], legislative_session=session, title=title) else: bill = Bill(identifier=info_dict["File Number"], legislative_session=session, title=title, classification=category) for spons in info_dict["Sponsors"]: if spons == "NONE": continue try: name, spons_type = spons.rsplit(",", 1) except ValueError: name = spons spons_type = "Sponsor" primary = True if "Prime Sponsor" in spons_type else False entity = "person" if "committee" in name: entity = committee bill.add_sponsorship(name, spons_type, entity, primary) if "Indexes" in info_dict: for subj in info_dict["Indexes"]: if subj.strip() and subj.strip() != "NONE": bill.add_subject(subj.strip()) if "Title" in info_dict and info_dict["Title"].strip(): note = "bill's long title'" if ("Note" in info_dict and info_dict["Note"].strip()): note = info_dict["Note"] bill.add_abstract(abstract=info_dict["Title"], note=note) self.process_action_table(matter_doc, bill) bill.add_source(matter_link, note='web') yield bill
def parse_bill(self, chamber, session, bill_id, url): page = self.lxmlize(url) short_bill_id = re.sub(r'(H|S)([JC])R', r'\1\2', bill_id) version_link_node = self.get_node( page, '//a[contains(@href, "{bill_id}/bill.doc") or contains(@href,' '"{bill_id}/bill.pdf")]'.format(bill_id=short_bill_id)) if version_link_node is None: # Bill withdrawn self.logger.warning('Bill withdrawn.') return else: source_url = version_link_node.attrib['href'] if source_url.endswith('.doc'): mimetype = 'application/msword' elif source_url.endswith('.pdf'): mimetype = 'application/pdf' if self._is_post_2016: title_texts = self.get_nodes( page, '//div[@class="StandardText leftDivMargin"]/text()') title_texts = list( filter(None, [text.strip() for text in title_texts])) title_texts = [ s for s in title_texts if s != ',' and not s.startswith('(BR ') ] title = ' '.join(title_texts) actions = self.get_nodes( page, '//div[@class="StandardText leftDivMargin"]/' 'div[@class="StandardText"][last()]//text()[normalize-space()]' ) else: pars = version_link_node.xpath("following-sibling::p") if len(pars) == 2: title = pars[0].xpath("string()") action_p = pars[1] else: title = pars[0].getprevious().tail if not title: self.warning( 'walking backwards to get bill title, error prone!') title = pars[0].getprevious().getprevious() while not title.tail: title = title.getprevious() title = title.tail self.warning('got title the dangerous way: %s' % title) action_p = pars[0] title = re.sub(r'[\s\xa0]+', ' ', title).strip() actions = action_p.xpath("string()").split("\n") if 'CR' in bill_id: bill_type = 'concurrent resolution' elif 'JR' in bill_id: bill_type = 'joint resolution' elif 'R' in bill_id: bill_type = 'resolution' else: bill_type = 'bill' bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=title, classification=bill_type) bill.subject = self._subjects[bill_id] bill.add_source(url) bill.add_version_link("Most Recent Version", source_url, media_type=mimetype) other_versions = page.xpath( '//a[contains(@href, "/recorddocuments/bill/") and' ' not(contains(@href, "/bill.pdf")) and' ' not(contains(@href, "/bill.doc")) and' ' not(contains(@href, "/LM.pdf"))]') for version_link in other_versions: source_url = version_link.attrib['href'] if source_url.endswith('.doc'): mimetype = 'application/msword' elif source_url.endswith('.pdf'): mimetype = 'application/pdf' version_title = version_link.xpath('text()')[0] bill.add_version_link(version_title, source_url, media_type=mimetype) # LM is "Locally Mandated fiscal impact" fiscal_notes = page.xpath('//a[contains(@href, "/LM.pdf")]') for fiscal_note in fiscal_notes: source_url = fiscal_note.attrib['href'] if source_url.endswith('.doc'): mimetype = 'application/msword' elif source_url.endswith('.pdf'): mimetype = 'application/pdf' bill.add_document_link("Fiscal Note", source_url, media_type=mimetype) for link in page.xpath("//a[contains(@href, 'legislator/')]"): bill.add_sponsorship(link.text.strip(), classification='primary', entity_type='person', primary=True) for line in actions: line_actions = line.strip().split(';') for index, action in enumerate(line_actions): action = action.strip() if not action: continue action_date_text = line.split('-')[0].strip() if self._is_post_2016: action_date_string = action_date_text.replace(',', '') else: action_date_string = '{} {}'.format( action_date_text, session[0:4]) # This patch is super hacky, but allows us to better # capture actions that screw up the formatting such as # veto document links. try: action_date = datetime.datetime.strptime( action_date_string, '%b %d %Y') cached_action_date = action_date used_cached_action_date = False except ValueError: action_date = cached_action_date used_cached_action_date = True # Separate out theif first action on the line. if index == 0 and not used_cached_action_date: action = '-'.join(action.split('-')[1:]).strip() if not action: continue if action.endswith('House') or action.endswith('(H)'): actor = 'lower' elif action.endswith('Senate') or action.endswith('(S)'): actor = 'upper' else: actor = chamber # For chamber passage, # the only way to determine chamber correctly is # how many total people voted on it if action.startswith('3rd reading'): votes = re.search(r'(\d+)\-(\d+)', action) if votes: yeas = int(votes.groups(1)[0]) nays = int(votes.groups(1)[1]) # 50 is the quorum for the house, # and more than the number of senators if yeas + nays > 50: actor = 'lower' elif (yeas + nays > 20) and (yeas + nays < 50): actor = 'upper' atype = [] if 'introduced in' in action: atype.append('introduction') if 'to ' in action: atype.append('referral-committee') elif 'signed by Governor' in action: atype.append('executive-signature') elif 'vetoed' in action: atype.append('executive-veto') # Get the accompanying veto message document. There # should only be one. veto_document_link = self.get_node( page, '//div[@class="StandardText leftDivMargin"]/' 'div[@class="StandardText"][last()]/a[contains(@href,' '"veto.pdf")]') if veto_document_link is not None: bill.add_document_link( "Veto Message", veto_document_link.attrib['href'], on_duplicate='ignore') elif re.match(r'^to [A-Z]', action): atype.append('referral-committee') elif action == 'adopted by voice vote': atype.append('passage') if '1st reading' in action: atype.append('reading-1') if '3rd reading' in action: atype.append('reading-3') if 'passed' in action: atype.append('passage') if '2nd reading' in action: atype.append('reading-2') if 'delivered to secretary of state' in action.lower(): atype.append('became-law') if 'veto overridden' in action.lower(): atype.append('veto-override-passage') if 'R' in bill_id and 'adopted by voice vote' in action: atype.append('passage') amendment_re = (r'floor amendments?( \([a-z\d\-]+\))*' r'( and \([a-z\d\-]+\))? filed') if re.search(amendment_re, action): atype.append('amendment-introduction') if not atype: atype = None # Capitalize the first letter of the action for nicer # display. capitalize() won't work for this because it # lowercases all other letters. action = (action[0].upper() + action[1:]) action_date = timezone('America/Kentucky/Louisville').localize( action_date) action_date = action_date.strftime('%Y-%m-%d') if action: bill.add_action(action, action_date, chamber=actor, classification=atype) try: votes_link = page.xpath( "//a[contains(@href, 'vote_history.pdf')]")[0] bill.add_document_link("Vote History", votes_link.attrib['href']) except IndexError: # No votes self.logger.warning(u'No votes found for {}'.format(title)) pass # Ugly Hack Alert! # find actions before introduction date and subtract 1 from the year # if the date is after introduction intro_date = None for i, action in enumerate(bill.actions): if 'introduction' in action['classification']: intro_date = action['date'] break for action in bill.actions[:i]: if action['date'] > intro_date: action['date'] = action['date'].replace( year=action['date'].year - 1) self.debug('corrected year for %s', action['action']) yield bill
def scrape_bill(self, chamber, session, bill_id, url): page = self.lxmlize(url) (header, ) = page.xpath('//h3[@class="heading"]/text()') title = header.replace(bill_id, "").strip() if '.B. ' in bill_id: bill_type = 'bill' elif bill_id.startswith('H.R. ') or bill_id.startswith('S.R. '): bill_type = 'resolution' elif '.C.R. ' in bill_id: bill_type = 'concurrent resolution' elif '.J.R. ' in bill_id: bill_type = 'joint resolution' for flag in SUB_BLACKLIST: if flag in bill_id: bill_id = bill_id.replace(flag, " ") bill_id = re.sub("\s+", " ", bill_id).strip() bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=title, classification=bill_type) bill.add_source(url) primary_info = page.xpath('//div[@id="billsponsordiv"]') for info in primary_info: (title, name) = [x.strip() for x in info.xpath('.//text()') if x.strip()] assert title == "Bill Sponsor:" name = name.replace("Sen. ", "").replace("Rep. ", "") bill.add_sponsorship(name, classification='primary', entity_type='person', primary=True) floor_info = page.xpath('//div[@id="floorsponsordiv"]//text()') floor_info = [x.strip() for x in floor_info if x.strip()] if len(floor_info) in (0, 1): # This indicates that no floor sponsor was found pass elif len(floor_info) == 2: assert floor_info[0] == "Floor Sponsor:" floor_sponsor = floor_info[1].replace("Sen. ", "").replace("Rep. ", "") bill.add_sponsorship(floor_sponsor, classification='cosponsor', entity_type='person', primary=False) else: raise AssertionError("Unexpected floor sponsor HTML found") versions = page.xpath( '//b[text()="Bill Text"]/following-sibling::ul/li/' 'a[text() and not(text()=" ")]') for version in versions: # sometimes the href is on the following <a> tag and the tag we # have has an onclick url = version.get('href') if not url: url = version.xpath('following-sibling::a[1]/@href')[0] bill.add_version_link(version.xpath('text()')[0].strip(), url, media_type='application/pdf') for related in page.xpath( '//b[text()="Related Documents "]/following-sibling::ul/li/' 'a[contains(@class,"nlink")]'): href = related.xpath('@href')[0] if '.fn.pdf' in href: bill.add_document_link("Fiscal Note", href, media_type='application/pdf') else: text = related.xpath('text()')[0] bill.add_document_link(text, href, media_type='application/pdf') subjects = [] for link in page.xpath("//a[contains(@href, 'RelatedBill')]"): subjects.append(link.text.strip()) bill.subject = subjects status_table = page.xpath('//div[@id="billStatus"]//table')[0] yield from self.parse_status(bill, status_table, chamber) yield bill
def scrape(self, window=28) : n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(float(window)) for matter in self.matters(n_days_ago) : matter_id = matter['MatterId'] date = matter['MatterIntroDate'] title = matter['MatterTitle'] identifier = matter['MatterFile'] if not all((date, title, identifier)) : continue bill_session = self.session(self.toTime(date)) bill_type = BILL_TYPES[matter['MatterTypeName']] if identifier.startswith('S'): alternate_identifiers = [identifier] identifier = identifier[1:] else: alternate_identifiers = [] bill = Bill(identifier=identifier, legislative_session=bill_session, title=title, classification=bill_type, from_organization={"name":"Board of Directors"}) legistar_web = matter['legistar_url'] legistar_api = self.BASE_URL + '/matters/{0}'.format(matter_id) bill.add_source(legistar_web, note='web') bill.add_source(legistar_api, note='api') for identifier in alternate_identifiers: bill.add_identifier(identifier) for action, vote in self.actions(matter_id) : act = bill.add_action(**action) if action['description'] == 'Referred' : body_name = matter['MatterBodyName'] act.add_related_entity(body_name, 'organization', entity_id = _make_pseudo_id(name=body_name)) result, votes = vote if result : vote_event = VoteEvent(legislative_session=bill.legislative_session, motion_text=action['description'], organization=action['organization'], classification=None, start_date=action['date'], result=result, bill=bill) vote_event.add_source(legistar_web) vote_event.add_source(legistar_api + '/histories') for vote in votes : raw_option = vote['VoteValueName'].lower() clean_option = self.VOTE_OPTIONS.get(raw_option, raw_option) vote_event.vote(clean_option, vote['VotePersonName'].strip()) yield vote_event for sponsorship in self.sponsorships(matter_id) : bill.add_sponsorship(**sponsorship) for topic in self.topics(matter_id) : bill.add_subject(topic['MatterIndexName'].strip()) for relation in self.relations(matter_id): try: # Get data (i.e., json) for the related bill. # Then, we can find the 'MatterFile' (i.e., identifier) and the 'MatterIntroDate' (i.e., to determine its legislative session). # Sometimes, the related bill does not yet exist: in this case, throw an error, and continue. related_bill = self.endpoint('/matters/{0}', relation['MatterRelationMatterId']) except scrapelib.HTTPError: continue else: date = related_bill['MatterIntroDate'] related_bill_session = self.session(self.toTime(date)) identifier = related_bill['MatterFile'] bill.add_related_bill(identifier=identifier, legislative_session=related_bill_session, relation_type='companion') # Currently, the relation type for bills can be one of a few possibilites: https://github.com/opencivicdata/python-opencivicdata/blob/master/opencivicdata/common.py#L104 # Metro simply understands these as related files, suggesting that they receive a relation of 'companion'. bill.add_version_link('Board Report', 'https://metro.legistar.com/ViewReport.ashx?M=R&N=TextL5&GID=557&ID={}&GUID=LATEST&Title=Board+Report'.format(matter_id), media_type="application/pdf") for attachment in self.attachments(matter_id) : if attachment['MatterAttachmentName'] : bill.add_document_link(attachment['MatterAttachmentName'], attachment['MatterAttachmentHyperlink'], media_type="application/pdf") bill.extras = {'local_classification' : matter['MatterTypeName']} text = self.text(matter_id) if text : if text['MatterTextPlain'] : bill.extras['plain_text'] = text['MatterTextPlain'] if text['MatterTextRtf'] : bill.extras['rtf_text'] = text['MatterTextRtf'].replace(u'\u0000', '') yield bill
def scrape_bill(self, session, history_url): history_xml = self.get(history_url).text root = etree.fromstring(history_xml) bill_title = root.findtext("caption") if (bill_title is None or "Bill does not exist" in history_xml): self.warning("Bill does not appear to exist") return bill_id = ' '.join(root.attrib['bill'].split(' ')[1:]) chamber = self.CHAMBERS[bill_id[0]] if bill_id[1] == 'B': bill_type = ['bill'] elif bill_id[1] == 'R': bill_type = ['resolution'] elif bill_id[1:3] == 'CR': bill_type = ['concurrent resolution'] elif bill_id[1:3] == 'JR': bill_type = ['joint resolution'] else: raise ScrapeError("Invalid bill_id: %s" % bill_id) bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=bill_title, classification=bill_type) bill.add_source(history_url) for subject in root.iterfind('subjects/subject'): bill.add_subject(subject.text.strip()) versions = [x for x in self.versions if x[0] == bill_id] for version in versions: bill.add_version_link(note=self.NAME_SLUGS[version[1][-5]], url=version[1], media_type='text/html') analyses = [x for x in self.analyses if x[0] == bill_id] for analysis in analyses: bill.add_document_link(note="Analysis ({})".format( self.NAME_SLUGS[analysis[1][-5]]), url=analysis[1], media_type='text/html') fiscal_notes = [x for x in self.fiscal_notes if x[0] == bill_id] for fiscal_note in fiscal_notes: bill.add_document_link(note="Fiscal Note ({})".format( self.NAME_SLUGS[fiscal_note[1][-5]]), url=fiscal_note[1], media_type='text/html') witnesses = [x for x in self.witnesses if x[0] == bill_id] for witness in witnesses: bill.add_document_link(note="Witness List ({})".format( self.NAME_SLUGS[witness[1][-5]]), url=witness[1], media_type='text/html') for action in root.findall('actions/action'): act_date = datetime.datetime.strptime(action.findtext('date'), "%m/%d/%Y").date() action_number = action.find('actionNumber').text actor = { 'H': 'lower', 'S': 'upper', 'E': 'executive' }[action_number[0]] desc = action.findtext('description').strip() if desc == 'Scheduled for public hearing on . . .': self.warning("Skipping public hearing action with no date") continue introduced = False if desc == 'Amended': atype = 'amendment-passage' elif desc == 'Amendment(s) offered': atype = 'amendment-introduction' elif desc == 'Amendment amended': atype = 'amendment-amendment' elif desc == 'Amendment withdrawn': atype = 'amendment-withdrawal' elif desc == 'Passed' or desc == 'Adopted': atype = 'passage' elif re.match(r'^Received (by|from) the', desc): if 'Secretary of the Senate' not in desc: atype = 'introduction' else: atype = 'filing' elif desc.startswith('Sent to the Governor'): # But what if it gets lost in the mail? atype = 'executive-receipt' elif desc.startswith('Signed by the Governor'): atype = 'executive-signature' elif desc == 'Vetoed by the Governor': atype = 'executive-veto' elif desc == 'Read first time': atype = ['introduction', 'reading-1'] introduced = True elif desc == 'Read & adopted': atype = ['passage'] if not introduced: introduced = True atype.append('introduction') elif desc == "Passed as amended": atype = 'passage' elif (desc.startswith('Referred to') or desc.startswith("Recommended to be sent to ")): atype = 'referral-committee' elif desc == "Reported favorably w/o amendment(s)": atype = 'committee-passage' elif desc == "Filed": atype = 'filing' elif desc == 'Read 3rd time': atype = 'reading-3' elif desc == 'Read 2nd time': atype = 'reading-2' elif desc.startswith('Reported favorably'): atype = 'committee-passage-favorable' else: atype = None act = bill.add_action(action.findtext('description'), act_date, chamber=actor, classification=atype) if atype and 'referral-committee' in atype: repls = ['Referred to', "Recommended to be sent to "] ctty = desc for r in repls: ctty = ctty.replace(r, "").strip() act.add_related_entity(name=ctty, entity_type='organization') for author in root.findtext('authors').split(' | '): if author != "": bill.add_sponsorship(author, classification='primary', entity_type='person', primary=True) for coauthor in root.findtext('coauthors').split(' | '): if coauthor != "": bill.add_sponsorship(coauthor, classification='cosponsor', entity_type='person', primary=False) for sponsor in root.findtext('sponsors').split(' | '): if sponsor != "": bill.add_sponsorship(sponsor, classification='primary', entity_type='person', primary=True) for cosponsor in root.findtext('cosponsors').split(' | '): if cosponsor != "": bill.add_sponsorship(cosponsor, classification='cosponsor', entity_type='person', primary=False) if root.findtext('companions'): self._get_companion(bill) yield bill
def scrape_bill(self, chamber, session, bill_id, short_title=None): """ Scrapes documents, actions, vote counts and votes for bills from the 2009 session and above. """ url = BILL_URL % (session, bill_id.replace(' ', '')) bill_page = self.get(url, verify=False).text html = lxml.html.fromstring(bill_page) html.make_links_absolute('http://legislature.idaho.gov/legislation/%s/' % session) bill_tables = html.xpath('//table[contains(@class, "bill-table")]') title = bill_tables[1].text_content().strip() bill_type = get_bill_type(bill_id) bill = Bill(legislative_session=session, chamber=chamber, identifier=bill_id, title=title, classification=bill_type) bill.add_source(url) for subject in self._subjects[bill_id.replace(' ', '')]: bill.add_subject(subject) if short_title and title.lower() != short_title.lower(): bill.add_title(short_title, 'short title') # documents doc_links = html.xpath('//div[contains(@class,"pf-content")]//a') for link in doc_links: name = link.text_content().strip() href = link.get('href') if 'Engrossment' in name or 'Bill Text' in name: bill.add_version_link(note=name, url=href, media_type="application/pdf") else: bill.add_document_link(note=name, url=href, media_type="application/pdf") def _split(string): return re.split(r"\w+[,|AND]\s+", string) # sponsors range from a committee to one legislator to a group of legs sponsor_lists = bill_tables[0].text_content().split('by') if len(sponsor_lists) > 1: for sponsors in sponsor_lists[1:]: if 'COMMITTEE' in sponsors.upper(): bill.add_sponsorship(name=sponsors.strip(), entity_type="organization", primary=True, classification='primary') else: for person in _split(sponsors): person = person.strip() if person != "": bill.add_sponsorship(classification='primary', name=person, entity_type="person", primary=True) actor = chamber last_date = None for row in bill_tables[2]: # lots of empty rows if len(row) == 1: continue _, date, action, _ = [x.text_content().strip() for x in row] if date: last_date = date else: date = last_date date = datetime.datetime.strptime(date + '/' + session[0:4], "%m/%d/%Y").strftime('%Y-%m-%d') if action.startswith('House'): actor = 'lower' elif action.startswith('Senate'): actor = 'upper' # votes if 'AYES' in action or 'NAYS' in action: yield from self.parse_vote(actor, date, row[2], session, bill_id, chamber, url) # bill.add_vote_event(vote) # some td's text is seperated by br elements if len(row[2]): action = "".join(row[2].itertext()) action = action.replace(u'\xa0', ' ').strip() atype = get_action(actor, action) bill.add_action(action, date, chamber=actor, classification=atype) # after voice vote/roll call and some actions the bill is sent # 'to House' or 'to Senate' if 'to House' in action: actor = 'lower' elif 'to Senate' in action: actor = 'upper' yield bill
def scrape_bill(self, chamber, session, session_id, bill_id, url): sidebar = lxml.html.fromstring(self.get(url).text) sidebar.make_links_absolute("https://www.legis.iowa.gov") hist_url = (f"https://www.legis.iowa.gov/legislation/billTracking/" f"billHistory?billName={bill_id}&ga={session_id}") req_session = requests.Session() req = requests.get(hist_url) if req.status_code == 500: self.warning("500 error on {}, skipping".format(hist_url)) return page = lxml.html.fromstring(req.text) page.make_links_absolute("https://www.legis.iowa.gov") title = page.xpath('string(//div[@id="content"]/div[@class=' '"divideVert"]/div/div[4]/div[2])').strip() if title == "": # Sometimes the title is moved, see # https://www.legis.iowa.gov/legislation/billTracking/billHistory?billName=SF%20139&ga=88 title = page.xpath('string(//div[@id="content"]/div[@class=' '"divideVert"]/div[4]/div[2])').strip() if title == "": self.warning("URL: %s gives us an *EMPTY* bill. Aborting." % url) return if title.lower().startswith("in"): title = page.xpath("string(//table[2]/tr[3])").strip() if "HR" in bill_id or "SR" in bill_id: bill_type = ["resolution"] elif "HJR" in bill_id or "SJR" in bill_id: bill_type = ["joint resolution"] elif "HCR" in bill_id or "SCR" in bill_id: bill_type = ["concurrent resolution"] else: bill_type = ["bill"] bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=title, classification=bill_type, ) bill.add_source(hist_url) # base url for text version (version_abbrev, session_id, bill_id) version_html_url_template = ( "https://www.legis.iowa.gov/docs/" "publications/LG{}/{}/attachments/{}.html") version_pdf_url_template = ("https://www.legis.iowa.gov/docs/" "publications/LG{}/{}/{}.pdf") # get pieces of version_link vpieces = sidebar.xpath('//select[@id="billVersions"]/option') if vpieces: for version in vpieces: version_name = version.text version_abbrev = version.xpath("string(@value)") # Get HTML document of bill version. version_html_url = version_html_url_template.format( version_abbrev.upper(), session_id, bill_id.replace(" ", "")) bill.add_version_link(note=version_name, url=version_html_url, media_type="text/html") # Get PDF document of bill version. version_pdf_url = version_pdf_url_template.format( version_abbrev.upper(), session_id, bill_id.replace(" ", "")) if "Marked Up" in version_name: version_pdf_url = sidebar.xpath( "//iframe[@id='bbContextDoc']/@src")[0] bill.add_version_link(note=version_name, url=version_pdf_url, media_type="application/pdf") sponsors_str = page.xpath('string(//div[@id="content"]/div[@class=' '"divideVert"]/div/div[4]/div[1])').strip() if re.search("^By ", sponsors_str): sponsors = re.split(",| and ", sponsors_str.split("By ")[1]) # for some bills sponsors listed in different format else: sponsors = re.findall(r"[\w-]+(?:, [A-Z]\.)?(?:,|(?: and)|\.$)", sponsors_str) for sponsor in sponsors: sponsor = sponsor.replace(" and", "").strip(" .,") # a few sponsors get mangled by our regex sponsor = { "Means": "Ways & Means", "Iowa": "Economic Growth/Rebuild Iowa", "Safety": "Public Safety", "Resources": "Human Resources", "Affairs": "Veterans Affairs", "Protection": "Environmental Protection", "Government": "State Government", "Boef": "De Boef", }.get(sponsor, sponsor) if sponsor[0].islower(): # SSBs catch cruft in it ('charges', 'overpayments') # https://sunlight.atlassian.net/browse/DATA-286 continue bill.add_sponsorship( name=sponsor, classification="primary", entity_type="person", primary=True, ) for tr in page.xpath( "//table[contains(@class, 'billActionTable')][1]/tbody/tr"): date = tr.xpath("string(td[contains(text(), ', 20')])").strip() if date.startswith("***"): continue elif "No history is recorded at this time." in date: return if date == "": continue date = datetime.datetime.strptime(date, "%B %d, %Y").date() action = tr.xpath("string(td[3])").strip() action = re.sub(r"\s+", " ", action) # Capture any amendment links. links = [ link for link in [version["links"] for version in bill.versions] ] version_urls = [ link["url"] for link in [i for sub in links for i in sub] ] if "amendment" in action.lower(): for anchor in tr.xpath(".//a[1]"): if "-" in anchor.text: # https://www.legis.iowa.gov/docs/publications/AMDI/88/S3071.pdf amd_pattern = "https://www.legis.iowa.gov/docs/publications/AMDI/{}/{}.pdf" amd_id = anchor.text.replace("-", "").strip() amd_url = amd_pattern.format(session_id, amd_id) amd_name = "Amendment {}".format(anchor.text.strip()) if amd_url not in version_urls: bill.add_version_link(note=amd_name, url=amd_url, media_type="application/pdf") version_urls.append(amd_url) else: self.info( "Already Added {}, skipping".format(amd_url)) if "S.J." in action or "SCS" in action: actor = "upper" elif "H.J." in action or "HCS" in action: actor = "lower" else: actor = "legislature" action = re.sub(r"(H|S)\.J\.\s+\d+\.$", "", action).strip() if action.startswith("Introduced"): atype = ["introduction"] if ", referred to" in action: atype.append("referral-committee") elif action.startswith("Read first time"): atype = "reading-1" elif action.startswith("Referred to"): atype = "referral-committee" elif action.startswith("Sent to Governor"): atype = "executive-receipt" elif action.startswith("Reported Signed by Governor"): atype = "executive-signature" elif action.startswith("Signed by Governor"): atype = "executive-signature" elif action.startswith("Vetoed by Governor"): atype = "executive-veto" elif action.startswith("Item veto"): atype = "executive-veto-line-item" elif re.match(r"Passed (House|Senate)", action): atype = "passage" elif re.match(r"Amendment (S|H)-\d+ filed", action): atype = ["amendment-introduction"] if ", adopted" in action: atype.append("amendment-passage") elif re.match(r"Amendment (S|H)-\d+( as amended,)? adopted", action): atype = "amendment-passage" elif re.match(r"Amendment (S|N)-\d+ lost", action): atype = "amendment-failure" elif action.startswith("Resolution filed"): atype = "introduction" elif action.startswith("Resolution adopted"): atype = "passage" elif action.startswith("Committee report") and action.endswith( "passage."): atype = "committee-passage" elif action.startswith("Withdrawn"): atype = "withdrawal" else: atype = None if action.strip() == "": continue if re.search(r"END OF \d+ ACTIONS", action): continue if "$history" not in action: bill.add_action(description=action, date=date, chamber=actor, classification=atype) self.scrape_subjects(bill, bill_id, session, req_session) yield bill
def scrape(self, window=28, matter_ids=None): '''By default, scrape board reports updated in the last 28 days. Optionally specify a larger or smaller window of time from which to scrape updates, or specific matters to scrape. Note that passing a value for :matter_ids supercedes the value of :window, such that the given matters will be scraped regardless of when they were updated. Optional parameters :window (numeric) - Amount of time for which to scrape updates, e.g. a window of 7 will scrape legislation updated in the last week. Pass a window of 0 to scrape all legislation. :matter_ids (str) - Comma-separated list of matter IDs to scrape ''' if matter_ids: matters = [ self.matter(matter_id) for matter_id in matter_ids.split(',') ] matters = filter( None, matters) # Skip matters that are not yet in Legistar elif float(window): # Support for partial days, i.e., window=0.15 n_days_ago = datetime.datetime.utcnow() - datetime.timedelta( float(window)) matters = self.matters(n_days_ago) else: # Scrape all matters, including those without a last-modified date matters = self.matters() n_days_ago = datetime.datetime.utcnow() - datetime.timedelta( float(window)) for matter in matters: # Skip this bill, until Metro cleans up duplicate in Legistar API if matter['MatterFile'] == '2017-0447': continue matter_id = matter['MatterId'] date = matter['MatterIntroDate'] title = matter['MatterTitle'] identifier = matter['MatterFile'] if not all((date, title, identifier)): continue # Do not scrape private bills introduced before this timestamp. if self._is_restricted(matter) and ( date < self.START_DATE_PRIVATE_SCRAPE): continue bill_session = self.session(self.toTime(date)) bill_type = BILL_TYPES[matter['MatterTypeName']] if identifier.startswith('S'): alternate_identifiers = [identifier] identifier = identifier[1:] else: alternate_identifiers = [] bill = Bill(identifier=identifier, legislative_session=bill_session, title=title, classification=bill_type, from_organization={"name": "Board of Directors"}) # The Metro scraper scrapes private bills. # However, we do not want to capture significant data about private bills, # other than the value of the helper function `_is_restricted` and a last modified timestamp. # We yield private bills early, wipe data from previously imported once-public bills, # and include only data *required* by the pupa schema. # https://github.com/opencivicdata/pupa/blob/master/pupa/scrape/schemas/bill.py bill.extras = {'restrict_view': self._is_restricted(matter)} # Add API source early. # Private bills should have this url for debugging. legistar_api = self.BASE_URL + '/matters/{0}'.format(matter_id) bill.add_source(legistar_api, note='api') if self._is_restricted(matter): # required fields bill.title = 'Restricted View' # wipe old data bill.extras['plain_text'] = '' bill.extras['rtf_text'] = '' bill.sponsorships = [] bill.related_bills = [] bill.versions = [] bill.documents = [] bill.actions = [] yield bill continue legistar_web = matter['legistar_url'] bill.add_source(legistar_web, note='web') for identifier in alternate_identifiers: bill.add_identifier(identifier) for action, vote in self.actions(matter_id): act = bill.add_action(**action) if action['description'] == 'Referred': body_name = matter['MatterBodyName'] act.add_related_entity( body_name, 'organization', entity_id=_make_pseudo_id(name=body_name)) result, votes = vote if result: vote_event = VoteEvent( legislative_session=bill.legislative_session, motion_text=action['description'], organization=action['organization'], classification=None, start_date=action['date'], result=result, bill=bill) vote_event.add_source(legistar_web) vote_event.add_source(legistar_api + '/histories') for vote in votes: try: raw_option = vote['VoteValueName'].lower() except AttributeError: raw_option = None clean_option = self.VOTE_OPTIONS.get( raw_option, raw_option) vote_event.vote(clean_option, vote['VotePersonName'].strip()) yield vote_event for sponsorship in self.sponsorships(matter_id): bill.add_sponsorship(**sponsorship) for topic in self.topics(matter_id): bill.add_subject(topic['MatterIndexName'].strip()) for relation in self.relations(matter_id): try: # Get data (i.e., json) for the related bill. # Then, we can find the 'MatterFile' (i.e., identifier) and the 'MatterIntroDate' (i.e., to determine its legislative session). # Sometimes, the related bill does not yet exist: in this case, throw an error, and continue. related_bill = self.endpoint( '/matters/{0}', relation['MatterRelationMatterId']) except scrapelib.HTTPError: continue else: date = related_bill['MatterIntroDate'] related_bill_session = self.session(self.toTime(date)) identifier = related_bill['MatterFile'] bill.add_related_bill( identifier=identifier, legislative_session=related_bill_session, relation_type='companion') # Currently, the relation type for bills can be one of a few possibilites: https://github.com/opencivicdata/python-opencivicdata/blob/master/opencivicdata/common.py#L104 # Metro simply understands these as related files, suggesting that they receive a relation of 'companion'. bill.add_version_link( 'Board Report', 'https://metro.legistar.com/ViewReport.ashx?M=R&N=TextL5&GID=557&ID={}&GUID=LATEST&Title=Board+Report' .format(matter_id), media_type="application/pdf") for attachment in self.attachments(matter_id): if attachment['MatterAttachmentName'] and self._show_attachment( attachment): bill.add_document_link( attachment['MatterAttachmentName'], attachment['MatterAttachmentHyperlink'].strip(), media_type="application/pdf") bill.extras['local_classification'] = matter['MatterTypeName'] matter_version_value = matter['MatterVersion'] text = self.text(matter_id, matter_version_value) if text: if text['MatterTextPlain']: bill.extras['plain_text'] = text['MatterTextPlain'] if text['MatterTextRtf']: bill.extras['rtf_text'] = text['MatterTextRtf'].replace( u'\u0000', '') yield bill
def scrape_bill(self, session, bill_id, chamber): # https://malegislature.gov/Bills/189/SD2739 session_for_url = self.replace_non_digits(session) bill_url = 'https://malegislature.gov/Bills/{}/{}'.format( session_for_url, bill_id) try: response = requests.get(bill_url) self.info("GET (with `requests`) - {}".format(bill_url)) except requests.exceptions.RequestException as e: self.warning(u'Server Error on {}'.format(bill_url)) return False html = response.text page = lxml.html.fromstring(html) if not page.xpath('//div[contains(@class, "followable")]/h1/text()'): self.warning(u'Server Error on {}'.format(bill_url)) return False # The state website will periodically miss a few bills' titles for a few days # These titles will be extant on the bill list page, but missing on the bill detail page # The titles are eventually populated try: bill_title = page.xpath( '//div[@id="contentContainer"]/div/div/h2/text()')[0] except IndexError: self.warning( "Couldn't find title for {}; skipping".format(bill_id)) return False bill_id = re.sub(r'[^S|H|D|\d]', '', bill_id) bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=bill_title, classification='bill') bill_summary = None if page.xpath('//p[@id="pinslip"]/text()'): bill_summary = page.xpath('//p[@id="pinslip"]/text()')[0] if bill_summary: bill.add_abstract(bill_summary, 'summary') bill.add_source(bill_url) # https://malegislature.gov/Bills/189/SD2739 has a presenter # https://malegislature.gov/Bills/189/S2168 no sponsor # Find the non-blank text of the dt following Sponsor or Presenter, # including any child link text. sponsor = page.xpath( '//dt[text()="Sponsor:" or text()="Presenter:"]/' 'following-sibling::dd/descendant-or-self::*/text()[normalize-space()]' ) if sponsor: sponsor = sponsor[0].strip() bill.add_sponsorship(sponsor, classification='primary', primary=True, entity_type='person') self.scrape_cosponsors(bill, bill_url) version = page.xpath( "//div[contains(@class, 'modalBtnGroup')]/" "a[contains(text(), 'Download PDF') and not(@disabled)]/@href") if version: version_url = "https://malegislature.gov{}".format(version[0]) bill.add_version_link('Bill Text', version_url, media_type='application/pdf') # yield back votes and bill # XXX yield from self.scrape_actions(bill, bill_url, session) yield bill
def scrape_bill(self, session, chamber, bill_id, title, url, strip_sponsors=re.compile(r'\s*\(.{,50}\)\s*').sub): html = self.get(url).text page = lxml.html.fromstring(html) page.make_links_absolute(url) bill_type = self.bill_types[bill_id.split()[0][1:]] bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=title, classification=bill_type) bill.add_source(url) xpath = ('//strong[contains(., "SUBJECT")]/../' 'following-sibling::td/a/text()') bill.subject = page.xpath(xpath) for version in self.scrape_versions(session, chamber, page, bill_id): bill.add_version_link(**version) # Resolution pages have different html. values = {} trs = page.xpath('//div[@id="bhistcontent"]/table/tr') for tr in trs: heading = tr.xpath('td/strong/text()') if heading: heading = heading[0] else: continue value = tr.text_content().replace(heading, '').strip() values[heading] = value # summary was always same as title # bill['summary'] = values['SUMMARY:'] # Add primary sponsor. primary = strip_sponsors('', values.get('LEAD SPONSOR:', '')) if primary: bill.add_sponsorship(name=primary, classification='primary', entity_type='person', primary=True) # Add cosponsors. if values.get('SPONSORS:'): sponsors = strip_sponsors('', values['SPONSORS:']) sponsors = re.split(', (?![A-Z]\.)', sponsors) for name in sponsors: name = name.strip(', \n\r') if name: # Fix name splitting bug where "Neale, D. Hall" match = re.search('(.+?), ([DM]\. Hall)', name) if match: for name in match.groups(): bill.add_sponsorship(name=name, classification='cosponsor', entity_type='person', primary=False) else: bill.add_sponsorship(name=name, classification='cosponsor', entity_type='person', primary=False) for link in page.xpath("//a[contains(@href, 'votes/house')]"): yield from self.scrape_house_vote(bill, link.attrib['href']) for tr in reversed( page.xpath("//table[@class='tabborder']/descendant::tr")[1:]): tds = tr.xpath('td') if len(tds) < 3: continue chamber_letter = tds[0].text_content() chamber = {'S': 'upper', 'H': 'lower'}[chamber_letter] # Index of date info no longer varies on resolutions. date = tds[2].text_content().strip() date = datetime.datetime.strptime(date, "%m/%d/%y").date() action = tds[1].text_content().strip() if action.lower().startswith('passed senate'): for href in tds[1].xpath('a/@href'): yield from self.scrape_senate_vote(bill, href, date) attrs = dict(chamber=chamber, description=action, date=date.strftime("%Y-%m-%d")) temp = self.categorizer.categorize(action) related_entities = [] for key, values in temp.items(): if key != 'classification': for value in values: related_entities.append({"type": key, "name": value}) attrs.update(classification=temp['classification'], related_entities=related_entities) bill.add_action(**attrs) yield bill
def _parse_senate_billpage(self, bill_url, year): bill_page = self.lxmlize(bill_url) # get all the info needed to record the bill # TODO probably still needs to be fixed bill_id = bill_page.xpath('//*[@id="lblBillNum"]')[0].text_content() bill_title = bill_page.xpath( '//*[@id="lblBillTitle"]')[0].text_content() bill_desc = bill_page.xpath( '//*[@id="lblBriefDesc"]')[0].text_content() # bill_lr = bill_page.xpath('//*[@id="lblLRNum"]')[0].text_content() bill_type = "bill" triplet = bill_id[:3] if triplet in bill_types: bill_type = bill_types[triplet] subs = [] bid = bill_id.replace(" ", "") if bid in self._subjects: subs = self._subjects[bid] self.info("With subjects for this bill") self.info(bid) if bid == 'XXXXXX': self.info("Skipping Junk Bill") return bill = Bill( bill_id, title=bill_desc, chamber='upper', legislative_session=self._session_id, classification=bill_type, ) bill.subject = subs bill.add_abstract(bill_desc, note='abstract') bill.add_source(bill_url) if bill_title: bill.add_title(bill_title) # Get the primary sponsor try: sponsor = bill_page.xpath('//a[@id="hlSponsor"]')[0] except IndexError: sponsor = bill_page.xpath('//span[@id="lSponsor"]')[0] bill_sponsor = sponsor.text_content() # bill_sponsor_link = sponsor.attrib.get('href') bill.add_sponsorship( bill_sponsor, entity_type='person', classification='primary', primary=True, ) # cosponsors show up on their own page, if they exist cosponsor_tag = bill_page.xpath('//a[@id="hlCoSponsors"]') if len(cosponsor_tag) > 0 and cosponsor_tag[0].attrib.get('href'): self._parse_senate_cosponsors(bill, cosponsor_tag[0].attrib['href']) # get the actions action_url = bill_page.xpath('//a[@id="hlAllActions"]') if len(action_url) > 0: action_url = action_url[0].attrib['href'] self._parse_senate_actions(bill, action_url) # stored on a separate page versions_url = bill_page.xpath('//a[@id="hlFullBillText"]') if len(versions_url) > 0 and versions_url[0].attrib.get('href'): self._parse_senate_bill_versions(bill, versions_url[0].attrib['href']) amendment_links = bill_page.xpath( '//a[contains(@href,"ShowAmendment.asp")]') for link in amendment_links: link_text = link.xpath('string(.)').strip() if 'adopted' in link_text.lower(): link_url = link.xpath('@href')[0] bill.add_version_link(link_text, link_url, media_type='application/pdf', on_duplicate='ignore') yield bill
def old_scrape(self, session=None): status_report_url = "http://www.legislature.ohio.gov/legislation/status-reports" # ssl verification off due Ohio not correctly implementing SSL if not session: session = self.latest_session() self.info('no session, using %s', session) doc = self.get(status_report_url, verify=False).text doc = lxml.html.fromstring(doc) doc.make_links_absolute(status_report_url) xpath = "//div[contains(text(),'{}')]/following-sibling::table" status_table = doc.xpath(xpath.format(session))[0] status_links = status_table.xpath( ".//a[contains(text(),'Excel')]/@href") for url in status_links: try: fname, resp = self.urlretrieve(url) except scrapelib.HTTPError as report: self.logger.warning("Missing report {}".format(report)) continue sh = xlrd.open_workbook(fname).sheet_by_index(0) # once workbook is open, we can remove tempfile os.remove(fname) for rownum in range(1, sh.nrows): bill_id = sh.cell(rownum, 0).value bill_type = "resolution" if "R" in bill_id else "bill" chamber = "lower" if "H" in bill_id else "upper" bill_title = str(sh.cell(rownum, 3).value) bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=bill_title, classification=bill_type) bill.add_source(url) bill.add_sponsor('primary', str(sh.cell(rownum, 1).value)) # add cosponsor if sh.cell(rownum, 2).value: bill.add_sponsor('cosponsor', str(sh.cell(rownum, 2).value)) actor = "" # Actions start column after bill title for colnum in range(4, sh.ncols - 1): action = str(sh.cell(0, colnum).value) cell = sh.cell(rownum, colnum) date = cell.value if len(action) != 0: if action.split()[0] == 'House': actor = "lower" elif action.split()[0] == 'Senate': actor = "upper" elif action.split()[-1] == 'Governor': actor = "executive" elif action.split()[0] == 'Gov.': actor = "executive" elif action.split()[-1] == 'Gov.': actor = "executive" if action in ('House Intro. Date', 'Senate Intro. Date'): atype = ['bill:introduced'] action = action.replace('Intro. Date', 'Introduced') elif action == '3rd Consideration': atype = ['bill:reading:3', 'bill:passed'] elif action == 'Sent to Gov.': atype = ['governor:received'] elif action == 'Signed By Governor': atype = ['governor:signed'] else: atype = ['other'] if type(date) == float: date = str(xlrd.xldate_as_tuple(date, 0)) date = datetime.datetime.strptime( date, "(%Y, %m, %d, %H, %M, %S)") date = self._tz.localize(date) date = "{:%Y-%m-%d}".format(date) bill.add_action(actor, action, date, type=atype) for idx, char in enumerate(bill_id): try: int(char) except ValueError: continue underscore_bill = bill_id[:idx] + "_" + bill_id[idx:] break yield from self.scrape_votes_old(bill, underscore_bill, session) self.scrape_versions_old(bill, underscore_bill, session) yield bill
def test_full_bill(): create_jurisdiction() person = Person.objects.create(id='person-id', name='Adam Smith') org = ScrapeOrganization(name='House', classification='lower') com = ScrapeOrganization(name='Arbitrary Committee', classification='committee', parent_id=org._id) oldbill = ScrapeBill('HB 99', '1899', 'Axe & Tack Tax Act', classification='tax bill', from_organization=org._id) bill = ScrapeBill('HB 1', '1900', 'Axe & Tack Tax Act', classification='tax bill', from_organization=org._id) bill.subject = ['taxes', 'axes'] bill.add_identifier('SB 9') bill.add_title('Tack & Axe Tax Act') bill.add_action('introduced in house', '1900-04-01', chamber='lower') act = bill.add_action('sent to arbitrary committee', '1900-04-04', chamber='lower') act.add_related_entity('arbitrary committee', 'organization', com._id) bill.add_related_bill("HB 99", legislative_session="1899", relation_type="prior-session") bill.add_sponsorship('Adam Smith', classification='extra sponsor', entity_type='person', primary=False, entity_id=person.id) bill.add_sponsorship('Jane Smith', classification='lead sponsor', entity_type='person', primary=True) bill.add_abstract('This is an act about axes and taxes and tacks.', note="official") bill.add_document_link('Fiscal Note', 'http://example.com/fn.pdf', media_type='application/pdf') bill.add_document_link('Fiscal Note', 'http://example.com/fn.html', media_type='text/html') bill.add_version_link('Fiscal Note', 'http://example.com/v/1', media_type='text/html') bill.add_source('http://example.com/source') # import bill oi = OrganizationImporter('jid') oi.import_data([org.as_dict(), com.as_dict()]) pi = PersonImporter('jid') pi.json_to_db_id['person-id'] = 'person-id' # Since we have to create this person behind the back of the import # transaction, we'll fake the json-id to db-id, since they match in this # case. This is *really* getting at some implementation detail, but it's # the cleanest way to ensure we short-circut the json id lookup. BillImporter('jid', oi, pi).import_data([oldbill.as_dict(), bill.as_dict()]) # get bill from db and assert it imported correctly b = Bill.objects.get(identifier='HB 1') assert b.from_organization.classification == 'lower' assert b.identifier == bill.identifier assert b.title == bill.title assert b.classification == bill.classification assert b.subject == ['taxes', 'axes'] assert b.abstracts.get().note == 'official' # other_title, other_identifier added assert b.other_titles.get().title == 'Tack & Axe Tax Act' assert b.other_identifiers.get().identifier == 'SB 9' # actions actions = list(b.actions.all()) assert len(actions) == 2 # ensure order was preserved (if this breaks it'll be intermittent) assert actions[0].organization == Organization.objects.get(classification='lower') assert actions[0].description == "introduced in house" assert actions[1].description == "sent to arbitrary committee" assert (actions[1].related_entities.get().organization == Organization.objects.get(classification='committee')) # related_bills were added rb = b.related_bills.get() assert rb.identifier == 'HB 99' # and bill got resolved assert rb.related_bill.identifier == 'HB 99' # sponsors added, linked & unlinked sponsorships = b.sponsorships.all() assert len(sponsorships) == 2 for ss in sponsorships: if ss.primary: assert ss.person is None assert ss.organization is None else: assert ss.person == person # versions & documents with their links versions = b.versions.all() assert len(versions) == 1 assert versions[0].links.count() == 1 documents = b.documents.all() assert len(documents) == 1 assert documents[0].links.count() == 2 # sources assert b.sources.count() == 1
def get_bill_info(self, chamber, session, bill_detail_url, version_list_url): """ Extracts all the requested info for a given bill. Calls the parent's methods to enter the results into JSON files. """ chamber = 'lower' if chamber.lower() == 'house' else chamber chamber = 'upper' if chamber.lower() == 'senate' else chamber # Get html and parse doc = self.lxmlize(bill_detail_url) # Check if bill hasn't been transmitted to the other chamber yet transmit_check = self.get_node( doc, '//h1[text()[contains(.,"Bills")]]/following-sibling::ul/li/text()' ) if (transmit_check is not None and 'has not been transmitted' in transmit_check.strip()): self.logger.debug('Bill has not been transmitted to other chamber ' '... skipping {0}'.format(bill_detail_url)) return # Get the basic parts of the bill bill_id = self.get_node(doc, '//h1/text()') self.logger.debug(bill_id) bill_title_text = self.get_node( doc, '//h2[text()[contains(.,"Description")]]/following-sibling::p/text()' ) if bill_title_text is not None: bill_title = bill_title_text.strip() else: long_desc_url = self.get_node( doc, '//a[text()[contains(.,"Long Description")]]/@href') long_desc_page = self.lxmlize(long_desc_url) long_desc_text = self.get_node( long_desc_page, '//h1/' 'following-sibling::p/text()') if long_desc_text is not None: bill_title = long_desc_text.strip() else: bill_title = 'No title found.' self.logger.warning('No title found for {}.'.format(bill_id)) self.logger.debug(bill_title) bill_type = { 'F': 'bill', 'R': 'resolution', 'C': 'concurrent resolution' }[bill_id[1]] bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=bill_title, classification=bill_type) # Add source bill.add_source(bill_detail_url) for subject in self._subject_mapping[bill_id]: bill.add_subject(subject) # Get companion bill. companion = doc.xpath('//table[@class="status_info"]//tr[1]/td[2]' '/a[starts-with(@href, "?")]/text()') companion = self.make_bill_id( companion[0]) if len(companion) > 0 else None companion_chamber = self.chamber_from_bill(companion) if companion is not None: bill.add_companion(companion, chamber=companion_chamber) # Grab sponsors bill = self.extract_sponsors(bill, doc, chamber) # Add Actions performed on the bill. bill = self.extract_actions(bill, doc, chamber) # Get all versions of the bill. bill = self.extract_versions(bill, doc, chamber, version_list_url) yield bill
def scrape(self): session_name = self.latest_session() session = session_name[0:5] self._bill_prefix_map = { 'HB': { 'type': 'bill', 'url_segment': 'bills/house', }, 'HR': { 'type': 'resolution', 'url_segment': 'resolutions/house/simple', }, 'HCR': { 'type': 'concurrent resolution', 'url_segment': 'resolutions/house/concurrent', }, 'HJR': { 'type': 'joint resolution', 'url_segment': 'resolutions/house/joint' }, 'HC': { 'type': 'concurrent resolution', 'url_segment': 'resolutions/house/concurrent', }, 'HJ': { 'type': 'joint resolution', 'url_segment': 'resolutions/house/joint', }, 'SB': { 'type': 'bill', 'url_segment': 'bills/senate', }, 'SR': { 'type': 'resolution', 'url_segment': 'resolutions/senate/simple', }, 'SCR': { 'type': 'concurrent resolution', 'url_segment': 'resolutions/senate/concurrent', }, 'SJR': { 'type': 'joint resolution', 'url_segment': 'resolutions/senate/joint', }, 'SC': { 'type': 'concurrent resolution', 'url_segment': 'resolutions/senate/concurrent', }, 'SJ': { 'type': 'joint resolution', 'url_segment': 'resolutions/senate/joint', }, } api_base_url = "https://api.iga.in.gov" proxy = {"url": "http://in-proxy.openstates.org"} # ah, indiana. it's really, really hard to find # pdfs in their web interface. Super easy with # the api, but a key needs to be passed # in the headers. To make these documents # viewable to the public and our scrapers, # sunlight's put up a proxy service at this link # using our api key for pdf document access. client = ApiClient(self) r = client.get("bills", session=session) all_pages = client.unpaginate(r) for b in all_pages: bill_id = b["billName"] for idx, char in enumerate(bill_id): try: int(char) except ValueError: continue disp_bill_id = bill_id[:idx]+" "+str(int(bill_id[idx:])) break bill_link = b["link"] api_source = api_base_url + bill_link try: bill_json = client.get("bill", session=session, bill_id=bill_id.lower()) except scrapelib.HTTPError: self.logger.warning('Bill could not be accessed. Skipping.') continue title = bill_json["title"] if title == "NoneNone": title = None # sometimes title is blank # if that's the case, we can check to see if # the latest version has a short description if not title: title = bill_json["latestVersion"]["shortDescription"] # and if that doesn't work, use the bill_id but throw a warning if not title: title = bill_id self.logger.warning("Bill is missing a title, using bill id instead.") bill_prefix = self._get_bill_id_components(bill_id)[0] original_chamber = ("lower" if bill_json["originChamber"].lower() == "house" else "upper") bill_type = self._bill_prefix_map[bill_prefix]['type'] bill = Bill(disp_bill_id, legislative_session=session, chamber=original_chamber, title=title, classification=bill_type) bill.add_source(self._get_bill_url(session, bill_id)) bill.add_source(api_source) # sponsors for s in bill_json["authors"]: bill.add_sponsorship(classification="author", name=self._get_name(s), entity_type='person', primary=True) for s in bill_json["coauthors"]: bill.add_sponsorship(classification="coauthor", name=self._get_name(s), entity_type='person', primary=False) for s in bill_json["sponsors"]: bill.add_sponsorship(classification="sponsor", name=self._get_name(s), entity_type='person', primary=True) for s in bill_json["cosponsors"]: bill.add_sponsorship(classification="cosponsor", name=self._get_name(s), entity_type='person', primary=False) # actions action_link = bill_json["actions"]["link"] api_source = api_base_url + action_link try: actions = client.get("bill_actions", session=session, bill_id=bill_id.lower()) except scrapelib.HTTPError: self.logger.warning("Could not find bill actions page") actions = {"items": []} for a in actions["items"]: action_desc = a["description"] if "governor" in action_desc.lower(): action_chamber = "executive" elif a["chamber"]["name"].lower() == "house": action_chamber = "lower" else: action_chamber = "upper" date = a["date"] if not date: self.logger.warning("Action has no date, skipping") continue # convert time to pupa fuzzy time date = date.replace('T', ' ') # TODO: if we update pupa to accept datetimes we can drop this line date = date.split()[0] action_type = [] d = action_desc.lower() committee = None reading = False if "first reading" in d: action_type.append("reading-1") reading = True if ("second reading" in d or "reread second time" in d): action_type.append("reading-2") reading = True if ("third reading" in d or "reread third time" in d): action_type.append("reading-3") if "passed" in d: action_type.append("passage") if "failed" in d: action_type.append("failure") reading = True if "adopted" in d and reading: action_type.append("passage") if ("referred" in d and "committee on" in d or "reassigned" in d and "committee on" in d): committee = d.split("committee on")[-1].strip() action_type.append("referral-committee") if "committee report" in d: if "pass" in d: action_type.append("committee-passage") if "fail" in d: action_type.append("committee-failure") if "amendment" in d and "without amendment" not in d: if "pass" in d or "prevail" in d or "adopted" in d: action_type.append("amendment-passage") if "fail" or "out of order" in d: action_type.append("amendment-failure") if "withdraw" in d: action_type.append("amendment-withdrawal") if "signed by the governor" in d: action_type.append("executive-signature") if len(action_type) == 0: # calling it other and moving on with a warning self.logger.warning("Could not recognize an action in '{}'".format( action_desc)) action_type = None a = bill.add_action(chamber=action_chamber, description=action_desc, date=date, classification=action_type) if committee: a.add_related_entity(committee, entity_type='organization') # subjects subjects = [s["entry"] for s in bill_json["latestVersion"]["subjects"]] for subject in subjects: bill.add_subject(subject) # versions and votes for version in bill_json["versions"][::-1]: try: version_json = client.get("bill_version", session=session, bill_id=version["billName"], version_id=version["printVersionName"]) except scrapelib.HTTPError: self.logger.warning("Bill version does not seem to exist.") continue yield from self.deal_with_version(version_json, bill, bill_id, original_chamber, session, proxy) yield bill
def scrape_bill(self, chamber, session, bill_id, short_title=None): """ Scrapes documents, actions, vote counts and votes for bills from the 2009 session and above. """ url = BILL_URL % (session, bill_id.replace(" ", "")) bill_page = self.get(url).text html = lxml.html.fromstring(bill_page) html.make_links_absolute( "http://legislature.idaho.gov/legislation/%s/" % session ) bill_tables = html.xpath('//table[contains(@class, "bill-table")]') title = bill_tables[1].text_content().strip() bill_type = get_bill_type(bill_id) bill = Bill( legislative_session=session, chamber=chamber, identifier=bill_id, title=title, classification=bill_type, ) bill.add_source(url) for subject in self._subjects[bill_id.replace(" ", "")]: bill.add_subject(subject) if short_title and title.lower() != short_title.lower(): bill.add_title(short_title, "short title") # documents doc_links = html.xpath('//div[contains(@class,"insert-page")]//a') for link in doc_links: name = link.text_content().strip() href = link.get("href") if "Engrossment" in name or "Bill Text" in name or "Amendment" in name: bill.add_version_link(note=name, url=href, media_type="application/pdf") else: bill.add_document_link( note=name, url=href, media_type="application/pdf" ) def _split(string): return re.split(r"\w+[,|AND]\s+", string) # sponsors range from a committee to one legislator to a group of legs sponsor_lists = bill_tables[0].text_content().split("by") if len(sponsor_lists) > 1: for sponsors in sponsor_lists[1:]: if "COMMITTEE" in sponsors.upper(): bill.add_sponsorship( name=sponsors.strip(), entity_type="organization", primary=True, classification="primary", ) else: for person in _split(sponsors): person = person.strip() if person != "": bill.add_sponsorship( classification="primary", name=person, entity_type="person", primary=True, ) actor = chamber last_date = None # if a bill has passed a chamber or been 'received from' # then the next committee passage is in the opposite chamber has_moved_chambers = False for row in bill_tables[2]: # lots of empty rows if len(row) == 1: continue _, date, action, _ = [x.text_content().strip() for x in row] if date: last_date = date else: date = last_date date = datetime.datetime.strptime( date + "/" + session[0:4], "%m/%d/%Y" ).strftime("%Y-%m-%d") if action.startswith("House"): actor = "lower" elif action.startswith("Senate"): actor = "upper" # votes if "AYES" in action or "NAYS" in action: yield from self.parse_vote( actor, date, row[2], session, bill_id, chamber, url ) # bill.add_vote_event(vote) # some td's text is seperated by br elements if len(row[2]): action = "".join(row[2].itertext()) action = action.replace(u"\xa0", " ").strip() atype = get_action(actor, action) if atype and "passage" in atype: has_moved_chambers = True if atype and "committee-passage" in atype and has_moved_chambers: actor = _OTHER_CHAMBERS[actor] bill.add_action(action, date, chamber=actor, classification=atype) # after voice vote/roll call and some actions the bill is sent # 'to House' or 'to Senate' if "to House" in action: actor = "lower" elif "to Senate" in action: actor = "upper" yield bill