def scrape_bill(self, chamber, session, bill_id, url): html = self.get(url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) try: title = doc.xpath('//h3[@class="h3billright"]')[0].text_content() # TODO: grab summary (none present at time of writing) except IndexError: if 'Unable to retrieve the requested information. Please try again' in html: self.warning('Soft error page, skipping.') return else: raise if 'B' in bill_id: _type = ['bill'] elif 'J' in bill_id: _type = ['joint resolution'] else: raise ValueError('unknown bill type ' + bill_id) bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=title, classification=_type) bill.add_source(url) # process sponsors sponsors = _get_td(doc, 'All Sponsors:').text_content() sponsors = sponsors.replace('Delegates ', '') sponsors = sponsors.replace('Delegate ', '') sponsors = sponsors.replace('Senator ', '') sponsors = sponsors.replace('Senators ', '') sponsor_type = 'primary' for sponsor in re.split(', (?:and )?', sponsors): sponsor = sponsor.strip() if not sponsor: continue bill.add_sponsorship( sponsor, sponsor_type, primary=sponsor_type == 'primary', entity_type='person', ) sponsor_type = 'cosponsor' # subjects subject_list = [] for heading in ('Broad Subject(s):', 'Narrow Subject(s):'): subjects = _get_td(doc, heading).xpath('a/text()') subject_list += [s.split(' -see also-')[0] for s in subjects if s] bill.subject = subject_list # documents yield from self.scrape_documents(bill, url.replace('stab=01', 'stab=02')) # actions self.scrape_actions(bill, url.replace('stab=01', 'stab=03')) yield bill
def handle_list_item(self, item): bill_id = item.text.strip() title = item.xpath("string(../following-sibling::td[1])").strip() sponsor = item.xpath("string(../following-sibling::td[2])").strip() bill_url = item.attrib['href'] + '/ByCategory' if bill_id.startswith(('SB ', 'HB ', 'SPB ', 'HPB ')): bill_type = 'bill' elif bill_id.startswith(('HR ', 'SR ')): bill_type = 'resolution' elif bill_id.startswith(('HJR ', 'SJR ')): bill_type = 'joint resolution' elif bill_id.startswith(('SCR ', 'HCR ')): bill_type = 'concurrent resolution' elif bill_id.startswith(('SM ', 'HM ')): bill_type = 'memorial' else: raise ValueError('Failed to identify bill type.') bill = Bill(bill_id, self.kwargs['session'], title, chamber='lower' if bill_id[0] == 'H' else 'upper', classification=bill_type) bill.add_source(bill_url) # normalize id from HB 0004 to H4 subj_bill_id = re.sub('(H|S)\w+ 0*(\d+)', r'\1\2', bill_id) bill.subject = list(self.kwargs['subjects'][subj_bill_id]) sponsor = re.sub(r'^(?:Rep|Sen)\.\s', "", sponsor) for sp in sponsor.split(', '): bill.add_sponsorship(sp, 'primary', 'person', True) yield from self.scrape_page_items(BillDetail, url=bill_url, obj=bill) yield bill
def scrape_bill(self, chamber, session, bill_id): bill_num = bill_id.split()[1] url = ("%s/GetLegislation?biennium=%s&billNumber" "=%s" % (self._base_url, self.biennium, bill_num)) page = self.get(url) page = lxml.etree.fromstring(page.content) page = xpath(page, "//wa:Legislation")[0] xml_chamber = xpath(page, 'string(wa:OriginalAgency)') chamber = self._chamber_map[xml_chamber] title = xpath(page, "string(wa:LongDescription)") bill_type = xpath( page, "string(wa:ShortLegislationType/wa:LongLegislationType)") bill_type = bill_type.lower() if bill_type == 'gubernatorial appointment': return bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=title, classification=[bill_type]) fake_source = ("http://apps.leg.wa.gov/billinfo/" "summary.aspx?bill=%s&year=%s" % (bill_num, session[0:4])) bill.add_source(fake_source) try: for version in self.versions[bill_id]: bill.add_version_link(note=version['note'], url=version['url'], media_type=version['media_type']) except KeyError: self.warning("No versions were found for {}".format(bill_id)) try: for document in self.documents[bill_num]: bill.add_document_link(note=document['note'], url=document['url'], media_type=document['media_type']) except KeyError: pass self.scrape_sponsors(bill) self.scrape_actions(bill, bill_num) self.scrape_hearings(bill, bill_num) yield from self.scrape_votes(bill) bill.subject = list(set(self._subjects[bill_id])) yield bill
def scrape_bill(self, chamber, session, bill_id, url): html = self.get(url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) title = doc.xpath('//h3[@class="h3billright"]')[0].text_content() # TODO: grab summary (none present at time of writing) if 'B' in bill_id: _type = ['bill'] elif 'J' in bill_id: _type = ['joint resolution'] else: raise ValueError('unknown bill type ' + bill_id) bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=title, classification=_type) bill.add_source(url) # process sponsors sponsors = _get_td(doc, 'All Sponsors:').text_content() sponsors = sponsors.replace('Delegates ', '') sponsors = sponsors.replace('Delegate ', '') sponsors = sponsors.replace('Senator ', '') sponsors = sponsors.replace('Senators ', '') sponsor_type = 'primary' for sponsor in re.split(', (?:and )?', sponsors): sponsor = sponsor.strip() if not sponsor: continue bill.add_sponsorship( sponsor, sponsor_type, primary=sponsor_type == 'primary', entity_type='person', ) sponsor_type = 'cosponsor' # subjects subject_list = [] for heading in ('Broad Subject(s):', 'Narrow Subject(s):'): subjects = _get_td(doc, heading).xpath('a/text()') subject_list += [s.split(' -see also-')[0] for s in subjects if s] bill.subject = subject_list # documents yield from self.scrape_documents(bill, url.replace('stab=01', 'stab=02')) # actions self.scrape_actions(bill, url.replace('stab=01', 'stab=03')) yield bill
def scrape_bill(self, session, session_slug, chamber, url): page = lxml.html.fromstring(self.get(url).text) bill_no = page.xpath('//*[@id="item-header"]/text()')[0].strip() # state bill id internal_id = re.search(r"\/Bill\/(\d+)\/Overview", url).group(1) # bill data gets filled in from another call bill_data_base = ( "https://www.leg.state.nv.us/App/NELIS/REL/{}/Bill/" "FillSelectedBillTab?selectedTab=Overview&billKey={}&_={}" ) bill_data_url = bill_data_base.format( session_slug, internal_id, time.time() * 1000 ) bill_page = lxml.html.fromstring(self.get(bill_data_url).text) short_title = self.get_header_field(bill_page, "Summary:").text short_title = short_title.replace("\u00a0", " ") bill = Bill( identifier=bill_no, legislative_session=session, title=short_title, chamber=chamber, ) long_title = self.get_header_field(bill_page, "Title:").text if long_title is not None: bill.add_abstract(long_title, "Summary") sponsor_div = self.get_header_field(bill_page, "Primary Sponsor") if sponsor_div is not None: self.add_sponsors(sponsor_div, bill, "primary") cosponsor_div = self.get_header_field(bill_page, "Co-Sponsor") if cosponsor_div is not None: self.add_sponsors(cosponsor_div, bill, "cosponsor") self.add_actions(bill_page, bill, chamber) self.add_versions(session_slug, internal_id, bill) bill.subject = list(set(self.subject_mapping[bill_no])) bdr = self.extract_bdr(short_title) if bdr: bill.extras["BDR"] = bdr bill.extras["NV_ID"] = internal_id bill.add_source(url) yield bill
def scrape_bill(self, chamber, session, bill_id): bill_num = bill_id.split()[1] url = ("%s/GetLegislation?biennium=%s&billNumber" "=%s" % (self._base_url, self.biennium, bill_num)) page = self.get(url) page = lxml.etree.fromstring(page.content) page = xpath(page, "//wa:Legislation")[0] title = xpath(page, "string(wa:LongDescription)") bill_type = xpath( page, "string(wa:ShortLegislationType/wa:LongLegislationType)") bill_type = bill_type.lower() if bill_type == 'gubernatorial appointment': return bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=title, classification=[bill_type]) fake_source = ("http://apps.leg.wa.gov/billinfo/" "summary.aspx?bill=%s&year=%s" % ( bill_num, session[0:4])) bill.add_source(fake_source) try: for version in self.versions[bill_id]: bill.add_version_link(note=version['note'], url=version['url'], media_type=version['media_type']) except KeyError: self.warning("No versions were found for {}".format(bill_id)) try: for document in self.documents[bill_num]: bill.add_document_link(note=document['note'], url=document['url'], media_type=document['media_type']) except KeyError: pass self.scrape_sponsors(bill) self.scrape_actions(bill, bill_num) self.scrape_hearings(bill, bill_num) yield from self.scrape_votes(bill) bill.subject = list(set(self._subjects[bill_id])) yield bill
def scrape_bill(self, session, session_slug, chamber, url): page = lxml.html.fromstring(self.get(url).text) bill_no = page.xpath('//*[@id="item-header"]/text()')[0].strip() # state bill id internal_id = re.search(r'\/Bill\/(\d+)\/Overview', url).group(1) # bill data gets filled in from another call bill_data_base = 'https://www.leg.state.nv.us/App/NELIS/REL/{}/Bill/' \ 'FillSelectedBillTab?selectedTab=Overview&billKey={}&_={}' bill_data_url = bill_data_base.format( session_slug, internal_id, time.time() * 1000) bill_page = lxml.html.fromstring(self.get(bill_data_url).text) short_title = self.get_header_field(bill_page, 'Summary:').text short_title = short_title.replace(u'\u00a0', ' ') bill = Bill( identifier=bill_no, legislative_session=session, title=short_title, chamber=chamber ) long_title = self.get_header_field(bill_page, 'Title:').text if long_title is not None: bill.add_abstract(long_title, 'Summary') sponsor_div = self.get_header_field(bill_page, 'Primary Sponsor') if sponsor_div is not None: self.add_sponsors(sponsor_div, bill, 'primary') cosponsor_div = self.get_header_field(bill_page, 'Co-Sponsor') if cosponsor_div is not None: self.add_sponsors(cosponsor_div, bill, 'cosponsor') self.add_actions(bill_page, bill, chamber) self.add_versions(session_slug, internal_id, bill) bill.subject = list(set(self.subject_mapping[bill_no])) bdr = self.extract_bdr(short_title) if bdr: bill.extras['BDR'] = bdr bill.extras['NV_ID'] = internal_id bill.add_source(url) yield bill
def handle_page(self): bills = self.doc.xpath('//ul[@class="linkSect"]/li') for bill in bills: link = bill.getchildren()[0] bill_id = str(link.text_content()) if not bill_id.startswith(('S', 'H')): continue # create a bill desc = bill.xpath('text()')[0].strip() chamber = { 'H': 'lower', 'S': 'upper', }[bill_id[0]] bill_type = { 'B': 'bill', 'J': 'joint resolution', 'R': 'resolution' }[bill_id[1]] bill = Bill(bill_id, self.kwargs['session'], desc, chamber=chamber, classification=bill_type) bill_url = link.get('href') sponsor_url = BASE_URL + URL_PATTERNS['sponsors'].format( self.kwargs['session_id'], bill_id.replace(' ', ''), ) list( self.scrape_page_items(BillSponsorPage, url=sponsor_url, obj=bill)) yield from self.scrape_page_items(BillDetailPage, url=bill_url, obj=bill) bill.subject = self.kwargs['subjects'][bill_id] bill.add_source(bill_url) yield bill next_url = self.doc.xpath('//a/b[text()="More..."]/../@href') if next_url: yield from self.scrape_page_items(BillListPage, url=next_url[0], **self.kwargs)
def handle_page(self): bills = self.doc.xpath('//ul[@class="linkSect"]/li') for bill in bills: link = bill.getchildren()[0] bill_id = str(link.text_content()) if not bill_id.startswith(("S", "H")): continue # create a bill desc = bill.xpath("text()")[0].strip() chamber = {"H": "lower", "S": "upper"}[bill_id[0]] bill_type = { "B": "bill", "J": "joint resolution", "R": "resolution" }[bill_id[1]] bill = Bill( bill_id, self.kwargs["session"], desc, chamber=chamber, classification=bill_type, ) bill_url = link.get("href") sponsor_url = BASE_URL + URL_PATTERNS["sponsors"].format( self.kwargs["session_id"], bill_id.replace(" ", "")) list( self.scrape_page_items(BillSponsorPage, url=sponsor_url, obj=bill)) yield from self.scrape_page_items(BillDetailPage, url=bill_url, obj=bill) bill.subject = self.kwargs["subjects"][bill_id] bill.add_source(bill_url) yield bill next_url = self.doc.xpath('//a/b[text()="More..."]/../@href') if next_url: yield from self.scrape_page_items(BillListPage, url=next_url[0], **self.kwargs)
def handle_list_item(self, item): bill_id = item.text.strip() title = item.xpath("string(../following-sibling::td[1])").strip() sponsor = item.xpath("string(../following-sibling::td[2])").strip() bill_url = item.attrib["href"] + "/ByCategory" if bill_id.startswith(("SB ", "HB ", "SPB ", "HPB ")): bill_type = "bill" elif bill_id.startswith(("HR ", "SR ")): bill_type = "resolution" elif bill_id.startswith(("HJR ", "SJR ")): bill_type = "joint resolution" elif bill_id.startswith(("SCR ", "HCR ")): bill_type = "concurrent resolution" elif bill_id.startswith(("SM ", "HM ")): bill_type = "memorial" else: raise ValueError("Failed to identify bill type.") bill = Bill( bill_id, self.kwargs["session"], title, chamber="lower" if bill_id[0] == "H" else "upper", classification=bill_type, ) bill.add_source(bill_url) # normalize id from HB 0004 to H4 subj_bill_id = re.sub(r"(H|S)\w+ 0*(\d+)", r"\1\2", bill_id) bill.subject = list(self.kwargs["subjects"][subj_bill_id]) sponsor = re.sub(r"^(?:Rep|Sen)\.\s", "", sponsor) for sp in sponsor.split(", "): sp = sp.strip() bill.add_sponsorship(sp, "primary", "person", True) yield from self.scrape_page_items(BillDetail, url=bill_url, obj=bill) yield bill
def handle_page(self): bills = self.doc.xpath('//ul[@class="linkSect"]/li') for bill in bills: link = bill.getchildren()[0] bill_id = str(link.text_content()) if not bill_id.startswith(('S', 'H')): continue # create a bill desc = bill.xpath('text()')[0].strip() chamber = { 'H': 'lower', 'S': 'upper', }[bill_id[0]] bill_type = { 'B': 'bill', 'J': 'joint resolution', 'R': 'resolution' }[bill_id[1]] bill = Bill(bill_id, self.kwargs['session'], desc, chamber=chamber, classification=bill_type) bill_url = link.get('href') sponsor_url = BASE_URL + URL_PATTERNS['sponsors'].format( self.kwargs['session_id'], bill_id.replace(' ', ''), ) list(self.scrape_page_items(BillSponsorPage, url=sponsor_url, obj=bill)) yield from self.scrape_page_items(BillDetailPage, url=bill_url, obj=bill) bill.subject = self.kwargs['subjects'][bill_id] bill.add_source(bill_url) yield bill next_url = self.doc.xpath('//a/b[text()="More..."]/../@href') if next_url: yield from self.scrape_page_items(BillListPage, url=next_url[0], **self.kwargs)
def scrape_bill_2012(self, chamber, session, bill_id, url): html = self.get(url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) # find <a name="Title">, get parent dt, get parent dl, then dd n dl title = doc.xpath( '//a[@name="Title"][1]/../../dd[1]/text()')[0].strip() summary = doc.xpath('//font[@size="3"]/p/text()')[0].strip() if 'B' in bill_id: _type = ['bill'] elif 'J' in bill_id: _type = ['joint resolution'] bill = Bill( bill_id, legislative_session=session, classification=_type, chamber=chamber, title=title, ) bill.add_abstract(summary, note='summary') bill.add_source(url) self.parse_bill_sponsors(doc, bill) # sponsors self.parse_bill_actions(doc, bill) # actions self.parse_bill_documents(doc, bill) # documents and versions yield from self.parse_bill_votes(doc, bill) # votes # subjects subjects = [] for subj in doc.xpath('//a[contains(@href, "/subjects/")]'): subjects.append(subj.text.split('-see also-')[0]) bill.subject = subjects # add bill to collection self.save_bill(bill)
def scrape_bill_2012(self, chamber, session, bill_id, url): html = self.get(url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) # find <a name="Title">, get parent dt, get parent dl, then dd n dl title = doc.xpath('//a[@name="Title"][1]/../../dd[1]/text()')[0].strip() summary = doc.xpath('//font[@size="3"]/p/text()')[0].strip() if 'B' in bill_id: _type = ['bill'] elif 'J' in bill_id: _type = ['joint resolution'] bill = Bill( bill_id, legislative_session=session, classification=_type, chamber=chamber, title=title, ) bill.add_abstract(summary, note='summary') bill.add_source(url) self.parse_bill_sponsors(doc, bill) # sponsors self.parse_bill_actions(doc, bill) # actions self.parse_bill_documents(doc, bill) # documents and versions yield from self.parse_bill_votes(doc, bill) # votes # subjects subjects = [] for subj in doc.xpath('//a[contains(@href, "/subjects/")]'): subjects.append(subj.text.split('-see also-')[0]) bill.subject = subjects # add bill to collection self.save_bill(bill)
def scrape_bill_list(self, chamber, session, url): if 'joint_resolution' in url: bill_type = 'joint resolution' elif 'resolution' in url: bill_type = 'resolution' elif 'bill' in url: bill_type = 'bill' try: data = self.get(url).text except scrapelib.HTTPError: self.warning('skipping URL %s' % url) return doc = lxml.html.fromstring(data) doc.make_links_absolute(url) bill_list = doc.xpath( '//ul[@class="infoLinks"]/li/div[@class="row-fluid"]') for b in bill_list: bill_url = b.xpath('./div[@class="span3"]/a/@href')[0] bill_id = bill_url.rsplit('/', 1)[-1] bill_id = bill_id.upper() title = b.xpath('./div[@class="span6"]/text()')[0].replace( ' - Relating to: ', '').strip() bill = Bill( bill_id, legislative_session=session, title=title, chamber=chamber, classification=bill_type, ) bill.subject = list(set(self.subjects[bill_id])) yield from self.scrape_bill_history(bill, bill_url, chamber) yield bill
def scrape_bill_list(self, chamber, session, url): if 'joint_resolution' in url: bill_type = 'joint resolution' elif 'resolution' in url: bill_type = 'resolution' elif 'bill' in url: bill_type = 'bill' try: data = self.get(url).text except scrapelib.HTTPError: self.warning('skipping URL %s' % url) return doc = lxml.html.fromstring(data) doc.make_links_absolute(url) bill_list = doc.xpath('//ul[@class="infoLinks"]/li/div[@class="row-fluid"]') for b in bill_list: bill_url = b.xpath('./div[@class="span3"]/a/@href')[0] bill_id = bill_url.rsplit('/', 1)[-1] bill_id = bill_id.upper() title = b.xpath( './div[@class="span6"]/text()' )[0].replace(' - Relating to: ', '').strip() bill = Bill( bill_id, legislative_session=session, title=title, chamber=chamber, classification=bill_type, ) bill.subject = list(set(self.subjects[bill_id])) yield from self.scrape_bill_history(bill, bill_url, chamber) yield bill
def scrape_actions(self, session, href): page = self.lxmlize(href) (bid, ) = page.xpath('//h1[@id="page-title"]/text()') bid = re.sub(r"^Bill Actions for ", "", bid) subjects = self.subjects.get(bid, []) # some pages say "Measure Number Breakdown", others "Bill..." table = page.xpath("//table[contains(@summary, 'Number Breakdown')]") table = table[0] ttrows = page.xpath("//div[@id='application']/p") descr = ttrows[-2] title = re.sub("\s+", " ", descr.text_content()).strip() ttrows = ttrows[:-1] chamber = { "H": "lower", "S": "upper" }[bid[0]] type_ = bid[1:3] bill_type = "bill" if type_.startswith("B"): bill_type = "bill" if type_.startswith("R"): bill_type = "resolution" if type_ == "CR": bill_type = "concurrent resolution" bill = Bill(bid, legislative_session=session, chamber=chamber, title=title, classification=bill_type) bill.subject = subjects bill.add_source(href) for row in ttrows: if isinstance(row, lxml.html.HtmlComment): continue # ignore HTML comments, no text_content() sponsors = row.text_content().strip() sinf = re.match( "(?i)introduced by( (rep\.|sen\.))? (?P<sponsors>.*)", sponsors ) if sinf: sponsors = sinf.groupdict() for sponsor in [ x.strip() for x in sponsors['sponsors'].split(",") ]: bill.add_sponsorship(sponsor, classification='primary', entity_type='person', primary=True) dt = None oldchamber = 'other' for row in table.xpath(".//tr"): if row.text_content().strip() == '': continue if "Meeting Description" in [ x.strip() for x in row.xpath(".//th/text()") ]: continue row = row.xpath("./*") row = [x.text_content().strip() for x in row] if len(row) > 3: row = row[:3] date, chamber, action = row try: chamber = { "House": "lower", "Senate": "upper" }[chamber] oldchamber = chamber except KeyError: chamber = oldchamber if date != '': dt = datetime.strptime("%s %s" % (date, self.year), "%m/%d %Y") classif = self.categorizer.categorize(action) bill.add_action(chamber=chamber, description=action, date=dt.strftime('%Y-%m-%d'), classification=classif['classification']) version_url = page.xpath("//a[contains(text(), 'Versions')]") if len(version_url) == 1: href = version_url[0].attrib['href'] bill = self.scrape_versions(bill, href) yield bill
def scrape_bill(self, session, chamber, bill_id, title, url, strip_sponsors=re.compile(r'\s*\(.{,50}\)\s*').sub): html = self.get(url).text page = lxml.html.fromstring(html) page.make_links_absolute(url) bill_type = self.bill_types[bill_id.split()[0][1:]] bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=title, classification=bill_type) bill.add_source(url) xpath = ('//strong[contains(., "SUBJECT")]/../' 'following-sibling::td/a/text()') bill.subject = page.xpath(xpath) for version in self.scrape_versions(session, chamber, page, bill_id): bill.add_version_link(**version) self.scrape_amendments(page, bill) # Resolution pages have different html. values = {} trs = page.xpath('//div[@id="bhistcontent"]/table/tr') for tr in trs: heading = tr.xpath('td/strong/text()') if heading: heading = heading[0] else: continue value = tr.text_content().replace(heading, '').strip() values[heading] = value # summary was always same as title # bill['summary'] = values['SUMMARY:'] # Add primary sponsor. primary = strip_sponsors('', values.get('LEAD SPONSOR:', '')) if primary: bill.add_sponsorship(name=primary, classification='primary', entity_type='person', primary=True) # Add cosponsors. if values.get('SPONSORS:'): sponsors = strip_sponsors('', values['SPONSORS:']) sponsors = re.split(r', (?![A-Z]\.)', sponsors) for name in sponsors: name = name.strip(', \n\r') if name: # Fix name splitting bug where "Neale, D. Hall" match = re.search(r'(.+?), ([DM]\. Hall)', name) if match: for name in match.groups(): bill.add_sponsorship(name=name, classification='cosponsor', entity_type='person', primary=False) else: bill.add_sponsorship(name=name, classification='cosponsor', entity_type='person', primary=False) for link in page.xpath("//a[contains(@href, 'votes/house')]"): yield from self.scrape_house_vote(bill, link.attrib['href']) for tr in reversed( page.xpath("//table[@class='tabborder']/descendant::tr")[1:]): tds = tr.xpath('td') if len(tds) < 3: continue chamber_letter = tds[0].text_content() chamber = {'S': 'upper', 'H': 'lower'}[chamber_letter] # Index of date info no longer varies on resolutions. date = tds[2].text_content().strip() date = datetime.datetime.strptime(date, "%m/%d/%y").date() action = tds[1].text_content().strip() if action.lower().startswith('passed senate'): for href in tds[1].xpath('a/@href'): yield from self.scrape_senate_vote(bill, href, date) attrs = dict(chamber=chamber, description=action, date=date.strftime("%Y-%m-%d")) temp = self.categorizer.categorize(action) related_entities = [] for key, values in temp.items(): if key != 'classification': for value in values: related_entities.append({"type": key, "name": value}) attrs.update(classification=temp['classification'], related_entities=related_entities) bill.add_action(**attrs) yield bill
def scrape_actions(self, session, href): page = self.lxmlize(href) (bid, ) = page.xpath('//h1[@id="page-title"]/text()') bid = re.sub(r"^Bill Actions for ", "", bid) subjects = self.subjects.get(bid, []) # some pages say "Measure Number Breakdown", others "Bill..." table = page.xpath("//table[contains(@summary, 'Number Breakdown')]") table = table[0] ttrows = page.xpath("//div[@id='application']/p") descr = ttrows[-2] title = re.sub(r"\s+", " ", descr.text_content()).strip() ttrows = ttrows[:-1] chamber = {"H": "lower", "S": "upper"}[bid[0]] type_ = bid[1:3] bill_type = "bill" if type_.startswith("B"): bill_type = "bill" if type_.startswith("R"): bill_type = "resolution" if type_ == "CR": bill_type = "concurrent resolution" bill = Bill( bid, legislative_session=session, chamber=chamber, title=title, classification=bill_type, ) bill.subject = subjects bill.add_source(href) for row in ttrows: if isinstance(row, lxml.html.HtmlComment): continue # ignore HTML comments, no text_content() sponsors = row.text_content().strip() sinf = re.match( r"(?i)introduced by( (rep\.|sen\.))? (?P<sponsors>.*)", sponsors) if sinf: sponsors = sinf.groupdict() for sponsor in [ x.strip() for x in sponsors["sponsors"].split(",") ]: bill.add_sponsorship( sponsor, classification="primary", entity_type="person", primary=True, ) dt = None oldchamber = "other" for row in table.xpath(".//tr"): if row.text_content().strip() == "": continue if "Meeting Description" in [ x.strip() for x in row.xpath(".//th/text()") ]: continue row = row.xpath("./*") row = [x.text_content().strip() for x in row] if len(row) > 3: row = row[:3] date, chamber, action = row try: chamber = {"House": "lower", "Senate": "upper"}[chamber] oldchamber = chamber except KeyError: chamber = oldchamber if date != "": dt = datetime.strptime("%s %s" % (date, self.year), "%m/%d %Y") classif = self.categorizer.categorize(action) bill.add_action( chamber=chamber, description=action, date=dt.strftime("%Y-%m-%d"), classification=classif["classification"], ) version_url = page.xpath("//a[contains(text(), 'Versions')]") if len(version_url) == 1: href = version_url[0].attrib["href"] bill = self.scrape_versions(bill, href) yield bill
def parse_bill(self, chamber, session, bill_id, url): try: page = self.lxmlize(url) except scrapelib.HTTPError as e: self.logger.warning(e) return last_action = self.parse_bill_field(page, "Last Action").xpath("text()")[0] if "WITHDRAWN" in last_action.upper(): self.info("{} Withdrawn, skipping".format(bill_id)) return version = self.parse_bill_field(page, "Bill Documents") source_url = version.xpath("a[1]/@href")[0] version_title = version.xpath("a[1]/text()")[0].strip() if version is None: # Bill withdrawn self.logger.warning("Bill withdrawn.") return else: if source_url.endswith(".doc"): mimetype = "application/msword" elif source_url.endswith(".pdf"): mimetype = "application/pdf" title = self.parse_bill_field(page, "Title").text_content() # actions = self.get_nodes( # page, # '//div[@class="StandardText leftDivMargin"]/' # 'div[@class="StandardText"][last()]//text()[normalize-space()]') if "CR" in bill_id: bill_type = "concurrent resolution" elif "JR" in bill_id: bill_type = "joint resolution" elif "R" in bill_id: bill_type = "resolution" else: bill_type = "bill" bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=title, classification=bill_type, ) bill.subject = self._subjects[bill_id] bill.add_source(url) bill.add_version_link(version_title, source_url, media_type=mimetype) self.parse_actions(page, bill, chamber) self.parse_subjects(page, bill) # LM is "Locally Mandated fiscal impact" fiscal_notes = page.xpath('//a[contains(@href, "/LM.pdf")]') for fiscal_note in fiscal_notes: source_url = fiscal_note.attrib["href"] if source_url.endswith(".doc"): mimetype = "application/msword" elif source_url.endswith(".pdf"): mimetype = "application/pdf" bill.add_document_link("Fiscal Note", source_url, media_type=mimetype) for link in page.xpath( "//td/span/a[contains(@href, 'Legislator-Profile')]"): bill.add_sponsorship( link.text.strip(), classification="primary", entity_type="person", primary=True, ) bdr_no = self.parse_bill_field(page, "Bill Request Number") if bdr_no.xpath("text()"): bdr = bdr_no.xpath("text()")[0].strip() bill.extras["BDR"] = bdr yield bill
def scrape_bill_type(self, chamber, session, bill_type, type_abbr, committee_abbr_regex=get_committee_name_regex()): bills = self.session.query(CABill).filter_by( session_year=session).filter_by( measure_type=type_abbr) for bill in bills: bill_session = session if bill.session_num != '0': bill_session += ' Special Session %s' % bill.session_num bill_id = bill.short_bill_id fsbill = Bill(bill_id, session, title='', chamber=chamber) if ((bill_id.startswith('S') and chamber == 'lower') or (bill_id.startswith('A') and chamber == 'upper')): print("!!!! BAD ID/CHAMBER PAIR !!!!", bill) continue # # Construct session for web query, going from '20092010' to '0910' # source_session = session[2:4] + session[6:8] # # Turn 'AB 10' into 'ab_10' # source_num = "%s_%s" % (bill.measure_type.lower(), # bill.measure_num) # Construct a fake source url source_url = ('http://leginfo.legislature.ca.gov/faces/' 'billNavClient.xhtml?bill_id=%s') % bill.bill_id fsbill.add_source(source_url) fsbill.add_version_link(bill_id, source_url, media_type='text/html') title = '' type_ = ['bill'] subject = '' all_titles = set() # Get digest test (aka "summary") from latest version. if bill.versions: version = bill.versions[-1] nsmap = version.xml.nsmap xpath = '//caml:DigestText/xhtml:p' els = version.xml.xpath(xpath, namespaces=nsmap) chunks = [] for el in els: t = etree_text_content(el) t = re.sub(r'\s+', ' ', t) t = re.sub(r'\)(\S)', lambda m: ') %s' % m.group(1), t) chunks.append(t) summary = '\n\n'.join(chunks) for version in bill.versions: if not version.bill_xml: continue version_date = self._tz.localize(version.bill_version_action_date) # create a version name to match the state's format # 02/06/17 - Enrolled version_date_human = version_date.strftime( '%m/%d/%y') version_name = "{} - {}".format( version_date_human, version.bill_version_action) version_base = "https://leginfo.legislature.ca.gov/faces" version_url_pdf = "{}/billPdf.xhtml?bill_id={}&version={}".format( version_base, version.bill_id, version.bill_version_id) fsbill.add_version_link( version_name, version_url_pdf, media_type='application/pdf', date=version_date.date()) # CA is inconsistent in that some bills have a short title # that is longer, more descriptive than title. if bill.measure_type in ('AB', 'SB'): impact_clause = clean_title(version.title) title = clean_title(version.short_title) else: impact_clause = None if len(version.title) < len(version.short_title) and \ not version.title.lower().startswith('an act'): title = clean_title(version.short_title) else: title = clean_title(version.title) if title: all_titles.add(title) type_ = [bill_type] if version.appropriation == 'Yes': type_.append('appropriation') tags = [] if version.fiscal_committee == 'Yes': tags.append('fiscal committee') if version.local_program == 'Yes': tags.append('local program') if version.urgency == 'Yes': tags.append('urgency') if version.taxlevy == 'Yes': tags.append('tax levy') if version.subject: subject = clean_title(version.subject) if not title: self.warning("Couldn't find title for %s, skipping" % bill_id) continue fsbill.title = title if summary: fsbill.add_abstract(summary, note='summary') fsbill.classification = type_ fsbill.subject = [subject] if subject else [] fsbill.extras['impact_clause'] = impact_clause fsbill.extras['tags'] = tags # We don't want the current title in alternate_titles all_titles.remove(title) for title in all_titles: fsbill.add_title(title) for author in version.authors: fsbill.add_sponsorship( author.name, classification=SPONSOR_TYPES[author.contribution], primary=author.primary_author_flg == 'Y', entity_type='person', ) # fsbill.sponsorships[-1]['extras'] = {'official_type': author.contribution} seen_actions = set() for action in bill.actions: if not action.action: # NULL action text seems to be an error on CA's part, # unless it has some meaning I'm missing continue actor = action.actor or chamber actor = actor.strip() match = re.match(r'(Assembly|Senate)($| \(Floor)', actor) if match: actor = {'Assembly': 'lower', 'Senate': 'upper'}[match.group(1)] elif actor.startswith('Governor'): actor = 'executive' else: def replacer(matchobj): if matchobj: return {'Assembly': 'lower', 'Senate': 'upper'}[matchobj.group()] else: return matchobj.group() actor = re.sub(r'^(Assembly|Senate)', replacer, actor) type_ = [] act_str = action.action act_str = re.sub(r'\s+', ' ', act_str) attrs = self.categorizer.categorize(act_str) # Add in the committee strings of the related committees, if any. kwargs = attrs matched_abbrs = committee_abbr_regex.findall(action.action) if re.search(r'Com[s]?. on', action.action) and not matched_abbrs: msg = 'Failed to extract committee abbr from %r.' self.logger.warning(msg % action.action) if matched_abbrs: committees = [] for abbr in matched_abbrs: try: name = self.committee_abbr_to_name(chamber, abbr) committees.append(name) except KeyError: msg = ('Mapping contains no committee name for ' 'abbreviation %r. Action text was %r.') args = (abbr, action.action) raise KeyError(msg % args) committees = filter(None, committees) kwargs['committees'] = committees code = re.search(r'C[SXZ]\d+', actor) if code is not None: code = code.group() kwargs['actor_info'] = {'committee_code': code} assert len(list(committees)) == len(matched_abbrs) for committee, abbr in zip(committees, matched_abbrs): act_str = act_str.replace('Coms. on ', '') act_str = act_str.replace('Com. on ' + abbr, committee) act_str = act_str.replace(abbr, committee) if not act_str.endswith('.'): act_str = act_str + '.' # Determine which chamber the action originated from. changed = False for committee_chamber in ['upper', 'lower', 'legislature']: if actor.startswith(committee_chamber): actor = committee_chamber changed = True break if not changed: actor = 'legislature' if actor != action.actor: actor_info = kwargs.get('actor_info', {}) actor_info['details'] = action.actor kwargs['actor_info'] = actor_info # Add strings for related legislators, if any. rgx = r'(?:senator|assembly[mwp][^ .,:;]+)\s+[^ .,:;]+' legislators = re.findall(rgx, action.action, re.I) if legislators: kwargs['legislators'] = legislators date = action.action_date date = self._tz.localize(date) date = date.date() if (actor, act_str, date) in seen_actions: continue kwargs.update(self.categorizer.categorize(act_str)) action = fsbill.add_action(act_str, date.strftime('%Y-%m-%d'), chamber=actor, classification=kwargs['classification']) for committee in kwargs.get('committees', []): action.add_related_entity( committee, entity_type='organization') seen_actions.add((actor, act_str, date)) for vote_num, vote in enumerate(bill.votes): if vote.vote_result == '(PASS)': result = True else: result = False if not vote.location: continue full_loc = vote.location.description first_part = full_loc.split(' ')[0].lower() if first_part in ['asm', 'assembly']: vote_chamber = 'lower' # vote_location = ' '.join(full_loc.split(' ')[1:]) elif first_part.startswith('sen'): vote_chamber = 'upper' # vote_location = ' '.join(full_loc.split(' ')[1:]) else: raise ScrapeError("Bad location: %s" % full_loc) if vote.motion: motion = vote.motion.motion_text or '' else: motion = '' if "Third Reading" in motion or "3rd Reading" in motion: vtype = 'passage' elif "Do Pass" in motion: vtype = 'passage' else: vtype = 'other' motion = motion.strip() # Why did it take until 2.7 to get a flags argument on re.sub? motion = re.compile(r'(\w+)( Extraordinary)? Session$', re.IGNORECASE).sub('', motion) motion = re.compile(r'^(Senate|Assembly) ', re.IGNORECASE).sub('', motion) motion = re.sub(r'^(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? ', '', motion) motion = re.sub(r' \(\w+\)$', '', motion) motion = re.sub(r'(SCR|SB|AB|AJR|ACR)\s?\d+ \w+\.?$', '', motion) motion = re.sub(r'(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? ' r'Urgency Clause$', '(Urgency Clause)', motion) motion = re.sub(r'\s+', ' ', motion) if not motion: self.warning("Got blank motion on vote for %s" % bill_id) continue # XXX this is responsible for all the CA 'committee' votes, not # sure if that's a feature or bug, so I'm leaving it as is... # vote_classification = chamber if (vote_location == 'Floor') else 'committee' # org = { # 'name': vote_location, # 'classification': vote_classification # } fsvote = VoteEvent( motion_text=motion, start_date=self._tz.localize(vote.vote_date_time), result='pass' if result else 'fail', classification=vtype, # organization=org, chamber=vote_chamber, bill=fsbill, ) fsvote.extras = {'threshold': vote.threshold} source_url = ( 'http://leginfo.legislature.ca.gov/faces' '/billVotesClient.xhtml?bill_id={}' ).format(fsbill.identifier) fsvote.add_source(source_url) fsvote.pupa_id = source_url + '#' + str(vote_num) rc = {'yes': [], 'no': [], 'other': []} for record in vote.votes: if record.vote_code == 'AYE': rc['yes'].append(record.legislator_name) elif record.vote_code.startswith('NO'): rc['no'].append(record.legislator_name) else: rc['other'].append(record.legislator_name) # Handle duplicate votes for key in rc.keys(): rc[key] = list(set(rc[key])) for key, voters in rc.items(): for voter in voters: fsvote.vote(key, voter) # Set counts by summed votes for accuracy fsvote.set_count(key, len(voters)) yield fsvote yield fsbill self.session.expire_all()
def scrape_bill(self, chamber, session, bill_id, url): page = self.lxmlize(url) (header, ) = page.xpath('//h3[@class="heading"]/text()') title = header.replace(bill_id, "").strip() if '.B. ' in bill_id: bill_type = 'bill' elif bill_id.startswith('H.R. ') or bill_id.startswith('S.R. '): bill_type = 'resolution' elif '.C.R. ' in bill_id: bill_type = 'concurrent resolution' elif '.J.R. ' in bill_id: bill_type = 'joint resolution' for flag in SUB_BLACKLIST: if flag in bill_id: bill_id = bill_id.replace(flag, " ") bill_id = re.sub(r"\s+", " ", bill_id).strip() bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=title, classification=bill_type) bill.add_source(url) primary_info = page.xpath('//div[@id="billsponsordiv"]') for info in primary_info: try: (title, name) = [ x.strip() for x in info.xpath('.//text()') if x.strip() ] except ValueError: self.warning( "Could not find sponsor's name for {}".format(bill_id)) continue assert title == "Bill Sponsor:" name = name.replace("Sen. ", "").replace("Rep. ", "") bill.add_sponsorship(name, classification='primary', entity_type='person', primary=True) floor_info = page.xpath('//div[@id="floorsponsordiv"]//text()') floor_info = [x.strip() for x in floor_info if x.strip()] if len(floor_info) in (0, 1): # This indicates that no floor sponsor was found pass elif len(floor_info) == 2: assert floor_info[0] == "Floor Sponsor:" floor_sponsor = floor_info[1].replace("Sen. ", "").replace("Rep. ", "") bill.add_sponsorship(floor_sponsor, classification='cosponsor', entity_type='person', primary=False) else: raise AssertionError("Unexpected floor sponsor HTML found") versions = page.xpath( '//b[text()="Bill Text"]/following-sibling::ul/li/' 'a[text() and not(text()=" ")]') for version in versions: # sometimes the href is on the following <a> tag and the tag we # have has an onclick url = version.get('href') if not url: url = version.xpath('following-sibling::a[1]/@href')[0] bill.add_version_link(version.xpath('text()')[0].strip(), url, media_type='application/pdf') for related in page.xpath( '//b[text()="Related Documents "]/following-sibling::ul/li/' 'a[contains(@class,"nlink")]'): href = related.xpath('@href')[0] if '.fn.pdf' in href: bill.add_document_link("Fiscal Note", href, media_type='application/pdf') else: text = related.xpath('text()')[0] bill.add_document_link(text, href, media_type='application/pdf') subjects = [] for link in page.xpath("//a[contains(@href, 'RelatedBill')]"): subjects.append(link.text.strip()) bill.subject = subjects if page.xpath('//div[@id="billStatus"]//table'): status_table = page.xpath('//div[@id="billStatus"]//table')[0] yield from self.parse_status(bill, status_table, chamber) yield bill
def scrape_bills(self, chamber, session, subjects): idex = bill_start_numbers(session)[chamber] FROM = "ctl00$rilinContent$txtBillFrom" TO = "ctl00$rilinContent$txtBillTo" YEAR = "ctl00$rilinContent$cbYear" blocks = "FOO" # Ugh. while len(blocks) > 0: default_headers = get_default_headers(SEARCH_URL) default_headers[FROM] = idex default_headers[TO] = idex + MAXQUERY default_headers[YEAR] = session idex += MAXQUERY blocks = self.parse_results_page( self.post(SEARCH_URL, data=default_headers).text) blocks = blocks[1:-1] blocks = self.digest_results_page(blocks) for block in blocks: bill = blocks[block] subs = [] try: subs = subjects[bill["bill_id"]] except KeyError: pass title = bill["title"][len("ENTITLED, "):] billid = bill["bill_id"] try: subs = subjects[bill["bill_id"]] except KeyError: subs = [] for b in BILL_NAME_TRANSLATIONS: if billid[:len(b)] == b: billid = (BILL_NAME_TRANSLATIONS[b] + billid[len(b) + 1:].split()[0]) b = Bill( billid, title=title, chamber=chamber, legislative_session=session, classification=self.get_type_by_name(bill["bill_id"]), ) b.subject = subs # keep bill ID around self._bill_id_by_type[(chamber, re.findall(r"\d+", billid)[0])] = billid self.process_actions(bill["actions"], b) sponsors = bill["sponsors"][len("BY"):].strip() sponsors = sponsors.split(",") sponsors = [s.strip() for s in sponsors] for href in bill["bill_id_hrefs"]: b.add_version_link(href.text, href.attrib["href"], media_type="application/pdf") for sponsor in sponsors: b.add_sponsorship( sponsor, entity_type="person", classification="primary", primary=True, ) b.add_source(SEARCH_URL) yield b
def scrape_bill(self, chamber, session, bill_id): # try and get bill for the first year of the session biennium url = 'http://legislature.mi.gov/doc.aspx?%s-%s' % ( session[:4], bill_id.replace(' ', '-')) html = self.get(url).text # Otherwise, try second year of the session biennium if ('Page Not Found' in html or 'The bill you are looking for is not available yet' in html): url = 'http://legislature.mi.gov/doc.aspx?%s-%s' % ( session[-4:], bill_id.replace(' ', '-')) html = self.get(url).text if ('Page Not Found' in html or 'The bill you are looking for is not available yet' in html): self.warning("Cannot open bill page for {}; skipping".format(bill_id)) return doc = lxml.html.fromstring(html) doc.make_links_absolute('http://legislature.mi.gov') title = doc.xpath('//span[@id="frg_billstatus_ObjectSubject"]')[0].text_content() # get B/R/JR/CR part and look up bill type bill_type = bill_types[bill_id.split(' ')[0][1:]] bill = Bill(bill_id, session, title, chamber=chamber, classification=bill_type) bill.add_source(url) # sponsors sponsors = doc.xpath('//span[@id="frg_billstatus_SponsorList"]/a') for sponsor in sponsors: name = sponsor.text.replace(u'\xa0', ' ') # sometimes district gets added as a link if name.isnumeric(): continue if len(sponsors) > 1: classification = ( 'primary' if sponsor.tail and 'primary' in sponsor.tail else 'cosponsor' ) else: classification = 'primary' bill.add_sponsorship( name=name, chamber=chamber, entity_type='person', primary=classification == 'primary', classification=classification, ) bill.subject = doc.xpath('//span[@id="frg_billstatus_CategoryList"]/a/text()') # actions (skip header) for row in doc.xpath('//table[@id="frg_billstatus_HistoriesGridView"]/tr')[1:]: tds = row.xpath('td') # date, journal link, action date = tds[0].text_content() journal = tds[1].text_content() action = tds[2].text_content() date = TIMEZONE.localize(datetime.datetime.strptime(date, "%m/%d/%Y")) # instead of trusting upper/lower case, use journal for actor actor = 'upper' if 'SJ' in journal else 'lower' classification = categorize_action(action) bill.add_action(action, date, chamber=actor, classification=classification) # check if action mentions a sub submatch = re.search(r'WITH SUBSTITUTE\s+([\w\-\d]+)', action, re.IGNORECASE) if submatch and tds[2].xpath('a'): version_url = tds[2].xpath('a/@href')[0] version_name = tds[2].xpath('a/text()')[0].strip() version_name = 'Substitute {}'.format(version_name) self.info("Found Substitute {}".format(version_url)) if version_url.lower().endswith('.pdf'): mimetype = 'application/pdf' elif version_url.lower().endswith('.htm'): mimetype = 'text/html' bill.add_version_link(version_name, version_url, media_type=mimetype) # check if action mentions a vote rcmatch = re.search(r'Roll Call # (\d+)', action, re.IGNORECASE) if rcmatch: rc_num = rcmatch.groups()[0] # in format mileg.aspx?page=getobject&objectname=2011-SJ-02-10-011 journal_link = tds[1].xpath('a/@href') if journal_link: objectname = journal_link[0].rsplit('=', 1)[-1] chamber_name = {'upper': 'Senate', 'lower': 'House'}[actor] vote_url = BASE_URL + '/documents/%s/Journal/%s/htm/%s.htm' % ( session, chamber_name, objectname) results = self.parse_roll_call(vote_url, rc_num) vote = VoteEvent( start_date=date, chamber=actor, bill=bill, motion_text=action, result='pass' if len(results['yes']) > len(results['no']) else 'fail', classification='passage', ) # check the expected counts vs actual count = re.search(r'YEAS (\d+)', action, re.IGNORECASE) count = int(count.groups()[0]) if count else 0 if count != len(results['yes']): self.warning('vote count mismatch for %s %s, %d != %d' % (bill_id, action, count, len(results['yes']))) count = re.search(r'NAYS (\d+)', action, re.IGNORECASE) count = int(count.groups()[0]) if count else 0 if count != len(results['no']): self.warning('vote count mismatch for %s %s, %d != %d' % (bill_id, action, count, len(results['no']))) vote.set_count('yes', len(results['yes'])) vote.set_count('no', len(results['no'])) vote.set_count('other', len(results['other'])) for name in results['yes']: vote.yes(name) for name in results['no']: vote.no(name) for name in results['other']: vote.vote('other', name) vote.add_source(vote_url) yield vote else: self.warning("missing journal link for %s %s" % (bill_id, journal)) # versions for row in doc.xpath('//table[@id="frg_billstatus_DocumentGridTable"]/tr'): parsed = self.parse_doc_row(row) if parsed: name, url = parsed if url.endswith('.pdf'): mimetype = 'application/pdf' elif url.endswith('.htm'): mimetype = 'text/html' bill.add_version_link(name, url, media_type=mimetype) # documents for row in doc.xpath('//table[@id="frg_billstatus_HlaTable"]/tr'): document = self.parse_doc_row(row) if document: name, url = document bill.add_document_link(name, url) for row in doc.xpath('//table[@id="frg_billstatus_SfaTable"]/tr'): document = self.parse_doc_row(row) if document: name, url = document bill.add_document_link(name, url) yield bill
def scrape_bill(self, chamber, session, bill_id, url): try: page = lxml.html.fromstring(self.get(url).text) except scrapelib.HTTPError as e: self.warning('error (%s) fetching %s, skipping' % (e, url)) return title = page.xpath( "string(//span[contains(@id, 'PlaceHolder1_txtST')])").strip() if 'JR' in bill_id: bill_type = ['joint resolution'] elif 'CR' in bill_id: bill_type = ['concurrent resolution'] elif 'R' in bill_id: bill_type = ['resolution'] else: bill_type = ['bill'] bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=title, classification=bill_type) bill.add_source(url) bill.subject = self.subject_map[bill_id] for link in page.xpath("//a[contains(@id, 'Auth')]"): name = link.xpath("string()").strip() if ':' in name: raise Exception(name) if 'otherAuth' in link.attrib['id']: bill.add_sponsorship(name, classification='cosponsor', entity_type='person', primary=False) else: bill.add_sponsorship(name, classification='primary', entity_type='person', primary=True) act_table = page.xpath("//table[contains(@id, 'Actions')]")[0] for tr in act_table.xpath("tr")[2:]: action = tr.xpath("string(td[1])").strip() if not action or action == 'None': continue date = tr.xpath("string(td[3])").strip() date = datetime.datetime.strptime(date, "%m/%d/%Y").date() actor = tr.xpath("string(td[4])").strip() if actor == 'H': actor = 'lower' elif actor == 'S': actor = 'upper' attrs = self.categorizer.categorize(action) related_entities = [] for item in attrs['committees']: related_entities.append({ 'type': 'committee', 'name': item }) for item in attrs['legislators']: related_entities.append({ 'type': 'legislator', 'name': item }) bill.add_action(description=action, date=date.strftime('%Y-%m-%d'), chamber=actor, classification=attrs['classification'], related_entities=related_entities) version_table = page.xpath("//table[contains(@id, 'Versions')]")[0] # Keep track of already seen versions to prevent processing duplicates. version_urls = [] for link in version_table.xpath(".//a[contains(@href, '.PDF')]"): version_url = link.attrib['href'] if version_url in version_urls: self.warning('Skipping duplicate version URL.') continue else: version_urls.append(version_url) name = link.text.strip() if re.search('COMMITTEE REPORTS|SCHEDULED CCR', version_url, re.IGNORECASE): bill.add_document_link(note=name, url=version_url, media_type='application/pdf') continue bill.add_version_link(note=name, url=version_url, media_type='application/pdf') for link in page.xpath(".//a[contains(@href, '_VOTES')]"): if 'HT_' not in link.attrib['href']: yield from self.scrape_votes(bill, self.urlescape(link.attrib['href'])) # # If the bill has no actions and no versions, it's a bogus bill on # # their website, which appears to happen occasionally. Skip. has_no_title = (bill.title == "Short Title Not Found.") if has_no_title: # If there's no title, this is an empty page. Skip! return else: # Otherwise, save the bills. yield bill
def parse_bill(self, chamber, session, bill_id, url): page = self.lxmlize(url) short_bill_id = re.sub(r'(H|S)([JC])R', r'\1\2', bill_id) version_link_node = self.get_node( page, '//a[contains(@href, "{bill_id}/bill.doc") or contains(@href,' '"{bill_id}/bill.pdf")]'.format(bill_id=short_bill_id)) if version_link_node is None: # Bill withdrawn self.logger.warning('Bill withdrawn.') return else: source_url = version_link_node.attrib['href'] if source_url.endswith('.doc'): mimetype = 'application/msword' elif source_url.endswith('.pdf'): mimetype = 'application/pdf' if self._is_post_2016: title_texts = self.get_nodes( page, '//div[@class="StandardText leftDivMargin"]/text()') title_texts = list( filter(None, [text.strip() for text in title_texts])) title_texts = [ s for s in title_texts if s != ',' and not s.startswith('(BR ') ] title = ' '.join(title_texts) actions = self.get_nodes( page, '//div[@class="StandardText leftDivMargin"]/' 'div[@class="StandardText"][last()]//text()[normalize-space()]' ) else: pars = version_link_node.xpath("following-sibling::p") if len(pars) == 2: title = pars[0].xpath("string()") action_p = pars[1] else: title = pars[0].getprevious().tail if not title: self.warning( 'walking backwards to get bill title, error prone!') title = pars[0].getprevious().getprevious() while not title.tail: title = title.getprevious() title = title.tail self.warning('got title the dangerous way: %s' % title) action_p = pars[0] title = re.sub(r'[\s\xa0]+', ' ', title).strip() actions = action_p.xpath("string()").split("\n") if 'CR' in bill_id: bill_type = 'concurrent resolution' elif 'JR' in bill_id: bill_type = 'joint resolution' elif 'R' in bill_id: bill_type = 'resolution' else: bill_type = 'bill' bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=title, classification=bill_type) bill.subject = self._subjects[bill_id] bill.add_source(url) bill.add_version_link("Most Recent Version", source_url, media_type=mimetype) other_versions = page.xpath( '//a[contains(@href, "/recorddocuments/bill/") and' ' not(contains(@href, "/bill.pdf")) and' ' not(contains(@href, "/bill.doc")) and' ' not(contains(@href, "/LM.pdf"))]') for version_link in other_versions: source_url = version_link.attrib['href'] if source_url.endswith('.doc'): mimetype = 'application/msword' elif source_url.endswith('.pdf'): mimetype = 'application/pdf' version_title = version_link.xpath('text()')[0] bill.add_version_link(version_title, source_url, media_type=mimetype) # LM is "Locally Mandated fiscal impact" fiscal_notes = page.xpath('//a[contains(@href, "/LM.pdf")]') for fiscal_note in fiscal_notes: source_url = fiscal_note.attrib['href'] if source_url.endswith('.doc'): mimetype = 'application/msword' elif source_url.endswith('.pdf'): mimetype = 'application/pdf' bill.add_document_link("Fiscal Note", source_url, media_type=mimetype) for link in page.xpath("//a[contains(@href, 'legislator/')]"): bill.add_sponsorship(link.text.strip(), classification='primary', entity_type='person', primary=True) for line in actions: line_actions = line.strip().split(';') for index, action in enumerate(line_actions): action = action.strip() if not action: continue action_date_text = line.split('-')[0].strip() if self._is_post_2016: action_date_string = action_date_text.replace(',', '') else: action_date_string = '{} {}'.format( action_date_text, session[0:4]) # This patch is super hacky, but allows us to better # capture actions that screw up the formatting such as # veto document links. try: action_date = datetime.datetime.strptime( action_date_string, '%b %d %Y') cached_action_date = action_date used_cached_action_date = False except ValueError: action_date = cached_action_date used_cached_action_date = True # Separate out theif first action on the line. if index == 0 and not used_cached_action_date: action = '-'.join(action.split('-')[1:]).strip() if not action: continue if action.endswith('House') or action.endswith('(H)'): actor = 'lower' elif action.endswith('Senate') or action.endswith('(S)'): actor = 'upper' else: actor = chamber atype = [] if 'introduced in' in action: atype.append('introduction') if 'to ' in action: atype.append('referral-committee') elif 'signed by Governor' in action: atype.append('executive-signature') elif 'vetoed' in action: atype.append('executive-veto') # Get the accompanying veto message document. There # should only be one. veto_document_link = self.get_node( page, '//div[@class="StandardText leftDivMargin"]/' 'div[@class="StandardText"][last()]/a[contains(@href,' '"veto.pdf")]') if veto_document_link is not None: bill.add_document_link( "Veto Message", veto_document_link.attrib['href'], on_duplicate='ignore') elif re.match(r'^to [A-Z]', action): atype.append('referral-committee') elif action == 'adopted by voice vote': atype.append('passage') if '1st reading' in action: atype.append('reading-1') if '3rd reading' in action: atype.append('reading-3') if 'passed' in action: atype.append('passage') if '2nd reading' in action: atype.append('reading-2') if 'R' in bill_id and 'adopted by voice vote' in action: atype.append('passage') amendment_re = (r'floor amendments?( \([a-z\d\-]+\))*' r'( and \([a-z\d\-]+\))? filed') if re.search(amendment_re, action): atype.append('amendment-introduction') if not atype: atype = None # Capitalize the first letter of the action for nicer # display. capitalize() won't work for this because it # lowercases all other letters. action = (action[0].upper() + action[1:]) action_date = timezone('America/Kentucky/Louisville').localize( action_date) action_date = action_date.strftime('%Y-%m-%d') if action: bill.add_action(action, action_date, chamber=actor, classification=atype) try: votes_link = page.xpath( "//a[contains(@href, 'vote_history.pdf')]")[0] bill.add_document_link("Vote History", votes_link.attrib['href']) except IndexError: # No votes self.logger.warning(u'No votes found for {}'.format(title)) pass # Ugly Hack Alert! # find actions before introduction date and subtract 1 from the year # if the date is after introduction intro_date = None for i, action in enumerate(bill.actions): if 'introduction' in action['classification']: intro_date = action['date'] break for action in bill.actions[:i]: if action['date'] > intro_date: action['date'] = action['date'].replace( year=action['date'].year - 1) self.debug('corrected year for %s', action['action']) yield bill
def scrape_bill_type( self, chamber, session, bill_type, type_abbr, committee_abbr_regex=get_committee_name_regex(), ): bills = (self.session.query(CABill).filter_by( session_year=session).filter_by(measure_type=type_abbr)) archive_year = int(session[0:4]) not_archive_year = archive_year >= 2009 for bill in bills: bill_session = session if bill.session_num != "0": bill_session += " Special Session %s" % bill.session_num bill_id = bill.short_bill_id if bill_id.strip() == "SB77" and session == "20052006": continue fsbill = Bill(bill_id, bill_session, title="", chamber=chamber) if (bill_id.startswith("S") and chamber == "lower") or (bill_id.startswith("A") and chamber == "upper"): print("!!!! BAD ID/CHAMBER PAIR !!!!", bill) continue # Construct a fake source url source_url = ("http://leginfo.legislature.ca.gov/faces/" "billNavClient.xhtml?bill_id=%s") % bill.bill_id fsbill.add_source(source_url) fsbill.add_version_link(bill_id, source_url, media_type="text/html") title = "" type_ = ["bill"] subject = "" all_titles = set() summary = "" # Get digest test (aka "summary") from latest version. if bill.versions and not_archive_year: version = bill.versions[-1] nsmap = version.xml.nsmap xpath = "//caml:DigestText/xhtml:p" els = version.xml.xpath(xpath, namespaces=nsmap) chunks = [] for el in els: t = etree_text_content(el) t = re.sub(r"\s+", " ", t) t = re.sub(r"\)(\S)", lambda m: ") %s" % m.group(1), t) chunks.append(t) summary = "\n\n".join(chunks) for version in bill.versions: if not version.bill_xml: continue version_date = self._tz.localize( version.bill_version_action_date) # create a version name to match the state's format # 02/06/17 - Enrolled version_date_human = version_date.strftime("%m/%d/%y") version_name = "{} - {}".format(version_date_human, version.bill_version_action) version_base = "https://leginfo.legislature.ca.gov/faces" version_url_pdf = "{}/billPdf.xhtml?bill_id={}&version={}".format( version_base, version.bill_id, version.bill_version_id) fsbill.add_version_link( version_name, version_url_pdf, media_type="application/pdf", date=version_date.date(), ) # CA is inconsistent in that some bills have a short title # that is longer, more descriptive than title. if bill.measure_type in ("AB", "SB"): impact_clause = clean_title(version.title) title = clean_title(version.short_title) else: impact_clause = None if len(version.title) < len( version.short_title) and not version.title.lower( ).startswith("an act"): title = clean_title(version.short_title) else: title = clean_title(version.title) if title: all_titles.add(title) type_ = [bill_type] if version.appropriation == "Yes": type_.append("appropriation") tags = [] if version.fiscal_committee == "Yes": tags.append("fiscal committee") if version.local_program == "Yes": tags.append("local program") if version.urgency == "Yes": tags.append("urgency") if version.taxlevy == "Yes": tags.append("tax levy") if version.subject: subject = clean_title(version.subject) if not title: self.warning("Couldn't find title for %s, skipping" % bill_id) continue fsbill.title = title if summary: fsbill.add_abstract(summary, note="summary") fsbill.classification = type_ fsbill.subject = [subject] if subject else [] fsbill.extras["impact_clause"] = impact_clause fsbill.extras["tags"] = tags # We don't want the current title in alternate_titles all_titles.remove(title) for title in all_titles: fsbill.add_title(title) for author in version.authors: fsbill.add_sponsorship( author.name, classification=SPONSOR_TYPES[author.contribution], primary=author.primary_author_flg == "Y", entity_type="person", ) # fsbill.sponsorships[-1]['extras'] = {'official_type': author.contribution} seen_actions = set() for action in bill.actions: if not action.action: # NULL action text seems to be an error on CA's part, # unless it has some meaning I'm missing continue actor = action.actor or chamber actor = actor.strip() match = re.match(r"(Assembly|Senate)($| \(Floor)", actor) if match: actor = { "Assembly": "lower", "Senate": "upper" }[match.group(1)] elif actor.startswith("Governor"): actor = "executive" else: def replacer(matchobj): if matchobj: return { "Assembly": "lower", "Senate": "upper" }[matchobj.group()] else: return matchobj.group() actor = re.sub(r"^(Assembly|Senate)", replacer, actor) type_ = [] act_str = action.action act_str = re.sub(r"\s+", " ", act_str) attrs = self.categorizer.categorize(act_str) # Add in the committee strings of the related committees, if any. kwargs = attrs matched_abbrs = committee_abbr_regex.findall(action.action) if re.search(r"Com[s]?. on", action.action) and not matched_abbrs: msg = "Failed to extract committee abbr from %r." self.logger.warning(msg % action.action) if matched_abbrs: committees = [] for abbr in matched_abbrs: try: name = self.committee_abbr_to_name(chamber, abbr) committees.append(name) except KeyError: msg = ("Mapping contains no committee name for " "abbreviation %r. Action text was %r.") args = (abbr, action.action) self.warning(msg % args) committees = filter(None, committees) kwargs["committees"] = committees code = re.search(r"C[SXZ]\d+", actor) if code is not None: code = code.group() kwargs["actor_info"] = {"committee_code": code} if not_archive_year: assert len(list(committees)) == len(matched_abbrs) for committee, abbr in zip(committees, matched_abbrs): act_str = act_str.replace("Coms. on ", "") act_str = act_str.replace("Com. on " + abbr, committee) act_str = act_str.replace(abbr, committee) if not act_str.endswith("."): act_str = act_str + "." # Determine which chamber the action originated from. changed = False for committee_chamber in ["upper", "lower", "legislature"]: if actor.startswith(committee_chamber): actor = committee_chamber changed = True break if not changed: actor = "legislature" if actor != action.actor: actor_info = kwargs.get("actor_info", {}) actor_info["details"] = action.actor kwargs["actor_info"] = actor_info # Add strings for related legislators, if any. rgx = r"(?:senator|assembly[mwp][^ .,:;]+)\s+[^ .,:;]+" legislators = re.findall(rgx, action.action, re.I) if legislators: kwargs["legislators"] = legislators date = action.action_date date = self._tz.localize(date) date = date.date() if (actor, act_str, date) in seen_actions: continue kwargs.update(self.categorizer.categorize(act_str)) action = fsbill.add_action( act_str, date.strftime("%Y-%m-%d"), chamber=actor, classification=kwargs["classification"], ) for committee in kwargs.get("committees", []): action.add_related_entity(committee, entity_type="organization") seen_actions.add((actor, act_str, date)) source_url = ( "http://leginfo.legislature.ca.gov/faces/billVotesClient.xhtml?" ) source_url += f"bill_id={session}{bill.session_num}{fsbill.identifier}" # Votes for non archived years if archive_year > 2009: for vote_num, vote in enumerate(bill.votes): if vote.vote_result == "(PASS)": result = True else: result = False if not vote.location: continue full_loc = vote.location.description first_part = full_loc.split(" ")[0].lower() if first_part in ["asm", "assembly"]: vote_chamber = "lower" # vote_location = ' '.join(full_loc.split(' ')[1:]) elif first_part.startswith("sen"): vote_chamber = "upper" # vote_location = ' '.join(full_loc.split(' ')[1:]) else: # raise ScrapeError("Bad location: %s" % full_loc) # To uncomment continue if vote.motion: motion = vote.motion.motion_text or "" else: motion = "" if "Third Reading" in motion or "3rd Reading" in motion: vtype = "passage" elif "Do Pass" in motion: vtype = "passage" else: vtype = "other" motion = motion.strip() motion = re.compile(r"(\w+)( Extraordinary)? Session$", re.IGNORECASE).sub("", motion) motion = re.compile(r"^(Senate|Assembly) ", re.IGNORECASE).sub("", motion) motion = re.sub(r"^(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? ", "", motion) motion = re.sub(r" \(\w+\)$", "", motion) motion = re.sub(r"(SCR|SB|AB|AJR|ACR)\s?\d+ \w+\.?$", "", motion) motion = re.sub( r"(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? " r"Urgency Clause$", "(Urgency Clause)", motion, ) motion = re.sub(r"\s+", " ", motion) if not motion: self.warning("Got blank motion on vote for %s" % bill_id) continue # XXX this is responsible for all the CA 'committee' votes, not # sure if that's a feature or bug, so I'm leaving it as is... # vote_classification = chamber if (vote_location == 'Floor') else 'committee' # org = { # 'name': vote_location, # 'classification': vote_classification # } fsvote = VoteEvent( motion_text=motion, start_date=self._tz.localize(vote.vote_date_time), result="pass" if result else "fail", classification=vtype, # organization=org, chamber=vote_chamber, bill=fsbill, ) fsvote.extras = {"threshold": vote.threshold} fsvote.add_source(source_url) fsvote.pupa_id = source_url + "#" + str(vote_num) rc = {"yes": [], "no": [], "other": []} for record in vote.votes: if record.vote_code == "AYE": rc["yes"].append(record.legislator_name) elif record.vote_code.startswith("NO"): rc["no"].append(record.legislator_name) else: rc["other"].append(record.legislator_name) # Handle duplicate votes for key in rc.keys(): rc[key] = list(set(rc[key])) for key, voters in rc.items(): for voter in voters: fsvote.vote(key, voter) # Set counts by summed votes for accuracy fsvote.set_count(key, len(voters)) yield fsvote if len(bill.votes) > 0 and archive_year <= 2009: vote_page_url = ( "http://leginfo.legislature.ca.gov/faces/billVotesClient.xhtml?" ) vote_page_url += ( f"bill_id={session}{bill.session_num}{fsbill.identifier}") # parse the bill data page, finding the latest html text data = self.get(vote_page_url).content doc = html.fromstring(data) doc.make_links_absolute(vote_page_url) num_of_votes = len(doc.xpath("//div[@class='status']")) for vote_section in range(1, num_of_votes + 1): lines = doc.xpath( f"//div[@class='status'][{vote_section}]//div[@class='statusRow']" ) date, result, motion, vtype, location = "", "", "", "", "" votes = {} for line in lines: line = line.text_content().split() if line[0] == "Date": date = line[1] date = datetime.datetime.strptime(date, "%m/%d/%y") date = self._tz.localize(date) elif line[0] == "Result": result = "pass" if "PASS" in line[1] else "fail" elif line[0] == "Motion": motion = " ".join(line[1:]) elif line[0] == "Location": location = " ".join(line[1:]) elif len(line) > 1: if line[0] == "Ayes" and line[1] != "Count": votes["yes"] = line[1:] elif line[0] == "Noes" and line[1] != "Count": votes["no"] = line[1:] elif line[0] == "NVR" and line[1] != "Count": votes["not voting"] = line[1:] # Determine chamber based on location first_part = location.split(" ")[0].lower() vote_chamber = "" if first_part in ["asm", "assembly"]: vote_chamber = "lower" elif first_part.startswith("sen"): vote_chamber = "upper" if "Third Reading" in motion or "3rd Reading" in motion: vtype = "passage" elif "Do Pass" in motion: vtype = "passage" else: vtype = "other" if len(motion) > 0: fsvote = VoteEvent( motion_text=motion, start_date=date, result=result, classification=vtype, chamber=vote_chamber, bill=fsbill, ) fsvote.add_source(vote_page_url) fsvote.pupa_id = vote_page_url + "#" + str( vote_section) for how_voted, voters in votes.items(): for voter in voters: voter = voter.replace(",", "") fsvote.vote(how_voted, voter) yield fsvote yield fsbill self.session.expire_all()
def _parse_senate_billpage(self, bill_url, year): bill_page = self.lxmlize(bill_url) # get all the info needed to record the bill # TODO probably still needs to be fixed bill_id = bill_page.xpath('//*[@id="lblBillNum"]')[0].text_content() bill_title = bill_page.xpath('//*[@id="lblBillTitle"]')[0].text_content() bill_desc = bill_page.xpath('//*[@id="lblBriefDesc"]')[0].text_content() # bill_lr = bill_page.xpath('//*[@id="lblLRNum"]')[0].text_content() bill_type = "bill" triplet = bill_id[:3] if triplet in bill_types: bill_type = bill_types[triplet] subs = [] bid = bill_id.replace(" ", "") if bid in self._subjects: subs = self._subjects[bid] self.info("With subjects for this bill") self.info(bid) if bid == 'XXXXXX': self.info("Skipping Junk Bill") return bill = Bill( bill_id, title=bill_desc, chamber='upper', legislative_session=self._session_id, classification=bill_type, ) bill.subject = subs bill.add_abstract(bill_desc, note='abstract') bill.add_source(bill_url) if bill_title: bill.add_title(bill_title) # Get the primary sponsor sponsor = bill_page.xpath('//a[@id="hlSponsor"]')[0] bill_sponsor = sponsor.text_content() # bill_sponsor_link = sponsor.attrib.get('href') bill.add_sponsorship( bill_sponsor, entity_type='person', classification='primary', primary=True, ) # cosponsors show up on their own page, if they exist cosponsor_tag = bill_page.xpath('//a[@id="hlCoSponsors"]') if len(cosponsor_tag) > 0 and cosponsor_tag[0].attrib.get('href'): self._parse_senate_cosponsors(bill, cosponsor_tag[0].attrib['href']) # get the actions action_url = bill_page.xpath('//a[@id="hlAllActions"]') if len(action_url) > 0: action_url = action_url[0].attrib['href'] self._parse_senate_actions(bill, action_url) # stored on a separate page versions_url = bill_page.xpath('//a[@id="hlFullBillText"]') if len(versions_url) > 0 and versions_url[0].attrib.get('href'): self._parse_senate_bill_versions(bill, versions_url[0].attrib['href']) amendment_links = bill_page.xpath('//a[contains(@href,"ShowAmendment.asp")]') for link in amendment_links: link_text = link.xpath('string(.)').strip() if 'adopted' in link_text.lower(): link_url = link.xpath('@href')[0] bill.add_version_link(link_text, link_url, media_type='application/pdf', on_duplicate='ignore') yield bill
def _parse_senate_billpage(self, bill_url, year): bill_page = self.lxmlize(bill_url) # get all the info needed to record the bill # TODO probably still needs to be fixed bill_id = bill_page.xpath('//*[@id="lblBillNum"]')[0].text_content() bill_title = bill_page.xpath('//*[@id="lblBillTitle"]')[0].text_content() bill_desc = bill_page.xpath('//*[@id="lblBriefDesc"]')[0].text_content() # bill_lr = bill_page.xpath('//*[@id="lblLRNum"]')[0].text_content() bill_type = "bill" triplet = bill_id[:3] if triplet in bill_types: bill_type = bill_types[triplet] subs = [] bid = bill_id.replace(" ", "") if bid in self._subjects: subs = self._subjects[bid] self.info("With subjects for this bill") self.info(bid) bill = Bill( bill_id, title=bill_desc, legislative_session=year, classification=bill_type, ) bill.subject = subs bill.add_abstract(bill_desc, note='abstract') bill.add_source(bill_url) if bill_title: bill.add_title(bill_title) # Get the primary sponsor sponsor = bill_page.xpath('//a[@id="hlSponsor"]')[0] bill_sponsor = sponsor.text_content() # bill_sponsor_link = sponsor.attrib.get('href') bill.add_sponsorship( bill_sponsor, entity_type='person', classification='primary', primary=True, ) # cosponsors show up on their own page, if they exist cosponsor_tag = bill_page.xpath('//a[@id="hlCoSponsors"]') if len(cosponsor_tag) > 0 and cosponsor_tag[0].attrib.get('href'): self._parse_senate_cosponsors(bill, cosponsor_tag[0].attrib['href']) # get the actions action_url = bill_page.xpath('//a[@id="hlAllActions"]') if len(action_url) > 0: action_url = action_url[0].attrib['href'] self._parse_senate_actions(bill, action_url) # stored on a separate page versions_url = bill_page.xpath('//a[@id="hlFullBillText"]') if len(versions_url) > 0 and versions_url[0].attrib.get('href'): self._parse_senate_bill_versions(bill, versions_url[0].attrib['href']) yield bill
def scrape_bills(self, chamber, session, subjects): idex = bill_start_numbers(session)[chamber] FROM = "ctl00$rilinContent$txtBillFrom" TO = "ctl00$rilinContent$txtBillTo" YEAR = "ctl00$rilinContent$cbYear" blocks = "FOO" # Ugh. while len(blocks) > 0: default_headers = get_default_headers(SEARCH_URL) default_headers[FROM] = idex default_headers[TO] = idex + MAXQUERY default_headers[YEAR] = session idex += MAXQUERY blocks = self.parse_results_page(self.post(SEARCH_URL, data=default_headers).text) blocks = blocks[1:-1] blocks = self.digest_results_page(blocks) for block in blocks: bill = blocks[block] subs = [] try: subs = subjects[bill['bill_id']] except KeyError: pass title = bill['title'][len("ENTITLED, "):] billid = bill['bill_id'] try: subs = subjects[bill['bill_id']] except KeyError: subs = [] for b in BILL_NAME_TRANSLATIONS: if billid[:len(b)] == b: billid = BILL_NAME_TRANSLATIONS[b] + billid[len(b) + 1:].split()[0] b = Bill( billid, title=title, chamber=chamber, legislative_session=session, classification=self.get_type_by_name(bill['bill_id']), ) b.subject = subs # keep bill ID around self._bill_id_by_type[(chamber, re.findall(r'\d+', billid)[0])] = billid self.process_actions(bill['actions'], b) sponsors = bill['sponsors'][len("BY"):].strip() sponsors = sponsors.split(",") sponsors = [s.strip() for s in sponsors] for href in bill['bill_id_hrefs']: b.add_version_link( href.text, href.attrib['href'], media_type="application/pdf") for sponsor in sponsors: b.add_sponsorship( sponsor, entity_type='person', classification='primary', primary=True) b.add_source(SEARCH_URL) yield b
def parse_bill(self, chamber, session, bill_id, url): try: page = self.lxmlize(url) except scrapelib.HTTPError as e: self.logger.warning(e) return last_action = self.parse_bill_field( page, 'Last Action').xpath('text()')[0] if 'WITHDRAWN' in last_action.upper(): self.info("{} Withdrawn, skipping".format(bill_id)) return version = self.parse_bill_field(page, 'Bill Documents') source_url = version.xpath('a[1]/@href')[0] version_title = version.xpath('a[1]/text()')[0].strip() if version is None: # Bill withdrawn self.logger.warning('Bill withdrawn.') return else: if source_url.endswith('.doc'): mimetype = 'application/msword' elif source_url.endswith('.pdf'): mimetype = 'application/pdf' title = self.parse_bill_field(page, 'Title').text_content() # actions = self.get_nodes( # page, # '//div[@class="StandardText leftDivMargin"]/' # 'div[@class="StandardText"][last()]//text()[normalize-space()]') if 'CR' in bill_id: bill_type = 'concurrent resolution' elif 'JR' in bill_id: bill_type = 'joint resolution' elif 'R' in bill_id: bill_type = 'resolution' else: bill_type = 'bill' bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=title, classification=bill_type) bill.subject = self._subjects[bill_id] bill.add_source(url) bill.add_version_link(version_title, source_url, media_type=mimetype) self.parse_actions(page, bill, chamber) self.parse_subjects(page, bill) # LM is "Locally Mandated fiscal impact" fiscal_notes = page.xpath('//a[contains(@href, "/LM.pdf")]') for fiscal_note in fiscal_notes: source_url = fiscal_note.attrib['href'] if source_url.endswith('.doc'): mimetype = 'application/msword' elif source_url.endswith('.pdf'): mimetype = 'application/pdf' bill.add_document_link( "Fiscal Note", source_url, media_type=mimetype) for link in page.xpath("//td/span/a[contains(@href, 'Legislator-Profile')]"): bill.add_sponsorship(link.text.strip(), classification='primary', entity_type='person', primary=True) bdr_no = self.parse_bill_field(page, 'Bill Request Number') if bdr_no.xpath('text()'): bdr = bdr_no.xpath('text()')[0].strip() bill.extras["BDR"] = bdr yield bill
def scrape_bill_list(self, url): bill_list = self._get_bill_list(url) for bill_info in bill_list: (bill_id, ) = bill_info.xpath('td[1]/font/input/@value') (sponsor, ) = bill_info.xpath('td[2]/font/input/@value') (subject, ) = bill_info.xpath('td[3]//text()') subject = subject.strip() chamber = self.CHAMBERS[bill_id[0]] if 'B' in bill_id: bill_type = 'bill' elif 'JR' in bill_id: bill_type = 'joint resolution' elif 'R' in bill_id: bill_type = 'resolution' else: raise AssertionError( "Unknown bill type for bill '{}'".format(bill_id)) bill = Bill( bill_id, legislative_session=self.session, chamber=chamber, title='', classification=bill_type, ) if subject: bill.subject = [subject] if sponsor: bill.add_sponsorship( name=sponsor, entity_type='person', classification='primary', primary=True, ) bill.add_source(url) bill_url = ('http://alisondb.legislature.state.al.us/Alison/' 'SESSBillStatusResult.aspx?BILL={}'.format(bill_id)) bill.add_source(bill_url) bill_html = self._get_bill_response(bill_url) if bill_html is None: self.warning("Bill {} has no webpage, and will be skipped". format(bill_id)) continue bill_doc = lxml.html.fromstring(bill_html) if (bill_doc.xpath('//span[@id="ContentPlaceHolder1_lblShotTitle"]')): title = bill_doc.xpath( '//span[@id="ContentPlaceHolder1_lblShotTitle"]' )[0].text_content().strip() if not title: title = "[No title given by state]" bill.title = title version_url_base = ( 'http://alisondb.legislature.state.al.us/ALISON/' 'SearchableInstruments/{0}/PrintFiles/{1}-'. format(self.session, bill_id)) versions = bill_doc.xpath( '//table[@class="box_versions"]/tr/td[2]/font/text()') for version in versions: name = version if version == "Introduced": version_url = version_url_base + 'int.pdf' elif version == "Engrossed": version_url = version_url_base + 'eng.pdf' elif version == "Enrolled": version_url = version_url_base + 'enr.pdf' else: raise NotImplementedError( "Unknown version type found: '{}'".format(name)) bill.add_version_link( name, version_url, media_type='application/pdf', on_duplicate='ignore', ) # Fiscal notes exist, but I can't figure out how to build their URL fiscal_notes = bill_doc.xpath( '//table[@class="box_fiscalnote"]')[1:] for fiscal_note in fiscal_notes: pass # Budget Isolation Resolutions are handled as extra actions/votes birs = bill_doc.xpath( '//div[@class="box_bir"]//table//table/tr')[1:] for bir in birs: bir_action = bir.xpath('td[1]')[0].text_content().strip() # Sometimes ALISON's database puts another bill's # actions into the BIR action list; ignore these if bill_id not in bir_action: self.warning( "BIR action found ({}) ".format(bir_action) + "that doesn't match the bill ID ({})".format(bill_id)) continue bir_date = datetime.datetime.strptime( bir.xpath('td[2]/font/text()')[0], self.DATE_FORMAT) bir_type = bir.xpath('td[1]/font/text()')[0].split(" ")[0] bir_chamber = self.CHAMBERS[bir_type[0]] bir_text = "{0}: {1}".format( bir_type, bir.xpath('td[3]/font/text()')[0].strip()) bill.add_action( bir_text, TIMEZONE.localize(bir_date), chamber=bir_chamber, classification='other', ) try: (bir_vote_id, ) = bir.xpath('td[4]/font/input/@value') except ValueError: bir_vote_id = '' bir_vote_id = bir_vote_id.strip() if bir_vote_id.startswith("Roll "): bir_vote_id = bir_vote_id.split(" ")[-1] yield from self.scrape_vote( bill=bill, vote_chamber=bir_type[0], bill_id="{0}%20for%20{1}".format(bir_type, bill_id), vote_id=bir_vote_id, vote_date=TIMEZONE.localize(bir_date), action_text=bir_text ) actions = bill_doc.xpath('//table[@id="ContentPlaceHolder1_gvHistory"]/tr')[1:] action_date = None for action in actions: # If actions occur on the same day, only one date will exist if (action.xpath('td[1]/font/text()')[0]. encode('ascii', 'ignore').strip()): action_date = datetime.datetime.strptime( action.xpath('td[1]/font/text()')[0], self.DATE_FORMAT) (action_chamber, ) = action.xpath('td[2]/font/text()') if action.xpath('td[3]/font/u/text()'): (amendment, ) = action.xpath('td[3]/font/u/text()') else: amendment = None (action_text, ) = action.xpath('td[4]/font/text()') action_type = _categorize_action(action_text) # check for occasional extra last row if not action_chamber.strip(): continue # The committee cell is just an abbreviation, so get its name actor = self.CHAMBERS[action_chamber] try: action_committee = re.search( r'.*? referred to the .*? committee on (.*?)$', action_text).group(1).strip() except AttributeError: action_committee = '' act = bill.add_action( action_text, TIMEZONE.localize(action_date), chamber=actor, classification=action_type, ) if action_committee: act.add_related_entity(action_committee, entity_type='organization') try: vote_button = action.xpath('td[9]//text()')[0].strip() except IndexError: vote_button = '' if vote_button.startswith("Roll "): vote_id = vote_button.split(" ")[-1] yield from self.scrape_vote( bill=bill, vote_chamber=action_chamber, bill_id=bill_id, vote_id=vote_id, vote_date=TIMEZONE.localize(action_date), action_text=action_text ) if amendment: amend_url = ( 'http://alisondb.legislature.state.al.us/ALISON/' 'SearchableInstruments/{0}/PrintFiles/{1}.pdf'. format(self.session, amendment)) amend_name = 'Amd/Sub {}'.format(amendment) bill.add_version_link( amend_name, amend_url, media_type='application/pdf', on_duplicate='ignore', ) yield bill
def _parse_senate_billpage(self, bill_url, year): bill_page = self.lxmlize(bill_url) # get all the info needed to record the bill # TODO probably still needs to be fixed bill_id = bill_page.xpath('//*[@id="lblBillNum"]')[0].text_content() bill_title = bill_page.xpath('//*[@id="lblBillTitle"]')[0].text_content() bill_desc = bill_page.xpath('//*[@id="lblBriefDesc"]')[0].text_content() # bill_lr = bill_page.xpath('//*[@id="lblLRNum"]')[0].text_content() bill_type = "bill" triplet = bill_id[:3] if triplet in bill_types: bill_type = bill_types[triplet] subs = [] bid = bill_id.replace(" ", "") if bid in self._subjects: subs = self._subjects[bid] self.info("With subjects for this bill") self.info(bid) bill = Bill( bill_id, title=bill_desc, chamber='upper', legislative_session=year, classification=bill_type, ) bill.subject = subs bill.add_abstract(bill_desc, note='abstract') bill.add_source(bill_url) if bill_title: bill.add_title(bill_title) # Get the primary sponsor sponsor = bill_page.xpath('//a[@id="hlSponsor"]')[0] bill_sponsor = sponsor.text_content() # bill_sponsor_link = sponsor.attrib.get('href') bill.add_sponsorship( bill_sponsor, entity_type='person', classification='primary', primary=True, ) # cosponsors show up on their own page, if they exist cosponsor_tag = bill_page.xpath('//a[@id="hlCoSponsors"]') if len(cosponsor_tag) > 0 and cosponsor_tag[0].attrib.get('href'): self._parse_senate_cosponsors(bill, cosponsor_tag[0].attrib['href']) # get the actions action_url = bill_page.xpath('//a[@id="hlAllActions"]') if len(action_url) > 0: action_url = action_url[0].attrib['href'] self._parse_senate_actions(bill, action_url) # stored on a separate page versions_url = bill_page.xpath('//a[@id="hlFullBillText"]') if len(versions_url) > 0 and versions_url[0].attrib.get('href'): self._parse_senate_bill_versions(bill, versions_url[0].attrib['href']) yield bill
def scrape_bill(self, session, chamber, bill_id, title, url, strip_sponsors=re.compile(r'\s*\(.{,50}\)\s*').sub): html = self.get(url).text page = lxml.html.fromstring(html) page.make_links_absolute(url) bill_type = self.bill_types[bill_id.split()[0][1:]] bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=title, classification=bill_type) bill.add_source(url) xpath = ('//strong[contains(., "SUBJECT")]/../' 'following-sibling::td/a/text()') bill.subject = page.xpath(xpath) for version in self.scrape_versions(session, chamber, page, bill_id): bill.add_version_link(**version) # Resolution pages have different html. values = {} trs = page.xpath('//div[@id="bhistcontent"]/table/tr') for tr in trs: heading = tr.xpath('td/strong/text()') if heading: heading = heading[0] else: continue value = tr.text_content().replace(heading, '').strip() values[heading] = value # summary was always same as title # bill['summary'] = values['SUMMARY:'] # Add primary sponsor. primary = strip_sponsors('', values.get('LEAD SPONSOR:', '')) if primary: bill.add_sponsorship( name=primary, classification='primary', entity_type='person', primary=True ) # Add cosponsors. if values.get('SPONSORS:'): sponsors = strip_sponsors('', values['SPONSORS:']) sponsors = re.split(', (?![A-Z]\.)', sponsors) for name in sponsors: name = name.strip(', \n\r') if name: # Fix name splitting bug where "Neale, D. Hall" match = re.search('(.+?), ([DM]\. Hall)', name) if match: for name in match.groups(): bill.add_sponsorship( name=name, classification='cosponsor', entity_type='person', primary=False ) else: bill.add_sponsorship( name=name, classification='cosponsor', entity_type='person', primary=False ) for link in page.xpath("//a[contains(@href, 'votes/house')]"): yield from self.scrape_house_vote(bill, link.attrib['href']) for tr in reversed(page.xpath("//table[@class='tabborder']/descendant::tr")[1:]): tds = tr.xpath('td') if len(tds) < 3: continue chamber_letter = tds[0].text_content() chamber = {'S': 'upper', 'H': 'lower'}[chamber_letter] # Index of date info no longer varies on resolutions. date = tds[2].text_content().strip() date = datetime.datetime.strptime(date, "%m/%d/%y").date() action = tds[1].text_content().strip() if action.lower().startswith('passed senate'): for href in tds[1].xpath('a/@href'): yield from self.scrape_senate_vote(bill, href, date) attrs = dict(chamber=chamber, description=action, date=date.strftime("%Y-%m-%d")) temp = self.categorizer.categorize(action) related_entities = [] for key, values in temp.items(): if key != 'classification': for value in values: related_entities.append({ "type": key, "name": value }) attrs.update(classification=temp['classification'], related_entities=related_entities) bill.add_action(**attrs) yield bill
def scrape_bill_type(self, chamber, session, bill_type, type_abbr, committee_abbr_regex=get_committee_name_regex()): bills = self.session.query(CABill).filter_by( session_year=session).filter_by(measure_type=type_abbr) for bill in bills: bill_session = session if bill.session_num != '0': bill_session += ' Special Session %s' % bill.session_num bill_id = bill.short_bill_id fsbill = Bill(bill_id, session, title='', chamber=chamber) if ((bill_id.startswith('S') and chamber == 'lower') or (bill_id.startswith('A') and chamber == 'upper')): print("!!!! BAD ID/CHAMBER PAIR !!!!", bill) continue # # Construct session for web query, going from '20092010' to '0910' # source_session = session[2:4] + session[6:8] # # Turn 'AB 10' into 'ab_10' # source_num = "%s_%s" % (bill.measure_type.lower(), # bill.measure_num) # Construct a fake source url source_url = ('http://leginfo.legislature.ca.gov/faces/' 'billNavClient.xhtml?bill_id=%s') % bill.bill_id fsbill.add_source(source_url) fsbill.add_version_link(bill_id, source_url, media_type='text/html') title = '' type_ = ['bill'] subject = '' all_titles = set() # Get digest test (aka "summary") from latest version. if bill.versions: version = bill.versions[-1] nsmap = version.xml.nsmap xpath = '//caml:DigestText/xhtml:p' els = version.xml.xpath(xpath, namespaces=nsmap) chunks = [] for el in els: t = etree_text_content(el) t = re.sub(r'\s+', ' ', t) t = re.sub(r'\)(\S)', lambda m: ') %s' % m.group(1), t) chunks.append(t) summary = '\n\n'.join(chunks) for version in bill.versions: if not version.bill_xml: continue version_date = self._tz.localize( version.bill_version_action_date) # create a version name to match the state's format # 02/06/17 - Enrolled version_date_human = version_date.strftime('%m/%d/%y') version_name = "{} - {}".format(version_date_human, version.bill_version_action) version_base = "https://leginfo.legislature.ca.gov/faces" version_url_pdf = "{}/billPdf.xhtml?bill_id={}&version={}".format( version_base, version.bill_id, version.bill_version_id) fsbill.add_version_link(version_name, version_url_pdf, media_type='application/pdf', date=version_date.date()) # CA is inconsistent in that some bills have a short title # that is longer, more descriptive than title. if bill.measure_type in ('AB', 'SB'): impact_clause = clean_title(version.title) title = clean_title(version.short_title) else: impact_clause = None if len(version.title) < len(version.short_title) and \ not version.title.lower().startswith('an act'): title = clean_title(version.short_title) else: title = clean_title(version.title) if title: all_titles.add(title) type_ = [bill_type] if version.appropriation == 'Yes': type_.append('appropriation') tags = [] if version.fiscal_committee == 'Yes': tags.append('fiscal committee') if version.local_program == 'Yes': tags.append('local program') if version.urgency == 'Yes': tags.append('urgency') if version.taxlevy == 'Yes': tags.append('tax levy') if version.subject: subject = clean_title(version.subject) if not title: self.warning("Couldn't find title for %s, skipping" % bill_id) continue fsbill.title = title if summary: fsbill.add_abstract(summary, note='summary') fsbill.classification = type_ fsbill.subject = [subject] if subject else [] fsbill.extras['impact_clause'] = impact_clause fsbill.extras['tags'] = tags # We don't want the current title in alternate_titles all_titles.remove(title) for title in all_titles: fsbill.add_title(title) for author in version.authors: fsbill.add_sponsorship( author.name, classification=SPONSOR_TYPES[author.contribution], primary=author.primary_author_flg == 'Y', entity_type='person', ) # fsbill.sponsorships[-1]['extras'] = {'official_type': author.contribution} seen_actions = set() for action in bill.actions: if not action.action: # NULL action text seems to be an error on CA's part, # unless it has some meaning I'm missing continue actor = action.actor or chamber actor = actor.strip() match = re.match(r'(Assembly|Senate)($| \(Floor)', actor) if match: actor = { 'Assembly': 'lower', 'Senate': 'upper' }[match.group(1)] elif actor.startswith('Governor'): actor = 'executive' else: def replacer(matchobj): if matchobj: return { 'Assembly': 'lower', 'Senate': 'upper' }[matchobj.group()] else: return matchobj.group() actor = re.sub(r'^(Assembly|Senate)', replacer, actor) type_ = [] act_str = action.action act_str = re.sub(r'\s+', ' ', act_str) attrs = self.categorizer.categorize(act_str) # Add in the committee strings of the related committees, if any. kwargs = attrs matched_abbrs = committee_abbr_regex.findall(action.action) if re.search(r'Com[s]?. on', action.action) and not matched_abbrs: msg = 'Failed to extract committee abbr from %r.' self.logger.warning(msg % action.action) if matched_abbrs: committees = [] for abbr in matched_abbrs: try: name = self.committee_abbr_to_name(chamber, abbr) committees.append(name) except KeyError: msg = ('Mapping contains no committee name for ' 'abbreviation %r. Action text was %r.') args = (abbr, action.action) raise KeyError(msg % args) committees = filter(None, committees) kwargs['committees'] = committees code = re.search(r'C[SXZ]\d+', actor) if code is not None: code = code.group() kwargs['actor_info'] = {'committee_code': code} assert len(list(committees)) == len(matched_abbrs) for committee, abbr in zip(committees, matched_abbrs): act_str = act_str.replace('Coms. on ', '') act_str = act_str.replace('Com. on ' + abbr, committee) act_str = act_str.replace(abbr, committee) if not act_str.endswith('.'): act_str = act_str + '.' # Determine which chamber the action originated from. changed = False for committee_chamber in ['upper', 'lower', 'legislature']: if actor.startswith(committee_chamber): actor = committee_chamber changed = True break if not changed: actor = 'legislature' if actor != action.actor: actor_info = kwargs.get('actor_info', {}) actor_info['details'] = action.actor kwargs['actor_info'] = actor_info # Add strings for related legislators, if any. rgx = r'(?:senator|assembly[mwp][^ .,:;]+)\s+[^ .,:;]+' legislators = re.findall(rgx, action.action, re.I) if legislators: kwargs['legislators'] = legislators date = action.action_date date = self._tz.localize(date) date = date.date() if (actor, act_str, date) in seen_actions: continue kwargs.update(self.categorizer.categorize(act_str)) action = fsbill.add_action( act_str, date.strftime('%Y-%m-%d'), chamber=actor, classification=kwargs['classification']) for committee in kwargs.get('committees', []): action.add_related_entity(committee, entity_type='organization') seen_actions.add((actor, act_str, date)) for vote_num, vote in enumerate(bill.votes): if vote.vote_result == '(PASS)': result = True else: result = False if not vote.location: continue full_loc = vote.location.description first_part = full_loc.split(' ')[0].lower() if first_part in ['asm', 'assembly']: vote_chamber = 'lower' # vote_location = ' '.join(full_loc.split(' ')[1:]) elif first_part.startswith('sen'): vote_chamber = 'upper' # vote_location = ' '.join(full_loc.split(' ')[1:]) else: raise ScrapeError("Bad location: %s" % full_loc) if vote.motion: motion = vote.motion.motion_text or '' else: motion = '' if "Third Reading" in motion or "3rd Reading" in motion: vtype = 'passage' elif "Do Pass" in motion: vtype = 'passage' else: vtype = 'other' motion = motion.strip() # Why did it take until 2.7 to get a flags argument on re.sub? motion = re.compile(r'(\w+)( Extraordinary)? Session$', re.IGNORECASE).sub('', motion) motion = re.compile(r'^(Senate|Assembly) ', re.IGNORECASE).sub('', motion) motion = re.sub(r'^(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? ', '', motion) motion = re.sub(r' \(\w+\)$', '', motion) motion = re.sub(r'(SCR|SB|AB|AJR|ACR)\s?\d+ \w+\.?$', '', motion) motion = re.sub( r'(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? ' r'Urgency Clause$', '(Urgency Clause)', motion) motion = re.sub(r'\s+', ' ', motion) if not motion: self.warning("Got blank motion on vote for %s" % bill_id) continue # XXX this is responsible for all the CA 'committee' votes, not # sure if that's a feature or bug, so I'm leaving it as is... # vote_classification = chamber if (vote_location == 'Floor') else 'committee' # org = { # 'name': vote_location, # 'classification': vote_classification # } fsvote = VoteEvent( motion_text=motion, start_date=self._tz.localize(vote.vote_date_time), result='pass' if result else 'fail', classification=vtype, # organization=org, chamber=vote_chamber, bill=fsbill, ) fsvote.extras = {'threshold': vote.threshold} source_url = ('http://leginfo.legislature.ca.gov/faces' '/billVotesClient.xhtml?bill_id={}').format( fsbill.identifier) fsvote.add_source(source_url) fsvote.pupa_id = source_url + '#' + str(vote_num) rc = {'yes': [], 'no': [], 'other': []} for record in vote.votes: if record.vote_code == 'AYE': rc['yes'].append(record.legislator_name) elif record.vote_code.startswith('NO'): rc['no'].append(record.legislator_name) else: rc['other'].append(record.legislator_name) # Handle duplicate votes for key in rc.keys(): rc[key] = list(set(rc[key])) for key, voters in rc.items(): for voter in voters: fsvote.vote(key, voter) # Set counts by summed votes for accuracy fsvote.set_count(key, len(voters)) yield fsvote yield fsbill self.session.expire_all()
def test_full_bill(): create_jurisdiction() person = Person.objects.create(id='person-id', name='Adam Smith') org = ScrapeOrganization(name='House', classification='lower') com = ScrapeOrganization(name='Arbitrary Committee', classification='committee', parent_id=org._id) oldbill = ScrapeBill('HB 99', '1899', 'Axe & Tack Tax Act', classification='tax bill', from_organization=org._id) bill = ScrapeBill('HB 1', '1900', 'Axe & Tack Tax Act', classification='tax bill', from_organization=org._id) bill.subject = ['taxes', 'axes'] bill.add_identifier('SB 9') bill.add_title('Tack & Axe Tax Act') bill.add_action('introduced in house', '1900-04-01', chamber='lower') act = bill.add_action('sent to arbitrary committee', '1900-04-04', chamber='lower') act.add_related_entity('arbitrary committee', 'organization', com._id) bill.add_related_bill("HB 99", legislative_session="1899", relation_type="prior-session") bill.add_sponsorship('Adam Smith', classification='extra sponsor', entity_type='person', primary=False, entity_id=person.id) bill.add_sponsorship('Jane Smith', classification='lead sponsor', entity_type='person', primary=True) bill.add_abstract('This is an act about axes and taxes and tacks.', note="official") bill.add_document_link('Fiscal Note', 'http://example.com/fn.pdf', media_type='application/pdf') bill.add_document_link('Fiscal Note', 'http://example.com/fn.html', media_type='text/html') bill.add_version_link('Fiscal Note', 'http://example.com/v/1', media_type='text/html') bill.add_source('http://example.com/source') # import bill oi = OrganizationImporter('jid') oi.import_data([org.as_dict(), com.as_dict()]) pi = PersonImporter('jid') pi.json_to_db_id['person-id'] = 'person-id' # Since we have to create this person behind the back of the import # transaction, we'll fake the json-id to db-id, since they match in this # case. This is *really* getting at some implementation detail, but it's # the cleanest way to ensure we short-circut the json id lookup. BillImporter('jid', oi, pi).import_data([oldbill.as_dict(), bill.as_dict()]) # get bill from db and assert it imported correctly b = Bill.objects.get(identifier='HB 1') assert b.from_organization.classification == 'lower' assert b.identifier == bill.identifier assert b.title == bill.title assert b.classification == bill.classification assert b.subject == ['taxes', 'axes'] assert b.abstracts.get().note == 'official' # other_title, other_identifier added assert b.other_titles.get().title == 'Tack & Axe Tax Act' assert b.other_identifiers.get().identifier == 'SB 9' # actions actions = list(b.actions.all()) assert len(actions) == 2 # ensure order was preserved (if this breaks it'll be intermittent) assert actions[0].organization == Organization.objects.get(classification='lower') assert actions[0].description == "introduced in house" assert actions[1].description == "sent to arbitrary committee" assert (actions[1].related_entities.get().organization == Organization.objects.get(classification='committee')) # related_bills were added rb = b.related_bills.get() assert rb.identifier == 'HB 99' # and bill got resolved assert rb.related_bill.identifier == 'HB 99' # sponsors added, linked & unlinked sponsorships = b.sponsorships.all() assert len(sponsorships) == 2 for ss in sponsorships: if ss.primary: assert ss.person is None assert ss.organization is None else: assert ss.person == person # versions & documents with their links versions = b.versions.all() assert len(versions) == 1 assert versions[0].links.count() == 1 documents = b.documents.all() assert len(documents) == 1 assert documents[0].links.count() == 2 # sources assert b.sources.count() == 1
def test_full_bill(): create_jurisdiction() sp = ScrapePerson('Adam Smith') org = ScrapeOrganization(name='House', classification='lower') com = ScrapeOrganization(name='Arbitrary Committee', classification='committee', parent_id=org._id) oldbill = ScrapeBill('HB 99', '1899', 'Axe & Tack Tax Act', classification='tax bill', from_organization=org._id) bill = ScrapeBill('HB 1', '1900', 'Axe & Tack Tax Act', classification='tax bill', from_organization=org._id) bill.subject = ['taxes', 'axes'] bill.add_identifier('SB 9') bill.add_title('Tack & Axe Tax Act') bill.add_action('introduced in house', '1900-04-01', chamber='lower') act = bill.add_action('sent to arbitrary committee', '1900-04-04', chamber='lower') act.add_related_entity('arbitrary committee', 'organization', com._id) bill.add_related_bill("HB 99", legislative_session="1899", relation_type="prior-session") bill.add_sponsorship('Adam Smith', classification='extra sponsor', entity_type='person', primary=False, entity_id=sp._id) bill.add_sponsorship('Jane Smith', classification='lead sponsor', entity_type='person', primary=True) bill.add_abstract('This is an act about axes and taxes and tacks.', note="official", date='1969-10-20') bill.add_document_link('Fiscal Note', 'http://example.com/fn.pdf', media_type='application/pdf') bill.add_document_link('Fiscal Note', 'http://example.com/fn.html', media_type='text/html') bill.add_version_link('Fiscal Note', 'http://example.com/v/1', media_type='text/html') bill.add_source('http://example.com/source') # import bill oi = OrganizationImporter('jid') oi.import_data([org.as_dict(), com.as_dict()]) pi = PersonImporter('jid') pi.import_data([sp.as_dict()]) BillImporter('jid', oi, pi).import_data([oldbill.as_dict(), bill.as_dict()]) # get bill from db and assert it imported correctly b = Bill.objects.get(identifier='HB 1') assert b.from_organization.classification == 'lower' assert b.identifier == bill.identifier assert b.title == bill.title assert b.classification == bill.classification assert b.subject == ['taxes', 'axes'] assert b.abstracts.get().note == 'official' assert b.abstracts.get().date == '1969-10-20' # other_title, other_identifier added assert b.other_titles.get().title == 'Tack & Axe Tax Act' assert b.other_identifiers.get().identifier == 'SB 9' # actions actions = list(b.actions.all()) assert len(actions) == 2 # ensure order was preserved (if this breaks it'll be intermittent) assert actions[0].organization == Organization.objects.get( classification='lower') assert actions[0].description == "introduced in house" assert actions[1].description == "sent to arbitrary committee" assert (actions[1].related_entities.get().organization == Organization.objects.get(classification='committee')) # related_bills were added rb = b.related_bills.get() assert rb.identifier == 'HB 99' # and bill got resolved assert rb.related_bill.identifier == 'HB 99' # sponsors added, linked & unlinked sponsorships = b.sponsorships.all() assert len(sponsorships) == 2 person = Person.objects.get(name='Adam Smith') for ss in sponsorships: if ss.primary: assert ss.person is None assert ss.organization is None else: assert ss.person == person # versions & documents with their links versions = b.versions.all() assert len(versions) == 1 assert versions[0].links.count() == 1 documents = b.documents.all() assert len(documents) == 1 assert documents[0].links.count() == 2 # sources assert b.sources.count() == 1
def scrape_bill(self, chamber, session, bill_id, url): html = self.get(url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) try: title = doc.xpath('//h3[@class="h3billright"]')[0].text_content() # TODO: grab summary (none present at time of writing) except IndexError: if "Unable to retrieve the requested information. Please try again" in html: self.warning("Soft error page, skipping.") return else: raise if "B" in bill_id: _type = ["bill"] elif "J" in bill_id: _type = ["joint resolution"] else: raise ValueError("unknown bill type " + bill_id) bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=title, classification=_type, ) bill.add_source(url) # process sponsors sponsors = _get_td(doc, "All Sponsors:").text_content() sponsors = sponsors.replace("Delegates ", "") sponsors = sponsors.replace("Delegate ", "") sponsors = sponsors.replace("Senator ", "") sponsors = sponsors.replace("Senators ", "") sponsor_type = "primary" for sponsor in re.split(", (?:and )?", sponsors): sponsor = sponsor.strip() if not sponsor: continue bill.add_sponsorship( sponsor, sponsor_type, primary=sponsor_type == "primary", entity_type="person", ) sponsor_type = "cosponsor" # subjects subject_list = [] for heading in ("Broad Subject(s):", "Narrow Subject(s):"): subjects = _get_td(doc, heading).xpath("a/text()") subject_list += [s.split(" -see also-")[0] for s in subjects if s] bill.subject = subject_list html = self.get(url.replace("stab=01", "stab=02")).text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) # documents self.scrape_documents(bill, doc) # actions self.scrape_actions(bill, url.replace("stab=01", "stab=03")) yield from self.parse_bill_votes_new(doc, bill) yield bill
def parse_bill(self, chamber, session, bill_id, url): page = self.lxmlize(url) last_action = self.parse_bill_field(page, 'Last Action').xpath('text()')[0] if 'WITHDRAWN' in last_action.upper(): self.info("{} Withdrawn, skipping".format(bill_id)) return version = self.parse_bill_field(page, 'Bill Documents') source_url = version.xpath('a[1]/@href')[0] version_title = version.xpath('a[1]/text()')[0].strip() if version is None: # Bill withdrawn self.logger.warning('Bill withdrawn.') return else: if source_url.endswith('.doc'): mimetype = 'application/msword' elif source_url.endswith('.pdf'): mimetype = 'application/pdf' title = self.parse_bill_field(page, 'Title').text_content() # actions = self.get_nodes( # page, # '//div[@class="StandardText leftDivMargin"]/' # 'div[@class="StandardText"][last()]//text()[normalize-space()]') if 'CR' in bill_id: bill_type = 'concurrent resolution' elif 'JR' in bill_id: bill_type = 'joint resolution' elif 'R' in bill_id: bill_type = 'resolution' else: bill_type = 'bill' bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=title, classification=bill_type) bill.subject = self._subjects[bill_id] bill.add_source(url) bill.add_version_link(version_title, source_url, media_type=mimetype) self.parse_actions(page, bill, chamber) self.parse_subjects(page, bill) # LM is "Locally Mandated fiscal impact" fiscal_notes = page.xpath('//a[contains(@href, "/LM.pdf")]') for fiscal_note in fiscal_notes: source_url = fiscal_note.attrib['href'] if source_url.endswith('.doc'): mimetype = 'application/msword' elif source_url.endswith('.pdf'): mimetype = 'application/pdf' bill.add_document_link("Fiscal Note", source_url, media_type=mimetype) for link in page.xpath( "//td/span/a[contains(@href, 'Legislator-Profile')]"): bill.add_sponsorship(link.text.strip(), classification='primary', entity_type='person', primary=True) bdr_no = self.parse_bill_field(page, 'Bill Request Number') if bdr_no.xpath('text()'): bdr = bdr_no.xpath('text()')[0].strip() bill.extras["BDR"] = bdr yield bill
def scrape_bill(self, chamber, session, bill_id, url): try: page = lxml.html.fromstring(self.get(url).text) except scrapelib.HTTPError as e: self.warning('error (%s) fetching %s, skipping' % (e, url)) return title = page.xpath( "string(//span[contains(@id, 'PlaceHolder1_txtST')])").strip() if not title: self.warning('blank bill on %s - skipping', url) return if 'JR' in bill_id: bill_type = ['joint resolution'] elif 'CR' in bill_id: bill_type = ['concurrent resolution'] elif 'R' in bill_id: bill_type = ['resolution'] else: bill_type = ['bill'] bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=title, classification=bill_type) bill.add_source(url) bill.subject = self.subject_map[bill_id] for link in page.xpath("//a[contains(@id, 'Auth')]"): name = link.xpath("string()").strip() if ':' in name: raise Exception(name) if 'otherAuth' in link.attrib['id']: bill.add_sponsorship(name, classification='cosponsor', entity_type='person', primary=False) else: bill.add_sponsorship(name, classification='primary', entity_type='person', primary=True) act_table = page.xpath("//table[contains(@id, 'Actions')]")[0] for tr in act_table.xpath("tr")[2:]: action = tr.xpath("string(td[1])").strip() if not action or action == 'None': continue date = tr.xpath("string(td[3])").strip() date = datetime.datetime.strptime(date, "%m/%d/%Y").date() actor = tr.xpath("string(td[4])").strip() if actor == 'H': actor = 'lower' elif actor == 'S': actor = 'upper' attrs = self.categorizer.categorize(action) related_entities = [] for item in attrs['committees']: related_entities.append({ 'type': 'committee', 'name': item }) for item in attrs['legislators']: related_entities.append({ 'type': 'legislator', 'name': item }) bill.add_action(description=action, date=date.strftime('%Y-%m-%d'), chamber=actor, classification=attrs['classification'], related_entities=related_entities) version_table = page.xpath("//table[contains(@id, 'Versions')]")[0] # Keep track of already seen versions to prevent processing duplicates. version_urls = [] for link in version_table.xpath(".//a[contains(@href, '.PDF')]"): version_url = link.attrib['href'] if version_url in version_urls: self.warning('Skipping duplicate version URL.') continue else: version_urls.append(version_url) name = link.text.strip() if re.search('COMMITTEE REPORTS|SCHEDULED CCR', version_url, re.IGNORECASE): bill.add_document_link(note=name, url=version_url, media_type='application/pdf') continue bill.add_version_link(note=name, url=version_url, media_type='application/pdf') for link in page.xpath(".//a[contains(@href, '_VOTES')]"): if 'HT_' not in link.attrib['href']: yield from self.scrape_votes(bill, self.urlescape(link.attrib['href'])) # # If the bill has no actions and no versions, it's a bogus bill on # # their website, which appears to happen occasionally. Skip. has_no_title = (bill.title == "Short Title Not Found.") if has_no_title: # If there's no title, this is an empty page. Skip! return else: # Otherwise, save the bills. yield bill
def scrape_bill(self, chamber, session, bill_id): # try and get bill for the first year of the session biennium url = 'http://legislature.mi.gov/doc.aspx?%s-%s' % ( session[:4], bill_id.replace(' ', '-')) html = self.get(url).text # Otherwise, try second year of the session biennium if ('Page Not Found' in html or 'The bill you are looking for is not available yet' in html): url = 'http://legislature.mi.gov/doc.aspx?%s-%s' % ( session[-4:], bill_id.replace(' ', '-')) html = self.get(url).text if ('Page Not Found' in html or 'The bill you are looking for is not available yet' in html): self.warning("Cannot open bill page for {}; skipping".format(bill_id)) return doc = lxml.html.fromstring(html) doc.make_links_absolute('http://legislature.mi.gov') title = doc.xpath('//span[@id="frg_billstatus_ObjectSubject"]')[0].text_content() # get B/R/JR/CR part and look up bill type bill_type = bill_types[bill_id.split(' ')[0][1:]] bill = Bill(bill_id, session, title, chamber=chamber, classification=bill_type) bill.add_source(url) # sponsors sponsors = doc.xpath('//span[@id="frg_billstatus_SponsorList"]/a') for sponsor in sponsors: name = sponsor.text.replace(u'\xa0', ' ') # sometimes district gets added as a link if name.isnumeric(): continue if len(sponsors) > 1: classification = ( 'primary' if sponsor.tail and 'primary' in sponsor.tail else 'cosponsor' ) else: classification = 'primary' bill.add_sponsorship( name=name, chamber=chamber, entity_type='person', primary=classification == 'primary', classification=classification, ) bill.subject = doc.xpath('//span[@id="frg_billstatus_CategoryList"]/a/text()') # actions (skip header) for row in doc.xpath('//table[@id="frg_billstatus_HistoriesGridView"]/tr')[1:]: tds = row.xpath('td') # date, journal link, action date = tds[0].text_content() journal = tds[1].text_content() action = tds[2].text_content() date = TIMEZONE.localize(datetime.datetime.strptime(date, "%m/%d/%Y")) # instead of trusting upper/lower case, use journal for actor actor = 'upper' if 'SJ' in journal else 'lower' classification = categorize_action(action) bill.add_action(action, date, chamber=actor, classification=classification) # check if action mentions a sub submatch = re.search(r'WITH SUBSTITUTE\s+([\w\-\d]+)', action, re.IGNORECASE) if submatch and tds[2].xpath('a'): version_url = tds[2].xpath('a/@href')[0] version_name = tds[2].xpath('a/text()')[0].strip() version_name = 'Substitute {}'.format(version_name) self.info("Found Substitute {}".format(version_url)) if version_url.lower().endswith('.pdf'): mimetype = 'application/pdf' elif version_url.lower().endswith('.htm'): mimetype = 'text/html' bill.add_version_link(version_name, version_url, media_type=mimetype) # check if action mentions a vote rcmatch = re.search(r'Roll Call # (\d+)', action, re.IGNORECASE) if rcmatch: rc_num = rcmatch.groups()[0] # in format mileg.aspx?page=getobject&objectname=2011-SJ-02-10-011 journal_link = tds[1].xpath('a/@href') if journal_link: objectname = journal_link[0].rsplit('=', 1)[-1] chamber_name = {'upper': 'Senate', 'lower': 'House'}[actor] vote_url = BASE_URL + '/documents/%s/Journal/%s/htm/%s.htm' % ( session, chamber_name, objectname) results = self.parse_roll_call(vote_url, rc_num) if results is not None: vote_passed = len(results['yes']) > len(results['no']) vote = VoteEvent( start_date=date, chamber=actor, bill=bill, motion_text=action, result='pass' if vote_passed else 'fail', classification='passage', ) # check the expected counts vs actual count = re.search(r'YEAS (\d+)', action, re.IGNORECASE) count = int(count.groups()[0]) if count else 0 if count != len(results['yes']): self.warning('vote count mismatch for %s %s, %d != %d' % (bill_id, action, count, len(results['yes']))) count = re.search(r'NAYS (\d+)', action, re.IGNORECASE) count = int(count.groups()[0]) if count else 0 if count != len(results['no']): self.warning('vote count mismatch for %s %s, %d != %d' % (bill_id, action, count, len(results['no']))) vote.set_count('yes', len(results['yes'])) vote.set_count('no', len(results['no'])) vote.set_count('other', len(results['other'])) for name in results['yes']: vote.yes(name) for name in results['no']: vote.no(name) for name in results['other']: vote.vote('other', name) vote.add_source(vote_url) yield vote else: self.warning("missing journal link for %s %s" % (bill_id, journal)) # versions for row in doc.xpath('//table[@id="frg_billstatus_DocumentGridTable"]/tr'): parsed = self.parse_doc_row(row) if parsed: name, url = parsed if url.endswith('.pdf'): mimetype = 'application/pdf' elif url.endswith('.htm'): mimetype = 'text/html' bill.add_version_link(name, url, media_type=mimetype) # documents for row in doc.xpath('//table[@id="frg_billstatus_HlaTable"]/tr'): document = self.parse_doc_row(row) if document: name, url = document bill.add_document_link(name, url) for row in doc.xpath('//table[@id="frg_billstatus_SfaTable"]/tr'): document = self.parse_doc_row(row) if document: name, url = document bill.add_document_link(name, url) yield bill
def scrape_bill(self, chamber, session, bill_id, url): page = self.lxmlize(url) (header, ) = page.xpath('//h3[@class="heading"]/text()') title = header.replace(bill_id, "").strip() if '.B. ' in bill_id: bill_type = 'bill' elif bill_id.startswith('H.R. ') or bill_id.startswith('S.R. '): bill_type = 'resolution' elif '.C.R. ' in bill_id: bill_type = 'concurrent resolution' elif '.J.R. ' in bill_id: bill_type = 'joint resolution' for flag in SUB_BLACKLIST: if flag in bill_id: bill_id = bill_id.replace(flag, " ") bill_id = re.sub("\s+", " ", bill_id).strip() bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=title, classification=bill_type) bill.add_source(url) primary_info = page.xpath('//div[@id="billsponsordiv"]') for info in primary_info: (title, name) = [x.strip() for x in info.xpath('.//text()') if x.strip()] assert title == "Bill Sponsor:" name = name.replace("Sen. ", "").replace("Rep. ", "") bill.add_sponsorship(name, classification='primary', entity_type='person', primary=True) floor_info = page.xpath('//div[@id="floorsponsordiv"]//text()') floor_info = [x.strip() for x in floor_info if x.strip()] if len(floor_info) in (0, 1): # This indicates that no floor sponsor was found pass elif len(floor_info) == 2: assert floor_info[0] == "Floor Sponsor:" floor_sponsor = floor_info[1].replace("Sen. ", "").replace("Rep. ", "") bill.add_sponsorship(floor_sponsor, classification='cosponsor', entity_type='person', primary=False) else: raise AssertionError("Unexpected floor sponsor HTML found") versions = page.xpath( '//b[text()="Bill Text"]/following-sibling::ul/li/' 'a[text() and not(text()=" ")]' ) for version in versions: # sometimes the href is on the following <a> tag and the tag we # have has an onclick url = version.get('href') if not url: url = version.xpath('following-sibling::a[1]/@href')[0] bill.add_version_link( version.xpath('text()')[0].strip(), url, media_type='application/pdf' ) for related in page.xpath('//b[text()="Related Documents "]/following-sibling::ul/li/' 'a[contains(@class,"nlink")]'): href = related.xpath('@href')[0] if '.fn.pdf' in href: bill.add_document_link("Fiscal Note", href, media_type='application/pdf') else: text = related.xpath('text()')[0] bill.add_document_link(text, href, media_type='application/pdf') subjects = [] for link in page.xpath("//a[contains(@href, 'RelatedBill')]"): subjects.append(link.text.strip()) bill.subject = subjects status_table = page.xpath('//div[@id="billStatus"]//table')[0] yield from self.parse_status(bill, status_table, chamber) yield bill
def _parse_senate_billpage(self, bill_url, year): bill_page = self.lxmlize(bill_url) # get all the info needed to record the bill # TODO probably still needs to be fixed bill_id = bill_page.xpath('//*[@id="lblBillNum"]')[0].text_content() bill_title = bill_page.xpath( '//*[@id="lblBillTitle"]')[0].text_content() bill_desc = bill_page.xpath( '//*[@id="lblBriefDesc"]')[0].text_content() # bill_lr = bill_page.xpath('//*[@id="lblLRNum"]')[0].text_content() bill_type = "bill" triplet = bill_id[:3] if triplet in bill_types: bill_type = bill_types[triplet] subs = [] bid = bill_id.replace(" ", "") if bid in self._subjects: subs = self._subjects[bid] self.info("With subjects for this bill") self.info(bid) if bid == 'XXXXXX': self.info("Skipping Junk Bill") return bill = Bill( bill_id, title=bill_desc, chamber='upper', legislative_session=self._session_id, classification=bill_type, ) bill.subject = subs bill.add_abstract(bill_desc, note='abstract') bill.add_source(bill_url) if bill_title: bill.add_title(bill_title) # Get the primary sponsor sponsor = bill_page.xpath('//a[@id="hlSponsor"]')[0] bill_sponsor = sponsor.text_content() # bill_sponsor_link = sponsor.attrib.get('href') bill.add_sponsorship( bill_sponsor, entity_type='person', classification='primary', primary=True, ) # cosponsors show up on their own page, if they exist cosponsor_tag = bill_page.xpath('//a[@id="hlCoSponsors"]') if len(cosponsor_tag) > 0 and cosponsor_tag[0].attrib.get('href'): self._parse_senate_cosponsors(bill, cosponsor_tag[0].attrib['href']) # get the actions action_url = bill_page.xpath('//a[@id="hlAllActions"]') if len(action_url) > 0: action_url = action_url[0].attrib['href'] self._parse_senate_actions(bill, action_url) # stored on a separate page versions_url = bill_page.xpath('//a[@id="hlFullBillText"]') if len(versions_url) > 0 and versions_url[0].attrib.get('href'): self._parse_senate_bill_versions(bill, versions_url[0].attrib['href']) amendment_links = bill_page.xpath( '//a[contains(@href,"ShowAmendment.asp")]') for link in amendment_links: link_text = link.xpath('string(.)').strip() if 'adopted' in link_text.lower(): link_url = link.xpath('@href')[0] bill.add_version_link(link_text, link_url, media_type='application/pdf', on_duplicate='ignore') yield bill
def scrape_bills(self): """ Does the following 1) Scrapes bill data from unitedstates project and saves the data to path specified in UnitedStates module 2) Iterates over bill data and converts each one to an OCD-compliant bill model. 3) Yields the OCD-compliant bill model instance @return: yield Bill instance """ # run scraper first to pull in all the bill data self.run_unitedstates_bill_scraper() # iterate over all the files and build and yield Bill objects for filename in find_files(settings.SCRAPED_DATA_DIR, '.*[a-z]*\/[a-z]*[0-9]*\/data\.json'): try: with open(filename) as json_file: json_data = json.load(json_file) # Initialize Object bill = Bill(self.TYPE_MAP[json_data['bill_type']]['canonical'] + ' ' + json_data['number'], json_data['congress'], json_data['official_title'], chamber=self.TYPE_MAP[json_data['bill_type']]['chamber'] ) # Basics bill.type = [json_data['bill_type']] bill.subject = json_data['subjects'] bill.add_summary(json_data['summary']['as'], json_data['summary']['text'], json_data['summary']['date']) # Common Fields bill.sources = [{'url': json_data['url'], 'note': 'all'}] # Other/Related Bills bill.other_titles = [{'note': t['type'], 'title': t['title']} for t in json_data['titles']] # change value of relationship_type to 'type' field from json_data when permitted by schema bill.related_bills = [{'session': b['session'], 'name': b['name'], 'relationship_type':'companion'} for b in json_data['related_bills']] # add primary sponsor bill.add_sponsorship_by_identifier(json_data['sponsor']['name'], 'person', 'person', True, scheme='thomas_id', identifier=json_data['sponsor']['thomas_id'], chamber=self.TYPE_MAP[json_data['bill_type']]['chamber']) # add cosponsors for cs in json_data['cosponsors']: bill.add_sponsorship_by_identifier(cs['name'], 'person', 'person', False, scheme='thomas_id', identifier=cs['thomas_id'], chamber=self.TYPE_MAP[json_data['bill_type']]['chamber']) # add introduced_at and actions bill.actions.append({'date': json_data['introduced_at'], 'type': 'introduced', 'description': 'date of introduction', 'actor': self.TYPE_MAP[json_data['bill_type']]['chamber'], 'related_entities': []}) for action in json_data['actions']: bill.actions.append({'date': action['acted_at'], 'type': [action['type']], 'description': action['text'], 'actor': self.TYPE_MAP[json_data['bill_type']]['chamber'], 'related_entities': [] }) # add bill versions for version_path in find_files(os.path.join(settings.SCRAPED_DATA_DIR, 'data', bill.session, 'bills', json_data['bill_type'], json_data['bill_type'] + json_data['number'], 'text-versions'), '*\.json'): try: with open(version_path) as version_file: version_json_data = json.load(version_file) for k, v in version_json_data['urls'].iteritems(): bill.versions.append({'date': version_json_data['issued_on'], 'type': version_json_data['version_code'], 'name': self.VERSION_MAP[version_json_data['version_code']], 'links': [{'mimetype': k, 'url': v}]}) except IOError: print("Unable to open or parse file with path " + version_path) continue yield bill except IOError: print("Unable to open or parse file with path " + filename) continue
def _parse_house_bill(self, url, session): # using the print page makes the page simpler, and also *drastically* smaller # (8k rather than 100k) url = re.sub("billsummary", "billsummaryprn", url) url = '%s/%s' % (self._house_base_url, url) # the URL is an iframed version now, so swap in for the actual bill page url = url.replace('Bill.aspx', 'BillContent.aspx') url = url.replace('&code=R', '&code=R&style=new') # http://www.house.mo.gov/Bill.aspx?bill=HB26&year=2017&code=R # http://www.house.mo.gov/BillContent.aspx?bill=HB26&year=2017&code=R&style=new bill_page = self.get(url).text bill_page = lxml.html.fromstring(bill_page) bill_page.make_links_absolute(url) bill_id = bill_page.xpath('//*[@class="entry-title"]/div') if len(bill_id) == 0: self.info("WARNING: bill summary page is blank! (%s)" % url) self._bad_urls.append(url) return bill_id = bill_id[0].text_content() bill_id = clean_text(bill_id) bill_desc = bill_page.xpath( '//*[@class="BillDescription"]')[0].text_content() bill_desc = clean_text(bill_desc) table_rows = bill_page.xpath('//table/tr') # if there is a cosponsor all the rows are pushed down one for the extra row # for the cosponsor: cosponsorOffset = 0 if table_rows[2][0].text_content().strip() == 'Co-Sponsor:': cosponsorOffset = 1 lr_label_tag = table_rows[3 + cosponsorOffset] assert lr_label_tag[0].text_content().strip() == 'LR Number:' # bill_lr = lr_label_tag[1].text_content() lastActionOffset = 0 if table_rows[4 + cosponsorOffset][0].text_content().strip( ) == 'Governor Action:': lastActionOffset = 1 official_title_tag = table_rows[5 + cosponsorOffset + lastActionOffset] assert official_title_tag[0].text_content().strip() == 'Bill String:' official_title = official_title_tag[1].text_content() # could substitute the description for the name, # but keeping it separate for now. bill_type = "bill" triplet = bill_id[:3] if triplet in bill_types: bill_type = bill_types[triplet] bill_number = int(bill_id[3:].strip()) else: bill_number = int(bill_id[3:]) subs = [] bid = bill_id.replace(" ", "") if bid in self._subjects: subs = self._subjects[bid] self.info("With subjects for this bill") self.info(bid) if bill_desc == "": if bill_number <= 20: # blank bill titles early in session are approp. bills bill_desc = 'Appropriations Bill' else: self.error("Blank title. Skipping. {} / {} / {}".format( bill_id, bill_desc, official_title)) return bill = Bill( bill_id, chamber='lower', title=bill_desc, legislative_session=self._session_id, classification=bill_type, ) bill.subject = subs bill.add_title(official_title, note='official') bill.add_source(url) bill_sponsor = clean_text(table_rows[0][1].text_content()) # try: # bill_sponsor_link = table_rows[0][1][0].attrib['href'] # except IndexError: # return bill.add_sponsorship( bill_sponsor, entity_type='person', classification='primary', primary=True, ) # check for cosponsors sponsors_url, = bill_page.xpath( "//a[contains(@href, 'CoSponsors.aspx')]/@href") self._parse_cosponsors_from_bill(bill, sponsors_url) # actions_link_tag = bill_page.xpath('//div[@class="Sections"]/a')[0] # actions_link = '%s/%s' % (self._house_base_url,actions_link_tag.attrib['href']) # actions_link = re.sub("content", "print", actions_link) actions_link, = bill_page.xpath( "//a[contains(@href, 'BillActions.aspx')]/@href") yield from self._parse_house_actions(bill, actions_link) # get bill versions doc_tags = bill_page.xpath('//div[@class="BillDocuments"][1]/span') for doc_tag in reversed(doc_tags): doc = clean_text(doc_tag.text_content()) text_url = '%s%s' % (self._house_base_url, doc_tag[0].attrib['href']) bill.add_document_link(doc, text_url, media_type='text/html') # get bill versions version_tags = bill_page.xpath('//div[@class="BillDocuments"][2]/span') for version_tag in reversed(version_tags): version = clean_text(version_tag.text_content()) for vurl in version_tag.xpath(".//a"): if vurl.text == 'PDF': mimetype = 'application/pdf' else: mimetype = 'text/html' bill.add_version_link(version, vurl.attrib['href'], media_type=mimetype, on_duplicate='ignore') # house bill versions # everything between the row containing "Bill Text"" and the next div.DocHeaderRow version_rows = bill_page.xpath( '//div[contains(text(),"Bill Text")]/' 'following-sibling::div[contains(@class,"DocRow") ' 'and count(preceding-sibling::div[contains(@class,"DocHeaderRow")])=1]' ) for row in version_rows: # some rows are just broken links, not real versions if row.xpath('.//div[contains(@class,"textType")]/a/@href'): version = row.xpath( './/div[contains(@class,"textType")]/a/text()')[0].strip() path = row.xpath( './/div[contains(@class,"textType")]/a/@href')[0].strip() if '.pdf' in path: mimetype = 'application/pdf' else: mimetype = 'text/html' bill.add_version_link(version, path, media_type=mimetype, on_duplicate='ignore') # house bill summaries # everything between the row containing "Bill Summary"" and the next div.DocHeaderRow summary_rows = bill_page.xpath( '//div[contains(text(),"Bill Summary")]/' 'following-sibling::div[contains(@class,"DocRow") ' 'and count(following-sibling::div[contains(@class,"DocHeaderRow")])=1]' ) # if there are no amedments, we need a different xpath for summaries if not summary_rows: summary_rows = bill_page.xpath( '//div[contains(text(),"Bill Summary")]/' 'following-sibling::div[contains(@class,"DocRow")]') for row in reversed(summary_rows): version = row.xpath( './/div[contains(@class,"textType")]/a/text()')[0].strip() if version: path = row.xpath( './/div[contains(@class,"textType")]/a/@href')[0].strip() summary_name = 'Bill Summary ({})'.format(version) if '.pdf' in path: mimetype = 'application/pdf' else: mimetype = 'text/html' bill.add_document_link(summary_name, path, media_type=mimetype, on_duplicate='ignore') # house bill amendments amendment_rows = bill_page.xpath( '//div[contains(text(),"Amendment")]/' 'following-sibling::div[contains(@class,"DocRow")]') for row in reversed(amendment_rows): version = row.xpath( './/div[contains(@class,"DocInfoCell")]/a[1]/text()')[0].strip( ) path = row.xpath( './/div[contains(@class,"DocInfoCell")]/a[1]/@href')[0].strip( ) summary_name = 'Amendment {}'.format(version) defeated_icon = row.xpath('.//img[contains(@title,"Defeated")]') if defeated_icon: summary_name = '{} (Defeated)'.format(summary_name) adopted_icon = row.xpath('.//img[contains(@title,"Adopted")]') if adopted_icon: summary_name = '{} (Adopted)'.format(summary_name) distributed_icon = row.xpath( './/img[contains(@title,"Distributed")]') if distributed_icon: summary_name = '{} (Distributed)'.format(summary_name) if '.pdf' in path: mimetype = 'application/pdf' else: mimetype = 'text/html' bill.add_version_link(summary_name, path, media_type=mimetype, on_duplicate='ignore') yield bill
def scrape_bill(self, chamber, session, bill_id): # try and get bill for the first year of the session biennium url = "http://legislature.mi.gov/doc.aspx?%s-%s" % ( session[:4], bill_id.replace(" ", "-"), ) html = self.get(url).text # Otherwise, try second year of the session biennium if ("Page Not Found" in html or "The bill you are looking for is not available yet" in html): url = "http://legislature.mi.gov/doc.aspx?%s-%s" % ( session[-4:], bill_id.replace(" ", "-"), ) html = self.get(url).text if ("Page Not Found" in html or "The bill you are looking for is not available yet" in html): self.warning( "Cannot open bill page for {}; skipping".format(bill_id)) return doc = lxml.html.fromstring(html) doc.make_links_absolute("http://legislature.mi.gov") title = doc.xpath( '//span[@id="frg_billstatus_ObjectSubject"]')[0].text_content() # get B/R/JR/CR part and look up bill type bill_type = bill_types[bill_id.split(" ")[0][1:]] bill = Bill(bill_id, session, title, chamber=chamber, classification=bill_type) bill.add_source(url) # sponsors sponsors = doc.xpath('//span[@id="frg_billstatus_SponsorList"]/a') for sponsor in sponsors: name = sponsor.text.replace(u"\xa0", " ") # sometimes district gets added as a link if name.isnumeric(): continue if len(sponsors) > 1: classification = ("primary" if sponsor.tail and "primary" in sponsor.tail else "cosponsor") else: classification = "primary" bill.add_sponsorship( name=name.strip(), chamber=chamber, entity_type="person", primary=classification == "primary", classification=classification, ) bill.subject = doc.xpath( '//span[@id="frg_billstatus_CategoryList"]/a/text()') # actions (skip header) for row in doc.xpath( '//table[@id="frg_billstatus_HistoriesGridView"]/tr')[1:]: tds = row.xpath("td") # date, journal link, action date = tds[0].text_content() journal = tds[1].text_content() action = tds[2].text_content() date = TIMEZONE.localize( datetime.datetime.strptime(date, "%m/%d/%Y")) # instead of trusting upper/lower case, use journal for actor actor = "upper" if "SJ" in journal else "lower" classification = categorize_action(action) bill.add_action(action, date, chamber=actor, classification=classification) # check if action mentions a sub submatch = re.search(r"WITH SUBSTITUTE\s+([\w\-\d]+)", action, re.IGNORECASE) if submatch and tds[2].xpath("a"): version_url = tds[2].xpath("a/@href")[0] version_name = tds[2].xpath("a/text()")[0].strip() version_name = "Substitute {}".format(version_name) self.info("Found Substitute {}".format(version_url)) if version_url.lower().endswith(".pdf"): mimetype = "application/pdf" elif version_url.lower().endswith(".htm"): mimetype = "text/html" bill.add_version_link(version_name, version_url, media_type=mimetype) # check if action mentions a vote rcmatch = re.search(r"Roll Call # (\d+)", action, re.IGNORECASE) if rcmatch: rc_num = rcmatch.groups()[0] # in format mileg.aspx?page=getobject&objectname=2011-SJ-02-10-011 journal_link = tds[1].xpath("a/@href") if journal_link: objectname = journal_link[0].rsplit("=", 1)[-1] chamber_name = {"upper": "Senate", "lower": "House"}[actor] vote_url = BASE_URL + "/documents/%s/Journal/%s/htm/%s.htm" % ( session, chamber_name, objectname, ) results = self.parse_roll_call(vote_url, rc_num, session) if results is not None: vote_passed = len(results["yes"]) > len(results["no"]) vote = VoteEvent( start_date=date, chamber=actor, bill=bill, motion_text=action, result="pass" if vote_passed else "fail", classification="passage", ) # check the expected counts vs actual count = re.search(r"YEAS (\d+)", action, re.IGNORECASE) count = int(count.groups()[0]) if count else 0 if count != len(results["yes"]): self.warning( "vote count mismatch for %s %s, %d != %d" % (bill_id, action, count, len(results["yes"]))) count = re.search(r"NAYS (\d+)", action, re.IGNORECASE) count = int(count.groups()[0]) if count else 0 if count != len(results["no"]): self.warning( "vote count mismatch for %s %s, %d != %d" % (bill_id, action, count, len(results["no"]))) vote.set_count("yes", len(results["yes"])) vote.set_count("no", len(results["no"])) vote.set_count("other", len(results["other"])) possible_vote_results = ["yes", "no", "other"] for pvr in possible_vote_results: for name in results[pvr]: if session == "2017-2018": names = name.split("\t") for n in names: vote.vote(pvr, name.strip()) else: # Prevents voter names like "House Bill No. 4451, entitled" and other sentences if len(name.split()) < 5: vote.vote(pvr, name.strip()) vote.add_source(vote_url) yield vote else: self.warning("missing journal link for %s %s" % (bill_id, journal)) # versions for row in doc.xpath( '//table[@id="frg_billstatus_DocumentGridTable"]/tr'): parsed = self.parse_doc_row(row) if parsed: name, url = parsed if url.endswith(".pdf"): mimetype = "application/pdf" elif url.endswith(".htm"): mimetype = "text/html" bill.add_version_link(name, url, media_type=mimetype) # documents for row in doc.xpath('//table[@id="frg_billstatus_HlaTable"]/tr'): document = self.parse_doc_row(row) if document: name, url = document bill.add_document_link(name, url) for row in doc.xpath('//table[@id="frg_billstatus_SfaTable"]/tr'): document = self.parse_doc_row(row) if document: name, url = document bill.add_document_link(name, url) yield bill
def _parse_house_bill(self, url, session): # using the print page makes the page simpler, and also *drastically* smaller # (8k rather than 100k) url = re.sub("billsummary", "billsummaryprn", url) url = '%s/%s' % (self._house_base_url, url) # the URL is an iframed version now, so swap in for the actual bill page url = url.replace('Bill.aspx', 'BillContent.aspx') url = url.replace('&code=R', '&code=R&style=new') # http://www.house.mo.gov/Bill.aspx?bill=HB26&year=2017&code=R # http://www.house.mo.gov/BillContent.aspx?bill=HB26&year=2017&code=R&style=new bill_page = self.get(url).text bill_page = lxml.html.fromstring(bill_page) bill_page.make_links_absolute(url) bill_id = bill_page.xpath('//*[@class="entry-title"]/div') if len(bill_id) == 0: self.info("WARNING: bill summary page is blank! (%s)" % url) self._bad_urls.append(url) return bill_id = bill_id[0].text_content() bill_id = clean_text(bill_id) bill_desc = bill_page.xpath('//*[@class="BillDescription"]')[0].text_content() bill_desc = clean_text(bill_desc) table_rows = bill_page.xpath('//table/tr') # if there is a cosponsor all the rows are pushed down one for the extra row # for the cosponsor: cosponsorOffset = 0 if table_rows[2][0].text_content().strip() == 'Co-Sponsor:': cosponsorOffset = 1 lr_label_tag = table_rows[3 + cosponsorOffset] assert lr_label_tag[0].text_content().strip() == 'LR Number:' # bill_lr = lr_label_tag[1].text_content() lastActionOffset = 0 if table_rows[4 + cosponsorOffset][0].text_content().strip() == 'Governor Action:': lastActionOffset = 1 official_title_tag = table_rows[5 + cosponsorOffset + lastActionOffset] assert official_title_tag[0].text_content().strip() == 'Bill String:' official_title = official_title_tag[1].text_content() # could substitute the description for the name, # but keeping it separate for now. bill_type = "bill" triplet = bill_id[:3] if triplet in bill_types: bill_type = bill_types[triplet] bill_number = int(bill_id[3:].strip()) else: bill_number = int(bill_id[3:]) subs = [] bid = bill_id.replace(" ", "") if bid in self._subjects: subs = self._subjects[bid] self.info("With subjects for this bill") self.info(bid) if bill_desc == "": if bill_number <= 20: # blank bill titles early in session are approp. bills bill_desc = 'Appropriations Bill' else: self.error("Blank title. Skipping. {} / {} / {}".format( bill_id, bill_desc, official_title )) return bill = Bill( bill_id, chamber='lower', title=bill_desc, legislative_session=self._session_id, classification=bill_type, ) bill.subject = subs bill.add_title(official_title, note='official') bill.add_source(url) bill_sponsor = clean_text(table_rows[0][1].text_content()) # try: # bill_sponsor_link = table_rows[0][1][0].attrib['href'] # except IndexError: # return bill.add_sponsorship( bill_sponsor, entity_type='person', classification='primary', primary=True, ) # check for cosponsors sponsors_url, = bill_page.xpath( "//a[contains(@href, 'CoSponsors.aspx')]/@href") self._parse_cosponsors_from_bill(bill, sponsors_url) # actions_link_tag = bill_page.xpath('//div[@class="Sections"]/a')[0] # actions_link = '%s/%s' % (self._house_base_url,actions_link_tag.attrib['href']) # actions_link = re.sub("content", "print", actions_link) actions_link, = bill_page.xpath( "//a[contains(@href, 'BillActions.aspx')]/@href") yield from self._parse_house_actions(bill, actions_link) # get bill versions doc_tags = bill_page.xpath('//div[@class="BillDocuments"][1]/span') for doc_tag in reversed(doc_tags): doc = clean_text(doc_tag.text_content()) text_url = '%s%s' % ( self._house_base_url, doc_tag[0].attrib['href'] ) bill.add_document_link(doc, text_url, media_type='text/html') # get bill versions version_tags = bill_page.xpath('//div[@class="BillDocuments"][2]/span') for version_tag in reversed(version_tags): version = clean_text(version_tag.text_content()) for vurl in version_tag.xpath(".//a"): if vurl.text == 'PDF': mimetype = 'application/pdf' else: mimetype = 'text/html' bill.add_version_link(version, vurl.attrib['href'], media_type=mimetype, on_duplicate='ignore') # house bill versions # everything between the row containing "Bill Text"" and the next div.DocHeaderRow version_rows = bill_page.xpath( '//div[contains(text(),"Bill Text")]/' 'following-sibling::div[contains(@class,"DocRow") ' 'and count(preceding-sibling::div[contains(@class,"DocHeaderRow")])=1]') for row in version_rows: # some rows are just broken links, not real versions if row.xpath('.//div[contains(@class,"textType")]/a/@href'): version = row.xpath('.//div[contains(@class,"textType")]/a/text()')[0].strip() path = row.xpath('.//div[contains(@class,"textType")]/a/@href')[0].strip() if '.pdf' in path: mimetype = 'application/pdf' else: mimetype = 'text/html' bill.add_version_link(version, path, media_type=mimetype, on_duplicate='ignore') # house bill summaries # everything between the row containing "Bill Summary"" and the next div.DocHeaderRow summary_rows = bill_page.xpath( '//div[contains(text(),"Bill Summary")]/' 'following-sibling::div[contains(@class,"DocRow") ' 'and count(following-sibling::div[contains(@class,"DocHeaderRow")])=1]') # if there are no amedments, we need a different xpath for summaries if not summary_rows: summary_rows = bill_page.xpath( '//div[contains(text(),"Bill Summary")]/' 'following-sibling::div[contains(@class,"DocRow")]') for row in reversed(summary_rows): version = row.xpath('.//div[contains(@class,"textType")]/a/text()')[0].strip() if version: path = row.xpath('.//div[contains(@class,"textType")]/a/@href')[0].strip() summary_name = 'Bill Summary ({})'.format(version) if '.pdf' in path: mimetype = 'application/pdf' else: mimetype = 'text/html' bill.add_document_link(summary_name, path, media_type=mimetype, on_duplicate='ignore') # house bill amendments amendment_rows = bill_page.xpath('//div[contains(text(),"Amendment")]/' 'following-sibling::div[contains(@class,"DocRow")]') for row in reversed(amendment_rows): version = row.xpath('.//div[contains(@class,"DocInfoCell")]/a[1]/text()')[0].strip() path = row.xpath('.//div[contains(@class,"DocInfoCell")]/a[1]/@href')[0].strip() summary_name = 'Amendment {}'.format(version) defeated_icon = row.xpath('.//img[contains(@title,"Defeated")]') if defeated_icon: summary_name = '{} (Defeated)'.format(summary_name) adopted_icon = row.xpath('.//img[contains(@title,"Adopted")]') if adopted_icon: summary_name = '{} (Adopted)'.format(summary_name) distributed_icon = row.xpath('.//img[contains(@title,"Distributed")]') if distributed_icon: summary_name = '{} (Distributed)'.format(summary_name) if '.pdf' in path: mimetype = 'application/pdf' else: mimetype = 'text/html' bill.add_version_link(summary_name, path, media_type=mimetype, on_duplicate='ignore') yield bill
def scrape_bill(self, chamber, session, bill_id, url): try: page = lxml.html.fromstring(self.get(url).text) except scrapelib.HTTPError as e: self.warning("error (%s) fetching %s, skipping" % (e, url)) return title = page.xpath( "string(//span[contains(@id, 'PlaceHolder1_txtST')])" ).strip() if not title: self.warning("blank bill on %s - skipping", url) return if "JR" in bill_id: bill_type = ["joint resolution"] elif "CR" in bill_id: bill_type = ["concurrent resolution"] elif "R" in bill_id: bill_type = ["resolution"] else: bill_type = ["bill"] bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=title, classification=bill_type, ) bill.add_source(url) bill.subject = self.subject_map[bill_id] for link in page.xpath("//a[contains(@id, 'Auth')]"): name = link.xpath("string()").strip() if ":" in name: raise Exception(name) if "otherAuth" in link.attrib["id"]: bill.add_sponsorship( name, classification="cosponsor", entity_type="person", primary=False, ) else: bill.add_sponsorship( name, classification="primary", entity_type="person", primary=True ) act_table = page.xpath("//table[contains(@id, 'Actions')]")[0] for tr in act_table.xpath("tr")[2:]: action = tr.xpath("string(td[1])").strip() if not action or action == "None": continue date = tr.xpath("string(td[3])").strip() date = datetime.datetime.strptime(date, "%m/%d/%Y").date() actor = tr.xpath("string(td[4])").strip() if actor == "H": actor = "lower" elif actor == "S": actor = "upper" attrs = self.categorizer.categorize(action) related_entities = [] for item in attrs["committees"]: related_entities.append({"type": "committee", "name": item}) for item in attrs["legislators"]: related_entities.append({"type": "legislator", "name": item}) bill.add_action( description=action, date=date.strftime("%Y-%m-%d"), chamber=actor, classification=attrs["classification"], related_entities=related_entities, ) version_table = page.xpath("//table[contains(@id, 'Versions')]")[0] # Keep track of already seen versions to prevent processing duplicates. version_urls = [] for link in version_table.xpath(".//a[contains(@href, '.PDF')]"): version_url = link.attrib["href"] if version_url in version_urls: self.warning("Skipping duplicate version URL.") continue else: version_urls.append(version_url) name = link.text.strip() if re.search("COMMITTEE REPORTS|SCHEDULED CCR", version_url, re.IGNORECASE): bill.add_document_link( note=name, url=version_url, media_type="application/pdf" ) continue bill.add_version_link( note=name, url=version_url, media_type="application/pdf" ) self.scrape_amendments(bill, page) for link in page.xpath(".//a[contains(@href, '_VOTES')]"): if "HT_" not in link.attrib["href"]: yield from self.scrape_votes(bill, self.urlescape(link.attrib["href"])) # # If the bill has no actions and no versions, it's a bogus bill on # # their website, which appears to happen occasionally. Skip. has_no_title = bill.title == "Short Title Not Found." if has_no_title: # If there's no title, this is an empty page. Skip! return else: # Otherwise, save the bills. yield bill
def scrape_assem_bills(self, chamber, insert, session, year): doc_type = {1: 'bill', 3: 'resolution', 5: 'concurrent resolution', 6: 'joint resolution', 9: 'petition'} for docnum, bill_type in doc_type.items(): parentpage_url = 'http://www.leg.state.nv.us/Session/%s/' \ 'Reports/HistListBills.cfm?DoctypeID=%s' % (insert, docnum) links = self.scrape_links(parentpage_url) count = 0 for link in links: count = count + 1 page_path = 'http://www.leg.state.nv.us/Session/%s/Reports/%s' % (insert, link) page = self.get(page_path).text page = page.replace(u"\xa0", " ") root = lxml.html.fromstring(page) root.make_links_absolute("http://www.leg.state.nv.us/") bill_id = root.xpath('string(/html/body/div[@id="content"]' '/table[1]/tr[1]/td[1]/font)') title = self.get_node( root, '//div[@id="content"]/table/tr[preceding-sibling::tr/td/' 'b[contains(text(), "By:")]]/td/em/text()') bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=title, classification=bill_type) bill.subject = list(set(self.subject_mapping[bill_id])) billtext = root.xpath("//b[text()='Bill Text']")[0].getparent().getnext() text_urls = billtext.xpath("./a") for text_url in text_urls: version_name = text_url.text.strip() version_url = text_url.attrib['href'] bill.add_version_link(note=version_name, url=version_url, media_type='application/pdf') primary, secondary = self.scrape_sponsors(page) for leg in primary: bill.add_sponsorship(classification='primary', name=leg, entity_type='person', primary=True) for leg in secondary: bill.add_sponsorship(classification='cosponsor', name=leg, entity_type='person', primary=False) minutes_count = 2 for mr in root.xpath('//table[4]/tr/td[3]/a'): minutes = mr.xpath("string(@href)") minutes_url = "http://www.leg.state.nv.us" + minutes minutes_date_path = "string(//table[4]/tr[%s]/td[2])" % minutes_count minutes_date = mr.xpath(minutes_date_path).split() minutes_date = minutes_date[0] + minutes_date[1] + minutes_date[2] + " Minutes" bill.add_document_link(note=minutes_date, url=minutes_url) minutes_count += 1 self.scrape_actions(root, bill, "lower") yield from self.scrape_votes(page, page_path, bill, insert, year) bill.add_source(page_path) yield bill
def scrape_bill_list(self, url): bill_list = self._get_bill_list(url) for bill_info in bill_list: (bill_id, ) = bill_info.xpath('td[1]/font/input/@value') (sponsor, ) = bill_info.xpath('td[2]/font/input/@value') (subject, ) = bill_info.xpath('td[3]//text()') subject = subject.strip() chamber = self.CHAMBERS[bill_id[0]] if 'B' in bill_id: bill_type = 'bill' elif 'JR' in bill_id: bill_type = 'joint resolution' elif 'R' in bill_id: bill_type = 'resolution' else: raise AssertionError( "Unknown bill type for bill '{}'".format(bill_id)) bill = Bill( bill_id, legislative_session=self.session, chamber=chamber, title='', classification=bill_type, ) if subject: bill.subject = [subject] if sponsor: bill.add_sponsorship( name=sponsor, entity_type='person', classification='primary', primary=True, ) bill.add_source(url) bill_url = ('http://alisondb.legislature.state.al.us/Alison/' 'SESSBillStatusResult.aspx?BILL={}'.format(bill_id)) bill.add_source(bill_url) bill_html = self._get_bill_response(bill_url) if bill_html is None: self.warning( "Bill {} has no webpage, and will be skipped".format( bill_id)) continue bill_doc = lxml.html.fromstring(bill_html) if (bill_doc.xpath( '//span[@id="ContentPlaceHolder1_lblShotTitle"]')): title = bill_doc.xpath( '//span[@id="ContentPlaceHolder1_lblShotTitle"]' )[0].text_content().strip() if not title: title = "[No title given by state]" bill.title = title version_url_base = ( 'http://alisondb.legislature.state.al.us/ALISON/' 'SearchableInstruments/{0}/PrintFiles/{1}-'.format( self.session, bill_id)) versions = bill_doc.xpath( '//table[@class="box_versions"]/tr/td[2]/font/text()') for version in versions: name = version if version == "Introduced": version_url = version_url_base + 'int.pdf' elif version == "Engrossed": version_url = version_url_base + 'eng.pdf' elif version == "Enrolled": version_url = version_url_base + 'enr.pdf' else: raise NotImplementedError( "Unknown version type found: '{}'".format(name)) bill.add_version_link( name, version_url, media_type='application/pdf', on_duplicate='ignore', ) # Fiscal notes exist, but I can't figure out how to build their URL fiscal_notes = bill_doc.xpath( '//table[@class="box_fiscalnote"]')[1:] for fiscal_note in fiscal_notes: pass # Budget Isolation Resolutions are handled as extra actions/votes birs = bill_doc.xpath( '//div[@class="box_bir"]//table//table/tr')[1:] for bir in birs: bir_action = bir.xpath('td[1]')[0].text_content().strip() # Sometimes ALISON's database puts another bill's # actions into the BIR action list; ignore these if bill_id not in bir_action: self.warning( "BIR action found ({}) ".format(bir_action) + "that doesn't match the bill ID ({})".format(bill_id)) continue bir_date = datetime.datetime.strptime( bir.xpath('td[2]/font/text()')[0], self.DATE_FORMAT) bir_type = bir.xpath('td[1]/font/text()')[0].split(" ")[0] bir_chamber = self.CHAMBERS[bir_type[0]] bir_text = "{0}: {1}".format( bir_type, bir.xpath('td[3]/font/text()')[0].strip()) bill.add_action( bir_text, TIMEZONE.localize(bir_date), chamber=bir_chamber, classification='other', ) try: (bir_vote_id, ) = bir.xpath('td[4]/font/input/@value') except ValueError: bir_vote_id = '' bir_vote_id = bir_vote_id.strip() if bir_vote_id.startswith("Roll "): bir_vote_id = bir_vote_id.split(" ")[-1] yield from self.scrape_vote( bill=bill, vote_chamber=bir_type[0], bill_id="{0}%20for%20{1}".format(bir_type, bill_id), vote_id=bir_vote_id, vote_date=TIMEZONE.localize(bir_date), action_text=bir_text) actions = bill_doc.xpath( '//table[@id="ContentPlaceHolder1_gvHistory"]/tr')[1:] action_date = None for action in actions: # If actions occur on the same day, only one date will exist if (action.xpath('td[1]/font/text()')[0].encode( 'ascii', 'ignore').strip()): action_date = datetime.datetime.strptime( action.xpath('td[1]/font/text()')[0], self.DATE_FORMAT) (action_chamber, ) = action.xpath('td[2]/font/text()') if action.xpath('td[3]/font/u/text()'): (amendment, ) = action.xpath('td[3]/font/u/text()') else: amendment = None (action_text, ) = action.xpath('td[4]/font/text()') action_type = _categorize_action(action_text) # check for occasional extra last row if not action_chamber.strip(): continue # The committee cell is just an abbreviation, so get its name actor = self.CHAMBERS[action_chamber] try: action_committee = re.search( r'.*? referred to the .*? committee on (.*?)$', action_text).group(1).strip() except AttributeError: action_committee = '' act = bill.add_action( action_text, TIMEZONE.localize(action_date), chamber=actor, classification=action_type, ) if action_committee: act.add_related_entity(action_committee, entity_type='organization') try: vote_button = action.xpath('td[9]//text()')[0].strip() except IndexError: vote_button = '' if vote_button.startswith("Roll "): vote_id = vote_button.split(" ")[-1] yield from self.scrape_vote( bill=bill, vote_chamber=action_chamber, bill_id=bill_id, vote_id=vote_id, vote_date=TIMEZONE.localize(action_date), action_text=action_text) if amendment: amend_url = ( 'http://alisondb.legislature.state.al.us/ALISON/' 'SearchableInstruments/{0}/PrintFiles/{1}.pdf'.format( self.session, amendment)) amend_name = 'Amd/Sub {}'.format(amendment) bill.add_version_link( amend_name, amend_url, media_type='application/pdf', on_duplicate='ignore', ) yield bill
def scrape_senate_bills(self, chamber, insert, session, year): doc_type = {2: 'bill', 4: 'resolution', 7: 'concurrent resolution', 8: 'joint resolution'} for docnum, bill_type in doc_type.items(): parentpage_url = 'http://www.leg.state.nv.us/Session/%s/Reports/' \ 'HistListBills.cfm?DoctypeID=%s' % (insert, docnum) links = self.scrape_links(parentpage_url) count = 0 for link in links: count += 1 page_path = 'http://www.leg.state.nv.us/Session/%s/Reports/%s' % (insert, link) page = self.get(page_path).text page = page.replace(u"\xa0", " ") root = lxml.html.fromstring(page) bill_id = root.xpath('string(/html/body/div[@id="content"]' + '/table[1]/tr[1]/td[1]/font)') title = self.get_node( root, '//div[@id="content"]/table/tr[preceding-sibling::tr/td/' 'b[contains(text(), "By:")]]/td/em/text()') bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=title, classification=bill_type ) bill.subject = list(set(self.subject_mapping[bill_id])) for table in root.xpath('//div[@id="content"]/table'): if 'Bill Text' in table.text_content(): bill_text = table.xpath("string(tr/td[2]/a/@href)") text_url = "http://www.leg.state.nv.us" + bill_text bill.add_version_link(note="Bill Text", url=text_url, media_type='application/pdf') primary, secondary = self.scrape_sponsors(page) for leg in primary: bill.add_sponsorship(name=leg, classification='primary', entity_type='person', primary=True) for leg in secondary: bill.add_sponsorship(name=leg, classification='cosponsor', entity_type='person', primary=False) minutes_count = 2 for mr in root.xpath('//table[4]/tr/td[3]/a'): minutes = mr.xpath("string(@href)") minutes_url = "http://www.leg.state.nv.us" + minutes minutes_date_path = "string(//table[4]/tr[%s]/td[2])" % minutes_count minutes_date = mr.xpath(minutes_date_path).split() minutes_date = minutes_date[0] + minutes_date[1] + minutes_date[2] + " Agenda" # bill.add_document(minutes_date, minutes_url) bill.add_document_link(note=minutes_date, url=minutes_url) minutes_count = minutes_count + 1 self.scrape_actions(root, bill, "upper") yield from self.scrape_votes(page, page_path, bill, insert, year) bill.add_source(page_path) yield bill