def scrape_assem_bills(self, chamber, insert, session, year): doc_type = {1: 'bill', 3: 'resolution', 5: 'concurrent resolution', 6: 'joint resolution'} for docnum, bill_type in doc_type.iteritems(): parentpage_url = 'http://www.leg.state.nv.us/Session/%s/Reports/HistListBills.cfm?DoctypeID=%s' % (insert, docnum) links = self.scrape_links(parentpage_url) count = 0 for link in links: count = count + 1 page_path = 'http://www.leg.state.nv.us/Session/%s/Reports/%s' % (insert, link) with self.urlopen(page_path) as page: page = page.decode("utf8").replace(u"\xa0", " ") root = lxml.html.fromstring(page) bill_id = root.xpath('string(/html/body/div[@id="content"]/table[1]/tr[1]/td[1]/font)') title = root.xpath('string(/html/body/div[@id="content"]/table[1]/tr[5]/td)') if insert.find('Special') != -1: session = insert bill = Bill(session, chamber, bill_id, title, type=bill_type) bill_text = root.xpath("string(/html/body/div[@id='content']/table[6]/tr/td[2]/a/@href)") text_url = "http://www.leg.state.nv.us" + bill_text bill.add_version("Bill Text", text_url) primary, secondary = self.scrape_sponsors(page) if primary[0] == 'By:': primary.pop(0) if primary[0] == 'ElectionsProceduresEthicsand': primary[0] = 'Elections Procedures Ethics and' full_name = '' for part_name in primary: full_name = full_name + part_name + " " bill.add_sponsor('primary', full_name) else: for leg in primary: bill.add_sponsor('primary', leg) for leg in secondary: bill.add_sponsor('cosponsor', leg) minutes_count = 2 for mr in root.xpath('//table[4]/tr/td[3]/a'): minutes = mr.xpath("string(@href)") minutes_url = "http://www.leg.state.nv.us" + minutes minutes_date_path = "string(//table[4]/tr[%s]/td[2])" % minutes_count minutes_date = mr.xpath(minutes_date_path).split() minutes_date = minutes_date[0] + minutes_date[1] + minutes_date[2] + " Minutes" bill.add_document(minutes_date, minutes_url) minutes_count = minutes_count + 1 self.scrape_actions(root, bill, "lower") self.scrape_votes(page, bill, insert, year) bill.add_source(page_path) self.save_bill(bill)
def scrape_bill(self, session, chamber, bill_id, url): with self.urlopen(url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(url) title = page.xpath("//br")[8].tail if not title: return title = title.strip() bill = Bill(session, chamber, bill_id, title) bill.add_source(url) action_link = page.xpath("//a[contains(@href, 'getActions')]")[0] self.scrape_actions(bill, action_link.attrib['href']) version_path = "//a[contains(., '%s')]" for version_type in ('Introduced Bill', 'House Bill', 'Senate Bill', 'Engrossed Bill', 'Enrolled Act'): path = version_path % version_type links = page.xpath(path) if links: bill.add_version(version_type, links[0].attrib['href']) for doc_link in page.xpath("//a[contains(@href, 'FISCAL')]"): num = doc_link.text.strip().split("(")[0] bill.add_document("Fiscal Impact Statement #%s" % num, doc_link.attrib['href']) bill['subjects'] = self.subjects[bill_id] self.save_bill(bill)
def scrape(self, chamber, session): chamber_abbrev = {'upper': 'SF', 'lower': 'HB'}[chamber] url = ("http://legisweb.state.wy.us/%s/billindex/" "BillCrossRef.aspx?type=%s" % (session, chamber_abbrev)) page = lxml.html.fromstring(self.urlopen(url)) for tr in page.xpath("//tr[@valign='middle']")[1:]: bill_id = tr.xpath("string(td[1])").strip() title = tr.xpath("string(td[2])").strip() if bill_id[0:2] in ['SJ', 'HJ']: bill_type = 'joint resolution' else: bill_type = 'bill' bill = Bill(session, chamber, bill_id, title, type=bill_type) self.scrape_digest(bill) # versions for a in (tr.xpath('td[6]//a') + tr.xpath('td[9]//a') + tr.xpath('td[10]//a')): bill.add_version(a.text, a.get('href')) # documents fnote = tr.xpath('td[7]//a') if fnote: bill.add_document('Fiscal Note', fnote[0].get('href')) summary = tr.xpath('td[12]//a') if summary: bill.add_document('Summary', summary[0].get('href')) bill.add_source(url) self.save_bill(bill)
def process_bill(self, data): chamber = parse_psuedo_id(data['from_organization'])['classification'] if chamber == 'legislature': chamber = 'upper' bill = Bill(data['legislative_session'], chamber, data['identifier'], data['title'], subjects=data['subject'], type=data['classification']) if data['abstracts']: bill['summary'] = data['abstracts'][0]['abstract'] bill.update(**data['extras']) for action in data['actions']: actor = parse_psuedo_id(action['organization_id'])['classification'] legislators = [] committees = [] for rel in action['related_entities']: if rel['entity_type'] == 'organization': committees.append(rel['name']) elif rel['entity_type'] == 'person': legislators.append(rel['name']) bill.add_action(actor, action['description'], parse_date(action['date']), type=_action_categories(action['classification']), committees=committees, legislators=legislators, **action.get('extras', {}), ) for source in data['sources']: bill.add_source(source['url']) for sponsor in data['sponsorships']: bill.add_sponsor(sponsor['classification'], sponsor['name'], ) for version in data['versions']: for link in version['links']: bill.add_version(version['note'], link['url'], mimetype=link['media_type'], date=parse_date(version['date']), **version.get('extras', {})) for doc in data['documents']: for link in doc['links']: bill.add_document(doc['note'], link['url'], mimetype=link['media_type'], date=parse_date(doc['date']), **doc.get('extras', {})) for title in data['other_titles']: bill.add_title(title['title']) for related in data['related_bills']: bill.add_companion(related['identifier'], related['legislative_session'], chamber ) self.save_bill(bill)
def scrape(self, chamber, session): self.validate_session(session) if chamber == 'upper': other_chamber = 'lower' bill_id = 'SB 1' else: other_chamber = 'upper' bill_id = 'HB 1' b1 = Bill(session, chamber, bill_id, 'A super bill') b1.add_source('http://example.com/') b1.add_version('As Introduced', 'http://example.com/SB1.html') b1.add_document('Google', 'http://google.com') b1.add_sponsor('primary', 'Bob Smith') b1.add_sponsor('secondary', 'Johnson, Sally') d1 = datetime.datetime.strptime('1/29/2010', '%m/%d/%Y') v1 = Vote('upper', d1, 'Final passage', True, 2, 0, 0) v1.yes('Smith') v1.yes('Johnson') d2 = datetime.datetime.strptime('1/30/2010', '%m/%d/%Y') v2 = Vote('lower', d2, 'Final passage', False, 0, 1, 1) v2.no('Bob Smith') v2.other('S. Johnson') b1.add_vote(v1) b1.add_vote(v2) b1.add_action(chamber, 'introduced', d1) b1.add_action(chamber, 'read first time', d2) b1.add_action(other_chamber, 'introduced', d2) self.save_bill(b1)
def scrape(self, chamber, session): chamber_abbrev = {'upper': 'SF', 'lower': 'HB'}[chamber] url = ("http://legisweb.state.wy.us/%s/billindex/" "BillCrossRef.aspx?type=%s" % (session, chamber_abbrev)) page = lxml.html.fromstring(self.urlopen(url)) for tr in page.xpath("//tr[@valign='middle']")[1:]: bill_id = tr.xpath("string(td[1])").strip() title = tr.xpath("string(td[2])").strip() if bill_id[0:2] in ['SJ', 'HJ']: bill_type = 'joint resolution' else: bill_type = 'bill' bill = Bill(session, chamber, bill_id, title, type=bill_type) self.scrape_digest(bill) # versions for a in (tr.xpath('td[6]//a') + tr.xpath('td[9]//a') + tr.xpath('td[10]//a')): bill.add_version(a.text, a.get('href')) # documents fnote = tr.xpath('td[7]//a') if fnote: bill.add_document('Fiscal Note', fnote[0].get('href')) summary = tr.xpath('td[12]//a') if summary: bill.add_document('Summary', summary[0].get('href')) bill.add_source(url) self.save_bill(bill)
def scrape_assem_bills(self, chamber, insert, session, year): doc_type = { 1: 'bill', 3: 'resolution', 5: 'concurrent resolution', 6: 'joint resolution' } for docnum, bill_type in doc_type.iteritems(): parentpage_url = 'http://www.leg.state.nv.us/Session/%s/Reports/HistListBills.cfm?DoctypeID=%s' % ( insert, docnum) links = self.scrape_links(parentpage_url) count = 0 for link in links: count = count + 1 page_path = 'http://www.leg.state.nv.us/Session/%s/Reports/%s' % ( insert, link) page = self.urlopen(page_path) page = page.replace(u"\xa0", " ") root = lxml.html.fromstring(page) bill_id = root.xpath( 'string(/html/body/div[@id="content"]/table[1]/tr[1]/td[1]/font)' ) title = root.xpath( 'string(/html/body/div[@id="content"]/table[1]/tr[5]/td)') bill = Bill(session, chamber, bill_id, title, type=bill_type) bill['subjects'] = self.subject_mapping[bill_id] bill_text = root.xpath( "string(/html/body/div[@id='content']/table[6]/tr/td[2]/a/@href)" ) text_url = "http://www.leg.state.nv.us" + bill_text bill.add_version("Bill Text", text_url, mimetype='application/pdf') primary, secondary = self.scrape_sponsors(page) for leg in primary: bill.add_sponsor('primary', leg) for leg in secondary: bill.add_sponsor('cosponsor', leg) minutes_count = 2 for mr in root.xpath('//table[4]/tr/td[3]/a'): minutes = mr.xpath("string(@href)") minutes_url = "http://www.leg.state.nv.us" + minutes minutes_date_path = "string(//table[4]/tr[%s]/td[2])" % minutes_count minutes_date = mr.xpath(minutes_date_path).split() minutes_date = minutes_date[0] + minutes_date[ 1] + minutes_date[2] + " Minutes" bill.add_document(minutes_date, minutes_url) minutes_count = minutes_count + 1 self.scrape_actions(root, bill, "lower") self.scrape_votes(page, bill, insert, year) bill.add_source(page_path) self.save_bill(bill)
def scrape_bill(self, session, chamber, bill_id, short_title, url): if bill_id in ['SCR 0003', 'SB 0251', 'SB 0292']: return with self.urlopen(url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(url) # check for Bill Withdrawn header h1text = page.xpath('//h1/text()') if h1text and h1text[0] == 'Bill Withdrawn': return title = page.xpath("//br")[8].tail if not title: title = short_title title = title.strip() abbrev = bill_id.split()[0] if abbrev.endswith('B'): bill_type = ['bill'] elif abbrev.endswith('JR'): bill_type = ['joint resolution'] elif abbrev.endswith('CR'): bill_type = ['concurrent resolution'] elif abbrev.endswith('R'): bill_type = ['resolution'] bill = Bill(session, chamber, bill_id, title, type=bill_type) bill.add_source(url) action_link = page.xpath("//a[contains(@href, 'getActions')]")[0] self.scrape_actions(bill, action_link.attrib['href']) version_path = "//a[contains(., '%s')]" for version_type in ('Introduced Bill', 'House Bill', 'Senate Bill', 'Engrossed Bill', 'Enrolled Act'): path = version_path % version_type links = page.xpath(path) if links: bill.add_version(version_type, links[0].attrib['href']) for vote_link in page.xpath("//a[contains(@href, 'Srollcal')]"): self.scrape_senate_vote(bill, vote_link.attrib['href']) for vote_link in page.xpath("//a[contains(@href, 'Hrollcal')]"): self.scrape_house_vote(bill, vote_link.attrib['href']) for doc_link in page.xpath("//a[contains(@href, 'FISCAL')]"): num = doc_link.text.strip().split("(")[0] bill.add_document("Fiscal Impact Statement #%s" % num, doc_link.attrib['href']) bill['subjects'] = self.subjects[bill_id] self.save_bill(bill)
def scrape_bill(self, term, bill_url): with self.urlopen(bill_url) as page: page = lxml.html.fromstring(page) chamber1 = page.xpath('//span[@id="lblBillSponsor"]/a[1]')[0].text if len(page.xpath('//span[@id="lblCoBillSponsor"]/a[1]')) > 0: chamber2 = page.xpath('//span[@id="lblCoBillSponsor"]/a[1]')[0].text if '*' in chamber1: bill_id = chamber1.replace(' ', '')[1:len(chamber1)] secondary_bill_id = chamber2.replace(' ', '') else: bill_id = chamber2.replace(' ', '')[1:len(chamber2)] secondary_bill_id = chamber1.replace(' ', '') primary_chamber = 'lower' if 'H' in bill_id else 'upper' else: primary_chamber = 'lower' if 'H' in chamber1 else 'upper' bill_id = chamber1.replace(' ', '')[1:len(chamber1)] secondary_bill_id = None title = page.xpath("//span[@id='lblAbstract']")[0].text bill = Bill(term, primary_chamber, bill_id, title, secondary_bill_id=secondary_bill_id) bill.add_source(bill_url) # Primary Sponsor sponsor = page.xpath("//span[@id='lblBillSponsor']")[0].text_content().split("by")[-1] sponsor = sponsor.replace('*','').strip() bill.add_sponsor('primary',sponsor) # Co-sponsors unavailable for scraping (loaded into page via AJAX) # Full summary doc summary = page.xpath("//span[@id='lblBillSponsor']/a")[0] bill.add_document('Full summary', summary.get('href')) # Actions tables = page.xpath("//table[@id='tabHistoryAmendments_tabHistory_gvBillActionHistory']") actions_table = tables[0] action_rows = actions_table.xpath("tr[position()>1]") for ar in action_rows: action_taken = ar.xpath("td")[0].text action_date = datetime.datetime.strptime(ar.xpath("td")[1].text.strip(), '%m/%d/%Y') #NEED TO ADD SECONDARY ACTIONS bill.add_action(primary_chamber, action_taken, action_date) votes_link = page.xpath("//span[@id='lblBillVotes']/a") if(len(votes_link) > 0): votes_link = votes_link[0].get('href') bill = self.scrape_votes(bill, sponsor, 'http://wapp.capitol.tn.gov/apps/Billinfo/%s' % (votes_link,)) self.save_bill(bill)
def scrape_bill_page(self, chamber, session, bill_url, bill_type): page = self.lxmlize(bill_url) author = self.get_one_xpath(page, "//a[@id='ctl00_PageBody_LinkAuthor']/text()") sbp = lambda x: self.scrape_bare_page(page.xpath("//a[contains(text(), '%s')]" % (x))[0].attrib["href"]) authors = [x.text for x in sbp("Authors")] try: digests = sbp("Digests") except IndexError: digests = [] try: versions = sbp("Text") except IndexError: versions = [] title = page.xpath("//span[@id='ctl00_PageBody_LabelShortTitle']/text()")[0] actions = page.xpath("//div[@id='ctl00_PageBody_PanelBillInfo']/" "/table[@style='font-size:small']/tr") bill_id = page.xpath("//span[@id='ctl00_PageBody_LabelBillID']/text()")[0] bill_type = {"B": "bill", "CR": "concurrent resolution"}[bill_type[1:]] bill = Bill(session, chamber, bill_id, title, type=bill_type) bill.add_source(bill_url) authors.remove(author) bill.add_sponsor("primary", author) for author in authors: bill.add_sponsor("cosponsor", author) for digest in digests: bill.add_document(digest.text, digest.attrib["href"], mimetype="application/pdf") for version in versions: bill.add_version(version.text, version.attrib["href"], mimetype="application/pdf") flags = {"prefiled": ["bill:filed"], "referred to the committee": ["committee:referred"]} for action in actions: date, chamber, page, text = [x.text for x in action.xpath(".//td")] date += "/%s" % (session) # Session is April --> June. Prefiles # look like they're in January at earliest. date = dt.datetime.strptime(date, "%m/%d/%Y") chamber = {"S": "upper", "H": "lower", "J": "joint"}[chamber] cat = [] for flag in flags: if flag in text.lower(): cat += flags[flag] if cat == []: cat = ["other"] bill.add_action(chamber, text, date, cat) self.save_bill(bill)
def scrape_bill(self, session, chamber, bill_id, short_title, url): if bill_id in ['SCR 0003', 'SB 0251', 'SB 0292']: return with self.urlopen(url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(url) # check for Bill Withdrawn header h1text = page.xpath('//h1/text()') if h1text and h1text[0] == 'Bill Withdrawn': return title = page.xpath("//br")[8].tail if not title: title = short_title title = title.strip() abbrev = bill_id.split()[0] if abbrev.endswith('B'): bill_type = ['bill'] elif abbrev.endswith('JR'): bill_type = ['joint resolution'] elif abbrev.endswith('CR'): bill_type = ['concurrent resolution'] elif abbrev.endswith('R'): bill_type = ['resolution'] bill = Bill(session, chamber, bill_id, title, type=bill_type) bill.add_source(url) action_link = page.xpath("//a[contains(@href, 'getActions')]")[0] self.scrape_actions(bill, action_link.attrib['href']) version_path = "//a[contains(., '%s')]" for version_type in ('Introduced Bill', 'House Bill', 'Senate Bill', 'Engrossed Bill', 'Enrolled Act'): path = version_path % version_type links = page.xpath(path) if links: bill.add_version(version_type, links[0].attrib['href']) for vote_link in page.xpath("//a[contains(@href, 'Srollcal')]"): self.scrape_senate_vote(bill, vote_link.attrib['href']) for vote_link in page.xpath("//a[contains(@href, 'Hrollcal')]"): self.scrape_house_vote(bill, vote_link.attrib['href']) for doc_link in page.xpath("//a[contains(@href, 'FISCAL')]"): num = doc_link.text.strip().split("(")[0] bill.add_document("Fiscal Impact Statement #%s" % num, doc_link.attrib['href']) bill['subjects'] = self.subjects[bill_id] self.save_bill(bill)
def scrape_bill(self, chamber, term, bill_id, url, title, subject=None): self.logger.info('GET ' + url) resp = self.get(url) html = resp.text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) bill = Bill(term, chamber, bill_id, title) bill.add_source(url) if subject is not None: bill['subjects'] = [subject] # Sponsors sponsor_map = { 'author': 'primary', 'co-author': 'cosponsor', 'sponsor': 'cosponsor', 'co-sponsor': 'cosponsor', } for div in doc.xpath('//div[contains(@class, "bill-author-info")]'): name = div.xpath('string(b)').strip() sp_type = sponsor_map[div.xpath('string(p)').strip().lower()] bill.add_sponsor(sp_type, name) # Actions for li in doc.xpath('//div[@id="bill-actions"]//li')[::-1]: if li.text_content() == 'None currently available.': continue chamber_str = li.xpath('string(strong)').strip() action_chamber = dict(H='lower', S='upper')[chamber_str] action_date = li.xpath('string(span[@class="document-date"])') action_date = datetime.datetime.strptime(action_date.strip(), '%m/%d/%Y') action_text = li.xpath('string(span[2])').strip() if not action_text.strip(): continue kwargs = dict(date=action_date, actor=action_chamber, action=action_text) kwargs.update(**self.categorizer.categorize(action_text)) bill.add_action(**kwargs) # Documents (including votes) for doc_type, doc_meta in BillDocuments(self, doc): if doc_type == 'version': bill.add_version(doc_meta.title or doc_meta.text, url=doc_meta.url, mimetype='application/pdf') elif doc_type == 'document': bill.add_document(doc_meta.title or doc_meta.text, url=doc_meta.url, mimetype='application/pdf') elif doc_type == 'rollcall': self.add_rollcall(chamber, bill, doc_meta) self.save_bill(bill)
def scrape_current(self, chamber, term): chamber_name = "Senate" if chamber == "upper" else "House" with self.urlopen( ksapi.url + "bill_status/" ) as bill_request: # perhaps we should save this data so we can make on request for both chambers? bill_request_json = json.loads(bill_request) bills = bill_request_json["content"] for bill_data in bills: # filtering out other chambers bill_equal_chamber = False for history in bill_data["HISTORY"]: if history["chamber"] == chamber_name: bill_is_in_chamber = True if not bill_is_in_chamber: continue # main bill = Bill(term, chamber, bill_data["BILLNO"], bill_data["SHORTTITLE"]) bill.add_source(ksapi.url + "bill_status/" + bill_data["BILLNO"].lower()) if bill_data["LONGTITLE"]: bill.add_title(bill_data["LONGTITLE"]) bill.add_document("apn", ksapi.ksleg + bill_data["apn"]) bill.add_version("Latest", ksapi.ksleg + bill_data["apn"]) for sponsor in bill_data["SPONSOR_NAMES"]: bill.add_sponsor("primary" if len(bill_data["SPONSOR_NAMES"]) == 1 else "cosponsor", sponsor) for event in bill_data["HISTORY"]: if "committee_names" in event and "conferee_names" in event: actor = " and ".join(bill_data["committee_names"] + bill_data["conferee_names"]) elif "committee_names" in history: actor = " and ".join(bill_data["committee_names"]) elif "conferee_names" in history: actor = " and ".join(bill_data["conferee_names"]) else: actor = "upper" if chamber == "Senate" else "lower" date = datetime.datetime.strptime(event["occurred_datetime"], "%Y-%m-%dT%H:%M:%S") bill.add_action(actor, event["status"], date) if event["action_code"] in ksapi.voted: votes = votes_re.match(event["status"]) if votes: vote = Vote( chamber, date, votes.group(1), event["action_code"] in ksapi.passed, int(votes.group(2)), int(votes.group(3)), 0, ) vote.add_source(ksapi.ksleg + "bill_status/" + bill_data["BILLNO"].lower()) bill.add_vote(vote) self.save_bill(bill)
def scrape_bill(self, chamber, session, bill_id, url): try: page = self.urlopen(url) except scrapelib.HTTPError: self.warning("couldn't open %s, skipping bill" % url) return page = lxml.html.fromstring(page) page.make_links_absolute(url) header = page.xpath('//h3/br')[0].tail.replace(' ', ' ') title, primary_sponsor = header.split(' -- ') if bill_id.startswith('H.B.') or bill_id.startswith('S.B.'): bill_type = ['bill'] elif bill_id.startswith('H.R.') or bill_id.startswith('S.R.'): bill_type = ['resolution'] elif bill_id.startswith('H.C.R.') or bill_id.startswith('S.C.R.'): bill_type = ['concurrent resolution'] elif bill_id.startswith('H.J.R.') or bill_id.startswith('S.J.R.'): bill_type = ['joint resolution'] for flag in SUB_BLACKLIST: if flag in bill_id: bill_id = bill_id.replace(flag, " ") bill_id = re.sub("\s+", " ", bill_id).strip() bill = Bill(session, chamber, bill_id, title, type=bill_type) bill.add_sponsor('primary', primary_sponsor) bill.add_source(url) for link in page.xpath( '//a[contains(@href, "bills/") and text() = "HTML"]'): name = link.getprevious().tail.strip() bill.add_version(name, link.attrib['href'], mimetype="text/html") next = link.getnext() if next.text == "PDF": bill.add_version(name, next.attrib['href'], mimetype="application/pdf") for link in page.xpath( "//a[contains(@href, 'fnotes') and text() = 'HTML']"): bill.add_document("Fiscal Note", link.attrib['href']) subjects = [] for link in page.xpath("//a[contains(@href, 'RelatedBill')]"): subjects.append(link.text.strip()) bill['subjects'] = subjects status_link = page.xpath('//a[contains(@href, "billsta")]')[0] self.parse_status(bill, status_link.attrib['href']) self.save_bill(bill)
def scrape_assem_bills(self, chamber, insert, session, year): doc_type = {1: 'bill', 3: 'resolution', 5: 'concurrent resolution', 6: 'joint resolution',9:'petition'} for docnum, bill_type in doc_type.iteritems(): parentpage_url = 'http://www.leg.state.nv.us/Session/%s/Reports/HistListBills.cfm?DoctypeID=%s' % (insert, docnum) links = self.scrape_links(parentpage_url) count = 0 for link in links: count = count + 1 page_path = 'http://www.leg.state.nv.us/Session/%s/Reports/%s' % (insert, link) page = self.get(page_path).text page = page.replace(u"\xa0", " ") root = lxml.html.fromstring(page) root.make_links_absolute("http://www.leg.state.nv.us/") bill_id = root.xpath('string(/html/body/div[@id="content"]/table[1]/tr[1]/td[1]/font)') title = self.get_node( root, '//div[@id="content"]/table/tr[preceding-sibling::tr/td/' 'b[contains(text(), "By:")]]/td/em/text()') bill = Bill(session, chamber, bill_id, title, type=bill_type) bill['subjects'] = list(set(self.subject_mapping[bill_id])) billtext = root.xpath("//b[text()='Bill Text']")[0].getparent().getnext() text_urls = billtext.xpath("./a") for text_url in text_urls: version_name = text_url.text.strip() version_url = text_url.attrib['href'] bill.add_version(version_name, version_url, mimetype='application/pdf') primary, secondary = self.scrape_sponsors(page) for leg in primary: bill.add_sponsor('primary', leg) for leg in secondary: bill.add_sponsor('cosponsor', leg) minutes_count = 2 for mr in root.xpath('//table[4]/tr/td[3]/a'): minutes = mr.xpath("string(@href)") minutes_url = "http://www.leg.state.nv.us" + minutes minutes_date_path = "string(//table[4]/tr[%s]/td[2])" % minutes_count minutes_date = mr.xpath(minutes_date_path).split() minutes_date = minutes_date[0] + minutes_date[1] + minutes_date[2] + " Minutes" bill.add_document(minutes_date, minutes_url) minutes_count = minutes_count + 1 self.scrape_actions(root, bill, "lower") self.scrape_votes(page, page_path, bill, insert, year) bill.add_source(page_path) self.save_bill(bill)
def scrape_bill(self, chamber, term, bill_id, url, title, subject=None): self.logger.info('GET ' + url) resp = self.get(url) html = resp.text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) bill = Bill(term, chamber, bill_id, title) bill.add_source(url) if subject is not None: bill['subjects'] = [subject] # Sponsors sponsor_map = { 'author': 'primary', 'co-author': 'cosponsor', 'sponsor': 'cosponsor', 'co-sponsor': 'cosponsor', } for div in doc.xpath('//div[contains(@class, "bill-author-info")]'): name = div.xpath('string(b)').strip() sp_type = sponsor_map[div.xpath('string(p)').strip().lower()] bill.add_sponsor(sp_type, name) # Actions for li in doc.xpath('//div[@id="bill-actions"]//li')[::-1]: if li.text_content() == 'None currently available.': continue chamber_str = li.xpath('string(strong)').strip() action_chamber = dict(H='lower', S='upper')[chamber_str] action_date = li.xpath('string(span[@class="document-date"])') # Some resolution actions have no dates. if not action_date.strip(): continue action_date = datetime.datetime.strptime(action_date.strip(), '%m/%d/%Y') action_text = li.xpath('string(span[2])').strip() if not action_text.strip(): continue kwargs = dict(date=action_date, actor=action_chamber, action=action_text) kwargs.update(**self.categorizer.categorize(action_text)) bill.add_action(**kwargs) # Documents (including votes) for doc_type, doc_meta in BillDocuments(self, doc): if doc_type == 'version': bill.add_version( doc_meta.title or doc_meta.text, url=doc_meta.url, mimetype='application/pdf') elif doc_type == 'document': bill.add_document(doc_meta.title or doc_meta.text, url=doc_meta.url, mimetype='application/pdf') elif doc_type == 'rollcall': self.add_rollcall(chamber, bill, doc_meta) self.save_bill(bill)
def scrape_senate_bills(self, chamber, insert, session, year): doc_type = {2: 'bill', 4: 'resolution', 7: 'concurrent resolution', 8: 'joint resolution'} for docnum, bill_type in doc_type.iteritems(): parentpage_url = 'http://www.leg.state.nv.us/Session/%s/Reports/HistListBills.cfm?DoctypeID=%s' % ( insert, docnum) links = self.scrape_links(parentpage_url) count = 0 for link in links: count = count + 1 page_path = 'http://www.leg.state.nv.us/Session/%s/Reports/%s' % (insert, link) page = self.urlopen(page_path) page = page.replace(u"\xa0", " ") root = lxml.html.fromstring(page) bill_id = root.xpath( 'string(/html/body/div[@id="content"]/table[1]/tr[1]/td[1]/font)') title = root.xpath( 'string(/html/body/div[@id="content"]/table[1]/tr[5]/td)') bill = Bill(session, chamber, bill_id, title, type=bill_type) bill['subjects'] = self.subject_mapping[bill_id] bill_text = root.xpath( "string(/html/body/div[@id='content']/table[6]/tr/td[2]/a/@href)") text_url = "http://www.leg.state.nv.us" + bill_text bill.add_version("Bill Text", text_url, mimetype='application/pdf') primary, secondary = self.scrape_sponsors(page) for leg in primary: bill.add_sponsor('primary', leg) for leg in secondary: bill.add_sponsor('cosponsor', leg) minutes_count = 2 for mr in root.xpath('//table[4]/tr/td[3]/a'): minutes = mr.xpath("string(@href)") minutes_url = "http://www.leg.state.nv.us" + minutes minutes_date_path = "string(//table[4]/tr[%s]/td[2])" % minutes_count minutes_date = mr.xpath(minutes_date_path).split() minutes_date = minutes_date[0] + \ minutes_date[1] + minutes_date[2] + " Agenda" bill.add_document(minutes_date, minutes_url) minutes_count = minutes_count + 1 self.scrape_actions(root, bill, "upper") self.scrape_votes(page, bill, insert, year) bill.add_source(page_path) self.save_bill(bill)
def scrape_bill(self, chamber, session, bill_id, url): try: page = self.urlopen(url) except scrapelib.HTTPError: self.warning("couldn't open %s, skipping bill" % url) return page = lxml.html.fromstring(page) page.make_links_absolute(url) header = page.xpath('//h3/br')[0].tail.replace(' ', ' ') title, primary_sponsor = header.split(' -- ') if bill_id.startswith('H.B.') or bill_id.startswith('S.B.'): bill_type = ['bill'] elif bill_id.startswith('H.R.') or bill_id.startswith('S.R.'): bill_type = ['resolution'] elif bill_id.startswith('H.C.R.') or bill_id.startswith('S.C.R.'): bill_type = ['concurrent resolution'] elif bill_id.startswith('H.J.R.') or bill_id.startswith('S.J.R.'): bill_type = ['joint resolution'] for flag in SUB_BLACKLIST: if flag in bill_id: bill_id = bill_id.replace(flag, " ") bill_id = re.sub("\s+", " ", bill_id).strip() bill = Bill(session, chamber, bill_id, title, type=bill_type) bill.add_sponsor('primary', primary_sponsor) bill.add_source(url) for link in page.xpath( '//a[contains(@href, "bills/") and text() = "HTML"]'): name = link.getprevious().tail.strip() bill.add_version(name, link.attrib['href'], mimetype="text/html") next = link.getnext() if next.text == "PDF": bill.add_version(name, next.attrib['href'], mimetype="application/pdf") for link in page.xpath( "//a[contains(@href, 'fnotes') and text() = 'HTML']"): bill.add_document("Fiscal Note", link.attrib['href']) subjects = [] for link in page.xpath("//a[contains(@href, 'RelatedBill')]"): subjects.append(link.text.strip()) bill['subjects'] = subjects status_link = page.xpath('//a[contains(@href, "billsta")]')[0] self.parse_status(bill, status_link.attrib['href']) self.save_bill(bill)
def scrape_senate_bills(self, chamber, insert, session, year): doc_type = {2: 'bill', 4: 'resolution', 7: 'concurrent resolution', 8: 'joint resolution'} for docnum, bill_type in doc_type.iteritems(): parentpage_url = 'http://www.leg.state.nv.us/Session/%s/Reports/HistListBills.cfm?DoctypeID=%s' % (insert, docnum) links = self.scrape_links(parentpage_url) count = 0 for link in links: count = count + 1 page_path = 'http://www.leg.state.nv.us/Session/%s/Reports/%s' % (insert, link) page = self.get(page_path).text page = page.replace(u"\xa0", " ") root = lxml.html.fromstring(page) bill_id = root.xpath('string(/html/body/div[@id="content"]/table[1]/tr[1]/td[1]/font)') title = self.get_node( root, '//div[@id="content"]/table/tr[preceding-sibling::tr/td/' 'b[contains(text(), "By:")]]/td/em/text()') bill = Bill(session, chamber, bill_id, title, type=bill_type) bill['subjects'] = list(set(self.subject_mapping[bill_id])) for table in root.xpath('//div[@id="content"]/table'): if 'Bill Text' in table.text_content(): bill_text = table.xpath("string(tr/td[2]/a/@href)") text_url = "http://www.leg.state.nv.us" + bill_text bill.add_version("Bill Text", text_url, mimetype='application/pdf') primary, secondary = self.scrape_sponsors(page) for leg in primary: bill.add_sponsor('primary', leg) for leg in secondary: bill.add_sponsor('cosponsor', leg) minutes_count = 2 for mr in root.xpath('//table[4]/tr/td[3]/a'): minutes = mr.xpath("string(@href)") minutes_url = "http://www.leg.state.nv.us" + minutes minutes_date_path = "string(//table[4]/tr[%s]/td[2])" % minutes_count minutes_date = mr.xpath(minutes_date_path).split() minutes_date = minutes_date[0] + minutes_date[1] + minutes_date[2] + " Agenda" bill.add_document(minutes_date, minutes_url) minutes_count = minutes_count + 1 self.scrape_actions(root, bill, "upper") self.scrape_votes(page, page_path, bill, insert, year) bill.add_source(page_path) self.save_bill(bill)
def parse_bill(self, chamber, session, bill_id, bill_info_url): with self.urlopen(bill_info_url) as bill_info_data: bill_info = self.soup_parser(bill_info_data) version_url = '%s/bill.doc' % bill_id version_link = bill_info.find(href=version_url) if not version_link: # This bill was withdrawn return bill_title = version_link.findNext('p').contents[0].strip() bill = Bill(session, chamber, bill_id, bill_title) bill.add_version("Most Recent Version", session_url(session) + version_url) bill.add_source(bill_info_url) sponsor_links = bill_info.findAll(href=re.compile( 'legislator/[SH]\d+\.htm')) for sponsor_link in sponsor_links: bill.add_sponsor('primary', sponsor_link.contents[0].strip()) action_p = version_link.findAllNext('p')[-1] for action in action_p.findAll(text=True): action = action.strip() if (not action or action == 'last action' or 'Prefiled' in action): continue action_date = action.split('-')[0] action_date = dt.datetime.strptime(action_date, '%b %d') # Fix: action_date = action_date.replace( year=int('20' + session[2:4])) action = '-'.join(action.split('-')[1:]) if action.endswith('House') or action.endswith('(H)'): actor = 'lower' elif action.endswith('Senate') or action.endswith('(S)'): actor = 'upper' else: actor = chamber bill.add_action(actor, action, action_date) vote_link = bill_info.find(href=re.compile('.*/vote_history.pdf')) if vote_link: bill.add_document( 'vote_history.pdf', bill_info_url.replace('.htm', '') + "/vote_history.pdf") self.save_bill(bill)
def process_bill(self, data): chamber = parse_psuedo_id(data['from_organization'])['classification'] bill = Bill(data['legislative_session'], chamber, data['identifier'], data['title'], subjects=data['subject'], type=data['classification']) if data['abstracts']: bill['summary'] = data['abstracts'][0]['abstract'] bill.update(**data['extras']) for action in data['actions']: actor = parse_psuedo_id( action['organization_id'])['classification'] bill.add_action(actor, action['description'], parse_date(action['date']), type=_action_categories(action['classification'])) # TODO: related entities for source in data['sources']: bill.add_source(source['url']) for sponsor in data['sponsorships']: bill.add_sponsor( sponsor['classification'], sponsor['name'], ) for version in data['versions']: for link in version['links']: bill.add_version(version['note'], link['url'], mimetype=link['media_type'], date=parse_date(version['date'])) for doc in data['documents']: for link in doc['links']: bill.add_document(doc['note'], link['url'], mimetype=link['media_type'], date=parse_date(doc['date'])) for title in data['other_titles']: bill.add_title(title) # TODO: related bills # for related in data['related_bills']: self.save_bill(bill)
def scrape_assem_bills(self, chamber, insert, session, year): doc_type = {1: "bill", 3: "resolution", 5: "concurrent resolution", 6: "joint resolution"} for docnum, bill_type in doc_type.iteritems(): parentpage_url = "http://www.leg.state.nv.us/Session/%s/Reports/HistListBills.cfm?DoctypeID=%s" % ( insert, docnum, ) links = self.scrape_links(parentpage_url) count = 0 for link in links: count = count + 1 page_path = "http://www.leg.state.nv.us/Session/%s/Reports/%s" % (insert, link) with self.urlopen(page_path) as page: page = page.decode("utf8").replace(u"\xa0", " ") root = lxml.html.fromstring(page) bill_id = root.xpath('string(/html/body/div[@id="content"]/table[1]/tr[1]/td[1]/font)') title = root.xpath('string(/html/body/div[@id="content"]/table[1]/tr[5]/td)') bill = Bill(session, chamber, bill_id, title, type=bill_type) bill["subjects"] = self.subject_mapping[bill_id] bill_text = root.xpath("string(/html/body/div[@id='content']/table[6]/tr/td[2]/a/@href)") text_url = "http://www.leg.state.nv.us" + bill_text bill.add_version("Bill Text", text_url) primary, secondary = self.scrape_sponsors(page) for leg in primary: bill.add_sponsor("primary", leg) for leg in secondary: bill.add_sponsor("cosponsor", leg) minutes_count = 2 for mr in root.xpath("//table[4]/tr/td[3]/a"): minutes = mr.xpath("string(@href)") minutes_url = "http://www.leg.state.nv.us" + minutes minutes_date_path = "string(//table[4]/tr[%s]/td[2])" % minutes_count minutes_date = mr.xpath(minutes_date_path).split() minutes_date = minutes_date[0] + minutes_date[1] + minutes_date[2] + " Minutes" bill.add_document(minutes_date, minutes_url) minutes_count = minutes_count + 1 self.scrape_actions(root, bill, "lower") self.scrape_votes(page, bill, insert, year) bill.add_source(page_path) self.save_bill(bill)
def scrape_bill(self, session, chamber, bill_id, short_title, url): if bill_id == "SCR 0003": return with self.urlopen(url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(url) title = page.xpath("//br")[8].tail if not title: title = short_title title = title.strip() abbrev = bill_id.split()[0] if abbrev.endswith("B"): bill_type = ["bill"] elif abbrev.endswith("JR"): bill_type = ["joint resolution"] elif abbrev.endswith("CR"): bill_type = ["concurrent resolution"] elif abbrev.endswith("R"): bill_type = ["resolution"] bill = Bill(session, chamber, bill_id, title, type=bill_type) bill.add_source(url) action_link = page.xpath("//a[contains(@href, 'getActions')]")[0] self.scrape_actions(bill, action_link.attrib["href"]) version_path = "//a[contains(., '%s')]" for version_type in ("Introduced Bill", "House Bill", "Senate Bill", "Engrossed Bill", "Enrolled Act"): path = version_path % version_type links = page.xpath(path) if links: bill.add_version(version_type, links[0].attrib["href"]) for vote_link in page.xpath("//a[contains(@href, 'Srollcal')]"): self.scrape_senate_vote(bill, vote_link.attrib["href"]) for doc_link in page.xpath("//a[contains(@href, 'FISCAL')]"): num = doc_link.text.strip().split("(")[0] bill.add_document("Fiscal Impact Statement #%s" % num, doc_link.attrib["href"]) bill["subjects"] = self.subjects[bill_id] self.save_bill(bill)
def scrape(self, chamber, session): self.validate_session(session) if chamber == 'upper': bill_no = 1 abbr = 'SB' else: bill_no = 4001 abbr = 'HB' while True: bill_page = self.scrape_bill(session, abbr, bill_no) bill_page = BeautifulSoup(bill_page) # if we can't find a page, we must be done. This is a healthy thing. if bill_page == None: return title = ''.join(self.flatten(bill_page.findAll(id='frg_billstatus_ObjectSubject')[0])) title = title.replace('\n','').replace('\r','') bill_id = "%s %d" % (abbr, bill_no) the_bill = Bill(session, chamber, bill_id, title) #sponsors first = 0 for name in bill_page.findAll(id='frg_billstatus_SponsorList')[0].findAll('a'): the_bill.add_sponsor(['primary', 'cosponsor'][first], name.string) first = 1 #versions for doc in bill_page.findAll(id='frg_billstatus_DocumentGridTable')[0].findAll('tr'): r = self.parse_doc(the_bill, doc) if r: the_bill.add_version(*r) #documents if 'frg_billstatus_HlaTable' in str(bill_page): for doc in bill_page.findAll(id='frg_billstatus_HlaTable')[0].findAll('tr'): r = self.parse_doc(the_bill, doc) if r: the_bill.add_document(*r) if 'frg_billstatus_SfaSection' in str(bill_page): for doc in bill_page.findAll(id='frg_billstatus_SfaSection')[0].findAll('tr'): r = self.parse_doc(the_bill, doc) if r: the_bill.add_document(*r) self.parse_actions(the_bill, bill_page.findAll(id='frg_billstatus_HistoriesGridView')[0]) self.save_bill(the_bill) bill_no = bill_no + 1 pass
def process_bill(self, data): chamber = parse_psuedo_id(data['from_organization'])['classification'] bill = Bill(data['legislative_session'], chamber, data['identifier'], data['title'], subjects=data['subject'], type=data['classification']) if data['abstracts']: bill['summary'] = data['abstracts'][0]['abstract'] bill.update(**data['extras']) for action in data['actions']: actor = parse_psuedo_id(action['organization_id'])['classification'] bill.add_action(actor, action['description'], parse_date(action['date']), type=_action_categories(action['classification'])) # TODO: related entities for source in data['sources']: bill.add_source(source['url']) for sponsor in data['sponsorships']: bill.add_sponsor(sponsor['classification'], sponsor['name'], ) for version in data['versions']: for link in version['links']: bill.add_version(version['note'], link['url'], mimetype=link['media_type'], date=parse_date(version['date'])) for doc in data['documents']: for link in doc['links']: bill.add_document(doc['note'], link['url'], mimetype=link['media_type'], date=parse_date(doc['date'])) for title in data['other_titles']: bill.add_title(title) # TODO: related bills # for related in data['related_bills']: self.save_bill(bill)
def scrape_bill(self, chamber, session, bill_id, url): try: page = self.urlopen(url) except scrapelib.HTTPError: self.warning("couldn't open %s, skipping bill" % url) return page = lxml.html.fromstring(page) page.make_links_absolute(url) header = page.xpath("//h3/br")[0].tail.replace(" ", " ") title, primary_sponsor = header.split(" -- ") if bill_id.startswith("H.B.") or bill_id.startswith("S.B."): bill_type = ["bill"] elif bill_id.startswith("H.R.") or bill_id.startswith("S.R."): bill_type = ["resolution"] elif bill_id.startswith("H.C.R.") or bill_id.startswith("S.C.R."): bill_type = ["concurrent resolution"] elif bill_id.startswith("H.J.R.") or bill_id.startswith("S.J.R."): bill_type = ["joint resolution"] bill = Bill(session, chamber, bill_id, title, type=bill_type) bill.add_sponsor("primary", primary_sponsor) bill.add_source(url) for link in page.xpath('//a[contains(@href, "bills/") and text() = "HTML"]'): name = link.getprevious().tail.strip() bill.add_version(name, link.attrib["href"]) for link in page.xpath("//a[contains(@href, 'fnotes') and text() = 'HTML']"): bill.add_document("Fiscal Note", link.attrib["href"]) subjects = [] for link in page.xpath("//a[contains(@href, 'RelatedBill')]"): subjects.append(link.text.strip()) bill["subjects"] = subjects status_link = page.xpath('//a[contains(@href, "billsta")]')[0] self.parse_status(bill, status_link.attrib["href"]) self.save_bill(bill)
def scrape_bill(self, chamber, session, bill_number, ga_num): bill_url = self.urls['info'] % (bill_number, ga_num) with self.urlopen(bill_url) as page: page = lxml.html.fromstring(page) title = page.xpath("//span[@id='lblAbstract']")[0].text bill = Bill(session, chamber, bill_number, title) bill.add_source(bill_url) # Primary Sponsor sponsor = page.xpath("//span[@id='lblBillSponsor']" )[0].text_content().split("by")[-1] sponsor = sponsor.replace('*', '').strip() bill.add_sponsor('primary', sponsor) # Co-sponsors unavailable for scraping (loaded into page via AJAX) # Full summary doc summary = page.xpath("//span[@id='lblBillSponsor']/a")[0] bill.add_document('Full summary', summary.get('href')) # Actions tables = page.xpath( "//table[@id='tabHistoryAmendments_tabHistory_gvBillActionHistory']" ) actions_table = tables[0] action_rows = actions_table.xpath("tr[position()>1]") for ar in action_rows: action_taken = ar.xpath("td")[0].text action_date = datetime.datetime.strptime( ar.xpath("td")[1].text.strip(), '%m/%d/%Y') bill.add_action(chamber, action_taken, action_date) votes_link = page.xpath("//span[@id='lblBillVotes']/a") if (len(votes_link) > 0): votes_link = votes_link[0].get('href') bill = self.scrape_votes( bill, sponsor, 'http://wapp.capitol.tn.gov/apps/Billinfo/%s' % (votes_link, )) self.save_bill(bill)
def scrape(self, chamber, session): chamber_abbrev = {'upper': 'SF', 'lower': 'HB'}[chamber] url = ("http://legisweb.state.wy.us/%s/billreference/" "BillReference.aspx?type=%s" % (session, chamber_abbrev)) page = self.lxmlize(url) for tr in page.xpath( "//table[contains(@id,'cphContent_gvBills')]//tr")[1:]: bill_id = tr.xpath("string(td[1])").strip() title = tr.xpath("string(td[2])").strip() if bill_id[0:2] in ['SJ', 'HJ']: bill_type = 'joint resolution' else: bill_type = 'bill' bill = Bill(session, chamber, bill_id, title, type=bill_type) self.scrape_digest(bill) # versions for a in (tr.xpath('td[8]//a') + tr.xpath('td[11]//a') + tr.xpath('td[12]//a')): # skip references to other bills if a.text.startswith('See'): continue bill.add_version(a.text, a.get('href'), mimetype='application/pdf') # documents fnote = tr.xpath('td[9]//a') if fnote: bill.add_document('Fiscal Note', fnote[0].get('href')) summary = tr.xpath('td[14]//a') if summary: bill.add_document('Summary', summary[0].get('href')) bill.add_source(url) self.save_bill(bill)
def scrape_bill(self, chamber, session, bill_id, url): with self.urlopen(url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(url) header = page.xpath('//h3/br')[0].tail.replace(' ', ' ') title, primary_sponsor = header.split(' -- ') if bill_id.startswith('H.B.') or bill_id.startswith('S.B.'): bill_type = ['bill'] elif bill_id.startswith('H.R.') or bill_id.startswith('S.R.'): bill_type = ['resolution'] elif bill_id.startswith('H.C.R.') or bill_id.startswith('S.C.R.'): bill_type = ['concurrent resolution'] elif bill_id.startswith('H.J.R.') or bill_id.startswith('S.J.R.'): bill_type = ['joint resolution'] bill = Bill(session, chamber, bill_id, title, type=bill_type) bill.add_sponsor('primary', primary_sponsor) bill.add_source(url) for link in page.xpath( '//a[contains(@href, "bills/") and text() = "HTML"]'): name = link.getprevious().tail.strip() bill.add_version(name, link.attrib['href']) for link in page.xpath( "//a[contains(@href, 'fnotes') and text() = 'HTML']"): bill.add_document("Fiscal Note", link.attrib['href']) subjects = [] for link in page.xpath("//a[contains(@href, 'RelatedBill')]"): subjects.append(link.text.strip()) bill['subjects'] = subjects status_link = page.xpath('//a[contains(@href, "billsta")]')[0] self.parse_status(bill, status_link.attrib['href']) self.save_bill(bill)
def scrape_bill(self, chamber, session, bill_id, url): with self.urlopen(url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(url) header = page.xpath('//h3/br')[0].tail.replace(' ', ' ') title, primary_sponsor = header.split(' -- ') if bill_id.startswith('H.B.') or bill_id.startswith('S.B.'): bill_type = ['bill'] elif bill_id.startswith('H.R.') or bill_id.startswith('S.R.'): bill_type = ['resolution'] elif bill_id.startswith('H.C.R.') or bill_id.startswith('S.C.R.'): bill_type = ['concurrent resolution'] elif bill_id.startswith('H.J.R.') or bill_id.startswith('S.J.R.'): bill_type = ['joint resolution'] bill = Bill(session, chamber, bill_id, title, type=bill_type) bill.add_sponsor('primary', primary_sponsor) bill.add_source(url) for link in page.xpath( '//a[contains(@href, "bills/") and text() = "HTML"]'): name = link.getprevious().tail.strip() bill.add_version(name, link.attrib['href']) for link in page.xpath( "//a[contains(@href, 'fnotes') and text() = 'HTML']"): bill.add_document("Fiscal Note", link.attrib['href']) subjects = [] for link in page.xpath("//a[contains(@href, 'RelatedBill')]"): subjects.append(link.text.strip()) bill['subjects'] = subjects status_link = page.xpath('//a[contains(@href, "billsta")]')[0] self.parse_status(bill, status_link.attrib['href']) self.save_bill(bill)
def scrape(self, chamber, session): chamber_abbrev = {'upper': 'SF', 'lower': 'HB'}[chamber] url = ("http://legisweb.state.wy.us/%s/billreference/" "BillReference.aspx?type=%s" % (session, chamber_abbrev)) page = self.lxmlize(url) for tr in page.xpath("//table[@id='ctl00_cphContent_gvBills']//tr")[1:]: bill_id = tr.xpath("string(td[1])").strip() title = tr.xpath("string(td[2])").strip() if bill_id[0:2] in ['SJ', 'HJ']: bill_type = 'joint resolution' else: bill_type = 'bill' bill = Bill(session, chamber, bill_id, title, type=bill_type) self.scrape_digest(bill) # versions for a in (tr.xpath('td[8]//a') + tr.xpath('td[11]//a') + tr.xpath('td[12]//a')): # skip references to other bills if a.text.startswith('See'): continue bill.add_version(a.text, a.get('href'), mimetype='application/pdf') # documents fnote = tr.xpath('td[9]//a') if fnote: bill.add_document('Fiscal Note', fnote[0].get('href')) summary = tr.xpath('td[14]//a') if summary: bill.add_document('Summary', summary[0].get('href')) bill.add_source(url) self.save_bill(bill)
def scrape_bill(self, chamber, session, bill_number, ga_num): bill_url = self.urls['info'] % (bill_number, ga_num) with self.urlopen(bill_url) as page: page = lxml.html.fromstring(page) title = page.xpath("//span[@id='lblAbstract']")[0].text bill = Bill(session, chamber, bill_number, title) bill.add_source(bill_url) # Primary Sponsor sponsor = page.xpath("//span[@id='lblBillSponsor']")[0].text_content().split("by")[-1] sponsor = sponsor.replace('*','').strip() bill.add_sponsor('primary',sponsor) # Co-sponsors unavailable for scraping (loaded into page via AJAX) # Full summary doc summary = page.xpath("//span[@id='lblBillSponsor']/a")[0] bill.add_document('Full summary', summary.get('href')) # Actions tables = page.xpath("//table[@id='tabHistoryAmendments_tabHistory_gvBillActionHistory']") actions_table = tables[0] action_rows = actions_table.xpath("tr[position()>1]") for ar in action_rows: action_taken = ar.xpath("td")[0].text action_date = datetime.datetime.strptime(ar.xpath("td")[1].text.strip(), '%m/%d/%Y') bill.add_action(chamber, action_taken, action_date) votes_link = page.xpath("//span[@id='lblBillVotes']/a") if(len(votes_link) > 0): votes_link = votes_link[0].get('href') bill = self.scrape_votes(bill, sponsor, 'http://wapp.capitol.tn.gov/apps/Billinfo/%s' % (votes_link,)) self.save_bill(bill)
def scrape_bills(self, chamber_to_scrape, session): url = 'http://billstatus.ls.state.ms.us/%s/pdf/all_measures/allmsrs.xml' % session with self.urlopen(url) as bill_dir_page: root = lxml.etree.fromstring(bill_dir_page, lxml.etree.HTMLParser()) for mr in root.xpath('//lastaction/msrgroup'): bill_id = mr.xpath('string(measure)').replace(" ", "") if bill_id[0] == "S": chamber = "upper" else: chamber = "lower" bill_type = {'B':'bill', 'C': 'concurrent resolution', 'R': 'resolution', 'N': 'nomination'}[bill_id[1]] # just skip past bills that are of the wrong chamber if chamber != chamber_to_scrape: continue link = mr.xpath('string(actionlink)').replace("..", "") main_doc = mr.xpath('string(measurelink)').replace("../../../", "") main_doc_url = 'http://billstatus.ls.state.ms.us/%s' % main_doc bill_details_url = 'http://billstatus.ls.state.ms.us/%s/pdf/%s' % (session, link) with self.urlopen(bill_details_url) as details_page: details_page = details_page.decode('latin1').encode('utf8', 'ignore') details_root = lxml.etree.fromstring(details_page, lxml.etree.HTMLParser()) title = details_root.xpath('string(//shorttitle)') longtitle = details_root.xpath('string(//longtitle)') bill = Bill(session, chamber, bill_id, title, type=bill_type, longtitle=longtitle) #sponsors main_sponsor = details_root.xpath('string(//p_name)').split() if main_sponsor: main_sponsor = main_sponsor[0] main_sponsor_link = details_root.xpath('string(//p_link)').replace(" ", "_") main_sponsor_url = 'http://billstatus.ls.state.ms.us/%s/pdf/House_authors/%s.xml' % (session, main_sponsor_link) type = "primary" bill.add_sponsor(type, main_sponsor, main_sponsor_url = main_sponsor_url) for author in details_root.xpath('//authors/additional'): leg = author.xpath('string(co_name)').replace(" ", "_") if leg: leg_url = 'http://billstatus.ls.state.ms.us/%s/pdf/House_authors/%s.xml' % (session, leg) type = "cosponsor" bill.add_sponsor(type, leg, leg_url=leg_url) #Versions curr_version = details_root.xpath('string(//current_other)').replace("../../../../", "") curr_version_url = "http://billstatus.ls.state.ms.us/" + curr_version bill.add_version("Current version", curr_version_url) intro_version = details_root.xpath('string(//intro_other)').replace("../../../../", "") intro_version_url = "http://billstatus.ls.state.ms.us/" + intro_version bill.add_version("As Introduced", intro_version_url) comm_version = details_root.xpath('string(//cmtesub_other)').replace("../../../../", "") if comm_version.find("documents") != -1: comm_version_url = "http://billstatus.ls.state.ms.us/" + comm_version bill.add_version("Committee Substitute", comm_version_url) passed_version = details_root.xpath('string(//passed_other)').replace("../../../../", "") if passed_version.find("documents") != -1: passed_version_url = "http://billstatus.ls.state.ms.us/" + passed_version title = "As Passed the " + chamber bill.add_version(title, passed_version_url) asg_version = details_root.xpath('string(//asg_other)').replace("../../../../", "") if asg_version.find("documents") != -1: asg_version_url = "http://billstatus.ls.state.ms.us/" + asg_version bill.add_version("Approved by the Governor", asg_version_url) # avoid duplicate votes seen_votes = set() #Actions for action in details_root.xpath('//history/action'): action_num = action.xpath('string(act_number)').strip() action_num = int(action_num) act_vote = action.xpath('string(act_vote)').replace("../../../..", "") action_desc = action.xpath('string(act_desc)') date, action_desc = action_desc.split(" ", 1) date = date + "/" + session[0:4] date = datetime.strptime(date, "%m/%d/%Y") if action_desc.startswith("(H)"): actor = "lower" action = action_desc[4:] elif action_desc.startswith("(S)"): actor = "upper" action = action_desc[4:] else: actor = "executive" action = action_desc if action.find("Veto") != -1: version_path = details_root.xpath("string(//veto_other)") version_path = version_path.replace("../../../../", "") version_url = "http://billstatus.ls.state.ms.us/" + version_path bill.add_document("Veto", version_url) atype = 'other' for prefix, prefix_type in self._action_types: if action.startswith(prefix): atype = prefix_type break bill.add_action(actor, action, date, type=atype, action_num=action_num) if act_vote: vote_url = 'http://billstatus.ls.state.ms.us%s' % act_vote if vote_url not in seen_votes: seen_votes.add(vote_url) vote = self.scrape_votes(vote_url, action, date, actor) vote.add_source(vote_url) bill.add_vote(vote) bill.add_source(bill_details_url) self.save_bill(bill)
def scrape_bill(self, session, chamber, bill_id, short_title, url): try: page = self.urlopen(url) except scrapelib.HTTPError: self.logger.warning("500 error at: %r" % url) return page = lxml.html.fromstring(page) page.make_links_absolute(url) # check for Bill Withdrawn header h1text = page.xpath("//h1/text()") if h1text and h1text[0] == "Bill Withdrawn": return title = page.xpath("//br")[8].tail if not title: title = short_title title = title.strip() abbrev = bill_id.split()[0] if abbrev.endswith("B"): bill_type = ["bill"] elif abbrev.endswith("JR"): bill_type = ["joint resolution"] elif abbrev.endswith("CR"): bill_type = ["concurrent resolution"] elif abbrev.endswith("R"): bill_type = ["resolution"] bill = Bill(session, chamber, bill_id, title, type=bill_type) bill.add_source(url) action_link = page.xpath("//a[contains(@href, 'getActions')]")[0] self.scrape_actions(bill, action_link.attrib["href"]) version_path = "//a[contains(., '%s')]" for version_type in ("Introduced Bill", "House Bill", "Senate Bill", "Engrossed Bill", "Enrolled Act"): path = version_path % version_type links = page.xpath(path) if links: _url = links[0].attrib["href"] # Set the mimetype. if "pdf" in _url: mimetype = "application/pdf" else: mimetype = "text/html" bill.add_version(version_type, _url, mimetype=mimetype) for vote_link in page.xpath("//a[contains(@href, 'Srollcal')]"): self.scrape_senate_vote(bill, vote_link.attrib["href"]) for vote_link in page.xpath("//a[contains(@href, 'Hrollcal')]"): self.scrape_house_vote(bill, vote_link.attrib["href"]) for doc_link in page.xpath("//a[contains(@href, 'FISCAL')]"): num = doc_link.text.strip().split("(")[0] bill.add_document("Fiscal Impact Statement #%s" % num, doc_link.attrib["href"]) bill["subjects"] = self.subjects[bill_id] # Also retrieve the "latest printing" bill if it hasn't # been found yet. latest_printing = '//a[contains(@href, "bills")]/@href' for url in set(page.xpath(latest_printing)): # Set the mimetype. if "pdf" in url: mimetype = "application/pdf" else: mimetype = "text/html" try: bill.add_version("Latest printing", url, mimetype=mimetype) except ValueError: # The url was a duplicate. pass if not bill["sponsors"]: # Indiana has so-called 'vehicle bills', which are empty # placeholders that may later get injected with content # concerning such innocuous topics as redistricting # (2011 SB 0192) and marijuana studies (2011 SB 0192). url = bill["sources"][0]["url"] page = self.urlopen(url) if "Vehicle Bill" in page: msg = "Skipping vehicle bill: {bill_id}." self.logger.info(msg.format(**bill)) return # And some bills are withdrawn before first reading, which # case they don't really exist, and the main version link # will 404. withdrawn = "Withdrawn prior to first reading" if bill["actions"]: if bill["actions"][-1]["action"] == withdrawn: msg = "Skipping bill withdrawn before first " "reading: {bill_id}." self.logger.info(msg.format(**bill)) return self.save_bill(bill)
def process_bill(self, data): chamber = parse_psuedo_id(data['from_organization'])['classification'] if chamber == 'legislature': chamber = 'upper' bill = Bill(data['legislative_session'], chamber, data['identifier'], data['title'], subjects=data['subject'], type=data['classification']) if data['abstracts']: bill['summary'] = data['abstracts'][0]['abstract'] bill.update(**data['extras']) for action in data['actions']: actor = parse_psuedo_id( action['organization_id'])['classification'] legislators = [] committees = [] for rel in action['related_entities']: if rel['entity_type'] == 'organization': committees.append(rel['name']) elif rel['entity_type'] == 'person': legislators.append(rel['name']) bill.add_action(actor, action['description'], parse_date(action['date']), type=_action_categories(action['classification']), committees=committees, legislators=legislators, **action.get('extras', {})) for source in data['sources']: bill.add_source(source['url']) for sponsor in data['sponsorships']: bill.add_sponsor( sponsor['classification'], sponsor['name'], ) for version in data['versions']: for link in version['links']: bill.add_version(version['note'], link['url'], mimetype=link['media_type'], date=parse_date(version['date']), **version.get('extras', {})) for doc in data['documents']: for link in doc['links']: bill.add_document(doc['note'], link['url'], mimetype=link['media_type'], date=parse_date(doc['date']), **doc.get('extras', {})) for title in data['other_titles']: bill.add_title(title['title']) for related in data['related_bills']: bill.add_companion(related['identifier'], related['legislative_session'], chamber) bill['alternate_bill_ids'] = [ oi['identifier'] for oi in data['other_identifiers'] ] self.save_bill(bill)
def parse_bill(self, chamber, session, bill_id, url): with self.urlopen(url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(url) try: short_bill_id = re.sub(r'S([JC])R', r'S\1', bill_id) version_link = page.xpath( "//a[contains(@href, '%s/bill.doc')]" % short_bill_id)[0] except IndexError: # Bill withdrawn return pars = version_link.xpath("following-sibling::p") if len(pars) == 2: title = pars[0].xpath("string()") action_p = pars[1] else: title = pars[0].getprevious().tail action_p = pars[0] title = re.sub(ur'[\s\xa0]+', ' ', title).strip() if 'CR' in bill_id: bill_type = 'concurrent resolution' elif 'JR' in bill_id: bill_type = 'joint resolution' elif 'R' in bill_id: bill_type = 'resolution' else: bill_type = 'bill' bill = Bill(session, chamber, bill_id, title, type=bill_type) bill['subjects'] = self._subjects[bill_id] bill.add_source(url) bill.add_version("Most Recent Version", version_link.attrib['href']) for link in page.xpath("//a[contains(@href, 'legislator/')]"): bill.add_sponsor('primary', link.text.strip()) for line in action_p.xpath("string()").split("\n"): action = line.strip() if (not action or action == 'last action' or 'Prefiled' in action): continue action_date = "%s %s" % (action.split('-')[0], session[0:4]) action_date = datetime.datetime.strptime( action_date, '%b %d %Y') action = '-'.join(action.split('-')[1:]) if action.endswith('House') or action.endswith('(H)'): actor = 'lower' elif action.endswith('Senate') or action.endswith('(S)'): actor = 'upper' else: actor = chamber atype = [] if action.startswith('introduced in'): atype.append('bill:introduced') if '; to ' in action: atype.append('committee:referred') elif action.startswith('signed by Governor'): atype.append('governor:signed') elif re.match(r'^to [A-Z]', action): atype.append('committee:referred') elif action == 'adopted by voice vote': atype.append('bill:passed') if '1st reading' in action: atype.append('bill:reading:1') if '3rd reading' in action: atype.append('bill:reading:3') if '2nd reading' in action: atype.append('bill:reading:2') if 'R' in bill_id and 'adopted by voice vote' in action: atype.append('bill:passed') amendment_re = (r'floor amendments?( \([a-z\d\-]+\))*' r'( and \([a-z\d\-]+\))? filed') if re.search(amendment_re, action): atype.append('amendment:introduced') if not atype: atype = ['other'] bill.add_action(actor, action, action_date, type=atype) try: votes_link = page.xpath( "//a[contains(@href, 'vote_history.pdf')]")[0] bill.add_document("Vote History", votes_link.attrib['href']) except IndexError: # No votes pass self.save_bill(bill)
def scrape_bill(self, link, chamber, session): legislation_types = { 'House Bill': 'HB', 'House Concurrent Resolution': 'HCR', 'House Joint Resolution': 'HJR', 'House Resolution': 'HR', 'Senate Bill': 'SB', 'Senate Concurrent Resolution': 'SCR', 'Senate Joint Resolution': 'SJR', 'Senate Resolution': 'SR', } base_url = "http://legis.delaware.gov" text_base_url = "http://legis.delaware.gov/LIS/lis{session}.nsf/vwLegislation/{bill_id}/$file/legis.html?open" try: page = self.lxmlize(link, True) except requests.exceptions.HTTPError: self.logger.warning('404. Apparently the bill hasn\'t been posted') return nominee = self.get_node(page, './/div[@id="page_header"]/text()') if nominee is not None and nominee.strip().lower( ) == "nominee information": self.logger.info("Nominee, skipping") return bill_id = self.get_node( page, './/div[@align="center" or @style="text-align:center"]') try: bill_id = bill_id.text_content().strip() except IndexError: self.logger.warning("Can't find bill number, skipping") return #some bill_ids include relevant amendments #in the form "SB 10 w/SA1", so we fix it here bill_id = bill_id.split("w/")[0] bill_id = bill_id.split("(")[0] leg_type = None for long_name, short_name in legislation_types.items(): if long_name in bill_id: leg_type = short_name bill_num = bill_id.replace(long_name, "").strip() break if leg_type: bill_id = leg_type + " " + bill_num elif "for" in bill_id: bill_id = bill_id.split("for")[1] else: self.logger.warning("Unknown bill type for {}".format(bill_id)) return bill_id = bill_id.replace(' ', "") bill_id = bill_id.strip() #each row is in its own table #there are no classes/ids or anything, so we're going to loop #through the individual tables and look for keywords #in the first td to tell us what we're looking at tables = self.get_nodes(page, './/div[@id="page_content"]/table') bill_title = None primary_sponsors = [] cosponsors = [] bill_url = None bill_documents = {} action_list = [] vote_documents = {} sub_link = None bill_text_avail = False if tables is None or not tables: self.logger.warning('First xpath didn\'t work.') tables = self.get_nodes(page, './/table[@style="width:837.0px"]/tr') for table in tables: tds = table.xpath('.//td') if len(tds) == 0: #some kind of empty table for formatting reasons continue title_text = tds[0].text_content().strip().lower() if title_text.startswith('primary sponsor'): pri_sponsor_text = tds[1].text_content() primary_sponsors = self.separate_names(pri_sponsor_text) #sometimes additional sponsors are in a 3rd td #other times the 3rd td contains a blank image addl_sponsors = [] add_spons_text = tds[2].text_content().strip() if add_spons_text: add_spons_text = add_spons_text.replace( "Additional Sponsor(s):", "") if not "on behalf of all representatives" in add_spons_text.lower( ): addl_sponsors = self.separate_names(add_spons_text) elif title_text.startswith('co-sponsor'): cosponsor_text = tds[1].text_content() if "none..." in cosponsor_text.lower(): cosponsors = [] continue cosponsors = self.separate_names(cosponsor_text) elif title_text.startswith('long title'): bill_title = tds[1].text_content().strip() elif title_text.startswith('amendment'): amendments = tds[1].xpath('.//a') for a in amendments: amm = a.text amm_text = "Amendment".format(amm.strip()) amm_slg = "+".join(amm.split()) amm_link = text_base_url.format(session=session, bill_id=amm_slg) bill_documents[amm_text] = amm_link amm_page = self.lxmlize(a.attrib["href"]) for tr in amm_page.xpath('//tr'): tds = tr.xpath("./td") if len(tds) > 1: if "voting" in tds[0].text_content().lower(): self.find_vote(tds, vote_documents, "Amendment: ") elif title_text.startswith('engrossed version'): if tds[1].text_content().strip(): engrossment_base = "http://legis.delaware.gov/LIS/lis{session}.nsf/EngrossmentsforLookup/{bill_id}/$file/Engross.html?open" engrossment_link = engrossment_base.format( session=session, bill_id="+".join(bill_id.split())) if bill_url not in bill_documents.values(): bill_documents["Engrossed Version"] = engrossment_link elif title_text.startswith('substituted'): content = tds[1].text_content().strip() if ("Substitute" in content and not "Original" in content): sub_link = tds[1].xpath(".//a/@href")[0] elif ("full text" in title_text and ("(" not in title_text or "html" in title_text)): if tds[1].text_content().strip(): #it is totally unclear which version of the bill is referred to here #so I'm just calling it "bill text" bill_url = text_base_url.format(session=session, bill_id=bill_id.replace( " ", "+")) if bill_url not in bill_documents.values(): bill_documents["Bill Text"] = bill_url elif title_text.startswith('fiscal notes'): pass #skipping fiscal notes for now, they are really ugly #but leaving in as a placeholder so we can remember to #do this someday, if we feel like it elif title_text.startswith('committee reports'): pass #the committee reports let a legislator #comment on a bill. They can comment as #"favorable","unfavorable" or "on its merits" #but these are NOT votes (per conversation w #seceretary of the DE senate 3/16/15). The bill is #considered if the majority sign it, which will #appear in the bill's action history as being #reported out of committee elif title_text.startswith('voting'): self.find_vote(tds, vote_documents) elif title_text.startswith('actions history'): action_list = tds[1].text_content().split("\n") sub_versions = [] use_sub = False if sub_link: bill = self.scrape_bill(sub_link, chamber, session) if bill: sub_versions = [v["url"] for v in bill["versions"]] bill.add_title(bill_id) use_sub = True if not use_sub: bill = Bill(session, chamber, bill_id, bill_title) for s in primary_sponsors: bill.add_sponsor("primary", s) for s in addl_sponsors: #it is not totally clear whether "additional sponsors" #are co or primary but primary is my best guess #based on the bill text, bc they're on the first #line with the primary sponsor bill.add_sponsor("primary", s) for s in cosponsors: bill.add_sponsor("cosponsor", s) for name, doc_link in bill_documents.items(): if "Engrossment" in name or "Bill Text" in name: if doc_link not in sub_versions: bill.add_version(name, doc_link, mimetype="text/html") else: pass bill.add_document(name, doc_link, mimetype="text/html") for a in action_list: if a.strip(): date, action = a.split('-', 1) try: date = datetime.strptime(date.strip(), '%b %d, %Y') except ValueError: date = datetime.strptime(date.strip(), '%B %d, %Y') # XXX: ugh. action = action.strip() actor = actions.get_actor(action, bill['chamber']) attrs = dict(actor=actor, action=action, date=date) attrs.update(**self.categorizer.categorize(action)) attrs["action"] = " ".join(attrs["action"].split()) bill.add_action(**attrs) for name, doc in vote_documents.items(): vote_chamber = "lower" if "house" in name.lower() else "upper" try: self.head(doc) except requests.exceptions.HTTPError: self.logger.warning("could not access vote document") continue vote_page = self.lxmlize(doc) vote_info = vote_page.xpath(".//div[@id='page_content']/p")[-1] yes_votes = [] no_votes = [] other_votes = [] lines = vote_info.text_content().split("\n") for line in lines: if line.strip().startswith("Date"): date_str = " ".join(line.split()[1:4]) date = datetime.strptime(date_str, "%m/%d/%Y %I:%M %p") passage_status = line.strip().split()[-1] #we've never seen a vote with anything but "passed" #so throw an error otherwise so we can figure it out passed_statuses = ["Passed"] failed_statuses = ["Defeated", "Rescinded"] if passage_status not in passed_statuses + failed_statuses: raise AssertionError( "Unknown passage state {}".format(passage_status)) passed = passage_status in passed_statuses if line.strip().startswith("Vote Type"): if "voice" in line.lower(): voice_vote = True else: voice_vote = False yes_count = int(re.findall("Yes: (\d+)", line)[0]) no_count = int(re.findall("No: (\d+)", line)[0]) other_count = int( re.findall("Not Voting: (\d+)", line)[0]) other_count += int( re.findall("Absent: (\d+)", line)[0]) vote_tds = vote_page.xpath(".//table//td") person_seen = False for td in vote_tds: if person_seen: person_vote = td.text_content().strip() if person_vote == "Y": yes_votes.append(person) elif person_vote == "N": no_votes.append(person) elif person_vote in ["NV", "A", "X", "C"]: other_votes.append(person) else: raise AssertionError( "Unknown vote '{}'".format( person_vote)) person_seen = False else: person = td.text_content().strip() if person: person_seen = True if voice_vote: vote = Vote(vote_chamber, date, "passage", passed, 0, 0, 0) else: vote = Vote(vote_chamber, date, "passage", passed, yes_count, no_count, other_count, yes_votes=[], no_votes=[], other_votes=[]) vote["yes_votes"] = yes_votes vote["no_votes"] = no_votes vote["other_votes"] = other_votes if (passed and vote["yes_count"] <= vote["no_count"] and not voice_vote): raise AssertionError("Vote passed with more N than Y votes?") if not passed and vote["yes_count"] > vote["no_count"]: self.logger.warning("Vote did not pass but had a majority \ probably worth checking") if "Amendment" in name: vote["type"] = "amendment" else: vote["type"] = "passage" vote.add_source(doc) bill.add_vote(vote) bill.add_source(link) return bill
def scrape_bill(self, url, kw, re_amendment=re.compile(r'(^[A-Z]A \d{1,3}) to'), re_substitution=re.compile(r'(^[A-Z]S \d{1,2}) for'), re_digits=re.compile(r'\d{,5}'), actions_get_actor=actions.get_actor): bill = Bill(**kw) bill.add_source(url) #--------------------------------------------------------------------- # A few helpers. _url_2_lxml = self._url_2_lxml _cleanup_sponsors = self._cleanup_sponsors # Shortcut function partial to get text at a particular xpath: doc = _url_2_lxml(url) _get_text = partial(get_text, doc, 0) # Get session number--needed for fetching related documents (see below). xpath = '//font[contains(., "General Assembly") and @face="Arial"]' session_num = doc.xpath(xpath)[0].text_content() session_num = re_digits.match(session_num).group() #--------------------------------------------------------------------- # Sponsors chamber = bill['chamber'] sponsor_types = { 'Additional Sponsor(s):': 'cosponsor', 'CoSponsors:': 'cosponsor', 'Primary Sponsor:': 'primary'} xpath = '//font[contains(., "Sponsor") and @color="#008080"]' headings = doc.xpath(xpath + '/text()') sponsors = doc.xpath(xpath + '/../../following-sibling::td/font/text()') for h, s in zip(headings, sponsors): names = _cleanup_sponsors(s, chamber) type_ = sponsor_types[h.strip()] if names: for name, _chamber in names: bill.add_sponsor(type_, name, chamber=_chamber) #--------------------------------------------------------------------- # Versions tmp = '/'.join([ 'http://www.legis.delaware.gov', 'LIS/lis{session_num}.nsf/vwLegislation', '{moniker}/$file/{filename}{format_}?open']) documents = self.scrape_documents(source=url, docname="introduced", filename="Legis", tmp=tmp, session_num=session_num) for d in documents: bill.add_version(**d) # If bill is a substitution, add the original as a version. names = doc.xpath('//*[contains(text(), "Substituted ' 'Legislation for Bill:")]/text()') urls = doc.xpath('//*[contains(text(), "Substituted ' 'Legislation for Bill:")]' '/following-sibling::a/@href') for name, url in zip(names, urls): name = re_substitution.match(name).group(1) bill.add_version(name, url, description='original bill') #--------------------------------------------------------------------- # Actions actions = doc.xpath('//font[contains(., "Actions History")]' '/../following-sibling::table/descendant::td[2]') actions = actions[0].text_content() actions = filter(None, actions.splitlines()) for a in reversed(actions): date, action = a.split(' - ', 1) try: date = datetime.strptime(date, '%b %d, %Y') except ValueError: date = datetime.strptime(date, '%B %d, %Y') # XXX: ugh. actor = actions_get_actor(action, bill['chamber']) attrs = dict(actor=actor, action=action, date=date) attrs.update(**self.categorizer.categorize(action)) bill.add_action(**attrs) #--------------------------------------------------------------------- # Votes vote_strings = doc.xpath('//*[contains(text(), "vote:")]/text()') # Sometimes vote strings are contained in weird, separate elements. Probably # hand edited. if not all(re.search('\d', string) for string in vote_strings): # Use the parent's text_content instead. vote_strings = [] for el in doc.xpath('//*[contains(text(), "vote:")]/..'): vote_strings.append(el.text_content()) vote_urls = doc.xpath('//*[contains(text(), "vote:")]' '/following-sibling::a/@href') for string, url in zip(vote_strings, vote_urls): vote_data = parse_votestring(string) vote = self.scrape_vote(url, **vote_data) if vote: bill.add_vote(vote) #--------------------------------------------------------------------- # Amendments xpath = ("//font[contains(., 'Amendments')]/" "../../../td[2]/font/a") tmp = ('http://www.legis.delaware.gov/LIS/lis{session_num}.nsf/' 'vwLegislation/{id_}/$file/{filename}{format_}?open') for source, id_ in zip(doc.xpath(xpath + '/@href'), doc.xpath(xpath + '/text()')): short_id = re_amendment.match(id_).group(1) documents = self.scrape_documents( source=source, docname='amendment (%s)' % short_id, filename='Legis', tmp=tmp, session_num=session_num, id_=id_) for d in documents: bill.add_document(**d) #--------------------------------------------------------------------- # Add any related "Engrossments". # See www.ncsl.org/documents/legismgt/ILP/98Tab3Pt4.pdf for # an explanation of the engrossment process in DE. source = doc.xpath('//img[@alt="Engrossment"]/../@href') if source: tmp = '/'.join([ 'http://www.legis.delaware.gov', 'LIS/lis{session_num}.nsf/EngrossmentsforLookup', '{moniker}/$file/{filename}{format_}?open']) documents = self.scrape_documents( source=source[0], docname="Engrossment", filename="Engross", tmp=tmp, session_num=session_num, id_=bill['bill_id']) for d in documents: bill.add_version(**d) # -------------------------------------------------------------------- # Add any fiscal notes. source = doc.xpath("//img[@alt='Fiscal Note']/../@href") if source: tmp = '/'.join([ 'http://www.legis.delaware.gov', 'LIS/lis{session_num}.nsf/FiscalforLookup', '{docnum}/$file/{filename}{format_}?open']) documents = self.scrape_documents( source=source[0], docname="Fiscal Note", filename="Fiscal", tmp=tmp, session_num=session_num) for d in documents: bill.add_document(**d) #--------------------------------------------------------------------- # Extra fields # Helper to get the first td sibling of certain nodes. tmp = '//font[contains(., "%s")]/../../../td[2]' first_sibling_text = lambda heading: _get_text(tmp % heading) extra_fields = { # A long description of the legislation. "summary": "Synopsis", # Codification details for enacted legislation. "volume_chapter": "Volume Chapter", # Presumably the date of approval/veto. "date_governor_acted": "Date Governor Acted", "fiscal_notes": "Fiscal Notes", } for key, name in extra_fields.iteritems(): try: bill[key] = first_sibling_text(name) except IndexError: # xpath lookup failed. pass self.save_bill(bill)
def scrape_bill(self, chamber, session, bill_id, bill_type): url = '%s?r=%s' % (self.base_url, bill_id) with self.urlopen(url) as html: doc = lxml.html.fromstring(html) # search for Titulo, accent over i messes up lxml, so use 'tulo' title = doc.xpath( u'//td/b[contains(text(),"tulo")]/../following-sibling::td/text()' ) if not title: raise NoSuchBill() bill = Bill(session, chamber, bill_id, title[0], type=bill_type) author = doc.xpath( u'//td/b[contains(text(),"Autor")]/../text()')[0] for aname in author.split(','): bill.add_sponsor('primary', aname.strip()) co_authors = doc.xpath( u'//td/b[contains(text(),"Co-autor")]/../text()') if len(co_authors) != 0: for co_author in co_authors[1].split(','): bill.add_sponsor('cosponsor', co_author.strip()) action_table = doc.xpath('//table')[-1] for row in action_table[1:]: tds = row.xpath('td') # ignore row missing date if len(tds) != 2: continue date = datetime.datetime.strptime(tds[0].text_content(), "%m/%d/%Y") action = tds[1].text_content().strip() #parse the text to see if it's a new version or a unrelated document #if has - let's *shrug* assume it's a vote document #get url of action action_url = tds[1].xpath('a/@href') #check it has a url and is not just text if action_url: action_url = action_url[0] #check if it's a version of the bill or another type of document. #NOTE: not sure if new versions of the bill are only denoted with 'Entirillado' OR if that's the correct name but from what i gather it looks like it. if re.match('Entirillado', action): bill.add_version(action, action_url) else: bill.add_document(action, action_url) for pattern, atype in _classifiers: if re.match(pattern, action): break else: atype = 'other' bill.add_action(chamber, action, date, type=atype) if atype == 'bill:passed' and action_url: vote_chamber = None for pattern, vote_chamber in _voteChambers: if re.match(pattern, action): break else: self.warning('coudnt find voteChamber pattern') if vote_chamber == 'lower' and len(action_url) > 0: vote = self.scrape_votes(action_url, action, date, vote_chamber) if not vote[0] == None: vote[0].add_source(action_url) bill.add_vote(vote[0]) else: self.warning('Problem Reading vote: %s,%s' % (vote[1], bill_id)) bill.add_source(url) self.save_bill(bill)
def scrape_for_bill_type(self, chamber, session, url): self.refresh_session() html = self.urlopen(url) doc = lxml.html.fromstring(html) # bills are all their own table with cellspacing=4 (skip first) bill_tables = doc.xpath('//table[@cellspacing="4"]') for bt in bill_tables[1:]: # each table has 3 rows: detail row, description, blank details, desc, _ = bt.xpath('tr') # first <tr> has img, button, sponsor, topic, current house # current status, committee, committee2, last action tds = details.xpath('td') if len(tds) == 9: # middle _, _ is chamber, last action _, button, sponsor, subject, _, _, com1, com2, _ = tds elif len(tds) == 8: # middle _ is last action _, button, sponsor, subject, _, com1, com2, _ = tds else: self.warning('invalid row: (tds=%s) %s', len(tds), details.text_content()) continue # contains script tag that has a document.write that writes the # bill_id, we have to pull that out (gross, but only way) script_text = button.text_content() # skip SBIR/HBIR if 'SBIR' in script_text or 'HBIR' in script_text: continue """ script text looks like: document.write("<input type=button id=BTN71139 name=BTN71139 style='font-weight:normal' value='SB1'"); document.write(" onClick=\"javascript:instrumentSelected(this,'71139','SB1','ON','ON','ON','"); document.write(status + "','OFF','SB1-int.pdf,,','SB1-int.pdf,,')\">"); """ oid, bill_id, fnotes = re.findall( r"instrumentSelected\(this,'(\d+)','(\w+)','ON','ON','(ON|OFF)'", script_text)[0] second_piece = re.findall( r"status \+ \"','(ON|OFF)','([^,]*),([^,]*),([^,]*)\'", script_text) if second_piece: amend, intver, engver, enrver = second_piece[0] else: intver = engver = enrver = None sponsor = sponsor.text_content() subject = subject.text_content() com1 = com1.text_content() com2 = com2.text_content() desc = desc.text_content() if 'B' in bill_id: bill_type = 'bill' elif 'JR' in bill_id: bill_type = 'joint resolution' elif 'R' in bill_id: bill_type = 'resolution' # title is missing on a few bills title = desc.strip() if not title: return # create bill bill = Bill(session, chamber, bill_id, title, type=bill_type) if subject: bill['subjects'] = [subject] if fnotes == 'ON': bill.add_document( 'fiscal notes', 'http://alisondb.legislature.state.al.us/acas/ACTIONFiscalNotesFrameMac.asp?OID=%s&LABEL=%s' % (oid, bill_id)) self.get_sponsors(bill, oid) self.get_actions(bill, oid) # craft bill URLs if intver: bill.add_version('introduced', self.base_doc_url + intver, mimetype='application/pdf') if engver: bill.add_version('engrossed', self.base_doc_url + engver, mimetype='application/pdf') if enrver: bill.add_version('enrolled', self.base_doc_url + enrver, mimetype='application/pdf') self.save_bill(bill)
def scrape_senate_bills(self, chamber, insert, session, year): doc_type = { 2: 'bill', 4: 'resolution', 7: 'concurrent resolution', 8: 'joint resolution' } for docnum, bill_type in doc_type.iteritems(): parentpage_url = 'http://www.leg.state.nv.us/Session/%s/Reports/HistListBills.cfm?DoctypeID=%s' % ( insert, docnum) links = self.scrape_links(parentpage_url) count = 0 for link in links: count = count + 1 page_path = 'http://www.leg.state.nv.us/Session/%s/Reports/%s' % ( insert, link) with self.urlopen(page_path) as page: page = page.decode("utf8").replace(u"\xa0", " ") root = lxml.html.fromstring(page) bill_id = root.xpath( 'string(/html/body/div[@id="content"]/table[1]/tr[1]/td[1]/font)' ) title = root.xpath( 'string(/html/body/div[@id="content"]/table[1]/tr[5]/td)' ) bill = Bill(session, chamber, bill_id, title, type=bill_type) bill_text = root.xpath( "string(/html/body/div[@id='content']/table[6]/tr/td[2]/a/@href)" ) text_url = "http://www.leg.state.nv.us" + bill_text bill.add_version("Bill Text", text_url) primary, secondary = self.scrape_sponsors(page) if primary and primary[0] == 'By:': primary.pop(0) if primary[0] == 'ElectionsProceduresEthicsand': primary[0] = 'Elections Procedures Ethics and' full_name = '' for part_name in primary: full_name = full_name + part_name + " " bill.add_sponsor('primary', full_name) else: for leg in primary: bill.add_sponsor('primary', leg) for leg in secondary: bill.add_sponsor('cosponsor', leg) minutes_count = 2 for mr in root.xpath('//table[4]/tr/td[3]/a'): minutes = mr.xpath("string(@href)") minutes_url = "http://www.leg.state.nv.us" + minutes minutes_date_path = "string(//table[4]/tr[%s]/td[2])" % minutes_count minutes_date = mr.xpath(minutes_date_path).split() minutes_date = minutes_date[0] + minutes_date[ 1] + minutes_date[2] + " Minutes" bill.add_document(minutes_date, minutes_url) minutes_count = minutes_count + 1 self.scrape_actions(root, bill, "upper") self.scrape_votes(page, bill, insert, year) bill.add_source(page_path) self.save_bill(bill)
def scrape_bill(self, chamber, session, bill_id, short_title=None): """ Scrapes documents, actions, vote counts and votes for bills from the 2009 session and above. """ url = BILL_URL % (session, bill_id.replace(' ', '')) with self.urlopen(url) as bill_page: html = lxml.html.fromstring(bill_page) html.make_links_absolute( 'http://legislature.idaho.gov/legislation/%s/' % session) bill_tables = html.xpath('./body/table/tr/td[2]')[0].xpath( './/table') title = bill_tables[1].text_content().strip() bill_type = get_bill_type(bill_id) bill = Bill(session, chamber, bill_id, title, type=bill_type) bill.add_source(url) bill['subjects'] = self._subjects[bill_id.replace(' ', '')] if short_title and bill['title'].lower() != short_title.lower(): bill.add_title(short_title) # documents doc_links = html.xpath('//span/a') for link in doc_links: name = link.text_content().strip() href = link.get('href') if 'Engrossment' in name or 'Bill Text' in name: bill.add_version(name, href) else: bill.add_document(name, href) # sponsors range from a committee to one legislator to a group of legs sponsor_lists = bill_tables[0].text_content().split('by') if len(sponsor_lists) > 1: for sponsors in sponsor_lists[1:]: for person in sponsors.split(','): bill.add_sponsor('primary', person) actor = chamber last_date = None for row in bill_tables[2]: # lots of empty rows if len(row) == 1: continue _, date, action, _ = [x.text_content().strip() for x in row] if date: last_date = date else: date = last_date date = datetime.datetime.strptime(date + '/' + session[0:4], "%m/%d/%Y") if action.startswith('House'): actor = 'lower' elif action.startswith('Senate'): actor = 'upper' # votes if 'AYES' in action or 'NAYS' in action: vote = self.parse_vote(actor, date, row[2]) vote.add_source(url) bill.add_vote(vote) # some td's text is seperated by br elements if len(row[2]): action = "".join(row[2].itertext()) action = action.replace(u'\xa0', ' ').strip() atype = get_action(actor, action) bill.add_action(actor, action, date, type=atype) # after voice vote/roll call and some actions the bill is sent # 'to House' or 'to Senate' if 'to House' in action: actor = 'lower' elif 'to Senate' in action: actor = 'upper' self.save_bill(bill)
def scrape_bills(self, chamber_to_scrape, session): url = 'http://billstatus.ls.state.ms.us/%s/pdf/all_measures/allmsrs.xml' % session bill_dir_page = self.get(url) root = lxml.etree.fromstring(bill_dir_page.content) for mr in root.xpath('//LASTACTION/MSRGROUP'): bill_id = mr.xpath('string(MEASURE)').replace(" ", "") if bill_id[0] == "S": chamber = "upper" else: chamber = "lower" bill_type = {'B':'bill', 'C': 'concurrent resolution', 'R': 'resolution', 'N': 'nomination'}[bill_id[1]] # just skip past bills that are of the wrong chamber if chamber != chamber_to_scrape: continue link = mr.xpath('string(ACTIONLINK)').replace("..", "") main_doc = mr.xpath('string(MEASURELINK)').replace("../../../", "") main_doc_url = 'http://billstatus.ls.state.ms.us/%s' % main_doc bill_details_url = 'http://billstatus.ls.state.ms.us/%s/pdf/%s' % (session, link) details_page = self.get(bill_details_url) page = details_page.content.replace(chr(11), "") # Some pages have the (invalid) byte 11 sitting around. Just drop # them out. Might as well. details_root = lxml.etree.fromstring(page) title = details_root.xpath('string(//SHORTTITLE)') longtitle = details_root.xpath('string(//LONGTITLE)') bill = Bill(session, chamber, bill_id, title, type=bill_type, summary=longtitle) #sponsors main_sponsor = details_root.xpath('string(//P_NAME)').split() if main_sponsor: main_sponsor = main_sponsor[0] main_sponsor_link = details_root.xpath('string(//P_LINK)').replace(" ", "_") main_sponsor_url = 'http://billstatus.ls.state.ms.us/%s/pdf/House_authors/%s.xml' % (session, main_sponsor_link) type = "primary" bill.add_sponsor(type, main_sponsor, main_sponsor_url = main_sponsor_url) for author in details_root.xpath('//AUTHORS/ADDITIONAL'): leg = author.xpath('string(CO_NAME)').replace(" ", "_") if leg: leg_url = 'http://billstatus.ls.state.ms.us/%s/pdf/House_authors/%s.xml' % (session, leg) type = "cosponsor" bill.add_sponsor(type, leg, leg_url=leg_url) #Versions curr_version = details_root.xpath('string(//CURRENT_OTHER)').replace("../../../../", "") if curr_version != "": curr_version_url = "http://billstatus.ls.state.ms.us/" \ + curr_version bill.add_version("Current version", curr_version_url, on_duplicate='use_new', mimetype='text/html') intro_version = details_root.xpath('string(//INTRO_OTHER)').replace("../../../../", "") if intro_version != "": intro_version_url = "http://billstatus.ls.state.ms.us/"\ + intro_version bill.add_version("As Introduced", intro_version_url, on_duplicate='use_new', mimetype='text/html') comm_version = details_root.xpath('string(//CMTESUB_OTHER)').replace("../../../../", "") if comm_version.find("documents") != -1: comm_version_url = "http://billstatus.ls.state.ms.us/" + comm_version bill.add_version("Committee Substitute", comm_version_url, on_duplicate='use_new', mimetype='text/html') passed_version = details_root.xpath('string(//PASSED_OTHER)').replace("../../../../", "") if passed_version.find("documents") != -1: passed_version_url = "http://billstatus.ls.state.ms.us/" + passed_version title = "As Passed the " + chamber bill.add_version(title, passed_version_url, on_duplicate='use_new', mimetype='text/html') asg_version = details_root.xpath('string(//ASG_OTHER)').replace("../../../../", "") if asg_version.find("documents") != -1: asg_version_url = "http://billstatus.ls.state.ms.us/" + asg_version bill.add_version("Approved by the Governor", asg_version_url, on_duplicate='use_new', mimetype='text/html') # avoid duplicate votes seen_votes = set() #Actions for action in details_root.xpath('//HISTORY/ACTION'): action_num = action.xpath('string(ACT_NUMBER)').strip() action_num = int(action_num) act_vote = action.xpath('string(ACT_VOTE)').replace("../../../..", "") action_desc = action.xpath('string(ACT_DESC)') date, action_desc = action_desc.split(" ", 1) date = date + "/" + session[0:4] date = datetime.strptime(date, "%m/%d/%Y") if action_desc.startswith("(H)"): actor = "lower" action = action_desc[4:] elif action_desc.startswith("(S)"): actor = "upper" action = action_desc[4:] else: actor = "executive" action = action_desc if action.find("Veto") != -1: version_path = details_root.xpath("string(//VETO_OTHER)") version_path = version_path.replace("../../../../", "") version_url = "http://billstatus.ls.state.ms.us/" + version_path bill.add_document("Veto", version_url) atype = 'other' for prefix, prefix_type in self._action_types: if action.startswith(prefix): atype = prefix_type break bill.add_action(actor, action, date, type=atype, action_num=action_num) # use committee names as scraped subjects subjects = details_root.xpath('//H_NAME/text()') subjects += details_root.xpath('//S_NAME/text()') bill['subjects'] = subjects if act_vote: vote_url = 'http://billstatus.ls.state.ms.us%s' % act_vote if vote_url not in seen_votes: seen_votes.add(vote_url) vote = self.scrape_votes(vote_url, action, date, actor) vote.add_source(vote_url) bill.add_vote(vote) bill.add_source(bill_details_url) self.save_bill(bill)
def scrape_bill(self, session, history_url): history_xml = self.get(history_url).content root = etree.fromstring(history_xml) bill_title = root.findtext("caption") if (bill_title is None or "Bill does not exist" in history_xml): self.warning("Bill does not appear to exist") return bill_id = ' '.join(root.attrib['bill'].split(' ')[1:]) chamber = self.CHAMBERS[bill_id[0]] if bill_id[1] == 'B': bill_type = ['bill'] elif bill_id[1] == 'R': bill_type = ['resolution'] elif bill_id[1:3] == 'CR': bill_type = ['concurrent resolution'] elif bill_id[1:3] == 'JR': bill_type = ['joint resolution'] else: raise ScrapeError("Invalid bill_id: %s" % bill_id) bill = Bill(session, chamber, bill_id, bill_title, type=bill_type) bill.add_source(history_url) bill['subjects'] = [] for subject in root.iterfind('subjects/subject'): bill['subjects'].append(subject.text.strip()) versions = [x for x in self.versions if x[0] == bill_id] for version in versions: bill.add_version(name=self.NAME_SLUGS[version[1][-5]], url=version[1], mimetype='text/html') analyses = [x for x in self.analyses if x[0] == bill_id] for analysis in analyses: bill.add_document(name="Analysis ({})".format( self.NAME_SLUGS[analysis[1][-5]]), url=analysis[1], mimetype='text/html') fiscal_notes = [x for x in self.fiscal_notes if x[0] == bill_id] for fiscal_note in fiscal_notes: bill.add_document(name="Fiscal Note ({})".format( self.NAME_SLUGS[fiscal_note[1][-5]]), url=fiscal_note[1], mimetype='text/html') witnesses = [x for x in self.witnesses if x[0] == bill_id] for witness in witnesses: bill.add_document(name="Witness List ({})".format( self.NAME_SLUGS[witness[1][-5]]), url=witness[1], mimetype='text/html') for action in root.findall('actions/action'): act_date = datetime.datetime.strptime(action.findtext('date'), "%m/%d/%Y").date() extra = {} extra['action_number'] = action.find('actionNumber').text comment = action.find('comment') if comment is not None and comment.text: extra['comment'] = comment.text.strip() actor = { 'H': 'lower', 'S': 'upper', 'E': 'executive' }[extra['action_number'][0]] desc = action.findtext('description').strip() if desc == 'Scheduled for public hearing on . . .': self.warning("Skipping public hearing action with no date") continue introduced = False if desc == 'Amended': atype = 'amendment:passed' elif desc == 'Amendment(s) offered': atype = 'amendment:introduced' elif desc == 'Amendment amended': atype = 'amendment:amended' elif desc == 'Amendment withdrawn': atype = 'amendment:withdrawn' elif desc == 'Passed' or desc == 'Adopted': atype = 'bill:passed' elif re.match(r'^Received (by|from) the', desc): if 'Secretary of the Senate' not in desc: atype = 'bill:introduced' else: atype = 'bill:filed' elif desc.startswith('Sent to the Governor'): # But what if it gets lost in the mail? atype = 'governor:received' elif desc.startswith('Signed by the Governor'): atype = 'governor:signed' elif desc == 'Vetoed by the Governor': atype = 'governor:vetoed' elif desc == 'Read first time': atype = ['bill:introduced', 'bill:reading:1'] introduced = True elif desc == 'Read & adopted': atype = ['bill:passed'] if not introduced: introduced = True atype.append('bill:introduced') elif desc == "Passed as amended": atype = 'bill:passed' elif (desc.startswith('Referred to') or desc.startswith("Recommended to be sent to ")): atype = 'committee:referred' elif desc == "Reported favorably w/o amendment(s)": atype = 'committee:passed' elif desc == "Filed": atype = 'bill:filed' elif desc == 'Read 3rd time': atype = 'bill:reading:3' elif desc == 'Read 2nd time': atype = 'bill:reading:2' elif desc.startswith('Reported favorably'): atype = 'committee:passed:favorable' else: atype = 'other' if 'committee:referred' in atype: repls = ['Referred to', "Recommended to be sent to "] ctty = desc for r in repls: ctty = ctty.replace(r, "").strip() extra['committees'] = ctty bill.add_action(actor, action.findtext('description'), act_date, type=atype, **extra) for author in root.findtext('authors').split(' | '): if author != "": bill.add_sponsor('primary', author, official_type='author') for coauthor in root.findtext('coauthors').split(' | '): if coauthor != "": bill.add_sponsor('cosponsor', coauthor, official_type='coauthor') for sponsor in root.findtext('sponsors').split(' | '): if sponsor != "": bill.add_sponsor('primary', sponsor, official_type='sponsor') for cosponsor in root.findtext('cosponsors').split(' | '): if cosponsor != "": bill.add_sponsor('cosponsor', cosponsor, official_type='cosponsor') self.save_bill(bill)
def scrape_bills(self, chamber_to_scrape, session): url = 'http://billstatus.ls.state.ms.us/%s/pdf/all_measures/allmsrs.xml' % session with self.urlopen(url) as bill_dir_page: root = lxml.etree.fromstring(bill_dir_page, lxml.etree.HTMLParser()) for mr in root.xpath('//lastaction/msrgroup'): bill_id = mr.xpath('string(measure)').replace(" ", "") if bill_id[0] == "S": chamber = "upper" else: chamber = "lower" bill_type = {'B':'bill', 'C': 'concurrent resolution', 'R': 'resolution', 'N': 'nomination'}[bill_id[1]] # just skip past bills that are of the wrong chamber if chamber != chamber_to_scrape: continue link = mr.xpath('string(actionlink)').replace("..", "") main_doc = mr.xpath('string(measurelink)').replace("../../../", "") main_doc_url = 'http://billstatus.ls.state.ms.us/%s' % main_doc bill_details_url = 'http://billstatus.ls.state.ms.us/%s/pdf/%s' % (session, link) with self.urlopen(bill_details_url) as details_page: details_page = details_page.decode('latin1').encode('utf8', 'ignore') details_root = lxml.etree.fromstring(details_page, lxml.etree.HTMLParser()) title = details_root.xpath('string(//shorttitle)') longtitle = details_root.xpath('string(//longtitle)') bill = Bill(session, chamber, bill_id, title, type=bill_type, longtitle=longtitle) #sponsors main_sponsor = details_root.xpath('string(//p_name)').split() if main_sponsor: main_sponsor = main_sponsor[0] main_sponsor_link = details_root.xpath('string(//p_link)').replace(" ", "_") main_sponsor_url = 'http://billstatus.ls.state.ms.us/%s/pdf/House_authors/%s.xml' % (session, main_sponsor_link) type = "primary" bill.add_sponsor(type, main_sponsor, main_sponsor_url = main_sponsor_url) for author in details_root.xpath('//authors/additional'): leg = author.xpath('string(co_name)').replace(" ", "_") if leg: leg_url = 'http://billstatus.ls.state.ms.us/%s/pdf/House_authors/%s.xml' % (session, leg) type = "cosponsor" bill.add_sponsor(type, leg, leg_url=leg_url) #Versions curr_version = details_root.xpath('string(//current_other)').replace("../../../../", "") curr_version_url = "http://billstatus.ls.state.ms.us/" + curr_version bill.add_version("Current version", curr_version_url) intro_version = details_root.xpath('string(//intro_other)').replace("../../../../", "") intro_version_url = "http://billstatus.ls.state.ms.us/" + intro_version bill.add_version("As Introduced", intro_version_url) comm_version = details_root.xpath('string(//cmtesub_other)').replace("../../../../", "") if comm_version.find("documents") != -1: comm_version_url = "http://billstatus.ls.state.ms.us/" + comm_version bill.add_version("Committee Substitute", comm_version_url) passed_version = details_root.xpath('string(//passed_other)').replace("../../../../", "") if passed_version.find("documents") != -1: passed_version_url = "http://billstatus.ls.state.ms.us/" + passed_version title = "As Passed the " + chamber bill.add_version(title, passed_version_url) asg_version = details_root.xpath('string(//asg_other)').replace("../../../../", "") if asg_version.find("documents") != -1: asg_version_url = "http://billstatus.ls.state.ms.us/" + asg_version bill.add_version("Approved by the Governor", asg_version_url) # avoid duplicate votes seen_votes = set() #Actions for action in details_root.xpath('//history/action'): action_num = action.xpath('string(act_number)').strip() action_num = int(action_num) act_vote = action.xpath('string(act_vote)').replace("../../../..", "") action_desc = action.xpath('string(act_desc)') date, action_desc = action_desc.split(" ", 1) date = date + "/" + session[0:4] date = datetime.strptime(date, "%m/%d/%Y") if action_desc.startswith("(H)"): actor = "lower" action = action_desc[4:] elif action_desc.startswith("(S)"): actor = "upper" action = action_desc[4:] else: actor = "executive" action = action_desc if action.find("Veto") != -1: version_path = details_root.xpath("string(//veto_other)") version_path = version_path.replace("../../../../", "") version_url = "http://billstatus.ls.state.ms.us/" + version_path bill.add_document("Veto", version_url) atype = 'other' for prefix, prefix_type in self._action_types: if action.startswith(prefix): atype = prefix_type break bill.add_action(actor, action, date, type=atype, action_num=action_num) # use committee names as scraped subjects subjects = details_root.xpath('//h_name/text()') subjects += details_root.xpath('//s_name/text()') bill['subjects'] = subjects if act_vote: vote_url = 'http://billstatus.ls.state.ms.us%s' % act_vote if vote_url not in seen_votes: seen_votes.add(vote_url) vote = self.scrape_votes(vote_url, action, date, actor) vote.add_source(vote_url) bill.add_vote(vote) bill.add_source(bill_details_url) self.save_bill(bill)
def parse_bill(self, chamber, session, bill_id, url): with self.urlopen(url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(url) try: version_link = page.xpath( "//a[contains(@href, '%s/bill.doc')]" % bill_id)[0] except IndexError: # Bill withdrawn return title = version_link.xpath("string(following-sibling::p[1])") title = re.sub(ur'[\s\xa0]+', ' ', title).strip() bill = Bill(session, chamber, bill_id, title) bill.add_source(url) bill.add_version("Most Recent Version", version_link.attrib['href']) for link in page.xpath("//a[contains(@href, 'legislator/')]"): bill.add_sponsor('primary', link.text.strip()) action_p = version_link.xpath("following-sibling::p[2]")[0] for line in action_p.xpath("string()").split("\n"): action = line.strip() if (not action or action == 'last action' or 'Prefiled' in action): continue action_date = "%s %s" % (action.split('-')[0], session[0:4]) action_date = datetime.datetime.strptime( action_date, '%b %d %Y') action = '-'.join(action.split('-')[1:]) if action.endswith('House') or action.endswith('(H)'): actor = 'lower' elif action.endswith('Senate') or action.endswith('(S)'): actor = 'upper' else: actor = chamber atype = [] if action.startswith('introduced in'): atype.append('bill:introduced') elif action.startswith('signed by Governor'): atype.append('governor:signed') elif re.match(r'^to [A-Z]', action): atype.append('committee:referred') if '1st reading' in action: atype.append('bill:reading:1') if '3rd reading' in action: atype.append('bill:reading:3') if '2nd reading' in action: atype.append('bill:reading:2') amendment_re = (r'floor amendments?( \([a-z\d\-]+\))*' r'( and \([a-z\d\-]+\))? filed') if re.search(amendment_re, action): atype.append('amendment:introduced') if not atype: atype = ['other'] bill.add_action(actor, action, action_date, type=atype) try: votes_link = page.xpath( "//a[contains(@href, 'vote_history.pdf')]")[0] bill.add_document("Vote History", votes_link.attrib['href']) except IndexError: # No votes pass self.save_bill(bill)
def scrape_bill(self, chamber, session, bill_id): # try and get bill for current year url = 'http://legislature.mi.gov/doc.aspx?%s-%s' % ( session[:4], bill_id.replace(' ', '-')) html = self.get(url).text # if first page isn't found, try second year if ('Page Not Found' in html or 'The bill you are looking for is not available yet' in html): html = self.get('http://legislature.mi.gov/doc.aspx?%s-%s' % (session[-4:], bill_id.replace(' ','-'))).text if ('Page Not Found' in html or 'The bill you are looking for is not available yet' in html): return None doc = lxml.html.fromstring(html) title = doc.xpath('//span[@id="frg_billstatus_ObjectSubject"]')[0].text_content() # get B/R/JR/CR part and look up bill type bill_type = bill_types[bill_id.split(' ')[0][1:]] bill = Bill(session=session, chamber=chamber, bill_id=bill_id, title=title, type=bill_type) bill.add_source(url) # sponsors sp_type = 'primary' for sponsor in doc.xpath('//span[@id="frg_billstatus_SponsorList"]/a/text()'): sponsor = sponsor.replace(u'\xa0', ' ') bill.add_sponsor(sp_type, sponsor) sp_type = 'cosponsor' bill['subjects'] = doc.xpath('//span[@id="frg_billstatus_CategoryList"]/a/text()') # actions (skip header) for row in doc.xpath('//table[@id="frg_billstatus_HistoriesGridView"]/tr')[1:]: tds = row.xpath('td') # date, journal link, action date = tds[0].text_content() journal = tds[1].text_content() action = tds[2].text_content() date = datetime.datetime.strptime(date, "%m/%d/%Y") # instead of trusting upper/lower case, use journal for actor actor = 'upper' if 'SJ' in journal else 'lower' type = categorize_action(action) bill.add_action(actor, action, date, type=type) # check if action mentions a vote rcmatch = re.search('Roll Call # (\d+)', action, re.IGNORECASE) if rcmatch: rc_num = rcmatch.groups()[0] # in format mileg.aspx?page=getobject&objectname=2011-SJ-02-10-011 journal_link = tds[1].xpath('a/@href') if journal_link: objectname = journal_link[0].rsplit('=', 1)[-1] chamber_name = {'upper': 'Senate', 'lower': 'House'}[actor] vote_url = BASE_URL + '/documents/%s/Journal/%s/htm/%s.htm' % ( session, chamber_name, objectname) vote = Vote(actor, date, action, False, 0, 0, 0) self.parse_roll_call(vote, vote_url, rc_num) # check the expected counts vs actual count = re.search('YEAS (\d+)', action, re.IGNORECASE) count = int(count.groups()[0]) if count else 0 if count != len(vote['yes_votes']): self.warning('vote count mismatch for %s %s, %d != %d' % (bill_id, action, count, len(vote['yes_votes']))) count = re.search('NAYS (\d+)', action, re.IGNORECASE) count = int(count.groups()[0]) if count else 0 if count != len(vote['no_votes']): self.warning('vote count mismatch for %s %s, %d != %d' % (bill_id, action, count, len(vote['no_votes']))) vote['yes_count'] = len(vote['yes_votes']) vote['no_count'] = len(vote['no_votes']) vote['other_count'] = len(vote['other_votes']) vote['passed'] = vote['yes_count'] > vote['no_count'] vote.add_source(vote_url) bill.add_vote(vote) else: self.warning("missing journal link for %s %s" % (bill_id, journal)) # versions for row in doc.xpath('//table[@id="frg_billstatus_DocumentGridTable"]/tr'): version = self.parse_doc_row(row) if version: if version[1].endswith('.pdf'): mimetype = 'application/pdf' elif version[1].endswith('.htm'): mimetype = 'text/html' bill.add_version(*version, mimetype=mimetype) # documents for row in doc.xpath('//table[@id="frg_billstatus_HlaTable"]/tr'): document = self.parse_doc_row(row) if document: bill.add_document(*document) for row in doc.xpath('//table[@id="frg_billstatus_SfaTable"]/tr'): document = self.parse_doc_row(row) if document: bill.add_document(*document) self.save_bill(bill) return True
def scrape_bill(self, chamber, session, bill_id, short_title=None): """ Scrapes documents, actions, vote counts and votes for bills from the 2009 session and above. """ url = BILL_URL % (session, bill_id.replace(' ', '')) bill_page = self.get(url).text html = lxml.html.fromstring(bill_page) html.make_links_absolute('http://legislature.idaho.gov/legislation/%s/' % session) bill_tables = html.xpath('//table[contains(@class, "bill-table")]') title = bill_tables[1].text_content().strip() bill_type = get_bill_type(bill_id) bill = Bill(session, chamber, bill_id, title, type=bill_type) bill.add_source(url) bill['subjects'] = self._subjects[bill_id.replace(' ', '')] if short_title and bill['title'].lower() != short_title.lower(): bill.add_title(short_title) # documents doc_links = html.xpath('//div[contains(@class,"pf-content")]//a') for link in doc_links: name = link.text_content().strip() href = link.get('href') if 'Engrossment' in name or 'Bill Text' in name: bill.add_version(name, href, mimetype='application/pdf') else: bill.add_document(name, href) def _split(string): return re.split(r"\w+[,|AND]\s+", string) # sponsors range from a committee to one legislator to a group of legs sponsor_lists = bill_tables[0].text_content().split('by') if len(sponsor_lists) > 1: for sponsors in sponsor_lists[1:]: if 'COMMITTEE' in sponsors.upper(): bill.add_sponsor('primary', sponsors.strip()) else: for person in _split(sponsors): person = person.strip() if person != "": bill.add_sponsor('primary', person) actor = chamber last_date = None for row in bill_tables[2]: # lots of empty rows if len(row) == 1: continue _, date, action, _ = [x.text_content().strip() for x in row] if date: last_date = date else: date = last_date date = datetime.datetime.strptime(date + '/' + session[0:4], "%m/%d/%Y") if action.startswith('House'): actor = 'lower' elif action.startswith('Senate'): actor = 'upper' # votes if 'AYES' in action or 'NAYS' in action: vote = self.parse_vote(actor, date, row[2]) vote.add_source(url) bill.add_vote(vote) # some td's text is seperated by br elements if len(row[2]): action = "".join(row[2].itertext()) action = action.replace(u'\xa0', ' ').strip() atype = get_action(actor, action) bill.add_action(actor, action, date, type=atype) # after voice vote/roll call and some actions the bill is sent # 'to House' or 'to Senate' if 'to House' in action: actor = 'lower' elif 'to Senate' in action: actor = 'upper' self.save_bill(bill)
def scrape_bills(self, session, year_abr): #Main Bill information main_bill_csv = self.access_to_csv('MainBill') # keep a dictionary of bills (mapping bill_id to Bill obj) bill_dict = {} for rec in main_bill_csv: bill_type = rec["BillType"].strip() bill_number = int(rec["BillNumber"]) bill_id = bill_type + str(bill_number) title = rec["Synopsis"] if bill_type[0] == 'A': chamber = "lower" else: chamber = "upper" # some bills have a blank title.. just skip it if not title: continue bill = Bill(str(session), chamber, bill_id, title, type=self._bill_types[bill_type[1:]]) if rec['IdenticalBillNumber'].strip(): bill.add_companion(rec['IdenticalBillNumber'].split()[0]) # TODO: last session info is in there too bill_dict[bill_id] = bill #Sponsors bill_sponsors_csv = self.access_to_csv('BillSpon') for rec in bill_sponsors_csv: bill_type = rec["BillType"].strip() bill_number = int(rec["BillNumber"]) bill_id = bill_type + str(bill_number) if bill_id not in bill_dict: self.warning('unknown bill %s in sponsor database' % bill_id) continue bill = bill_dict[bill_id] name = rec["Sponsor"] sponsor_type = rec["Type"] if sponsor_type == 'P': sponsor_type = "primary" else: sponsor_type = "cosponsor" bill.add_sponsor(sponsor_type, name) #Documents bill_document_csv = self.access_to_csv('BillWP') for rec in bill_document_csv: bill_type = rec["BillType"].strip() bill_number = int(rec["BillNumber"]) bill_id = bill_type + str(bill_number) if bill_id not in bill_dict: self.warning('unknown bill %s in document database' % bill_id) continue bill = bill_dict[bill_id] document = rec["Document"] document = document.split('\\') document = document[-2] + "/" + document[-1] year = str(year_abr) + str((year_abr + 1)) #doc_url = "ftp://www.njleg.state.nj.us/%s/%s" % (year, document) htm_url = 'http://www.njleg.state.nj.us/%s/Bills/%s' % (year_abr, document.replace('.DOC', '.HTM')) # name document based _doctype try: doc_name = self._doctypes[rec['DocType']] except KeyError: raise Exception('unknown doctype %s on %s' % (rec['DocType'], bill_id)) if rec['Comment']: doc_name += ' ' + rec['Comment'] if rec['DocType'] in self._version_types: # Clean HTMX links. if htm_url.endswith('HTMX'): htm_url = re.sub('X$', '', htm_url) if htm_url.endswith('HTM'): mimetype = 'text/html' elif htm_url.endswith('wpd'): mimetype = 'application/vnd.wordperfect' bill.add_version(doc_name, htm_url, mimetype=mimetype) else: bill.add_document(doc_name, htm_url) # Votes next_year = int(year_abr)+1 vote_info_list = ['A%s' % year_abr, 'A%s' % next_year, 'S%s' % year_abr, 'S%s' % next_year, 'CA%s-%s' % (year_abr, next_year), 'CS%s-%s' % (year_abr, next_year), ] for filename in vote_info_list: s_vote_url = 'ftp://www.njleg.state.nj.us/votes/%s.zip' % filename try: s_vote_zip, resp = self.urlretrieve(s_vote_url) except scrapelib.FTPError: self.warning('could not find %s' % s_vote_url) continue zipedfile = zipfile.ZipFile(s_vote_zip) for vfile in ["%s.txt" % (filename), "%sEnd.txt" % (filename)]: try: vote_file = zipedfile.open(vfile, 'U') except KeyError: # # Right, so, 2011 we have an "End" file with more # vote data than was in the original dump. # self.warning("No such file: %s" % (vfile)) continue vdict_file = csv.DictReader(vote_file) votes = {} if filename.startswith('A') or filename.startswith('CA'): chamber = "lower" else: chamber = "upper" if filename.startswith('C'): vote_file_type = 'committee' else: vote_file_type = 'chamber' for rec in vdict_file: if vote_file_type == 'chamber': bill_id = rec["Bill"].strip() leg = rec["Full_Name"] date = rec["Session_Date"] action = rec["Action"] leg_vote = rec["Legislator_Vote"] else: bill_id = '%s%s' % (rec['Bill_Type'], rec['Bill_Number']) leg = rec['Name'] # drop time portion date = rec['Agenda_Date'].split()[0] # make motion readable action = self._com_vote_motions[rec['BillAction']] # first char (Y/N) use [0:1] to ignore '' leg_vote = rec['LegislatorVote'][0:1] date = datetime.strptime(date, "%m/%d/%Y") vote_id = '_'.join((bill_id, chamber, action)) vote_id = vote_id.replace(" ", "_") if vote_id not in votes: votes[vote_id] = Vote(chamber, date, action, None, None, None, None, bill_id=bill_id) if vote_file_type == 'committee': votes[vote_id]['committee'] = self._committees[ rec['Committee_House']] if leg_vote == "Y": votes[vote_id].yes(leg) elif leg_vote == "N": votes[vote_id].no(leg) else: votes[vote_id].other(leg) # remove temp file os.remove(s_vote_zip) #Counts yes/no/other votes and saves overall vote for vote in votes.itervalues(): vote_yes_count = len(vote["yes_votes"]) vote_no_count = len(vote["no_votes"]) vote_other_count = len(vote["other_votes"]) vote["yes_count"] = vote_yes_count vote["no_count"] = vote_no_count vote["other_count"] = vote_other_count # Veto override. if vote['motion'] == 'OVERRIDE': # Per the NJ leg's glossary, a veto override requires # 2/3ds of each chamber. 27 in the senate, 54 in the house. # http://www.njleg.state.nj.us/legislativepub/glossary.asp vote['passed'] = False if vote['chamber'] == 'lower': if vote_yes_count >= 54: vote['passed'] = True elif vote['chamber'] == 'upper': if vote_yes_count >= 27: vote['passed'] = True # Regular vote. elif vote_yes_count > vote_no_count: vote["passed"] = True else: vote["passed"] = False vote_bill_id = vote["bill_id"] bill = bill_dict[vote_bill_id] bill.add_vote(vote) #Actions bill_action_csv = self.access_to_csv('BillHist') actor_map = {'A': 'lower', 'G': 'executive', 'S': 'upper'} for rec in bill_action_csv: bill_type = rec["BillType"].strip() bill_number = int(rec["BillNumber"]) bill_id = bill_type + str(bill_number) if bill_id not in bill_dict: self.warning('unknown bill %s in action database' % bill_id) continue bill = bill_dict[bill_id] action = rec["Action"] date = rec["DateAction"] date = datetime.strptime(date, "%m/%d/%y %H:%M:%S") actor = actor_map[rec["House"]] comment = rec["Comment"] action, atype = self.categorize_action(action, bill_id) if comment: action += (' ' + comment) bill.add_action(actor, action, date, type=atype) # Subjects subject_csv = self.access_to_csv('BillSubj') for rec in subject_csv: bill_id = rec['BillType'].strip() + str(int(rec['BillNumber'])) if bill_id not in bill_dict: self.warning('unknown bill %s in subject database' % bill_id) continue bill = bill_dict.get(bill_id) if bill: bill.setdefault('subjects', []).append(rec['SubjectKey']) else: self.warning('invalid bill id in BillSubj: %s' % bill_id) phony_bill_count = 0 # save all bills at the end for bill in bill_dict.itervalues(): # add sources if not bill['actions'] and not bill['versions']: self.warning('probable phony bill detected %s', bill['bill_id']) phony_bill_count += 1 else: bill.add_source('http://www.njleg.state.nj.us/downloads.asp') self.save_bill(bill) if phony_bill_count: self.warning('%s total phony bills detected', phony_bill_count)
def scrape_bill_pages(self, session, year_abr): """ assemble information on a bill from a number of DBF files """ #Main Bill information main_bill_url, main_bill_db = self.get_dbf(year_abr, 'MAINBILL') # keep a dictionary of bills (mapping bill_id to Bill obj) bill_dict = {} for rec in main_bill_db: bill_type = rec["billtype"] bill_number = int(rec["billnumber"]) bill_id = bill_type + str(bill_number) title = rec["synopsis"] if bill_type[0] == 'A': chamber = "lower" else: chamber = "upper" # some bills have a blank title.. just skip it if not title: continue bill = Bill(str(session), chamber, bill_id, title, type=self._bill_types[bill_type[1:]]) bill.add_source(main_bill_url) bill_dict[bill_id] = bill #Sponsors bill_sponsors_url, bill_sponsors_db = self.get_dbf(year_abr, 'BILLSPON') for rec in bill_sponsors_db: bill_type = rec["billtype"] bill_number = int(rec["billnumber"]) bill_id = bill_type + str(bill_number) bill = bill_dict[bill_id] name = rec["sponsor"] sponsor_type = rec["type"] if sponsor_type == 'P': sponsor_type = "Primary" else: sponsor_type = "Co-sponsor" bill.add_sponsor(sponsor_type, name) #Documents bill_document_url, bill_document_db = self.get_dbf(year_abr, 'BILLWP') #print bill_document_db[2] for rec in bill_document_db: bill_type = rec["billtype"] bill_number = int(rec["billnumber"]) bill_id = bill_type + str(bill_number) bill = bill_dict[bill_id] document = rec["document"] document = document.split('\\') document = document[-2] + "/" + document[-1] year = str(year_abr) + str((year_abr + 1)) #doc_url = "ftp://www.njleg.state.nj.us/%s/%s" % (year, document) htm_url = 'http://www.njleg.state.nj.us/%s/Bills/%s' % (year_abr, document.replace('.DOC', '.HTM')) # name document based _doctype try: doc_name = self._doctypes[rec['doctype']] except KeyError: raise Exception('unknown doctype %s on %s' % (rec['doctype'], bill_id)) if rec['comment']: doc_name += ' ' + rec['comment'] if rec['doctype'] in self._version_types: bill.add_version(doc_name, htm_url) else: bill.add_document(doc_name, htm_url) # Votes next_year = int(year_abr)+1 vote_info_list = ['A%s' % year_abr, 'A%s' % next_year, 'S%s' % year_abr, 'S%s' % next_year, 'CA%s-%s' % (year_abr, next_year), 'CS%s-%s' % (year_abr, next_year), ] for filename in vote_info_list: s_vote_url = 'ftp://www.njleg.state.nj.us/votes/%s.zip' % filename s_vote_zip, resp = self.urlretrieve(s_vote_url) zipedfile = zipfile.ZipFile(s_vote_zip) vfile = "%s.txt" % filename vote_file = zipedfile.open(vfile, 'U') vdict_file = csv.DictReader(vote_file) votes = {} if filename.startswith('A') or filename.startswith('CA'): chamber = "lower" else: chamber = "upper" if filename.startswith('C'): vote_file_type = 'committee' else: vote_file_type = 'chamber' for rec in vdict_file: if vote_file_type == 'chamber': bill_id = rec["Bill"].strip() leg = rec["Full_Name"] date = rec["Session_Date"] action = rec["Action"] leg_vote = rec["Legislator_Vote"] else: bill_id = '%s%s' % (rec['Bill_Type'], rec['Bill_Number']) leg = rec['Name'] # drop time portion date = rec['Agenda_Date'].split()[0] # make motion readable action = self._com_vote_motions[rec['BillAction']] # first char (Y/N) use [0:1] to ignore '' leg_vote = rec['LegislatorVote'][0:1] date = datetime.strptime(date, "%m/%d/%Y") vote_id = '_'.join((bill_id, chamber, action)) vote_id = vote_id.replace(" ", "_") if vote_id not in votes: votes[vote_id] = Vote(chamber, date, action, None, None, None, None, bill_id=bill_id) if vote_file_type == 'committee': votes[vote_id]['committee'] = self._committees[ rec['Committee_House']] if leg_vote == "Y": votes[vote_id].yes(leg) elif leg_vote == "N": votes[vote_id].no(leg) else: votes[vote_id].other(leg) # remove temp file os.remove(s_vote_zip) #Counts yes/no/other votes and saves overall vote for vote in votes.itervalues(): vote_yes_count = len(vote["yes_votes"]) vote_no_count = len(vote["no_votes"]) vote_other_count = len(vote["other_votes"]) vote["yes_count"] = vote_yes_count vote["no_count"] = vote_no_count vote["other_count"] = vote_other_count if vote_yes_count > vote_no_count: vote["passed"] = True else: vote["passed"] = False vote_bill_id = vote["bill_id"] bill = bill_dict[vote_bill_id] bill.add_vote(vote) #Actions bill_action_url, bill_action_db = self.get_dbf(year_abr, 'BILLHIST') for rec in bill_action_db: bill_type = rec["billtype"] bill_number = int(rec["billnumber"]) bill_id = bill_type + str(bill_number) bill = bill_dict[bill_id] action = rec["action"] date = rec["dateaction"] actor = rec["house"] comment = rec["comment"] action, atype = self.categorize_action(action) if comment: action += (' ' + comment) bill.add_action(actor, action, date, type=atype) # Subjects subject_url, subject_db = self.get_dbf(year_abr, 'BILLSUBJ') for rec in subject_db: bill_id = rec['billtype'] + str(int(rec['billnumber'])) bill = bill_dict.get(bill_id) if bill: bill.setdefault('subjects', []).append(rec['subjectkey']) else: self.warning('invalid bill id in BILLSUBJ.DBF: %s' % bill_id) # save all bills at the end for bill in bill_dict.itervalues(): # add sources bill.add_source(bill_sponsors_url) bill.add_source(bill_document_url) bill.add_source(bill_action_url) bill.add_source(subject_url) self.save_bill(bill)
def scrape_bill(self, chamber, session, bill_id, bill_type, url): doc = lxml.html.fromstring(self.get(url).text) doc.make_links_absolute(url) title = doc.xpath('//b[text()="TITLE:"]') if title: title = title[0].tail.strip().strip('"') else: self.warning("skipping bill %s, no information" % url) return bill = Bill(session, chamber, bill_id, title, type=bill_type) bill.add_source(url) # Get sponsors spons_str = doc.xpath( '//b[contains(text(), "SPONSOR")]')[0].tail.strip() sponsors_match = re.match( '(SENATOR|REPRESENTATIVE)\([Ss]\) ([^,]+(,[^,]+){0,})', spons_str) if sponsors_match: sponsors = sponsors_match.group(2).split(',') sponsor = sponsors[0].strip() if sponsor: bill.add_sponsor('primary', sponsors[0]) for sponsor in sponsors[1:]: sponsor = sponsor.strip() if sponsor: bill.add_sponsor('cosponsor', sponsor) else: # Committee sponsorship spons_str = spons_str.strip() if re.match(r' BY REQUEST OF THE GOVERNOR$', spons_str): spons_str = re.sub(r' BY REQUEST OF THE GOVERNOR$', '', spons_str).title() spons_str = (spons_str + " Committee (by request of the governor)") if spons_str: bill.add_sponsor('primary', spons_str) # Get actions from second myth table self._current_comm = None act_rows = doc.xpath('(//table[@class="myth"])[2]//tr')[1:] for row in act_rows: date, journal, raw_chamber, action = row.xpath('td') act_date = datetime.datetime.strptime(date.text_content().strip(), '%m/%d/%y') raw_chamber = raw_chamber.text_content().strip() action = action.text_content().strip() if raw_chamber == "(H)": act_chamber = "lower" elif raw_chamber == "(S)": act_chamber = "upper" if re.match("\w+ Y(\d+)", action): vote_href = journal.xpath('.//a/@href') if vote_href: self.parse_vote(bill, action, act_chamber, act_date, vote_href[0]) action, atype = self.clean_action(action) match = re.match('^Prefile released (\d+/\d+/\d+)$', action) if match: action = 'Prefile released' act_date = datetime.datetime.strptime(match.group(1), '%m/%d/%y') bill.add_action(act_chamber, action, act_date, type=atype) # Get subjects bill['subjects'] = [] for subj in doc.xpath('//a[contains(@href, "subject")]/text()'): bill['subjects'].append(subj.strip()) # Get versions text_list_url = "http://www.legis.state.ak.us/"\ "basis/get_fulltext.asp?session=%s&bill=%s" % ( session, bill_id) bill.add_source(text_list_url) text_doc = lxml.html.fromstring(self.get(text_list_url).text) text_doc.make_links_absolute(text_list_url) for link in text_doc.xpath('//a[contains(@href, "get_bill_text")]'): name = link.xpath('../preceding-sibling::td/text()')[0].strip() text_url = link.get('href') bill.add_version(name, text_url, mimetype="text/html") # Get documents doc_list_url = "http://www.legis.state.ak.us/"\ "basis/get_documents.asp?session=%s&bill=%s" % ( session, bill_id ) doc_list = lxml.html.fromstring(self.get(doc_list_url).text) doc_list.make_links_absolute(doc_list_url) bill.add_source(doc_list_url) for href in doc_list.xpath( '//a[contains(@href, "get_documents")][@onclick]'): h_name = href.text_content() h_href = href.attrib['href'] if h_name.strip(): bill.add_document(h_name, h_href) self.save_bill(bill)
def scrape_bill(self, session, session_number, bill_id, title, sponsor, url): try: html = self.get(url).text except: return page = lxml.html.fromstring(html) page.make_links_absolute(url) bill = Bill(session, self.CHAMBERS[bill_id[0]], bill_id, title) bill.add_source(url) sponsor = re.sub(r'^(?:Rep|Sen)\.\s', "", sponsor) bill.add_sponsor('primary', sponsor) hist_table = page.xpath("//div[@id = 'tabBodyBillHistory']//table")[0] if bill_id.startswith('SB ') or \ bill_id.startswith('HB ') or \ bill_id.startswith('SPB ') or \ bill_id.startswith('HPB '): bill_type = 'bill' elif bill_id.startswith('HR ') or bill_id.startswith('SR '): bill_type = 'resolution' elif bill_id.startswith('HJR ') or bill_id.startswith('SJR '): bill_type = 'joint resolution' elif bill_id.startswith('SCR ') or bill_id.startswith('HCR '): bill_type = 'concurrent resolution' elif bill_id.startswith('SM ') or bill_id.startswith('HM '): bill_type = 'memorial' else: raise Exception('Failed to identify bill type.') bill['type'] = [bill_type] for tr in hist_table.xpath("tbody/tr"): date = tr.xpath("string(td[1])") date = datetime.datetime.strptime(date, "%m/%d/%Y").date() actor = tr.xpath("string(td[2])") actor = {'Senate': 'upper', 'House': 'lower'}.get(actor, actor) if not actor: continue act_text = tr.xpath("string(td[3])").strip() for action in act_text.split(u'\u2022'): action = action.strip() if not action: continue action = re.sub(r'-(H|S)J\s+(\d+)$', '', action) atype = [] if action.startswith('Referred to'): atype.append('committee:referred') elif action.startswith('Favorable by'): atype.append('committee:passed') elif action == "Filed": atype.append("bill:filed") elif action.startswith("Withdrawn"): atype.append("bill:withdrawn") elif action.startswith("Died"): atype.append("bill:failed") elif action.startswith('Introduced'): atype.append('bill:introduced') elif action.startswith('Read 2nd time'): atype.append('bill:reading:2') elif action.startswith('Read 3rd time'): atype.append('bill:reading:3') elif action.startswith('Adopted'): atype.append('bill:passed') elif action.startswith('CS passed'): atype.append('bill:passed') elif action.startswith('Approved by Gov'): atype.apend('governor:signed') bill.add_action(actor, action, date, type=atype) try: version_table = page.xpath( "//div[@id = 'tabBodyBillText']/table")[0] for tr in version_table.xpath("tbody/tr"): name = tr.xpath("string(td[1])").strip() version_url = tr.xpath("td/a[1]")[0].attrib['href'] if version_url.endswith('PDF'): mimetype = 'application/pdf' elif version_url.endswith('HTML'): mimetype = 'text/html' bill.add_version(name, version_url, mimetype=mimetype) except IndexError: self.log("No version table for %s" % bill_id) try: analysis_table = page.xpath( "//div[@id = 'tabBodyAnalyses']/table")[0] for tr in analysis_table.xpath("tbody/tr"): name = tr.xpath("string(td[1])").strip() name += " -- " + tr.xpath("string(td[3])").strip() name = re.sub(r'\s+', " ", name) date = tr.xpath("string(td[4])").strip() if date: name += " (%s)" % date analysis_url = tr.xpath("td/a")[0].attrib['href'] bill.add_document(name, analysis_url) except IndexError: self.log("No analysis table for %s" % bill_id) vote_tables = page.xpath("//div[@id = 'tabBodyVoteHistory']//table") for vote_table in vote_tables: for tr in vote_table.xpath("tbody/tr"): vote_date = tr.xpath("string(td[3])").strip() if vote_date.isalpha(): vote_date = tr.xpath("string(td[2])").strip() try: vote_date = datetime.datetime.strptime( vote_date, "%m/%d/%Y %H:%M %p").date() except ValueError: msg = 'Got bogus vote date: %r' self.logger.warning(msg % vote_date) vote_url = tr.xpath("td[4]/a")[0].attrib['href'] if "SenateVote" in vote_url: self.scrape_floor_vote('upper', bill, vote_date, vote_url) elif "HouseVote" in vote_url: self.scrape_floor_vote('lower', bill, vote_date, vote_url) else: self.scrape_uppper_committee_vote(bill, vote_date, vote_url) else: self.log("No vote table for %s" % bill_id) self.scrape_lower_committee_votes(session_number, bill) self.save_bill(bill)
def scrape_bill(self, url, kw, re_amendment=re.compile(r'(^[A-Z]A \d{1,3}) to'), re_substitution=re.compile(r'(^[A-Z]S \d{1,2}) for'), re_digits=re.compile(r'\d{,5}'), actions_categorize=actions.categorize, actions_get_actor=actions.get_actor): bill = Bill(**kw) bill.add_source(url) #--------------------------------------------------------------------- # A few helpers. _url_2_lxml = self._url_2_lxml _cleanup_sponsors = self._cleanup_sponsors # Shortcut function partial to get text at a particular xpath: doc = _url_2_lxml(url) _get_text = partial(get_text, doc, 0) # Get session number--needed for fetching related documents (see below). xpath = '//font[contains(., "General Assembly") and @face="Arial"]' session_num = doc.xpath(xpath)[0].text_content() session_num = re_digits.match(session_num).group() #--------------------------------------------------------------------- # Sponsors chamber = bill['chamber'] sponsor_types = { 'Additional Sponsor(s):': 'cosponsor', 'CoSponsors:': 'cosponsor', 'Primary Sponsor:': 'primary' } xpath = '//font[contains(., "Sponsor") and @color="#008080"]' headings = doc.xpath(xpath + '/text()') sponsors = doc.xpath(xpath + '/../../following-sibling::td/font/text()') for h, s in zip(headings, sponsors): names = _cleanup_sponsors(s, chamber) type_ = sponsor_types[h.strip()] if names: for name, _chamber in names: bill.add_sponsor(type_, name, chamber=_chamber) #--------------------------------------------------------------------- # Versions tmp = '/'.join([ 'http://www.legis.delaware.gov', 'LIS/lis{session_num}.nsf/vwLegislation', '{moniker}/$file/{filename}{format_}?open' ]) documents = self.scrape_documents(source=url, docname="introduced", filename="Legis", tmp=tmp, session_num=session_num) for d in documents: bill.add_version(**d) # If bill is a substitution, add the original as a version. names = doc.xpath('//*[contains(text(), "Substituted ' 'Legislation for Bill:")]/text()') urls = doc.xpath('//*[contains(text(), "Substituted ' 'Legislation for Bill:")]' '/following-sibling::a/@href') for name, url in zip(names, urls): name = re_substitution.match(name).group(1) bill.add_version(name, url, description='original bill') #--------------------------------------------------------------------- # Actions actions = doc.xpath('//font[contains(., "Actions History")]' '/../following-sibling::table/descendant::td[2]') actions = actions[0].text_content() actions = filter(None, actions.splitlines()) for a in reversed(actions): date, action = a.split(' - ', 1) try: date = datetime.strptime(date, '%b %d, %Y') except ValueError: date = datetime.strptime(date, '%B %d, %Y') # XXX: ugh. actor = actions_get_actor(action, bill['chamber']) type_ = actions_categorize(action) bill.add_action(actor, action, date, type_) #--------------------------------------------------------------------- # Votes vote_strings = doc.xpath('//*[contains(text(), "vote:")]/text()') # Sometimes vote strings are contained in weird, separate elements. Probably # hand edited. if not all(re.search('\d', string) for string in vote_strings): # Use the parent's text_content instead. vote_strings = [] for el in doc.xpath('//*[contains(text(), "vote:")]/..'): vote_strings.append(el.text_content()) vote_urls = doc.xpath('//*[contains(text(), "vote:")]' '/following-sibling::a/@href') for string, url in zip(vote_strings, vote_urls): vote_data = parse_votestring(string) vote = self.scrape_vote(url, **vote_data) if vote: bill.add_vote(vote) #--------------------------------------------------------------------- # Amendments xpath = ("//font[contains(., 'Amendments')]/" "../../../td[2]/font/a") tmp = ('http://www.legis.delaware.gov/LIS/lis{session_num}.nsf/' 'vwLegislation/{id_}/$file/{filename}{format_}?open') for source, id_ in zip(doc.xpath(xpath + '/@href'), doc.xpath(xpath + '/text()')): short_id = re_amendment.match(id_).group(1) documents = self.scrape_documents(source=source, docname='amendment (%s)' % short_id, filename='Legis', tmp=tmp, session_num=session_num, id_=id_) for d in documents: bill.add_document(**d) #--------------------------------------------------------------------- # Add any related "Engrossments". # See www.ncsl.org/documents/legismgt/ILP/98Tab3Pt4.pdf for # an explanation of the engrossment process in DE. source = doc.xpath('//img[@alt="Engrossment"]/../@href') if source: tmp = '/'.join([ 'http://www.legis.delaware.gov', 'LIS/lis{session_num}.nsf/EngrossmentsforLookup', '{moniker}/$file/{filename}{format_}?open' ]) documents = self.scrape_documents(source=source[0], docname="Engrossment", filename="Engross", tmp=tmp, session_num=session_num, id_=bill['bill_id']) for d in documents: bill.add_version(**d) # -------------------------------------------------------------------- # Add any fiscal notes. source = doc.xpath("//img[@alt='Fiscal Note']/../@href") if source: tmp = '/'.join([ 'http://www.legis.delaware.gov', 'LIS/lis{session_num}.nsf/FiscalforLookup', '{docnum}/$file/{filename}{format_}?open' ]) documents = self.scrape_documents(source=source[0], docname="Fiscal Note", filename="Fiscal", tmp=tmp, session_num=session_num) for d in documents: bill.add_document(**d) #--------------------------------------------------------------------- # Extra fields # Helper to get the first td sibling of certain nodes. tmp = '//font[contains(., "%s")]/../../../td[2]' first_sibling_text = lambda heading: _get_text(tmp % heading) extra_fields = { # A long description of the legislation. "summary": "Synopsis", # Codification details for enacted legislation. "volume_chapter": "Volume Chapter", # Presumably the date of approval/veto. "date_governor_acted": "Date Governor Acted", "fiscal_notes": "Fiscal Notes", } for key, name in extra_fields.iteritems(): try: bill[key] = first_sibling_text(name) except IndexError: # xpath lookup failed. pass self.save_bill(bill)
def parse_house_bill(self, url, session): # using the print page makes the page simpler, and also *drastically* smaller (8k rather than 100k) url = re.sub("billsummary", "billsummaryprn", url) url = '%s/%s' % (self.senate_base_url, url) with self.urlopen(url) as bill_page: bill_page = lxml.html.fromstring(bill_page) bill_id = bill_page.xpath('//*[@class="entry-title"]') if len(bill_id) == 0: self.log("WARNING: bill summary page is blank! (%s)" % url) self.bad_urls.append(url) return bill_id = bill_id[0].text_content() bill_id = clean_text(bill_id) bill_desc = bill_page.xpath( '//*[@class="BillDescription"]')[0].text_content() bill_desc = clean_text(bill_desc) table_rows = bill_page.xpath('//table/tr') # if there is a cosponsor all the rows are pushed down one for the extra row for the cosponsor: cosponsorOffset = 0 if table_rows[2][0].text_content().strip() == 'Co-Sponsor:': cosponsorOffset = 1 lr_label_tag = table_rows[3 + cosponsorOffset] assert lr_label_tag[0].text_content().strip() == 'LR Number:' bill_lr = lr_label_tag[1].text_content() lastActionOffset = 0 if table_rows[4 + cosponsorOffset][0].text_content().strip( ) == 'Governor Action:': lastActionOffset = 1 official_title_tag = table_rows[5 + cosponsorOffset + lastActionOffset] assert official_title_tag[0].text_content().strip( ) == 'Bill String:' official_title = official_title_tag[1].text_content() # could substitute the description for the name, # but keeping it separate for now. bill_type = "bill" triplet = bill_id[:3] if triplet in bill_types: bill_type = bill_types[triplet] subs = [] bid = bill_id.replace(" ", "") if bid in self.subjects: subs = self.subjects[bid] self.log("With subjects for this bill") self.log(bid) bill = Bill(session, 'lower', bill_id, bill_desc, bill_url=url, bill_lr=bill_lr, official_title=official_title, type=bill_type, subjects=subs) bill.add_source(url) bill_sponsor = clean_text(table_rows[0][1].text_content()) try: bill_sponsor_link = table_rows[0][1][0].attrib['href'] except IndexError: return if bill_sponsor_link: bill_sponsor_link = '%s%s' % (self.senate_base_url, bill_sponsor_link) bill.add_sponsor('primary', bill_sponsor, sponsor_link=bill_sponsor_link) # check for cosponsors if cosponsorOffset == 1: if len(table_rows[2][1]) == 1: # just a name cosponsor = table_rows[2][1][0] bill.add_sponsor( 'cosponsor', cosponsor.text_content(), sponsor_link='%s/%s' % (self.senate_base_url, cosponsor.attrib['href'])) else: # name ... etal try: cosponsor = table_rows[2][1][0] bill.add_sponsor( 'cosponsor', clean_text(cosponsor.text_content()), sponsor_link='%s/%s' % (self.senate_base_url, cosponsor.attrib['href'])) self.parse_cosponsors_from_bill( bill, '%s/%s' % (self.senate_base_url, table_rows[2][1][1].attrib['href'])) except scrapelib.HTTPError as e: self.log("WARNING: " + str(e)) self.bad_urls.append(url) self.log("WARNING: no bill summary page (%s)" % url) actions_link_tag = bill_page.xpath('//div[@class="Sections"]/a')[0] actions_link = '%s/%s' % (self.senate_base_url, actions_link_tag.attrib['href']) actions_link = re.sub("content", "print", actions_link) self.parse_house_actions(bill, actions_link) # get bill versions doc_tags = bill_page.xpath( '//div[@class="BillDocsSection"][1]/span') for doc_tag in reversed(doc_tags): doc = clean_text(doc_tag.text_content()) text_url = '%s%s' % (self.senate_base_url, doc_tag[0].attrib['href']) bill.add_document(doc, text_url, mimetype="text/html") # get bill versions version_tags = bill_page.xpath( '//div[@class="BillDocsSection"][2]/span') for version_tag in reversed(version_tags): version = clean_text(version_tag.text_content()) text_url = '%s%s' % (self.senate_base_url, version_tag[0].attrib['href']) pdf_url = '%s%s' % (self.senate_base_url, version_tag[1].attrib['href']) if text_url.endswith('htm'): mimetype = 'text/html' elif text_url.endswith('pdf'): mimetype = 'application/pdf' bill.add_version(version, text_url, pdf_url=pdf_url, on_duplicate='use_new', mimetype=mimetype) self.save_bill(bill)