def scrape_bill(self, chamber, session, url): page = self.urlopen(url) root = lxml.html.fromstring(page) bill_detail_el = root.xpath('//div[@class="col2"]//div[@class="Columns bg2717"]//div[@class="widgetContent"]')[0] title = bill_detail_el.xpath('.//p/text()')[0] bill_id = bill_detail_el.xpath('./p/b/text()')[1].strip() m = re.search('Bill Number: ([HSD0-9]+)', bill_id) if len(m.groups()): bill_id = m.groups()[0] else: bill_id = None doctype = None if self._last_doctype: doctype = self._last_doctype.lower() bill = Bill(session, chamber, bill_id, title, type=doctype) sponsors_el = bill_detail_el.xpath('./p[2]/a/text()') for i in range(len(sponsors_el)): sponsor = sponsors_el[i] if i == 0: type = 'primary' else: type = 'cosponsor' bill.add_sponsor(type, sponsor) secondary_sponsors_el = bill_detail_el.xpath('.//div[@class="dataBlock"]//td/a/text()') for secondary_sponsor in secondary_sponsors_el: bill.add_sponsor('secondary', secondary_sponsor) print bill self.save_bill(bill)
def scrape_bill(self, chamber, session, bill_type, number): """ Creates a bill object """ if len(session) == 4: session_url = session+'rs' else: session_url = session url = BILL_URL % (session_url, bill_type, number) with self.urlopen(url) as html: doc = lxml.html.fromstring(html) # title # find <a name="Title">, get parent dt, get parent dl, then get dd within dl title = doc.cssselect('a[name=Title]')[0] \ .getparent().getparent().cssselect('dd')[0].text.strip() # create the bill object now that we have the title print "%s %d %s" % (bill_type, number, title) bill = Bill(session, chamber, "%s %d" % (bill_type, number), title) bill.add_source(url) self.parse_bill_sponsors(doc, bill) # sponsors self.parse_bill_actions(doc, bill) # actions self.parse_bill_documents(doc, bill) # documents and versions self.parse_bill_votes(doc, bill) # votes # add bill to collection self.save_bill(bill)
def parse_bill(self, chamber, session, special, link): bill_num = link.text.strip() bill_type = re.search('type=(B|R|)', link.attrib['href']).group(1) bill_id = "%s%s %s" % (bill_abbr(chamber), bill_type, bill_num) url = info_url(chamber, session, special, bill_type, bill_num) with self.urlopen(url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(url) title = page.xpath( "//td[text() = 'Short Title:']/following-sibling::td")[0] title = title.text.strip() bill = Bill(session, chamber, bill_id, title) bill.add_source(url) self.parse_bill_versions(bill, page) self.parse_history(bill, history_url(chamber, session, special, bill_type, bill_num)) self.parse_votes(bill, vote_url(chamber, session, special, bill_type, bill_num)) self.save_bill(bill)
def parse_senate_billpage(self, bill_url, year): with self.urlopen(bill_url) as bill_page: bill_page = BeautifulSoup(bill_page) # get all the info needed to record the bill bill_id = bill_page.find(id="lblBillNum").b.font.contents[0] bill_title = bill_page.find(id="lblBillTitle").font.string bill_desc = bill_page.find(id="lblBriefDesc").font.contents[0] bill_lr = bill_page.find(id="lblLRNum").font.string bill = Bill(year, 'upper', bill_id, bill_desc, bill_url=bill_url, bill_lr=bill_lr, official_title=bill_title) bill.add_source(bill_url) # Get the primary sponsor bill_sponsor = bill_page.find(id="hlSponsor").i.font.contents[0] bill_sponsor_link = bill_page.find(id="hlSponsor").href bill.add_sponsor('primary', bill_sponsor, sponsor_link=bill_sponsor_link) # cosponsors show up on their own page, if they exist cosponsor_tag = bill_page.find(id="hlCoSponsors") if cosponsor_tag and 'href' in cosponsor_tag: self.parse_senate_cosponsors(bill, cosponsor_tag['href']) # get the actions action_url = bill_page.find(id="hlAllActions")['href'] self.parse_senate_actions(bill, action_url) # stored on a separate page versions_url = bill_page.find(id="hlFullBillText") if versions_url: self.parse_senate_bill_versions(bill, versions_url['href']) self.save_bill(bill)
def scrape_bill(self, bill_url, chamber, session): with self.urlopen(bill_url) as text: if "Specified Bill could not be found" in text: return False page = lxml.html.fromstring(text) page.make_links_absolute(bill_url) bill_id = page.xpath("string(//h2)").split()[0] summary = page.xpath( "string(//*[starts-with(text(), 'Summary: ')])") summary = summary.replace('Summary: ', '') match = re.match(r"^([^:]+): ([^(]+)", summary) if match: subjects = [match.group(1).strip()] title = match.group(2).strip() else: raise ScrapeError("Bad title") bill = Bill(session, chamber, bill_id, title, subjects=subjects) bill.add_source(bill_url) history_link = page.xpath("//a[text() = 'History']")[0] history_url = history_link.attrib['href'] self.scrape_history(bill, history_url) authors_link = page.xpath("//a[text() = 'Authors']")[0] authors_url = authors_link.attrib['href'] self.scrape_authors(bill, authors_url) try: versions_link = page.xpath( "//a[text() = 'Text - All Versions']")[0] versions_url = versions_link.attrib['href'] self.scrape_versions(bill, versions_url) except IndexError: # Only current version try: version_link = page.xpath( "//a[text() = 'Text - Current']")[0] version_url = version_link.attrib['href'] bill.add_version("%s Current" % bill_id, version_url) except IndexError: # Some bills don't have any versions :( pass try: votes_link = page.xpath("//a[text() = 'Votes']")[0] self.scrape_votes(bill, votes_link.attrib['href']) except IndexError: # Some bills don't have any votes pass self.save_bill(bill) return True
def parse_special_session_bill_status_page(self, bill_id, status_page, bill_table, session, chamber, sources): title = bill_table.xpath('//tr[3]/td[2]')[0].text_content() bill = Bill(session, chamber, bill_id, title) for source in sources: bill.add_source(source) self.add_sponsors(bill, self.get_sponsor_table(status_page)) self.add_actions(bill, self.get_action_table(status_page)) return bill
def scrape2003(self, url, year, chamberName, session, number): "e.g. http://www.legis.ga.gov/legis/2003_04/sum/sum/sb1.htm" with self.lxml_context(url) as page: # Grab the interesting tables on the page. tables = page.cssselect('center table') # Bill name = tables[0].text_content().split('-', 1)[1] bill = Bill(session, chamberName, number, name) # Sponsorships for a in tables[1].cssselect('a'): bill.add_sponsor('', a.text_content().strip()) # Actions center = page.cssselect('center table center')[0] for row in center.cssselect('table')[-2].cssselect('tr')[2:]: date = row[0].text_content().strip() action_text = row[1].text_content().strip() if '/' not in date: continue if action_text.startswith('Senate'): bill.add_action('upper', action_text, date) elif action_text.startswith('House'): bill.add_action('lower', action_text, date) # Versions for row in center.cssselect('table')[-1].cssselect('a'): bill.add_version(a.text_content(), urlparse.urljoin(url, a.get('href'))) self.save_bill(bill)
def scrape1999(self, url, year, chamberName, session, number): "e.g. http://www.legis.ga.gov/legis/1999_00/leg/sum/sb1.htm" with self.lxml_context(url) as lxml: # Grab the interesting tables on the page. tables = page.cssselect('table') # Bill name = tables[1].cssselect('a')[0].text_content().split('-', 1)[1] bill = Bill(session, chamberName, number, name) # Versions bill.add_version('Current', url.replace('/sum/', '/fulltext/')) # Sponsorships for a in tables[2].cssselect('a'): bill.add_sponsor('', a.text_content().strip()) # Actions for row in tables[-1].cssselect('tr'): senate_date = row[0].text_content().strip() action_text = row[1].text_content().strip() house_date = row[2].text_content().strip() if '/' not in senate_date and '/' not in house_date: continue if senate_date: bill.add_action('upper', action_text, senate_date) if house_date: bill.add_action('lower', action_text, house_date) self.save_bill(bill)
def scrape(self, chamber, session): year = year_from_session(session) url = bills_url(year) with self.urlopen(url) as bills_page_html: bills_page = lxml.html.fromstring(bills_page_html) table_rows = bills_page.cssselect('tr') # Eliminate empty rows table_rows = table_rows[0:len(table_rows):2] for row in table_rows: row_elements = row.cssselect('td') bill_document = row_elements[0] bill_document.make_links_absolute(base_url()) element, attribute, link, pos = bill_document.iterlinks().next() bill_id = element.text_content().rstrip('.pdf') bill_document_link = link title_and_sponsors = row_elements[1] title_match = re.search('([A-Z][a-z]+.+[a-z])[A-Z]', title_and_sponsors.text_content()) sponsors_match = re.search('[a-z]([A-Z]+.+)', title_and_sponsors.text_content()) title = title_match.group(1) sponsors = sponsors_match.group(1) separated_sponsors = sponsors.split('--') bill = Bill(session, chamber, bill_id, title) bill.add_version('current', bill_document_link) if separated_sponsors[1] == '(NONE)': bill.add_sponsor('primary', separated_sponsors[0]) else: bill.add_sponsor('cosponsor', separated_sponsors[0]) bill.add_sponsor('cosponsor', separated_sponsors[1]) versions_page_element = row_elements[2] versions_page_element.make_links_absolute(base_url()) element, attribute, link, pos = versions_page_element.iterlinks().next() bill.add_source(link) self.scrape_versions(link, bill) actions_page_element = row_elements[3] element, attribute, link, pos = actions_page_element.iterlinks().next() frame_link = base_url() + link.split('?Open&target=')[1] self.scrape_actions(frame_link, bill) votes_page_element = row_elements[7] element, attribute, link, pos = votes_page_element.iterlinks().next() frame_link = base_url() + link.split('?Open&target=')[1] self.scrape_votes(frame_link, chamber, bill)
def scrape2009(self, url, year, chamberName, session, number): "e.g. http://www.legis.ga.gov/legis/2009_10/sum/sum/sb1.htm" with self.lxml_context(url) as page: # Bill name = page.cssselect('#legislation h1')[0].text_content().strip() bill_id = name.split(' - ')[0].strip() bill = Bill(session, chamberName, bill_id, name) # Sponsorships for a in page.cssselect("#sponsors a"): bill.add_sponsor('', a.text_content().strip()) # Actions for row in page.cssselect('#history tr')[1:]: date = row[0].text_content().strip() action_text = row[1].text_content().strip() if '/' not in date: continue date = datetime.datetime.strptime(date, '%m/%d/%Y') if action_text.startswith('Senate'): bill.add_action('upper', action_text, date) elif action_text.startswith('House'): bill.add_action('lower', action_text, date) # Versions for row in page.cssselect('#versions a'): bill.add_version(a.text_content(), urlparse.urljoin(url, a.get('href'))) self.save_bill(bill)
def scrape(self, chamber, session): self.site_id = self.metadata['session_details'][session]['internal_id'] chamber_piece = {'upper': 'Senate', 'lower': 'House+of+Representatives'}[chamber] # resolutions # http://alisondb.legislature.state.al.us/acas/SESSResosBySelectedMatterTransResults.asp?WhichResos=Senate&TransCodes={All}&LegDay={All}%22&GetBillsTrans=Get+Resolutions+by+Transaction url = 'http://alisondb.legislature.state.al.us/acas/SESSBillsBySelectedMatterTransResults.asp?TransCodes={All}&LegDay={All}&WhichBills=%s' % chamber_piece self.refresh_session() with self.urlopen(url) as html: doc = lxml.html.fromstring(html) # bills are all their own table with cellspacing=4 (skip first) bill_tables = doc.xpath('//table[@cellspacing="4"]') for bt in bill_tables[1:]: # each table has 3 rows: detail row, description, blank details, desc, _ = bt.xpath('tr') # first <tr> has img, button, sponsor, topic, current house # current status, committee, committee2, last action _, button, sponsor, topic, _, _, com1, com2, _ = details.xpath('td') # pull bill_id out of script tag (gross) bill_id = bill_id_re.search(button.text_content()).group() oid = btn_re.search(button.text_content()).groups()[0] sponsor = sponsor.text_content() topic = topic.text_content() com1 = com1.text_content() com2 = com2.text_content() desc = desc.text_content() # create bill bill = Bill(session, chamber, bill_id, desc.strip(), topic=topic) bill.add_sponsor(sponsor, 'primary') self.get_sponsors(bill, oid) self.get_actions(bill, oid) # craft bill URL session_fragment = '2010rs' type_fragment = 'bills' bill_id_fragment = bill_id.lower() bill_text_url = 'http://alisondb.legislature.state.al.us/acas/searchableinstruments/%s/%s/%s.htm' % ( session_fragment, type_fragment, bill_id_fragment) bill.add_version('bill text', bill_text_url) self.save_bill(bill)
def scrape(self, chamber, year): # Data prior to 1997 is contained in pdfs if year < "1997": raise NoDataForYear(year) bills_url = "http://www.leg.state.co.us/CLICS/CLICS" + year + "A/csl.nsf/%28bf-1%29?OpenView&Count=2000" with self.lxml_context(bills_url) as bills_page: table_rows = bills_page.cssselect("tr") # Eliminate empty rows table_rows = table_rows[0 : len(table_rows) : 2] for row in table_rows: print "row" row_elements = row.cssselect("td") bill_document = row_elements[0] bill_document.make_links_absolute("http://www.leg.state.co.us") element, attribute, link, pos = bill_document.iterlinks().next() bill_id = element.text_content().rstrip(".pdf") bill_document_link = link title_and_sponsors = row_elements[1] title_match = re.search("([A-Z][a-z]+.+[a-z])[A-Z]", title_and_sponsors.text_content()) sponsors_match = re.search("[a-z]([A-Z]+.+)", title_and_sponsors.text_content()) title = title_match.group(1) sponsors = sponsors_match.group(1) separated_sponsors = sponsors.split("--") bill = Bill(year, chamber, bill_id, title) bill.add_version("current", bill_document_link) if separated_sponsors[1] == "(NONE)": bill.add_sponsor("primary", separated_sponsors[0]) else: bill.add_sponsor("cosponsor", separated_sponsors[0]) bill.add_sponsor("cosponsor", separated_sponsors[1]) versions_page_element = row_elements[2] versions_page_element.make_links_absolute("http://www.leg.state.co.us") element, attribute, link, pos = versions_page_element.iterlinks().next() bill.add_source(link) self.scrape_versions(link, bill) actions_page_element = row_elements[3] element, attribute, link, pos = actions_page_element.iterlinks().next() frame_link = "http://www.leg.state.co.us" + link.split("?Open&target=")[1] self.scrape_actions(frame_link, bill) votes_page_element = row_elements[7] element, attribute, link, pos = votes_page_element.iterlinks().next() frame_link = "http://www.leg.state.co.us" + link.split("?Open&target=")[1] self.scrape_votes(link, chamber, bill)
def parse_standard_bill_status_page(self, bill_id, status_page, session, chamber, sources): try: title = status_page.xpath("/div/form[1]/table[2]/tr[3]/td[2]")[0].text_content() except IndexError: if len(status_page.xpath("/html/html")) == 2: title = status_page.xpath('/html/html[2]/tr[1]/td[2]')[0].text_content() else: title = status_page.xpath('/html/html[3]/tr[1]/td[2]')[0].text_content() bill = Bill(session, chamber, bill_id, title) for source in sources: bill.add_source(source) self.add_sponsors(bill, self.get_sponsor_table(status_page)) self.add_actions(bill, self.get_action_table(status_page)) return bill
def get_bill_info(self, chamber, session, bill_detail_url, version_list_url): """Extracts all the requested info for a given bill. Calls the parent's methods to enter the results into JSON files. """ if chamber == "House": chamber = 'lower' else: chamber = 'upper' with self.urlopen(bill_detail_url) as bill_html: bill_soup = BeautifulSoup(bill_html) bill_id = self.extract_bill_id(bill_soup) bill_title = self.extract_bill_title(bill_soup) bill = Bill(session, chamber, bill_id, bill_title) # Get all versions of the bill. # Versions of a bill are on a separate page, linked to from the column # labeled, "Bill Text", on the search results page. with self.urlopen(version_list_url) as version_html: version_soup = BeautifulSoup(version_html) # MN bills can have multiple versions. Get them all, and loop over # the results, adding each one. self.debug("Extracting bill versions from: " + version_list_url) bill_versions = self.extract_bill_versions(version_soup) for version in bill_versions: version_name = version['name'] version_url = urlparse.urljoin(VERSION_URL_BASE, version['url']) bill.add_version(version_name, version_url) # grab primary and cosponsors # MN uses "Primary Author" to name a bill's primary sponsor. # Everyone else listed will be added as a 'cosponsor'. sponsors = self.extract_bill_sponsors(bill_soup) primary_sponsor = sponsors[0] cosponsors = sponsors[1:] bill.add_sponsor('primary', primary_sponsor) for leg in cosponsors: bill.add_sponsor('cosponsor', leg) # Add Actions performed on the bill. bill_actions = self.extract_bill_actions(bill_soup, chamber) for action in bill_actions: action_chamber = action['action_chamber'] action_date = action['action_date'] action_text = action['action_text'] bill.add_action(action_chamber, action_text, action_date) self.save_bill(bill)
def scrape_session_2009(self, chamber, session): url, type = bills_url(chamber) with self.urlopen(url) as page_html: page = lxml.html.fromstring(page_html) for element, attribute, link, pos in page.iterlinks(): if re.search("billtype=" + type + "&billnumber=[0-9]+", link) != None: bill_page_url = bill_url(link) with self.urlopen(bill_page_url) as bill_page_str: bill_page = lxml.html.fromstring(bill_page_str) splitted_link = link.split("=") bill_number = splitted_link[-1] bill_id = bill_page.cssselect('a[class="headerlink"]') bill_id = bill_id[0] bill_id = bill_id.text_content() bill_title = bill_page.cssselect('td[style="color:Black"]') bill_title = bill_title[0] bill_title = bill_title.text_content() bill = Bill(session, chamber, bill_id, bill_title) bill.add_source(bill_page_url) actions_table_list = bill_page.cssselect('table[rules="all"]') actions_table = actions_table_list[0] action_elements = actions_table.cssselect('tr') # first element is not an action element action_elements.pop(0) for ae in action_elements: action_element_parts = ae.cssselect('td') action_date = action_element_parts[0] actor_house = action_element_parts[1] action_text = action_element_parts[2] # look for acting comittees match = re.search("(committee\(s\)|committee) on ([A-Z]{3}(/|-)[A-Z]{3}|[A-Z]{3})", action_text.text_content()) if(match != None): actor = match.group(0) elif(actor_house == 'H'): actor = "lower" elif (actor_house == 'S'): actor = "upper" else: actor = chamber action_date = dt.datetime.strptime(action_date.text_content(), '%m/%d/%Y') if (re.search("The votes were as follows", action_text.text_content()) != None): self.scrape_votes(action_text.text_content(), bill_page_url, actor_house, action_date, bill) bill.add_action(actor, action_text, action_date) with self.urlopen(versions_page_url(type, bill_number)) as versions_page_html: versions_page = lxml.html.fromstring(versions_page_html) versions_elements = versions_page.cssselect('span[class="searchtitle"]') for ve in versions_elements: element_text = ve.text_content() version_name = element_text.rstrip("_.HTM") bill.add_version(version_name, bill_version_url(element_text))
def scrape_bill(self, chamber, session, bill_id, url): with self.urlopen(url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(url) header = page.xpath('//h3/br')[0].tail.replace(' ', ' ') title, primary_sponsor = header.split(' -- ') if bill_id.startswith('H.B.') or bill_id.startswith('S.B.'): bill_type = ['bill'] elif bill_id.startswith('H.C.R.') or bill_id.startswith('S.C.R.'): bill_type = ['concurrent resolution'] elif bill_id.startswith('H.J.R.') or bill_id.startswith('S.J.R.'): bill_type = ['joint resolution'] bill = Bill(session, chamber, bill_id, title, type=bill_type) bill.add_sponsor('primary', primary_sponsor) bill.add_source(url) status_link = page.xpath('//a[contains(@href, "billsta")]')[0] self.parse_status(bill, status_link.attrib['href']) for link in page.xpath( '//a[contains(@href, "bills/") and text() = "HTML"]'): name = link.getprevious().tail.strip() bill.add_version(name, link.attrib['href']) self.save_bill(bill)
def scrape_bill(self, session, chamber, bill_id, url): with self.urlopen(url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(url) title = page.xpath("//br")[8].tail if not title: return title = title.strip() bill = Bill(session, chamber, bill_id, title) bill.add_source(url) action_link = page.xpath("//a[contains(@href, 'getActions')]")[0] self.scrape_actions(bill, action_link.attrib['href']) version_path = "//a[contains(., '%s')]" for version_type in ('Introduced Bill', 'House Bill', 'Senate Bill', 'Engrossed Bill', 'Enrolled Act'): path = version_path % version_type links = page.xpath(path) if links: bill.add_version(version_type, links[0].attrib['href']) for doc_link in page.xpath("//a[contains(@href, 'FISCAL')]"): num = doc_link.text.strip().split("(")[0] bill.add_document("Fiscal Impact Statement #%s" % num, doc_link.attrib['href']) bill['subjects'] = self.subjects[bill_id] self.save_bill(bill)
def scrape(self, chamber, session): # internal id for the session, store on self so all methods have access self.site_id = self.metadata['session_details'][session]['site_id'] self.build_subject_map() # used for skipping bills from opposite chamber start_letter = 'H' if chamber == 'lower' else 'S' url = 'http://leg6.state.va.us/cgi-bin/legp604.exe?%s+lst+ALL' % self.site_id while url: with self.urlopen(url) as html: doc = lxml.html.fromstring(html) url = None # no more unless we encounter 'More...' bills = doc.xpath('//ul[@class="linkSect"]/li') for bill in bills: link = bill.getchildren()[0] bill_id = link.text_content() # check if this is the 'More...' link if bill_id == 'More...': url = BASE_URL + link.get('href') # skip bills from the other chamber elif not bill_id.startswith(start_letter): continue else: # create a bill desc = bill.xpath('text()')[0].strip() bill_type = {'B': 'bill', 'J': 'joint resolution', 'R': 'resolution'}[bill_id[1]] bill = Bill(session, chamber, bill_id, desc, type=bill_type) bill_url = BASE_URL + link.get('href') self.fetch_sponsors(bill) self.scrape_bill_details(bill_url, bill) bill['subjects'] = self.subject_map[bill_id] bill.add_source(bill_url) self.save_bill(bill)
def scrape1995(self, url, year, chamberName, session, number): "e.g. http://www.legis.ga.gov/legis/1995_96/leg/sum/sb1.htm" with self.lxml_context(url) as page: # Bill name = page.cssselect('h3 br')[0].tail.split('-', 1)[1].strip() bill = Bill(session, chamberName, number, name) # Versions bill.add_version('Current', url.replace('/sum/', '/fulltext/')) # Sponsorships rows = page.cssselect('center table tr') for row in rows: if row.text_content().strip() == 'Sponsor and CoSponsors': continue if row.text_content().strip() == 'Links / Committees / Status': break for a in row.cssselect('a'): bill.add_sponsor('', a.text_content().strip()) # Actions # The actions are in a pre table that looks like: """ SENATE HOUSE ------------------------------------- 1/13/95 Read 1st time 2/6/95 1/31/95 Favorably Reported 2/1/95 Read 2nd Time 2/7/95 2/3/95 Read 3rd Time 2/3/95 Passed/Adopted """ actions = page.cssselect('pre')[0].text_content().split('\n') actions = actions[2:] for action in actions: senate_date = action[:22].strip() action_text = action[23:46].strip() house_date = action[46:].strip() if '/' not in senate_date and '/' not in house_date: continue if senate_date: bill.add_action('upper', action_text, senate_date) if house_date: bill.add_action('lower', action_text, house_date) self.save_bill(bill)
def scrape_bill(self, bill_url, chamber, session): with self.urlopen(bill_url) as text: if "Specified Bill could not be found" in text: return False page = lxml.html.fromstring(text) page.make_links_absolute(bill_url) bill_id = page.xpath("string(//h2)").split()[0] summary = page.xpath( "string(//*[starts-with(text(), 'Summary: ')])") summary = summary.replace('Summary: ', '') bill = Bill(session, chamber, bill_id, summary) history_link = page.xpath("//a[text() = 'History']")[0] history_url = history_link.attrib['href'] self.scrape_history(bill, history_url) authors_link = page.xpath("//a[text() = 'Authors']")[0] authors_url = authors_link.attrib['href'] self.scrape_authors(bill, authors_url) try: versions_link = page.xpath( "//a[text() = 'Text - All Versions']")[0] versions_url = versions_link.attrib['href'] self.scrape_versions(bill, versions_url) except IndexError: # Only current version try: version_link = page.xpath( "//a[text() = 'Text - Current']")[0] version_url = version_link.attrib['href'] bill.add_version("%s Current" % bill_id, version_url) except IndexError: # Some bills don't have any versions :( pass self.save_bill(bill) return True
def parse_bill_status_page(self, status_url, bill_url, session, chamber): status_page = ElementTree(lxml.html.fromstring(self.urlopen(status_url))) # see 2007 HB 2... weird. try: bill_id = status_page.xpath("/div/form[1]/table[2]/tr[2]/td[2]")[0].text_content() except IndexError: bill_id = status_page.xpath("/html/html[2]/tr[1]/td[2]")[0].text_content() try: title = status_page.xpath("/div/form[1]/table[2]/tr[3]/td[2]")[0].text_content() except IndexError: title = status_page.xpath("/html/html[3]/tr[1]/td[2]")[0].text_content() bill = Bill(session, chamber, bill_id, title) bill.add_source(bill_url) self.add_sponsors(bill, status_page) self.add_actions(bill, status_page) return bill
def scrape(self, chamber, session): self.validate_session(session) if chamber == "lower": bill_abbr = "HB" else: bill_abbr = "SB" bill_list_url = "http://www.le.state.ut.us/~%s/bills.htm" % ( session.replace(' ', '')) self.log("Getting bill list for %s, %s" % (session, chamber)) try: base_bill_list = self.soup_parser(self.urlopen(bill_list_url)) except: # this session doesn't exist for this year return bill_list_link_re = re.compile('.*%s\d+ht.htm$' % bill_abbr) for link in base_bill_list.findAll('a', href=bill_list_link_re): bill_list = self.soup_parser(self.urlopen(link['href'])) bill_link_re = re.compile('.*billhtm/%s.*.htm' % bill_abbr) for bill_link in bill_list.findAll('a', href=bill_link_re): bill_id = bill_link.find(text=True).strip() bill_info_url = bill_link['href'] bill_info = self.soup_parser(self.urlopen(bill_info_url)) bill_title, primary_sponsor = bill_info.h3.contents[2].replace( ' ', ' ').strip().split(' -- ') bill = Bill(session, chamber, bill_id, bill_title) bill.add_source(bill_info_url) bill.add_sponsor('primary', primary_sponsor) status_re = re.compile('.*billsta/%s.*.htm' % bill_abbr.lower()) status_link = bill_info.find('a', href=status_re) if status_link: self.parse_status(bill, status_link['href']) text_find = bill_info.find( text="Bill Text (If you are having trouble viewing") if text_find: text_link_re = re.compile('.*\.htm') for text_link in text_find.parent.parent.findAll( 'a', href=text_link_re)[1:]: version_name = text_link.previous.strip() bill.add_version(version_name, text_link['href']) self.save_bill(bill)
def scrape(self, chamber, session): self.validate_session(session) if chamber == 'upper': bill_no = 1 abbr = 'SB' else: bill_no = 4001 abbr = 'HB' while True: bill_page = self.scrape_bill(session, abbr, bill_no) bill_page = BeautifulSoup(bill_page) # if we can't find a page, we must be done. This is a healthy thing. if bill_page == None: return title = ''.join(self.flatten(bill_page.findAll(id='frg_billstatus_ObjectSubject')[0])) title = title.replace('\n','').replace('\r','') bill_id = "%s %d" % (abbr, bill_no) the_bill = Bill(session, chamber, bill_id, title) #sponsors first = 0 for name in bill_page.findAll(id='frg_billstatus_SponsorList')[0].findAll('a'): the_bill.add_sponsor(['primary', 'cosponsor'][first], name.string) first = 1 #versions for doc in bill_page.findAll(id='frg_billstatus_DocumentGridTable')[0].findAll('tr'): r = self.parse_doc(the_bill, doc) if r: the_bill.add_version(*r) #documents if 'frg_billstatus_HlaTable' in str(bill_page): for doc in bill_page.findAll(id='frg_billstatus_HlaTable')[0].findAll('tr'): r = self.parse_doc(the_bill, doc) if r: the_bill.add_document(*r) if 'frg_billstatus_SfaSection' in str(bill_page): for doc in bill_page.findAll(id='frg_billstatus_SfaSection')[0].findAll('tr'): r = self.parse_doc(the_bill, doc) if r: the_bill.add_document(*r) self.parse_actions(the_bill, bill_page.findAll(id='frg_billstatus_HistoriesGridView')[0]) self.save_bill(the_bill) bill_no = bill_no + 1 pass
def scrape_bill(self, chamber, session, bill_type, number): """ Creates a bill object """ if len(session) == 4: session_url = session+'rs' else: session_url = session url = BILL_URL % (session_url, bill_type, number) with self.urlopen(url) as html: doc = lxml.html.fromstring(html) # find <a name="Title">, get parent dt, get parent dl, then dd n dl title = doc.xpath('//a[@name="Title"][1]/../../dd[1]/text()')[0].strip() synopsis = doc.xpath('//font[@size="3"]/p/text()')[0].strip() #print "%s %d %s" % (bill_type, number, title) if 'B' in bill_type: _type = ['bill'] elif 'J' in bill_type: _type = ['joint resolution'] bill = Bill(session, chamber, "%s %d" % (bill_type, number), title, type=_type, synopsis=synopsis) bill.add_source(url) self.parse_bill_sponsors(doc, bill) # sponsors self.parse_bill_actions(doc, bill) # actions self.parse_bill_documents(doc, bill) # documents and versions self.parse_bill_votes(doc, bill) # votes # subjects subjects = [] for subj in doc.xpath('//a[contains(@href, "/subjects/")]'): subjects.append(subj.text.split('-see also-')[0]) bill['subjects'] = subjects # add bill to collection self.save_bill(bill)
def scrape_info(self, session, bill_number): bill_view_url = 'http://www.njleg.state.nj.us/bills/BillView.asp' bill_id = bill_number[0] bill_view_body = 'BillNumber=%s++++&LastSession=' % bill_number[0] with self.urlopen(bill_view_url, 'POST', bill_view_body) as bill_view_page: root = lxml.etree.fromstring(bill_view_page, lxml.etree.HTMLParser()) title = bill_number[1] if bill_id[0] == 'A': chamber = 'General Assembly' elif bill_number[0][0] == 'S': chamber = 'Senate' bill = Bill(session, chamber, bill_id, title) #Grabbing sponsors sponsorship = root.xpath('string(//tr[1]/td[1]/div/font[3])').split() primary_count = sponsorship.count('Primary') sponsor_count = 1 #Special case if session == 214 and bill_id == 'A101': sponsorship = root.xpath('string(//tr[1]/td[1]/div/font[5])').split() primary_count = sponsorship.count('Primary') for sp in root.xpath('//tr[1]/td[1]/div/font/a/font'): sponsor = sp.xpath('string()').split() if len(sponsor) == 3: leg = sponsor[1] + " " + sponsor[2] + " " + sponsor[0] leg = leg[0: len(leg) - 1] elif len(sponsor) == 2: leg = sponsor[1] + " " + sponsor[0] leg = leg[0: len(leg) - 1] if sponsor_count <= primary_count: sponsor_type = 'Primary' if sponsor_count > primary_count: sponsor_type = 'Co-sponsor' bill.add_sponsor(sponsor_type, leg) sponsor_count = sponsor_count + 1 self.save_bill(bill)
def scrape_bill(self, chamber, session, billid, histurl, year): if year[0] != 'R': session = year else: session = self.metadata['session_details'][year][ 'sub_sessions'][int(year[0]) - 1] with self.urlopen(histurl) as data: soup = BeautifulSoup(cleansource(data)) basicinfo = soup.findAll('div', id='bhistleft')[0] hist = basicinfo.table sponsor = None title = None for b in basicinfo.findAll('b'): if b.next.startswith('SUMMARY'): title = b.findNextSiblings(text=True)[0].strip() elif b.next.startswith('SPONSOR'): for a in b.findNextSiblings('a'): if not issponsorlink(a): break sponsor = cleansponsor(a.contents[0]) bill = Bill(session, chamber, billid, title) if sponsor: bill.add_sponsor('primary', sponsor) for row in hist.findAll('tr'): link = row.td.a vlink = urlbase % link['href'] vname = link.contents[0].strip() bill.add_version(vname, vlink) history = soup.findAll('div', id='bhisttab')[0].table rows = history.findAll('tr')[1:] for row in rows: tds = row.findAll('td') if len(tds) < 2: # This is not actually an action continue date, action = row.findAll('td')[:2] date = dt.datetime.strptime(date.contents[0], '%m/%d/%y') action = action.contents[0].strip() if 'House' in action: actor = 'lower' elif 'Senate' in action: actor = 'upper' else: # for lack of a better actor = chamber bill.add_action(actor, action, date) self.save_bill(bill)
def scrape_bill(self, chamber, session, bill_type, number): """ Creates a bill object """ if len(session) == 4: session_url = session + "rs" else: session_url = session url = BILL_URL % (session_url, bill_type, number) with self.urlopen(url) as html: doc = lxml.html.fromstring(html) # find <a name="Title">, get parent dt, get parent dl, then dd n dl title = doc.xpath('//a[@name="Title"][1]/../../dd[1]/text()')[0].strip() # create the bill object now that we have the title print "%s %d %s" % (bill_type, number, title) if "B" in bill_type: _type = ["bill"] elif "J" in bill_type: _type = ["joint resolution"] bill = Bill(session, chamber, "%s %d" % (bill_type, number), title, type=_type) bill.add_source(url) self.parse_bill_sponsors(doc, bill) # sponsors self.parse_bill_actions(doc, bill) # actions self.parse_bill_documents(doc, bill) # documents and versions self.parse_bill_votes(doc, bill) # votes # subjects subjects = [] for subj in doc.xpath('//a[contains(@href, "/subjects/")]'): subjects.append(subj.text.split("-see also-")[0]) bill["subjects"] = subjects # add bill to collection self.save_bill(bill)
def parse_bill(self, chamber, session, special, link): bill_number = link.contents[0] type = re.search('type=(B|R|)', link['href']).group(1) bill_id = "%s%s %s" % (bill_abbr(chamber), type, bill_number) bill_info_url = info_url(chamber, session, special, type, bill_number) with self.urlopen(bill_info_url) as info_page: info_page = BeautifulSoup(info_page) title_label = info_page.find(text='Short Title:') title = title_label.findNext().contents[0] bill = Bill(session, chamber, bill_id, title) bill.add_source(bill_info_url) self.parse_bill_versions(bill, info_page) self.parse_history(bill, history_url(chamber, session, special, type, bill_number)) self.parse_votes(bill, vote_url(chamber, session, special, type, bill_number)) self.save_bill(bill)
def scrape_assem_bills(self, chamber, insert, session): doc_type = [1, 3, 5, 6] for doc in doc_type: parentpage_url = 'http://www.leg.state.nv.us/Session/%s/Reports/HistListBills.cfm?DoctypeID=%s' % (insert, doc) links = self.scrape_links(parentpage_url) count = 0 for link in links: count = count + 1 page_path = 'http://www.leg.state.nv.us/Session/%s/Reports/%s' % (insert, link) with self.urlopen(page_path) as page: root = lxml.etree.fromstring(page, lxml.etree.HTMLParser()) bill_id = root.xpath('string(/html/body/div[@id="content"]/table[1]/tr[1]/td[1]/font)') title = root.xpath('string(/html/body/div[@id="content"]/table[1]/tr[5]/td)') bill = Bill(session, chamber, bill_id, title) primary, secondary = self.scrape_sponsors(page_path) if primary[0] == 'By:': primary.pop(0) if primary[0] == 'ElectionsProceduresEthicsand': primary[0] = 'Elections Procedures Ethics and' full_name = '' for part_name in primary: full_name = full_name + part_name + " " bill.add_sponsor('primary', full_name) else: for leg in primary: bill.add_sponsor('primary', leg) for leg in secondary: bill.add_sponsor('cosponsor', leg) self.scrape_actions(page_path, bill, "Assembly") self.scrape_votes(page_path, bill, "Assembly", insert, title) bill.add_source(page_path) self.save_bill(bill)
def scrape_bill(self, chamber, session, bill_number, ga_num): bill_url = self.urls['info'] % (bill_number, ga_num) with self.urlopen(bill_url) as page: page = lxml.html.fromstring(page) title = page.xpath("//span[@id='lblAbstract']")[0].text bill = Bill(session, chamber, bill_number, title) bill.add_source(bill_url) # Primary Sponsor sponsor = page.xpath("//span[@id='lblBillSponsor']")[0].text_content().split("by")[-1] sponsor = sponsor.replace('*','').strip() bill.add_sponsor('primary',sponsor) # Co-sponsors unavailable for scraping (loaded into page via AJAX) # Full summary doc summary = page.xpath("//span[@id='lblBillSponsor']/a")[0] bill.add_document('Full summary', summary.get('href')) # Actions tables = page.xpath("//table[@id='tabHistoryAmendments_tabHistory_gvBillActionHistory']") actions_table = tables[0] action_rows = actions_table.xpath("tr[position()>1]") for ar in action_rows: action_taken = ar.xpath("td")[0].text action_date = datetime.datetime.strptime(ar.xpath("td")[1].text.strip(), '%m/%d/%Y') bill.add_action(chamber, action_taken, action_date) votes_link = page.xpath("//span[@id='lblBillVotes']/a") if(len(votes_link) > 0): votes_link = votes_link[0].get('href') bill = self.scrape_votes(bill, sponsor, 'http://wapp.capitol.tn.gov/apps/Billinfo/%s' % (votes_link,)) self.save_bill(bill)