def scrape(self, chamber, session): year = year_from_session(session) url = bills_url(year) with self.urlopen(url) as bills_page_html: bills_page = lxml.html.fromstring(bills_page_html) table_rows = bills_page.cssselect('tr') # Eliminate empty rows table_rows = table_rows[0:len(table_rows):2] for row in table_rows: row_elements = row.cssselect('td') bill_document = row_elements[0] bill_document.make_links_absolute(base_url()) element, attribute, link, pos = bill_document.iterlinks().next() bill_id = element.text_content().rstrip('.pdf') bill_document_link = link title_and_sponsors = row_elements[1] title_match = re.search('([A-Z][a-z]+.+[a-z])[A-Z]', title_and_sponsors.text_content()) sponsors_match = re.search('[a-z]([A-Z]+.+)', title_and_sponsors.text_content()) title = title_match.group(1) sponsors = sponsors_match.group(1) separated_sponsors = sponsors.split('--') bill = Bill(session, chamber, bill_id, title) bill.add_version('current', bill_document_link) if separated_sponsors[1] == '(NONE)': bill.add_sponsor('primary', separated_sponsors[0]) else: bill.add_sponsor('cosponsor', separated_sponsors[0]) bill.add_sponsor('cosponsor', separated_sponsors[1]) versions_page_element = row_elements[2] versions_page_element.make_links_absolute(base_url()) element, attribute, link, pos = versions_page_element.iterlinks().next() bill.add_source(link) self.scrape_versions(link, bill) actions_page_element = row_elements[3] element, attribute, link, pos = actions_page_element.iterlinks().next() frame_link = base_url() + link.split('?Open&target=')[1] self.scrape_actions(frame_link, bill) votes_page_element = row_elements[7] element, attribute, link, pos = votes_page_element.iterlinks().next() frame_link = base_url() + link.split('?Open&target=')[1] self.scrape_votes(frame_link, chamber, bill)
def scrape(self, chamber, session): # Legislator data only available for the current session if year_from_session(session) != 2009: raise NoDataForPeriod(session) with self.urlopen(legs_url(chamber)) as html: page = lxml.html.fromstring(html) # Iterate through legislator names page.make_links_absolute(BASE_URL) for link in set([a.get('href') for a in page.xpath('//b/a')]): with self.urlopen(link) as legislator_html: legislator_page = lxml.html.fromstring(legislator_html) leg_elements = legislator_page.cssselect('b') leg_name = leg_elements[0].text_content() district = "" district_match = re.search("District [0-9]+", legislator_page.text_content()) if (district_match != None): district = district_match.group(0) email = "" email_match = re.search('E-mail: (.*)', legislator_page.text_content()) if (email_match != None): email = email_match.group(1) form_page = lxml.html.parse(leg_form_url()).getroot() form_page.forms[0].fields['Query'] = leg_name result = lxml.html.parse(lxml.html.submit_form(form_page.forms[0])).getroot() elements = result.cssselect('td') party_letter = elements[7].text_content() party = party_name(party_letter) leg = Legislator(session, chamber, district, leg_name, "", "", "", party, official_email=email) leg.add_source(link) self.save_legislator(leg)