def scrape(self, chamber, session): if year_from_session(session) != 2009: raise NoDataForPeriod(session) if chamber == 'upper': self.scrape_legislator_data('upper', session) else: self.scrape_legislator_data('lower', session)
def scrape(self, chamber, session): sep = '<h1>House</h1>' if chamber == 'upper': after = False reg = '[5-9]' else: after = True reg = '[1-4]' year = str(year_from_session(session)) with self.urlopen("http://apps.leg.wa.gov/billinfo/dailystatus.aspx?year=" + year) as page_html: page = lxml.html.fromstring(separate_content(page_html, sep)) for element, attribute, link, pos in page.iterlinks(): if re.search("bill=" + reg + "[0-9]{3}", link) != None: bill_page_url = "http://apps.leg.wa.gov/billinfo/" + link with self.urlopen(bill_page_url) as bill_page_html: bill_page = lxml.html.fromstring(bill_page_html) raw_title = bill_page.cssselect('title') split_title = string.split(raw_title[0].text_content(), ' ') bill_id = split_title[0] + ' ' + split_title[1] bill_id = bill_id.strip() title_element = bill_page.get_element_by_id("ctl00_ContentPlaceHolder1_lblSubTitle") title = title_element.text_content() bill = Bill(session, chamber, bill_id, title) bill.add_source(bill_page_url) self.scrape_actions(bill_page, bill) for element, attribute, link, pos in bill_page.iterlinks(): if re.search("billdocs", link) != None: if re.search("Amendments", link) != None: bill.add_document("Amendment: " + element.text_content(), link) elif re.search("Bills", link) != None: bill.add_version(element.text_content(), link) else: bill.add_document(element.text_content(), link) elif re.search("senators|representatives", link) != None: with self.urlopen(link) as senator_page_html: senator_page = lxml.html.fromstring(senator_page_html) try: name_tuple = self.scrape_legislator_name(senator_page) bill.add_sponsor('primary', name_tuple[0]) except: pass elif re.search("ShowRollCall", link) != None: match = re.search("([0-9]+,[0-9]+)", link) match = match.group(0) match = match.split(',') id1 = match[0] id2 = match[1] url = votes_url(id1, id2) with self.urlopen(url) as vote_page_html: vote_page = lxml.html.fromstring(vote_page_html) self.scrape_votes(vote_page, bill, url) self.save_bill(bill)