def scrape_regular(self, chamber, session, url): """Scraper for Regular Sessions >= 2009 """ year_label, session_type = get_session_details(session) base_url = url % year_label bill_types = { 'lower': ( ('RptIntroHB.aspx', 'bill'), ('RptHR.aspx', 'resolution'), ('RptHCR.aspx', 'concurrent resolution')), 'upper': ( ('RptIntroSB.aspx', 'bill'), ('RptSR.aspx', 'resolution'), ('RptSCR.aspx', 'concurrent resolution')), } for suffix, bill_type in bill_types[chamber]: bill_list_url = base_url + suffix with self.urlopen(bill_list_url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(bill_list_url) for row in page.xpath('//table/tr'): b = row.xpath('td//a[contains(@id, "HyperLink1")]') if b: # Ignore if no match bill_status_url = b[0].get('href') self.scrape_bill(session, chamber, bill_type, bill_status_url)
def scrape_regular(self, chamber, session, url): """Scraper for Regular Sessions >= 2009 """ year_label, session_type = get_session_details(session) base_url = url % year_label bill_types = { 'lower': (('RptIntroHB.aspx', 'bill'), ('RptHR.aspx', 'resolution'), ('RptHCR.aspx', 'concurrent resolution')), 'upper': (('RptIntroSB.aspx', 'bill'), ('RptSR.aspx', 'resolution'), ('RptSCR.aspx', 'concurrent resolution')), } for suffix, bill_type in bill_types[chamber]: bill_list_url = base_url + suffix with self.urlopen(bill_list_url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(bill_list_url) for row in page.xpath('//table/tr'): b = row.xpath('td//a[contains(@id, "HyperLink1")]') if b: # Ignore if no match bill_status_url = b[0].get('href') self.scrape_bill(session, chamber, bill_type, bill_status_url)
def scrape_20091SS(self, chamber, session, url): """Scraper for 2009 First Special Session""" year_label, session_type = get_session_details(session) bill_list_url = url % (year_label) with self.urlopen(bill_list_url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(bill_list_url) table = page.xpath('//table[@id="ReportGridView"]')[0] for row in table.xpath('tr'): self.scrape_20091SS_row(chamber, session, row)
def scrape_20091SS(self, chamber, session, url): """Scraper for 2009 First Special Session""" year_label, session_type = get_session_details(session) bill_list_url = url%(year_label) with self.urlopen(bill_list_url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(bill_list_url) table = page.xpath('//table[@id="ReportGridView"]')[0] for row in table.xpath('tr'): self.scrape_20091SS_row(chamber, session, row)
def scrape_20101SS(self, chamber, session, url): """Scraper for 2010 Special Sessions""" year_label, session_type = get_session_details(session) bill_list_url = url%(year_label) with self.urlopen(bill_list_url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(bill_list_url) table = page.xpath('//table[tr/th[contains(., "Measure Status")]]')[0] for row in table.xpath('tr'): self.scrape_20101SS_row(chamber, session, row)
def scrape_2008RS(self, chamber, session, url): """Scraper for pre 2008 Regualar Sessions""" year_label, session_type = get_session_details(session) chamber_string = get_chamber_string(url, chamber) bill_list_url = url % (year_label, chamber_string) with self.urlopen(bill_list_url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(bill_list_url) table = page.xpath('//table[tr/th[contains(., "Measure")]]')[0] for row in table.xpath('tr'): self.scrape_2008RS_row(chamber, session, row)
def scrape_2008RS(self, chamber, session, url): """Scraper for pre 2008 Regualar Sessions""" year_label, session_type = get_session_details(session) chamber_string = get_chamber_string(url, chamber) bill_list_url = url%(year_label, chamber_string) with self.urlopen(bill_list_url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(bill_list_url) table = page.xpath('//table[tr/th[contains(., "Measure")]]')[0] for row in table.xpath('tr'): self.scrape_2008RS_row(chamber, session, row)
def scrape_regular(self, chamber, session, url): """Scraper for post 2009 Regular Sessions.""" year_label, session_type = get_session_details(session) chamber_string = get_chamber_string(url, chamber) bill_list_url = url%(year_label, chamber_string) with self.urlopen(bill_list_url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(bill_list_url) table = page.xpath('//table[contains(@id, "ReportGridView")]')[0] for row in table.xpath('tr'): self.scrape_regular_row(chamber, session, row)
def scrape_2009RS(self, chamber, session, url): """Scraper for post 2009 Regular Sessions.""" year_label, session_type = get_session_details(session) chamber_string = get_chamber_string(url, chamber) bill_list_url = url % (year_label, chamber_string) with self.urlopen(bill_list_url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(bill_list_url) table = page.xpath('//table[contains(@id, "ReportGridView")]')[0] for row in table.xpath('tr'): # import pdb; pdb.set_trace() self.scrape_2009RS_row(chamber, session, row)
def scrape(self, chamber, session): self.validate_session(session) # Check session is defined in init file. # Work out appropriate scaper for year and session type. year_label, session_type = get_session_details(session) # Check if session scaper already implemented. url, scraper = self.session_scraper.get(session, [None, None]) # Configure for general cases. if scraper is None: url = "/session%s/lists/RptIntro%s.aspx" scraper = self.scrape_regular scraper(chamber, session, STATE_URL+url)
def scrape(self, chamber, session): self.validate_session(session) # Check session is defined in init file. # Work out appropriate scaper for year and session type. year_label, session_type = get_session_details(session) # Check if session scaper already implemented. url, scraper = self.session_scraper.get(session, [None, None]) # Configure for general cases. if scraper is None: url = "/session%s/lists/" scraper = self.scrape_regular scraper(chamber, session, STATE_URL+url)
def scrape(self, chamber, session): self.validate_session(session) # Check session is defined in init file. # Work out appropriate scaper for year and session type. year_label, session_type = get_session_details(session) # Check if session scaper already implemented. url, scraper = self.session_scraper.get(session, [None, None]) if scraper is not None: pass # Session scraper is specified, so just run. # Configure for general cases. elif int(year_label) >= 2009 and session_type == 'regular': # User 2009 scraper for new sessions. Hopefully they won't change! url, scraper = self.session_scraper.get('2009 Regular Session', [None, None]) elif int(year_label <= 2008) and session_type == 'regular': # Pre-2009 pages have a different but regular layout. url, scraper = self.session_scraper.get('2008 Regular Session', [None, None]) # If scraper specified, get scraping. if scraper is not None: scraper(chamber, session, STATE_URL+url)
def scrape(self, chamber, session): self.validate_session( session) # Check session is defined in init file. # Work out appropriate scaper for year and session type. year_label, session_type = get_session_details(session) # Check if session scaper already implemented. url, scraper = self.session_scraper.get(session, [None, None]) if scraper is not None: pass # Session scraper is specified, so just run. # Configure for general cases. elif int(year_label) >= 2009 and session_type == 'regular': # User 2009 scraper for new sessions. Hopefully they won't change! url, scraper = self.session_scraper.get('2009 Regular Session', [None, None]) elif int(year_label <= 2008) and session_type == 'regular': # Pre-2009 pages have a different but regular layout. url, scraper = self.session_scraper.get('2008 Regular Session', [None, None]) # If scraper specified, get scraping. if scraper is not None: scraper(chamber, session, STATE_URL + url)