Example #1
0
    def scrape_regular(self, chamber, session, url):
        """Scraper for Regular Sessions >= 2009 """
        year_label, session_type = get_session_details(session)
        base_url = url % year_label

        bill_types = {
            'lower': (
                ('RptIntroHB.aspx', 'bill'),
                ('RptHR.aspx', 'resolution'),
                ('RptHCR.aspx', 'concurrent resolution')),
            'upper': (
                ('RptIntroSB.aspx', 'bill'),
                ('RptSR.aspx', 'resolution'),
                ('RptSCR.aspx', 'concurrent resolution')),
        }

        for suffix, bill_type in bill_types[chamber]:
            bill_list_url = base_url + suffix

            with self.urlopen(bill_list_url) as page:
                page = lxml.html.fromstring(page)
                page.make_links_absolute(bill_list_url)
                for row in page.xpath('//table/tr'):
                    b = row.xpath('td//a[contains(@id, "HyperLink1")]')
                    if b: # Ignore if no match
                        bill_status_url = b[0].get('href')
                        self.scrape_bill(session, chamber, bill_type, bill_status_url)
Example #2
0
    def scrape_regular(self, chamber, session, url):
        """Scraper for Regular Sessions >= 2009 """
        year_label, session_type = get_session_details(session)
        base_url = url % year_label

        bill_types = {
            'lower':
            (('RptIntroHB.aspx', 'bill'), ('RptHR.aspx', 'resolution'),
             ('RptHCR.aspx', 'concurrent resolution')),
            'upper':
            (('RptIntroSB.aspx', 'bill'), ('RptSR.aspx', 'resolution'),
             ('RptSCR.aspx', 'concurrent resolution')),
        }

        for suffix, bill_type in bill_types[chamber]:
            bill_list_url = base_url + suffix

            with self.urlopen(bill_list_url) as page:
                page = lxml.html.fromstring(page)
                page.make_links_absolute(bill_list_url)
                for row in page.xpath('//table/tr'):
                    b = row.xpath('td//a[contains(@id, "HyperLink1")]')
                    if b:  # Ignore if no match
                        bill_status_url = b[0].get('href')
                        self.scrape_bill(session, chamber, bill_type,
                                         bill_status_url)
Example #3
0
 def scrape_20091SS(self, chamber, session, url):
     """Scraper for 2009 First Special Session"""
     year_label, session_type = get_session_details(session)
     bill_list_url = url % (year_label)
     with self.urlopen(bill_list_url) as page:
         page = lxml.html.fromstring(page)
         page.make_links_absolute(bill_list_url)
         table = page.xpath('//table[@id="ReportGridView"]')[0]
         for row in table.xpath('tr'):
             self.scrape_20091SS_row(chamber, session, row)
Example #4
0
 def scrape_20091SS(self, chamber, session, url):
     """Scraper for 2009 First Special Session"""
     year_label, session_type = get_session_details(session)
     bill_list_url = url%(year_label)
     with self.urlopen(bill_list_url) as page:
         page = lxml.html.fromstring(page)
         page.make_links_absolute(bill_list_url)
         table = page.xpath('//table[@id="ReportGridView"]')[0]
         for row in table.xpath('tr'):
             self.scrape_20091SS_row(chamber, session, row)
Example #5
0
 def scrape_20101SS(self, chamber, session, url):
     """Scraper for 2010 Special Sessions"""
     year_label, session_type = get_session_details(session)
     bill_list_url = url%(year_label)
     with self.urlopen(bill_list_url) as page:
         page = lxml.html.fromstring(page)
         page.make_links_absolute(bill_list_url)
         table = page.xpath('//table[tr/th[contains(., "Measure Status")]]')[0]
         for row in table.xpath('tr'):
             self.scrape_20101SS_row(chamber, session, row)
Example #6
0
 def scrape_2008RS(self, chamber, session, url):
     """Scraper for pre 2008 Regualar Sessions"""
     year_label, session_type = get_session_details(session)
     chamber_string = get_chamber_string(url, chamber)
     bill_list_url = url % (year_label, chamber_string)
     with self.urlopen(bill_list_url) as page:
         page = lxml.html.fromstring(page)
         page.make_links_absolute(bill_list_url)
         table = page.xpath('//table[tr/th[contains(., "Measure")]]')[0]
         for row in table.xpath('tr'):
             self.scrape_2008RS_row(chamber, session, row)
Example #7
0
 def scrape_2008RS(self, chamber, session, url):
     """Scraper for pre 2008 Regualar Sessions"""
     year_label, session_type = get_session_details(session)
     chamber_string = get_chamber_string(url, chamber)
     bill_list_url = url%(year_label, chamber_string)
     with self.urlopen(bill_list_url) as page:
         page = lxml.html.fromstring(page)
         page.make_links_absolute(bill_list_url)
         table = page.xpath('//table[tr/th[contains(., "Measure")]]')[0]
         for row in table.xpath('tr'):
             self.scrape_2008RS_row(chamber, session, row)
Example #8
0
 def scrape_regular(self, chamber, session, url):
     """Scraper for post 2009 Regular Sessions."""
     year_label, session_type = get_session_details(session)
     chamber_string = get_chamber_string(url, chamber)
     bill_list_url = url%(year_label, chamber_string)
     with self.urlopen(bill_list_url) as page:
         page = lxml.html.fromstring(page)
         page.make_links_absolute(bill_list_url)
         table = page.xpath('//table[contains(@id, "ReportGridView")]')[0]
         for row in table.xpath('tr'):
             self.scrape_regular_row(chamber, session, row)
Example #9
0
 def scrape_2009RS(self, chamber, session, url):
     """Scraper for post 2009 Regular Sessions."""
     year_label, session_type = get_session_details(session)
     chamber_string = get_chamber_string(url, chamber)
     bill_list_url = url % (year_label, chamber_string)
     with self.urlopen(bill_list_url) as page:
         page = lxml.html.fromstring(page)
         page.make_links_absolute(bill_list_url)
         table = page.xpath('//table[contains(@id, "ReportGridView")]')[0]
         for row in table.xpath('tr'):
             # import pdb; pdb.set_trace()
             self.scrape_2009RS_row(chamber, session, row)
Example #10
0
    def scrape(self, chamber, session):
        self.validate_session(session) # Check session is defined in init file.
        # Work out appropriate scaper for year and session type.
        year_label, session_type = get_session_details(session)
        # Check if session scaper already implemented.
        url, scraper = self.session_scraper.get(session, [None, None])

        # Configure for general cases.
        if scraper is None:
            url = "/session%s/lists/RptIntro%s.aspx"
            scraper = self.scrape_regular

        scraper(chamber, session, STATE_URL+url)
Example #11
0
    def scrape(self, chamber, session):
        self.validate_session(session) # Check session is defined in init file.
        # Work out appropriate scaper for year and session type.
        year_label, session_type = get_session_details(session)
        # Check if session scaper already implemented.
        url, scraper = self.session_scraper.get(session, [None, None])

        # Configure for general cases.
        if scraper is None:
            url = "/session%s/lists/"
            scraper = self.scrape_regular

        scraper(chamber, session, STATE_URL+url)
Example #12
0
 def scrape(self, chamber, session):
     self.validate_session(session) # Check session is defined in init file.
     # Work out appropriate scaper for year and session type.
     year_label, session_type = get_session_details(session)
     # Check if session scaper already implemented.
     url, scraper = self.session_scraper.get(session, [None, None])
     if scraper is not None:
         pass # Session scraper is specified, so just run.
     # Configure for general cases.
     elif int(year_label) >= 2009 and session_type == 'regular':
         # User 2009 scraper for new sessions. Hopefully they won't change!
         url, scraper = self.session_scraper.get('2009 Regular Session', [None, None])
     elif int(year_label <= 2008) and session_type == 'regular':
         # Pre-2009 pages have a different but regular layout.
         url, scraper = self.session_scraper.get('2008 Regular Session', [None, None])
     # If scraper specified, get scraping.
     if scraper is not None:
         scraper(chamber, session, STATE_URL+url)
Example #13
0
 def scrape(self, chamber, session):
     self.validate_session(
         session)  # Check session is defined in init file.
     # Work out appropriate scaper for year and session type.
     year_label, session_type = get_session_details(session)
     # Check if session scaper already implemented.
     url, scraper = self.session_scraper.get(session, [None, None])
     if scraper is not None:
         pass  # Session scraper is specified, so just run.
     # Configure for general cases.
     elif int(year_label) >= 2009 and session_type == 'regular':
         # User 2009 scraper for new sessions. Hopefully they won't change!
         url, scraper = self.session_scraper.get('2009 Regular Session',
                                                 [None, None])
     elif int(year_label <= 2008) and session_type == 'regular':
         # Pre-2009 pages have a different but regular layout.
         url, scraper = self.session_scraper.get('2008 Regular Session',
                                                 [None, None])
     # If scraper specified, get scraping.
     if scraper is not None:
         scraper(chamber, session, STATE_URL + url)