Beispiel #1
0
 def scrape(self, chamber, session):   
     year = year_from_session(session)
     url = bills_url(year)
     with self.urlopen(url) as bills_page_html:
         bills_page = lxml.html.fromstring(bills_page_html)
         table_rows = bills_page.cssselect('tr')
         # Eliminate empty rows
         table_rows = table_rows[0:len(table_rows):2]
         for row in table_rows:
             row_elements = row.cssselect('td')
             
             bill_document = row_elements[0]
             bill_document.make_links_absolute(base_url())
             
             element, attribute, link, pos = bill_document.iterlinks().next()
             bill_id = element.text_content().rstrip('.pdf')
             bill_document_link = link           
             
             title_and_sponsors = row_elements[1]
             title_match = re.search('([A-Z][a-z]+.+[a-z])[A-Z]', title_and_sponsors.text_content())
             sponsors_match = re.search('[a-z]([A-Z]+.+)', title_and_sponsors.text_content())
             title = title_match.group(1)
             sponsors =  sponsors_match.group(1)
             separated_sponsors = sponsors.split('--')
             
             bill = Bill(session, chamber, bill_id, title)
             bill.add_version('current', bill_document_link)
             
             if separated_sponsors[1] == '(NONE)':
                 bill.add_sponsor('primary', separated_sponsors[0])
             
             else:
                 bill.add_sponsor('cosponsor', separated_sponsors[0])
                 bill.add_sponsor('cosponsor', separated_sponsors[1])                
            
             
             versions_page_element = row_elements[2]
             versions_page_element.make_links_absolute(base_url())
             element, attribute, link, pos = versions_page_element.iterlinks().next()
             
             bill.add_source(link)
             
             self.scrape_versions(link, bill)
                   
             actions_page_element = row_elements[3]
             element, attribute, link, pos = actions_page_element.iterlinks().next()
             frame_link = base_url() + link.split('?Open&target=')[1]
             
             self.scrape_actions(frame_link, bill)
             
             votes_page_element = row_elements[7]
             element, attribute, link, pos = votes_page_element.iterlinks().next()
             frame_link = base_url() + link.split('?Open&target=')[1]
             self.scrape_votes(frame_link, chamber, bill)
                             
                     
                        
                     
                 
             
    def scrape(self, chamber, session):
        # Legislator data only available for the current session
        if year_from_session(session) != 2009:
            raise NoDataForPeriod(session)

        with self.urlopen(legs_url(chamber)) as html:
            page = lxml.html.fromstring(html)

            # Iterate through legislator names
            page.make_links_absolute(BASE_URL)
            for link in set([a.get('href') for a in page.xpath('//b/a')]):
                with self.urlopen(link) as legislator_html:
                    legislator_page = lxml.html.fromstring(legislator_html)

                    leg_elements = legislator_page.cssselect('b')
                    leg_name = leg_elements[0].text_content()

                    district = ""
                    district_match = re.search("District [0-9]+", legislator_page.text_content())
                    if (district_match != None):
                        district = district_match.group(0)

                    email = ""
                    email_match = re.search('E-mail: (.*)', legislator_page.text_content())
                    if (email_match != None):
                        email = email_match.group(1)

                    form_page = lxml.html.parse(leg_form_url()).getroot()
                    form_page.forms[0].fields['Query'] = leg_name
                    result = lxml.html.parse(lxml.html.submit_form(form_page.forms[0])).getroot()
                    elements = result.cssselect('td')
                    party_letter = elements[7].text_content()

                    party = party_name(party_letter)

                    leg = Legislator(session, chamber, district, leg_name,
                                 "", "", "", party,
                                 official_email=email)
                    leg.add_source(link)

                    self.save_legislator(leg)