Esempio n. 1
0
    def scrape_senate(self, session):
        legislator_pages_dir = legislators_url('upper')

        for counter, leg_page_dir in enumerate(legislator_pages_dir):
            with self.urlopen(leg_page_dir) as leg_page_html:
                leg_page = lxml.html.fromstring(leg_page_html)
                tables = leg_page.cssselect('table')
                legislators_table = tables[62]
                leg_data = legislators_table.cssselect('tr')
                # remove table header
                leg_data = leg_data[5:]
                for ld in leg_data:
                    data_elements = ld.cssselect('td')

                    pic_link_part = data_elements[0].iterlinks().next()[2]
                    pic_link = 'http://www.senadopr.us' + pic_link_part
                    name = data_elements[1].text_content()
                    link_part = data_elements[1].iterlinks().next()[2]
                    link = 'http://www.senadopr.us' + link_part
                    leg_party = data_elements[2].text_content()
                    leg_phone_no = data_elements[3].text_content()
                    leg_email = data_elements[4].text_content()

                    if counter == 0:
                        dist = 'at-large'
                    else:
                        dist = counter

                    leg = Legislator(session, 'upper', str(dist), name, \
                                     party = leg_party, head_shot = pic_link, \
                                     phone = leg_phone_no, email = leg_email)
                    leg.add_source(link)
                    leg.add_source(leg_page_dir)
                    self.save_legislator(leg)
Esempio n. 2
0
 def scrape_senate(self, session):
     legislator_pages_dir = legislators_url('upper')
     
     for counter, leg_page_dir in enumerate(legislator_pages_dir):
         with self.urlopen(leg_page_dir) as leg_page_html:
             leg_page = lxml.html.fromstring(leg_page_html)
             tables = leg_page.cssselect('table')
             legislators_table = tables[62]
             leg_data = legislators_table.cssselect('tr')
             # remove table header
             leg_data = leg_data[5:]
             for ld in leg_data:
                 data_elements = ld.cssselect('td')
                
                 pic_link_part = data_elements[0].iterlinks().next()[2]
                 pic_link = 'http://www.senadopr.us' + pic_link_part
                 name = data_elements[1].text_content()
                 link_part = data_elements[1].iterlinks().next()[2]
                 link = 'http://www.senadopr.us' + link_part
                 leg_party = data_elements[2].text_content()
                 leg_phone_no = data_elements[3].text_content()
                 leg_email = data_elements[4].text_content()
                
                 if counter == 0:
                     dist = 'at-large'
                 else:
                     dist = counter                     
                
                 leg = Legislator(session, 'upper', str(dist), name, \
                                  party = leg_party, head_shot = pic_link, \
                                  phone = leg_phone_no, email = leg_email)
                 leg.add_source(link)
                 leg.add_source(leg_page_dir)
                 self.save_legislator(leg)
Esempio n. 3
0
    def scrape_house(self, session):
        legislator_pages_dir = legislators_url('lower')

        with self.urlopen(legislator_pages_dir) as leg_page_html:
            leg_page = lxml.html.fromstring(leg_page_html)
            tables = leg_page.cssselect("table")
            leg_dist_table = tables[4]
            leg_acu_table = tables[5]
            legs_dist = leg_dist_table.cssselect('td')
            legs_acu = leg_acu_table.cssselect('td')
            # last one is empty
            legs_acu.pop()

            for l in legs_dist:
                leg_data = l.cssselect('font')
                name_dist_party = leg_data[0].text_content()
                name, sep, dist_party = name_dist_party.partition('Distrito')
                dist = re.search('[0-9]+', dist_party).group(0)
                leg_party = leg_data[1].text_content()
                name = name.lstrip()
                link_part = l.cssselect('a')[0].iterlinks().next()[2]
                link = 'http://www.camaraderepresentantes.org/' + link_part
                imgs = l.cssselect('img')
                pic_link_part = imgs[0].iterlinks().next()[2]
                pic_link = 'http://www.camaraderepresentantes.org/' + pic_link_part

                leg = Legislator(session, 'lower', dist, name, \
                                 party = leg_party, head_shot = pic_link)
                leg.add_source(link)
                leg.add_source(legislator_pages_dir)
                self.save_legislator(leg)

            for l in legs_acu:
                link_part = l.iterlinks().next()[2]
                link = 'http://www.camaraderepresentantes.org/' + link_part
                pic_link_part = l.cssselect('img')[0].iterlinks().next()[2]
                pic_link = 'http://www.camaraderepresentantes.org/' + pic_link_part
                name_party = l.text_content().lstrip()
                match = re.search('PNP|PPD', name_party)
                name = name_party[:-4]
                leg_party = match.group(0)

                leg = Legislator(session, 'lower', 'at-large', name, \
                                 party = leg_party, head_shot = pic_link)

                leg.add_source(link)
                leg.add_source(legislator_pages_dir)
                self.save_legislator(leg)
Esempio n. 4
0
    def scrape_house(self, session):
        legislator_pages_dir = legislators_url('lower')
        
        with self.urlopen(legislator_pages_dir) as leg_page_html:
                leg_page = lxml.html.fromstring(leg_page_html)
                tables = leg_page.cssselect("table")
                leg_dist_table = tables[4]
                leg_acu_table = tables[5]
                legs_dist = leg_dist_table.cssselect('td')
                legs_acu = leg_acu_table.cssselect('td')
                # last one is empty
                legs_acu.pop()
                
                for l in legs_dist:
                    leg_data = l.cssselect('font')
                    name_dist_party = leg_data[0].text_content()
                    name, sep, dist_party = name_dist_party.partition('Distrito')
                    dist = re.search('[0-9]+', dist_party).group(0)
                    leg_party = leg_data[1].text_content()
                    name = name.lstrip()
                    link_part = l.cssselect('a')[0].iterlinks().next()[2]
                    link = 'http://www.camaraderepresentantes.org/' + link_part
                    imgs = l.cssselect('img')
                    pic_link_part = imgs[0].iterlinks().next()[2]
                    pic_link = 'http://www.camaraderepresentantes.org/' + pic_link_part

                    leg = Legislator(session, 'lower', dist, name, \
                                     party = leg_party, head_shot = pic_link)
                    leg.add_source(link)
                    leg.add_source(legislator_pages_dir)
                    self.save_legislator(leg)
                 
                for l in legs_acu:
                    link_part = l.iterlinks().next()[2]
                    link = 'http://www.camaraderepresentantes.org/' + link_part
                    pic_link_part = l.cssselect('img')[0].iterlinks().next()[2]
                    pic_link = 'http://www.camaraderepresentantes.org/' + pic_link_part
                    name_party = l.text_content().lstrip()
                    match = re.search('PNP|PPD', name_party)
                    name = name_party[:-4]
                    leg_party = match.group(0) 
                                                                               
                    leg = Legislator(session, 'lower', 'at-large', name, \
                                     party = leg_party, head_shot = pic_link)
                    
                    leg.add_source(link)
                    leg.add_source(legislator_pages_dir)
                    self.save_legislator(leg)