Esempio n. 1
0
    def scrape(self, chamber, term):
        # Pennsylvania doesn't make member lists easily available
        # for previous sessions, unfortunately
        if term != '2009-2010':
            raise NoDataForPeriod(term)

        leg_list_url = legislators_url(chamber)

        with self.urlopen(leg_list_url) as page:
            page = lxml.html.fromstring(page)

            for link in page.xpath("//a[contains(@href, '_bio.cfm')]"):
                full_name = link.text[0:-4]
                district = re.search("District (\d+)", link.tail).group(1)

                party = link.text[-2]
                if party == 'R':
                    party = 'Republican'
                elif party == 'D':
                    party = 'Democratic'

                legislator = Legislator(term, chamber, district,
                                        full_name, party=party)
                legislator.add_source(leg_list_url)
                self.save_legislator(legislator)
Esempio n. 2
0
    def scrape(self, chamber, year):
        # Pennsylvania doesn't make member lists easily available
        # for previous sessions, unfortunately
        if int(year) < 2009:
            #raise NoDataForYear(year)
            return

        session = "%s-%d" % (year, int(year) + 1)
        leg_list_url = legislators_url(chamber)

        with self.urlopen(leg_list_url) as member_list_page:
            member_list_page = BeautifulSoup(member_list_page)
            for link in member_list_page.findAll(
                'a', href=re.compile('_bio\.cfm\?id=')):

                full_name = link.contents[0][0:-4]
                last_name = full_name.split(',')[0]
                first_name = full_name.split(' ')[1]

                if len(full_name.split(' ')) > 2:
                    middle_name = full_name.split(' ')[2].strip(',')
                else:
                    middle_name = ''

                party = link.contents[0][-2]
                if party == 'R':
                    party = "Republican"
                elif party == 'D':
                    party = "Democrat"

                district = re.search(
                    "District (\d+)", link.parent.contents[1]).group(1)

                legislator = Legislator(session, chamber, district,
                                        full_name, first_name, last_name,
                                        middle_name, party)
                legislator.add_source(leg_list_url)
                self.save_legislator(legislator)