Esempio n. 1
0
    def fetch_member(self, url, name, term, chamber):
        party_map = {'R': 'Republican', 'D': 'Democratic', 'I': 'Independent'}
        party_district_re = re.compile(
            r'\((R|D|I)\) - (?:House|Senate) District\s+(\d+)')

        url = 'http://leg6.state.va.us' + url

        # handle resignations, special elections
        match = re.search(r'-(Resigned|Member) (\d{1,2}/\d{1,2})?', name)
        if match:
            action, date = match.groups()
            name = name.rsplit('-')[0]
            if action == 'Resigned':
                pass # TODO: set end date
            elif action == 'Member':
                pass # TODO: set start date

        with self.urlopen(url) as html:
            doc = lxml.html.fromstring(html)

            party_district_line = doc.xpath('//h3/font/text()')[0]
            party, district = party_district_re.match(party_district_line).groups()

            leg = Legislator(term, chamber, district, name.strip(),
                             party=party_map[party])
            leg.add_source(url)

            for com in doc.xpath('//ul[@class="linkSect"][1]/li/a/text()'):
                leg.add_role('committee member', term=term, chamber=chamber,
                             committee=com)

            self.save_legislator(leg)
Esempio n. 2
0
    def scrape_legislator_data(self, url, chamber):
        party_fulls = {'R' : 'Republican', 'D' : 'Democrat'}
        with self.urlopen(url) as page:
            page = BeautifulSoup(page)
            for data in page.find('table', id = 'ctl00_mainCopy_DataList1')('td'):
                spans = data('span')
                if len(spans) == 0:
                    self.debug('Found an empty cell in %s. Continuing' % url)
                    continue
                full_name = ' '.join([span.string.strip() for span in spans])
                if len(spans[0].string.strip().split()) == 2:
                    first_name, middle_name = spans[0].string.strip().split()
                else:
                    first_name, middle_name = spans[0].string.strip(), ''
                last_name = spans[1].string.strip()

                details_url = get_abs_url(url, data.find('a')['href'])
                with self.urlopen(details_url) as details:
                    details = BeautifulSoup(details)
                    district = details.find('a', id = 'ctl00_mainCopy_LegisInfo_DISTRICTLabel').string.strip()
                    party = party_fulls[details.find('span', id = 'ctl00_mainCopy_LegisInfo_PARTYLabel').string]

                    leg = Legislator('2010', chamber, district, full_name, first_name, 
                            last_name, middle_name, party)
                    leg.add_source(details_url)

                    comms_table = details.find('table', id = 'ctl00_mainCopy_MembershipGrid')
                    for comms_raw_data in comms_table('tr')[1:]:
                        comm_data = comms_raw_data('td')
                        comm_role_type = comm_data[0].string.strip()
                        comm_name = comm_data[1]('a')[0].string.strip()
                        leg.add_role(comm_role_type, '2010', chamber = chamber, committee = comm_name)

                    self.save_legislator(leg)
Esempio n. 3
0
    def scrape_member(self, chamber, term, member_url):
        with self.urlopen(member_url) as page:
            root = lxml.html.fromstring(page)
            root.make_links_absolute(member_url)
            photo_url = root.xpath('//div[@class="bioPicContainer"]/img/@src')[0]
            full_name = root.xpath('//div[@class="bioPicContainer"]/img/@alt')[0]

            name_parts = full_name.split(' ')
            first_name = last_name = middle_name = None
            if len(name_parts) == 2:
                first_name, last_name = name_parts
                middle_name = ''
            elif len(name_parts) == 3:
                first_name, middle_name, last_name = name_parts
            elif len(name_parts) > 3:
                first_name = name_parts[0]
                middle_name = name_parts[1]
                last_name = name_parts[2]

            district = root.xpath('//div[@id="District"]//div[@class="widgetContent"]')
            if len(district):
                district = district[0].text.strip()
                if len(district.split(' - ')) > 1:
                    district = district.split(' - ')[0]
                elif len(district.split('. ')) > 1:
                    district = district.split('. ')[0]
                else:
                    district = district[0:32]
            else:
                district = 'NotFound'

            party = root.xpath('//div[@class="bioDescription"]/div')[0].text.strip().split(',')[0]
            if party == 'Democrat':
                party = 'Democratic'
            elif party == 'Republican':
                party = 'Republican'

            leg = Legislator(term, chamber, district, full_name,
                                 party=party, photo_url=photo_url,
                                 first_name=first_name, middle_name=middle_name,
                                 last_name=last_name)

            leg.add_source(member_url)

            comm_div = root.xpath('//div[@id="Column5"]//div[@class="widgetContent"]')
            if len(comm_div):
                comm_div = comm_div[0]
                for li in comm_div.xpath('/ul/li'):
                    role = li.xpath('text()').strip()
                    comm = li.xpath('/a/text()').strip()[0].strip(',')
                    if role == 'Member':
                        role = 'committee member'
                    leg.add_role(role, term, chamber=chamber,
                                     committee=comm)

            self.save_legislator(leg)
Esempio n. 4
0
    def scrape_member(self, chamber, year, member_url):
        with self.urlopen(member_url) as page:
            root = lxml.html.fromstring(page)
            root.make_links_absolute(member_url)

            sdiv = root.xpath('//div[@class="subtitle"]')[0]
            table = sdiv.getnext()

            photo_url = table.xpath('//img[@id="ctl00_ContentPlaceHolder1'
                                    '_imgMember"]')[0].attrib['src']

            td = table.xpath('//td[@valign="top"]')[0]
            full_name = td.xpath('string(//div[2]/strong)').strip()

            district = td.xpath('string(//div[3])').strip()
            district = district.replace('District ', '')

            party = td.xpath('string(//div[4])').strip()[0]
            if party == 'D':
                party = 'Democrat'
            elif party == 'R':
                party = 'Republican'

            leg = Legislator('81', chamber, district, full_name,
                             party=party, photo_url=photo_url)

            leg.add_source(member_url)

            comm_div = root.xpath('//div[string() = "Committee Membership:"]'
                                  '/following-sibling::div'
                                  '[@class="rcwcontent"]')[0]

            for br in comm_div.xpath('*/br'):
                if br.tail:
                    leg.add_role('committee member', '81', chamber=chamber,
                                 committee=br.tail.strip())

            self.save_legislator(leg)
Esempio n. 5
0
    def scrape(self, chamber, term):
        self.validate_term(term)

        l1 = Legislator(term, chamber, '1st',
                        'Bob Smith', party='Democrat')

        if chamber == 'upper':
            l1.add_role('President of the Senate', term)
        else:
            l1.add_role('Speaker of the House', term)

        l1.add_source('http://example.com/Bob_Smith.html')

        l2 = Legislator(term, chamber, '2nd',
                        'Sally Johnson', party='Republican')
        l2.add_role('Minority Leader', term)
        l2.add_source('http://example.com/Sally_Johnson.html')

        self.save_legislator(l1)
        self.save_legislator(l2)
Esempio n. 6
0
    def scrape(self, chamber, year):
        if year != '2009':
            raise NoDataForYear

        l1 = Legislator('2009-2010', chamber, '1st',
                        'Bob Smith', party='Democrat')

        if chamber == 'upper':
            l1.add_role('President of the Senate', '2009-2010')
        else:
            l1.add_role('Speaker of the House', '2009-2010')

        l1.add_source('http://example.com/Bob_Smith.html')

        l2 = Legislator('2009-2010', chamber, '2nd',
                        'Sally Johnson', party='Republican')
        l2.add_role('Minority Leader', '2009-2010')
        l2.add_source('http://example.com/Sally_Johnson.html')

        self.save_legislator(l1)
        self.save_legislator(l2)
Esempio n. 7
0
 def scrape(self, chamber, term):
     self.validate_term(term)
     session = self.get_session_for_term(term)
     try:
         session_id = self.get_session_id(session)
     except KeyError:
         raise NoDataForPeriod(session)
         
     body = {'lower': 'H', 'upper': 'S'}[chamber]
     url = 'http://www.azleg.gov/MemberRoster.asp?Session_ID=%s&body=%s' % (
                                                            session_id, body)
     with self.urlopen(url) as page:
         root = html.fromstring(page)
         path = '//table[@id="%s"]/tr' % {'H': 'house', 'S': 'senate'}[body]
         roster = root.xpath(path)[1:]
         for row in roster:
             position = ''
             vacated = ''
             name, district, party, email, room, phone, fax = row.getchildren()
             
             link = name.xpath('string(a/@href)')
             link = "http://www.azleg.gov" + link
             if len(name) == 1:
                 name = name.text_content().strip()
             else:
                 position = name.tail.strip()
                 name = name[0].text_content().strip()
                 
             district = district.text_content()
             party = party.text_content().strip()
             party = self.get_party(party)
             email = email.text_content().strip()
             
             if re.match('Vacated', email):
                 vacated = re.search('[0-9]*/[0-9]*/\d{4}', email).group()
                 email = ''
             
             room = room.text_content().strip()
             phone = phone.text_content().strip()
             if not phone.startswith('602'):
                 phone = "602-" + phone
             fax = fax.text_content().strip()
             if not fax.startswith('602'):
                 fax = "602-" + fax
             if vacated:
                 end_date = datetime.datetime.strptime(vacated, '%m/%d/%Y')
                 leg = Legislator( term, chamber, district, full_name=name,
                                   party=party, url=link)
                 leg['roles'][0]['end_date'] = end_date
             else:
                 leg = Legislator( term, chamber, district, full_name=name,
                                   party=party, phone=phone, fax=fax, room=room, 
                                   email=email, url=link)
             
             if position:
                 leg.add_role( position, term, chamber=chamber, 
                              district=district, party=party)
                   
             leg.add_source(url)
             
             #Probably just get this from the committee scraper
             #self.scrape_member_page(link, session, chamber, leg)
             self.save_legislator(leg)