Beispiel #1
0
    def fetch_member(self, url, name, term, chamber):
        party_map = {"R": "Republican", "D": "Democratic", "I": "Independent"}
        party_district_re = re.compile(r"\((R|D|I)\) - (?:House|Senate) District\s+(\d+)")

        url = "http://leg6.state.va.us" + url

        # handle resignations, special elections
        match = re.search(r"-(Resigned|Member) (\d{1,2}/\d{1,2})?", name)
        if match:
            action, date = match.groups()
            name = name.rsplit("-")[0]
            if action == "Resigned":
                pass  # TODO: set end date
            elif action == "Member":
                pass  # TODO: set start date

        with self.urlopen(url) as html:
            doc = lxml.html.fromstring(html)

            party_district_line = doc.xpath("//h3/font/text()")[0]
            party, district = party_district_re.match(party_district_line).groups()

            leg = Legislator(term, chamber, district, name.strip(), party=party_map[party])
            leg.add_source(url)

            for com in doc.xpath('//ul[@class="linkSect"][1]/li/a/text()'):
                leg.add_role("committee member", term=term, chamber=chamber, committee=com)

            self.save_legislator(leg)
Beispiel #2
0
def test_legislator():
    l = Legislator('T1', 'upper', '1', 'Adam Smith', 'Adam', 'Smith')
    assert_equal(l, {'_type': 'person', 'full_name': 'Adam Smith',
                     'first_name': 'Adam', 'last_name': 'Smith',
                     'middle_name': '', 'suffixes': '', 'roles': [
                         {'chamber': 'upper', 'term': 'T1',
                          'role': 'member', 'start_date': None,
                          'end_date': None, 'district': '1',
                          'party': ''}],
                     'offices': [], 'sources': []})

    l.add_role('committee member', 'T1', committee='Some Committee',
               position='chairman')
    assert_equal(l['roles'][1], {'role': 'committee member', 'term': 'T1',
                                 'start_date': None, 'end_date': None,
                                 'committee': 'Some Committee',
                                 'position': 'chairman'})

    l.add_office('capitol', 'Statehouse Office', '123 Main St', '123-456-7890',
                 '123-555-5555', '*****@*****.**')
    assert_equal(l['offices'], [{'type': 'capitol',
                                 'name': 'Statehouse Office',
                                 'address': '123 Main St',
                                 'phone': '123-456-7890',
                                 'fax': '123-555-5555',
                                 'email': '*****@*****.**'}])
Beispiel #3
0
    def scrape_senators(self, chamber, term):
        url = 'http://www.ohiosenate.gov/directory.html'
        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)
            page.make_links_absolute(url)

            for el in page.xpath('//table[@class="fullWidth"]/tr/td'):
                sen_link = el.xpath('a[@class="senatorLN"]')[1]
                sen_url = sen_link.get('href')

                full_name = sen_link.text
                full_name = full_name[0:-2]
                if full_name == 'To Be Announced':
                    continue

                district = el.xpath('string(h3)').split()[1]

                party = el.xpath('string(a[@class="senatorLN"]/span)')

                if party == "D":
                    party = "Democratic"
                elif party == "R":
                    party = "Republican"

                office_phone = el.xpath("b[text() = 'Phone']")[0].tail
                office_phone = office_phone.strip(' :')

                office = ", ".join([x.strip() for x in \
                                    el.xpath("./text()")[2:-1]])

                photo_url = el.xpath("a/img")[0].attrib['src']
                email = el.xpath('.//span[@class="tan"]/text()')[1]

                leg = Legislator(term, chamber, district, full_name,
                                 party=party, photo_url=photo_url, url=sen_url,
                                 email="")

                committees = self.scrape_senate_committees(sen_url)

                leg.add_office('capitol',
                               'Capitol Office',
                               address=office,
                               phone=office_phone)

                leg.add_source(url)
                leg.add_source(sen_url)

                for committee in committees:
                    chmbr = chamber
                    if "joint" in committee['committee'].lower():
                        chmbr = "joint"

                    leg.add_role('committee member',
                        term=term,
                        chamber=chmbr,
                        committee=committee['committee'],
                        position=committee['title']
                    )

                self.save_legislator(leg)
Beispiel #4
0
    def scrape(self, chamber, session):
        url = self.get_district_list(chamber, session)
        people_pages = self.scrape_directory( url, chamber, session )

        for person in people_pages:
            district = person
            p_url = people_pages[district]
            metainf = self.process_person( p_url )

            p = Legislator( session, chamber, district, metainf['name'],
                party=metainf['party'],
                # some additional things the website provides:
                occupation=metainf['occupation'],
                photo_url=metainf['photo_url'])
            p.add_source( p_url )

            if 'ctty' in metainf:
                for ctty in metainf['ctty']:
                    p.add_role( 'committee member',
                        term=session,
                        chamber=chamber,
                        committee=ctty,
                        position="member"
                    )
            self.save_legislator( p )
Beispiel #5
0
    def fetch_member(self, url, name, term, chamber):
        party_map = {'R': 'Republican', 'D': 'Democratic', 'I': 'Independent'}
        party_district_re = re.compile(
            r'\((R|D|I)\) - (?:House|Senate) District\s+(\d+)')

        # handle resignations, special elections
        match = re.search(r'-(Resigned|Member) (\d{1,2}/\d{1,2})?', name)
        if match:
            action, date = match.groups()
            name = name.rsplit('-')[0]
            if action == 'Resigned':
                pass # TODO: set end date
            elif action == 'Member':
                pass # TODO: set start date

        with self.urlopen(url) as html:
            doc = lxml.html.fromstring(html)

            party_district_line = doc.xpath('//h3/font/text()')[0]
            party, district = party_district_re.match(party_district_line).groups()

            leg = Legislator(term, chamber, district, name.strip(),
                             party=party_map[party], url=url)
            leg.add_source(url)

            for com in doc.xpath('//ul[@class="linkSect"][1]/li/a/text()'):
                leg.add_role('committee member', term=term, chamber=chamber,
                             committee=com)

            self.save_legislator(leg)
Beispiel #6
0
    def scrape(self, chamber, session):
        url = self.get_district_list(chamber, session)
        people_pages = self.scrape_directory( url, chamber, session )

        for person in people_pages:
            district = person
            p_url = people_pages[district]
            metainf = self.process_person( p_url )

            p = Legislator( session, chamber, district, metainf['name'],
                party=metainf['party'],
                # some additional things the website provides:
                occupation=metainf['occupation'],
                photo_url=metainf['photo_url'],
                url=metainf['homepage'])
            if "email" in metainf:
                p['email'] = metainf['email']
            if "number" in metainf:
                p.add_office('capitol', 'Capitol Office',
                             phone=metainf['number'],
                             address='200 E. Colfax\nDenver, CO 80203'
                            )

            p.add_source( p_url )

            if 'ctty' in metainf:
                for ctty in metainf['ctty']:
                    p.add_role( 'committee member',
                        term=session,
                        chamber=chamber,
                        committee=clean_committee(ctty),
                        position="member"
                    )
            self.save_legislator( p )
Beispiel #7
0
    def scrape(self, term, chambers):
        url = 'http://gencourt.state.nh.us/downloads/Members(Asterisk%20Delimited).txt'

        option_map = {}
        html = self.urlopen('http://www.gencourt.state.nh.us/house/members/memberlookup.aspx')
        doc = lxml.html.fromstring(html)
        for opt in doc.xpath('//option'):
            option_map[opt.text] = opt.get('value')

        with self.urlopen(url) as data:
            for line in data.splitlines():
                if line.strip() == "":
                    continue

                (chamber, fullname, last, first, middle, county, district_num,
                 seat, party, street, street2, city, astate, zipcode,
                 home_phone, office_phone, fax, email, com1, com2, com3,
                 com4, com5, _, _) = line.split('*')

                chamber = chamber_map[chamber]

                # skip legislators from a chamber we aren't scraping
                if chamber not in chambers:
                    continue

                if middle:
                    full = '%s %s %s' % (first, middle, last)
                else:
                    full = '%s %s' % (first, last)

                address = street
                if street2:
                    address += (' ' + street2)
                address += '\n%s, %s %s' % (city, astate, zipcode)

                district = str(int(district_num))
                if county:
                    district = '%s %s' % (county, district)

                leg = Legislator(term, chamber, district, full, first, last,
                                 middle, party_map[party], email=email)
                leg.add_office('district', 'Home Address',
                               address=address, phone=home_phone or None)
                leg.add_office('district', 'Office Address',
                               phone=office_phone or None, fax=fax or None)

                if chamber == 'upper':
                    leg['url'] = 'http://www.gencourt.state.nh.us/Senate/members/webpages/district%02d.aspx' % int(district_num)
                elif chamber == 'lower':
                    code = option_map.get('{0}, {1}'.format(last, first))
                    if code:
                        leg['url'] = 'http://www.gencourt.state.nh.us/house/members/member.aspx?member=' + code

                for com in (com1, com2, com3, com4, com5):
                    if com:
                        leg.add_role('committee member', term=term,
                                      chamber=chamber, committee=com)

                leg.add_source(url)
                self.save_legislator(leg)
Beispiel #8
0
 def scrape_senators(self, chamber, session, term):
     url = self.senator_url % (session[2:])
     with self.urlopen(url) as page:
         page = lxml.html.fromstring(page)
         table = page.xpath('//*[@id="mainContent"]/table//table/tr')
         rowcount = 0
         for tr in table:
             rowcount += 1
             # the first two rows are headers, skip:
             if rowcount < 2:
                 continue
             tds = tr.xpath('td')
             full_name = tds[0].xpath('div/a')[0].text_content().strip()
             party_and_district = tds[1].xpath('div')[0].text_content().strip().split('-')
             if party_and_district[0] == 'D':
                 party = 'Democratic'
             elif party_and_district[0] == 'R':
                 party = 'Republican'
             senator_key = "%s%s" % (party_and_district[0].lower(),party_and_district[1])
             district = party_and_district[1]
             phone = tds[3].xpath('div')[0].text_content().strip()
             leg = Legislator(term, chamber, district, full_name, '', '', '', party)
             leg.add_source(url)
             url = self.senator_details_url % (session[2:],int(district))
             with self.urlopen(url) as details_page:
                 leg.add_source(url)
                 
                 #Using soupparser as legislator pages are very soupy
                 page = lxml.html.fromstring(details_page)
                 photo_url = page.xpath('//html/body/div[2]/div/img/@src')[0]
                 committees = page.xpath('//html/body/div[2]//span[@class="style3"]/a')
                 for c in committees:
                     if c.attrib.get('href').find('info/comm/') == -1:
                         continue
                     parts = c.text_content().split('\n')
                     #print "committee = '%s'" % parts[0].strip()
                     subcommittee = None
                     if len(parts) > 1:
                         subcommittee = parts[1].strip().replace('- ','').replace(', Vice-Chairman','').replace(', Chairman','')
                     committee = parts[0].strip().replace(', Vice-Chairman','').replace(', Chairman','')
                     if subcommittee:
                         leg.add_role('committee member', term, committee=committee, subcommittee=subcommittee, chamber=chamber)
                     else:
                         leg.add_role('committee member', term, committee=committee, chamber=chamber)
             url = self.senator_address_url % (session[2:],int(senator_key[1:]))
             with self.urlopen(url) as details_page:
                 leg.add_source(url)
                 page = lxml.html.fromstring(details_page)
                 address = page.xpath('/html/body//span[2]')[0].text_content().split('\n')
                 email = page.xpath('/html/body/p/span[2]/a/@href')
                 # TODO This is only true if the href doesn't contain 'mail_form'. If it does,
                 # then there is only a webform. So...no email?
                 # TODO a lot of these have fax numbers. Include?
             leg['office_phone'] = phone
             leg['office_address'] = "%s%s" % (address[0],address[1])
             leg['photo_url'] = photo_url
             if email and len(email) > 0 and email[0] != 'mailto:':
                 leg['email'] = email[0].split(':')[1]
                 #print "em = %s" % email
             self.save_legislator(leg)
Beispiel #9
0
    def scrape_legislator_data(self, url, chamber):
        party_fulls = {'R' : 'Republican', 'D' : 'Democrat'}
        with self.urlopen(url) as page:
            page = BeautifulSoup(page)
            for data in page.find('table', id = 'ctl00_mainCopy_DataList1')('td'):
                spans = data('span')
                if len(spans) == 0:
                    self.debug('Found an empty cell in %s. Continuing' % url)
                    continue
                full_name = ' '.join([span.string.strip() for span in spans])
                if len(spans[0].string.strip().split()) == 2:
                    first_name, middle_name = spans[0].string.strip().split()
                else:
                    first_name, middle_name = spans[0].string.strip(), ''
                last_name = spans[1].string.strip()

                details_url = get_abs_url(url, data.find('a')['href'])
                with self.urlopen(details_url) as details:
                    details = BeautifulSoup(details)
                    district = details.find('a', id = 'ctl00_mainCopy_LegisInfo_DISTRICTLabel').string.strip()
                    party = party_fulls[details.find('span', id = 'ctl00_mainCopy_LegisInfo_PARTYLabel').string]

                    leg = Legislator('2010', chamber, district, full_name, first_name, 
                            last_name, middle_name, party)
                    leg.add_source(details_url)

                    comms_table = details.find('table', id = 'ctl00_mainCopy_MembershipGrid')
                    for comms_raw_data in comms_table('tr')[1:]:
                        comm_data = comms_raw_data('td')
                        comm_role_type = comm_data[0].string.strip()
                        comm_name = comm_data[1]('a')[0].string.strip()
                        leg.add_role(comm_role_type, '2010', chamber = chamber, committee = comm_name)

                    self.save_legislator(leg)
Beispiel #10
0
    def scrape(self, chamber, session):
        metainf = self.scrape_leg_page(get_chamber_listing_url( chamber ))
        for leg in metainf:
            p = Legislator( session, chamber, leg['district'], leg['name'],
                party=leg['party'],
                # some additional things the website provides:
                photo_url=leg['image'],
                url=leg['homepage'],
                room=leg['room'],
                phone=leg['phone'],
                fax=leg['fax'],
                email=leg['email'],
                address=leg['addr'])

            for source in leg['source']:
                p.add_source( source )

            try:
                for ctty in leg['ctty']:
                    flag='Joint Legislative'
                    if ctty['name'][:len(flag)] == flag:
                        ctty_chamber = "joint"
                    else:
                        ctty_chamber = chamber

                    p.add_role( 'committee member',
                        term=session,
                        chamber=ctty_chamber,
                        committee=ctty['name'],
                        position="member")
            except KeyError:
                self.log( "XXX: Warning, %s has no scraped Commities" %
                    leg['name'] )

            self.save_legislator( p )
Beispiel #11
0
    def scrape_reps(self, chamber, session, term):
        url = (self.reps_url % (session))
        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)
            # This is the ASP.net table container
            table_xpath = ('id("ContentPlaceHolder1_'
                            'gridMembers_DXMainTable")')
            table = page.xpath(table_xpath)[0]
            for tr in table.xpath('tr')[1:]:
                tds = tr.xpath('td')
                leg_code = tds[0].xpath('a[1]')[0].attrib.get('href')
                last_name = tds[0].text_content().strip()
                first_name = tds[1].text_content().strip()
                full_name = '%s %s' % (first_name, last_name)
                district = str(int(tds[2].text_content().strip()))
                party = tds[3].text_content().strip()
                if party == 'Democrat':
                    party = 'Democratic'
                phone = tds[4].text_content().strip()
                room = tds[5].text_content().strip()
                address = self.assumed_address_fmt % (room if room else '')
                if last_name == 'Vacant':
                    leg = Legislator(term, chamber, district, full_name=full_name,
                                first_name=first_name, last_name=last_name,
                                party=party, _code=leg_code, url=url)

                    leg.add_office('capitol', "Capitol Office",
                                   address=address,
                                   phone=phone)

                    leg.add_source(url)
                    self.save_vacant_legislator(leg)
                else:
                    leg = Legislator(term, chamber, district, full_name=full_name,
                              first_name=first_name, last_name=last_name,
                              party=party, _code=leg_code, url=url)

                    leg.add_office('capitol', 'Capitol Office',
                                   address=address,
                                   phone=phone)

                    url = (self.rep_details_url % (session,district))
                    leg.add_source(url)
                    with self.urlopen(url) as details_page:
                        page = lxml.html.fromstring(details_page)
                        picture = page.xpath('//*[@id="ContentPlaceHolder1_imgPhoto"]/@src')
                        email = page.xpath('//*[@id="ContentPlaceHolder1_lblAddresses"]/table/tr[4]/td/a/@href')
                        terms = page.xpath('//*[@id="ContentPlaceHolder1_lblElected"]')
                        committees = page.xpath('//*[@id="ContentPlaceHolder1_lblCommittees"]/li/a')
                        for c in committees:
                            leg.add_role('committee member', term, committee=c.text_content().strip(), chamber=chamber)
                        # TODO home address?
                        if len(email) > 0 and email[0] != 'mailto:':
                            #print "Found email : %s" % email[0]
                            leg['email'] = email[0].split(':')[1]
                        if len(picture) > 0:
                            #print "Found picture : %s" % picture[0]
                            leg['photo_url'] = picture[0]
                        #leg.add_source(url)
                        self.save_legislator(leg)
Beispiel #12
0
    def scrape_reps(self, chamber, session, term):
        url = (self.reps_url % (session))
        page = self.urlopen(url)
        page = lxml.html.fromstring(page)
        # This is the ASP.net table container
        table_xpath = ('id("ContentPlaceHolder1_'
                        'gridMembers_DXMainTable")')
        table = page.xpath(table_xpath)[0]
        for tr in table.xpath('tr')[1:]:
            tds = tr.xpath('td')
            leg_code = tds[0].xpath('a[1]')[0].attrib.get('href')
            last_name = tds[0].text_content().strip()
            first_name = tds[1].text_content().strip()
            full_name = '%s %s' % (first_name, last_name)
            district = str(int(tds[2].text_content().strip()))
            party = tds[3].text_content().strip()
            if party == 'Democrat':
                party = 'Democratic'
            phone = tds[4].text_content().strip()
            room = tds[5].text_content().strip()
            address = self.assumed_address_fmt % (room if room else '')
            if last_name == 'Vacant':
                leg = Legislator(term, chamber, district, full_name=full_name,
                            first_name=first_name, last_name=last_name,
                            party=party, _code=leg_code, url=url)

                leg.add_office('capitol', "Capitol Office",
                               address=address,
                               phone=phone)

                leg.add_source(url)
                self.save_vacant_legislator(leg)
            else:
                leg = Legislator(term, chamber, district, full_name=full_name,
                          first_name=first_name, last_name=last_name,
                          party=party, _code=leg_code, url=url)

                leg.add_office('capitol', 'Capitol Office',
                               address=address,
                               phone=phone)

                url = (self.rep_details_url % (session,district))
                leg.add_source(url)
                details_page = self.urlopen(url)
                page = lxml.html.fromstring(details_page)
                picture = page.xpath('//*[@id="ContentPlaceHolder1_imgPhoto"]/@src')
                email = page.xpath('//*[@id="ContentPlaceHolder1_lblAddresses"]/table/tr[4]/td/a/@href')
                terms = page.xpath('//*[@id="ContentPlaceHolder1_lblElected"]')
                committees = page.xpath('//*[@id="ContentPlaceHolder1_lblCommittees"]/li/a')
                for c in committees:
                    leg.add_role('committee member', term, committee=c.text_content().strip(), chamber=chamber)
                # TODO home address?
                if len(email) > 0 and email[0] != 'mailto:':
                    #print "Found email : %s" % email[0]
                    leg['email'] = email[0].split(':')[1]
                if len(picture) > 0:
                    #print "Found picture : %s" % picture[0]
                    leg['photo_url'] = picture[0]
                #leg.add_source(url)
                self.save_legislator(leg)
Beispiel #13
0
    def scrape(self, chamber, session):
        metainf = self.scrape_leg_page(get_chamber_listing_url( chamber ))
        for leg in metainf:
            p = Legislator( session, chamber, leg['district'], leg['name'],
                party=leg['party'],
                # some additional things the website provides:
                photo_url=leg['image'],
                url=leg['homepage'],
                email=leg['email'])
            p.add_office('capitol', 'Capitol Office', address=leg['addr'],
                         phone=leg['phone'], fax=leg['fax'] or None)

            for source in leg['source']:
                p.add_source( source )

            try:
                for ctty in leg['ctty']:
                    flag='Joint Legislative'
                    if ctty['name'][:len(flag)] == flag:
                        ctty_chamber = "joint"
                    else:
                        ctty_chamber = chamber

                    p.add_role( 'committee member',
                        term=session,
                        chamber=ctty_chamber,
                        committee=ctty['name'],
                        position="member")
            except KeyError:
                self.log( "XXX: Warning, %s has no scraped Commities" %
                    leg['name'] )

            self.save_legislator( p )
Beispiel #14
0
    def fetch_member(self, url, name, term, chamber):
        party_map = {'R': 'Republican', 'D': 'Democratic', 'I': 'Independent'}
        party_district_re = re.compile(
            r'\((R|D|I)\) - (?:House|Senate) District\s+(\d+)')

        # handle resignations, special elections
        match = re.search(r'-(Resigned|Member) (\d{1,2}/\d{1,2})?', name)
        if match:
            action, date = match.groups()
            name = name.rsplit('-')[0]
            if action == 'Resigned':
                pass  # TODO: set end date
            elif action == 'Member':
                pass  # TODO: set start date

        with self.urlopen(url) as html:
            doc = lxml.html.fromstring(html)

            party_district_line = doc.xpath('//h3/font/text()')[0]
            party, district = party_district_re.match(
                party_district_line).groups()

            leg = Legislator(term,
                             chamber,
                             district,
                             name.strip(),
                             party=party_map[party],
                             url=url)
            leg.add_source(url)

            for ul in doc.xpath('//ul[@class="linkNon"]'):
                address = []
                phone = None
                email = None
                for li in ul.getchildren():
                    text = li.text_content()
                    if re.match('\(\d{3}\)', text):
                        phone = text
                    elif text.startswith('email:'):
                        email = text.strip('email: ')
                    else:
                        address.append(text)
                    type = ('capitol'
                            if 'Capitol Square' in address else 'district')
                    name = ('Capitol Office'
                            if type == 'capitol' else 'District Office')
                    leg.add_office(type,
                                   name,
                                   address='\n'.join(address),
                                   phone=phone,
                                   email=email)

            for com in doc.xpath('//ul[@class="linkSect"][1]/li/a/text()'):
                leg.add_role('committee member',
                             term=term,
                             chamber=chamber,
                             committee=com)

            self.save_legislator(leg)
Beispiel #15
0
    def scrape(self, term, chambers):
        leg_url = "ftp://ftp.cga.ct.gov/pub/data/LegislatorDatabase.csv"
        data = self.get(leg_url)
        page = open_csv(data)

        for row in page:
            chamber = {'H': 'lower', 'S': 'upper'}[row['office code']]
            if chamber not in chambers:
                continue

            district = row['dist'].lstrip('0')

            name = row['first name']
            mid = row['middle initial'].strip()
            if mid:
                name += " %s" % mid
            name += " %s" % row['last name']
            suffix = row['suffix'].strip()
            if suffix:
                name += " %s" % suffix

            party = row['party']
            if party == 'Democrat':
                party = 'Democratic'

            leg = Legislator(term, chamber, district,
                             name, first_name=row['first name'],
                             last_name=row['last name'],
                             middle_name=row['middle initial'],
                             suffixes=row['suffix'],
                             party=party,
                             email=row['email'].strip(),
                             url=row['URL'],
                             office_phone=row['capitol phone'])

            office_address = "%s, Room %s\nHartford, CT 06106-1591" % (
                row['capitol street address'], row['room number'])
            leg.add_office('capitol', 'Capitol Office',
                           address=office_address, phone=row['capitol phone'])
            # skipping home address for now
            leg.add_source(leg_url)

            for comm in row['committee member1'].split(';'):
                if comm:
                    if ' (' in comm:
                        comm, role = comm.split(' (')
                        role = role.strip(')').lower()
                    else:
                        role = 'member'
                    comm = comm.strip()
                    if comm == '':
                        continue

                    leg.add_role('committee member', term,
                                 chamber='joint',
                                 committee=comm,
                                 position=role)

            self.save_legislator(leg)
Beispiel #16
0
    def scrape_legislator(self, chamber, term, url):
        with self.urlopen(url) as html:
            doc = lxml.html.fromstring(html)
            doc.make_links_absolute(url)

            # most properties are easy to pull
            properties = {'first_name': 'FNAME', 'last_name': 'LNAME',
                          'party': 'PARTY', 'district': 'DISTRICT',
                          'county': 'COUNTY', 'start_year': 'STARTYEAR',
                          'occupation': 'OCCUPATION',
                          'capitol_phone': 'OFF_PHONE', 'office_phone': 'WKPH'}
            for key, value in properties.iteritems():
                id = 'ctl00_mainCopy_LegisInfo_%sLabel' % value
                try:
                    val = doc.get_element_by_id(id).text
                except KeyError:
                    self.warning('bad legislator page %s missing %s' %
                                 (url, id))
                    return
                if val:
                    properties[key] = val.strip()

            # image & email are a bit different
            properties['photo_url'] = doc.xpath('//img[@id="ctl00_mainCopy_LegisInfo_LegislatorPhoto"]/@src')[0]
            email = doc.get_element_by_id('ctl00_mainCopy_LegisInfo_lnkEmail').text
            if email:
                properties['email'] = email.strip()

            properties['url'] = url

            properties['chamber'] = chamber
            properties['term'] = term
            properties['full_name'] = '%(first_name)s %(last_name)s' % properties
            if '(D)' in properties['party']:
                properties['party'] = 'Democratic'
            elif '(R)' in properties['party']:
                properties['party'] = 'Republican'
            elif '(DTS)' in properties['party']:
                properties['party'] = 'Decline to State'
            else:
                raise Exception("unknown party encountered")

            leg = Legislator(**properties)
            leg.add_source(url)

            # committees
            # skip first header row
            for row in doc.xpath('//table[@id="ctl00_mainCopy_MembershipGrid"]/tr')[1:]:
                role, committee, note = [x.text_content()
                                         for x in row.xpath('td')]
                if 'Interim' in note:
                    role = 'interim ' + role.lower()
                else:
                    role = role.lower()
                leg.add_role('committee member', term, committee=committee,
                             position=role, chamber=chamber)

            self.save_legislator(leg)
Beispiel #17
0
    def scrape(self, chamber, term):

        office_code = {'upper': 'S', 'lower': 'H'}[chamber]

        leg_url = "ftp://ftp.cga.ct.gov/pub/data/LegislatorDatabase.csv"
        data = self.urlopen(leg_url)
        page = open_csv(data)

        for row in page:
            if office_code != row['office code']:
                continue

            district = row['dist'].lstrip('0')

            name = row['first name']
            mid = row['middle initial'].strip()
            if mid:
                name += " %s" % mid
            name += " %s" % row['last name']
            suffix = row['suffix'].strip()
            if suffix:
                name += " %s" % suffix

            party = row['party']
            if party == 'Democrat':
                party = 'Democratic'

            leg = Legislator(term,
                             chamber,
                             district,
                             name,
                             first_name=row['first name'],
                             last_name=row['last name'],
                             middle_name=row['middle initial'],
                             suffixes=row['suffix'],
                             party=party,
                             email=row['email'],
                             url=row['URL'],
                             office_phone=row['capitol phone'])

            office_address = "%s, Room %s\nHartford, CT 06106-1591" % (
                row['capitol street address'], row['room number'])
            leg.add_office('capitol',
                           'Capitol Office',
                           address=office_address,
                           phone=row['capitol phone'])
            # skipping home address for now
            leg.add_source(leg_url)

            for comm_code in row['committee codes'].split(';'):
                if comm_code:
                    comm_name = self._committee_names[comm_code]
                    leg.add_role('committee member',
                                 term,
                                 chamber='joint',
                                 committee=comm_name)

            self.save_legislator(leg)
Beispiel #18
0
    def fetch_member(self, url, name, term, chamber):
        if name in CHAMBER_MOVES:
            if chamber != CHAMBER_MOVES[name]:
                return  # Skip bad chambers.

        if "vacated" in name.lower():
            self.logger.warning("Seat seems to have been vacated: '{}'".format(name))
            return

        party_map = {'R': 'Republican', 'D': 'Democratic', 'I': 'Independent'}
        party_district_re = re.compile(
            r'\((R|D|I)\) - (?:House|Senate) District\s+(\d+)')

        # handle resignations, special elections
        match = re.search(r'-(Resigned|Member) (\d{1,2}/\d{1,2})?', name)
        if match:
            action, date = match.groups()
            name = name.rsplit('-')[0]

            if action == 'Resigned':
                pass # TODO: set end date
            elif action == 'Member':
                pass # TODO: set start date

        html = self.get(url).text
        doc = lxml.html.fromstring(html)

        party_district_line = doc.xpath('//h3/font/text()')[0]
        party, district = party_district_re.match(party_district_line).groups()

        leg = Legislator(term, chamber, district, name.strip(),
                         party=party_map[party], url=url)
        leg.add_source(url)

        for ul in doc.xpath('//ul[@class="linkNon"]'):
            address = []
            phone = None
            email = None
            for li in ul.getchildren():
                text = li.text_content()
                if re.match('\(\d{3}\)', text):
                    phone = text
                elif text.startswith('email:'):
                    email = text.strip('email: ').strip()
                else:
                    address.append(text)
                office_type = ('capitol' if 'Capitol Square' in address
                        else 'district')
                name = ('Capitol Office' if office_type == 'capitol'
                        else 'District Office')
            leg.add_office(office_type, name, address='\n'.join(address),
                           phone=phone, email=email)

        for com in doc.xpath('//ul[@class="linkSect"][1]/li/a/text()'):
            leg.add_role('committee member', term=term, chamber=chamber,
                         committee=com)

        self.save_legislator(leg)
Beispiel #19
0
    def scrape(self, chamber, term):
        url = 'http://gencourt.state.nh.us/downloads/Members(Asterisk%20Delimited).txt'

        self.validate_term(term, latest_only=True)

        with self.urlopen(url) as data:
            for line in data.splitlines():
                (body, fullname, last, first, middle, county, district, seat,
                 party, street, street2, city, astate, zipcode, home_phone,
                 office_phone, fax, email, com1, com2, com3, com4, com5, _,
                 _) = line.split('*')

                # skip legislators from other chamber
                if body != chamber_name[chamber]:
                    continue

                if middle:
                    full = '%s %s %s' % (first, middle, last)
                else:
                    full = '%s %s' % (first, last)

                address = street
                if street2:
                    address += (' ' + street2)
                address += '\n%s, %s %s' % (city, astate, zipcode)

                district = str(int(district))
                if county:
                    district = '%s %s' % (county, district)

                leg = Legislator(term,
                                 chamber,
                                 district,
                                 full,
                                 first,
                                 last,
                                 middle,
                                 party_map[party],
                                 address=address,
                                 home_phone=home_phone,
                                 office_phone=office_phone,
                                 office_fax=fax,
                                 email=email)

                # use seat as a _code if chamber is lower
                if chamber == 'lower':
                    leg['_code'] = seat

                for com in (com1, com2, com3, com4, com5):
                    if com:
                        leg.add_role('committee member',
                                     term=term,
                                     chamber=chamber,
                                     committee=com)

                leg.add_source(url)
                self.save_legislator(leg)
Beispiel #20
0
    def scrape(self, chamber, term):
        if term != '2011-2012':
            raise NoDataForPeriod(term)

        if chamber == 'upper':
            chamber_name = 'senate'
        else:
            chamber_name = 'house'

        url = "http://www.legis.iowa.gov/Legislators/%s.aspx" % chamber_name
        page = lxml.html.fromstring(self.urlopen(url))
        page.make_links_absolute(url)
        table = page.xpath('//table[@class="legis"]')[0]
        for link in table.xpath(".//a[contains(@href, 'legislator.aspx')]"):
            name = link.text.strip()
            leg_url = link.get('href')
            district = link.xpath("string(../../td[2])")
            party = link.xpath("string(../../td[3])")
            email = link.xpath("string(../../td[5])")

            if party == 'Democrat':
                party = 'Democratic'

            pid = re.search("PID=(\d+)", link.attrib['href']).group(1)
            photo_url = ("http://www.legis.iowa.gov/getPhotoPeople.aspx"
                         "?GA=84&PID=%s" % pid)

            leg = Legislator(term, chamber, district, name, party=party,
                             email=email, photo_url=photo_url, url=url)
            leg.add_source(url)

            leg_page = lxml.html.fromstring(self.urlopen(link.attrib['href']))
            comm_path = "//a[contains(@href, 'committee')]"
            for comm_link in leg_page.xpath(comm_path):
                comm = comm_link.text.strip()

                match = re.search(r'\((.+)\)$', comm)
                if match:
                    comm = re.sub(r'\((.+)\)$', '', comm).strip()
                    mtype = match.group(1).lower()
                else:
                    mtype = 'member'

                if comm.endswith('Appropriations Subcommittee'):
                    sub = re.match('^(.+) Appropriations Subcommittee$',
                                   comm).group(1)
                    leg.add_role('committee member', term, chamber=chamber,
                                 committee='Appropriations',
                                 subcommittee=sub,
                                 position=mtype)
                else:
                    leg.add_role('committee member', term, chamber=chamber,
                                 committee=comm,
                                 position=mtype)

            self.save_legislator(leg)
    def scrape(self, term, chambers):
        # The links on http://www.sanjoseca.gov/index.aspx?NID=1187 may go off-
        # site, so use http://www.sanjoseca.gov/index.aspx?NID=146
        council_url = 'http://www.sanjoseca.gov/index.aspx?NID=146'
        doc = lxml.html.fromstring(self.urlopen(council_url))
        doc.make_links_absolute(council_url)

        tds = doc.xpath('//div[@id="Section1"]//td')
        assert len(tds) <= 11, 'expected 11 unique mayor and councilmember URLs, found %d' % len(tds)

        lines = []
        for text in doc.xpath('//div[@id="Section1"]/text()'):
            text = clean_string(text)
            if re.match('^(?:\d+|San) ', text):
                lines.append(text)
        address = '\n'.join(lines)

        emails = []
        for text in doc.xpath('//div[@id="Section1"]/script/text()'):
            # PhantomJS would be sweet here.
            emails.append(''.join(re.search('([^"]+)"\+"(@)"\+"([^"]+)', text).groups()))

        for index, td in enumerate(tds):
            for text in td.xpath('.//text()'):
                match = tel_regex.search(text.strip())
                if match:
                    phone = '-'.join(match.groups())
                    break

            url       = td.xpath('.//a[strong]/@href')[0]
            photo_url = td.xpath('.//img/@src')[0]
            text      = td.xpath('.//strong/text()')[0]

            if 'District' in text:
                district = re.search('District \d+', text).group(0)
                name     = re.sub(', District \d+$', '', text)
                role     = None
                if 'Vice Mayor' in text:
                    name = name.replace('Vice Mayor ', '')
                    role = 'Vice Mayor'
            elif 'Mayor' in text:
                district = 'Mayor'
                name     = text.replace('Mayor ', '')
                role     = 'Mayor'
            else:
                self.logger.warning('Skipped: ' + text)

            legislator = Legislator(term, 'upper', district, name, email=emails[index], url=url, photo_url=photo_url, party=None)
            legislator.add_office('capitol', 'Council Office', address=address, phone=phone)

            if role:
                legislator.add_role(role, term)

            legislator.add_source(url)

            self.save_legislator(legislator)
Beispiel #22
0
    def scrape_legislator(self, chamber, term, name, url):
        page = lxml.html.fromstring(self.urlopen(url))
        page.make_links_absolute(url)

        dist_link = page.xpath("//a[contains(@href, 'dist=')]")[0]
        district = dist_link.xpath('string()').strip().lstrip('0')

        mem_span = page.xpath("//span[contains(@class, 'memname')]")[0]
        mem_tail = mem_span.tail.strip()

        party = re.match(r'\((R|D)', mem_tail).group(1)
        if party == 'D':
            party = 'Democratic'
        elif party == 'R':
            party = 'Republican'

        photo_url = page.xpath(
            "//img[contains(@src, 'images/members/')]")[0].attrib['src']

        email = page.xpath("//a[contains(@href, 'mailto:')]"
                           )[1].attrib['href'].split('mailto:')[1]

        leg = Legislator(term,
                         chamber,
                         district,
                         name,
                         party=party,
                         photo_url=photo_url,
                         email=email,
                         url=url)
        leg.add_source(url)

        for link in page.xpath("//a[contains(@href, 'committee.cfm')]"):
            comm = link.xpath("string()").strip()

            committee_chamber = chamber
            if 'interims' in link.attrib['href']:
                committee_chamber = 'joint'

            sub_index = comm.find('Subcommittee')
            if sub_index > 0:
                sub = comm[sub_index:].strip()
                comm = comm[:sub_index].strip()
                leg.add_role('committee member',
                             term,
                             committee=comm,
                             subcommittee=sub,
                             chamber=committee_chamber)
            else:
                leg.add_role('committee member',
                             term,
                             committee=comm,
                             chamber=committee_chamber)

        self.scrape_offices(leg, page)
        self.save_legislator(leg)
Beispiel #23
0
    def scrape_legislator(self, chamber, term, option):
        url = urlparse.urljoin(self.url, option.attrib['value'])
        name, party, district = re.split(r'\s*,\s*', option.text.strip())
        name = re.sub(r'^(Sen\.|Rep\.)\s+', '', name)
        district = re.sub(r'^District\s+', '', district)
        if district == '[N/A]':
            msg = 'No district found for %r; skipping.'
            self.logger.warning(msg, name)
            return
        leg = Legislator(term, chamber, district, name, party=party)

        # Scrape leg page.
        try:
            html = self.urlopen(url)
        except scrapelib.HTTPError as exc:
            # As of July 2014, this only happens when a page has
            # gone missing from their varnish server.
            # if exc.response.status_code is 503:
            self.logger.exception(exc)
            self.logger.warning('Skipping legislator at url: %s' % url)
            skipped = True
            return

        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(self.url)
        leg.add_source(url)

        # Scrape committees.
        for tr in doc.xpath(
                '//div[@class="legislator-committees-container"]//table//tr'):
            committee, committee_type, role = tr
            committee = committee.text_content().strip()
            role = role.text_content().strip()
            if 'member' in role.lower():
                role = 'committee member'
            elif 'chair' in role.lower():
                role = 'chair'
            if committee != "Committee Name":
                leg.add_role(role, term, chamber=chamber, committee=committee)

        # Scrape offices.
        dist_office, phone = doc.xpath('//address')
        dist_office = dist_office.text_content().strip()
        dist_office = re.sub(r' {2,}', '', dist_office)

        phone = phone.text_content().strip()
        email = doc.xpath('string(//a[starts-with(@href, "mailto:")]/@href)')
        photo_url = doc.xpath('string(//img[contains(@class, "member")]/@src)')

        leg.update(email=email, photo_url=photo_url)
        leg.add_office(address=dist_office,
                       name='Capitol Office',
                       type='capitol',
                       phone=phone)

        self.save_legislator(leg)
Beispiel #24
0
    def _parse_member(self, chamber, term, member):
        first_name = member.get('first-name')
        last_name = member.get('last-name')
        party = self.party_map[member.get('party')]

        # this is semi-safe because we validated term w/ latest_only=True
        session = self.metadata['terms'][-1]['sessions'][-1]

        # extra_fields
        extra_dict = {}
        for name, xpath in self.extra_fields.iteritems():
            result = member.xpath(xpath)
            if result:
                extra_dict[name] = result[0]

        # address fields
        for name, xpath in self.addr_fields.iteritems():
            result = member.xpath(xpath)
            if result:
                result = result[0]
                extra_dict[name] = '%s, %s, %s %s' % (
                    result.get('street-address'), result.get('city'),
                    result.get('state'), result.get('postal-code'))

        leg = Legislator(term,
                         chamber,
                         member.get('district-number'),
                         full_name=first_name + ' ' + last_name,
                         first_name=first_name,
                         last_name=last_name,
                         middle_name=member.get('middle-initial'),
                         party=party,
                         email=member.get('e-mail'),
                         url=member.get('website'),
                         oregon_member_id=member.get('leg-member-id'),
                         **extra_dict)

        # committees
        com_xpath = 'committee-membership/session[@session-name="%s"]/committee' % session
        for com in member.xpath(com_xpath):
            cdict = {
                'position': com.get('title').lower(),
                'chamber': chamber,
            }
            com_name = com.get('name')
            com_class = com.get('committee-class')
            if com_class == 'sub-committee':
                cdict['committee'], cdict['subcommittee'] = \
                        com.get('name').split(' Subcommittee On ')
            else:
                cdict['committee'] = com.get('name')

            leg.add_role('committee member', term, **cdict)

        leg.add_source(self.source_url)
        return leg
Beispiel #25
0
    def _parse_member(self, chamber, term, member):
        first_name = member.get('first-name')
        last_name = member.get('last-name')
        party = self.party_map[member.get('party')]

        # this is semi-safe because we validated term w/ latest_only=True
        session = self.metadata['terms'][-1]['sessions'][-1]

        # extra_fields
        extra_dict = {}
        for name, xpath in self.extra_fields.iteritems():
            result = member.xpath(xpath)
            if result:
                extra_dict[name] = result[0]

        # address fields
        for name, xpath in self.addr_fields.iteritems():
            result = member.xpath(xpath)
            if result:
                result = result[0]
                extra_dict[name] = '%s, %s, %s %s' % (
                    result.get('street-address'),
                    result.get('city'),
                    result.get('state'),
                    result.get('postal-code'))

        leg = Legislator(term, chamber, member.get('district-number'),
                         full_name=first_name+' '+last_name,
                         first_name=first_name,
                         last_name=last_name,
                         middle_name=member.get('middle-initial'),
                         party=party,
                         email=member.get('e-mail'),
                         website=member.get('website'),
                         oregon_member_id=member.get('leg-member-id'),
                         **extra_dict)

        # committees
        com_xpath = 'committee-membership/session[@session-name="%s"]/committee' % session
        for com in member.xpath(com_xpath):
            cdict = {
                'position': com.get('title').lower(),
                'chamber': chamber,
            }
            com_name = com.get('name')
            com_class = com.get('committee-class')
            if com_class == 'sub-committee':
                cdict['committee'], cdict['subcommittee'] = \
                        com.get('name').split(' Subcommittee On ')
            else:
                cdict['committee'] = com.get('name')

            leg.add_role('committee member', term, **cdict)

        leg.add_source(self.source_url)
        return leg
    def scrape(self, chamber, session):
        metainf = self.scrape_leg_page(get_legislator_listing_url(chamber))
        for leg in metainf:
            try:
                chamber = {"House": "lower", "Senate": "upper"}[leg['chamber']]
            except KeyError:
                print("")
                print("  ERROR: Bad Legislator page.")
                print("    -> " + "\n    -> ".join(leg['source']))
                print("")
                print("  Added this workaround because of a bad legislator")
                print("  page, while they filled their info out.")
                print("")
                print("  Emailed webmaster. Told to wait.")
                print("   - PRT, Jun 23, 2014")
                print("")
                continue

            p = Legislator(
                session,
                chamber,
                leg['district'],
                leg['name'],
                party=leg['party'],
                # some additional things the website provides:
                photo_url=leg['image'],
                url=leg['homepage'])
            p.add_office('capitol',
                         'Capitol Office',
                         address=leg['addr'],
                         phone=leg['phone'],
                         fax=leg['fax'] or None,
                         email=leg['email'])

            for source in leg['source']:
                p.add_source(source)

            try:
                for ctty in leg['ctty']:
                    flag = 'Joint Legislative'
                    if ctty['name'][:len(flag)] == flag:
                        ctty_chamber = "joint"
                    else:
                        ctty_chamber = chamber

                    p.add_role('committee member',
                               term=session,
                               chamber=ctty_chamber,
                               committee=ctty['name'],
                               position="member")
            except KeyError:
                self.log("XXX: Warning, %s has no scraped Commities" %
                         leg['name'])

            self.save_legislator(p)
Beispiel #27
0
    def scrape(self, chamber, term):
        if term != '2011-2012':
            raise NoDataForPeriod(term)

        office_code = {'upper': 'S', 'lower': 'H'}[chamber]

        leg_url = "ftp://ftp.cga.ct.gov/pub/data/LegislatorDatabase.csv"
        page = urllib2.urlopen(leg_url)
        page = csv.DictReader(page)

        for row in page:
            if office_code != row['office code']:
                continue

            district = row['dist'].lstrip('0')

            name = row['first name']
            mid = row['middle initial'].strip()
            if mid:
                name += " %s" % mid
            name += " %s" % row['last name']
            suffix = row['suffix'].strip()
            if suffix:
                name += " %s" % suffix

            party = row['party']
            if party == 'Democrat':
                party = 'Democratic'

            office_address = "%s, Room %s\nHartford, CT 06106-1591" % (
                row['capitol street address'], row['room number'])

            leg = Legislator(term,
                             chamber,
                             district,
                             name,
                             first_name=row['first name'],
                             last_name=row['last name'],
                             middle_name=row['middle initial'],
                             suffixes=row['suffix'],
                             party=party,
                             office_address=office_address,
                             office_phone=row['capitol phone'])
            leg.add_source(leg_url)

            for comm_code in row['committee codes'].split(';'):
                if comm_code:
                    comm_name = self._committee_names[comm_code]
                    leg.add_role('committee member',
                                 term,
                                 chamber='joint',
                                 committee=comm_name)

            self.save_legislator(leg)
Beispiel #28
0
def test_legislator():
    l = Legislator('T1', 'upper', '1', 'Adam Smith', 'Adam', 'Smith')
    assert_equal(
        l, {
            '_type':
            'person',
            'full_name':
            'Adam Smith',
            'first_name':
            'Adam',
            'last_name':
            'Smith',
            'middle_name':
            '',
            'suffixes':
            '',
            'roles': [{
                'chamber': 'upper',
                'term': 'T1',
                'role': 'member',
                'start_date': None,
                'end_date': None,
                'district': '1',
                'party': ''
            }],
            'offices': [],
            'sources': []
        })

    l.add_role('committee member',
               'T1',
               committee='Some Committee',
               position='chairman')
    assert_equal(
        l['roles'][1], {
            'role': 'committee member',
            'term': 'T1',
            'start_date': None,
            'end_date': None,
            'committee': 'Some Committee',
            'position': 'chairman'
        })

    l.add_office('capitol', 'Statehouse Office', '123 Main St', '123-456-7890',
                 '123-555-5555', '*****@*****.**')
    assert_equal(l['offices'], [{
        'type': 'capitol',
        'name': 'Statehouse Office',
        'address': '123 Main St',
        'phone': '123-456-7890',
        'fax': '123-555-5555',
        'email': '*****@*****.**'
    }])
Beispiel #29
0
    def scrape_legislator(self, chamber, term, option):
        url = urlparse.urljoin(self.url, option.attrib["value"])
        name, party, district = re.split(r"\s*,\s*", option.text.strip())
        name = re.sub(r"^(Sen\.|Rep\.)\s+", "", name)
        district = re.sub(r"^District\s+", "", district)
        if district == "[N/A]":
            msg = "No district found for %r; skipping."
            self.logger.warning(msg, name)
            return
        leg = Legislator(term, chamber, district, name, party=party)
        leg.add_source(self.url)

        # Scrape leg page.
        try:
            html = self.urlopen(url)
        except scrapelib.HTTPError as exc:
            # As of July 2014, this only happens when a page has
            # gone missing from their varnish server.
            # if exc.response.status_code is 503:
            self.logger.exception(exc)
            self.logger.warning("Skipping legislator at url: %s" % url)
            skipped = True
            return

        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(self.url)
        leg.add_source(url)

        # Scrape committees.
        for tr in doc.xpath("//table//tr"):
            committee, role = tr
            committee = committee.text_content().strip()
            role = role.text_content().strip()
            if "member" in role.lower():
                role = "committee member"
            elif "chair" in role.lower():
                role = "chair"
            leg.add_role(role, term, chamber=chamber, committee=committee)

        # Scrape offices.
        dist_office, phone = doc.xpath("//address")
        dist_office = dist_office.text_content().strip()
        dist_office = re.sub(r" {2,}", "", dist_office)

        phone = phone.text_content().strip()
        email = doc.xpath('string(//a[starts-with(@href, "mailto:")]/@href)')
        photo_url = doc.xpath('string(//img[contains(@class, "member")]/@src)')

        leg.update(email=email, photo_url=photo_url)
        leg.add_office(address=dist_office, name="District Office", type="district", phone=phone)

        self.save_legislator(leg)
Beispiel #30
0
    def scrape(self, chamber, term):

        office_code = {'upper': 'S', 'lower': 'H'}[chamber]

        leg_url = "ftp://ftp.cga.ct.gov/pub/data/LegislatorDatabase.csv"
        data = self.urlopen(leg_url)
        page = open_csv(data)

        for row in page:
            if office_code != row['office code']:
                continue

            district = row['dist'].lstrip('0')

            name = row['first name']
            mid = row['middle initial'].strip()
            if mid:
                name += " %s" % mid
            name += " %s" % row['last name']
            suffix = row['suffix'].strip()
            if suffix:
                name += " %s" % suffix

            party = row['party']
            if party == 'Democrat':
                party = 'Democratic'

            leg = Legislator(term, chamber, district,
                             name, first_name=row['first name'],
                             last_name=row['last name'],
                             middle_name=row['middle initial'],
                             suffixes=row['suffix'],
                             party=party,
                             email=row['email'],
                             url=row['URL'],
                             office_phone=row['capitol phone'])

            office_address = "%s, Room %s\nHartford, CT 06106-1591" % (
                row['capitol street address'], row['room number'])
            leg.add_office('capitol', 'Capitol Office',
                           address=office_address, phone=row['capitol phone'])
            # skipping home address for now
            leg.add_source(leg_url)

            for comm_code in row['committee codes'].split(';'):
                if comm_code:
                    comm_name = self._committee_names[comm_code]
                    leg.add_role('committee member', term,
                                 chamber='joint',
                                 committee=comm_name)

            self.save_legislator(leg)
Beispiel #31
0
    def scrape(self, chamber, term):
        url = 'http://gencourt.state.nh.us/downloads/Members(Asterisk%20Delimited).txt'

        self.validate_term(term, latest_only=True)

        with self.urlopen(url) as data:
            for line in data.splitlines():
                (body, fullname, last, first, middle, county, district_num,
                 seat, party, street, street2, city, astate, zipcode,
                 home_phone, office_phone, fax, email, com1, com2, com3,
                 com4, com5, _, _) = line.split('*')

                # skip legislators from other chamber
                if body != chamber_name[chamber]:
                    continue

                if middle:
                    full = '%s %s %s' % (first, middle, last)
                else:
                    full = '%s %s' % (first, last)

                address = street
                if street2:
                    address += (' ' + street2)
                address += '\n%s, %s %s' % (city, astate, zipcode)

                district = str(int(district_num))
                if county:
                    district = '%s %s' % (county, district)

                leg = Legislator(term, chamber, district, full, first, last,
                                 middle, party_map[party],
                                 address=address,
                                 home_phone=home_phone,
                                 office_phone=office_phone, office_fax=fax,
                                 email=email)

                if chamber == 'lower':
                    # use seat as a _code if chamber is lower
                    leg['_code'] = seat
                else:
                    # Senate URLs are guessable
                    leg['url'] = 'http://www.gencourt.state.nh.us/Senate/members/webpages/district%02d.aspx' % int(district_num)

                for com in (com1, com2, com3, com4, com5):
                    if com:
                        leg.add_role('committee member', term=term,
                                      chamber=chamber, committee=com)

                leg.add_source(url)
                self.save_legislator(leg)
Beispiel #32
0
    def scrape(self, chamber, term):
        if term != '2011-2012':
            raise NoDataForPeriod(term)

        office_code = {'upper': 'S', 'lower': 'H'}[chamber]

        leg_url = "ftp://ftp.cga.ct.gov/pub/data/LegislatorDatabase.csv"
        page = urllib2.urlopen(leg_url)
        page = csv.DictReader(page)

        for row in page:
            if office_code != row['office code']:
                continue

            district = row['dist'].lstrip('0')

            name = row['first name']
            mid = row['middle initial'].strip()
            if mid:
                name += " %s" % mid
            name += " %s" % row['last name']
            suffix = row['suffix'].strip()
            if suffix:
                name += " %s" % suffix

            party = row['party']
            if party == 'Democrat':
                party = 'Democratic'

            office_address = "%s, Room %s\nHartford, CT 06106-1591" % (
                row['capitol street address'], row['room number'])

            leg = Legislator(term, chamber, district,
                             name, first_name=row['first name'],
                             last_name=row['last name'],
                             middle_name=row['middle initial'],
                             suffixes=row['suffix'],
                             party=party,
                             office_address=office_address,
                             office_phone=row['capitol phone'])
            leg.add_source(leg_url)

            for comm_code in row['committee codes'].split(';'):
                if comm_code:
                    comm_name = self._committee_names[comm_code]
                    leg.add_role('committee member', term,
                                 chamber='joint',
                                 committee=comm_name)

            self.save_legislator(leg)
Beispiel #33
0
    def scrape(self, chamber, session):
        metainf = self.scrape_leg_page(get_legislator_listing_url(chamber))
        for leg in metainf:
            try:
                chamber = {"House": "lower",
                           "Senate": "upper"}[leg['chamber']]
            except KeyError:
                print("")
                print("  ERROR: Bad Legislator page.")
                print("    -> " + "\n    -> ".join(leg['source']))
                print("")
                print("  Added this workaround because of a bad legislator")
                print("  page, while they filled their info out.")
                print("")
                print("  Emailed webmaster. Told to wait.")
                print("   - PRT, Jun 23, 2014")
                print("")
                continue

            p = Legislator( session, chamber, leg['district'], leg['name'],
                party=leg['party'],
                # some additional things the website provides:
                photo_url=leg['image'],
                url=leg['homepage'],
                email=leg['email'])
            p.add_office('capitol', 'Capitol Office', address=leg['addr'],
                         phone=leg['phone'], fax=leg['fax'] or None)

            for source in leg['source']:
                p.add_source( source )

            try:
                for ctty in leg['ctty']:
                    flag='Joint Legislative'
                    if ctty['name'][:len(flag)] == flag:
                        ctty_chamber = "joint"
                    else:
                        ctty_chamber = chamber

                    p.add_role( 'committee member',
                        term=session,
                        chamber=ctty_chamber,
                        committee=ctty['name'],
                        position="member")
            except KeyError:
                self.log( "XXX: Warning, %s has no scraped Commities" %
                    leg['name'] )

            self.save_legislator( p )
Beispiel #34
0
    def scrape(self, chamber, term):
        """
        Scrapes legislators for the current term only
        """
        self.validate_term(term, latest_only=True)
        url = BASE_URL % CHAMBERS[chamber].lower()
        index = self.get(url).text
        html = lxml.html.fromstring(index)
        html.make_links_absolute(url)

        rows = html.xpath('//div[contains(@class, "row-equal-height")]')

        for row in rows:
            img_url = row.xpath('.//img/@src')[0]

            inner = row.xpath('.//div[@class="vc-column-innner-wrapper"]')[1]

            name = inner.xpath('p/strong')[0].text.replace(u'\xa0', ' ').strip()
            name = re.sub('\s+', ' ', name)
            party = PARTY[inner.xpath('p/strong')[0].tail.strip()]
            email = inner.xpath('p/strong/a')[0].text
            district = inner.xpath('p/a')[0].text.replace('District ', '')
            leg_url = inner.xpath('p/a/@href')[0]

            leg = Legislator(term, chamber, district, name, party=party,
                             email=email)

            phones = get_phones(inner)
            leg.add_office('district', 'District Office',
                           address=get_address(inner), fax=get_fax(inner),
                           phone=phones.get('home') or phones.get('business'))
            leg.add_office('capitol', 'Capitol Office', phone=phones.get('office'))

            leg.add_source(url)
            leg['photo_url'] = img_url
            leg['url'] = leg_url

            for com in inner.xpath('p/a[contains(@href, "committees")]'):
                role = com.tail.strip()
                if not role:
                    role = 'member'
                leg.add_role('committee member',
                             term=term,
                             chamber=chamber,
                             committee=com.text,
                             position=role)

            self.save_legislator(leg)
Beispiel #35
0
    def scrape_legislator(self, chamber, term, name, url):
        page = lxml.html.fromstring(self.urlopen(url))
        page.make_links_absolute(url)

        dist_link = page.xpath("//a[contains(@href, 'dist=')]")[0]
        district = dist_link.xpath('string()').strip().lstrip('0')

        mem_span = page.xpath("//span[contains(@class, 'memname')]")[0]
        mem_tail = mem_span.tail.strip()

        party = re.match(r'\((R|D)', mem_tail).group(1)
        if party == 'D':
            party = 'Democratic'
        elif party == 'R':
            party = 'Republican'

        photo_url = page.xpath(
            "//img[contains(@src, 'images/members/')]")[0].attrib['src']

        email = page.xpath(
            "//a[contains(@href, 'mailto:')]")[1].attrib['href'].split(
            'mailto:')[1]

        leg = Legislator(term, chamber, district, name, party=party,
                         photo_url=photo_url, email=email, url=url)
        leg.add_source(url)

        for link in page.xpath("//a[contains(@href, 'committee.cfm')]"):
            comm = link.xpath("string()").strip()

            committee_chamber = chamber
            if 'interims' in link.attrib['href']:
                committee_chamber = 'joint'

            sub_index = comm.find('Subcommittee')
            if sub_index > 0:
                sub = comm[sub_index:].strip()
                comm = comm[:sub_index].strip()
                leg.add_role('committee member', term, committee=comm,
                             subcommittee=sub, chamber=committee_chamber)
            else:
                leg.add_role('committee member', term, committee=comm,
                             chamber=committee_chamber)

        self.scrape_offices(leg, page)
        self.save_legislator(leg)
Beispiel #36
0
    def scrape_bio(self, term, chamber, district, name, url):
        # this opens the committee section without having to do another request
        url += '&TableRow=1.5.5'
        doc = lxml.html.fromstring(self.urlopen(url))
        doc.make_links_absolute(url)

        # party is in one of these
        party = doc.xpath('//div[@align="center"]/b/font[@size="2"]/text()')
        if '(D)' in party:
            party = 'Democratic'
        elif '(R)' in party:
            party = 'Republican'

        leg = Legislator(term, chamber, district, name, party=party, url=url)

        photo_url = doc.xpath('//img[contains(@src, "FieldElemFormat")]/@src')
        if photo_url:
            leg['photo_url'] = photo_url[0]

        roles = defaultdict(lambda: {})

        position = 'member'
        for text in doc.xpath('//td[@width="584"]/descendant::font/text()'):
            text = text.strip()
            if text == 'Committee Chair:':
                position = 'chair'
            elif text == 'Committee Co-chair:':
                position = 'co-chair'
            else:
                for committee in text.splitlines():
                    roles[committee].update(
                        role='committee member',
                        term=term,
                        chamber=chamber,
                        committee=committee,
                        party=party,
                        position=position)

        for role in roles.values():
            leg.add_role(**role)

        contact_info = self.scrape_contact_info(doc)
        leg.update(contact_info)

        return leg
Beispiel #37
0
    def scrape_bio(self, term, chamber, district, name, url):
        # this opens the committee section without having to do another request
        url += '&TableRow=1.5.5'
        doc = lxml.html.fromstring(self.urlopen(url))
        doc.make_links_absolute(url)

        # party is in one of these
        party = doc.xpath('//div[@align="center"]/b/font[@size="2"]/text()')
        if '(D)' in party:
            party = 'Democratic'
        elif '(R)' in party:
            party = 'Republican'

        leg = Legislator(term, chamber, district, name, party=party, url=url)

        photo_url = doc.xpath('//img[contains(@src, "FieldElemFormat")]/@src')
        if photo_url:
            leg['photo_url'] = photo_url[0]

        roles = defaultdict(lambda: {})

        position = 'member'
        for text in doc.xpath('//td[@width="584"]/descendant::font/text()'):
            text = text.strip()
            if text == 'Committee Chair:':
                position = 'chair'
            elif text == 'Committee Co-chair:':
                position = 'co-chair'
            else:
                for committee in text.splitlines():
                    roles[committee].update(
                        role='committee member',
                        term=term,
                        chamber=chamber,
                        committee=committee,
                        party=party,
                        position=position)

        for role in roles.values():
            leg.add_role(**role)

        contact_info = self.scrape_contact_info(doc)
        leg.update(contact_info)

        return leg
Beispiel #38
0
    def scrape_legislator(self, chamber, term, option):
        url = urlparse.urljoin(self.url, option.attrib['value'])
        name, party, district = re.split(r'\s*,\s*', option.text.strip())
        name = re.sub(r'^(Sen\.|Rep\.)\s+', '', name)
        district = re.sub(r'^District\s+', '', district)
        if district == '[N/A]':
            msg = 'No district found for %r; skipping.'
            self.logger.warning(msg, name)
            return
        leg = Legislator(term, chamber, district, name, party=party)
        leg.add_source(self.url)

        # Scrape leg page.
        html = self.urlopen(url)
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(self.url)
        leg.add_source(url)

        # Scrape committees.
        for tr in doc.xpath('//table//tr'):
            committee, role = tr
            committee = committee.text_content().strip()
            role = role.text_content().strip()
            if 'member' in role.lower():
                role = 'committee member'
            elif 'chair' in role.lower():
                role = 'chair'
            leg.add_role(role, term, chamber=chamber, committee=committee)

        # Scrape offices.
        dist_office, phone = doc.xpath('//address')
        dist_office = dist_office.text_content().strip()
        dist_office = re.sub(r' {2,}', '', dist_office)

        phone = phone.text_content().strip()
        email = doc.xpath('string(//a[starts-with(@href, "mailto:")]/@href)')
        photo_url = doc.xpath('string(//img[contains(@class, "member")]/@src)')

        leg.update(email=email, photo_url=photo_url)
        leg.add_office(
            address=dist_office, name='District Office',
            type='district', phone=phone)

        self.save_legislator(leg)
Beispiel #39
0
    def scrape_legislator(self, chamber, term, option):
        url = urlparse.urljoin(self.url, option.attrib['value'])
        name, party, district = re.split(r'\s*,\s*', option.text.strip())
        name = re.sub(r'^(Sen\.|Rep\.)\s+', '', name)
        district = re.sub(r'^District\s+', '', district)
        leg = Legislator(term, chamber, district, name, party=party)
        leg.add_source(self.url)

        # Scrape leg page.
        html = self.urlopen(url)
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(self.url)
        leg.add_source(url)

        # Scrape committees.
        for tr in doc.xpath('//table//tr'):
            committee, role = tr
            committee = committee.text_content().strip()
            role = role.text_content().strip()
            if 'member' in role.lower():
                role = 'committee member'
            elif 'chair' in role.lower():
                role = 'chair'
            leg.add_role(role, term, chamber=chamber, committee=committee)

        # Scrape offices.
        dist_office, phone = doc.xpath('//address')
        dist_office = dist_office.text_content().strip()
        dist_office = re.sub(r' {2,}', '', dist_office)

        phone = phone.text_content().strip()
        email = doc.xpath('string(//a[starts-with(@href, "mailto:")]/@href)')
        photo_url = doc.xpath('string(//img[contains(@class, "member")]/@src)')

        leg.update(email=email, photo_url=photo_url)
        leg.add_office(
            address=dist_office, name='District Office',
            type='district', phone=phone)

        self.save_legislator(leg)
Beispiel #40
0
    def scrape(self, chamber, session):
        metainf = self.scrape_leg_page(get_chamber_listing_url(chamber))
        for leg in metainf:
            p = Legislator(
                session,
                chamber,
                leg["district"],
                leg["name"],
                party=leg["party"],
                # some additional things the website provides:
                photo_url=leg["image"],
                url=leg["homepage"],
                email=leg["email"],
            )
            p.add_office("capitol", "Capitol Office", address=leg["addr"], phone=leg["phone"], fax=leg["fax"] or None)

            for source in leg["source"]:
                p.add_source(source)

            try:
                for ctty in leg["ctty"]:
                    flag = "Joint Legislative"
                    if ctty["name"][: len(flag)] == flag:
                        ctty_chamber = "joint"
                    else:
                        ctty_chamber = chamber

                    p.add_role(
                        "committee member",
                        term=session,
                        chamber=ctty_chamber,
                        committee=ctty["name"],
                        position="member",
                    )
            except KeyError:
                self.log("XXX: Warning, %s has no scraped Commities" % leg["name"])

            self.save_legislator(p)
Beispiel #41
0
    def fetch_member(self, url, name, term, chamber):
        party_map = {'R': 'Republican', 'D': 'Democratic', 'I': 'Independent'}
        party_district_re = re.compile(
            r'\((R|D|I)\) - (?:House|Senate) District\s+(\d+)')

        url = 'http://leg6.state.va.us' + url

        # handle resignations, special elections
        match = re.search(r'-(Resigned|Member) (\d{1,2}/\d{1,2})?', name)
        if match:
            action, date = match.groups()
            name = name.rsplit('-')[0]
            if action == 'Resigned':
                pass  # TODO: set end date
            elif action == 'Member':
                pass  # TODO: set start date

        with self.urlopen(url) as html:
            doc = lxml.html.fromstring(html)

            party_district_line = doc.xpath('//h3/font/text()')[0]
            party, district = party_district_re.match(
                party_district_line).groups()

            leg = Legislator(term,
                             chamber,
                             district,
                             name.strip(),
                             party=party_map[party])
            leg.add_source(url)

            for com in doc.xpath('//ul[@class="linkSect"][1]/li/a/text()'):
                leg.add_role('committee member',
                             term=term,
                             chamber=chamber,
                             committee=com)

            self.save_legislator(leg)
Beispiel #42
0
    def scrape(self, chamber, session):
        url = self.get_district_list(chamber, session)
        people_pages = self.scrape_directory(url, chamber, session)

        for person in people_pages:
            district = person
            p_url = people_pages[district]
            metainf = self.process_person(p_url)

            p = Legislator(
                session,
                chamber,
                district,
                metainf['name'],
                party=metainf['party'],
                # some additional things the website provides:
                occupation=metainf['occupation'],
                photo_url=metainf['photo_url'],
                url=metainf['homepage'])
            if "email" in metainf:
                p['email'] = metainf['email']
            if "number" in metainf:
                p.add_office('capitol',
                             'Capitol Office',
                             phone=metainf['number'],
                             address='200 E. Colfax\nDenver, CO 80203')

            p.add_source(p_url)

            if 'ctty' in metainf:
                for ctty in metainf['ctty']:
                    p.add_role('committee member',
                               term=session,
                               chamber=chamber,
                               committee=ctty,
                               position="member")
            self.save_legislator(p)
Beispiel #43
0
    def scrape(self, chamber, term):
        self.validate_term(term)

        l1 = Legislator(term, chamber, '1st',
                        'Bob Smith', party='Democrat')

        if chamber == 'upper':
            l1.add_role('President of the Senate', term)
        else:
            l1.add_role('Speaker of the House', term)

        l1.add_source('http://example.com/Bob_Smith.html')

        l2 = Legislator(term, chamber, '2nd',
                        'Sally Johnson', party='Republican')
        l2.add_role('Minority Leader', term)
        l2.add_source('http://example.com/Sally_Johnson.html')

        self.save_legislator(l1)
        self.save_legislator(l2)
Beispiel #44
0
    def scrape(self, chamber, term):
        self.validate_term(term)

        l1 = Legislator(term, chamber, '1st', 'Bob Smith', party='Democrat')

        if chamber == 'upper':
            l1.add_role('President of the Senate', term)
        else:
            l1.add_role('Speaker of the House', term)

        l1.add_source('http://example.com/Bob_Smith.html')

        l2 = Legislator(term,
                        chamber,
                        '2nd',
                        'Sally Johnson',
                        party='Republican')
        l2.add_role('Minority Leader', term)
        l2.add_source('http://example.com/Sally_Johnson.html')

        self.save_legislator(l1)
        self.save_legislator(l2)
Beispiel #45
0
    def scrape_legislator(self, chamber, term, url):
        with self.urlopen(url) as html:
            doc = lxml.html.fromstring(html)
            doc.make_links_absolute(url)

            # most properties are easy to pull
            properties = {
                'first_name': 'FNAME',
                'last_name': 'LNAME',
                'party': 'PARTY',
                'district': 'DISTRICT',
                'county': 'COUNTY',
                'start_year': 'STARTYEAR',
                'occupation': 'OCCUPATION',
                'office_phone': 'WKPH'
            }

            for key, value in properties.iteritems():
                id_ = 'ctl00_mainCopy_LegisInfo_%sLabel' % value
                try:
                    val = doc.get_element_by_id(id_).text
                except KeyError:
                    self.warning('bad legislator page %s missing %s' %
                                 (url, id))
                    return
                if val:
                    properties[key] = val.strip()

            # image & email are a bit different
            properties['photo_url'] = doc.xpath(
                '//img[@id="ctl00_mainCopy_LegisInfo_LegislatorPhoto"]/@src'
            )[0]
            email = doc.get_element_by_id(
                'ctl00_mainCopy_LegisInfo_lnkEmail').text
            if email:
                properties['email'] = email.strip()

            properties['url'] = url

            properties['chamber'] = chamber
            properties['term'] = term
            properties[
                'full_name'] = '%(first_name)s %(last_name)s' % properties
            if '(D)' in properties['party']:
                properties['party'] = 'Democratic'
            elif '(R)' in properties['party']:
                properties['party'] = 'Republican'
            elif '(DTS)' in properties['party']:
                properties['party'] = 'Decline to State'
            else:
                raise Exception("unknown party encountered")

            leg = Legislator(**properties)
            leg.add_source(url)

            # committees
            # skip first header row
            for row in doc.xpath(
                    '//table[@id="ctl00_mainCopy_MembershipGrid"]/tr')[1:]:
                role, committee, note = [
                    x.text_content() for x in row.xpath('td')
                ]
                if 'Interim' in note:
                    role = 'interim ' + role.lower()
                else:
                    role = role.lower()
                leg.add_role('committee member',
                             term,
                             committee=committee,
                             position=role,
                             chamber=chamber)

            # Already have the photo url.
            try:
                del leg['image_url']
            except KeyError:
                pass

            self.save_legislator(leg)
    def scrape(self, chamber, term):
        self.validate_term(term)
        session = self.get_session_for_term(term)
        try:
            session_id = self.get_session_id(session)
        except KeyError:
            raise NoDataForPeriod(session)

        body = {'lower': 'H', 'upper': 'S'}[chamber]
        url = 'http://www.azleg.gov/MemberRoster.asp?Session_ID=%s&body=%s' % (
                                                               session_id, body)
        page = self.get(url).text
        root = html.fromstring(page)
        path = '//table[@id="%s"]/tr' % {'H': 'house', 'S': 'senate'}[body]
        roster = root.xpath(path)[1:]
        for row in roster:
            position = ''
            vacated = ''
            name, district, party, email, room, phone, fax = row.xpath('td')

            if email.attrib.get('class') == 'vacantmember':
                continue  # Skip any vacant members.

            link = name.xpath('string(a/@href)')
            link = "http://www.azleg.gov" + link
            if len(name) == 1:
                name = name.text_content().strip()
            else:
                position = name.tail.strip()
                name = name[0].text_content().strip()

            linkpage = self.get(link).text
            linkroot = html.fromstring(linkpage)
            linkroot.make_links_absolute(link)

            photos = linkroot.xpath("//img[@name='memberphoto']")

            if len(photos) != 1:
                raise Exception

            photo_url = photos[0].attrib['src']

            district = district.text_content()
            party = party.text_content().strip()
            email = email.text_content().strip()

            if ('Vacated' in email or 'Resigned' in email or 
                'Removed' in email):
                # comment out the following 'continue' for historical
                # legislative sessions
                # for the current session, if a legislator has left we will
                # skip him/her to keep from overwriting their information
                continue
                vacated = re.search('[0-9]*/[0-9]*/\d{4}', email).group()
                email = ''

            party = self.get_party(party)
            room = room.text_content().strip()
            if chamber == 'lower':
                address = "House of Representatives\n"
            else:
                address = "Senate\n"
            address = address + "1700 West Washington\n Room " + room  \
                              + "\nPhoenix, AZ 85007"

            phone = phone.text_content().strip()
            if not phone.startswith('602'):
                phone = "602-" + phone
            fax = fax.text_content().strip()
            if not fax.startswith('602'):
                fax = "602-" + fax
            if vacated:
                end_date = datetime.datetime.strptime(vacated, '%m/%d/%Y')
                leg = Legislator( term, chamber, district, full_name=name,
                                  party=party, url=link)
                leg['roles'][0]['end_date'] = end_date
            else:
                leg = Legislator(term, chamber, district, full_name=name,
                                 party=party, url=link,
                                 photo_url=photo_url)

            leg.add_office('capitol', 'Capitol Office', address=address,
                           phone=phone, fax=fax,  email=email)

            if position:
                leg.add_role( position, term, chamber=chamber,
                             district=district, party=party)

            leg.add_source(url)

            #Probably just get this from the committee scraper
            #self.scrape_member_page(link, session, chamber, leg)
            self.save_legislator(leg)
Beispiel #47
0
    def scrape(self, chamber, term):
        """
        Scrapes legislators for the current term only
        """
        self.validate_term(term, latest_only=True)
        url = _BASE_URL % _CHAMBERS[chamber].lower()
        index = self.get(url).text
        html = lxml.html.fromstring(index)
        html.make_links_absolute(url)

        rows = html.xpath('//div[contains(@class, "row-equal-height")]')

        for row in rows:
            img_url = row.xpath('.//img/@src')[0]

            inner = row.xpath('.//div[@class="vc-column-innner-wrapper"]')[1]

            name = inner.xpath('p/strong')[0].text.replace(u'\xa0',
                                                           ' ').strip()
            name = re.sub('\s+', ' ', name)
            party = _PARTY[inner.xpath('p/strong')[0].tail.strip()]
            email = inner.xpath('p/strong/a')[0].text
            district = inner.xpath('p/a')[0].text.replace('District ', '')
            leg_url = inner.xpath('p/a/@href')[0]

            address = home_phone = office_phone = fax = None

            for br in inner.xpath('p/br'):
                piece = br.tail or ''
                piece = piece.strip()

                if re.findall(', \d{5}', piece):
                    address = re.sub(r'(\d{5})', r'ID \1', piece).strip()
                elif piece.startswith('Home '):
                    home_phone = piece[5:]
                elif piece.startswith('Bus '):
                    office_phone = piece[4:]
                elif piece.startswith('FAX '):
                    fax = piece[4:]
                print(piece)

            leg = Legislator(term,
                             chamber,
                             district,
                             name,
                             party=party,
                             email=email)

            phone = home_phone or office_phone
            leg.add_office('district',
                           'District Office',
                           address=address,
                           fax=fax,
                           phone=phone)

            leg.add_source(url)
            leg['photo_url'] = img_url
            leg['url'] = leg_url

            for com in inner.xpath('p/a[contains(@href, "committees")]'):
                role = com.tail.strip()
                if not role:
                    role = 'member'
                leg.add_role('committee member',
                             term=term,
                             chamber=chamber,
                             committee=com.text,
                             position=role)

            self.save_legislator(leg)
Beispiel #48
0
    def scrape(self, chamber, term):
        self.validate_term(term, latest_only=True)
        session_id = self.metadata['session_details'][term]['number']

        if chamber == 'upper':
            chamber_name = 'senate'
        else:
            chamber_name = 'house'

        url = "http://www.legis.iowa.gov/Legislators/%s.aspx" % chamber_name
        page = lxml.html.fromstring(self.urlopen(url))
        page.make_links_absolute(url)
        table = page.xpath('//table[@class="legis"]')[0]
        for link in table.xpath(".//a[contains(@href, 'legislator.aspx')]"):
            name = link.text.strip()
            leg_url = link.get('href')
            district = link.xpath("string(../../td[2])")
            party = link.xpath("string(../../td[3])")
            email = link.xpath("string(../../td[5])")

            if party == 'Democrat':
                party = 'Democratic'

            pid = re.search("PID=(\d+)", link.attrib['href']).group(1)
            photo_url = ("http://www.legis.iowa.gov/getPhotoPeople.aspx"
                         "?GA=%s&PID=%s" % (session_id, pid))

            leg = Legislator(term, chamber, district, name, party=party,
                             email=email, photo_url=photo_url, url=url)
            leg.add_source(url)

            leg_page = lxml.html.fromstring(self.urlopen(link.attrib['href']))

            office_data = {
                "email": "ctl00_cphMainContent_divEmailLegis",
                "home_phone": "ctl00_cphMainContent_divPhoneHome",
                "home_addr": "ctl00_cphMainContent_divAddrHome",
                "office_phone": "ctl00_cphMainContent_divPhoneCapitol",
            }
            metainf = {}

            for attr in office_data:
                path = office_data[attr]
                info = leg_page.xpath("//div[@id='%s']" % path)
                if len(info) != 1:
                    continue
                info = info[0]

                _, data = [x.text_content() for x in info.xpath("./span")]
                data = data.strip()
                if data == "":
                    continue

                metainf[attr] = data

            if "home_phone" in metainf or "home_addr" in metainf:
                home_args = {}
                if "home_phone" in metainf:
                    home_args['phone'] = metainf['home_phone']
                if "home_addr" in metainf:
                    home_args['address'] = metainf['home_addr']
                leg.add_office('district',
                               'Home Office',
                               **home_args)

            if "email" in metainf or "office_phone" in metainf:
                cap_args = {}

                if "email" in metainf:
                    cap_args['email'] = metainf['email']
                if "office_phone" in metainf:
                    cap_args['phone'] = metainf['office_phone']

                leg.add_office('capitol',
                               'Capitol Office',
                               **cap_args)


            comm_path = "//a[contains(@href, 'committee')]"
            for comm_link in leg_page.xpath(comm_path):
                comm = comm_link.text.strip()

                match = re.search(r'\((.+)\)$', comm)
                if match:
                    comm = re.sub(r'\((.+)\)$', '', comm).strip()
                    mtype = match.group(1).lower()
                else:
                    mtype = 'member'

                if comm.endswith('Appropriations Subcommittee'):
                    sub = re.match('^(.+) Appropriations Subcommittee$',
                                   comm).group(1)
                    leg.add_role('committee member', term, chamber=chamber,
                                 committee='Appropriations',
                                 subcommittee=sub,
                                 position=mtype)
                else:
                    leg.add_role('committee member', term, chamber=chamber,
                                 committee=comm,
                                 position=mtype)

            self.save_legislator(leg)
Beispiel #49
0
    def scrape(self, chamber, term):
        # CSS isn't there without this, it serves up a mobile version
        self.user_agent = 'Mozilla/5.0'

        if chamber == 'lower':
            url = 'http://www.scstatehouse.gov/member.php?chamber=H'
        else:
            url = 'http://www.scstatehouse.gov/member.php?chamber=S'

        data = self.urlopen(url)
        doc = lxml.html.fromstring(data)
        doc.make_links_absolute(url)

        for a in doc.xpath('//a[contains(@href, "code=")]'):
            full_name = a.text
            leg_url = a.get('href')

            leg_html = self.urlopen(leg_url)
            leg_doc = lxml.html.fromstring(leg_html)
            leg_doc.make_links_absolute(leg_url)

            party, district, _ = leg_doc.xpath(
                '//p[@style="font-size: 17px; margin: 0 0 0 0; padding: 0;"]/text()'
            )
            if 'Republican' in party:
                party = 'Republican'
            elif 'Democrat' in party:
                party = 'Democratic'

            # District # - County - Map
            district = district.split()[1]

            photo_url = leg_doc.xpath(
                '//img[contains(@src,"/members/")]/@src')[0]

            legislator = Legislator(term,
                                    chamber,
                                    district,
                                    full_name,
                                    party=party,
                                    photo_url=photo_url,
                                    url=leg_url)
            # office address / phone
            try:
                addr_div = leg_doc.xpath(
                    '//div[@style="float: left; width: 225px; margin: 10px 5px 0 20px; padding: 0;"]'
                )[0]
                addr = addr_div.xpath(
                    'p[@style="font-size: 13px; margin: 0 0 10px 0; padding: 0;"]'
                )[0].text_content()

                phone = addr_div.xpath(
                    'p[@style="font-size: 13px; margin: 0 0 0 0; padding: 0;"]/text()'
                )[0]
                phone = phone.strip()
                legislator.add_office('capitol',
                                      'Columbia Address',
                                      address=addr,
                                      phone=phone)
            except IndexError:
                self.warning('no address for {0}'.format(full_name))

            legislator.add_source(leg_url)
            legislator.add_source(url)

            # committees (skip first link)
            for com in leg_doc.xpath(
                    '//a[contains(@href, "committee.php")]')[1:]:
                if com.text.endswith(', '):
                    committee, role = com.text_content().rsplit(', ', 1)
                    # known roles
                    role = {
                        'Treas.': 'treasurer',
                        'Secy.': 'secretary',
                        'Secy./Treas.': 'secretary/treasurer',
                        'V.C.': 'vice-chair',
                        '1st V.C.': 'first vice-chair',
                        '2nd V.C.': 'second vice-chair',
                        '3rd V.C.': 'third vice-chair',
                        'Ex.Officio Member': 'ex-officio member',
                        'Chairman': 'chairman'
                    }[role]
                else:
                    committee = com.text
                    role = 'member'
                legislator.add_role('committee member',
                                    term=term,
                                    chamber=chamber,
                                    committee=committee,
                                    position=role)

            self.save_legislator(legislator)
Beispiel #50
0
    def scrape(self, chamber, term):
        self.validate_term(term, latest_only=True)
        session_id = self.metadata['session_details'][term]['number']

        if chamber == 'upper':
            chamber_name = 'senate'
        else:
            chamber_name = 'house'

        url = "https://www.legis.iowa.gov/legislators/%s" % chamber_name
        page = lxml.html.fromstring(self.urlopen(url))
        page.make_links_absolute(url)
        table = page.xpath('//table[@id="sortableTable"]')[0]
        for link in table.xpath(".//a[contains(@href, 'legislator')]"):
            name = link.text.strip()
            leg_url = link.get('href')
            district = link.xpath("string(../../td[3])")
            party = link.xpath("string(../../td[4])")
            email = link.xpath("string(../../td[5])")

            if party == 'Democrat':
                party = 'Democratic'

            pid = re.search("personID=(\d+)", link.attrib['href']).group(1)
            photo_url = ("https://www.legis.iowa.gov/photo"
                         "?action=getPhoto&ga=%s&pid=%s" % (session_id, pid))

            leg = Legislator(term, chamber, district, name, party=party,
                             photo_url=photo_url, url=url)
            leg.add_source(url)

            leg_page = lxml.html.fromstring(self.urlopen(link.attrib['href']))

            office_data = {
                "Legislative Email:": "email",
                "Home Phone:": "home_phone",
                "Home Address:": "home_addr",
                "Capitol Phone:": "office_phone",
            }
            metainf = {}

            table ,= leg_page.xpath(
                "//div[@class='legisIndent divideVert']/table"
            )
            for row in table.xpath(".//tr"):
                try:
                    key, value = (
                        x.text_content().strip() for x in row.xpath("./td")
                    )
                except ValueError:
                    continue

                try:
                    metainf[office_data[key]] = value
                except KeyError:
                    continue

            if "home_phone" in metainf or "home_addr" in metainf:
                home_args = {}
                if "home_phone" in metainf:
                    home_args['phone'] = metainf['home_phone']
                if "home_addr" in metainf:
                    home_args['address'] = metainf['home_addr']
                leg.add_office('district',
                               'Home Office',
                               **home_args)

            if "email" in metainf or "office_phone" in metainf:
                cap_args = {}

                if "email" in metainf:
                    cap_args['email'] = metainf['email']
                if "office_phone" in metainf:
                    cap_args['phone'] = metainf['office_phone']

                leg.add_office('capitol',
                               'Capitol Office',
                               **cap_args)


            comm_path = "//a[contains(@href, 'committee')]"
            for comm_link in leg_page.xpath(comm_path):
                comm = comm_link.text.strip()

                match = re.search(r'\((.+)\)$', comm)
                if match:
                    comm = re.sub(r'\((.+)\)$', '', comm).strip()
                    mtype = match.group(1).lower()
                else:
                    mtype = 'member'

                if comm.endswith('Appropriations Subcommittee'):
                    sub = re.match('^(.+) Appropriations Subcommittee$',
                                   comm).group(1)
                    leg.add_role('committee member', term, chamber=chamber,
                                 committee='Appropriations',
                                 subcommittee=sub,
                                 position=mtype)
                else:
                    leg.add_role('committee member', term, chamber=chamber,
                                 committee=comm,
                                 position=mtype)

            self.save_legislator(leg)
Beispiel #51
0
    def scrape(self, chamber, term):
        # TODO: old AZ scraper allowed old sessions, they seem to be gone?
        self.validate_term(term, latest_only=True)

        body = {'lower': 'H', 'upper': 'S'}[chamber]
        url = 'http://www.azleg.gov/MemberRoster/?body=' + body
        page = self.get(url).text

        # there is a bad comment closing tag on this page
        page = page.replace('--!>', '-->')

        root = html.fromstring(page)

        path = '//table//tr'
        roster = root.xpath(path)[1:]
        for row in roster:
            position = ''
            name, district, party, email, room, phone, = row.xpath('td')

            if email.attrib.get('class') == 'vacantmember':
                continue  # Skip any vacant members.

            link = name.xpath('string(a/@href)')
            if len(name) == 1:
                name = name.text_content().strip()
            else:
                position = name.tail.strip()
                name = name[0].text_content().strip()
            if '--' in name:
                name = name.split('--')[0].strip()

            linkpage = self.get(link).text
            linkpage = linkpage.replace('--!>', '-->')
            linkroot = html.fromstring(linkpage)
            linkroot.make_links_absolute(link)

            photos = linkroot.xpath("//img[contains(@src, 'MemberPhoto')]")

            if len(photos) != 1:
                self.warning('no photo on ' + link)
                photo_url = ''
            else:
                photo_url = photos[0].attrib['src']

            district = district.text_content()
            party = party.text_content().strip()
            email = email.text_content().strip()

            if email.startswith('Email: '):
                email = email.replace('Email: ', '').lower() + '@azleg.gov'
            else:
                email = ''

            party = self.get_party(party)
            room = room.text_content().strip()
            if chamber == 'lower':
                address = "House of Representatives\n"
            else:
                address = "Senate\n"
            address = address + "1700 West Washington\n Room " + room  \
                              + "\nPhoenix, AZ 85007"

            phone = phone.text_content().strip()
            if '602' not in re.findall(r'(\d+)', phone):
                phone = "602-" + phone

            leg = Legislator(term,
                             chamber,
                             district,
                             full_name=name,
                             party=party,
                             url=link,
                             photo_url=photo_url)

            leg.add_office('capitol',
                           'Capitol Office',
                           address=address,
                           phone=phone,
                           email=email)

            if position:
                leg.add_role(position,
                             term,
                             chamber=chamber,
                             district=district,
                             party=party)

            leg.add_source(url)

            # Probably just get this from the committee scraper
            # self.scrape_member_page(link, session, chamber, leg)
            self.save_legislator(leg)
Beispiel #52
0
    def scrape(self, chamber, term):
        self.validate_term(term, latest_only=True)
        session_id = self.metadata['session_details'][term]['number']

        if chamber == 'upper':
            chamber_name = 'senate'
        else:
            chamber_name = 'house'

        url = "https://www.legis.iowa.gov/legislators/%s" % chamber_name
        page = lxml.html.fromstring(self.urlopen(url))
        page.make_links_absolute(url)
        table = page.xpath('//table[@id="sortableTable"]')[0]
        for link in table.xpath(".//a[contains(@href, 'legislator')]"):
            name = link.text.strip()
            leg_url = link.get('href')
            district = link.xpath("string(../../td[3])")
            party = link.xpath("string(../../td[4])")
            email = link.xpath("string(../../td[5])")

            if party == 'Democrat':
                party = 'Democratic'

            pid = re.search("personID=(\d+)", link.attrib['href']).group(1)
            photo_url = ("http://www.legis.iowa.gov/getPhotoPeople.aspx"
                         "?GA=%s&PID=%s" % (session_id, pid))

            leg = Legislator(term, chamber, district, name, party=party,
                             photo_url=photo_url, url=url)
            leg.add_source(url)

            leg_page = lxml.html.fromstring(self.urlopen(link.attrib['href']))

            office_data = {
                "email": "ctl00_cphMainContent_divEmailLegis",
                "home_phone": "ctl00_cphMainContent_divPhoneHome",
                "home_addr": "ctl00_cphMainContent_divAddrHome",
                "office_phone": "ctl00_cphMainContent_divPhoneCapitol",
            }
            metainf = {}

            for attr in office_data:
                path = office_data[attr]
                info = leg_page.xpath("//div[@id='%s']" % path)
                if len(info) != 1:
                    continue
                info = info[0]

                _, data = [x.text_content() for x in info.xpath("./span")]
                data = data.strip()
                if data == "":
                    continue

                metainf[attr] = data

            if "home_phone" in metainf or "home_addr" in metainf:
                home_args = {}
                if "home_phone" in metainf:
                    home_args['phone'] = metainf['home_phone']
                if "home_addr" in metainf:
                    home_args['address'] = metainf['home_addr']
                leg.add_office('district',
                               'Home Office',
                               **home_args)

            if "email" in metainf or "office_phone" in metainf:
                cap_args = {}

                if "email" in metainf:
                    cap_args['email'] = metainf['email']
                if "office_phone" in metainf:
                    cap_args['phone'] = metainf['office_phone']

                leg.add_office('capitol',
                               'Capitol Office',
                               **cap_args)


            comm_path = "//a[contains(@href, 'committee')]"
            for comm_link in leg_page.xpath(comm_path):
                comm = comm_link.text.strip()

                match = re.search(r'\((.+)\)$', comm)
                if match:
                    comm = re.sub(r'\((.+)\)$', '', comm).strip()
                    mtype = match.group(1).lower()
                else:
                    mtype = 'member'

                if comm.endswith('Appropriations Subcommittee'):
                    sub = re.match('^(.+) Appropriations Subcommittee$',
                                   comm).group(1)
                    leg.add_role('committee member', term, chamber=chamber,
                                 committee='Appropriations',
                                 subcommittee=sub,
                                 position=mtype)
                else:
                    leg.add_role('committee member', term, chamber=chamber,
                                 committee=comm,
                                 position=mtype)

            self.save_legislator(leg)
Beispiel #53
0
    def scrape(self, chamber, term):
        # CSS isn't there without this, it serves up a mobile version
        self.user_agent = 'Mozilla/5.0'

        if chamber == 'lower':
            url = 'http://www.scstatehouse.gov/member.php?chamber=H'
        else:
            url = 'http://www.scstatehouse.gov/member.php?chamber=S'

        data = self.urlopen(url)
        doc = lxml.html.fromstring(data)
        doc.make_links_absolute(url)

        for a in doc.xpath('//a[contains(@href, "code=")]'):
            full_name = a.text
            leg_url = a.get('href')

            leg_html = self.urlopen(leg_url)
            leg_doc = lxml.html.fromstring(leg_html)
            leg_doc.make_links_absolute(leg_url)

            party, district, _ = leg_doc.xpath('//p[@style="font-size: 17px; margin: 0 0 0 0; padding: 0;"]/text()')
            if 'Republican' in party:
                party = 'Republican'
            elif 'Democrat' in party:
                party = 'Democratic'

            # District # - County - Map
            district = district.split()[1]

            photo_url = leg_doc.xpath('//img[contains(@src,"/members/")]/@src')[0]


            legislator = Legislator(term, chamber, district, full_name,
                                    party=party, photo_url=photo_url,
                                    url=leg_url)
            # office address / phone
            try:
                addr_div = leg_doc.xpath('//div[@style="float: left; width: 225px; margin: 10px 5px 0 20px; padding: 0;"]')[0]
                addr = addr_div.xpath('p[@style="font-size: 13px; margin: 0 0 10px 0; padding: 0;"]')[0].text_content()

                phone = addr_div.xpath('p[@style="font-size: 13px; margin: 0 0 0 0; padding: 0;"]/text()')[0]
                phone = phone.strip()
                legislator.add_office('capitol', 'Columbia Address',
                                      address=addr, phone=phone)
            except IndexError:
                self.warning('no address for {0}'.format(full_name))

            legislator.add_source(leg_url)
            legislator.add_source(url)


            # committees (skip first link)
            for com in leg_doc.xpath('//a[contains(@href, "committee.php")]')[1:]:
                if com.text.endswith(', '):
                    committee, role = com.text_content().rsplit(', ',1)
                    # known roles
                    role = {'Treas.': 'treasurer',
                            'Secy.': 'secretary',
                            'Secy./Treas.': 'secretary/treasurer',
                            'V.C.': 'vice-chair',
                            '1st V.C.': 'first vice-chair',
                            '2nd V.C.': 'second vice-chair',
                            '3rd V.C.': 'third vice-chair',
                            'Ex.Officio Member': 'ex-officio member',
                            'Chairman': 'chairman'}[role]
                else:
                    committee = com.text
                    role = 'member'
                legislator.add_role('committee member', term=term,
                                    chamber=chamber, committee=committee,
                                    position=role)

            self.save_legislator(legislator)
    def scrape(self, term, chambers):
        special_case_used = False

        url = 'http://gencourt.state.nh.us/downloads/Members.txt'

        option_map = {}
        html = self.get(
            'http://www.gencourt.state.nh.us/house/members/memberlookup.aspx'
        ).text
        doc = lxml.html.fromstring(html)
        for opt in doc.xpath('//option'):
            option_map[opt.text] = opt.get('value')

        data = self.get(url).text
        for line in data.splitlines():
            if line.strip() == "":
                continue

            (chamber, fullname, last, first, middle, county, district_num,
             seat, party, street, street2, city, astate, zipcode, home_phone,
             office_phone, fax, email, com1, com2, com3, com4, com5,
             com6) = line.split('\t')

            chamber = chamber_map[chamber]

            # skip legislators from a chamber we aren't scraping
            if chamber not in chambers:
                continue

            middle = middle.strip()
            last = last.strip('"')

            if middle:
                full = '%s %s %s' % (first, middle, last)
            else:
                full = '%s %s' % (first, last)

            address = street
            if street2:
                address += (' ' + street2)
            address += '\n%s, %s %s' % (city, astate, zipcode)

            district = str(int(district_num))
            if county:
                district = '%s %s' % (county, district)

            # When a candidate receives enough write-in votes in the
            # other party's primary, they are listed on the ballot as
            # being a nominee of both parties (eg, 'd+r')
            # Cross-reference this list for official party affiliation:
            # http://www.gencourt.state.nh.us/House/caljourns/journals/2015/HJ_4.pdf

            leg = Legislator(term,
                             chamber,
                             district,
                             full,
                             party=party_map[party])
            leg.add_office('district',
                           'Home Address',
                           address=address,
                           phone=home_phone or None)
            leg.add_office('district',
                           'Office Address',
                           phone=office_phone or None,
                           fax=fax or None,
                           email=email or None)

            if chamber == 'upper':
                leg['url'] = 'http://www.gencourt.state.nh.us/Senate/members/webpages/district%02d.aspx' % int(
                    district_num)
            elif chamber == 'lower':
                code = option_map.get('{0}, {1}'.format(last, first))
                if code:
                    leg['url'] = 'http://www.gencourt.state.nh.us/house/members/member.aspx?member=' + code

            romans = r'(?i)\s([IXV]+)(?:\s|$)'
            for com in (com1, com2, com3, com4, com5, com6):
                com = com.strip('"')
                if com:
                    com_name = com.title()
                    com_name = re.sub(romans, lambda m: m.group().upper(),
                                      com_name)
                    leg.add_role('committee member',
                                 term=term,
                                 chamber=chamber,
                                 committee=com_name)

            if 'url' in leg:
                leg['photo_url'] = self.get_photo(leg['url'], chamber)

            leg.add_source(url)
            self.save_legislator(leg)
Beispiel #55
0
    def scrape(self, chamber, term):
        self.validate_term(term)
        session = self.get_session_for_term(term)
        try:
            session_id = self.get_session_id(session)
        except KeyError:
            raise NoDataForPeriod(session)

        body = {'lower': 'H', 'upper': 'S'}[chamber]
        url = 'http://www.azleg.gov/MemberRoster.asp?Session_ID=%s&body=%s' % (
            session_id, body)
        page = self.get(url).text
        root = html.fromstring(page)
        path = '//table[@id="%s"]/tr' % {'H': 'house', 'S': 'senate'}[body]
        roster = root.xpath(path)[1:]
        for row in roster:
            position = ''
            vacated = ''
            name, district, party, email, room, phone, fax = row.xpath('td')

            if email.attrib.get('class') == 'vacantmember':
                continue  # Skip any vacant members.

            link = name.xpath('string(a/@href)')
            link = "http://www.azleg.gov" + link
            if len(name) == 1:
                name = name.text_content().strip()
            else:
                position = name.tail.strip()
                name = name[0].text_content().strip()

            linkpage = self.get(link).text
            linkroot = html.fromstring(linkpage)
            linkroot.make_links_absolute(link)

            photos = linkroot.xpath("//img[@name='memberphoto']")

            if len(photos) != 1:
                raise Exception

            photo_url = photos[0].attrib['src']

            district = district.text_content()
            party = party.text_content().strip()
            email = email.text_content().strip()

            if ('Vacated' in email or 'Resigned' in email
                    or 'Removed' in email):
                # comment out the following 'continue' for historical
                # legislative sessions
                # for the current session, if a legislator has left we will
                # skip him/her to keep from overwriting their information
                continue
                vacated = re.search('[0-9]*/[0-9]*/\d{4}', email).group()
                email = ''

            party = self.get_party(party)
            room = room.text_content().strip()
            if chamber == 'lower':
                address = "House of Representatives\n"
            else:
                address = "Senate\n"
            address = address + "1700 West Washington\n Room " + room  \
                              + "\nPhoenix, AZ 85007"

            phone = phone.text_content().strip()
            if not phone.startswith('602'):
                phone = "602-" + phone
            fax = fax.text_content().strip()
            if not fax.startswith('602'):
                fax = "602-" + fax
            if vacated:
                end_date = datetime.datetime.strptime(vacated, '%m/%d/%Y')
                leg = Legislator(term,
                                 chamber,
                                 district,
                                 full_name=name,
                                 party=party,
                                 url=link)
                leg['roles'][0]['end_date'] = end_date
            else:
                leg = Legislator(term,
                                 chamber,
                                 district,
                                 full_name=name,
                                 party=party,
                                 email=email,
                                 url=link,
                                 photo_url=photo_url)

            leg.add_office('capitol',
                           'Capitol Office',
                           address=address,
                           phone=phone,
                           fax=fax)

            if position:
                leg.add_role(position,
                             term,
                             chamber=chamber,
                             district=district,
                             party=party)

            leg.add_source(url)

            #Probably just get this from the committee scraper
            #self.scrape_member_page(link, session, chamber, leg)
            self.save_legislator(leg)