Beispiel #1
0
    def scrape_member(self, chamber, year, member_url):
        member_page = self.urlopen(member_url)
        doc = lxml.html.fromstring(member_page)

        photo_url = doc.xpath('//div[@id="bioImage"]/img/@src')[0]
        name_pieces = doc.xpath('//span[@id="name"]/text()')[0].split()
        full_name = ' '.join(name_pieces[1:-1]).strip()

        party = name_pieces[-1]
        if party == '(R)':
            party = 'Republican'
        elif party == '(D)':
            party = 'Democratic'
        elif party == '(I)':
            party = 'Independent'

        district = doc.xpath('//span[@id="districtHeader"]/text()')[0].split()[-1]

        leg = Legislator(year, chamber, district, full_name, party=party,
                         photo_url=photo_url, url=member_url)
        leg.add_source(member_url)

        address = '\n'.join(doc.xpath('//div[@id="FrankfortAddresses"]//span[@class="bioText"]/text()'))
        phone = None
        phone_numbers = doc.xpath('//div[@id="PhoneNumbers"]//span[@class="bioText"]/text()')
        for num in phone_numbers:
            if num.startswith('Annex: '):
                phone = num.replace('Annex: ', '')

        leg.add_office('capitol', 'Capitol Office', address=address,
                       phone=phone)

        self.save_legislator(leg)
Beispiel #2
0
    def scrape_senators(self, chamber, term):
        url = 'http://www.ohiosenate.gov/directory.html'
        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)
            page.make_links_absolute(url)

            for el in page.xpath('//table[@class="fullWidth"]/tr/td'):
                sen_link = el.xpath('a[@class="senatorLN"]')[1]
                sen_url = sen_link.get('href')

                full_name = sen_link.text
                full_name = full_name[0:-2]
                if full_name == 'To Be Announced':
                    continue

                district = el.xpath('string(h3)').split()[1]

                party = el.xpath('string(a[@class="senatorLN"]/span)')

                if party == "D":
                    party = "Democratic"
                elif party == "R":
                    party = "Republican"

                office_phone = el.xpath("b[text() = 'Phone']")[0].tail
                office_phone = office_phone.strip(' :')

                office = ", ".join([x.strip() for x in \
                                    el.xpath("./text()")[2:-1]])

                photo_url = el.xpath("a/img")[0].attrib['src']
                email = el.xpath('.//span[@class="tan"]/text()')[1]

                leg = Legislator(term, chamber, district, full_name,
                                 party=party, photo_url=photo_url, url=sen_url,
                                 email="")

                committees = self.scrape_senate_committees(sen_url)

                leg.add_office('capitol',
                               'Capitol Office',
                               address=office,
                               phone=office_phone)

                leg.add_source(url)
                leg.add_source(sen_url)

                for committee in committees:
                    chmbr = chamber
                    if "joint" in committee['committee'].lower():
                        chmbr = "joint"

                    leg.add_role('committee member',
                        term=term,
                        chamber=chmbr,
                        committee=committee['committee'],
                        position=committee['title']
                    )

                self.save_legislator(leg)
Beispiel #3
0
    def scrape_rep_info(self, url, term):
        district_to_sponsor_id = self.get_sponsor_ids()

        #get reps
        html = self.get(url).text
        page = lxml.html.fromstring(html)
        reps = page.xpath("//table[contains(@id,'HseMainContent_tabByName_TabPanel')]//tr")
        for rep in reps:
            #get basic rep info
            info = rep.xpath(".//td")
            if len(info) == 0:
                continue
            rep_name,party,district,suite,phone = [i.text_content() for i in info]
            district = district.replace("House District","").strip()
            office_address = '{}\n11 S. Union Street\nMontgomery, AL 36130'.format(suite)

            assert rep_name.count(",") == 1, "Unable to parse representative's name: {}".format(rep_name)
            full_name_parts = [x.strip() for x in rep_name.split(",")]
            full_name = "{0} {1}".format(full_name_parts[1], full_name_parts[0])

            PARTIES = {
                    'R': "Republican",
                    'D': "Democratic"
                    }
            party = PARTIES[party.strip()]

            #add basic leg info and main office
            leg = Legislator(term,
                            "lower",
                            district,
                            full_name,
                            party=party)
            leg.add_office('capitol',
                            'Capitol Office',
                            address=office_address,
                            phone=phone.strip())

            #match rep to sponsor_id if possible
            ln,fn = rep_name.split(",")
            last_fi_key = "{ln} ({fi})".format(ln=ln.strip(), fi=fn.strip()[0])

            leg.add_source(url)

            try:
                sponsor_id = district_to_sponsor_id[district]
            except KeyError:
                #can't find rep's sponsor_id, do what we can and get out!
                self.logger.warning("Legislator {name} does not match any sponsor_id and thus will not be linked to bills or committees".format(name=rep_name))
                self.save_legislator(leg)
                continue

            #scrape rep's additional info from sponsor page
            rep_sponsor_url = "http://www.legislature.state.al.us/aliswww/Representative.aspx?OID_SPONSOR={}".format(sponsor_id)
            rep_html = self.get(rep_sponsor_url).text
            rep_page = lxml.html.fromstring(rep_html)

            leg["photo_url"] = rep_page.xpath("//input[contains(@id,'imgLEG')]/@src")[0]
            self.add_committees(rep_page,leg,"lower",term)
            leg.add_source(rep_sponsor_url)
            self.save_legislator(leg)
Beispiel #4
0
    def get_member(self, term, chamber, kpid):
        url = '%smembers/%s' % (ksapi.url, kpid)
        content = json.loads(self.get(url).text)['content']

        party = content['PARTY']
        if party == 'Democrat':
            party = 'Democratic'

        slug = {'2013-2014': 'b2013_14',
                '2015-2016': 'b2015_16'}[term]
        leg_url = 'http://www.kslegislature.org/li/%s/members/%s/' % (slug, kpid)
        legislator_page = self.lxmlize(leg_url)
        (photo_url, ) = legislator_page.xpath(
                '//img[@class="profile-picture"]/@src')

        legislator = Legislator(term, chamber, str(content['DISTRICT']),
                                content['FULLNAME'], email=content['EMAIL'],
                                party=party, url=leg_url, photo_url=photo_url,
                                occupation=content['OCCUPATION'],
                               )

        address = ('Room %s\n'
                   'Kansas State Capitol Building\n'
                   '300 SW 10th St.\n'
                   'Topeka, KS 66612') % content['OFFICENUM']

        legislator.add_office('capitol', 'Capitol Office',
                              phone=content['OFFPH'] or None,
                              address=address)

        legislator.add_source(url)
        self.save_legislator(legislator)
Beispiel #5
0
    def scrape(self, chamber, session):
        url = self.get_district_list(chamber, session)
        people_pages = self.scrape_directory(url, chamber, session)

        for person in people_pages:
            district = person
            p_url = people_pages[district]
            metainf = self.process_person(p_url)

            p = Legislator(
                session,
                chamber,
                district,
                metainf["name"],
                party=metainf["party"],
                # some additional things the website provides:
                occupation=metainf["occupation"],
                photo_url=metainf["photo_url"],
                url=metainf["homepage"],
            )
            if "email" in metainf:
                p["email"] = metainf["email"]
            if "number" in metainf:
                p.add_office(
                    "capitol", "Capitol Office", phone=metainf["number"], address="200 E. Colfax\nDenver, CO 80203"
                )

            p.add_source(p_url)
            self.save_legislator(p)
Beispiel #6
0
    def scrape_senators(self, chamber, session, term):
        url = self.senator_url % (session[2:])
        root_url = url
        page = self.urlopen(url)
        page = lxml.html.fromstring(page)
        table = page.xpath('//*[@id="mainContent"]/table//table/tr')
        rowcount = 0
        for tr in table:
            rowcount += 1
            # the first two rows are headers, skip:
            if rowcount < 2:
                continue
            tds = tr.xpath('td')
            full_name = tds[0].xpath('div/a')[0].text_content().strip()
            party_and_district = tds[1].xpath('div')[0].text_content().strip().split('-')
            if party_and_district[0] == 'D':
                party = 'Democratic'
            elif party_and_district[0] == 'R':
                party = 'Republican'
            senator_key = "%s%s" % (party_and_district[0].lower(),party_and_district[1])
            district = party_and_district[1]
            phone = tds[3].xpath('div')[0].text_content().strip()
            url = self.senator_details_url % (session[2:],int(district))
            leg = Legislator(term, chamber, district, full_name,
                             party=party, url=url)
            leg.add_source(root_url)
            details_page = self.urlopen(url)
            leg.add_source(url)
            homepage = url
            page = lxml.html.fromstring(details_page)
            photo_url = page.xpath("//div[@id='container']/div[1]/img")
            photo_url = photo_url[0].attrib['src']

            url = self.senator_address_url % (
                session[2:],int(senator_key[1:]))

            details_page = self.urlopen(url)
            leg.add_source(url)
            page = lxml.html.fromstring(details_page)
            address = page.xpath('/html/body//span[2]')[0].text_content().split('\n')
            email = page.xpath('/html/body/p/span[2]/a/@href')
            # TODO This is only true if the href doesn't contain 'mail_form'. If it does,
            # then there is only a webform. So...no email?
            # TODO a lot of these have fax numbers. Include?

            kwargs = {
                "address": "%s%s" % (address[0],address[1])
            }

            if phone.strip() != "":
                kwargs['phone'] = phone

            leg.add_office("capitol", "Capitol Office",
                           **kwargs)

            leg['photo_url'] = photo_url
            if email and len(email) > 0 and email[0] != 'mailto:':
                leg['email'] = email[0].split(':')[1]
                #print "em = %s" % email
            self.save_legislator(leg)
Beispiel #7
0
    def scrape(self, chamber, session):
        url = self.get_district_list(chamber, session)
        people_pages = self.scrape_directory( url, chamber, session )

        for person in people_pages:
            district = person
            p_url = people_pages[district]
            metainf = self.process_person( p_url )

            p = Legislator( session, chamber, district, metainf['name'],
                party=metainf['party'],
                # some additional things the website provides:
                occupation=metainf['occupation'],
                photo_url=metainf['photo_url'],
                url=metainf['homepage'])
            if "email" in metainf:
                p['email'] = metainf['email']
            if "number" in metainf:
                p.add_office('capitol', 'Capitol Office',
                             phone=metainf['number'],
                             address='200 E. Colfax\nDenver, CO 80203'
                            )

            p.add_source( p_url )

            if 'ctty' in metainf:
                for ctty in metainf['ctty']:
                    p.add_role( 'committee member',
                        term=session,
                        chamber=chamber,
                        committee=clean_committee(ctty),
                        position="member"
                    )
            self.save_legislator( p )
Beispiel #8
0
    def get_member(self, term, chamber, kpid):
        url = '%smembers/%s' % (ksapi.url, kpid)
        content = json.loads(self.urlopen(url))['content']

        party = content['PARTY']
        if party == 'Democrat':
            party = 'Democratic'

        slug = {'2013-2014': 'b2013_14'}[term]
        leg_url = '%s/%s/members/%s/' % (LI, slug, kpid)
        photo_url = '%s/m/images/pics/%s.jpg' % (LI, kpid)

        legislator = Legislator(term, chamber, str(content['DISTRICT']),
                                content['FULLNAME'], email=content['EMAIL'],
                                party=party, url=leg_url, photo_url=photo_url,
                                occupation=content['OCCUPATION'])

        # just do office address for now, can get others from api
        if content['OFFICENUM']:
            address = ('Kansas House of Representatives\n'
                       'Docking State Office Building\n'
                       '901 SW Harrison Street\n'
                       'Topeka, KS 66612')
        else:
            address = ('Room %s\n'
                       'Kansas State Capitol Building\n'
                       '300 SW 10th St.\n'
                       'Topeka, KS 66612') % content['OFFICENUM']
        legislator.add_office('capitol', 'Capitol Office',
                              phone=content['OFFPH'] or None,
                              address=address)

        legislator.add_source(url)
        self.save_legislator(legislator)
Beispiel #9
0
    def scrape(self, term, chambers):

        represent_url = 'http://represent.opennorth.ca/representatives/%s/?limit=500' % self.representative_set
        data = json.load(urllib2.urlopen(represent_url))
        for rep in data['objects']:
            leg = Legislator(term, 'lower',
                rep['district_name'],
                rep['name'],
                party=rep.get('party_name'),
                photo_url=rep.get('photo_url'),
                url=rep.get('url'),
                email=rep.get('email')
            )
            leg.add_source(rep['source_url'])
            for rep_office in rep.get('offices', []):
                name = rep_office.get('postal', '').split('\n')[0]
                if not name:
                    name = (rep_office.get('type', '').title() + ' office').strip()
                leg.add_office(
                    'capitol' if rep_office.get('type') == 'legislature' else 'district',
                    name,
                    phone=rep_office.get('tel'),
                    fax=rep_office.get('fax'),
                    address=rep_office.get('postal')
                )
            self.save_legislator(leg)
Beispiel #10
0
    def scrape_upper(self, term):
        url = 'http://www.utahsenate.org/aspx/roster.aspx'
        html = self.urlopen(url)
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)

        for row in doc.xpath('//tr')[1:]:
            tds = row.xpath('td')

            # 1st has district
            district = tds[0].text_content()

            # 3rd has name and email
            person = tds[2].xpath('span[@class="person"]')[0]
            if '(D)' in person.text_content():
                party = 'Democratic'
            elif '(R)' in person.text_content():
                party = 'Republican'
            else:
                raise ValueError('unknown party')
            a = person.xpath('a')[0]
            name = a.text_content()
            leg_url = a.get('href')
            email = tds[2].xpath('span[@class="email"]/a/text()')
            if email:
                email = email[0]
            else:
                email = ''

            # office address
            # text is split by br in 4th td, join with a space
            address = ' '.join(tds[3].xpath('font/text()'))
            numbers = tds[4].xpath('text()')
            phone = None
            fax = None
            for num in numbers:
                if num.startswith(('Cell', 'Home', 'Work')) and not phone:
                    phone = num.split(u'\xa0')[-1]
                elif num.startswith('Fax'):
                    fax = num.split(u'\xa0')[-1]
            numbers = [num.split(u'\xa0') for num in numbers]

            # get photo
            try:
                leg_html = self.urlopen(leg_url)
                leg_doc = lxml.html.fromstring(leg_html)
                leg_doc.make_links_absolute(leg_url)
                photo_url = leg_doc.xpath('//p[@class="photo"]/img/@src')[0]
            except:
                self.warning('could not fetch %s' % leg_url)
                photo_url = ''

            leg = Legislator(term, 'upper', district, name,
                             party=party, email=email, address=address,
                             photo_url=photo_url, url=leg_url)
            leg.add_office('district', 'Home', address=address, phone=phone,
                           fax=fax)
            leg.add_source(url)
            leg.add_source(leg_url)
            self.save_legislator(leg)
Beispiel #11
0
    def scrape_senate(self, term):
        urls = (
         'http://www.senadopr.us/senadores/Pages/Senadores%20Acumulacion.aspx',
         'http://www.senadopr.us/Pages/Senadores%20Distrito%20I.aspx',
         'http://www.senadopr.us/Pages/Senadores%20Distrito%20II.aspx',
         'http://www.senadopr.us/Pages/Senadores%20Distrito%20III.aspx',
         'http://www.senadopr.us/Pages/Senadores%20Distrito%20IV.aspx',
         'http://www.senadopr.us/Pages/Senadores%20Distrito%20V.aspx',
         'http://www.senadopr.us/Pages/Senadores%20Distrito%20VI.aspx',
         'http://www.senadopr.us/Pages/Senadores%20Distrito%20VII.aspx',
         'http://www.senadopr.us/Pages/Senadores%20Distrito%20VIII.aspx')

        for counter, url in enumerate(urls):
            leg_page_html = self.urlopen(url)
            doc = lxml.html.fromstring(leg_page_html)
            doc.make_links_absolute(url)
            table = doc.xpath('//table[@summary="Listado de Senadores"]')[0]

            # skip first row
            for row in table.xpath('tr')[1:]:
                tds = row.xpath('td')

                name = tds[0].text_content().title().replace('Hon.','',1).strip()
                party = tds[1].text_content()
                phone = tds[2].text_content()
                email = tds[3].text_content()
                #shapefiles denote 0 as At-Large Districts
                if counter == 0:
                    district = 'At-Large'
                else:
                    district = str(counter)

                #Code to guess the picture
                namefixed = unicode(name.replace(".",". "))  #Those middle names abbreviations are sometimes weird.
                namefixed = unicodedata.normalize('NFKD', namefixed).encode('ascii', 'ignore') #Remove the accents
                nameparts = namefixed.split()
                if nameparts[1].endswith('.'):
                    lastname = nameparts[2]
                else:
                    lastname = nameparts[1]

                # Construct the photo url
                picture_filename = 'http://www.senadopr.us/Fotos%20Senadores/sen_' + (nameparts[0][0] + lastname).lower() + '.jpg'

                try:
                    picture_data = self.urlopen(picture_filename):  # Checking to see if the file is there
                    leg = Legislator(term, 'upper', district, name,
                                     party=party,
                                     email=email, url=url,
                                     photo_url=picture_filename)

                except scrapelib.HTTPError:         # If not, leave out the photo_url
                    leg = Legislator(term, 'upper', district, name,
                                     party=party, phone=phone, email=email,
                                     url=url)

                leg.add_office('capitol', 'Oficina del Capitolio',
                               phone=phone)
                leg.add_source(url)
                self.save_legislator(leg)
Beispiel #12
0
    def scrape(self, chamber, term):
        chamber_abbrev = {'upper': 'S', 'lower': 'H'}[chamber]

        url = ("http://legisweb.state.wy.us/LegislatorSummary/LegislatorList"
               ".aspx?strHouse=%s&strStatus=N" % chamber_abbrev)
        page = lxml.html.fromstring(self.urlopen(url))
        page.make_links_absolute(url)

        for link in page.xpath("//a[contains(@href, 'LegDetail')]"):
            name = link.text.strip()
            leg_url = link.get('href')

            email_address = link.xpath("../../../td[1]//a")[0].attrib['href']
            email_address = link.xpath("../../../td[2]//a")[0].attrib['href']
            email_address = email_address.split('Mailto:')[1]

            party = link.xpath("string(../../../td[3])").strip()
            if party == 'D':
                party = 'Democratic'
            elif party == 'R':
                party = 'Republican'

            district = link.xpath(
                "string(../../../td[4])").strip().lstrip('HS0')

            leg_page = lxml.html.fromstring(self.urlopen(leg_url))
            leg_page.make_links_absolute(leg_url)
            img = leg_page.xpath(
                "//img[contains(@src, 'LegislatorSummary/photos')]")[0]
            photo_url = img.attrib['src']

            office_tds = leg_page.xpath(
                '//table[@id="ctl00_cphContent_tblContact"]/tr/td/text()')
            address = []
            phone = None
            fax = None
            for td in office_tds:
                if td.startswith('Home -'):
                    phone = td.strip('Home - ')
                # only use cell if home isn't present
                elif td.startswith('Cell -') and not phone:
                    phone = td.strip('Cell - ')
                elif td.startswith('Fax -'):
                    fax = td.strip('Fax - ')
                else:
                    address.append(td)

            leg = Legislator(term, chamber, district, name, party=party,
                             email=email_address, photo_url=photo_url,
                             url=leg_url)

            adr = " ".join(address)
            if adr.strip() != "":
                leg.add_office('district', 'Contact Information',
                               address=adr, phone=phone, fax=fax)

            leg.add_source(url)
            leg.add_source(leg_url)

            self.save_legislator(leg)
Beispiel #13
0
    def scrape(self, chamber, session):
        metainf = self.scrape_leg_page(get_legislator_listing_url(chamber))
        for leg in metainf:
            chamber = {"House": "lower",
                       "Senate": "upper"}[leg['chamber']]
            p = Legislator( session, chamber, leg['district'], leg['name'],
                party=leg['party'],
                # some additional things the website provides:
                photo_url=leg['image'],
                url=leg['homepage'],
                email=leg['email'])
            p.add_office('capitol', 'Capitol Office', address=leg['addr'],
                         phone=leg['phone'], fax=leg['fax'] or None)

            for source in leg['source']:
                p.add_source( source )

            try:
                for ctty in leg['ctty']:
                    flag='Joint Legislative'
                    if ctty['name'][:len(flag)] == flag:
                        ctty_chamber = "joint"
                    else:
                        ctty_chamber = chamber

                    p.add_role( 'committee member',
                        term=session,
                        chamber=ctty_chamber,
                        committee=ctty['name'],
                        position="member")
            except KeyError:
                self.log( "XXX: Warning, %s has no scraped Commities" %
                    leg['name'] )

            self.save_legislator( p )
Beispiel #14
0
    def scrape_details(self, chamber, term, leg_name, leg_link, role):
        if not leg_link:
            # Vacant post, likely:
            if "Vacancy" in leg_name:
                return
            raise Exception("leg_link is null. something went wrong")
        try:
            url = 'http://billstatus.ls.state.ms.us/members/%s' % leg_link
            url_root = os.path.dirname(url)
            details_page = self.urlopen(url)
            root = lxml.etree.fromstring(details_page.bytes)
            party = root.xpath('string(//PARTY)')
            district = root.xpath('string(//DISTRICT)')
            photo = "%s/%s" % (url_root, root.xpath('string(//IMG_NAME)'))

            home_phone = root.xpath('string(//H_PHONE)')
            bis_phone = root.xpath('string(//B_PHONE)')
            capital_phone = root.xpath('string(//CAP_PHONE)')
            other_phone = root.xpath('string(//OTH_PHONE)')
            org_info = root.xpath('string(//ORG_INFO)')
            email_name = root.xpath('string(//EMAIL_ADDRESS)')
            cap_room = root.xpath('string(//CAP_ROOM)')

            if party == 'D':
                party = 'Democratic'
            else:
                party = 'Republican'

            leg = Legislator(term, chamber, district, leg_name,
                             party=party,
                             role=role,
                             org_info=org_info,
                             url=url,
                             photo_url=photo)
            leg.add_source(url)

            kwargs = {}

            if email_name.strip() != "":
                email = '%s@%s.ms.gov' % (email_name, {
                    "upper": "senate",
                    "lower": "house"
                }[chamber])
                kwargs['email'] = email

            if capital_phone != "":
                kwargs['phone'] = capital_phone

            if cap_room != "":
                kwargs["address"] = "Room %s\n%s" % (cap_room, CAP_ADDRESS)
            else:
                kwargs['address'] = CAP_ADDRESS

            leg.add_office('capitol',
                           'Capitol Office',
                           **kwargs)

            self.save_legislator(leg)
        except scrapelib.HTTPError, e:
            self.warning(str(e))
Beispiel #15
0
    def scrape(self, chamber, session):
        url = self.get_district_list(chamber, session)
        people_pages = self.scrape_directory( url, chamber, session )

        for person in people_pages:
            district = person
            p_url = people_pages[district]
            metainf = self.process_person( p_url )

            p = Legislator( session, chamber, district, metainf['name'],
                party=metainf['party'],
                # some additional things the website provides:
                occupation=metainf['occupation'],
                photo_url=metainf['photo_url'],
                url=metainf['homepage'])

            phone = metainf['number'] if 'number' in metainf else None
            email = metainf['email'] if 'email' in metainf else None
            p.add_office('capitol', 'Capitol Office',
                             phone=phone,
                             address='200 E. Colfax\nDenver, CO 80203',
                             email=email
                            )

            p.add_source( p_url )
            self.save_legislator( p )
Beispiel #16
0
    def scrape_upper(self, chamber, term):
        url = 'http://www.senate.michigan.gov/members/memberlist.htm'
        with self.urlopen(url) as html:
            doc = lxml.html.fromstring(html)
            for row in doc.xpath('//table[@width=550]/tr')[1:39]:
                # party, dist, member, office_phone, office_fax, office_loc
                party = abbr[row.xpath('td[1]/text()')[0]]
                district = row.xpath('td[2]/a/text()')[0]
                leg_url = row.xpath('td[3]/a/@href')[0]
                name = row.xpath('td[3]/a/text()')[0]
                office_phone = row.xpath('td[4]/text()')[0]
                office_fax = row.xpath('td[5]/text()')[0]
                office_loc = row.xpath('td[6]/text()')[0]
                leg = Legislator(term=term, chamber=chamber,
                                 district=district,
                                 full_name=name,
                                 party=party,
                                 url=leg_url)

                leg.add_office('capitol', 'Capitol Office',
                               address=office_loc,
                               fax=office_fax,
                               phone=office_phone)


                leg.add_source(url)
                self.save_legislator(leg)
Beispiel #17
0
    def scrape(self, chamber, term):
        if chamber == 'upper':
            url = ('http://webserver.rilin.state.ri.us/Documents/Senators.xls')
            rep_type = 'Senator '
        elif chamber == 'lower':
            url = (
             'http://webserver.rilin.state.ri.us/Documents/Representatives.xls')
            rep_type = 'Representative '

        self.urlretrieve(url, 'ri_leg.xls')

        wb = xlrd.open_workbook('ri_leg.xls')
        sh = wb.sheet_by_index(0)

        for rownum in xrange(1, sh.nrows):
            d = {}
            for field, col_num in excel_mapping.iteritems():
                d[field] = sh.cell(rownum, col_num).value
            dist = str(int(d['district']))
            district_name = dist
            full_name = re.sub(rep_type, '', d['full_name']).strip()
            translate = {
                "Democrat"    : "Democratic",
                "Republican"  : "Republican",
                "Independent" : "Independent"
            }
            leg = Legislator(term, chamber, district_name, full_name,
                             '', '', '',
                             translate[d['party']],
                             town_represented=d['town_represented'],
                             email=d['email'])
            leg.add_office('district', 'Address', address=d['address'])
            leg.add_source(url)
            self.save_legislator(leg)
Beispiel #18
0
def test_legislator():
    l = Legislator('T1', 'upper', '1', 'Adam Smith', 'Adam', 'Smith')
    assert_equal(l, {'_type': 'person', 'full_name': 'Adam Smith',
                     'first_name': 'Adam', 'last_name': 'Smith',
                     'middle_name': '', 'suffixes': '', 'roles': [
                         {'chamber': 'upper', 'term': 'T1',
                          'role': 'member', 'start_date': None,
                          'end_date': None, 'district': '1',
                          'party': ''}],
                     'offices': [], 'sources': []})

    l.add_role('committee member', 'T1', committee='Some Committee',
               position='chairman')
    assert_equal(l['roles'][1], {'role': 'committee member', 'term': 'T1',
                                 'start_date': None, 'end_date': None,
                                 'committee': 'Some Committee',
                                 'position': 'chairman'})

    l.add_office('capitol', 'Statehouse Office', '123 Main St', '123-456-7890',
                 '123-555-5555', '*****@*****.**')
    assert_equal(l['offices'], [{'type': 'capitol',
                                 'name': 'Statehouse Office',
                                 'address': '123 Main St',
                                 'phone': '123-456-7890',
                                 'fax': '123-555-5555',
                                 'email': '*****@*****.**'}])
def table_row_to_legislator_and_profile_url(table_row_element, chamber, term):
    """Derive a Legislator from an HTML table row lxml Element, and a link to their profile"""
    td_elements = table_row_element.xpath('td')
    (role_element, name_element, district_element, party_element, phone_element, email_element) = td_elements

    # Name comes in the form Last, First
    #last_name_first_name = name_element.text_content().strip()
    #full_name = last_name_first_name_to_full_name(last_name_first_name)
    full_name = name_element.text_content().strip()
    district = district_element.text_content().strip()
    party = party_element.text_content().strip()
    if party == 'Democrat':
        party = 'Democratic'

    legislator = Legislator(term, chamber, district, full_name, party=party)

    role = role_element.text_content().strip()
    address = co_address_from_role(role)
    phone = phone_element.text_content().strip()
    email = email_element.text_content().strip()

    legislator.add_office(
        'capitol',
        'Capitol Office',
        address=address,
        phone=phone,
        email=email,
    )

    (profile_url, ) = name_element.xpath('a/@href')

    return legislator, profile_url
Beispiel #20
0
    def scrape_reps(self, chamber, session, term):
        url = (self.reps_url % (session))
        page = self.urlopen(url)
        page = lxml.html.fromstring(page)
        # This is the ASP.net table container
        table_xpath = ('id("ContentPlaceHolder1_'
                        'gridMembers_DXMainTable")')
        table = page.xpath(table_xpath)[0]
        for tr in table.xpath('tr')[1:]:
            tds = tr.xpath('td')
            leg_code = tds[0].xpath('a[1]')[0].attrib.get('href')
            last_name = tds[0].text_content().strip()
            first_name = tds[1].text_content().strip()
            full_name = '%s %s' % (first_name, last_name)
            district = str(int(tds[2].text_content().strip()))
            party = tds[3].text_content().strip()
            if party == 'Democrat':
                party = 'Democratic'
            phone = tds[4].text_content().strip()
            room = tds[5].text_content().strip()
            address = self.assumed_address_fmt % (room if room else '')
            if last_name == 'Vacant':
                leg = Legislator(term, chamber, district, full_name=full_name,
                            first_name=first_name, last_name=last_name,
                            party=party, _code=leg_code, url=url)

                leg.add_office('capitol', "Capitol Office",
                               address=address,
                               phone=phone)

                leg.add_source(url)
                self.save_vacant_legislator(leg)
            else:
                leg = Legislator(term, chamber, district, full_name=full_name,
                          first_name=first_name, last_name=last_name,
                          party=party, _code=leg_code, url=url)

                leg.add_office('capitol', 'Capitol Office',
                               address=address,
                               phone=phone)

                url = (self.rep_details_url % (session,district))
                leg.add_source(url)
                details_page = self.urlopen(url)
                page = lxml.html.fromstring(details_page)
                picture = page.xpath('//*[@id="ContentPlaceHolder1_imgPhoto"]/@src')
                email = page.xpath('//*[@id="ContentPlaceHolder1_lblAddresses"]/table/tr[4]/td/a/@href')
                terms = page.xpath('//*[@id="ContentPlaceHolder1_lblElected"]')
                committees = page.xpath('//*[@id="ContentPlaceHolder1_lblCommittees"]/li/a')
                for c in committees:
                    leg.add_role('committee member', term, committee=c.text_content().strip(), chamber=chamber)
                # TODO home address?
                if len(email) > 0 and email[0] != 'mailto:':
                    #print "Found email : %s" % email[0]
                    leg['email'] = email[0].split(':')[1]
                if len(picture) > 0:
                    #print "Found picture : %s" % picture[0]
                    leg['photo_url'] = picture[0]
                #leg.add_source(url)
                self.save_legislator(leg)
Beispiel #21
0
    def scrape(self, term, chambers):
        base_url = 'http://news.legislature.ne.gov/dist'

        #there are 49 districts
        for district in range(1, 50):
            if district < 10:
                rep_url = base_url + '0' + str(district) + '/biography/'
            else:
                rep_url = base_url + str(district) + '/biography/'

            try:
                html = self.urlopen(rep_url)
                page = lxml.html.fromstring(html)

                full_name = page.xpath('//div[@class="content_header_right"]/a')[0].text.split(' ',1)[1].strip()
                # This is hacky, are lis always the same?
                address = page.xpath('//div[@id="sidebar"]/ul[1]/li[3]')[0].text.strip() + '\n'
                address += page.xpath('//div[@id="sidebar"]/ul[1]/li[4]')[0].text.strip() + '\n'
                address += page.xpath('//div[@id="sidebar"]/ul[1]/li[5]')[0].text.strip()
                phone = page.xpath('//div[@id="sidebar"]/ul[1]/li[6]')[0].text.split()
                phone = phone[1] + '-' + phone[2]
                email = page.xpath('//div[@id="sidebar"]/ul[1]/li[7]/a')[0].text or ''

                #Nebraska is offically nonpartisan
                party = 'Nonpartisan'
                leg = Legislator(term, 'upper', str(district), full_name,
                                 party=party, email=email, url=rep_url)
                leg.add_source(rep_url)
                leg.add_office('capitol', 'Capitol Office', address=address,
                               phone=phone)
                self.save_legislator(leg)
            except scrapelib.HTTPError:
                self.warning('could not retrieve %s' % rep_url)
Beispiel #22
0
    def scrape_member(self, chamber, term, member_url):
        page = self.urlopen(member_url)
        root = lxml.html.fromstring(page)
        root.make_links_absolute(member_url)

        photo_url = root.xpath('//div[starts-with(@class,"bioPicContainer")]/img/@src')[0]
        photo_url = root.xpath('//div[starts-with(@class,"bioPicContainer")]/img/@src')[0]
        full_name = root.xpath('//div[starts-with(@class,"bioPicContainer")]/img/@alt')[0]

        email = root.xpath('//a[contains(@href, "mailto")]/@href')[0]
        email = email.replace("mailto:", "")
        # if full_name == 'Frank A. Moran':

        district = root.xpath('//div[@id="District"]//div[starts-with(@class,"widgetContent")]')
        if len(district):
            district = district[0].text_content().strip()
            district = clean_district(district)
        else:
            self.logger.warning("No district tab found for this hot garbage. Skipping.")
            return

        party = root.xpath('//span[@class="legislatorAffiliation"]/text()')[0]

        if party == "D":
            party = "Democratic"
        elif party == "R":
            party = "Republican"
        else:
            party = "Other"

        leg = Legislator(
            term, chamber, district, full_name, party=party, photo_url=photo_url, url=member_url, email=email
        )
        leg.add_source(member_url)

        # offices
        for dl in root.xpath('//dl[@class="address"]'):
            office_name = phone = fax = email = None
            address = []
            for child in dl.getchildren():
                text = child.text_content()
                if child.tag == "dt":
                    office_name = text
                else:
                    if text.startswith("Phone:"):
                        phone = text.strip("Phone: ") or None
                    elif text.startswith("Fax:"):
                        fax = text.strip("Fax: ") or None
                    elif text.startswith("Email:"):
                        email = text.strip("Email: ") or None
                    else:
                        address.append(text)
            # all pieces collected
            if "District" in office_name:
                otype = "district"
            else:
                otype = "capitol"
            leg.add_office(otype, office_name, phone=phone, fax=fax, address="\n".join(address), email=email)

        self.save_legislator(leg)
Beispiel #23
0
    def scrape_upper(self, chamber, term):
        url = 'http://www.senate.michigan.gov/members/memberlist.htm'
        html = self.urlopen(url)
        doc = lxml.html.fromstring(html)
        for row in doc.xpath('//table[@width=550]/tr')[1:39]:
            # party, dist, member, office_phone, office_fax, office_loc
            party, dist, member, phone, fax, loc = row.getchildren()
            party = abbr[party.text]
            district = dist.text_content().strip()
            name = member.text_content().strip()
            if name == 'Vacant':
                self.info('district %s is vacant', district)
                continue
            leg_url = member.xpath('a/@href')[0]
            office_phone = phone.text
            office_fax = fax.text
            office_loc = loc.text
            leg = Legislator(term=term, chamber=chamber,
                             district=district,
                             full_name=name,
                             party=party,
                             url=leg_url)

            leg.add_office('capitol', 'Capitol Office',
                           address=office_loc,
                           fax=office_fax,
                           phone=office_phone)


            leg.add_source(url)
            self.save_legislator(leg)
Beispiel #24
0
    def scrape(self, term, chambers):
        url = 'http://gencourt.state.nh.us/downloads/Members(Asterisk%20Delimited).txt'

        option_map = {}
        html = self.urlopen('http://www.gencourt.state.nh.us/house/members/memberlookup.aspx')
        doc = lxml.html.fromstring(html)
        for opt in doc.xpath('//option'):
            option_map[opt.text] = opt.get('value')

        with self.urlopen(url) as data:
            for line in data.splitlines():
                if line.strip() == "":
                    continue

                (chamber, fullname, last, first, middle, county, district_num,
                 seat, party, street, street2, city, astate, zipcode,
                 home_phone, office_phone, fax, email, com1, com2, com3,
                 com4, com5, _, _) = line.split('*')

                chamber = chamber_map[chamber]

                # skip legislators from a chamber we aren't scraping
                if chamber not in chambers:
                    continue

                if middle:
                    full = '%s %s %s' % (first, middle, last)
                else:
                    full = '%s %s' % (first, last)

                address = street
                if street2:
                    address += (' ' + street2)
                address += '\n%s, %s %s' % (city, astate, zipcode)

                district = str(int(district_num))
                if county:
                    district = '%s %s' % (county, district)

                leg = Legislator(term, chamber, district, full, first, last,
                                 middle, party_map[party], email=email)
                leg.add_office('district', 'Home Address',
                               address=address, phone=home_phone or None)
                leg.add_office('district', 'Office Address',
                               phone=office_phone or None, fax=fax or None)

                if chamber == 'upper':
                    leg['url'] = 'http://www.gencourt.state.nh.us/Senate/members/webpages/district%02d.aspx' % int(district_num)
                elif chamber == 'lower':
                    code = option_map.get('{0}, {1}'.format(last, first))
                    if code:
                        leg['url'] = 'http://www.gencourt.state.nh.us/house/members/member.aspx?member=' + code

                for com in (com1, com2, com3, com4, com5):
                    if com:
                        leg.add_role('committee member', term=term,
                                      chamber=chamber, committee=com)

                leg.add_source(url)
                self.save_legislator(leg)
Beispiel #25
0
    def scrape(self, chamber, term):

        if chamber == 'upper':
            url = "http://legis.wisconsin.gov/Pages/leg-list.aspx?h=s"
        else:
            url = "http://legis.wisconsin.gov/Pages/leg-list.aspx?h=a"

        body = self.urlopen(url)
        page = lxml.html.fromstring(body)
        page.make_links_absolute(url)

        for row in page.xpath("//table[@class='legis-list']/tr")[1:]:
            if row.xpath(".//a/@href"):
                rep_url = row.xpath(".//a/@href")[0]
                rep_doc = lxml.html.fromstring(self.urlopen(rep_url))
                rep_doc.make_links_absolute(rep_url)

                first_name = rep_doc.xpath('//h2[@class="given-name"]/text()')[0]
                last_name = rep_doc.xpath('//h2[@class="family-name"]/text()')[0]
                full_name = '%s %s' % (first_name, last_name)
                party = rep_doc.xpath('//div[@class="party"]/text()')[0]
                if party == 'Democrat':
                    party = 'Democratic'

                district = str(int(row.getchildren()[2].text_content()))

                # email
                email = rep_doc.xpath('//a[starts-with(@href, "mailto")]/text()')
                if email:
                    email = email[0]
                else:
                    email = ''

                leg = Legislator(term, chamber, district, full_name,
                                 first_name=first_name, last_name=last_name,
                                 party=party, url=rep_url, email=email)

                img = rep_doc.xpath('//img[@class="photo"]/@src')
                if img:
                    leg['photo_url'] = img[0]

                # office ####
                address = '\n'.join(rep_doc.xpath('//dt[text()="Madison Office"]/following-sibling::dd/div/text()'))
                phone = rep_doc.xpath('//dt[text()="Telephone"]/following-sibling::dd/div/text()')
                if phone:
                    phone = re.sub('\s+', ' ', phone[0]).strip()
                else:
                    phone = None
                fax = rep_doc.xpath('//dt[text()="Fax"]/following-sibling::dd/div/text()')
                if fax:
                    fax = re.sub('\s+', ' ', fax[0]).strip()
                else:
                    fax = None

                leg.add_office('capitol', 'Madison Office', address=address,
                               phone=phone, fax=fax)

                # save legislator
                leg.add_source(rep_url)
                self.save_legislator(leg)
Beispiel #26
0
    def scrape(self, term, chambers):
        year_abr = term[0:4]

        file_url, db = self.get_dbf(year_abr, 'ROSTER')
        bio_url, bio_db = self.get_dbf(year_abr, 'LEGBIO')

        photos = {}
        for rec in bio_db:
            photos[rec['roster_key']] = rec['urlpicture']

        for rec in db:
            first_name = rec["firstname"]
            middle_name = rec["midname"]
            last_name = rec["lastname"]
            suffix = rec["suffix"]
            full_name = first_name + " " + middle_name + " " + last_name + " " + suffix
            full_name = full_name.replace('  ', ' ')
            full_name = full_name[0: len(full_name) - 1]

            district = int(rec["district"])
            party = rec["party"]
            if party == 'R':
                party = "Republican"
            elif party == 'D':
                party = "Democratic"
            else:
                party = party
            chamber = rec["house"]
            if chamber == 'A':
                chamber = "lower"
            elif chamber == 'S':
                chamber = "upper"

            title = rec["title"]
            legal_position = rec["legpos"]
            leg_status = rec["legstatus"]
            address = rec["address"]
            city = rec["city"]
            state = rec["state"]
            zipcode = rec["zipcode"]
            phone = rec["phone"]
            if 'email' in rec:
                email = rec["email"]
            else:
                email = ''
            photo_url = photos[rec['roster_key']]
            address = '{0}\n{1}, {2} {3}'.format(rec['address'], rec['city'],
                                                 rec['state'], rec['zipcode'])

            leg = Legislator(term, chamber, str(district), full_name,
                             first_name, last_name, middle_name, party,
                             suffixes=suffix, title=title,
                             legal_position=legal_position,
                             leg_status=leg_status, email=email,
                             photo_url=photo_url)
            leg.add_source(file_url)
            leg.add_office('district', 'District Office', address=address,
                           phone=rec['phone'])
            self.save_legislator(leg)
Beispiel #27
0
    def scrape(self, term, chambers):
        leg_url = "ftp://ftp.cga.ct.gov/pub/data/LegislatorDatabase.csv"
        data = self.get(leg_url)
        page = open_csv(data)

        for row in page:
            chamber = {'H': 'lower', 'S': 'upper'}[row['office code']]
            if chamber not in chambers:
                continue

            district = row['dist'].lstrip('0')

            name = row['first name']
            mid = row['middle initial'].strip()
            if mid:
                name += " %s" % mid
            name += " %s" % row['last name']
            suffix = row['suffix'].strip()
            if suffix:
                name += " %s" % suffix

            party = row['party']
            if party == 'Democrat':
                party = 'Democratic'

            leg = Legislator(term, chamber, district,
                             name, first_name=row['first name'],
                             last_name=row['last name'],
                             middle_name=row['middle initial'],
                             suffixes=row['suffix'],
                             party=party,
                             email=row['email'].strip(),
                             url=row['URL'],
                             office_phone=row['capitol phone'])

            office_address = "%s, Room %s\nHartford, CT 06106-1591" % (
                row['capitol street address'], row['room number'])
            leg.add_office('capitol', 'Capitol Office',
                           address=office_address, phone=row['capitol phone'])
            # skipping home address for now
            leg.add_source(leg_url)

            for comm in row['committee member1'].split(';'):
                if comm:
                    if ' (' in comm:
                        comm, role = comm.split(' (')
                        role = role.strip(')').lower()
                    else:
                        role = 'member'
                    comm = comm.strip()
                    if comm == '':
                        continue

                    leg.add_role('committee member', term,
                                 chamber='joint',
                                 committee=comm,
                                 position=role)

            self.save_legislator(leg)
Beispiel #28
0
    def scrape_member(self, chamber, term, member_url):
        with self.urlopen(member_url) as page:
            root = lxml.html.fromstring(page)
            root.make_links_absolute(member_url)

        photo_url = root.xpath('//div[starts-with(@class,"bioPicContainer")]/img/@src')[0]
        photo_url = root.xpath('//div[starts-with(@class,"bioPicContainer")]/img/@src')[0]
        full_name = root.xpath('//div[starts-with(@class,"bioPicContainer")]/img/@alt')[0]

        email = root.xpath('//a[contains(@href, "mailto")]/@href')[0]
        email = email.replace('mailto:','')

        district = root.xpath('//div[@id="District"]//div[starts-with(@class,"widgetContent")]')
        if len(district):
            district = district[0].text.strip()
            district = clean_district(district)
        else:
            self.logger.warning('No district tab found for this hot garbage. Skipping.')
            return

        party = root.xpath('//span[@class="legislatorAffiliation"]/text()')[0]

        if party == 'D':
            party = 'Democratic'
        elif party == 'R':
            party = 'Republican'
        else:
            party = 'Other'

        leg = Legislator(term, chamber, district, full_name, party=party,
                         photo_url=photo_url, url=member_url, email=email)
        leg.add_source(member_url)

        # offices
        for dl in root.xpath('//dl[@class="address"]'):
            office_name = phone = fax = email = None
            address = []
            for child in dl.getchildren():
                text = child.text_content()
                if child.tag == 'dt':
                    office_name = text
                else:
                    if text.startswith('Phone:'):
                        phone = text.strip('Phone: ') or None
                    elif text.startswith('Fax:'):
                        fax = text.strip('Fax: ') or None
                    elif text.startswith('Email:'):
                        email = text.strip('Email: ') or None
                    else:
                        address.append(text)
            # all pieces collected
            if 'District' in office_name:
                otype = 'district'
            else:
                otype = 'capitol'
            leg.add_office(otype, office_name, phone=phone, fax=fax,
                           address='\n'.join(address), email=email)

        self.save_legislator(leg)
Beispiel #29
0
    def _scrape_senator(self, url, term):
        page = lxml.html.fromstring(self.get(url).text)
        name_district = page.xpath('//div[@class="memtitle"]/text()')[0]
        name, district = re.search(r'Senator (.+): District (\d+)',
                                   name_district).group(1, 2)

        try:
            party_text = re.search(
                r'Party: ?(.+)',
                page.xpath('//p[@class="meminfo"][1]')[0].text_content()) \
                      .group(1).strip()
            party = {
                'Democrat': 'Democratic',
                'Republican': 'Republican'
            }[party_text]
        except:
            # A handful of senate pages don't list the legislators' parties, so
            # check the parties' own listings:
            party = self._get_party('upper', district)

        legislator = Legislator(term, 'upper', district, name,
                                party=party, url=url)

        legislator.add_source(url)

        offices_text = [
            '\n'.join(line.strip() for line in office_td.itertext())
            for office_td in page.xpath('//td[@class="memoffice"]')
        ]

        for office_text in offices_text:
            mailing_address = next(
                iter(re.findall(
                    r'Mailing Address:.+?7\d{4}', office_text,
                    flags=re.DOTALL | re.IGNORECASE)),
                office_text
            )

            try:
                address = re.search(
                    r'(?:\d+ |P\.?\s*O\.?).+7\d{4}', mailing_address,
                    flags=re.DOTALL | re.IGNORECASE).group()
            except AttributeError:
                # No address was found; skip office.
                continue

            phone = extract_phone(office_text)
            fax = extract_fax(office_text)

            office_type = 'capitol' if any(
                zip_code in address for zip_code in ('78701', '78711')
            ) else 'district'
            office_name = office_type.title() + ' Office'

            legislator.add_office(office_type, office_name,
                                  address=address.strip(), phone=phone,
                                  fax=fax)

        self.save_legislator(legislator)
Beispiel #30
0
    def scrape_chamber(self, chamber, term):
        url = "http://www.ncga.state.nc.us/gascripts/members/"\
            "memberListNoPic.pl?sChamber="

        if chamber == 'lower':
            url += 'House'
        else:
            url += 'Senate'

        data = self.get(url).text
        doc = lxml.html.fromstring(data)
        doc.make_links_absolute('http://www.ncga.state.nc.us')
        rows = doc.xpath('//div[@id="mainBody"]/table/tr')

        for row in rows[1:]:
            party, district, full_name, counties = row.getchildren()

            party = party.text_content().strip("()")
            party = party_map[party]

            district = district.text_content().replace("District","").strip()

            notice = full_name.xpath('span')
            if notice:
                notice = notice[0].text_content()
                # skip resigned legislators
                if 'Resigned' in notice or 'Deceased' in notice:
                    continue
            else:
                notice = None
            link = full_name.xpath('a/@href')[0]
            full_name = full_name.xpath('a')[0].text_content()
            full_name = full_name.replace(u'\u00a0', ' ')

            # scrape legislator page details
            lhtml = self.get(link).text
            ldoc = lxml.html.fromstring(lhtml)
            ldoc.make_links_absolute('http://www.ncga.state.nc.us')
            photo_url = ldoc.xpath('//a[contains(@href, "pictures")]/@href')[0]
            phone = get_table_item(ldoc, 'Phone:') or None
            address = get_table_item(ldoc, 'Address:') or None
            email = ldoc.xpath('//a[starts-with(@href, "mailto:")]')[0]
            capitol_email = email.text
            capitol_phone = email.xpath('ancestor::tr[1]/preceding-sibling::tr[1]/td/span')[0].text
            capitol_address = email.xpath('ancestor::tr[1]/preceding-sibling::tr[2]/td/text()')
            capitol_address = [x.strip() for x in capitol_address]
            capitol_address = '\n'.join(capitol_address) or None
            capitol_phone = capitol_phone.strip() or None

            # save legislator
            legislator = Legislator(term, chamber, district, full_name,
                                    photo_url=photo_url, party=party,
                                    url=link, notice=notice)
            legislator.add_source(link)
            legislator.add_office('district', 'District Office',
                                  address=address, phone=phone)
            legislator.add_office('capitol', 'Capitol Office',
                                  address=capitol_address, phone=capitol_phone, email=capitol_email)
            self.save_legislator(legislator)
Beispiel #31
0
    def scrape_details(self, chamber, term, leg_name, leg_link, role):
        if not leg_link:
            # Vacant post, likely:
            if "Vacancy" in leg_name:
                return
            raise Exception("leg_link is null. something went wrong")
        try:
            url = 'http://billstatus.ls.state.ms.us/members/%s' % leg_link
            url_root = os.path.dirname(url)
            details_page = self.get(url)
            root = lxml.etree.fromstring(details_page.content)
            party = root.xpath('string(//PARTY)')

            district = root.xpath('string(//DISTRICT)')

            photo = "%s/%s" % (url_root, root.xpath('string(//IMG_NAME)'))

            home_phone = root.xpath('string(//H_PHONE)')

            home_address = root.xpath('string(//H_ADDRESS)')
            home_address2 = root.xpath('string(//H_ADDRESS2)')
            home_city = root.xpath('string(//H_CITY)')
            home_zip = root.xpath('string(//H_ZIP)')

            home_address_total = "%s\n%s\n%s\n%s" % (
                home_address, home_address2, home_city, home_zip)

            bis_phone = root.xpath('string(//B_PHONE)')
            capital_phone = root.xpath('string(//CAP_PHONE)')
            other_phone = root.xpath('string(//OTH_PHONE)')
            org_info = root.xpath('string(//ORG_INFO)')
            email_name = root.xpath('string(//EMAIL_ADDRESS)').strip()
            cap_room = root.xpath('string(//CAP_ROOM)')

            if leg_name in ('Oscar Denton', 'Lataisha Jackson',
                            'John G. Faulkner'):
                assert not party, "Remove special-casing for this Democrat without a listed party: {}".format(
                    leg_name)
                party = 'Democratic'
            elif leg_name in ('James W. Mathis'):
                assert not party, "Remove special-casing for this Republican without a listed party: {}".format(
                    leg_name)
                party = 'Republican'
            elif party == 'D':
                party = 'Democratic'
            elif party == 'R':
                party = 'Republican'
            else:
                raise AssertionError(
                    "A member with no identifiable party was found: {}".format(
                        leg_name))

            leg = Legislator(term,
                             chamber,
                             district,
                             leg_name,
                             party=party,
                             role=role,
                             org_info=org_info,
                             url=url,
                             photo_url=photo)
            leg.add_source(url)

            kwargs = {}

            if email_name != "":
                if "@" in email_name:
                    email = email_name
                else:
                    email = '%s@%s.ms.gov' % (email_name, {
                        "upper": "senate",
                        "lower": "house"
                    }[chamber])
                kwargs['email'] = email

            if capital_phone != "":
                kwargs['phone'] = capital_phone

            if cap_room != "":
                kwargs["address"] = "Room %s\n%s" % (cap_room, CAP_ADDRESS)
            else:
                kwargs['address'] = CAP_ADDRESS

            leg.add_office('capitol', 'Capitol Office', **kwargs)

            kwargs = {}
            if home_phone != "":
                kwargs['phone'] = home_phone

            if home_address_total != "":
                kwargs['address'] = home_address_total

            if kwargs != {}:
                leg.add_office('district', 'District Office', **kwargs)

            self.save_legislator(leg)
        except scrapelib.HTTPError, e:
            self.warning(str(e))
Beispiel #32
0
    def scrape_senators(self, chamber, term):
        session = ((int(term[0:4]) - 2009) / 2) + 124

        mapping = {
            'district': 1,
            'first_name': 2,
            'middle_name': 3,
            'last_name': 4,
            # 'suffix': 6,
            'party': 6,
            'resident_county': 5,
            'street_addr': 7,
            'city': 8,
            'state': 9,
            'zip_code': 10,
            'phone1': 12,
            'phone2': 13,
            'email': 11,
        }

        url = (
            'http://legisweb1.mainelegislature.org/wp/senate/'
            'wp-content/uploads/sites/2/2013/09/%sthSenatorsList.xlsx' % session)

        try:
            fn, result = self.urlretrieve(url)
        except scrapelib.HTTPError:
            url = 'http://www.maine.gov/legis/senate/%dthSenatorsList.xls'
            url = url % session
            fn, result = self.urlretrieve(url)

        wb = xlrd.open_workbook(fn)
        sh = wb.sheet_by_index(0)

        for rownum in xrange(1, sh.nrows):
            # get fields out of mapping
            d = {}
            for field, col_num in mapping.iteritems():
                try:
                    d[field] = str(sh.cell(rownum, col_num).value)
                except IndexError:
                    # This col_num doesn't exist in the sheet.
                    pass

            full_name = " ".join((d['first_name'], d['middle_name'],
                                  d['last_name']))
            full_name = re.sub(r'\s+', ' ', full_name).strip()

            address = "{street_addr}\n{city}, ME {zip_code}".format(**d)

            # For matching up legs with votes
            district_name = d['city']

            phone = d['phone1']

            district = d['district'].split('.')[0]

            leg_url = 'http://www.maine.gov/legis/senate/bio%02ds.htm' % int(district)

            leg = Legislator(term, chamber, district, full_name,
                             d['first_name'], d['middle_name'], d['last_name'],
                             _party_map[d['party']],
                             resident_county=d['resident_county'],
                             office_address=address,
                             office_phone=phone,
                             email=None,
                             district_name=district_name,
                             url=leg_url)
            leg.add_source(url)
            leg.add_source(leg_url)

            html = self.urlopen(leg_url)
            doc = lxml.html.fromstring(html)
            doc.make_links_absolute(leg_url)
            xpath = '//td[@class="XSP_MAIN_PANEL"]/descendant::img/@src'
            photo_url = doc.xpath(xpath)
            if photo_url:
                photo_url = photo_url.pop()
                leg['photo_url'] = photo_url
            else:
                photo_url = None

            office = dict(
                name='District Office', type='district',
                fax=None, email=None,
                address=''.join(address))

            leg['email'] = d['email']
            leg.add_office(**office)
            self.save_legislator(leg)
Beispiel #33
0
    def scrape_upper(self, term):
        url = 'http://www.utahsenate.org/aspx/roster.aspx'
        html = self.urlopen(url)
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)

        for row in doc.xpath('//tr')[1:]:
            tds = row.xpath('td')

            # 1st has district
            district = tds[0].text_content()

            # 3rd has name and email
            person = tds[2].xpath('span[@class="person"]')[0]
            if '(D)' in person.text_content():
                party = 'Democratic'
            elif '(R)' in person.text_content():
                party = 'Republican'
            else:
                raise ValueError('unknown party')
            a = person.xpath('a')[0]
            name = a.text_content()
            leg_url = a.get('href')
            email = tds[2].xpath('span[@class="email"]/a/text()')
            if email:
                email = email[0]
            else:
                email = ''

            # office address
            # text is split by br in 4th td, join with a space
            address = ' '.join(tds[3].xpath('font/text()'))
            numbers = tds[4].xpath('text()')
            phone = None
            fax = None
            for num in numbers:
                if num.startswith(('Cell', 'Home', 'Work')) and not phone:
                    phone = num.split(u'\xa0')[-1]
                elif num.startswith('Fax'):
                    fax = num.split(u'\xa0')[-1]
            numbers = [num.split(u'\xa0') for num in numbers]

            # get photo
            try:
                leg_html = self.urlopen(leg_url)
                leg_doc = lxml.html.fromstring(leg_html)
                leg_doc.make_links_absolute(leg_url)
                photo_url = leg_doc.xpath('//p[@class="photo"]/img/@src')[0]
            except:
                self.warning('could not fetch %s' % leg_url)
                photo_url = ''

            leg = Legislator(term,
                             'upper',
                             district,
                             name,
                             party=party,
                             email=email,
                             address=address,
                             photo_url=photo_url,
                             url=leg_url)
            leg.add_office('district',
                           'Home',
                           address=address,
                           phone=phone,
                           fax=fax)
            leg.add_source(url)
            leg.add_source(leg_url)
            self.save_legislator(leg)
Beispiel #34
0
    def scrape_chamber(self, chamber, term):
        url = "http://www.ncga.state.nc.us/gascripts/members/"\
            "memberList.pl?sChamber="

        if chamber == 'lower':
            url += 'House'
        else:
            url += 'Senate'

        data = self.urlopen(url)
        doc = lxml.html.fromstring(data)
        doc.make_links_absolute('http://www.ncga.state.nc.us')
        rows = doc.xpath('//div[@id="mainBody"]/table/tr')

        for row in rows[1:]:
            party, district, full_name, counties = row.getchildren()

            party = party.text_content()
            party = party_map[party]

            district = district.text_content()

            notice = full_name.xpath('span')
            if notice:
                notice = notice[0].text_content()
                # skip resigned legislators
                if 'Resigned' in notice or 'Deceased' in notice:
                    continue
            else:
                notice = None
            link = full_name.xpath('a/@href')[0]
            full_name = full_name.xpath('a')[0].text_content()
            full_name = full_name.replace(u'\u00a0', ' ')

            # scrape legislator page details
            lhtml = self.urlopen(link)
            ldoc = lxml.html.fromstring(lhtml)
            ldoc.make_links_absolute('http://www.ncga.state.nc.us')
            photo_url = ldoc.xpath('//a[contains(@href, "pictures")]/@href')[0]
            phone = get_table_item(ldoc, 'Phone:')
            address = get_table_item(ldoc,
                                     'Legislative Mailing Address:') or None
            email = ldoc.xpath(
                '//a[starts-with(@href, "mailto:")]')[0].text or ''

            # save legislator
            legislator = Legislator(term,
                                    chamber,
                                    district,
                                    full_name,
                                    photo_url=photo_url,
                                    party=party,
                                    url=link,
                                    notice=notice,
                                    email=email)
            legislator.add_source(link)
            legislator.add_office('capitol',
                                  'Capitol Office',
                                  address=address,
                                  phone=phone)
            self.save_legislator(legislator)
Beispiel #35
0
    def _parse_member(self, chamber, term, member):
        first_name = member.get('first-name')
        last_name = member.get('last-name')
        party = self.party_map[member.get('party')]

        # this is semi-safe because we validated term w/ latest_only=True
        session = self.metadata['terms'][-1]['sessions'][-1]

        # extra_fields
        extra_dict = {}
        for name, xpath in self.extra_fields.iteritems():
            result = member.xpath(xpath)
            if result:
                extra_dict[name] = result[0]

        # address fields
        for name, xpath in self.addr_fields.iteritems():
            result = member.xpath(xpath)
            if result:
                result = result[0]
                extra_dict[name] = '%s, %s, %s %s' % (
                    result.get('street-address'),
                    result.get('city'),
                    result.get('state'),
                    result.get('postal-code'))

        leg = Legislator(term, chamber, member.get('district-number'),
                         full_name=first_name+' '+last_name,
                         first_name=first_name,
                         last_name=last_name,
                         middle_name=member.get('middle-initial'),
                         party=party,
                         email=member.get('e-mail'),
                         url=member.get('website'),
                         oregon_member_id=member.get('leg-member-id'))

        # add offices
        leg.add_office('capitol', 'Capitol Office',
                       address=extra_dict['capitol_address'],
                       phone=extra_dict['phone'])
        if 'district_address' in extra_dict or 'district_phone' in extra_dict:
            leg.add_office('district', 'District Office',
                           address=extra_dict.get('district_address', None),
                           phone=extra_dict.get('district_phone', None))

        # committees
        com_xpath = 'committee-membership/session[@session-name="%s"]/committee' % session
        for com in member.xpath(com_xpath):
            cdict = {
                'position': com.get('title').lower(),
                'chamber': chamber,
            }
            com_name = com.get('name')
            com_class = com.get('committee-class')
            if com_class == 'sub-committee':
                cdict['committee'], cdict['subcommittee'] = \
                        com.get('name').split(' Subcommittee On ')
            else:
                cdict['committee'] = com.get('name')

            leg.add_role('committee member', term, **cdict)

        leg.add_source(self.source_url)
        return leg
Beispiel #36
0
    def scrape(self, term, chambers):
        leg_url = "ftp://ftp.cga.ct.gov/pub/data/LegislatorDatabase.csv"
        data = self.urlopen(leg_url)
        page = open_csv(data)

        for row in page:
            chamber = {'H': 'lower', 'S': 'upper'}[row['office code']]
            if chamber not in chambers:
                continue

            district = row['dist'].lstrip('0')

            name = row['first name']
            mid = row['middle initial'].strip()
            if mid:
                name += " %s" % mid
            name += " %s" % row['last name']
            suffix = row['suffix'].strip()
            if suffix:
                name += " %s" % suffix

            party = row['party']
            if party == 'Democrat':
                party = 'Democratic'

            leg = Legislator(term,
                             chamber,
                             district,
                             name,
                             first_name=row['first name'],
                             last_name=row['last name'],
                             middle_name=row['middle initial'],
                             suffixes=row['suffix'],
                             party=party,
                             email=row['email'].strip(),
                             url=row['URL'],
                             office_phone=row['capitol phone'])

            office_address = "%s, Room %s\nHartford, CT 06106-1591" % (
                row['capitol street address'], row['room number'])
            leg.add_office('capitol',
                           'Capitol Office',
                           address=office_address,
                           phone=row['capitol phone'])
            # skipping home address for now
            leg.add_source(leg_url)

            for comm in row['committee member1'].split(';'):
                if comm:
                    if ' (' in comm:
                        comm, role = comm.split(' (')
                        role = role.strip(')').lower()
                    else:
                        role = 'member'
                    comm = comm.strip()
                    if comm == '':
                        continue

                    leg.add_role('committee member',
                                 term,
                                 chamber='joint',
                                 committee=comm,
                                 position=role)

            self.save_legislator(leg)
Beispiel #37
0
    def scrape(self, chamber, term):
        self.validate_term(term, latest_only=False)
        root_url = 'http://www.capitol.tn.gov/'
        parties = {
            'D': 'Democratic',
            'R': 'Republican',
            'CCR': 'Carter County Republican',
            'I': 'Independent'
        }

        #testing for chamber
        if chamber == 'upper':
            url_chamber_name = 'senate'
            abbr = 's'
        else:
            url_chamber_name = 'house'
            abbr = 'h'
        if term != self.metadata["terms"][-1]["sessions"][0]:
            chamber_url = root_url + url_chamber_name
            chamber_url += '/archives/' + term + 'GA/Members/index.html'
        else:
            chamber_url = root_url + url_chamber_name + '/members/'

        page = self.urlopen(chamber_url)
        page = lxml.html.fromstring(page)

        for row in page.xpath("//tr")[1:]:

            # Skip any a header row.
            if set(child.tag for child in row) == set(['th']):
                continue

            partyInit = row.xpath('td[2]')[0].text.split()[0]
            party = parties[partyInit]
            district = row.xpath('td[4]/a')[0].text.split()[1]
            address = row.xpath('td[5]')[0].text_content()
            # 301 6th Avenue North Suite
            address = address.replace(
                'LP', 'Legislative Plaza\nNashville, TN 37243')
            address = address.replace(
                'WMB', 'War Memorial Building\nNashville, TN 37243')
            address = '301 6th Avenue North\nSuite ' + address
            phone = row.xpath('td[6]')[0].text
            #special case for Karen D. Camper
            if phone == None:
                phone = row.xpath('td[6]/div')[0].text
            phone = '615-' + phone.split()[0]
            email = row.xpath('td[7]/a')[0].text
            member_url = (root_url + url_chamber_name + '/members/' + abbr +
                          district + '.html')
            member_photo_url = (root_url + url_chamber_name +
                                '/members/images/' + abbr + district + '.jpg')

            member_page = self.urlopen(member_url)
            member_page = lxml.html.fromstring(member_page)
            name = member_page.xpath('//div[@id="membertitle"]/h2')[0].text
            if 'Speaker' in name:
                full_name = name[8:len(name)]
            elif 'Lt.' in name:
                full_name = name[13:len(name)]
            elif abbr == 'h':
                full_name = name[5:len(name)]
            else:
                full_name = name[8:len(name)]

            leg = Legislator(term,
                             chamber,
                             district,
                             full_name.strip(),
                             party=party,
                             email=email,
                             url=member_url,
                             photo_url=member_photo_url)
            leg.add_source(chamber_url)
            leg.add_source(member_url)

            # TODO: add district address from this page

            leg.add_office('capitol',
                           'Nashville Address',
                           address=address,
                           phone=phone,
                           email=email)

            self.save_legislator(leg)
Beispiel #38
0
    def scrape(self, term, chambers):
        leg_url = "ftp://ftp.cga.ct.gov/pub/data/LegislatorDatabase.csv"
        page = self.get(leg_url)

        # Ensure that the spreadsheet's structure hasn't generally changed
        _row_headers = page.text.split('\r\n')[0].replace('"', '').split(',')
        assert _row_headers == HEADERS, "Spreadsheet structure may have changed"

        page = open_csv(page)
        for row in page:

            chamber = {'H': 'lower', 'S': 'upper'}[row['office code']]

            district = row['dist'].lstrip('0')
            assert district.isdigit(), "Invalid district found: {}".format(district)

            name = row['first name']
            mid = row['middle initial'].strip()
            if mid:
                name += " %s" % mid
            name += " %s" % row['last name']
            suffix = row['suffix'].strip()
            if suffix:
                name += " %s" % suffix

            party = row['party']
            if party == 'Democrat':
                party = 'Democratic'

            leg = Legislator(term, chamber, district, name,
                             party=party,
                             url=row['URL'])

            office_address = "%s\nRoom %s\nHartford, CT 06106" % (
                row['capitol street address'], row['room number'])
            email = row['email'].strip()
            if "@" not in email:
                assert email.endswith("mailform.php"), "Problematic email found: {}".format(email)
                email = None
            leg.add_office('capitol', 'Capitol Office',
                           address=office_address,
                           phone=row['capitol phone'],
                           fax=(row['fax'].strip() or None),
                           email=email)

            home_address = "{}\n{}, {} {}".format(
                row['home street address'],
                row['home city'],
                row['home state'],
                row['home zip code'],
            )
            if "Legislative Office Building" not in home_address:
                leg.add_office('district', 'District Office',
                               address=home_address,
                               phone=row['home phone'] if row['home phone'].strip() else None)

            leg.add_source(leg_url)

            for comm in row['committee member1'].split(';'):
                if comm:
                    if ' (' in comm:
                        comm, role = comm.split(' (')
                        role = role.strip(')').lower()
                    else:
                        role = 'member'
                    comm = comm.strip()
                    if comm == '':
                        continue

                    leg.add_role(role, term,
                                 chamber='joint',
                                 committee=comm)

            self.save_legislator(leg)
Beispiel #39
0
    def scrape_legislator(self, name, chamber, term, url):
        page = self.get(url).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        party = page.xpath("string(//span[contains(@id, 'Party')])")
        party = party.strip()

        if party == 'Democrat':
            party = 'Democratic'

        district = page.xpath("string(//span[contains(@id, 'District')])")
        district = district.strip().lstrip('0')

        occupation = page.xpath(
            "string(//span[contains(@id, 'Occupation')])")
        occupation = occupation.strip()

        (photo_url, ) = page.xpath('//img[contains(@id, "_imgMember")]/@src')

        office_phone = page.xpath(
            "string(//span[contains(@id, 'CapitolPhone')])").strip()

        email = None

        email_link = page.xpath('//a[@id="lnkMail"]')

        if email_link:
            email = email_link[0].attrib['href'].split(":")[1]

        legislator = Legislator(term, chamber, district, name,
                                party=party,
                                occupation=occupation,
                                photo_url=photo_url,
                                url=url)
        kwargs = {}
        if office_phone.strip() != "":
            kwargs['phone'] = office_phone

        if email and email.strip() != "":
            # South Dakota protects their email addresses from scraping using
            # some JS code that runs on page load
            # Until that code is run, all their email addresses are listed as
            # *@example.com; so, fix this
            kwargs['email'] = re.sub(r'@example\.com$', '@sdlegislature.gov', email)

        if kwargs:
            legislator.add_office('capitol', 'Capitol Office', **kwargs)

        home_address = [
                x.strip() for x in
                page.xpath('//td/span[contains(@id, "HomeAddress")]/text()')
                if x.strip()
                ]
        if home_address:
            home_address = "\n".join(home_address)
            home_phone = page.xpath(
                "string(//span[contains(@id, 'HomePhone')])").strip()
            legislator.add_office(
                    'district',
                    'District Office',
                    address=home_address,
                    phone=home_phone or None
                    )

        legislator.add_source(url)

        comm_url = page.xpath("//a[. = 'Committees']")[0].attrib['href']
        self.scrape_committees(legislator, comm_url)

        self.save_legislator(legislator)
Beispiel #40
0
    def scrape_senators(self, chamber, term):
        session = ((int(term[0:4]) - 2009) / 2) + 124

        mapping = {
            'district': 0,
            'first_name': 2,
            'middle_name': 3,
            'last_name': 4,
            'suffixes': 5,
            'party': 1,
            'street_addr': 6,
            'city': 7,
            'state': 8,
            'zip_code': 9,
            'phone1': 10,
            'phone2': 11,
            'email': 12
        }

        list_location = '2014/12/127th-Senate-Members2'
        url = ('http://legisweb1.mainelegislature.org/wp/senate/'
               'wp-content/uploads/sites/2/{}.xlsx'.format(list_location))
        fn, result = self.urlretrieve(url)

        wb = xlrd.open_workbook(fn)
        sh = wb.sheet_by_index(0)

        for rownum in xrange(1, sh.nrows):
            # get fields out of mapping
            d = {}
            for field, col_num in mapping.iteritems():
                try:
                    d[field] = str(sh.cell(rownum, col_num).value).strip()
                except IndexError:
                    # This col_num doesn't exist in the sheet.
                    pass

            full_name = " ".join(
                (d['first_name'], d['middle_name'], d['last_name']))
            full_name = re.sub(r'\s+', ' ', full_name).strip()

            address = "{street_addr}\n{city}, ME {zip_code}".format(**d)

            # For matching up legs with votes
            district_name = d['city']

            phone = d['phone1']
            if not phone:
                phone = d['phone2']
            if not phone:
                phone = None

            district = d['district'].split('.')[0]

            # Determine legislator's URL to get their photo
            LEGISLATOR_ROSTER_URL = \
                    'http://legisweb1.mainelegislature.org/wp/senate/senators/'
            html = self.get(LEGISLATOR_ROSTER_URL).text
            doc = lxml.html.fromstring(html)
            doc.make_links_absolute(LEGISLATOR_ROSTER_URL)

            URL_XPATH = '//address[contains(text(), "(District {})")]/a/@href'. \
                    format(district)
            (leg_url, ) = doc.xpath(URL_XPATH)

            leg = Legislator(term,
                             chamber,
                             district,
                             full_name,
                             first_name=d['first_name'],
                             middle_name=d['middle_name'],
                             last_name=d['last_name'],
                             party=d['party'],
                             suffixes=d['suffixes'],
                             district_name=district_name,
                             url=leg_url)
            leg.add_source(url)
            leg.add_source(leg_url)

            html = self.get(leg_url).text
            doc = lxml.html.fromstring(html)
            doc.make_links_absolute(leg_url)
            xpath = '//img[contains(@src, ".png")]/@src'
            photo_url = doc.xpath(xpath)
            if photo_url:
                photo_url = photo_url.pop()
                leg['photo_url'] = photo_url
            else:
                photo_url = None

            office = dict(name='District Office',
                          type='district',
                          phone=phone,
                          fax=None,
                          email=d['email'],
                          address=address)

            leg['email'] = d['email']
            leg.add_office(**office)
            self.save_legislator(leg)
Beispiel #41
0
    def scrape(self, chamber, term):
        url = self.URLs[chamber]
        page = self.lxmlize(url)

        for block in page.xpath("//div[@class='ms-rtestate-field']")[1:-1]:
            # Each legislator block.

            photo_block = block.xpath("ancestor::td/preceding-sibling::td")
            if len(photo_block) == 0:
                continue

            photo_block, = photo_block
            # (The <td> before ours was the photo)
            img, = photo_block.xpath("*")
            img = img.attrib['src']

            h2, = block.xpath(".//h2/a")
            name = h2.text

            info = {}
            # Right, now let's get info out of their little profile box.
            for entry in block.xpath(".//p"):
                for kvpair in itergraphs(entry.xpath("./*"), 'br'):
                    # OK. We either get the tail or the next element
                    # (usually an <a> tag)
                    if len(kvpair) == 1:
                        key, = kvpair
                        value = key.tail.strip() if key.tail else None
                        if value:
                            value = re.sub("\s+", " ", value).strip()
                    elif len(kvpair) == 2:
                        key, value = kvpair
                    else:
                        # Never seen text + an <a> tag, perhaps this can happen.
                        raise ValueError(
                            "Too many elements. Something changed")

                    key = key.text_content().strip(" :")
                    if value is None:
                        # A page has the value in a <strong> tag. D'oh.
                        key, value = (x.strip() for x in key.rsplit(":", 1))

                    key = re.sub("\s+", " ", key).strip()

                    info[key] = value

            info['District'] = info['District'].encode('ascii',
                                                       'ignore').strip()

            info['Party'] = info['Party'].strip(": ")

            leg = Legislator(term=term,
                             url=h2.attrib['href'],
                             chamber=chamber,
                             full_name=name,
                             party=info['Party'],
                             district=info['District'],
                             photo_url=img)
            leg.add_source(url)

            phone = info.get('Capitol Phone', info.get('apitol Phone'))
            if hasattr(phone, 'text_content'):
                phone = phone.text_content()

            leg.add_office(type='capitol',
                           name='Capitol Office',
                           address=info['Capitol Address'],
                           phone=phone,
                           email=info['Email'].attrib['href'].replace(
                               "mailto:", ""))

            self.save_legislator(leg)
Beispiel #42
0
    def scrape(self, chamber, term):
        if chamber == 'upper':
            index_url = 'http://www.leg.wa.gov/senate/senators/Pages/default.aspx'
        else:
            index_url = 'http://www.leg.wa.gov/house/representatives/Pages/default.aspx'
        doc = self.lxmlize(index_url)

        # Email addresses are listed on a separate page.
        email_list_url = 'http://app.leg.wa.gov/memberemail/Default.aspx'
        email_doc = self.lxmlize(email_list_url)

        for member in doc.xpath(
                '//div[@id="allMembers"]/div[@class="memberInformation"]'):
            (photo_url,
             ) = member.xpath('.//a[text()="Print Quality Photo"]/@href')

            (title_name_party,
             ) = member.xpath('.//span[@class="memberName"]/text()')
            (name, party) = re.search(
                r'^(?:Senator|Representative)\s(.+)\s\(([RD])\)$',
                title_name_party).groups()
            if party == 'R':
                party = "Republican"
            elif party == 'D':
                party = "Democratic"

            (
                district_name,
                _district_name,
            ) = member.xpath(
                './/a[contains(text(), " Legislative District")]/text()')
            assert district_name == _district_name
            district_num = re.search(r'(\d{1,2})\w{2} Legislative District',
                                     district_name).group(1)

            leg = Legislator(full_name=name,
                             term=term,
                             chamber=chamber,
                             district=district_num,
                             party=party,
                             photo_url=photo_url)
            leg['url'] = member.xpath(
                './/a[contains(text(), "Home Page")]/@href')[0]

            capitol_office = member.xpath(
                './/div[@class="memberColumnTitle" and text()=" Olympia Office"]/parent::div[1]/text()'
            )
            capitol_office = [l.strip() for l in capitol_office if l.strip()]

            capitol_fax = None
            capitol_phone = None
            capitol_address = None

            # Can't capture any information anyway if office data is empty,
            # so we can skip if that's the case.
            if capitol_office:
                # Retrieve capitol office fax number.
                if capitol_office[-1].startswith('Fax: '):
                    capitol_fax = capitol_office.pop().replace('Fax: ', "")

                # Retrieve capitol office phone number.
                capitol_phone = capitol_office.pop()

                # Retrieve capitol office address.
                capitol_address = '\n'.join(capitol_office)

            # Retrieve the member's position from the email link. We need it to find the member's email address.
            # These positions are enough to discriminate the chamber too (0 = upper, 1,2 = lower)
            email_link_url = member.xpath(
                './/a[contains(@href, "memberEmail")]')[0].get('href')
            position = re.search(r'/([[0-9]+)$', email_link_url).group(1)

            # Need to get the email from the email page by matching with the member's district and position
            email = self.get_node(
                email_doc,
                './/tr/td/a[contains(@href, "memberEmail/{}/{}")]/parent::td/'
                'following-sibling::td[1]/text()'.format(
                    district_num, position)).strip()

            leg.add_office('capitol',
                           'Capitol Office',
                           address=capitol_address,
                           phone=capitol_phone,
                           email=email,
                           fax=capitol_fax)

            _has_district_office = member.xpath(
                './/div[@class="memberColumnTitle" and text()=" District Office"]'
            )
            if _has_district_office:
                # Out of both chambers, only one member has multiple district offices, so ignore that
                # Also ignore the few members who have separate mailing addresses
                district_office = member.xpath(
                    './/div[@class="memberColumnTitle" and text()=" District Office"]/parent::div[1]/text()'
                )
                district_office = [
                    l.strip() for l in district_office if l.strip()
                ]
                _end_of_first_address = district_office.index([
                    l for l in district_office
                    if re.search(r'\,\s*WA\s*\d{5}', l)
                ][0])
                district_address = '\n'.join(
                    district_office[0:(_end_of_first_address + 1)])
                try:
                    district_phone = district_office[(_end_of_first_address +
                                                      1)]
                    assert re.match(r'\(\d{3}\) \d{3} \- \d{4}',
                                    district_phone)
                except IndexError:
                    pass
                except AssertionError:
                    pass

                leg.add_office('district',
                               'District Office',
                               address=district_address,
                               phone=district_phone)

            leg.add_source(index_url)

            self.save_legislator(leg)
Beispiel #43
0
    def scrape(self, term, chambers):
        url = 'http://gencourt.state.nh.us/downloads/Members.txt'

        option_map = {}
        html = self.urlopen('http://www.gencourt.state.nh.us/house/members/memberlookup.aspx')
        doc = lxml.html.fromstring(html)
        for opt in doc.xpath('//option'):
            option_map[opt.text] = opt.get('value')

        data = self.urlopen(url)
        for line in data.splitlines():
            if line.strip() == "":
                continue

            (chamber, fullname, last, first, middle, county, district_num,
             seat, party, street, street2, city, astate, zipcode,
             home_phone, office_phone, fax, email, com1, com2, com3,
             com4, com5, com6, com7) = line.split('*')

            chamber = chamber_map[chamber]

            # skip legislators from a chamber we aren't scraping
            if chamber not in chambers:
                continue

            if middle:
                full = '%s %s %s' % (first, middle, last)
            else:
                full = '%s %s' % (first, last)

            address = street
            if street2:
                address += (' ' + street2)
            address += '\n%s, %s %s' % (city, astate, zipcode)

            district = str(int(district_num))
            if county:
                district = '%s %s' % (county, district)

            leg = Legislator(term, chamber, district, full, first, last,
                             middle, party_map[party], email=email)
            leg.add_office('district', 'Home Address',
                           address=address, phone=home_phone or None)
            leg.add_office('district', 'Office Address',
                           phone=office_phone or None, fax=fax or None)

            if chamber == 'upper':
                leg['url'] = 'http://www.gencourt.state.nh.us/Senate/members/webpages/district%02d.aspx' % int(district_num)
            elif chamber == 'lower':
                code = option_map.get('{0}, {1}'.format(last, first))
                if code:
                    leg['url'] = 'http://www.gencourt.state.nh.us/house/members/member.aspx?member=' + code

            romans = r'(?i)\s([IXV]+)(?:\s|$)'
            for com in (com1, com2, com3, com4, com5, com6, com7):
                com = com.strip('"')
                if com:
                    com_name = com.title()
                    com_name = re.sub(romans, lambda m: m.group().upper(),
                                      com_name)
                    leg.add_role('committee member', term=term,
                                  chamber=chamber, committee=com_name)

            if 'url' in leg:
                leg['photo_url'] = self.get_photo(leg['url'], chamber)

            leg.add_source(url)
            self.save_legislator(leg)
Beispiel #44
0
    def scrape_member(self, chamber, term, member_url):
        page = self.get(member_url).text
        root = lxml.html.fromstring(page)
        root.make_links_absolute(member_url)

        photo_url = root.xpath('//div[@class="thumbPhoto"]/img/@src')[0]
        full_name = root.xpath('//h1/span')[0].tail.strip()

        try:
            email = root.xpath('//a[contains(@href, "mailto")]/@href')[0]
            email = email.replace('mailto:', '')
        except:
            email = ''
            self.info("seat may be vacant")

        party, district = root.xpath('//h1/span')[1].text.split('-')
        party = party.strip()
        district = clean_district(district.strip())

        if party in ('D', 'Democrat', 'Democratic'):
            party = 'Democratic'
        elif party in ('R', 'Republican'):
            party = 'Republican'
        else:
            party = 'Other'

        leg = Legislator(term, chamber, district, full_name, party=party,
                         photo_url=photo_url, url=member_url)
        leg.add_source(member_url)

        # offices

        # this bool is so we only attach the email to one office
        # and we make sure to create at least one office
        email_stored = True
        if email:
            email_stored = False

        for addr in root.xpath('//address/div[@class="contactGroup"]'):
            office_name = addr.xpath('../preceding-sibling::h4/text()'
                                     )[0].strip()
            address = addr.xpath('a')[0].text_content()
            address = re.sub('\s{2,}', '\n', address)

            phone = fax = next = None
            for phonerow in addr.xpath('./div/div'):
                phonerow = phonerow.text_content().strip()
                if phonerow == 'Phone:':
                    next = 'phone'
                elif phonerow == 'Fax:':
                    next = 'fax'
                elif next == 'phone':
                    phone = phonerow
                    next = None
                elif next == 'fax':
                    fax = phonerow
                    next = None
                else:
                    self.warning('unknown phonerow %s', phonerow)

            # all pieces collected
            if 'District' in office_name:
                otype = 'district'
            elif 'State' in office_name:
                otype = 'capitol'

            if not email_stored:
                email_stored = True
                leg.add_office(otype, office_name, phone=phone, fax=fax,
                               address=address, email=email)
            else:
                leg.add_office(otype, office_name, phone=phone, fax=fax,
                               address=address)

        if not email_stored:
            leg.add_office('capitol', 'Capitol Office', email=email)

        self.save_legislator(leg)
Beispiel #45
0
    def scrape(self, chamber, term):
        # What Vermont claims are Word and Excel files are actually
        # just HTML tables
        # What Vermont claims is a CSV file is actually one row of comma
        # separated values followed by a ColdFusion error.
        url = ("http://www.leg.state.vt.us/legdir/"
               "memberdata.cfm/memberdata.doc?FileType=W")

        page = self.urlopen(url)
        page = lxml.html.fromstring(page)

        for tr in page.xpath("//tr")[1:]:
            row_chamber = tr.xpath("string(td[4])")
            if row_chamber == 'S' and chamber == 'lower':
                continue
            elif row_chamber == 'H' and chamber == 'upper':
                continue

            district = tr.xpath("string(td[7])")
            district = district.replace('District', '').strip()
            if not district:
                continue

            first_name = tr.xpath("string(td[8])")
            middle_name = tr.xpath("string(td[9])")
            last_name = tr.xpath("string(td[10])")

            if first_name.endswith(" %s." % middle_name):
                first_name = first_name.split(" %s." % middle_name)[0]

            if middle_name:
                full_name = "%s %s. %s" % (first_name, middle_name, last_name)
            else:
                full_name = "%s %s" % (first_name, last_name)

            email = tr.xpath("string(td[11])")

            party = tr.xpath("string(td[6])")
            party = re.sub(r'Democrat\b', 'Democratic', party)
            parties = party.split('/')
            if 'Republican' in parties:
                if 'Democratic' in parties:
                    pass
                else:
                    party = 'Republican'
                    parties.remove('Republican')
            elif 'Democratic' in parties:
                party = 'Democratic'
                parties.remove('Democratic')
            else:
                party = parties.pop(0)

            leg = Legislator(
                term,
                chamber,
                district,
                full_name,
                first_name=first_name,
                middle_name=middle_name,
                last_name=last_name,
                party=party,
                email=email,
                # closest thing we have to a page for legislators, not ideal
                url='http://www.leg.state.vt.us/legdir/LegDirMain.cfm')
            leg['roles'][0]['other_parties'] = parties
            leg.add_source(url)

            # 12-16: MailingAddress: 1,2,City,State,ZIP
            mail = '%s\n%s\n%s, %s %s' % (
                tr.xpath('string(td[12])'), tr.xpath('string(td[13])'),
                tr.xpath('string(td[14])'), tr.xpath('string(td[15])'),
                tr.xpath('string(td[16])'))
            leg.add_office('district', 'Mailing Address', address=mail)
            # 17-21: HomeAddress: 1,2,City,State,ZIP, Email, Phone
            home = '%s\n%s\n%s, %s %s' % (
                tr.xpath('string(td[17])'), tr.xpath('string(td[18])'),
                tr.xpath('string(td[19])'), tr.xpath('string(td[20])'),
                tr.xpath('string(td[21])'))
            home_email = tr.xpath('string(td[22])') or None
            home_phone = tr.xpath('string(td[23])') or None
            leg.add_office('district',
                           'Home Address',
                           address=home,
                           email=home_email,
                           phone=home_phone)

            self.save_legislator(leg)
Beispiel #46
0
    def fetch_member(self, url, name, term, chamber):
        if name in CHAMBER_MOVES:
            if chamber != CHAMBER_MOVES[name]:
                return  # Skip bad chambers.

        party_map = {'R': 'Republican', 'D': 'Democratic', 'I': 'Independent'}
        party_district_re = re.compile(
            r'\((R|D|I)\) - (?:House|Senate) District\s+(\d+)')

        # handle resignations, special elections
        match = re.search(r'-(Resigned|Member) (\d{1,2}/\d{1,2})?', name)
        if match:
            action, date = match.groups()
            name = name.rsplit('-')[0]
            if action == 'Resigned':
                pass  # TODO: set end date
            elif action == 'Member':
                pass  # TODO: set start date

        html = self.urlopen(url)
        doc = lxml.html.fromstring(html)

        party_district_line = doc.xpath('//h3/font/text()')[0]
        party, district = party_district_re.match(party_district_line).groups()

        leg = Legislator(term,
                         chamber,
                         district,
                         name.strip(),
                         party=party_map[party],
                         url=url)
        leg.add_source(url)

        for ul in doc.xpath('//ul[@class="linkNon"]'):
            address = []
            phone = None
            email = None
            for li in ul.getchildren():
                text = li.text_content()
                if re.match('\(\d{3}\)', text):
                    phone = text
                elif text.startswith('email:'):
                    email = text.strip('email: ').strip()
                else:
                    address.append(text)
                type = ('capitol'
                        if 'Capitol Square' in address else 'district')
                name = ('Capitol Office'
                        if type == 'capitol' else 'District Office')
            leg.add_office(type,
                           name,
                           address='\n'.join(address),
                           phone=phone,
                           email=email)

        for com in doc.xpath('//ul[@class="linkSect"][1]/li/a/text()'):
            leg.add_role('committee member',
                         term=term,
                         chamber=chamber,
                         committee=com)

        self.save_legislator(leg)
Beispiel #47
0
    def scrape(self, chamber, term):
        chamber_abbrev = {'upper': 'S', 'lower': 'H'}[chamber]

        url = ("http://legisweb.state.wy.us/LegislatorSummary/LegislatorList"
               ".aspx?strHouse=%s&strStatus=N" % chamber_abbrev)
        page = lxml.html.fromstring(self.urlopen(url))
        page.make_links_absolute(url)

        for link in page.xpath("//a[contains(@href, 'LegDetail')]"):
            name = link.text.strip()
            leg_url = link.get('href')

            email_address = link.xpath("../../../td[1]//a")[0].attrib['href']
            email_address = link.xpath("../../../td[2]//a")[0].attrib['href']
            email_address = email_address.split('Mailto:')[1]

            party = link.xpath("string(../../../td[3])").strip()
            if party == 'D':
                party = 'Democratic'
            elif party == 'R':
                party = 'Republican'

            district = link.xpath("string(../../../td[4])").strip().lstrip(
                'HS0')

            leg_page = lxml.html.fromstring(self.urlopen(leg_url))
            leg_page.make_links_absolute(leg_url)
            img = leg_page.xpath(
                "//img[contains(@src, 'LegislatorSummary/photos')]")[0]
            photo_url = img.attrib['src']

            office_tds = leg_page.xpath(
                '//table[@id="ctl00_cphContent_tblContact"]/tr/td/text()')
            address = []
            phone = None
            cell = None
            fax = None

            for td in office_tds:
                if td.startswith('Home -'):
                    phone = td.strip('Home - ')

                if td.startswith('Cell -') and not phone:
                    phone = td.strip('Cell - ')

                if td.startswith('Fax -'):
                    fax = td.strip('Fax - ')

                elif ' - ' not in td:
                    address.append(td)

            leg = Legislator(term,
                             chamber,
                             district,
                             name,
                             party=party,
                             email=email_address,
                             photo_url=photo_url,
                             url=leg_url)

            adr = " ".join(address)
            if adr.strip() != "":
                leg.add_office('district',
                               'Contact Information',
                               cell=cell,
                               address=adr,
                               phone=phone,
                               fax=fax)

            leg.add_source(url)
            leg.add_source(leg_url)

            self.save_legislator(leg)
Beispiel #48
0
    def scrape(self, chamber, term):
        self.validate_term(term)
        session = self.get_session_for_term(term)
        try:
            session_id = self.get_session_id(session)
        except KeyError:
            raise NoDataForPeriod(session)

        body = {'lower': 'H', 'upper': 'S'}[chamber]
        url = 'http://www.azleg.gov/MemberRoster.asp?Session_ID=%s&body=%s' % (
            session_id, body)
        with self.urlopen(url) as page:
            root = html.fromstring(page)
            path = '//table[@id="%s"]/tr' % {'H': 'house', 'S': 'senate'}[body]
            roster = root.xpath(path)[1:]
            for row in roster:
                position = ''
                vacated = ''
                name, district, party, email, room, phone, fax = row.xpath(
                    'td')

                if email.attrib.get('class') == 'vacantmember':
                    continue  # Skip any vacant members.

                link = name.xpath('string(a/@href)')
                link = "http://www.azleg.gov" + link
                if len(name) == 1:
                    name = name.text_content().strip()
                else:
                    position = name.tail.strip()
                    name = name[0].text_content().strip()

                district = district.text_content()
                party = party.text_content().strip()
                email = email.text_content().strip()

                if ('Vacated' in email or 'Resigned' in email
                        or 'Removed' in email):
                    # comment out the following 'continue' for historical
                    # legislative sessions
                    # for the current session, if a legislator has left we will
                    # skip him/her to keep from overwriting their information
                    continue
                    vacated = re.search('[0-9]*/[0-9]*/\d{4}', email).group()
                    email = ''

                party = self.get_party(party)
                room = room.text_content().strip()
                if chamber == 'lower':
                    address = "House of Representatives\n"
                else:
                    address = "Senate\n"
                address = address + "1700 West Washington\n Room " + room  \
                                  + "\nPhoenix, AZ 85007"

                phone = phone.text_content().strip()
                if not phone.startswith('602'):
                    phone = "602-" + phone
                fax = fax.text_content().strip()
                if not fax.startswith('602'):
                    fax = "602-" + fax
                if vacated:
                    end_date = datetime.datetime.strptime(vacated, '%m/%d/%Y')
                    leg = Legislator(term,
                                     chamber,
                                     district,
                                     full_name=name,
                                     party=party,
                                     url=link)
                    leg['roles'][0]['end_date'] = end_date
                else:
                    leg = Legislator(term,
                                     chamber,
                                     district,
                                     full_name=name,
                                     party=party,
                                     email=email,
                                     url=link)

                leg.add_office('capitol',
                               'Capitol Office',
                               address=address,
                               phone=phone,
                               fax=fax)

                if position:
                    leg.add_role(position,
                                 term,
                                 chamber=chamber,
                                 district=district,
                                 party=party)

                leg.add_source(url)

                #Probably just get this from the committee scraper
                #self.scrape_member_page(link, session, chamber, leg)
                self.save_legislator(leg)
Beispiel #49
0
    def scrape(self, chamber, term):
        biennium = "%s-%s" % (term[0:4], term[7:9])

        url = ("http://wslwebservices.leg.wa.gov/SponsorService.asmx/"
               "GetSponsors?biennium=%s" % biennium)

        # these pages are useful for checking if a leg is still in office
        if chamber == 'upper':
            cur_members = self.urlopen(
                'http://www.leg.wa.gov/senate/senators/Pages/default.aspx')
        else:
            cur_members = self.urlopen(
                'http://www.leg.wa.gov/house/representatives/Pages/default.aspx'
            )

        page = self.urlopen(url)
        page = lxml.etree.fromstring(page.bytes)

        for member in xpath(page, "//wa:Member"):

            mchamber = xpath(member, "string(wa:Agency)")
            mchamber = {'House': 'lower', 'Senate': 'upper'}[mchamber]

            if mchamber != chamber:
                continue

            name = xpath(member, "string(wa:Name)").strip()

            # if the legislator isn't in the listing, skip them
            if name not in cur_members:
                self.warning('%s is no longer in office' % name)
                continue

            party = xpath(member, "string(wa:Party)")
            party = {'R': 'Republican', 'D': 'Democratic'}.get(party, party)

            district = xpath(member, "string(wa:District)")
            if district == '0':
                # Skip phony district 0.
                continue

            email = xpath(member, "string(wa:Email)")
            leg_id = xpath(member, "string(wa:Id)")
            phone = xpath(member, "string(wa:Phone)")

            last = xpath(member, "string(wa:LastName)")
            last = last.lower().replace(' ', '')

            if chamber == 'upper':
                leg_url = ("http://www.leg.wa.gov/senate/senators/"
                           "Pages/%s.aspx" % last)
            else:
                leg_url = ("http://www.leg.wa.gov/house/"
                           "representatives/Pages/%s.aspx" % last)
            scraped_offices = []

            try:
                leg_page = self.urlopen(leg_url)
                leg_page = lxml.html.fromstring(leg_page)
                leg_page.make_links_absolute(leg_url)

                photo_link = leg_page.xpath(
                    "//a[contains(@href, 'publishingimages')]")
                if photo_link:
                    photo_url = photo_link[0].attrib['href']
                offices = leg_page.xpath(
                    "//table[@cellspacing='0']/tr/td/b[contains(text(), 'Office')]"
                )
                for office in offices:
                    office_block = office.getparent()
                    office_name = office.text_content().strip().rstrip(":")
                    address_lines = [
                        x.tail for x in office_block.xpath(".//br")
                    ]
                    address_lines = filter(lambda a: a is not None,
                                           address_lines)
                    phone = address_lines.pop(len(address_lines) - 1)
                    address = "\n".join(address_lines)
                    obj = {"name": office_name, "phone": phone}
                    if address.strip() != '':
                        obj['address'] = address

                    scraped_offices.append(obj)
            except scrapelib.HTTPError:
                # Sometimes the API and website are out of sync
                # with respect to legislator resignations/appointments
                photo_url = ''

            leg = Legislator(term,
                             chamber,
                             district,
                             name,
                             '',
                             '',
                             '',
                             party,
                             _code=leg_id,
                             photo_url=photo_url,
                             url=leg_url)
            leg.add_source(leg_url)

            for office in scraped_offices:
                typ = 'district' if 'District' in office['name'] else 'capitol'
                leg.add_office(typ, office.pop('name'), **office)

            self.save_legislator(leg)
Beispiel #50
0
    def _scrape_individual_legislator_page(self, url, term, chamber, district=None):
        """Scrape a specific lower house legislators page. The function will actually
        call one of three functions as there is 2 different bio templates and a completely
        separate one for the speaker of the house.

        Example url: http://www1.legis.ga.gov/legis/2009_10/house/bios/abdulsalaamRoberta/abdulsalaamRoberta.htm
        """
        if 'speaker/index.htm' in url:
            return self._scrape_speaker_of_the_house(url, term, chamber)

        with self.lxml_context(url) as page:
            # page == None == 404
            if page is None:
                return None

            page.make_links_absolute(url)

            # first check to see if this is the 'original' template or the new one
            stylesheet_path = '//link[@rel="stylesheet"]'
            stylesheets = page.xpath(stylesheet_path)

            for style_sheet in stylesheets:
                if 'legis.ga.gov.house.factsheet.css' in style_sheet.get('href') or \
                   'legis.ga.gov.house.bio.css' in style_sheet.get('href'):
                    return self._scrape_individual_legislator_page_second_template(page, term, chamber, district=district)

            path = '//table[@id="hoverTable"]/tr'
            legislator_info = page.xpath(path)

            # There is one page, "www1.legis.ga.gov/legis/2011_12/house/bios/williamsCoach.htm" that has
            # malformed HTML, going to manually do that one:
            if "www1.legis.ga.gov/legis/2011_12/house/bios/williamsCoach.htm" in url:
                legislator = Legislator(term,
                                        chamber,
                                        district,
                                        '"Coach" Williams',
                                        party="Democratic",
                                        url=url)
                legislator.add_source(url)
                return legislator

            # See if we got to the first row, some templates don't start with their table as 'hoverTable'
            # in this case let's just get the first table on the page as that is seeming to work well.
            if not legislator_info:
                path = '//table'
                tables = page.xpath(path)
                legislator_info = tables[0].getchildren()
            first_row = legislator_info[0]

            td_elements = first_row.getchildren()[0]
            name = td_elements[0].text_content().split('\n')[0].strip()
            party = td_elements[1].text_content().strip()[0:1].upper()
            # There was some cases where the party wasn't in a <p> it was after the
            # <h2>name</h2> foo <br />, seriously wtf
            if party not in self.PARTY_DICT:
                elements = td_elements.text_content().split('\n')
                for ele in elements:
                    ele = ele.strip()
                    if " - " in ele:
                        party = ele[0:1]
                        break
                    elif ele.upper() == 'REPUBLICAN':
                        party = 'R'
                        break
                    elif ele.upper() == 'DEMOCRAT':
                        party = 'D'
                        break
                if party == '':
                    party = td_elements.text_content().split('\n')[1].strip()[0:1]

            if not district:
                if len(td_elements) < 3 or "District" not in td_elements[2].text_content():
                    text_content = first_row[1].text_content().split('\n')
                    district = text_content[0].strip()[len("District "):]
                else:
                    district = td_elements[2].text_content().strip()[len("District "):]

            # Not every legislator has a sworn in date or facebook url, so attempt to parse
            # and just pass if it fails
            sworn_in = None
            try:
                sworn_in = td_elements[4].text_content().strip()[len("Sworn in "):]
            except:
                pass

            facebook_url = ''
            try:
                facebook_url = td_elements[5].get('href')
            except:
                pass

            photo_url = ''
            try:
                td_elements = first_row.getchildren()[1]
                photo_url = td_elements[0].getchildren()[0].get('src') or ''
            except:
                pass

            # Second row:
            second_row = legislator_info[1]
            address_info = second_row.getchildren()[0].text_content().split("<br />")[0].split("\n")
            phone_number = address_info.pop()
            address = " ".join(address_info)

            email = ''
            try:
                text_content = second_row.text_content().split('\n')
                for content in text_content:
                    if '@' in content.strip():
                        email = content.strip()
            except IndexError:
                try:
                    email = second_row.getchildren()[1].getchildren()[0].text_content()
                except:
                    pass

            legislator = Legislator(term,
                                    chamber,
                                    district,
                                    name,
                                    party=self.PARTY_DICT[party],
                                    email=email,
                                    photo_url=photo_url,
                                    facebook_url=facebook_url,
                                    sworn_in_date=sworn_in,
                                    url = url)
            legislator.add_office('capitol', 'Capitol Address', address=address_info,
                                  phone=phone_number)
            legislator.add_source(url)
            return legislator
Beispiel #51
0
    def scrape(self, chamber, term):
        self.validate_term(term, latest_only=False)
        root_url = 'http://www.capitol.tn.gov/'
        parties = {
            'D': 'Democratic',
            'R': 'Republican',
            'CCR': 'Carter County Republican',
            'I': 'Independent'
        }

        #testing for chamber
        if chamber == 'upper':
            url_chamber_name = 'senate'
            abbr = 's'
        else:
            url_chamber_name = 'house'
            abbr = 'h'
        if term != self.metadata["terms"][-1]["sessions"][0]:
            chamber_url = root_url + url_chamber_name
            chamber_url += '/archives/' + term + 'GA/Members/index.html'
        else:
            chamber_url = root_url + url_chamber_name + '/members/'

        page = self.get(chamber_url).text
        page = lxml.html.fromstring(page)

        for row in page.xpath("//tr"):

            # Skip any a header row.
            if set(child.tag for child in row) == set(['th']):
                continue

            vacancy_check = row.xpath('./td/text()')[1]
            if 'Vacant' in vacancy_check:
                self.logger.warning("Vacant Seat")
                continue

            partyInit = row.xpath('td[3]')[0].text.split()[0]
            party = parties[partyInit]
            district = row.xpath('td[5]/a')[0].text.split()[1]
            address = row.xpath('td[6]')[0].text_content()
            # 301 6th Avenue North Suite
            address = address.replace(
                'LP', 'Legislative Plaza\nNashville, TN 37243')
            address = address.replace(
                'WMB', 'War Memorial Building\nNashville, TN 37243')
            address = '301 6th Avenue North\nSuite ' + address
            phone = [
                x.strip() for x in row.xpath('td[7]//text()') if x.strip()
            ][0]

            email = HTMLParser.HTMLParser().unescape(
                row.xpath('td[1]/a/@href')[0][len("mailto:"):])
            member_url = (root_url + url_chamber_name + '/members/' + abbr +
                          district + '.html')
            member_photo_url = (root_url + url_chamber_name +
                                '/members/images/' + abbr + district + '.jpg')

            try:
                member_page = self.get(member_url, follow_redirects=False).text
            except TypeError:
                try:
                    member_page = self.get(member_url).text
                except HTTPError:
                    self.logger.warning("page doesn't exist")
                    continue
            member_page = lxml.html.fromstring(member_page)
            try:
                name = member_page.xpath('body/div/div/h1/text()')[0]
            except IndexError:
                name = member_page.xpath(
                    '//div[@id="membertitle"]/h2/text()')[0]

            if 'Speaker' in name:
                full_name = name[8:len(name)]
            elif 'Lt.' in name:
                full_name = name[13:len(name)]
            elif abbr == 'h':
                full_name = name[len("Representative "):len(name)]
            else:
                full_name = name[8:len(name)]

            leg = Legislator(term,
                             chamber,
                             district,
                             full_name.strip(),
                             party=party,
                             url=member_url,
                             photo_url=member_photo_url)
            leg.add_source(chamber_url)
            leg.add_source(member_url)

            # TODO: add district address from this page

            leg.add_office('capitol',
                           'Nashville Address',
                           address=address,
                           phone=phone,
                           email=email)

            self.save_legislator(leg)
Beispiel #52
0
    def scrape_reps(self, chamber, term_name):
        url = 'http://www.maine.gov/legis/house/dist_mem.htm'
        page = self.get(url).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        # These do not include the non-voting tribal representatives
        # They do not have numbered districts, and lack a good deal of
        # the standard profile information about representatives
        districts = [
            x for x in page.xpath('/html/body/p') if len(x.xpath('a')) == 3
        ]
        for district in districts:
            if "- Vacant" in district.text_content():
                self.warning("District is vacant: '{}'".format(
                    district.text_content()))
                continue

            district_number = district.xpath('a[1]/@name')[0]

            leg_url = district.xpath('a[3]/@href')[0]
            leg_info = district.xpath('a[3]/text()')[0]

            INFO_RE = r'''
                    Representative\s
                    (?P<member_name>.+?)
                    \s\(
                    (?P<party>[DRUI])
                    -
                    (?P<district_name>.+?)
                    \)
                    '''
            info_search = re.search(INFO_RE, leg_info, re.VERBOSE)

            member_name = info_search.group('member_name')
            party = _party_map[info_search.group('party')]
            district_name = info_search.group('district_name')

            leg = Legislator(term_name,
                             chamber,
                             str(district_number),
                             member_name,
                             party=party,
                             url=leg_url,
                             district_name=district_name)
            leg.add_source(url)
            leg.add_source(leg_url)

            # Get the photo url.
            html = self.get(leg_url).text
            doc = lxml.html.fromstring(html)
            doc.make_links_absolute(leg_url)
            (photo_url, ) = doc.xpath('//img[contains(@src, ".jpg")]/@src')
            leg['photo_url'] = photo_url

            # Add contact information from personal page
            office_address = re.search(r'<B>Address:  </B>(.+?)<P>', html,
                                       re.IGNORECASE).group(1)

            office_email = doc.xpath(
                '//a[starts-with(@href, "mailto:")]/text()')
            if office_email:
                office_email = office_email[0]
            else:
                office_email = None

            business_phone = re.search(r'<B>Business Telephone:  </B>(.+?)<P>',
                                       html, re.IGNORECASE)
            home_phone = re.search(r'<B>Home Telephone:  </B>(.+?)<P>', html,
                                   re.IGNORECASE)
            cell_phone = re.search(r'<B>Cell Telephone:  </B>(.+?)<P>', html,
                                   re.IGNORECASE)

            if business_phone:
                office_phone = business_phone.group(1)
            elif home_phone:
                office_phone = home_phone.group(1)
            elif cell_phone:
                office_phone = cell_phone.group(1)
            else:
                office_phone = None

            district_office = {
                'name': "District Office",
                'type': "district",
                'address': office_address,
                'fax': None,
                'email': office_email,
                'phone': office_phone
            }
            leg.add_office(**district_office)

            # Add state party office to member's addresses
            if party == "Democratic":
                DEM_PARTY_OFFICE = dict(name='House Democratic Office',
                                        type='capitol',
                                        address='\n'.join([
                                            'Room 333, State House',
                                            '2 State House Station',
                                            'Augusta, Maine 04333-0002'
                                        ]),
                                        fax=None,
                                        email=None,
                                        phone='(207) 287-1430')
                leg.add_office(**DEM_PARTY_OFFICE)
            elif party == "Republican":
                REP_PARTY_OFFICE = dict(name='House GOP Office',
                                        type='capitol',
                                        address='\n'.join([
                                            'Room 332, State House',
                                            '2 State House Station',
                                            'Augusta, Maine 04333-0002'
                                        ]),
                                        fax=None,
                                        email=None,
                                        phone='(207) 287-1440')
                leg.add_office(**REP_PARTY_OFFICE)

            # Save legislator
            self.save_legislator(leg)
Beispiel #53
0
    def scrape_legislator_page(self, term, url):
        page = self.urlopen(url)
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)
        name = page.xpath("//h1[@id='page-title']/text()")[0]
        district = page.xpath("//a[contains(@href, 'district')]/text()")[0]
        district = district.replace("District", "").strip()

        committees = page.xpath("//a[contains(@href, 'committees')]/text()")

        party = page.xpath("//div[contains(text(), 'Political Party')]"
                           )[0].getnext().text_content().strip()

        photo = page.xpath("//div[@class='field-person-photo']/img/@src")
        photo = photo[0] if len(photo) else None

        address = page.xpath("//div[@class='adr']")[0]
        address = re.sub("\s+", " ", address.text_content()).strip()

        item_mapping = {
            "email": "email",
            "home telephone": "home-telephone",
            "cellphone": "cellphone",
            "office telephone": "office-telephone",
            "political party": "party",
            "chamber": "chamber",
            "fax": "fax"
        }
        metainf = {}

        for block in page.xpath(
                "//div[contains(@class, 'field-label-inline')]"):
            label, items = block.xpath("./*")
            key = label.text_content().strip().lower()
            if key.endswith(":"):
                key = key[:-1]

            metainf[item_mapping[key]] = items.text_content().strip()

        chamber = {"Senate": "upper", "House": "lower"}[metainf['chamber']]

        kwargs = {
            "party": {
                "Democrat": "Democratic",
                "Republican": "Republican"
            }[metainf['party']]
        }
        if photo:
            kwargs['photo_url'] = photo

        leg = Legislator(term, chamber, district, name, **kwargs)

        kwargs = {"address": address, "url": url}

        for key, leg_key in [
            ('email', 'email'),
            ('home-telephone', 'home_phone'),
            ('cellphone', 'cellphone'),
            ('fax', 'fax'),
            ('office-telephone', 'office_phone'),
        ]:
            if key in metainf:
                kwargs[leg_key] = metainf[key]

        leg.add_office('district', 'District Office', **kwargs)

        #for committee in committees:
        #    leg.add_role('committee member',
        #                 term=term,
        #                 chamber=chamber,
        #                 committee=committee)

        leg.add_source(url)
        self.save_legislator(leg)
Beispiel #54
0
    def scrape_legislator(self, chamber, term, url):
        html = self.urlopen(url)
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)

        # most properties are easy to pull
        optional = ["home_phone"]
        properties = {
            'start_year': 'lblStartYear',
            'district': "linkDistrict",
            'occupation': "lblOccupation",
            'header': "lblHeader",
            'addr_street': "lblAddress",
            'office_phone': ["lblCapitolPhone", "lblOfficePhone"],
            'home_phone': "lblHomePhone",
            #            '': "",
            #            '': "",
            #            '': "",
            #            '': "",
        }

        for key, value in properties.iteritems():
            if isinstance(value, list):
                values = value
            else:
                values = [value]

            found = False
            for value in values:
                id_ = 'ctl00_mainCopy_formViewLegislator_%s' % value
                try:
                    val = "\n".join(doc.get_element_by_id(id_).itertext())
                    found = True
                except KeyError:
                    pass
                if val:
                    properties[key] = val.strip()
                else:
                    properties[key] = None

            if found is False and key not in optional:
                self.warning('bad legislator page %s missing %s' % (url, id_))
                return

        # image & email are a bit different
        properties['photo_url'] = doc.xpath(
            '//img[@id="ctl00_mainCopy_formViewLegislator_imgLegislator"]/@src'
        )[0]
        email = doc.get_element_by_id(
            'ctl00_mainCopy_formViewLegislator_linkEmail').text
        if email:
            properties['email'] = email.strip()

        properties['url'] = url

        properties['chamber'] = chamber
        properties['term'] = term

        full_name, party = properties['header'].rsplit("-", 1)

        properties['full_name'] = full_name
        properties['party'] = party

        if '(D)' in properties['party']:
            properties['party'] = 'Democratic'
        elif '(R)' in properties['party']:
            properties['party'] = 'Republican'
        elif '(DTS)' in properties['party']:
            # decline to state = independent
            properties['party'] = 'Independent'
        else:
            raise Exception("unknown party encountered")

        address = properties.pop('addr_street')

        phone = (properties.pop('office_phone')
                 or properties.pop('home_phone'))

        leg = Legislator(**properties)
        leg.add_source(url)

        leg.add_office('district',
                       'District Address',
                       address=address,
                       phone=phone)

        # committees
        # skip first header row
        for row in doc.xpath(
                '//table[@id="ctl00_mainCopy_MembershipGrid"]/tr')[1:]:
            role, committee, note = [x.text_content() for x in row.xpath('td')]
            committee = committee.title()
            if 'Interim' in note:
                role = 'interim ' + role.lower()
            else:
                role = role.lower()
            leg.add_role('committee member',
                         term,
                         committee=committee,
                         position=role,
                         chamber=chamber)

        # Already have the photo url.
        try:
            del leg['image_url']
        except KeyError:
            pass

        self.save_legislator(leg)
    def scrape(self, term, chambers):
        year_slug = term[5:]

        # Load all members via the private API
        legislator_dump_url = (
            'http://legislature.vermont.gov/people/loadAll/{}'.format(
                year_slug))
        json_data = self.get(legislator_dump_url).text
        legislators = json.loads(json_data)['data']

        # Parse the information from each legislator
        for info in legislators:
            # Strip whitespace from strings
            info = {k: v.strip() for k, v in info.iteritems()}

            # Gather photo URL from the member's page
            member_url = (
                'http://legislature.vermont.gov/people/single/{}/{}'.format(
                    year_slug, info['PersonID']))
            page = self.lxmlize(member_url)
            (photo_url, ) = page.xpath('//img[@class="profile-photo"]/@src')

            # Also grab their state email address
            state_email = page.xpath(
                '//dl[@class="summary-table profile-summary"]/'
                'dt[text()="Email"]/following-sibling::dd[1]/a/text()')
            if state_email:
                (state_email, ) = state_email
            else:
                state_email = None

            leg = Legislator(
                term=term,
                chamber=self.CHAMBERS[info['Title']],
                district=info['District'].replace(" District", ""),
                party=info['Party'].replace("Democrat", "Democratic"),
                full_name="{0} {1}".format(info['FirstName'],
                                           info['LastName']),
                photo_url=photo_url)

            leg.add_office(
                type='capitol',
                name='Capitol Office',
                address=
                'Vermont State House\n115 State Street\nMontpelier, VT 05633',
                email=state_email)

            leg.add_office(type='district',
                           name='District Office',
                           address="{0}{1}\n{2}, {3} {4}".format(
                               info['MailingAddress1'],
                               ("\n" + info['MailingAddress2']
                                if info['MailingAddress2'].strip() else ""),
                               info['MailingCity'], info['MailingState'],
                               info['MailingZIP']),
                           phone=(info['HomePhone'].strip() or None),
                           email=(info['Email'].strip()
                                  or info['HomeEmail'].strip()
                                  or info['WorkEmail'].strip() or None))

            leg.add_source(legislator_dump_url)
            leg.add_source(member_url)

            self.save_legislator(leg)
Beispiel #56
0
    def scrape(self, chamber, term):

        for tdata in self.metadata['terms']:
            if term == tdata['name']:
                year = tdata['start_year']
                session_number = tdata['session_number']
                break

        # Fetch the csv.
        url = 'http://leg.mt.gov/content/sessions/%s/%d%sMembers.txt' % \
            (session_number, year, chamber == 'upper' and 'Senate' or 'House')

        # Parse it.
        data = self.urlopen(url)
        data = data.replace('"""', '"')  # weird triple quotes
        data = data.splitlines()

        fieldnames = [
            'last_name', 'first_name', 'party', 'district', 'address', 'city',
            'state', 'zip'
        ]
        csv_parser = csv.DictReader(data, fieldnames)

        district_leg_urls = self._district_legislator_dict()

        # Toss the row headers.
        next(csv_parser)

        for entry in csv_parser:
            if not entry:
                continue

            # City.
            entry['city'] = entry['city']

            # Address.
            entry['address'] = entry['address']

            # District.
            district = entry['district']
            hd_or_sd, district = district.split()
            del entry['district']

            # Party.
            party_letter = entry['party']
            party = {'D': 'Democratic', 'R': 'Republican'}[party_letter]
            entry['party'] = party
            del entry['party']

            # Get full name properly capped.
            fullname = _fullname = '%s %s' % (entry['first_name'].capitalize(),
                                              entry['last_name'].capitalize())

            city_lower = entry['city'].lower()

            # Get any info at the legislator's detail_url.
            detail_url = district_leg_urls[hd_or_sd][district]

            # Get the office.
            address = '\n'.join([
                entry['address'],
                '%s, %s %s' % (entry['city'], entry['state'], entry['zip'])
            ])

            office = dict(name='District Office',
                          type='district',
                          phone=None,
                          fax=None,
                          email=None,
                          address=address)

            try:
                deets = self._scrape_details(detail_url)
            except NoDetails:
                self.logger.warning("No details found at %r" % detail_url)
                continue

            # Add the details and delete junk.
            entry.update(deets)
            del entry['first_name'], entry['last_name']

            legislator = Legislator(term,
                                    chamber,
                                    district,
                                    fullname,
                                    party=party)
            legislator.update(entry)
            legislator.add_source(detail_url)
            legislator.add_source(url)
            legislator['url'] = detail_url

            office['phone'] = deets.get('phone')
            office['fax'] = deets.get('fax')
            legislator.add_office(**office)

            self.save_legislator(legislator)
Beispiel #57
0
    def scrape_member(self, chamber, term, member_url):
        page = self.urlopen(member_url)
        root = lxml.html.fromstring(page)
        root.make_links_absolute(member_url)

        photo_url = root.xpath(
            '//div[starts-with(@class,"bioPicContainer")]/img/@src')[0]
        photo_url = root.xpath(
            '//div[starts-with(@class,"bioPicContainer")]/img/@src')[0]
        full_name = root.xpath(
            '//div[starts-with(@class,"bioPicContainer")]/img/@alt')[0]

        email = root.xpath('//a[contains(@href, "mailto")]/@href')[0]
        email = email.replace('mailto:', '')
        # if full_name == 'Frank A. Moran':

        district = root.xpath(
            '//div[@id="District"]//div[starts-with(@class,"widgetContent")]')
        if len(district):
            district = district[0].text_content().strip()
            district = clean_district(district)
        else:
            self.logger.warning(
                'No district tab found for this hot garbage. Skipping.')
            return

        party = root.xpath('//span[@class="legislatorAffiliation"]/text()')[0]

        if party == 'D':
            party = 'Democratic'
        elif party == 'R':
            party = 'Republican'
        else:
            party = 'Other'

        leg = Legislator(term,
                         chamber,
                         district,
                         full_name,
                         party=party,
                         photo_url=photo_url,
                         url=member_url,
                         email=email)
        leg.add_source(member_url)

        # offices
        for dl in root.xpath('//dl[@class="address"]'):
            office_name = phone = fax = email = None
            address = []
            for child in dl.getchildren():
                text = child.text_content()
                if child.tag == 'dt':
                    office_name = text
                else:
                    if text.startswith('Phone:'):
                        phone = text.strip('Phone: ') or None
                    elif text.startswith('Fax:'):
                        fax = text.strip('Fax: ') or None
                    elif text.startswith('Email:'):
                        pass
                    else:
                        address.append(text)
            # all pieces collected
            if 'District' in office_name:
                otype = 'district'
            else:
                otype = 'capitol'

            address = filter(None,
                             [re.sub(r'\s+', ' ', s).strip() for s in address])

            if address:
                leg.add_office(otype,
                               office_name,
                               phone=phone,
                               fax=fax,
                               address='\n'.join(address),
                               email=None)

        self.save_legislator(leg)
Beispiel #58
0
    def scrape(self, chamber, term):
        if chamber == 'upper':
            url = ('http://webserver.rilin.state.ri.us/Documents/Senators.xls')
            rep_type = 'Senator '
            source_url = 'http://www.rilin.state.ri.us/senators/default.aspx'
            source_url_title_replacement = rep_type
        elif chamber == 'lower':
            url = (
                'http://webserver.rilin.state.ri.us/Documents/Representatives.xls'
            )
            rep_type = 'Representative '
            source_url = 'http://www.rilin.state.ri.us/representatives/default.aspx'
            source_url_title_replacement = 'Rep. '

        self.urlretrieve(url, 'ri_leg.xls')

        wb = xlrd.open_workbook('ri_leg.xls')
        sh = wb.sheet_by_index(0)

        # This isn't perfect but it's cheap and better than using the
        # XLS doc as the source URL for all legislators.
        # 374: RI: legislator url
        leg_source_url_map = {}
        leg_page = lxml.html.fromstring(self.urlopen(source_url))
        leg_page.make_links_absolute(source_url)

        for link in leg_page.xpath('//td[@class="ms-vb2"]'):
            leg_name = link.text_content().replace(
                source_url_title_replacement, '')
            leg_url = link.xpath("..//a")[0].attrib['href']
            leg_source_url_map[leg_name] = leg_url

        for rownum in xrange(1, sh.nrows):
            d = {}
            for field, col_num in excel_mapping.iteritems():
                d[field] = sh.cell(rownum, col_num).value
            dist = str(int(d['district']))
            district_name = dist
            full_name = re.sub(rep_type, '', d['full_name']).strip()
            translate = {
                "Democrat": "Democratic",
                "Republican": "Republican",
                "Independent": "Independent"
            }

            homepage_url = None
            if full_name in leg_source_url_map.keys():
                homepage_url = leg_source_url_map[full_name]

            kwargs = {
                "town_represented": d['town_represented'],
                "email": d['email']
            }

            if homepage_url is not None:
                kwargs['url'] = homepage_url

            leg = Legislator(term, chamber, district_name, full_name, '', '',
                             '', translate[d['party']], **kwargs)

            leg.add_office('district', 'Address', address=d['address'])
            leg.add_source(source_url)
            if homepage_url:
                leg.add_source(homepage_url)
            self.save_legislator(leg)
Beispiel #59
0
    def scrape_legislator(self, chamber, term, url):
        # Initialize default values for legislator attributes.
        full_name = None
        party = None
        photo_url = None
        email = None
        capitol_address = None
        capitol_phone = None
        district = None
        district_address = None
        district_phone = None

        if chamber == 'upper':
            title_prefix = 'Senator '
        elif chamber == 'lower':
            title_prefix = 'Representative '
        else:
            title_prefix = ''

        santa_fe_area_code = '(505)'

        page = self.lxmlize(url)

        info_node = self.get_node(
            page,
            '//table[@id="MainContent_formViewLegislator"]')
        if info_node is None:
            raise ValueError('Could not locate legislator data.')

        district_node = self.get_node(
            info_node,
            './/a[@id="MainContent_formViewLegislator_linkDistrict"]')
        if district_node is not None:
            district = district_node.text.strip()

        name_node = self.get_node(
            page,
            './/span[@id="MainContent_formViewLegislatorName'
            '_lblLegislatorName"]')

        if name_node is not None:
            if name_node.text.strip().endswith(' Vacant'):
                self.warning("Found vacant seat for {} district {}; skipping".format(chamber, district))
                return

            n_head, n_sep, n_party = name_node.text.rpartition(' - ')

            full_name = re.sub(r'^{}'.format(title_prefix), '', n_head.strip())

            if '(D)' in n_party:
                party = 'Democratic'
            elif '(R)' in n_party:
                party = 'Republican'
            elif '(DTS)' in n_party:
                # decline to state = independent
                party = 'Independent'
            else:
                raise AssertionError('Unknown party {} for {}'.format(
                    party,
                    full_name))

        photo_node = self.get_node(
            info_node,
            './/img[@id="MainContent_formViewLegislator_imgLegislator"]')
        if photo_node is not None:
            photo_url = photo_node.get('src')

        email_node = self.get_node(
            info_node,
            './/a[@id="MainContent_formViewLegislator_linkEmail"]')
        if email_node is not None and email_node.text:
            email = email_node.text.strip()

        capitol_address_node = self.get_node(
            info_node,
            './/span[@id="MainContent_formViewLegislator_lblCapitolRoom"]')
        if capitol_address_node is not None:
            capitol_address_text = capitol_address_node.text
            if capitol_address_text is not None:
                capitol_address = 'Room {} State Capitol\nSanta Fe, NM 87501'\
                    .format(capitol_address_text.strip())

        capitol_phone_node = self.get_node(
            info_node,
            './/span[@id="MainContent_formViewLegislator_lblCapitolPhone"]')
        if capitol_phone_node is not None:
            capitol_phone_text = capitol_phone_node.text
            if capitol_phone_text:
                capitol_phone_text = capitol_phone_text.strip()
                area_code, phone = extract_phone_number(capitol_phone_text)
                if phone:
                    capitol_phone = '{} {}'.format(
                        area_code.strip() if area_code else santa_fe_area_code,
                        phone)

        district_address_node = self.get_node(
            info_node,
            './/span[@id="MainContent_formViewLegislator_lblAddress"]')
        if district_address_node is not None:
            district_address = '\n'.join(district_address_node.xpath("text()"))

        office_phone_node = self.get_node(
            info_node,
            './/span[@id="MainContent_formViewLegislator_lblOfficePhone"]')

        home_phone_node = self.get_node(
            info_node,
            './/span[@id="MainContent_formViewLegislator_lblHomePhone"]')

        if office_phone_node is not None and office_phone_node.text:
            district_phone_text = office_phone_node.text
        elif home_phone_node is not None and home_phone_node.text:
            district_phone_text = home_phone_node.text
        else:
            district_phone_text = None
        if district_phone_text:
            d_area_code, d_phone = extract_phone_number(district_phone_text)
            district_phone = '{} {}'.format(d_area_code.strip(), d_phone)

        legislator = Legislator(
            term=term,
            chamber=chamber,
            district=district,
            full_name=full_name,
            party=party,
            photo_url=photo_url)

        if email:
            legislator['email'] = email

        legislator.add_source(url)

        legislator.add_office(
            'district',
            'District Office',
            address=district_address,
            phone=district_phone)
        legislator.add_office(
            'capitol',
            'Capitol Office',
            address=capitol_address,
            phone=capitol_phone,
            email=email)

        self.save_legislator(legislator)
Beispiel #60
0
    def scrape(self, chamber, term):
        #the url for each rep is unfindable (by me)
        #and the parts needed to make it up do not appear in the html or js.
        #we can find basic information on the main rep page, and sponsor
        #info on a version of their indivdual page called using only their
        #sponsor ID (which we have to scrape from ALISON)
        #we can't get detailed information without another ID
        #which I have not been able to find.

        if chamber == 'upper':
            member_list_url = self._base_url + 'Senate/ALSenators.aspx'
            legislator_base_url = self._base_url + 'ALSenator.aspx'
        elif chamber == 'lower':
            member_list_url = self._base_url + 'House/ALRepresentatives.aspx'
            legislator_base_url = self._base_url + 'ALRepresentative.aspx'

        page = self.lxmlize(member_list_url)

        legislator_nodes = self.get_nodes(
            page, '//div[@class="container container-main"]/table/tr/td/input')

        legislator_url_template = legislator_base_url + '?OID_SPONSOR='\
            '{oid_sponsor}&OID_PERSON={oid_person}'

        html_parser = HTMLParser.HTMLParser()

        for legislator_node in legislator_nodes:
            # Set identifiers internal to AlisonDB.
            # Have to do this to OID_SPONSOR because they don't know
            # how to HTML and I'm making links absolute out of convenience.
            try:
                oid_sponsor = legislator_node.attrib['longdesc'].split('/')[-1]
                oid_person = legislator_node.attrib['alt']
            except KeyError:
                continue

            legislator_url = legislator_url_template.format(
                oid_sponsor=oid_sponsor, oid_person=oid_person)

            legislator_page = self.lxmlize(legislator_url)

            name_text = self.get_node(
                legislator_page,
                '//span[@id="ContentPlaceHolder1_lblMember"]').text_content()\
                .encode('utf-8')

            # This just makes processing the text easier.
            name_text = name_text.lower()

            # Skip vacant seats.
            if 'vacant' in name_text:
                continue

            # Removes titles and nicknames.
            name = html_parser.unescape(
                re.sub(r'(?i)(representative|senator|'
                       '&quot.*&quot)', '', name_text).strip().title())

            # Assemble full name by reversing last name, first name format.
            name_parts = [x.strip() for x in name.split(',')]
            full_name = '{0} {1}'.format(name_parts[1], name_parts[0])

            info_node = self.get_node(
                legislator_page,
                '//div[@id="ContentPlaceHolder1_TabSenator_body"]//table')

            district_text = self.get_node(
                info_node, './tr[2]/td[2]').text_content().encode('utf-8')

            if chamber == 'upper':
                district = district_text.replace('Senate District', '').strip()
            elif chamber == 'lower':
                district = district_text.replace('House District', '').strip()

            party_text = self.get_node(
                info_node, './tr[1]/td[2]').text_content().encode('utf-8')

            if not full_name.strip() and party_text == '()':
                self.warning(
                    'Found empty seat, for district {}; skipping'.format(
                        district))
                continue

            party = self._parties[party_text.strip()]

            phone_number_text = self.get_node(
                info_node, './tr[4]/td[2]').text_content().encode('utf-8')

            phone_number = phone_number_text.strip()

            fax_number_text = self.get_node(
                info_node, './tr[5]/td[2]').text_content().encode('utf-8')

            fax_number = fax_number_text.strip()

            suite_text = self.get_node(
                info_node, './tr[7]/td[2]').text_content().encode('utf-8')

            office_address = '{}\n11 S. Union Street\nMontgomery, AL 36130'\
                .format(suite_text)

            email_text = self.get_node(
                info_node, './tr[11]/td[2]').text_content().encode('utf-8')

            email_address = email_text.strip()

            photo_url = self.get_node(
                legislator_page,
                '//input[@id="ContentPlaceHolder1_TabSenator_TabLeg_imgLEG"]'
                '/@src')

            #add basic leg info and main office
            legislator = Legislator(term=term,
                                    district=district,
                                    chamber=chamber,
                                    full_name=full_name,
                                    party=party,
                                    email=email_address,
                                    photo_url=photo_url)

            legislator.add_office('capitol',
                                  'Capitol Office',
                                  address=office_address,
                                  phone=phone_number,
                                  fax=fax_number)

            #match rep to sponsor_id if possible
            ln, fn = name.split(',')

            self.add_committees(legislator_page, legislator, chamber, term)

            legislator.add_source(member_list_url)
            legislator.add_source(legislator_url)

            self.save_legislator(legislator)