Example #1
0
    def scrape_details(self, chamber, term, leg_name, leg_link, role):
        if not leg_link:
            # Vacant post, likely:
            if "Vacancy" in leg_name:
                return
            raise Exception("leg_link is null. something went wrong")
        try:
            url = 'http://billstatus.ls.state.ms.us/members/%s' % leg_link
            url_root = os.path.dirname(url)
            details_page = self.urlopen(url)
            root = lxml.etree.fromstring(details_page.bytes)
            party = root.xpath('string(//PARTY)')
            district = root.xpath('string(//DISTRICT)')
            photo = "%s/%s" % (url_root, root.xpath('string(//IMG_NAME)'))

            home_phone = root.xpath('string(//H_PHONE)')
            bis_phone = root.xpath('string(//B_PHONE)')
            capital_phone = root.xpath('string(//CAP_PHONE)')
            other_phone = root.xpath('string(//OTH_PHONE)')
            org_info = root.xpath('string(//ORG_INFO)')
            email_name = root.xpath('string(//EMAIL_ADDRESS)')
            cap_room = root.xpath('string(//CAP_ROOM)')

            if party == 'D':
                party = 'Democratic'
            else:
                party = 'Republican'

            leg = Legislator(term, chamber, district, leg_name,
                             party=party,
                             role=role,
                             org_info=org_info,
                             url=url,
                             photo_url=photo)
            leg.add_source(url)

            kwargs = {}

            if email_name.strip() != "":
                email = '%s@%s.ms.gov' % (email_name, {
                    "upper": "senate",
                    "lower": "house"
                }[chamber])
                kwargs['email'] = email

            if capital_phone != "":
                kwargs['phone'] = capital_phone

            if cap_room != "":
                kwargs["address"] = "Room %s\n%s" % (cap_room, CAP_ADDRESS)
            else:
                kwargs['address'] = CAP_ADDRESS

            leg.add_office('capitol',
                           'Capitol Office',
                           **kwargs)

            self.save_legislator(leg)
        except scrapelib.HTTPError, e:
            self.warning(str(e))
    def scrape_legislator(self, chamber, term, name, url):
        html = self.get(url).text
        page = lxml.html.fromstring(html)
        page.make_links_absolute(url)

        xpath = '//select[@name="sel_member"]/option[@selected]/text()'
        district = page.xpath('//h1[contains(., "DISTRICT")]/text()').pop().split()[1].strip().lstrip('0')

        party = page.xpath('//h2').pop().text_content()
        party = re.search(r'\((R|D|I)[ \-\]]', party).group(1)

        if party == 'D':
            party = 'Democratic'
        elif party == 'R':
            party = 'Republican'
        elif party == 'I':
            party = 'Independent'

        photo_url = page.xpath(
            "//img[contains(@src, 'images/members/')]")[0].attrib['src']


        leg = Legislator(term, chamber, district, name, party=party,
                         photo_url=photo_url, url=url)
        leg.add_source(url)
        self.scrape_offices(leg, page)
        self.save_legislator(leg)
Example #3
0
    def scrape_member(self, chamber, term, member_url):
        with self.urlopen(member_url) as page:
            root = lxml.html.fromstring(page)
            root.make_links_absolute(member_url)

        photo_url = root.xpath('//div[starts-with(@class,"bioPicContainer")]/img/@src')[0]
        photo_url = root.xpath('//div[starts-with(@class,"bioPicContainer")]/img/@src')[0]
        full_name = root.xpath('//div[starts-with(@class,"bioPicContainer")]/img/@alt')[0]

        email = root.xpath('//a[contains(@href, "mailto")]/@href')[0]
        email = email.replace('mailto:','')

        district = root.xpath('//div[@id="District"]//div[starts-with(@class,"widgetContent")]')
        if len(district):
            district = district[0].text.strip()
            district = clean_district(district)

        party = root.xpath('//span[@class="legislatorAffiliation"]/text()')[0]

        if party == 'D':
            party = 'Democratic'
        elif party == 'R':
            party = 'Republican'
        else:
            party = 'Other'

        leg = Legislator(term, chamber, district, full_name, party=party,
                         photo_url=photo_url, url=member_url, email=email)
        leg.add_source(member_url)

        self.save_legislator(leg)
Example #4
0
    def scrape_lower(self, term):
        url = "http://le.utah.gov/house2/representatives.jsp"
        html = self.urlopen(url)
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)

        for row in doc.xpath("//tr")[1:]:
            tds = row.xpath("td")

            district = tds[0].text_content()
            if tds[1].text_content() == "Empty":
                self.log("district %s is empty" % district)
                continue
            a = tds[1].xpath("a")[0]
            name = a.text_content()
            leg_url = a.get("href")

            party = tds[2].text_content()
            if party == "D":
                party = "Democratic"
            elif party == "R":
                party = "Republican"
            else:
                raise ValueError("unknown party")

            # get photo
            leg_html = self.urlopen(leg_url)
            leg_doc = lxml.html.fromstring(leg_html)
            leg_doc.make_links_absolute(leg_url)
            photo_url = leg_doc.xpath('//img[@alt="photo"]/@src')[0]

            leg = Legislator(term, "lower", district, name, party=party, photo_url=photo_url, url=leg_url)
            leg.add_source(url)
            leg.add_source(leg_url)
            self.save_legislator(leg)
Example #5
0
    def scrape(self, chamber, term):
        if chamber == 'lower':
            url = 'http://www.scstatehouse.gov/html-pages/housemembers.html'
        else:
            url = 'http://www.scstatehouse.gov/html-pages/senatemembersd.html'

        with self.urlopen(url) as data:
            doc = lxml.html.fromstring(data)
            rows = doc.xpath('//pre/div[@class="sansSerifNormal"]')

            for row in rows:
                member_a = row.xpath('a')[0]

                name_party = member_a.text_content()
                if name_party.find('[D]') != -1:
                    party = 'Democratic'
                    full_name = name_party.partition('[D]')[0].strip()
                elif name_party.find('[R]') != -1:
                    party = 'Republican'
                    full_name = name_party.partition('[R]')[0].strip()

                photo_url = 'http://www.scstatehouse.gov/members/gif/' + re.search('(\d+)\.html', member_a.attrib['href']).group(1) + '.jpg'

                other_data = row.text_content().encode('ascii', 'ignore')
                od_result = re.search('^.+District (\d+) - (.+)Count.+$', other_data)
                district = od_result.group(1)

                contentb = re.search('^.+\(C\) (.+,.*\d+).*Bus. (\(\d+\) \d+-\d+).+$', other_data)
                if contentb is not None:
                    office_address = contentb.group(1)
                    office_phone = contentb.group(2)

                legislator = Legislator(term, chamber, district, full_name, party=party, photo_url=photo_url, office_address=office_address, office_phone=office_phone)
                legislator.add_source(url)
                self.save_legislator(legislator)
Example #6
0
 def _scrape_speaker_of_the_house(self, url, term, chamber):
     """The speaker of the house has a special page, because he is just OH so special</sarcasm>
     
     Main page url like: http://www1.legis.ga.gov/legis/2011_12/house/speaker/index.htm
     but need to scrape: http://www1.legis.ga.gov/legis/2011_12/house/speaker/bio.html
     """
     if url.endswith("index.htm"):
         url = url.replace("index.htm", "bio.html")
     with self.lxml_context(url) as page:
         path = '//div[@id="title"]'
         speaker_info_div = page.xpath(path)
         if speaker_info_div and len(speaker_info_div) == 1:
             # This isn't exactly great but it's the best/quickest solution for now
             speaker_info = speaker_info_div[0].text_content().split()
             name = speaker_info[2] + " " + speaker_info[3]
             party = None
             if "R-" in speaker_info[4]:
                 party = "Republican"
             elif "D-" in speaker_info[4]:
                 party = "Democrat"
             elif "I-" in speaker_info[4]:
                 party = "Independent"
             
             district = None
             if "district" in speaker_info[6].lower():
                 district = speaker_info[7].strip(")")
             
             legislator = Legislator(term,
                                     chamber,
                                     district,
                                     name,
                                     party=party)
             legislator.add_source(url)
             return legislator
Example #7
0
    def scrape_senator(self, name, term, url):
        with self.urlopen(url) as text:
            page = lxml.html.fromstring(text)

            district = page.xpath(
                "string(//*[starts-with(text(), 'Senator ')])")

            district = re.search(r'District (\d+)', district).group(1)

            try:
                party = page.xpath(
                    "//b[contains(text(), 'Party')]")[0].getnext().tail
                party = party.strip()
            except IndexError:
                party = 'N/A'

            if party == 'No Party (Independent)':
                party = 'Independent'
            elif party == 'Democrat':
                party = 'Democratic'

            leg = Legislator(term, 'upper', district, name, party=party,
                             url=url)
            leg.add_source(url)
            self.save_legislator(leg)
Example #8
0
    def scrape_rep(self, name, term, url):
        # special case names that confuses name_tools
        if name == "Franklin, A.B.":
            name = "Franklin, A. B."
        elif ", Jr., " in name:
            name.replace(", Jr., ", " ")
            name += ", Jr."
        elif ", III, " in name:
            name.replace(", III, ", " ")
            name += ", III"

        with self.urlopen(url) as text:
            page = lxml.html.fromstring(text)

            district = page.xpath("//a[contains(@href, 'Maps')]")[0].attrib["href"]
            district = re.search("district(\d+).pdf", district).group(1)

            if "Democrat&nbsp;District" in text:
                party = "Democratic"
            elif "Republican&nbsp;District" in text:
                party = "Republican"
            elif "Independent&nbsp;District" in text:
                party = "Independent"
            else:
                party = "Other"

            leg = Legislator(term, "lower", district, name, party=party)
            leg.add_source(url)
            self.save_legislator(leg)
    def scrape_bio(self, term, chamber, district, name, url):
        # this opens the committee section without having to do another request
        url += '&TableRow=1.5.5'
        frame_doc = self.lxmlize(url)
        actual_url = frame_doc.xpath("//frame[@name='right']/@src")[0]
        doc = self.lxmlize(actual_url)

        # party is in one of these
        party = doc.xpath('//div[@id="page_header"]')[0].text.strip()[-3:]
        if '(D)' in party:
            party = 'Democratic'
        elif '(R)' in party:
            party = 'Republican'
        else:
            raise AssertionError("No party found for {name}".format(name=name))

        leg = Legislator(term, chamber, district, name, party=party)

        photo_url = doc.xpath('//img[contains(@src, "jpg")]/@src')
        if photo_url:
            leg['photo_url'] = photo_url[0]

        contact_info = self.scrape_contact_info(doc)
        leg.update(contact_info)
        return leg
Example #10
0
    def scrape_lower(self, term):
        url = "http://assembly.state.ny.us/mem/?sh=email"
        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)
            page.make_links_absolute(url)

            for link, email in zip(
                page.xpath("//a[contains(@href, '/mem/')]"), page.xpath("//a[contains(@href, 'mailto')]")
            ):
                name = link.text.strip()
                if name == "Assembly Members":
                    continue
                # empty seats
                if "Assembly District" in name:
                    continue
                leg_url = link.get("href")

                district = link.xpath("string(../following-sibling::" "div[@class = 'email2'][1])")
                district = district.rstrip("rthnds")

                legislator = Legislator(term, "lower", district, name, party="Unknown", url=leg_url)
                legislator.add_source(url)

                email = email.text_content().strip()
                if email:
                    legislator["email"] = email
                self.save_legislator(legislator)
Example #11
0
    def scrape_upper(self, term):
        url = "http://www.nysenate.gov/senators"
        page = self.urlopen(url)
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        xpath = (
            '//div[contains(@class, "views-row")]/'
            'div[contains(@class, "last-name")]/'
            'span[contains(@class, "field-content")]/a')
        for link in page.xpath(xpath):
            if link.text in (None, 'Contact', 'RSS'):
                continue
            name = link.text.strip()

            district = link.xpath("string(../../../div[3]/span[1])")
            district = re.match(r"District (\d+)", district).group(1)

            photo_link = link.xpath("../../../div[1]/span/a/img")[0]
            photo_url = photo_link.attrib['src']

            legislator = Legislator(term, 'upper', district,
                                    name, party="Unknown",
                                    photo_url=photo_url)
            legislator.add_source(url)

            contact_link = link.xpath("../span[@class = 'contact']/a")[0]
            contact_url = contact_link.attrib['href']
            self.scrape_upper_offices(legislator, contact_url)

            legislator['url'] = contact_url.replace('/contact', '')

            self.save_legislator(legislator)
Example #12
0
    def scrape(self, chamber, term):
        self.validate_term(term)

        if chamber == 'upper':
            url = "http://legis.wi.gov/w3asp/contact/legislatorslist.aspx?house=senate"
        else:
            url = "http://legis.wi.gov/w3asp/contact/legislatorslist.aspx?house=assembly"

        with self.urlopen(url) as body:
            page = lxml.html.fromstring(body)

            for row in page.cssselect("#ctl00_C_dgLegData tr"):
                if len(row.cssselect("td a")) > 0:
                    rep_url = list(row)[0].cssselect("a[href]")[0].get("href")
                    rep_url = 'http://legis.wi.gov/w3asp/contact/' + rep_url

                    legpart = re.findall(r'([\w\-\,\s\.]+)\s+\(([\w])\)', list(row)[0].text_content())
                    if legpart:
                        full_name, party = legpart[0]

                        # skip if the legislator is vacant (occurred in 2011 session)
                        if full_name == 'Vacant':
                            continue

                        party = PARTY_DICT[party]

                        district = str(int(list(row)[2].text_content()))

                        leg = Legislator(term, chamber, district, full_name,
                                         party=party, url=rep_url)
                        leg.add_source(rep_url)

                        leg = self.add_committees(leg, rep_url, term, chamber)
                        self.save_legislator(leg)
Example #13
0
    def scrape_upper(self, term):
        url = "http://www.nysenate.gov/senators"
        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)
            page.make_links_absolute(url)

            for link in page.xpath('//a[contains(@href, "/senator/")]'):
                if link.text in (None, "Contact", "RSS"):
                    continue
                name = link.text.strip()

                district = link.xpath("string(../../../div[3]/span[1])")
                district = re.match(r"District (\d+)", district).group(1)

                photo_link = link.xpath("../../../div[1]/span/a/img")[0]
                photo_url = photo_link.attrib["src"]

                legislator = Legislator(term, "upper", district, name, party="Unknown", photo_url=photo_url)
                legislator.add_source(url)

                contact_link = link.xpath("../span[@class = 'contact']/a")[0]
                contact_url = contact_link.attrib["href"]
                self.scrape_upper_contact_info(legislator, contact_url)

                legislator["url"] = contact_url.replace("/contact", "")

                self.save_legislator(legislator)
Example #14
0
    def scrape_legislator(self, chamber, term, name, url):
        with self.urlopen(url) as page:
            # Alaska fails at unicode, some of the pages have broken
            # characters. They're not in data we care about so just
            # replace them.
            page = page.decode('utf8', 'replace')
            page = lxml.html.fromstring(page)

            name = re.sub(r'\s+', ' ', name)

            info = page.xpath('string(//div[@id = "fullpage"])')

            district = re.search(r'District ([\w\d]+)', info).group(1)
            party = re.search(r'Party: (.+) Toll-Free', info).group(1).strip()
            email = re.search(r'Email: ([\w_]+@legis\.state\.ak\.us)',
                              info).group(1)

            # for consistency
            if party == 'Democrat':
                party = 'Democratic'

            leg = Legislator(term, chamber, district, name, party=party,
                             email=email, url=url)
            leg.add_source(url)

            self.save_legislator(leg)
Example #15
0
    def scrape_legislator_data(self, url, chamber):
        party_fulls = {'R' : 'Republican', 'D' : 'Democrat'}
        with self.urlopen(url) as page:
            page = BeautifulSoup(page)
            for data in page.find('table', id = 'ctl00_mainCopy_DataList1')('td'):
                spans = data('span')
                if len(spans) == 0:
                    self.debug('Found an empty cell in %s. Continuing' % url)
                    continue
                full_name = ' '.join([span.string.strip() for span in spans])
                if len(spans[0].string.strip().split()) == 2:
                    first_name, middle_name = spans[0].string.strip().split()
                else:
                    first_name, middle_name = spans[0].string.strip(), ''
                last_name = spans[1].string.strip()

                details_url = get_abs_url(url, data.find('a')['href'])
                with self.urlopen(details_url) as details:
                    details = BeautifulSoup(details)
                    district = details.find('a', id = 'ctl00_mainCopy_LegisInfo_DISTRICTLabel').string.strip()
                    party = party_fulls[details.find('span', id = 'ctl00_mainCopy_LegisInfo_PARTYLabel').string]

                    leg = Legislator('2010', chamber, district, full_name, first_name, 
                            last_name, middle_name, party)
                    leg.add_source(details_url)

                    comms_table = details.find('table', id = 'ctl00_mainCopy_MembershipGrid')
                    for comms_raw_data in comms_table('tr')[1:]:
                        comm_data = comms_raw_data('td')
                        comm_role_type = comm_data[0].string.strip()
                        comm_name = comm_data[1]('a')[0].string.strip()
                        leg.add_role(comm_role_type, '2010', chamber = chamber, committee = comm_name)

                    self.save_legislator(leg)
Example #16
0
    def scrape_upper(self, term):
        url = "http://oksenate.gov/Senators/Default.aspx"
        html = self.urlopen(url)
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)

        for a in doc.xpath('//table[@summary]')[1].xpath('.//td//a[contains(@href, "biographies")]'):
            name, party = a.text.rsplit(None, 1)

            if party == '(D)':
                party = 'Democratic'
            elif party == '(R)':
                party = 'Republican'

            tail = a.xpath('..')[0].tail
            if tail:
                district = tail.split()[1]
            else:
                district = a.xpath('../../span')[1].text.split()[1]
            url = a.get('href')

            leg = Legislator(term, 'upper', district, name, party=party, url=url)
            leg.add_source(url)
            self.scrape_upper_offices(leg, url)
            self.save_legislator(leg)
Example #17
0
    def scrape_lower(self, term):
        url = "http://www.okhouse.gov/Members/Default.aspx"
        page = lxml.html.fromstring(self.urlopen(url))
        page.make_links_absolute(url)

        for tr in page.xpath("//table[@class='rgMasterTable']/tbody/tr")[1:]:
            name = tr.xpath('.//td[1]/a')[0].text.strip()
            district = tr.xpath('.//td[3]')[0].text_content().strip()
            party = tr.xpath('.//td[4]')[0].text_content().strip()
            party = {'R': 'Republican', 'D': 'Democratic'}[party]

            leg_url = 'http://www.okhouse.gov/District.aspx?District=' + district
            leg_doc = lxml.html.fromstring(self.urlopen(leg_url))
            leg_doc.make_links_absolute(leg_url)
            photo_url = leg_doc.xpath('//a[contains(@href, "HiRes")]/@href')[0]

            if name.startswith('House District'):
                self.warning("skipping %s %s" % (name, leg_url))
                continue

            leg = Legislator(term, 'lower', district, name, party=party,
                             photo_url=photo_url, url=leg_url)
            leg.add_source(url)
            leg.add_source(leg_url)

            # Scrape offices.
            self.scrape_lower_offices(leg_doc, leg)

            self.save_legislator(leg)
Example #18
0
    def scrape(self, chamber, term):
        if chamber == 'upper':
            url = ('http://webserver.rilin.state.ri.us/Documents/Senators.xls')
            rep_type = 'Senator '
        elif chamber == 'lower':
            url = (
             'http://webserver.rilin.state.ri.us/Documents/Representatives.xls')
            rep_type = 'Representative '

        self.urlretrieve(url, 'ri_leg.xls')

        wb = xlrd.open_workbook('ri_leg.xls')
        sh = wb.sheet_by_index(0)

        for rownum in xrange(1, sh.nrows):
            d = {}
            for field, col_num in excel_mapping.iteritems():
                d[field] = sh.cell(rownum, col_num).value
            dist = str(int(d['district']))
            district_name = dist
            full_name = re.sub(rep_type, '', d['full_name']).strip()
            translate = {
                "Democrat"    : "Democratic",
                "Republican"  : "Republican",
                "Independent" : "Independent"
            }
            leg = Legislator(term, chamber, district_name, full_name,
                             '', '', '',
                             translate[d['party']],
                             town_represented=d['town_represented'],
                             email=d['email'])
            leg.add_office('district', 'Address', address=d['address'])
            leg.add_source(url)
            self.save_legislator(leg)
Example #19
0
    def scrape_2011Leg(self, chamber, term, url):
        """2011 Scraper for legislators"""
        parties = {'(D)': 'Democratic', '(R)': 'Republican'}
        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)
            page.make_links_absolute(url)
            table = page.xpath('//table[contains(@id, "GridView1")]')[0]
            for row in table.xpath('tr[td/a[contains(@href, "memberpage")]]'):
                params = {}
                district = row.xpath('td/span[contains(@id, "LabelDistrict")]/font')[0].text
                last_name_a = row.xpath('td/a[contains(@id, "HyperLinkLast")]')[0]
                member_url = last_name_a.get('href')
                last_name = last_name_a.text_content().strip()
                first_names = row.xpath('td/span[contains(@id, "LabelFirst")]/font')[0].text.strip()
                first_name = first_names.split()[0]
                middle_name = ' '.join(first_names.split()[1:])
                party = row.xpath('td/span[contains(@id, "LabelParty")]/font')[0].text
                party = parties[party]
                params['office_address'] = row.xpath('td/span[contains(@id, "LabelRoom")]')[0].text + \
                    " " + row.xpath('td/span[contains(@id, "LabelRoom2")]')[0].text
                params['photo_url'] = row.xpath('td/a[contains(@id, "HyperLinkChairJPG")]/img')[0].attrib['src']
                params['email'] = row.xpath('td/a[contains(@id, "HyperLinkEmail")]')[0].text
                params['phone'] = row.xpath('td/span[contains(@id, "LabelPhone2")]')[0].text

                full_name = first_names + " " + last_name
                leg = Legislator(term, chamber, district, full_name,
                                 first_name, last_name, middle_name, party,
                                 url=member_url, **params)
                leg.add_source(url)
                self.save_legislator(leg)
Example #20
0
    def scrape(self, chamber, session):
        metainf = self.scrape_leg_page(get_chamber_listing_url( chamber ))
        for leg in metainf:
            p = Legislator( session, chamber, leg['district'], leg['name'],
                party=leg['party'],
                # some additional things the website provides:
                photo_url=leg['image'],
                url=leg['homepage'],
                room=leg['room'],
                phone=leg['phone'],
                fax=leg['fax'],
                email=leg['email'],
                address=leg['addr'])

            for source in leg['source']:
                p.add_source( source )

            try:
                for ctty in leg['ctty']:
                    flag='Joint Legislative'
                    if ctty['name'][:len(flag)] == flag:
                        ctty_chamber = "joint"
                    else:
                        ctty_chamber = chamber

                    p.add_role( 'committee member',
                        term=session,
                        chamber=ctty_chamber,
                        committee=ctty['name'],
                        position="member")
            except KeyError:
                self.log( "XXX: Warning, %s has no scraped Commities" %
                    leg['name'] )

            self.save_legislator( p )
Example #21
0
    def scrape_details(self, chamber, term, leg_name, leg_link, role):
        try:
            url = 'http://billstatus.ls.state.ms.us/members/%s' % leg_link
            with self.urlopen(url) as details_page:
                details_page = details_page.decode('latin1').encode('utf8', 'ignore')
                root = lxml.etree.fromstring(details_page, lxml.etree.HTMLParser())
                party = root.xpath('string(//party)')
                district = root.xpath('string(//district)')
                first_name, middle_name, last_name = "", "", ""

                home_phone = root.xpath('string(//h_phone)')
                bis_phone = root.xpath('string(//b_phone)')
                capital_phone = root.xpath('string(//cap_phone)')
                other_phone = root.xpath('string(//oth_phone)')
                org_info = root.xpath('string(//org_info)')
                email_name = root.xpath('string(//email_address)')
                email = '%s@%s.ms.gov' % (email_name, chamber)
                if party == 'D':
                    party = 'Democratic'
                else:
                    party = 'Republican'

                leg = Legislator(term, chamber, district, leg_name, first_name,
                                 last_name, middle_name, party, role=role,
                                 home_phone = home_phone, bis_phone=bis_phone,
                                 capital_phone=capital_phone,
                                 other_phone=other_phone, org_info=org_info,
                                 email=email, url=url)
                leg.add_source(url)
                self.save_legislator(leg)
        except scrapelib.HTTPError, e:
            self.warning(str(e))
Example #22
0
def test_legislator():
    l = Legislator('T1', 'upper', '1', 'Adam Smith', 'Adam', 'Smith')
    assert_equal(l, {'_type': 'person', 'full_name': 'Adam Smith',
                     'first_name': 'Adam', 'last_name': 'Smith',
                     'middle_name': '', 'suffixes': '', 'roles': [
                         {'chamber': 'upper', 'term': 'T1',
                          'role': 'member', 'start_date': None,
                          'end_date': None, 'district': '1',
                          'party': ''}],
                     'offices': [], 'sources': []})

    l.add_role('committee member', 'T1', committee='Some Committee',
               position='chairman')
    assert_equal(l['roles'][1], {'role': 'committee member', 'term': 'T1',
                                 'start_date': None, 'end_date': None,
                                 'committee': 'Some Committee',
                                 'position': 'chairman'})

    l.add_office('capitol', 'Statehouse Office', '123 Main St', '123-456-7890',
                 '123-555-5555', '*****@*****.**')
    assert_equal(l['offices'], [{'type': 'capitol',
                                 'name': 'Statehouse Office',
                                 'address': '123 Main St',
                                 'phone': '123-456-7890',
                                 'fax': '123-555-5555',
                                 'email': '*****@*****.**'}])
Example #23
0
    def scrape(self, chamber, term):
        # Pennsylvania doesn't make member lists easily available
        # for previous sessions, unfortunately
        self.validate_term(term, latest_only=True)

        leg_list_url = legislators_url(chamber)

        with self.urlopen(leg_list_url) as page:
            page = lxml.html.fromstring(page)
            page.make_links_absolute(leg_list_url)

            for link in page.xpath("//a[contains(@href, '_bio.cfm')]"):
                full_name = link.text
                district = link.getparent().getnext().tail.strip()
                district = re.search("District (\d+)", district).group(1)

                party = link.text[-2]
                if party == 'R':
                    party = 'Republican'
                elif party == 'D':
                    party = 'Democratic'

                url = link.get('href')

                legislator = Legislator(term, chamber, district,
                                        full_name, party=party, url=url)
                legislator.add_source(leg_list_url)
                self.save_legislator(legislator)
Example #24
0
    def scrape(self, chamber, session):
        url = self.get_district_list(chamber, session)
        people_pages = self.scrape_directory( url, chamber, session )

        for person in people_pages:
            district = person
            p_url = people_pages[district]
            metainf = self.process_person( p_url )

            p = Legislator( session, chamber, district, metainf['name'],
                party=metainf['party'],
                # some additional things the website provides:
                occupation=metainf['occupation'],
                photo_url=metainf['photo_url'],
                url=metainf['homepage'])

            phone = metainf['number'] if 'number' in metainf else None
            email = metainf['email'] if 'email' in metainf else None
            p.add_office('capitol', 'Capitol Office',
                             phone=phone,
                             address='200 E. Colfax\nDenver, CO 80203',
                             email=email
                            )

            p.add_source( p_url )
            self.save_legislator( p )
Example #25
0
  def scrape(self, chamber, term):
    self.validate_term(term, latest_only=True)

    if chamber == 'upper':
      url = ('http://www.rilin.state.ri.us/Documents/Senators.xls')
      rep_type = 'Senator '
    elif chamber == 'lower':
      url = ('http://www.rilin.state.ri.us/Documents/Representatives.xls')
      rep_type = 'Representative '

    with self.urlopen(url) as senator_xls:
      with open('ri_senate.xls', 'w') as f:
        f.write(senator_xls)

    wb = xlrd.open_workbook('ri_senate.xls')
    sh = wb.sheet_by_index(0)

    for rownum in xrange(1, sh.nrows):
      d = {}
      for field, col_num in excel_mapping.iteritems():
        d[field] = str(sh.cell(rownum, col_num).value)
      district_name = "District " + d['district']
      full_name = re.sub(rep_type, '', d['full_name']).strip()
      leg = Legislator(term, chamber, district_name, full_name,
                       '', '', '',
                       d['party'], 
                       office_address=d['address'],
                       town_represented=d['town_represented'],
                       email=d['email'])
      leg.add_source(url)

      self.save_legislator(leg)
Example #26
0
    def scrape_upper(self, chamber, term):
        url = 'http://www.senate.michigan.gov/members/memberlist.htm'
        html = self.urlopen(url)
        doc = lxml.html.fromstring(html)
        for row in doc.xpath('//table[@width=550]/tr')[1:39]:
            # party, dist, member, office_phone, office_fax, office_loc
            party, dist, member, phone, fax, loc = row.getchildren()
            party = abbr[party.text]
            district = dist.text_content().strip()
            name = member.text_content().strip()
            if name == 'Vacant':
                self.info('district %s is vacant', district)
                continue
            leg_url = member.xpath('a/@href')[0]
            office_phone = phone.text
            office_fax = fax.text
            office_loc = loc.text
            leg = Legislator(term=term, chamber=chamber,
                             district=district,
                             full_name=name,
                             party=party,
                             url=leg_url)

            leg.add_office('capitol', 'Capitol Office',
                           address=office_loc,
                           fax=office_fax,
                           phone=office_phone)


            leg.add_source(url)
            self.save_legislator(leg)
Example #27
0
    def scrape_reps(self, chamber, term):
        # There are 99 House districts
        for district in xrange(1, 100):
            rep_url = "http://www.house.state.oh.us/components/" "com_displaymembers/page.php?district=%d" % district

            with self.urlopen(rep_url) as page:
                page = lxml.html.fromstring(page)

                for el in page.xpath('//table[@class="page"]'):
                    rep_link = el.xpath("tr/td/title")[0]
                    full_name = rep_link.text
                    party = full_name[-2]
                    full_name = full_name[0:-3]

                    if full_name == "Vacant Posit":
                        continue

                    if party == "D":
                        party = "Democratic"
                    elif party == "R":
                        party = "Republican"

                    leg = Legislator(term, chamber, str(district), full_name, party=party, url=rep_url)
                    leg.add_source(rep_url)

                self.save_legislator(leg)
Example #28
0
 def scrape_2011Leg(self, chamber, term, url):
     """2011 Scraper for legislators"""
     titles = {'lower': 'Representative', 'upper': 'Senator'}
     parties = {'D': 'Democrat', 'R': 'Republican'}
     with self.urlopen(url) as page:
         page = lxml.html.fromstring(page)
         page.make_links_absolute(url)
         table = page.xpath('//table[contains(@id, "GridView1")]')[0]
         for row in table.xpath('tr[td/a[contains(@href, "memberpage")]]'):
             params = {}
             district = row.xpath('td/span[contains(@id, "LabelDis")]/font')[0].text + " " + \
                 row.xpath('td/span[contains(@id, "LabelDistrict2")]/font')[0].text
             # Replace any / in district name to allow json file to save.
             district = district.replace('/', '-')
             params['title'] = titles.get(chamber, '')
             last_name = row.xpath('td/a[contains(@id, "HyperLinkLast")]/font')[0].text.strip()
             first_names = row.xpath('td/span[contains(@id, "LabelFirst")]/font')[0].text.strip()
             first_name = first_names.split()[0]
             middle_name = ' '.join(first_names.split()[1:])
             party = row.xpath('td/span[contains(@id, "LabelParty")]/font')[0].text
             party = party.replace('(', '')
             party = party.replace(')', '')
             party = parties.get(party, '') # Expand party from initial letter.
             params['office_address'] = row.xpath('td/span[contains(@id, "LabelRoom")]')[0].text + \
                 " " + row.xpath('td/span[contains(@id, "LabelRoom2")]')[0].text
             params['photo_url'] = row.xpath('td/a[contains(@id, "HyperLinkChairJPG")]/img')[0].attrib['src']
             params['email'] = row.xpath('td/a[contains(@id, "HyperLinkEmail")]')[0].text
             params['phone'] = row.xpath('td/span[contains(@id, "LabelPhone2")]')[0].text
             
             full_name = first_names + " " + last_name
             leg = Legislator(term, chamber, district, full_name, 
                     first_name, last_name, middle_name, party, **params)
             leg.add_source(url)
             self.save_legislator(leg)
Example #29
0
    def scrape(self, chamber, term):
        term_slug = term[:-2]
        url = MEMBER_LIST_URL[chamber] % term_slug

        html = self.urlopen(url)
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)

        for row in doc.xpath('//table')[4].xpath('tr')[2:]:
            name, _, _, district, party = row.xpath('td')
            district = district.text
            party = {'D':'Democratic', 'R': 'Republican',
                     'I': 'Independent'}[party.text]
            leg_url = name.xpath('a/@href')[0]
            name = name.text_content().strip()

            # inactive legislator, skip them for now
            if name.endswith('*'):
                continue

            leg_html = self.urlopen(leg_url)
            leg_doc = lxml.html.fromstring(leg_html)
            photo_url = leg_doc.xpath('//img[contains(@src, "/members/")]/@src')[0]

            leg = Legislator(term, chamber, district, name, party=party,
                             url=leg_url, photo_url=photo_url)
            leg.add_source(url)
            leg.add_source(leg_url)
            self.save_legislator(leg)
Example #30
0
    def scrape_rep(self, name, term, url):
        # special case names that confuses name_tools
        if name == 'Franklin, A.B.':
            name = 'Franklin, A. B.'
        elif ', Jr., ' in name:
            name = name.replace(', Jr., ', ' ')
            name += ', Jr.'
        elif ', III, ' in name:
            name = name.replace(', III, ', ' ')
            name += ', III'

        with self.urlopen(url) as text:
            page = lxml.html.fromstring(text)

            district = page.xpath(
                "//a[contains(@href, 'district')]")[0].attrib['href']
            district = re.search("district(\d+).pdf", district).group(1)

            if "Democrat&nbsp;District" in text:
                party = "Democratic"
            elif "Republican&nbsp;District" in text:
                party = "Republican"
            elif "Independent&nbsp;District" in text:
                party = "Independent"
            else:
                party = "Other"

            leg = Legislator(term, 'lower', district, name, party=party,
                             url=url)
            leg.add_source(url)
            self.save_legislator(leg)
Example #31
0
    def scrape_lower_chamber(self, term):
        url = "http://www.okhouse.gov/Members/Default.aspx"

        page = self.lxmlize(url)

        legislator_nodes = self.get_nodes(
            page,
            '//table[@id="ctl00_ContentPlaceHolder1_RadGrid1_ctl00"]/tbody/tr')

        for legislator_node in legislator_nodes:
            name_node = self.get_node(
                legislator_node,
                './/td[1]/a')

            if name_node is not None:
                name_text = name_node.text.strip()

                last_name, delimiter, first_name = name_text.partition(',')

                if last_name is not None and first_name is not None:
                    first_name = first_name.strip()
                    last_name = last_name.strip()
                    name = ' '.join([first_name, last_name])
                else:
                    raise ValueError('Unable to parse name: {}'.format(
                        name_text))

                if name.startswith('House District'):
                    continue

            district_node = self.get_node(
                legislator_node,
                './/td[3]')

            if district_node is not None:
                district = district_node.text.strip()

            party_node = self.get_node(
                legislator_node,
                './/td[4]')

            if party_node is not None:
                party_text = party_node.text.strip()

            party = self._parties[party_text]

            legislator_url = 'http://www.okhouse.gov/District.aspx?District=' + district

            legislator_page = self.lxmlize(legislator_url)

            photo_url = self.get_node(
                legislator_page,
                '//a[@id="ctl00_ContentPlaceHolder1_imgHiRes"]/@href')

            legislator = Legislator(
                _scraped_name=name_text,
                full_name=name,
                term=term,
                chamber='lower',
                district=district,
                party=party,
                photo_url=photo_url,
                url=legislator_url
            )

            legislator.add_source(url)
            legislator.add_source(legislator_url)

            # Scrape offices.
            self.scrape_lower_offices(legislator_page, legislator)

            self.save_legislator(legislator)
Example #32
0
    def scrape(self, chamber, term):
        self.validate_term(term)
        session = self.get_session_for_term(term)
        try:
            session_id = self.get_session_id(session)
        except KeyError:
            raise NoDataForPeriod(session)

        body = {'lower': 'H', 'upper': 'S'}[chamber]
        url = 'http://www.azleg.gov/MemberRoster.asp?Session_ID=%s&body=%s' % (
                                                               session_id, body)
        with self.urlopen(url) as page:
            root = html.fromstring(page)
            path = '//table[@id="%s"]/tr' % {'H': 'house', 'S': 'senate'}[body]
            roster = root.xpath(path)[1:]
            for row in roster:
                position = ''
                vacated = ''
                name, district, party, email, room, phone, fax = row.getchildren()

                link = name.xpath('string(a/@href)')
                link = "http://www.azleg.gov" + link
                if len(name) == 1:
                    name = name.text_content().strip()
                else:
                    position = name.tail.strip()
                    name = name[0].text_content().strip()

                district = district.text_content()
                party = party.text_content().strip()
                email = email.text_content().strip()

                if 'Vacated' in email:
                    # comment out the following 'continue' for historical
                    # legislative sessions
                    # for the current session, if a legislator has left we will
                    # skip him/her to keep from overwriting their information
                    continue
                    vacated = re.search('[0-9]*/[0-9]*/\d{4}', email).group()
                    email = ''

                party = self.get_party(party)
                room = room.text_content().strip()
                if chamber == 'lower':
                    address = "House of Representatives\n"
                else:
                    address = "Senate\n"
                address = address + "1700 West Washington\n" + room  \
                                  + "\nPhoenix, AZ 85007"

                phone = phone.text_content().strip()
                if not phone.startswith('602'):
                    phone = "602-" + phone
                fax = fax.text_content().strip()
                if not fax.startswith('602'):
                    fax = "602-" + fax
                if vacated:
                    end_date = datetime.datetime.strptime(vacated, '%m/%d/%Y')
                    leg = Legislator( term, chamber, district, full_name=name,
                                      party=party, url=link)
                    leg['roles'][0]['end_date'] = end_date
                else:
                    leg = Legislator( term, chamber, district, full_name=name,
                                      party=party, office_phone=phone,
                                      office_fax=fax, office_address=address,
                                      email=email, url=link)

                if position:
                    leg.add_role( position, term, chamber=chamber,
                                 district=district, party=party)

                leg.add_source(url)

                #Probably just get this from the committee scraper
                #self.scrape_member_page(link, session, chamber, leg)
                self.save_legislator(leg)
Example #33
0
    def scrape(self, chamber, term):

        for tdata in self.metadata['terms']:
            if term == tdata['name']:
                year = tdata['start_year']
                session_number = tdata['session_number']
                break

        # Scrape committees. Also produce a name dictionary that can be
        # used for fuzzy matching between the committee page names and the
        # all-caps csv names.
        for name_dict, _ in scrape_committees(year, chamber):
            pass

        # Fetch the csv.
        url = 'http://leg.mt.gov/content/sessions/%s/%d%sMembers.txt' % \
            (session_number, year, chamber == 'upper' and 'Senate' or 'House')

        # Parse it.
        data = self.urlopen(url)
        data = data.replace('"""', '"')  # weird triple quotes
        data = data.splitlines()

        fieldnames = [
            'last_name', 'first_name', 'party', 'district', 'address', 'city',
            'state', 'zip'
        ]
        csv_parser = csv.DictReader(data, fieldnames)

        district_leg_urls = self._district_legislator_dict()

        for entry in csv_parser:
            if not entry:
                continue

            # City.
            entry['city'] = entry['city'].title()

            # Address.
            entry['address'] = entry['address'].title()

            # District.
            district = entry['district']
            hd_or_sd, district = district.split()
            del entry['district']

            # Party.
            party_letter = entry['party']
            party = {'D': 'Democratic', 'R': 'Republican'}[party_letter]
            entry['party'] = party
            del entry['party']

            # Get full name properly capped.
            _fullname = '%s %s' % (entry['first_name'].capitalize(),
                                   entry['last_name'].capitalize())

            city_lower = entry['city'].lower()
            fullname = difflib.get_close_matches(_fullname,
                                                 name_dict[city_lower],
                                                 cutoff=0.5)

            # If there are no close matches with the committee page,
            # use the title-capped first and last name.
            if len(fullname) < 1:
                fullname = _fullname
                # msg = 'No matches found for "%s" with "%s" from %r'
                # self.debug(msg % (_fullname, fullname,
                #                   name_dict[city_lower]))
            else:
                fullname = fullname[0]
                # if _fullname != fullname:
                #     msg = 'matched "%s" with "%s" from %r'
                #     self.debug(msg % (_fullname, fullname,
                #                       name_dict[city_lower]))

            # Get any info at the legislator's detail_url.
            detail_url = district_leg_urls[hd_or_sd][district]
            deets = self._scrape_details(detail_url)

            # Add the details and delete junk.
            entry.update(deets)
            del entry['first_name'], entry['last_name']

            legislator = Legislator(term,
                                    chamber,
                                    district,
                                    fullname,
                                    party=party)
            legislator.update(entry)
            legislator.add_source(detail_url)
            legislator.add_source(url)
            legislator['url'] = detail_url

            self.save_legislator(legislator)
Example #34
0
    def scrape(self, chamber, term):
        urls = {'lower': "http://www.msa.md.gov/msa/mdmanual/06hse/html/hseal.html",
                'upper': "http://www.msa.md.gov/msa/mdmanual/05sen/html/senal.html"}
        detail_re = re.compile('\((R|D)\), (?:Senate President, )?(?:House Speaker, )?District (\w+)')

        with self.urlopen(urls[chamber]) as html:
            doc = lxml.html.fromstring(html)

            # rest of data on this page is <li>s that have anchor tags
            for a in doc.cssselect('li a'):
                link = a.get('href')
                # tags don't close so we get the <li> and <a> content and diff them
                name_text = a.text_content()
                detail_text = a.getparent().text_content().replace(name_text, '')

                # ignore if it is not a valid link
                if link:
                    # handle names
                    names = name_text.split(',')
                    last_name = names[0]
                    first_name = names[1].strip()
                    # TODO: try to trim first name to remove middle initial
                    if len(names) > 2:
                        suffixes = names[2]
                    else:
                        suffixes = ''

                    # handle details
                    details = detail_text.strip()
                    party, district = detail_re.match(details).groups()
                    party = PARTY_DICT[party]

                    leg_url = BASE_URL+link

                    leg = Legislator(term, chamber, district,
                                     ' '.join((first_name, last_name)),
                                     first_name, last_name,
                                     party=party, suffixes=suffixes,
                                     url=leg_url)
                    leg.add_source(url=leg_url)

                    with self.urlopen(leg_url) as leg_html:
                        leg_doc = lxml.html.fromstring(leg_html)
                        img_src = leg_doc.xpath('//img[@align="left"]/@src')
                        if img_src:
                            leg['photo_url'] = BASE_URL + img_src[0]

                        # address extraction
                        # this is pretty terrible, we get address in a format that looks
                        # like:
                        #   James Senate Office Building, Room 322
                        #   11 Bladen St., Annapolis, MD 21401
                        #   (410) 841-3565, (301) 858-3565; 1-800-492-7122, ext. 3565 (toll free)
                        #   e-mail: [email protected]
                        #   fax: (410) 841-3552, (301) 858-3552
                        #
                        #   Western Maryland Railway Station, 13 Canal St., Room 304, Cumberland, MD 21502
                        #   (301) 722-4780; 1-866-430-9553 (toll free)
                        #   e-mail: [email protected]
                        #   fax: (301) 722-4790
                        # usually first ul, sometimes first p
                        try:
                            addr_lines = leg_doc.xpath('//ul')[0].text_content().strip().splitlines()
                        except IndexError:
                            addr_lines = leg_doc.xpath('//p')[0].text_content().strip().splitlines()
                        addr_pieces = {'capitol': defaultdict(str),
                                       'district': defaultdict(str)}
                        addr_type = 'capitol'
                        for line in addr_lines:
                            if '(401)' in line or '(301)' in line:
                                addr_pieces[addr_type]['phone'] = line
                            elif 'toll free' in line:
                                pass # skip stand alone 1-800 numbers
                            elif 'e-mail' in line:
                                addr_pieces[addr_type]['email'] = line.replace('email: ',
                                                                               '')
                            elif 'fax' in line:
                                addr_pieces[addr_type]['fax'] = line.replace('fax: ', '')
                            elif line == '':
                                addr_type = 'district'
                            else:
                                addr_pieces[addr_type]['address'] += '{0}\n'.format(line)
                        if addr_pieces['capitol']:
                            leg.add_office('capitol', 'Capitol Office',
                                           **addr_pieces['capitol'])
                            leg['email'] = (addr_pieces['capitol']['email'] or
                                            addr_pieces['district']['email'] or
                                            None)
                        if addr_pieces['district']:
                            leg.add_office('district', 'District Office',
                                           **addr_pieces['district'])

                    self.save_legislator(leg)
Example #35
0
    def scrape(self, chamber, term):
        self.validate_term(term, latest_only=False)
        root_url = 'http://www.capitol.tn.gov/'
        parties = {
            'D': 'Democratic',
            'R': 'Republican',
            'CCR': 'Carter County Republican',
            'I': 'Independent'
        }

        #testing for chamber
        if chamber == 'upper':
            url_chamber_name = 'senate'
            abbr = 's'
        else:
            url_chamber_name = 'house'
            abbr = 'h'
        if term != self.metadata["terms"][-1]["sessions"][0]:
            chamber_url = root_url + url_chamber_name
            chamber_url += '/archives/' + term + 'GA/Members/index.html'
        else:
            chamber_url = root_url + url_chamber_name + '/members/'

        page = self.lxmlize(chamber_url)

        for row in page.xpath("//tr"):

            # Skip any a header row.
            if set(child.tag for child in row) == set(['th']):
                continue

            vacancy_check = row.xpath('./td/text()')[1]
            if 'Vacant' in vacancy_check:
                self.logger.warning("Vacant Seat")
                continue

            partyInit = row.xpath('td[3]')[0].text.split()[0]
            party = parties[partyInit]
            district = row.xpath('td[5]/a')[0].text.split()[1]
            address = row.xpath('td[6]')[0].text_content()
            # 301 6th Avenue North Suite
            address = address.replace(
                'LP', 'Legislative Plaza\nNashville, TN 37243')
            address = address.replace(
                'WMB', 'War Memorial Building\nNashville, TN 37243')
            address = '301 6th Avenue North\nSuite ' + address
            phone = [
                x.strip() for x in row.xpath('td[7]//text()') if x.strip()
            ][0]

            email = HTMLParser.HTMLParser().unescape(
                row.xpath('td[1]/a/@href')[0][len("mailto:"):])
            member_url = (root_url + url_chamber_name + '/members/' + abbr +
                          district + '.html')
            member_photo_url = (root_url + url_chamber_name +
                                '/members/images/' + abbr + district + '.jpg')

            try:
                member_page = self.get(member_url, allow_redirects=False).text
            except (TypeError, HTTPError):
                try:
                    member_url = row.xpath('td[2]/a/@href')[0]
                    member_page = self.get(member_url,
                                           allow_redirects=False).text
                except (TypeError, HTTPError):
                    self.logger.warning("Valid member page does not exist.")
                    continue

            member_page = lxml.html.fromstring(member_page)
            try:
                name = member_page.xpath('//div/div/h1/text()')[0]
            except IndexError:
                name = member_page.xpath(
                    '//div[@id="membertitle"]/h2/text()')[0]

            if 'Speaker' in name:
                full_name = name[8:len(name)]
            elif 'Lt.' in name:
                full_name = name[13:len(name)]
            elif abbr == 'h':
                full_name = name[len("Representative "):len(name)]
            else:
                full_name = name[8:len(name)]

            leg = Legislator(term,
                             chamber,
                             district,
                             full_name.strip(),
                             party=party,
                             url=member_url,
                             photo_url=member_photo_url)
            leg.add_source(chamber_url)
            leg.add_source(member_url)

            # TODO: add district address from this page

            leg.add_office('capitol',
                           'Nashville Address',
                           address=address,
                           phone=phone,
                           email=email)

            self.save_legislator(leg)
Example #36
0
    def scrape(self, chamber, term):
        term_slug = term[:-2]
        url = MEMBER_LIST_URL[chamber] % term_slug

        html = self.urlopen(url)
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)

        for row in doc.xpath('//table')[4].xpath('tr')[2:]:
            name, _, _, district, party = row.xpath('td')
            district = district.text
            party = {
                'D': 'Democratic',
                'R': 'Republican',
                'I': 'Independent'
            }[party.text]
            leg_url = name.xpath('a/@href')[0]
            name = name.text_content().strip()

            # inactive legislator, skip them for now
            if name.endswith('*'):
                name = name.strip('*')
                continue

            leg_html = self.urlopen(leg_url)
            leg_doc = lxml.html.fromstring(leg_html)
            leg_doc.make_links_absolute(leg_url)

            leg = Legislator(term,
                             chamber,
                             district,
                             name,
                             party=party,
                             url=leg_url)
            leg.add_source(url)

            hotgarbage = ('Senate Biography Information for the 98th General '
                          'Assembly is not currently available.')
            if hotgarbage in leg_html:
                # The legislator's bio isn't available yet.
                self.logger.warning('No legislator bio available for ' + name)
                self.save_legislator(leg)
                continue

            photo_url = leg_doc.xpath(
                '//img[contains(@src, "/members/")]/@src')[0]
            photo_url_parsed = urlparse(photo_url)
            encoded_path = quote(photo_url_parsed.path)
            photo_url = photo_url_parsed._replace(path=encoded_path).geturl()
            leg.update(photo_url=photo_url)
            leg.add_source(leg_url)

            # email
            email = leg_doc.xpath('//b[text()="Email: "]')
            if email:
                leg['email'] = email[0].tail

            # function for turning an IL contact info table to office details
            def _table_to_office(table, office_type, office_name):
                addr = ''
                phone = ''
                fax = None
                for row in table.xpath('tr'):
                    row = row.text_content().strip()
                    # skip rows that aren't part of address
                    if 'Office:' in row or row == 'Cook County':
                        continue
                    # fax number row ends with FAX
                    elif 'FAX' in row:
                        fax = row.replace(' FAX', '')
                    # phone number starts with ( [make it more specific?]
                    elif row.startswith('('):
                        phone = row
                    # everything else is an address
                    else:
                        addr += (row + '\n')
                if addr.strip() != ',':
                    leg.add_office(office_type,
                                   office_name,
                                   address=addr.strip(),
                                   phone=phone,
                                   fax=fax)

            # extract both offices from tables
            table = leg_doc.xpath(
                '//table[contains(string(), "Springfield Office")]')
            if table:
                _table_to_office(table[3], 'capitol', 'Springfield Office')
            table = leg_doc.xpath(
                '//table[contains(string(), "District Office")]')
            if table:
                _table_to_office(table[3], 'district', 'District Office')

            self.save_legislator(leg)
Example #37
0
    def scrape(self, chamber, term):
        biennium = "%s-%s" % (term[0:4], term[7:9])

        url = ("http://wslwebservices.leg.wa.gov/SponsorService.asmx/"
               "GetSponsors?biennium=%s" % biennium)

        # these pages are useful for checking if a leg is still in office
        if chamber == 'upper':
            cur_member_url = 'http://www.leg.wa.gov/senate/senators/Pages/default.aspx'
        else:
            cur_member_url = 'http://www.leg.wa.gov/house/representatives/Pages/default.aspx'

        cur_members = self.get(cur_member_url).text
        cur_members_doc = lxml.html.fromstring(cur_members)
        cur_members_doc.make_links_absolute(cur_member_url)

        page = self.get(url)
        page = lxml.etree.fromstring(page.content)

        for member in xpath(page, "//wa:Member"):

            mchamber = xpath(member, "string(wa:Agency)")
            mchamber = {'House': 'lower', 'Senate': 'upper'}[mchamber]

            if mchamber != chamber:
                continue

            name = xpath(member, "string(wa:Name)").strip()
            if name == "":
                continue

            # if the legislator isn't in the listing, skip them
            if name not in cur_members:
                self.warning('%s is no longer in office' % name)
                continue
            else:
                leg_url, = set(cur_members_doc.xpath(
                    '//span[contains(text(), "%s")]/../..//'
                    'a[text()="Home Page"]/@href' % (
                        name
                    )))

            party = xpath(member, "string(wa:Party)")
            party = {'R': 'Republican', 'D': 'Democratic'}.get(
                party, party)

            district = xpath(member, "string(wa:District)")
            if district == '0':
                # Skip phony district 0.
                continue

            email = xpath(member, "string(wa:Email)")
            phone = xpath(member, "string(wa:Phone)")

            last = xpath(member, "string(wa:LastName)")
            last = last.lower().replace(' ', '')

            scraped_offices = []
            photo_url = ""

            try:
                leg_page = self.get(leg_url).text
                leg_page = lxml.html.fromstring(leg_page)
                leg_page.make_links_absolute(leg_url)

                photo_link = leg_page.xpath(
                    "//a[contains(@href, 'publishingimages')]")
                if photo_link:
                    photo_url = photo_link[0].attrib['href']
                offices = leg_page.xpath("//table[@cellspacing='0']/tr/td/b[contains(text(), 'Office')]")
                for office in offices:
                    office_block = office.getparent()
                    office_name = office.text_content().strip().rstrip(":")
                    address_lines = [x.tail for x in office_block.xpath(".//br")]
                    address_lines = filter(lambda a: a is not None, address_lines)
                    _ = address_lines.pop(len(address_lines) - 1)
                    phone = address_lines.pop(len(address_lines) - 1)
                    address = "\n".join(address_lines)
                    obj = {
                        "name": office_name,
                        "phone": phone
                    }
                    if address.strip() != '':
                        obj['address'] = address

                    scraped_offices.append(obj)

            except scrapelib.HTTPError:
                # Sometimes the API and website are out of sync
                # with respect to legislator resignations/appointments
                pass
            except requests.exceptions.ConnectionError:
                # Sometimes the API and website are out of sync
                # with respect to legislator resignations/appointments
                pass

            leg = Legislator(term, chamber, district,
                             name, '', '', '', party,
                             photo_url=photo_url, url=leg_url)
            leg.add_source(leg_url)

            for office in scraped_offices:
                typ = 'district' if 'District' in office['name'] else 'capitol'
                leg.add_office(typ, office.pop('name'), **office)

            self.save_legislator(leg)
Example #38
0
    def scrape_legislator(self, chamber, term, option):
        url = urlparse.urljoin(self.url, option.attrib['value'])
        name, party, district = re.split(r'\s*,\s*', option.text.strip())
        name = re.sub(r'^(Sen\.|Rep\.)\s+', '', name)
        district = re.sub(r'^District\s+', '', district)
        if district == '[N/A]':
            msg = 'No district found for %r; skipping.'
            self.logger.warning(msg, name)
            return
        leg = Legislator(term, chamber, district, name, party=party)
        leg.add_source(self.url)

        # Scrape leg page.
        try:
            html = self.urlopen(url)
        except scrapelib.HTTPError as exc:
            # As of July 2014, this only happens when a page has
            # gone missing from their varnish server.
            # if exc.response.status_code is 503:
            self.logger.exception(exc)
            self.logger.warning('Skipping legislator at url: %s' % url)
            skipped = True
            return

        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(self.url)
        leg.add_source(url)

        # Scrape committees.
        for tr in doc.xpath('//table//tr'):
            committee, role = tr
            committee = committee.text_content().strip()
            role = role.text_content().strip()
            if 'member' in role.lower():
                role = 'committee member'
            elif 'chair' in role.lower():
                role = 'chair'
            leg.add_role(role, term, chamber=chamber, committee=committee)

        # Scrape offices.
        dist_office, phone = doc.xpath('//address')
        dist_office = dist_office.text_content().strip()
        dist_office = re.sub(r' {2,}', '', dist_office)

        phone = phone.text_content().strip()
        email = doc.xpath('string(//a[starts-with(@href, "mailto:")]/@href)')
        photo_url = doc.xpath('string(//img[contains(@class, "member")]/@src)')

        leg.update(email=email, photo_url=photo_url)
        leg.add_office(address=dist_office,
                       name='District Office',
                       type='district',
                       phone=phone)

        self.save_legislator(leg)
Example #39
0
    def scrape(self, term, chambers):
        leg_url = "ftp://ftp.cga.ct.gov/pub/data/LegislatorDatabase.csv"
        data = self.urlopen(leg_url)
        page = open_csv(data)

        for row in page:
            chamber = {'H': 'lower', 'S': 'upper'}[row['office code']]
            if chamber not in chambers:
                continue

            district = row['dist'].lstrip('0')

            name = row['first name']
            mid = row['middle initial'].strip()
            if mid:
                name += " %s" % mid
            name += " %s" % row['last name']
            suffix = row['suffix'].strip()
            if suffix:
                name += " %s" % suffix

            party = row['party']
            if party == 'Democrat':
                party = 'Democratic'

            leg = Legislator(term,
                             chamber,
                             district,
                             name,
                             first_name=row['first name'],
                             last_name=row['last name'],
                             middle_name=row['middle initial'],
                             suffixes=row['suffix'],
                             party=party,
                             email=row['email'],
                             url=row['URL'],
                             office_phone=row['capitol phone'])

            office_address = "%s, Room %s\nHartford, CT 06106-1591" % (
                row['capitol street address'], row['room number'])
            leg.add_office('capitol',
                           'Capitol Office',
                           address=office_address,
                           phone=row['capitol phone'])
            # skipping home address for now
            leg.add_source(leg_url)

            for comm in row['committee member1'].split(';'):
                if comm:
                    if ' (' in comm:
                        comm, role = comm.split(' (')
                        role = role.strip(')').lower()
                    else:
                        role = 'member'
                    leg.add_role('committee member',
                                 term,
                                 chamber='joint',
                                 committee=comm.strip(),
                                 position=role)

            self.save_legislator(leg)
Example #40
0
    def scrape_details(self, chamber, term, leg_name, leg_link, role):
        if not leg_link:
            # Vacant post, likely:
            if "Vacancy" in leg_name:
                return
            raise Exception("leg_link is null. something went wrong")
        try:
            url = 'http://billstatus.ls.state.ms.us/members/%s' % leg_link
            url_root = os.path.dirname(url)
            details_page = self.get(url)
            root = lxml.etree.fromstring(details_page.content)
            party = root.xpath('string(//PARTY)')

            district = root.xpath('string(//DISTRICT)')

            photo = "%s/%s" % (url_root, root.xpath('string(//IMG_NAME)'))

            home_phone = root.xpath('string(//H_PHONE)')

            home_address = root.xpath('string(//H_ADDRESS)')
            home_address2 = root.xpath('string(//H_ADDRESS2)')
            home_city = root.xpath('string(//H_CITY)')
            home_zip = root.xpath('string(//H_ZIP)')

            home_address_total = "%s\n%s\n%s\n%s" % (
                home_address,
                home_address2,
                home_city,
                home_zip
            )

            bis_phone = root.xpath('string(//B_PHONE)')
            capital_phone = root.xpath('string(//CAP_PHONE)')
            other_phone = root.xpath('string(//OTH_PHONE)')
            org_info = root.xpath('string(//ORG_INFO)')
            email_name = root.xpath('string(//EMAIL_ADDRESS)').strip()
            cap_room = root.xpath('string(//CAP_ROOM)')

            if leg_name in ('Lataisha Jackson', 'John G. Faulkner'):
                assert not party, "Remove special-casing for this Democrat without a listed party: {}".format(leg_name)
                party = 'Democratic'
            elif leg_name in ('James W. Mathis', 'John Glen Corley'):
                assert not party, "Remove special-casing for this Republican without a listed party: {}".format(leg_name)
                party = 'Republican'
            elif party == 'D':
                party = 'Democratic'
            elif party == 'R':
                party = 'Republican'
            else:
                raise AssertionError(
                    "A member with no identifiable party was found: {}".format(leg_name))

            leg = Legislator(term, chamber, district, leg_name, party=party, role=role,
                             org_info=org_info, url=url, photo_url=photo)
            leg.add_source(url)

            kwargs = {}

            if email_name != "":
                if "@" in email_name:
                    email = email_name
                else:
                    email = '%s@%s.ms.gov' % (email_name,
                                              {"upper": "senate", "lower": "house"}[chamber])
                kwargs['email'] = email

            if capital_phone != "":
                kwargs['phone'] = capital_phone

            if cap_room != "":
                kwargs["address"] = "Room %s\n%s" % (cap_room, CAP_ADDRESS)
            else:
                kwargs['address'] = CAP_ADDRESS

            leg.add_office('capitol', 'Capitol Office', **kwargs)

            kwargs = {}
            if home_phone != "":
                kwargs['phone'] = home_phone

            if home_address_total != "":
                kwargs['address'] = home_address_total

            if kwargs != {}:
                leg.add_office('district', 'District Office', **kwargs)

            self.save_legislator(leg)
        except scrapelib.HTTPError, e:
            self.warning(str(e))
Example #41
0
    def scrape(self, chamber, term):
        self.validate_term(term, latest_only=False)
        root_url = 'http://www.capitol.tn.gov/'
        parties = {
            'D': 'Democratic',
            'R': 'Republican',
            'CCR': 'Carter County Republican'
        }

        #testing for chamber
        if chamber == 'upper':
            url_chamber_name = 'senate'
            abbr = 's'
        else:
            url_chamber_name = 'house'
            abbr = 'h'
        if term != self.metadata["terms"][-1]["sessions"][0]:
            chamber_url = root_url + url_chamber_name + '/archives/' + term + 'GA/Members/index.html'
        else:
            chamber_url = root_url + url_chamber_name + '/members/'

        with self.urlopen(chamber_url) as page:
            page = lxml.html.fromstring(page)

            for row in page.xpath("//tr")[1:]:
                partyInit = row.xpath('td[2]')[0].text.split()[0]
                party = parties[partyInit]
                district = row.xpath('td[4]/a')[0].text.split()[1]
                phone = row.xpath('td[6]')[0].text
                #special case for Karen D. Camper
                if phone == None:
                    phone = row.xpath('td[6]/div')[0].text
                phone = '615-' + phone.split()[0]
                email = row.xpath('td[7]/a')[0].text
                member_url = (root_url + url_chamber_name + '/members/' +
                              abbr + district + '.html')
                member_photo_url = (root_url + url_chamber_name +
                                    '/members/images/' + abbr + district +
                                    '.jpg')

                with self.urlopen(member_url) as member_page:
                    member_page = lxml.html.fromstring(member_page)
                    name = member_page.xpath(
                        '//div[@id="membertitle"]/h2')[0].text
                    if 'Speaker' in name:
                        full_name = name[8:len(name)]
                    elif 'Lt.' in name:
                        full_name = name[13:len(name)]
                    elif abbr == 'h':
                        full_name = name[5:len(name)]
                    else:
                        full_name = name[8:len(name)]

                    leg = Legislator(term,
                                     chamber,
                                     district,
                                     full_name,
                                     party=party,
                                     email=email,
                                     phone=phone,
                                     url=member_url,
                                     photo_url=member_photo_url)
                    leg.add_source(chamber_url)
                    leg.add_source(member_url)
                    self.save_legislator(leg)
Example #42
0
    def scrape_session(self, term, chambers, session):
        session = self.metadata['session_details'][session]
        sid = session['_guid']
        members = self.sservice.GetMembersBySession(sid)['MemberListing']
        for member in members:
            guid = member['Id']
            # print member['Name']
            nick_name, first_name, middle_name, last_name = (
                member['Name'][x]
                for x in ['Nickname', 'First', 'Middle', 'Last'])
            chamber, district = (member['District'][x]
                                 for x in ['Type', 'Number'])

            party = member['Party']
            if party == 'Democrat':
                party = 'Democratic'

            # print first_name, middle_name, last_name, party
            # print chamber, district
            first_name = nick_name if nick_name else first_name
            # XXX: Due to the upstream handling...

            # if middle_name:
            #     name = "%s %s %s" % (first_name, middle_name, last_name)
            # else:
            # blocked out due to GA putting middle_name in first_name ...
            name = "%s %s" % (first_name, last_name)

            chamber = {"House": 'lower', "Senate": 'upper'}[chamber]

            if party.strip() == '':
                party = 'other'

            legislator = Legislator(
                term,
                chamber,
                str(district),
                name,
                party=party,
                #                last_name=last_name,
                #                first_name=first_name,
                _guid=guid)
            #            if middle_name:
            #                legislator['middle_name'] = middle_name

            #           Sadly, upstream isn't good about keeping first names first only,
            #           so I'm blocking this out.

            ainfo = [
                member['DistrictAddress'][x]
                for x in ['Street', 'City', 'State', 'Zip']
            ]
            if not None in ainfo:
                # XXX: Debug this nonsense.
                ainfo = [x.strip() for x in ainfo]
                address = " ".join(ainfo)
                email = member['DistrictAddress']['Email']
                legislator.add_office('district',
                                      'District Address',
                                      address=address,
                                      email=email)

            legislator.add_source(self.ssource)
            self.save_legislator(legislator)
Example #43
0
    def scrape(self, chamber, term):
        url = self.URLs[chamber]
        page = self.lxmlize(url)

        for block in page.xpath("//div[@class='ms-rtestate-field']")[1:-1]:
            # Each legislator block.

            photo_block = block.xpath("ancestor::td/preceding-sibling::td")
            if len(photo_block) == 0:
                continue

            h2s = block.xpath(".//h2/a")
            if len(h2s) != 1:
                # We've got a Vacant person.
                print("Found a Vacant position. Skipping block.")
                continue

            h2, = h2s
            name = h2.text.strip()

            photo_block, = photo_block
            # (The <td> before ours was the photo)
            img, = photo_block.xpath("*")
            img = img.attrib['src']

            info = {}
            # Right, now let's get info out of their little profile box.
            for entry in block.xpath(".//p"):
                key = None
                for kvpair in itergraphs(entry.xpath("./*"), 'br'):
                    # OK. We either get the tail or the next element
                    # (usually an <a> tag)
                    if len(kvpair) == 1:
                        key, = kvpair
                        value = key.tail.strip() if key.tail else None
                        if value:
                            value = re.sub("\s+", " ", value).strip()
                    elif len(kvpair) == 2:
                        key, value = kvpair
                        if value.text_content().strip() == "arty:":
                            key = value
                            value = value.tail
                    elif len(kvpair) == 3:
                        k1, k2, value = kvpair
                        # As seen with a <stong><strong>Email:</strong></strong>
                        t = lambda x: x.text_content().strip()
                        assert t(k1) == "" or t(k2) == ""
                        if t(k1) != "":
                            key = k1
                        else:
                            key = k2
                    else:
                        # Never seen text + an <a> tag, perhaps this can happen.
                        raise ValueError(
                            "Too many elements. Something changed")

                    key = key.text_content().strip(" :")
                    if value is None:
                        # A page has the value in a <strong> tag. D'oh.
                        key, value = (x.strip() for x in key.rsplit(":", 1))

                    key = re.sub("\s+", " ", key).strip()
                    key = key.replace(":", "")
                    if key == "arty":
                        key = "Party"

                    info[key] = value

            info['District'] = info['District'].encode('ascii',
                                                       'ignore').strip()

            info['Party'] = info['Party'].strip(": ").replace(u"\u00a0", "")

            leg = Legislator(term=term,
                             url=h2.attrib['href'],
                             chamber=chamber,
                             full_name=name,
                             party=info['Party'],
                             district=info['District'],
                             photo_url=img)
            leg.add_source(url)

            phone = info.get('Capitol Phone', info.get('apitol Phone'))
            if hasattr(phone, 'text_content'):
                phone = phone.text_content()

            leg.add_office(type='capitol',
                           name='Capitol Office',
                           address=info['Capitol Address'],
                           phone=phone,
                           email=info['Email'].attrib['href'].replace(
                               "mailto:", ""))

            self.save_legislator(leg)
Example #44
0
    def scrape(self, chamber, term):
        if chamber == 'upper':
            url = ('http://webserver.rilin.state.ri.us/Documents/Senators.xls')
            rep_type = 'Senator'
            source_url = 'http://www.rilin.state.ri.us/senators/default.aspx'
            source_url_title_replacement = rep_type
            contact_url = 'http://webserver.rilin.state.ri.us/Email/SenEmailListDistrict.asp'
        elif chamber == 'lower':
            url = (
                'http://webserver.rilin.state.ri.us/Documents/Representatives.xls'
            )
            rep_type = 'Representative'
            source_url = 'http://www.rilin.state.ri.us/representatives/default.aspx'
            source_url_title_replacement = 'Rep. '
            contact_url = 'http://webserver.rilin.state.ri.us/Email/RepEmailListDistrict.asp'

        self.urlretrieve(url, 'ri_leg.xls')

        wb = xlrd.open_workbook('ri_leg.xls')
        sh = wb.sheet_by_index(0)

        # This isn't perfect but it's cheap and better than using the
        # XLS doc as the source URL for all legislators.
        # 374: RI: legislator url
        leg_source_url_map = {}
        leg_page = self.lxmlize(source_url)

        for link in leg_page.xpath('//td[@class="ms-vb2"]'):
            leg_name = link.text_content().replace(
                source_url_title_replacement, '')
            leg_url = link.xpath("..//a")[0].attrib['href']
            leg_source_url_map[leg_name] = leg_url

        for rownum in xrange(1, sh.nrows):
            d = {}
            for field, col_num in excel_mapping.iteritems():
                d[field] = sh.cell(rownum, col_num).value

            if d['full_name'].upper() == "VACANT":
                self.warning("District {}'s seat is vacant".format(
                    int(d['district'])))
                continue

            slug = re.match(
                "(?P<class>sen|rep)-(?P<slug>.*)@(rilin\.state\.ri\.us|rilegislature\.gov)",
                d['email'])

            if 'asp' in d['email']:
                d['email'] = None

            if d['email'] is not None:
                info = slug.groupdict()
                info['chamber'] = "senators" if info[
                    'class'] == 'sen' else "representatives"

                url = ("http://www.rilin.state.ri.us/{chamber}/"
                       "{slug}/Pages/Biography.aspx".format(**info))

            dist = str(int(d['district']))
            district_name = dist

            assert d['full_name'].startswith(rep_type), "Improper name found"
            full_name = re.sub(r"^{}(?=\s?[A-Z].*$)".format(rep_type), '',
                               d['full_name']).strip()
            translate = {
                "Democrat": "Democratic",
                "Republican": "Republican",
                "Independent": "Independent"
            }

            homepage_url = None
            url_names = lxml.html.fromstring(self.get(source_url).text)
            url_names = url_names.xpath('//td[@class="ms-vb2"]/a/@href')
            modified_name = re.sub(r'[^\w\s]', '', full_name)
            modified_name = modified_name.replace(' ', '').strip('').lower()

            for el in url_names:
                if 'default.aspx' in el:
                    el = el.replace('default.aspx', '')
                    el = el.strip('')
                if el[-1] == '/':
                    el = el[:-1]
                el = el.lower()
                url_name_array = el.split('/')
                if url_name_array[-1] in modified_name:
                    #remove '/default.aspx' and add last name
                    homepage_url = source_url[:-12] + url_name_array[-1]

            kwargs = {
                "town_represented": d['town_represented'],
            }

            contact = self.lxmlize(contact_url)
            contact_phone = contact.xpath(
                '//tr[@valign="TOP"]//td[@class="bodyCopy"]/text() | //td[@class="bodyCopy"]//center/text()'
            )

            phone = None
            for el in contact_phone:
                if len(el) <= 2 and dist == el:
                    number = contact_phone.index(el)
                    phone = contact_phone[number + 2]
                    phone = phone.strip()

            email = None
            if d['email'] is not None:
                email = d['email']

            if homepage_url is not None:
                kwargs['url'] = homepage_url

            if d['address'] is '':
                d['address'] = 'No Address Found'

            leg = Legislator(term, chamber, district_name, full_name, '', '',
                             '', translate[d['party']], **kwargs)

            leg.add_office('district',
                           'Dictrict Office',
                           address=d['address'],
                           phone=phone,
                           email=email)
            leg.add_source(source_url)
            leg.add_source(contact_url)
            if homepage_url:
                leg.add_source(homepage_url)
            self.save_legislator(leg)
Example #45
0
    def scrape(self, term, chambers):
        year_abr = term[0:4]

        self._init_mdb(year_abr)

        roster_csv = self.access_to_csv('Roster')
        bio_csv = self.access_to_csv('LegBio')

        photos = {}
        for rec in bio_csv:
            photos[rec['Roster Key']] = rec['URLPicture']

        for rec in roster_csv:
            first_name = rec["Firstname"]
            middle_name = rec["MidName"]
            last_name = rec["LastName"]
            suffix = rec["Suffix"]
            full_name = first_name + " " + middle_name + " " + last_name + " " + suffix
            full_name = full_name.replace('  ', ' ')
            full_name = full_name[0: len(full_name) - 1]

            district = int(rec["District"])
            party = rec["Party"]
            if party == 'R':
                party = "Republican"
            elif party == 'D':
                party = "Democratic"
            else:
                party = party
            chamber = rec["House"]
            if chamber == 'A':
                chamber = "lower"
            elif chamber == 'S':
                chamber = "upper"

            leg_status = rec["LegStatus"]
            # skip Deceased/Retired members
            if leg_status != 'Active':
                continue
            title = rec["Title"]
            legal_position = rec["LegPos"]
            phone = rec["Phone"] or None
            email = None
            if rec["Email"]:
                email = rec["Email"]
            try:
                photo_url = photos[rec['Roster Key']]
            except KeyError:
                photo_url = ''
                self.warning('no photo url for %s', rec['Roster Key'])
            url = ('http://www.njleg.state.nj.us/members/bio.asp?Leg=' +
                   str(int(rec['Roster Key'])))
            address = '{0}\n{1}, {2} {3}'.format(rec['Address'], rec['City'],
                                                 rec['State'], rec['Zipcode'])
            gender = {'M': 'Male', 'F': 'Female'}[rec['Sex']]

            leg = Legislator(term, chamber, str(district), full_name,
                             first_name, last_name, middle_name, party,
                             suffixes=suffix, title=title,
                             legal_position=legal_position,
                             url=url, photo_url=photo_url,
                             gender=gender)
            leg.add_office('district', 'District Office', address=address,
                           phone=phone, email=email)
            leg.add_source(url)
            leg.add_source('http://www.njleg.state.nj.us/downloads.asp')
            self.save_legislator(leg)
Example #46
0
    def scrape_reps(self, chamber, term):
        # There are 99 House districts
        for district in xrange(1, 100):
            rep_url = ('http://www.house.state.oh.us/components/'
                       'com_displaymembers/page.php?district=%d' % district)

            with self.urlopen(rep_url) as page:
                page = lxml.html.fromstring(page)

                ranges = []
                cur = []
                info = page.xpath('//td[@class="info"]/*')
                for r in info:
                    if r.tag == 'strong':
                        ranges.append(cur)
                        cur = []
                    else:
                        cur.append(r)
                ranges.append(cur)

                block = ranges[4][:-1]

                address = ", ".join(
                    [ x.tail.strip() for x in block ])

                phone = page.xpath(
                    "//strong[contains(text(), 'Phone')]")[0].tail

                fax = page.xpath(
                    "//strong[contains(text(), 'Fax')]")[0].tail

                for el in page.xpath('//table[@class="page"]'):
                    rep_link = el.xpath('tr/td/title')[0]
                    full_name = rep_link.text
                    party = full_name[-2]
                    full_name = full_name[0:-3]

                    if full_name == 'Vacant Posit':
                        continue

                    if party == "D":
                        party = "Democratic"
                    elif party == "R":
                        party = "Republican"


                    leg = Legislator(term, chamber, str(district),
                                     full_name, party=party, url=rep_url)
                    leg.add_office('capitol',
                                   'Capitol Office',
                                    address=address,
                                    phone=phone,
                                    fax=fax)  # Yet, no email.

                    committees = page.xpath("//table[@class='billLinks']")[0]
                    for committee in committees.xpath(".//tr"):
                        td = committee.xpath(".//td")
                        if len(td) != 2:
                            break

                        name, role = td
                        name, role = name.text_content(), role.text_content()
                        name, role = name.strip(), role.strip()
                        if name[0] == "|":
                            continue

                        if name.strip() == "Committee Name":
                            continue

                        chmbr = chamber
                        if "joint" in name.lower():
                            chmbr = "joint"

                        if name in JOINT_COMMITTEE_OVERRIDE:
                            chmbr = "joint"

                        leg.add_role('committee member',
                            term=term,
                            chamber=chmbr,
                            committee=name,
                            position=role
                        )

                    leg.add_source(rep_url)
                    self.save_legislator(leg)
Example #47
0
    def scrape(self, chamber, term):
        self.validate_term(term, latest_only=False)
        root_url = 'http://www.capitol.tn.gov/'
        parties = {
            'D': 'Democratic',
            'R': 'Republican',
            'CCR': 'Carter County Republican',
            'I': 'Independent'
        }

        #testing for chamber
        if chamber == 'upper':
            url_chamber_name = 'senate'
            abbr = 's'
        else:
            url_chamber_name = 'house'
            abbr = 'h'
        if term != self.metadata["terms"][-1]["sessions"][0]:
            chamber_url = root_url + url_chamber_name
            chamber_url += '/archives/' + term + 'GA/Members/index.html'
        else:
            chamber_url = root_url + url_chamber_name + '/members/'

        page = self.urlopen(chamber_url)
        page = lxml.html.fromstring(page)

        for row in page.xpath("//tr")[1:]:

            # Skip any a header row.
            if set(child.tag for child in row) == set(['th']):
                continue

            partyInit = row.xpath('td[2]')[0].text.split()[0]
            party = parties[partyInit]
            district = row.xpath('td[4]/a')[0].text.split()[1]
            address = row.xpath('td[5]')[0].text_content()
            # 301 6th Avenue North Suite
            address = address.replace(
                'LP', 'Legislative Plaza\nNashville, TN 37243')
            address = address.replace(
                'WMB', 'War Memorial Building\nNashville, TN 37243')
            address = '301 6th Avenue North\nSuite ' + address
            phone = row.xpath('td[6]')[0].text
            #special case for Karen D. Camper
            if phone == None:
                phone = row.xpath('td[6]/div')[0].text
            phone = '615-' + phone.split()[0]
            email = row.xpath('td[7]/a')[0].text
            member_url = (root_url + url_chamber_name + '/members/' + abbr +
                          district + '.html')
            member_photo_url = (root_url + url_chamber_name +
                                '/members/images/' + abbr + district + '.jpg')

            member_page = self.urlopen(member_url)
            member_page = lxml.html.fromstring(member_page)
            name = member_page.xpath('//div[@id="membertitle"]/h2')[0].text
            if 'Speaker' in name:
                full_name = name[8:len(name)]
            elif 'Lt.' in name:
                full_name = name[13:len(name)]
            elif abbr == 'h':
                full_name = name[5:len(name)]
            else:
                full_name = name[8:len(name)]

            leg = Legislator(term,
                             chamber,
                             district,
                             full_name.strip(),
                             party=party,
                             email=email,
                             url=member_url,
                             photo_url=member_photo_url)
            leg.add_source(chamber_url)
            leg.add_source(member_url)

            # TODO: add district address from this page

            leg.add_office('capitol',
                           'Nashville Address',
                           address=address,
                           phone=phone)

            self.save_legislator(leg)
Example #48
0
    def scrape(self, chamber, term):
        if term != '2011-2012':
            raise NoDataForPeriod(term)

        if chamber == 'upper':
            chamber_name = 'senate'
        else:
            chamber_name = 'house'

        url = "http://www.legis.iowa.gov/Legislators/%s.aspx" % chamber_name
        page = lxml.html.fromstring(self.urlopen(url))
        page.make_links_absolute(url)
        table = page.xpath('//table[@class="legis"]')[0]
        for link in table.xpath(".//a[contains(@href, 'legislator.aspx')]"):
            name = link.text.strip()
            district = link.xpath("string(../../td[2])")
            party = link.xpath("string(../../td[3])")
            email = link.xpath("string(../../td[5])")

            if party == 'Democrat':
                party = 'Democratic'

            pid = re.search("PID=(\d+)", link.attrib['href']).group(1)
            photo_url = ("http://www.legis.iowa.gov/getPhotoPeople.aspx"
                         "?GA=84&PID=%s" % pid)

            leg = Legislator(term,
                             chamber,
                             district,
                             name,
                             party=party,
                             email_address=email,
                             photo_url=photo_url)
            leg.add_source(url)

            leg_page = lxml.html.fromstring(self.urlopen(link.attrib['href']))
            comm_path = "//a[contains(@href, 'committee')]"
            for comm_link in leg_page.xpath(comm_path):
                comm = comm_link.text.strip()

                match = re.search(r'\((.+)\)$', comm)
                if match:
                    comm = re.sub(r'\((.+)\)$', '', comm).strip()
                    mtype = match.group(1).lower()
                else:
                    mtype = 'member'

                if comm.endswith('Appropriations Subcommittee'):
                    sub = re.match('^(.+) Appropriations Subcommittee$',
                                   comm).group(1)
                    leg.add_role('committee member',
                                 term,
                                 chamber=chamber,
                                 committee='Appropriations',
                                 subcommittee=sub,
                                 position=mtype)
                else:
                    leg.add_role('committee member',
                                 term,
                                 chamber=chamber,
                                 committee=comm,
                                 position=mtype)

            self.save_legislator(leg)
Example #49
0
    def scrape(self, term, chambers):
        year_abr = term[0:4]

        file_url, db = self.get_dbf(year_abr, 'ROSTER')
        bio_url, bio_db = self.get_dbf(year_abr, 'LEGBIO')

        photos = {}
        for rec in bio_db:
            photos[rec['roster_key']] = rec['urlpicture']

        for rec in db:
            first_name = rec["firstname"]
            middle_name = rec["midname"]
            last_name = rec["lastname"]
            suffix = rec["suffix"]
            full_name = first_name + " " + middle_name + " " + last_name + " " + suffix
            full_name = full_name.replace('  ', ' ')
            full_name = full_name[0:len(full_name) - 1]

            district = int(rec["district"])
            party = rec["party"]
            if party == 'R':
                party = "Republican"
            elif party == 'D':
                party = "Democratic"
            else:
                party = party
            chamber = rec["house"]
            if chamber == 'A':
                chamber = "lower"
            elif chamber == 'S':
                chamber = "upper"

            leg_status = rec["legstatus"]
            # skip Deceased/Retired members
            if leg_status != 'Active':
                continue
            title = rec["title"]
            legal_position = rec["legpos"]
            address = rec["address"]
            city = rec["city"]
            state = rec["state"]
            zipcode = rec["zipcode"]
            phone = rec["phone"]
            if 'email' in rec:
                email = rec["email"]
            else:
                email = ''
            photo_url = photos[rec['roster_key']]
            url = ('http://www.njleg.state.nj.us/members/bio.asp?Leg=' +
                   str(int(rec['roster_key'])))
            address = '{0}\n{1}, {2} {3}'.format(rec['address'], rec['city'],
                                                 rec['state'], rec['zipcode'])
            gender = {'M': 'Male', 'F': 'Female'}[rec['sex']]

            leg = Legislator(term,
                             chamber,
                             str(district),
                             full_name,
                             first_name,
                             last_name,
                             middle_name,
                             party,
                             suffixes=suffix,
                             title=title,
                             legal_position=legal_position,
                             email=email,
                             url=url,
                             photo_url=photo_url,
                             gender=gender)
            leg.add_source(url)
            leg.add_source(file_url)
            leg.add_office('district',
                           'District Office',
                           address=address,
                           phone=rec['phone'])
            self.save_legislator(leg)
Example #50
0
    def _scrape_representative(self, url, term, parties):
        """
        Returns a Legislator object representing a member of the lower
        legislative chamber.
        """
        #url = self.get(url).text.replace('<br>', '')
        member_page = self.lxmlize(url)

        photo_url = member_page.xpath('//img[@class="member-photo"]/@src')[0]
        if photo_url.endswith('/.jpg'):
            photo_url = None

        scraped_name, district_text = member_page.xpath(
            '//div[@class="member-info"]/h2')
        scraped_name = scraped_name.text_content().strip().replace('Rep. ', '')
        scraped_name = ' '.join(scraped_name.split())

        name = scraped_name

        district_text = district_text.text_content().strip()
        district = str(self.district_re.search(district_text).group(1))

        # Vacant house "members" are named after their district numbers:
        if re.match(r'^\d+$', scraped_name):
            return None

        party = parties[district]

        legislator = Legislator(term,
                                'lower',
                                district,
                                name,
                                party=party,
                                url=url,
                                _scraped_name=scraped_name)
        if photo_url is not None:
            legislator['photo_url'] = photo_url

        legislator.add_source(url)

        def office_name(element):
            """Returns the office address type."""
            return element.xpath('preceding-sibling::h4[1]/text()')[0] \
                .rstrip(':')

        offices_text = [{
            'name':
            office_name(p_tag),
            'type':
            office_name(p_tag).replace(' Address', '').lower(),
            'details':
            p_tag.text_content()
        } for p_tag in member_page.xpath(
            '//h4/following-sibling::p[@class="double-space"]')]

        for office_text in offices_text:
            details = office_text['details'].strip()

            # A few member pages have blank office listings:
            if details == '':
                continue

            # At the time of writing, this case of multiple district
            # offices occurs exactly once, for the representative at
            # District 43:
            if details.count('Office') > 1:
                district_offices = [
                    district_office.strip() for district_office in re.findall(
                        r'(\w+ Office.+?(?=\w+ Office|$))',
                        details,
                        flags=re.DOTALL)
                ]
                offices_text += [{
                    'name':
                    re.match(r'\w+ Office', office).group(),
                    'type':
                    'district',
                    'details':
                    re.search(r'(?<=Office).+(?=\w+ Office|$)?', office,
                              re.DOTALL).group()
                } for office in district_offices]

            match = self.address_re.search(details)
            if match is not None:
                address = re.sub(' +$',
                                 '',
                                 match.group().replace('\r', '').replace(
                                     '\n\n', '\n'),
                                 flags=re.MULTILINE)
            else:
                # No valid address found in the details.
                continue

            phone_number = extract_phone(details)
            fax_number = extract_fax(details)

            legislator.add_office(office_text['type'],
                                  office_text['name'],
                                  address=address,
                                  phone=phone_number,
                                  fax=fax_number)

        return legislator
Example #51
0
    def scrape_member(self, chamber, year, member_url):

        with self.urlopen(member_url) as member_page:

            member = {}
            member_root = lxml.html.fromstring(member_page)

            table = member_root.xpath('//body/div[2]/table')[0]

            imgtag = member_root.xpath('//body/div[2]/table//img')

            member['photo_url'] = imgtag[0].get('src')

            name_list = [
                mem.text for mem in table.iterdescendants(tag='strong')
            ][0].split(' ')

            member['full_name'] = ' '.join(name_list[1:-1]).strip()

            party = name_list[-1]
            party = re.sub(r'\(|\)', '', party)
            if party == 'R':
                party = 'Republican'
            elif party == 'D':
                party = 'Democratic'
            elif party == 'I':
                party = 'Independent'

            member['party'] = party

            boldList = [bold.text for bold in table.iterdescendants(tag='b')]

            for item in boldList:
                if item == None:
                    continue
                elif 'District' in item:
                    district = item.split(' ')[-1]
                    member['district'] = district.strip()
                else:
                    if member.has_key('additionalRoles'):
                        member['additionalRoles'].append(item)
                    else:
                        member['additionalRoles'] = [item]

            contact_rows = member_root.xpath(
                '//body/div[2]/div[1]/table/tr/td/table[1]/tr')

            for row in contact_rows:
                row_text = self.get_child_text(row)

                if len(row_text) > 0:
                    if row_text[0] == 'Frankfort Address(es)':
                        member['office_address'] = '\n'.join(row_text[1:])

                    if row_text[0] == 'Phone Number(s)':
                        for item in row_text:
                            # Use the first capitol annex phone
                            if item.startswith('Annex:'):
                                member['office_phone'] = item.replace(
                                    'Annex:', '').strip()
                                break

            leg = Legislator(year,
                             chamber,
                             member['district'],
                             member['full_name'],
                             party=member['party'],
                             photo_url=member['photo_url'],
                             office_address=member['office_address'],
                             office_phone=member['office_phone'])
            leg.add_source(member_url)

            if member.has_key('additionalRoles'):
                for role in member['additionalRoles']:
                    leg.add_role(role, year, member['party'])

            self.save_legislator(leg)
Example #52
0
    def scrape_house(self, term):
        url = 'http://www.camaraderepresentantes.org/cr_legs.asp'

        party_map = {'PNP': 'Partido Nuevo Progresista',
                     'PPD': u'Partido Popular Democr\xe1tico'}

        with self.urlopen(url) as html:
            doc = lxml.html.fromstring(html)
            doc.make_links_absolute(url)
            tables = doc.xpath('//table[@width="90%"]')

            # first table is district-based, second is at-large
            for table, at_large in zip(tables, [False, True]):

                for tr in table.xpath('.//tr')[1:]:
                    tds = tr.getchildren()
                    if not at_large:
                        # tds: name, district, addr, phone, office, email
                        name = tds[0]
                        district = tds[1].text_content().lstrip('0')
                        capitol_office = tds[2]
                        phone = tds[3]
                        email = tds[5]
                        # district offices
                        district_office = tds[4]
                        district_addr = []
                        district_phone  = None
                        district_fax = None
                        pieces = district_office.xpath('.//text()')
                        for piece in pieces:
                            if piece.startswith('Tel'):
                                district_phone = PHONE_RE.findall(piece)[0]
                            elif piece.startswith('Fax'):
                                district_fax = PHONE_RE.findall(piece)[0]
                            else:
                                district_addr.append(piece)
                        if district_addr:
                            district_addr = ' '.join(district_addr)
                    else:
                        # name, addr, phone, email
                        name = tds[0]
                        district = 'At-Large'
                        capitol_office = tds[1]
                        phone = tds[2]
                        email = tds[3]
                        district_addr = None

                    # cleanup is same for both tables
                    name = re.sub('\s+', ' ',
                                  name.text_content().strip().replace(u'\xa0', ' '))
                    email = email.xpath('.//a/@href')[0].strip('mailto:')

                    numbers = {}
                    for b in phone.xpath('b'):
                        numbers[b.text] = b.tail.strip()

                    # capitol_office as provided is junk
                    # things like 'Basement', and '2nd Floor'

                    # urls @ http://www.camaraderepresentantes.org/legs2.asp?r=BOKCADHRTZ
                    # where random chars are tr's id
                    leg_url = 'http://www.camaraderepresentantes.org/legs2.asp?r=' + tr.get('id')

                    leg = Legislator(term, 'lower', district, name,
                                     party='unknown', email=email, url=url)
                    leg.add_office('capitol', 'Oficina del Capitolio',
                                   phone=numbers.get('Tel:') or None,
                                   # could also add TTY
                                   #tty=numbers.get('TTY:') or None,
                                   fax=numbers.get('Fax:') or None)
                    if district_addr:
                        leg.add_office('district', 'Oficina de Distrito',
                                       address=district_addr,
                                       phone=district_phone,
                                       fax=district_fax)

                    leg.add_source(url)
                    self.save_legislator(leg)
Example #53
0
    def scrape(self, chamber, term):
        term_slug = term[:-2]
        url = MEMBER_LIST_URL[chamber] % term_slug

        html = self.urlopen(url)
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)

        for row in doc.xpath('//table')[4].xpath('tr')[2:]:
            name, _, _, district, party = row.xpath('td')
            district = district.text
            party = {
                'D': 'Democratic',
                'R': 'Republican',
                'I': 'Independent'
            }[party.text]
            leg_url = name.xpath('a/@href')[0]
            name = name.text_content().strip()

            # inactive legislator, skip them for now
            if name.endswith('*'):
                continue

            leg_html = self.urlopen(leg_url)
            leg_doc = lxml.html.fromstring(leg_html)
            leg_doc.make_links_absolute(leg_url)
            photo_url = leg_doc.xpath(
                '//img[contains(@src, "/members/")]/@src')[0]

            leg = Legislator(term,
                             chamber,
                             district,
                             name,
                             party=party,
                             url=leg_url,
                             photo_url=photo_url)
            leg.add_source(url)
            leg.add_source(leg_url)

            # email
            email = leg_doc.xpath('//b[text()="Email: "]')
            if email:
                leg['email'] = email[0].tail

            # function for turning an IL contact info table to office details
            def _table_to_office(table, office_type, office_name):
                addr = ''
                phone = ''
                fax = None
                for row in table.xpath('tr'):
                    row = row.text_content().strip()
                    # skip rows that aren't part of address
                    if 'Office:' in row or row == 'Cook County':
                        continue
                    # fax number row ends with FAX
                    elif 'FAX' in row:
                        fax = row.replace(' FAX', '')
                    # phone number starts with ( [make it more specific?]
                    elif row.startswith('('):
                        phone = row
                    # everything else is an address
                    else:
                        addr += (row + '\n')
                leg.add_office(office_type,
                               office_name,
                               address=addr.strip(),
                               phone=phone,
                               fax=fax)

            # extract both offices from tables
            table = leg_doc.xpath(
                '//table[contains(string(), "Springfield Office")]')[3]
            _table_to_office(table, 'capitol', 'Springfield Office')
            table = leg_doc.xpath(
                '//table[contains(string(), "District Office")]')[3]
            _table_to_office(table, 'district', 'District Office')

            self.save_legislator(leg)
Example #54
0
    def parse_legislator(self, tr, term, chamber,

            strip=methodcaller('strip'),

            xpath='td[contains(@class, "views-field-field-%s-%s")]%s',

            xp={'url':      ('lname-value-1', '/a/@href'),
                'district': ('district-value', '/text()'),
                'party':    ('party-value', '/text()'),
                'full_name':     ('feedbackurl-value', '/a/text()'),
                'address':  ('feedbackurl-value', '/p/text()')},

            titles={'upper': 'senator', 'lower': 'member'},

            funcs={
                'full_name': lambda s: s.replace('Contact Senator', '').strip(),
                'address': parse_address,
                }):
        '''
        Given a tr element, get specific data from it.
        '''
        rubberstamp = lambda _: _
        tr_xpath = tr.xpath
        res = {}
        for k, v in xp.items():

            f = funcs.get(k, rubberstamp)
            v = (titles[chamber],) + v
            v = map(f, map(strip, tr_xpath(xpath % v)))

            if len(v) == 1:
                res[k] = v[0]
            else:
                res[k] = v

        # Photo.
        try:
            res['photo_url'] = tr_xpath('td/p/img/@src')[0]
        except IndexError:
            pass

        # Addresses.
        addresses = map(dict, filter(None, res['address']))
        for x in addresses:
            try:
                x['zip'] = x['zip'].replace('CA ', '')
            except KeyError:
                # No zip? Toss.
                addresses.remove(x)

        # Re-key the addresses
        addresses[0].update(type='capitol', name='Capitol Office')
        offices = [addresses[0]]
        for office in addresses[1:]:
            office.update(type='district', name='District Office')
            offices.append(office)

        for office in offices:
            street = office['street']
            street = '%s\n%s, %s %s' % (street, office['city'], 'CA',
                                        office['zip'])
            office['address'] = street
            office['fax'] = None
            office['email'] = None

            del office['street'], office['city'], office['zip']
        res['offices'] = offices
        del res['address']

        # Remove junk from assembly member names.
        junk = 'Contact Assembly Member '
        res['full_name'] = res['full_name'].replace(junk, '')

        # convert party
        if res['party'] == 'Democrat':
            res['party'] = 'Democratic'
        # strip leading zero
        res['district'] = str(int(res['district']))

        # Add a source for the url.
        leg = Legislator(term, chamber, **res)
        leg.update(**res)
        return leg
Example #55
0
    def _scrape_upper_chamber(self, term):
        index_url = 'http://www.senate.mn/members/index.php'
        doc = lxml.html.fromstring(self.get(index_url).text)
        doc.make_links_absolute(index_url)

        leg_data = defaultdict(dict)

        # get all the tds in a certain div
        tds = doc.xpath(
            '//div[@id="hide_show_alpha_all"]//td[@style="vertical-align:top;"]'
        )
        for td in tds:
            # each td has 2 <a>s- site & email
            main_link, email = td.xpath('.//a')
            # get name
            name = main_link.text_content().split(' (')[0]
            leg = leg_data[name]
            leg['office_phone'] = filter(
                lambda string: re.match(r'\d{3}-\d{3}-\d{4}', string),
                td.xpath('.//p/text()'))[0].strip()
            leg['url'] = main_link.get('href')
            leg['photo_url'] = td.xpath('./preceding-sibling::td//img/@src')[0]
            if 'mailto:' in email.get('href'):
                leg['email'] = email.get('href').replace('mailto:', '')

        self.info('collected preliminary data on %s legislators',
                  len(leg_data))
        assert leg_data

        # use CSV for most of data
        csv_url = 'http://www.senate.mn/members/member_list_ascii.php?ls='
        csvfile = self.get(csv_url).text

        for row in csv.DictReader(StringIO(csvfile)):
            if not row['First Name']:
                continue
            name = '%s %s' % (row['First Name'], row['Last Name'])
            party = self._parties[row['Party']]
            leg_data[name]
            if 'email' in leg_data[name]:
                email = leg_data[name].pop('email')
            else:
                email = None
            leg = Legislator(term,
                             'upper',
                             row['District'].lstrip('0'),
                             name,
                             party=party,
                             first_name=row['First Name'],
                             last_name=row['Last Name'],
                             **leg_data[name])
            row["Zipcode"] = row["Zipcode"].strip()

            # Accommodate for multiple address column naming conventions.
            address1_fields = [row.get('Address'), row.get('Office Building')]
            address2_fields = [row.get('Address2'), row.get('Office Address')]
            row['Address'] = next(
                (a for a in address1_fields if a is not None), False)
            row['Address2'] = next(
                (a for a in address2_fields if a is not None), False)

            if (a in row['Address2'] for a in
                ['95 University Avenue W', '100 Rev. Dr. Martin Luther King']):
                leg.add_office('capitol', 'Capitol Office',
                    address='{Room} {Address}\n{Address2}\n{City}, {State} '\
                        '{Zipcode}'.format(Room=row['Rm. Number'], **row),
                    email=email, phone=leg.get('office_phone'))
            elif row['Address2']:
                leg.add_office(
                    'district',
                    'District Office',
                    address='{Address}\n{Address2}\n{City}, {State} {Zipcode}'.
                    format(**row),
                    email=email)
            else:
                leg.add_office(
                    'district',
                    'District Office',
                    address='{Address}\n{City}, {State} {Zipcode}'.format(
                        **row),
                    email=email)

            leg.add_source(csv_url)
            leg.add_source(index_url)

            self.save_legislator(leg)
Example #56
0
    def scrape(self, chamber, term):
        self.validate_term(term, latest_only=True)

        if chamber == 'upper':
            chamber_name = 'senate'
        else:
            chamber_name = 'house'

        url = "http://www.legis.iowa.gov/Legislators/%s.aspx" % chamber_name
        page = lxml.html.fromstring(self.urlopen(url))
        page.make_links_absolute(url)
        table = page.xpath('//table[@class="legis"]')[0]
        for link in table.xpath(".//a[contains(@href, 'legislator.aspx')]"):
            name = link.text.strip()
            leg_url = link.get('href')
            district = link.xpath("string(../../td[2])")
            party = link.xpath("string(../../td[3])")
            email = link.xpath("string(../../td[5])")

            if party == 'Democrat':
                party = 'Democratic'

            pid = re.search("PID=(\d+)", link.attrib['href']).group(1)
            photo_url = ("http://www.legis.iowa.gov/getPhotoPeople.aspx"
                         "?GA=84&PID=%s" % pid)

            leg = Legislator(term, chamber, district, name, party=party,
                             email=email, photo_url=photo_url, url=url)
            leg.add_source(url)

            leg_page = lxml.html.fromstring(self.urlopen(link.attrib['href']))

            office_data = {
                "email": "ctl00_cphMainContent_divEmailLegis",
                "home_phone": "ctl00_cphMainContent_divPhoneHome",
                "home_addr": "ctl00_cphMainContent_divAddrHome",
                "office_phone": "ctl00_cphMainContent_divPhoneCapitol",
            }
            metainf = {}

            for attr in office_data:
                path = office_data[attr]
                info = leg_page.xpath("//div[@id='%s']" % path)
                if len(info) != 1:
                    continue
                info = info[0]

                _, data = [x.text_content() for x in info.xpath("./span")]
                data = data.strip()
                if data == "":
                    continue

                metainf[attr] = data

            if "home_phone" in metainf or "home_addr" in metainf:
                home_args = {}
                if "home_phone" in metainf:
                    home_args['phone'] = metainf['home_phone']
                if "home_addr" in metainf:
                    home_args['address'] = metainf['home_addr']
                leg.add_office('district',
                               'Home Office',
                               **home_args)

            if "email" in metainf or "office_phone" in metainf:
                cap_args = {}

                if "email" in metainf:
                    cap_args['email'] = metainf['email']
                if "office_phone" in metainf:
                    cap_args['phone'] = metainf['office_phone']

                leg.add_office('capitol',
                               'Capitol Office',
                               **cap_args)


            comm_path = "//a[contains(@href, 'committee')]"
            for comm_link in leg_page.xpath(comm_path):
                comm = comm_link.text.strip()

                match = re.search(r'\((.+)\)$', comm)
                if match:
                    comm = re.sub(r'\((.+)\)$', '', comm).strip()
                    mtype = match.group(1).lower()
                else:
                    mtype = 'member'

                if comm.endswith('Appropriations Subcommittee'):
                    sub = re.match('^(.+) Appropriations Subcommittee$',
                                   comm).group(1)
                    leg.add_role('committee member', term, chamber=chamber,
                                 committee='Appropriations',
                                 subcommittee=sub,
                                 position=mtype)
                else:
                    leg.add_role('committee member', term, chamber=chamber,
                                 committee=comm,
                                 position=mtype)

            self.save_legislator(leg)
Example #57
0
    def _scrape_lower_chamber(self, term):
        url = 'http://www.house.leg.state.mn.us/members/hmem.asp'

        page = self.lxmlize(url)

        legislator_nodes = self.get_nodes(
            page, '//div[@id="hide_show_alpha_all"]/table/tr/td/table/tr')

        need_special_email_case = False

        for legislator_node in legislator_nodes:
            photo_url = self.get_node(legislator_node, './td[1]/a/img/@src')

            info_nodes = self.get_nodes(legislator_node, './td[2]/p/a')

            name_text = self.get_node(info_nodes[0], './b/text()')

            name_match = re.search(r'^.+\(', name_text)
            name = name_match.group(0)
            name = name.replace('(', '').strip()

            district_match = re.search(r'\([0-9]{2}[A-Z]', name_text)
            district_text = district_match.group(0)
            district = district_text.replace('(', '').lstrip('0').strip()

            party_match = re.search(r'[A-Z]+\)$', name_text)
            party_text = party_match.group(0)
            party_text = party_text.replace(')', '').strip()
            party = self._parties[party_text]

            info_texts = self.get_nodes(
                legislator_node,
                './td[2]/p/text()[normalize-space() and preceding-sibling'
                '::br]')
            address = '\n'.join((info_texts[0], info_texts[1]))

            phone_text = info_texts[2]
            if validate_phone_number(phone_text):
                phone = phone_text

            # E-mail markup is screwed-up and inconsistent.
            try:
                email_node = info_nodes[1]
                email_text = email_node.text
            except IndexError:
                # Primarily for Dan Fabian.
                email_node = info_texts[3]
                need_special_email_case = True

            email_text = email_text.replace('Email: ', '').strip()
            if validate_email_address(email_text):
                email = email_text

            legislator = Legislator(
                term=term,
                chamber='lower',
                district=district,
                full_name=name,
                party=party,
                email=email,
                photo_url=photo_url,
            )
            legislator.add_source(url)

            legislator.add_office(
                type='capitol',
                name="Capitol Office",
                address=address,
                phone=phone,
                email=email,
            )

            self.save_legislator(legislator)

        if not need_special_email_case:
            self.logger.warning('Special e-mail handling no longer required.')
Example #58
0
    def scrape_legislators(self, term, chamber, leg_page, member_url, main_url,
                           member):
        full_name = leg_page.xpath(
            '//div[@class="content"][1]/table[1]//tr[1]/td[2]/table//tr[1]/td/h2'
        )[0].text
        if len(full_name.split()) == 3:
            first_name = full_name.split()[1]
            middle_name = ''
            last_name = full_name.split()[2]
            full_name = first_name + ' ' + last_name
        else:
            first_name = full_name.split()[1]
            middle_name = full_name.split()[2]
            last_name = full_name.split()[3]
            full_name = first_name + ' ' + middle_name + ' ' + last_name
        district = leg_page.xpath(
            '//div[@class="content"][1]/table[1]//tr[1]/td[2]/table//tr[5]/td[2]'
        )[0].text
        party = leg_page.xpath(
            '//div[@class="content"][1]/table[1]//tr[1]/td[2]/table//tr[6]/td[2]'
        )[0].text.strip()
        full_address = leg_page.xpath(
            '//div[@class="content"][1]/table[1]//tr[1]/td[2]/table//tr[2]/td[2]'
        )[0].text
        phone = leg_page.xpath(
            '//div[@class="content"][1]/table[1]//tr[1]/td[2]/table//tr[3]/td[2]'
        )[0].text
        email = leg_page.xpath(
            '//div[@class="content"][1]/table[1]//tr[1]/td[2]/table//tr[4]/td[2]/a'
        )[0].text

        if member.tail:
            logger.info("Skipping legislator because: %s" % (member.tail))
            return

        if chamber == 'lower':
            photo_url = leg_page.xpath(
                '//img[contains(@src, "representatives")]')[0].get('src')
        else:
            photo_url = leg_page.xpath('//img[contains(@src, "senators")]')
            if len(photo_url) > 0:
                photo_url = photo_url[0].get('src')

        if party == 'Democrat':
            party = 'Democratic'

        kwargs = {"url": member_url}

        if photo_url:
            kwargs['photo_url'] = photo_url

        leg = Legislator(term, chamber, district, full_name, first_name,
                         last_name, middle_name, party, **kwargs)
        leg.add_office('district',
                       'District Office',
                       address=full_address,
                       phone=phone,
                       email=email)

        leg.add_source(member_url)
        leg.add_source(main_url)
        self.save_legislator(leg)