Beispiel #1
0
    def scrape(self, chamber, term):
        self.validate_term(term, latest_only=False)
        root_url = 'http://www.capitol.tn.gov/'
        parties = {
            'D': 'Democratic',
            'R': 'Republican',
            'CCR': 'Carter County Republican'
        }

        #testing for chamber
        if chamber == 'upper':
            url_chamber_name = 'senate'
            abbr = 's'
        else:
            url_chamber_name = 'house'
            abbr = 'h'
        if term != self.metadata["terms"][-1]["sessions"][0]:
            chamber_url = root_url + url_chamber_name + '/archives/' + term + 'GA/Members/index.html'
        else:
            chamber_url = root_url + url_chamber_name + '/members/'

        with self.urlopen(chamber_url) as page:
            page = lxml.html.fromstring(page)

            for row in page.xpath("//tr")[1:]:
                partyInit = row.xpath('td[2]')[0].text.split()[0]
                party = parties[partyInit]
                district = row.xpath('td[4]/a')[0].text.split()[1]
                phone = row.xpath('td[6]')[0].text
                #special case for Karen D. Camper
                if phone == None:
                    phone = row.xpath('td[6]/div')[0].text
                phone = '615-' + phone.split()[0]
                email = row.xpath('td[7]/a')[0].text
                member_url = (root_url + url_chamber_name + '/members/' +
                              abbr + district + '.html')
                member_photo_url = (root_url + url_chamber_name +
                                    '/members/images/' + abbr + district +
                                    '.jpg')

                with self.urlopen(member_url) as member_page:
                    member_page = lxml.html.fromstring(member_page)
                    name = member_page.xpath(
                        '//div[@id="membertitle"]/h2')[0].text
                    if 'Speaker' in name:
                        full_name = name[8:len(name)]
                    elif 'Lt.' in name:
                        full_name = name[13:len(name)]
                    elif abbr == 'h':
                        full_name = name[5:len(name)]
                    else:
                        full_name = name[8:len(name)]

                    leg = Legislator(term,
                                     chamber,
                                     district,
                                     full_name,
                                     party=party,
                                     email=email,
                                     phone=phone,
                                     url=member_url,
                                     photo_url=member_photo_url)
                    leg.add_source(chamber_url)
                    leg.add_source(member_url)
                    self.save_legislator(leg)
Beispiel #2
0
    def scrape(self, chamber, term):
        """
        Scrapes legislators for the current term only
        """
        self.validate_term(term, latest_only=True)
        url = _BASE_URL % _CHAMBERS[chamber].lower()
        index = self.get(url).text
        html = lxml.html.fromstring(index)
        html.make_links_absolute(url)
        base_table = html.xpath('body/table/tr/td[2]/table[2]')
        district = None # keep track of district for substitutes
        for row in base_table[0].xpath('tr'):
            img_url = row.xpath('string(.//img/@src)')
            contact_form, additional_info_url = row.xpath('.//a/@href')
            if "Substitute" in row.text_content():
                # it seems like the sub always follows the person who he/she
                # is filling in for.
                # most sub info is provided at the additional info url
                self.scrape_sub(chamber, term, district, additional_info_url)
                continue
            else:
                full_name = " ".join(row[1][0].text_content().replace(u'\xa0', ' ').split())
                party = _PARTY[row[1][0].tail.strip()]

            pieces = [ x.strip() for x in row.itertext() if x ][6:]

            # The parsed HTML will be something like:
            # ['District 4', '2', 'nd', 'term', address, phone(s), profession, committees]
            # Sometimes there's a leadership title before all that
            if 'District ' in pieces[1]:
                pieces.pop(0)
            assert pieces[0].startswith('District '), "Improper district found: {}".format(pieces[0])
            assert pieces[3] == 'term', "Improper term found: {}".format(pieces[3])

            district = pieces[0]
            district = district.replace('District', '').strip()
            pieces = pieces[4:]
            if pieces[0].startswith(u'(Served '):
                pieces.pop(0)

            address = pieces.pop(0).strip()
            assert re.match(r'.*\d{5}', address), "Address potentially invalid: {}".format(address)

            phone = None
            fax = None
            for line in pieces:
                if line.lower().startswith('home '):
                    phone = line[len('home '):]
                elif not phone and line.lower().startswith('bus '):
                    phone = line[len('bus '):]
                if line.lower().startswith('fax '):
                    fax = line[len('fax '):]

                # After committees begin, no more contact information exists
                if line == "Committees:":
                    break

            leg = Legislator(term,
                             chamber,
                             district,
                             full_name,
                             party=party)

            leg.add_office('district',
                           'District Office',
                           fax=fax if fax else None,
                           phone=phone if phone else None)

            leg.add_source(url)
            leg['photo_url'] = img_url
            leg['contact_form'] = contact_form
            leg['url'] = additional_info_url

            self.save_legislator(leg)
Beispiel #3
0
    def scrape_lower_chamber(self, term):
        # E-mail contact is now hidden behind webforms. Sadness.

        party_map = {
            'PNP': 'Partido Nuevo Progresista',
            'PPD': u'Partido Popular Democr\xe1tico',
            'PIP': u'Partido Independentista Puertorrique\u00F1o',
        }

        url = 'http://www.tucamarapr.org/dnncamara/ComposiciondelaCamara.aspx'

        page = self.lxmlize(url)

        member_nodes = self.get_nodes(
            page, '//div[@class="info-block"][1]//a[@class="opener"]')

        if member_nodes is not None:
            for member_node in member_nodes:
                # Initialize default values for legislator attributes.
                name = None
                district = None
                address = None
                party = None
                photo_url = None
                phone = None
                fax = None

                photo_url = self.get_node(
                    member_node, './/span[@class="identity"]/img/@src')

                # Node reference for convenience.
                info_node = self.get_node(member_node,
                                          './/span[@class="info"]')

                name_node = self.get_node(info_node, './/span[@class="name"]')
                # Strip titles from legislator name.
                if name_node is not None:
                    name_text = name_node.text.strip()
                    name_text = re.sub(r'^Hon\.[\s]*', '', name_text)
                    name_text = re.sub(r' - .*$', '', name_text)
                    name = ' '.join(name_text.split())

                party_node = self.get_node(info_node,
                                           './/span[@class="party"]/span')
                if party_node is not None:
                    party_text = party_node.text.strip()
                    party = party_map[party_text]

                district_node = self.get_node(info_node,
                                              './/span[@class="district"]')
                if district_node is not None:
                    district_text = district_node.text.strip()

                    try:
                        district_number = re.search(r'0?(\d{1,2})',
                                                    district_text).group(1)
                        district = re.sub(r'^Distrito[\s]*', '',
                                          district_text).strip()
                    except AttributeError:
                        if "Distrito" not in district_text:
                            district = 'At-Large'
                        else:
                            warning = u'{} missing district number.'
                            self.logger.warning(warning.format(name))

                address_node = self.get_node(info_node,
                                             './/span[@class="address"]')
                if address_node is not None:
                    address_text = address_node.text
                    if address_text and not address_text.isspace():
                        address = address_text.strip()

                # Only grabs the first validated phone number found.
                # Typically, representatives have multiple phone numbers.
                phone_nodes = self.get_nodes(
                    member_node,
                    './/span[@class="two-columns"]//span[@class="data-type"'
                    'and contains(text(), "Tel:")]')
                if phone_nodes is not None:
                    has_valid_phone = False

                    for phone_node in phone_nodes:
                        # Don't keep searching phone numbers if a good
                        # one is found.
                        if has_valid_phone:
                            break

                        phone_text = phone_node.text
                        phone_text = re.sub(r'^Tel:[\s]*', '', phone_text)\
                            .strip()
                        if self.validate_phone_number(phone_text):
                            phone = phone_text
                            has_valid_phone = True

                fax_node = self.get_node(
                    member_node,
                    './/span[@class="two-columns"]//span[@class="data-type"'
                    ' and contains(text(), "Fax:")]')
                if fax_node is not None:
                    fax_text = fax_node.text
                    fax_text = re.sub(r'^Fax:[\s]*', '', fax_text).strip()
                    if self.validate_phone_number(fax_text):
                        fax = fax_text

                legislator = Legislator(term=term,
                                        chamber='lower',
                                        district=district,
                                        full_name=name,
                                        party=party,
                                        photo_url=photo_url)

                legislator.add_source(url)
                legislator.add_office(
                    type='capitol',
                    name='Oficina del Capitolio',
                    address=address,
                    phone=phone,
                    fax=fax,
                )

                self.save_legislator(legislator)
    def fetch_member(self, url, name, term, chamber):
        photo_url = ''

        lis_id = self._get_lis_id(chamber, url)

        if chamber == 'lower':
            base_url = 'http://memdata.virginiageneralassembly.gov'
            profile_url = base_url + '/images/display_image/{}'
            photo_url = profile_url.format(lis_id)
            #xpath_query = './/img/@src'
        elif chamber == 'upper':
            base_url = 'http://apps.senate.virginia.gov'
            profile_url = base_url + '/Senator/memberpage.php?id={}'
            xpath_query = './/img[@class="profile_pic"]/@src'

            # Retrieve profile photo.
            profile_page = self.lxmlize(profile_url.format(lis_id))
            photo_url = self.get_node(profile_page, xpath_query)

        # Detect whether URL points to a blank base location.
        blank_urls = (
            'http://memdata.virginiageneralassembly.gov/images/display_'
            'image/',
            'http://virginiageneralassembly.gov/house/members/photos/',
        )

        if photo_url in blank_urls:
            photo_url = ''

        if (name in CHAMBER_MOVES and (chamber != CHAMBER_MOVES[name])):
            return

        if "vacated" in name.lower():
            self.logger.warning(
                "Seat seems to have been vacated: '{}'".format(name))
            return

        party_map = {'R': 'Republican', 'D': 'Democratic', 'I': 'Independent'}
        party_district_re = re.compile(
            r'\((R|D|I)\) - (?:House|Senate) District\s+(\d+)')

        # handle resignations, special elections
        match = re.search(r'-(Resigned|Member) (\d{1,2}/\d{1,2})?', name)
        if match:
            action, date = match.groups()
            name = name.rsplit('-')[0]

            if action == 'Resigned':
                pass  # TODO: set end date
            elif action == 'Member':
                pass  # TODO: set start date

        html = self.get(url).text
        doc = lxml.html.fromstring(html)

        party_district_line = doc.xpath('//h3/font/text()')[0]
        party, district = party_district_re.match(party_district_line).groups()

        # Scrub status from name.
        name = re.sub(r'(- Elect)$', '', name).strip()

        leg = Legislator(
            term=term,
            chamber=chamber,
            district=district,
            full_name=name.strip(),
            party=party_map[party],
            url=url,
            photo_url=photo_url,
        )
        leg.add_source(url)

        for ul in doc.xpath('//ul[@class="linkNon" and normalize-space()]'):
            address = []
            phone = None
            email = None
            for li in ul.getchildren():
                text = li.text_content()
                if re.match('\(\d{3}\)', text):
                    phone = text
                elif text.startswith('email:'):
                    email = text.strip('email: ').strip()
                else:
                    address.append(text)
                office_type = ('capitol'
                               if 'Capitol Square' in address else 'district')
                name = ('Capitol Office'
                        if office_type == 'capitol' else 'District Office')

            leg.add_office(office_type,
                           name,
                           address='\n'.join(address),
                           phone=phone,
                           email=email)

        for com in doc.xpath('//ul[@class="linkSect"][1]/li/a/text()'):
            leg.add_role('committee member',
                         term=term,
                         chamber=chamber,
                         committee=com)

        self.save_legislator(leg)
Beispiel #5
0
    def scrape_reps(self, chamber, term_name):
        url = 'http://www.maine.gov/legis/house/dist_mem.htm'
        page = self.urlopen(url)
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        # There are 151 districts
        for district in xrange(1, 152):
            if (district % 10) == 0:
                path = '/html/body/p[%s]/a[3]' % (district + 4)
            else:
                path = '/html/body/p[%s]/a[2]' % (district + 4)

            try:
                link = page.xpath(path)[0]
            except IndexError:
                # If the the district % 10 == 0 query doesn't
                # produce a link, retry the second link. Horrible.
                path = '/html/body/p[%s]/a[2]' % (district + 4)
                link = page.xpath(path)[0]

            leg_url = link.get('href')
            name = link.text_content()

            if len(name) == 0:
                return
            if name.split()[0] == 'District':
                return

            mark = name.find('(')
            party = name[mark + 1]
            district_name = name[mark + 3:-1]
            name = name[15:mark]

            # vacant
            if party == "V":
                continue
            else:
                party = _party_map[party]

            leg = Legislator(term_name,
                             chamber,
                             str(district),
                             name,
                             party=party,
                             url=leg_url,
                             district_name=district_name)
            leg.add_source(url)
            leg.add_source(leg_url)

            # Get the photo url.
            html = self.urlopen(leg_url)
            doc = lxml.html.fromstring(html)
            doc.make_links_absolute(leg_url)

            # Get the default (B&W) photo url.
            photo_url = doc.xpath('//img')[0]
            if 'src' in photo_url.attrib:
                photo_url = photo_url.attrib.pop('src')
                leg['photo_url'] = photo_url
            else:
                photo_url = None

            # Try to get color photo from the GOP website.
            if party == 'Republican':
                xpath = '//a[contains(@href, "house_gop")]/@href'
                party_website_url = doc.xpath(xpath)[0]
                party_website_html = self.urlopen(party_website_url)
                if party_website_html.response.status_code == 200:
                    party_website = lxml.html.fromstring(party_website_html)
                    photo_url = party_website.xpath('//img/@src')[1]

            # Try to get color photo from the dems' website.
            elif party == 'Democratic':
                xpath = '//a[contains(@href, "housedems")]/@href'

                els = doc.xpath(xpath)
                if els:
                    party_website_url = els[0]

                try:
                    party_website_html = self.urlopen(party_website_url)
                except scrapelib.HTTPError:
                    # Sometimes the page doesn't exist.
                    pass
                else:
                    if party_website_html.response.status_code == 200:
                        party_website = lxml.html.fromstring(
                            party_website_html)
                        photo_url = party_website.xpath('//img/@src')[1]

            self.scrape_lower_offices(leg, page, leg_url)
            self.save_legislator(leg)
Beispiel #6
0
    def scrape(self, chamber, term):
        self.validate_term(term, latest_only=False)
        root_url = 'http://www.capitol.tn.gov/'
        parties = {
            'D': 'Democratic',
            'R': 'Republican',
            'CCR': 'Carter County Republican',
            'I': 'Independent'
        }

        #testing for chamber
        if chamber == 'upper':
            url_chamber_name = 'senate'
            abbr = 's'
        else:
            url_chamber_name = 'house'
            abbr = 'h'
        if term != self.metadata["terms"][-1]["sessions"][0]:
            chamber_url = root_url + url_chamber_name
            chamber_url += '/archives/' + term + 'GA/Members/index.html'
        else:
            chamber_url = root_url + url_chamber_name + '/members/'

        page = self.get(chamber_url).text
        page = lxml.html.fromstring(page)

        for row in page.xpath("//tr"):

            # Skip any a header row.
            if set(child.tag for child in row) == set(['th']):
                continue

            vacancy_check = row.xpath('./td/text()')[1]
            if 'Vacant' in vacancy_check:
                self.logger.warning("Vacant Seat")
                continue

            partyInit = row.xpath('td[3]')[0].text.split()[0]
            party = parties[partyInit]
            district = row.xpath('td[5]/a')[0].text.split()[1]
            address = row.xpath('td[6]')[0].text_content()
            # 301 6th Avenue North Suite
            address = address.replace(
                'LP', 'Legislative Plaza\nNashville, TN 37243')
            address = address.replace(
                'WMB', 'War Memorial Building\nNashville, TN 37243')
            address = '301 6th Avenue North\nSuite ' + address
            phone = [
                x.strip() for x in row.xpath('td[7]//text()') if x.strip()
            ][0]

            email = HTMLParser.HTMLParser().unescape(
                row.xpath('td[1]/a/@href')[0][len("mailto:"):])
            member_url = (root_url + url_chamber_name + '/members/' + abbr +
                          district + '.html')
            member_photo_url = (root_url + url_chamber_name +
                                '/members/images/' + abbr + district + '.jpg')

            try:
                member_page = self.get(member_url, follow_redirects=False).text
            except TypeError:
                member_page = self.get(member_url).text
            member_page = lxml.html.fromstring(member_page)
            try:
                name = member_page.xpath('body/div/div/h1/text()')[0]
            except IndexError:
                name = member_page.xpath(
                    '//div[@id="membertitle"]/h2/text()')[0]

            if 'Speaker' in name:
                full_name = name[8:len(name)]
            elif 'Lt.' in name:
                full_name = name[13:len(name)]
            elif abbr == 'h':
                full_name = name[len("Representative "):len(name)]
            else:
                full_name = name[8:len(name)]

            leg = Legislator(term,
                             chamber,
                             district,
                             full_name.strip(),
                             party=party,
                             email=email,
                             url=member_url,
                             photo_url=member_photo_url)
            leg.add_source(chamber_url)
            leg.add_source(member_url)

            # TODO: add district address from this page

            leg.add_office('capitol',
                           'Nashville Address',
                           address=address,
                           phone=phone)

            self.save_legislator(leg)
Beispiel #7
0
    def scrape(self, chamber, term):
        if chamber == 'upper':
            index_url = 'http://www.leg.wa.gov/senate/senators/Pages/default.aspx'
        else:
            index_url = 'http://www.leg.wa.gov/house/representatives/Pages/default.aspx'
        doc = self.lxmlize(index_url)

        # Email addresses are listed on a separate page.
        email_list_url = 'http://app.leg.wa.gov/memberemail/Default.aspx'
        email_doc = self.lxmlize(email_list_url)

        for member in doc.xpath('//div[@id="allMembers"]/div[@class="memberInformation"]'):
            (photo_url, ) = member.xpath('.//a[text()="Print Quality Photo"]/@href')

            (title_name_party, ) = member.xpath('.//span[@class="memberName"]/text()')
            (name, party) = re.search(r'^(?:Senator|Representative)\s(.+)\s\(([RD])\)$', title_name_party).groups()
            if party == 'R':
                party = "Republican"
            elif party == 'D':
                party = "Democratic"

            (district_name, _district_name, ) = member.xpath('.//a[contains(text(), " Legislative District")]/text()')
            assert district_name == _district_name
            district_num = re.search(r'(\d{1,2})\w{2} Legislative District', district_name).group(1)

            leg = Legislator(
                full_name=name,
                term=term,
                chamber=chamber,
                district=district_num,
                party=party,
                photo_url=photo_url
            )
            leg['url'] = member.xpath('.//a[contains(text(), "Home Page")]/@href')[0]

            capitol_office = member.xpath('.//div[@class="memberColumnTitle" and text()=" Olympia Office"]/parent::div[1]/text()')
            capitol_office = [l.strip() for l in capitol_office if l.strip()]

            capitol_fax = None
            capitol_phone = None
            capitol_address = None

            # Can't capture any information anyway if office data is empty,
            # so we can skip if that's the case.
            if capitol_office:
                # Retrieve capitol office fax number.
                if capitol_office[-1].startswith('Fax: '):
                    capitol_fax = capitol_office.pop().replace('Fax: ', "")

                # Retrieve capitol office phone number.
                capitol_phone = capitol_office.pop()

                # Retrieve capitol office address.
                capitol_address = '\n'.join(capitol_office)

            # Retrieve the member's position from the email link. We need it to find the member's email address.
            # These positions are enough to discriminate the chamber too (0 = upper, 1,2 = lower)
            email_link_url = member.xpath('.//a[contains(@href, "memberEmail")]')[0].get('href')
            position = re.search(r'/([[0-9]+)$', email_link_url).group(1)
 
            # Need to get the email from the email page by matching with the member's district and position
            email = self.get_node(
                email_doc,
                './/tr/td/a[contains(@href, "memberEmail/{}/{}")]/parent::td/'
                'following-sibling::td[1]/text()'.format(
                    district_num,
                    position))

            leg.add_office(
                'capitol',
                'Capitol Office',
                address=capitol_address,
                phone=capitol_phone,
                email=email,
                fax=capitol_fax
            )

            _has_district_office = member.xpath('.//div[@class="memberColumnTitle" and text()=" District Office"]')
            if _has_district_office:
                # Out of both chambers, only one member has multiple district offices, so ignore that
                # Also ignore the few members who have separate mailing addresses
                district_office = member.xpath('.//div[@class="memberColumnTitle" and text()=" District Office"]/parent::div[1]/text()')
                district_office = [l.strip() for l in district_office if l.strip()]
                _end_of_first_address = district_office.index([l for l in district_office if re.search(r'\,\s*WA\s*\d{5}', l)][0])
                district_address = '\n'.join(district_office[0:(_end_of_first_address + 1)])
                try:
                    district_phone = district_office[(_end_of_first_address + 1)]
                    assert re.match(r'\(\d{3}\) \d{3} \- \d{4}', district_phone)
                except IndexError:
                    pass
                except AssertionError:
                    pass

                leg.add_office(
                    'district',
                    'District Office',
                    address=district_address,
                    phone=district_phone
                )

            leg.add_source(index_url)

            self.save_legislator(leg)
Beispiel #8
0
    def scrape(self, chamber, term):
        biennium = "%s-%s" % (term[0:4], term[7:9])

        url = ("http://wslwebservices.leg.wa.gov/SponsorService.asmx/"
               "GetSponsors?biennium=%s" % biennium)

        # these pages are useful for checking if a leg is still in office
        if chamber == 'upper':
            cur_member_url = 'http://www.leg.wa.gov/senate/senators/Pages/default.aspx'
        else:
            cur_member_url = 'http://www.leg.wa.gov/house/representatives/Pages/default.aspx'

        cur_members = self.get(cur_member_url).text
        cur_members_doc = lxml.html.fromstring(cur_members)
        cur_members_doc.make_links_absolute(cur_member_url)

        page = self.get(url)
        page = lxml.etree.fromstring(page.content)

        for member in xpath(page, "//wa:Member"):

            mchamber = xpath(member, "string(wa:Agency)")
            mchamber = {'House': 'lower', 'Senate': 'upper'}[mchamber]

            if mchamber != chamber:
                continue

            name = xpath(member, "string(wa:Name)").strip()
            if name == "":
                continue

            # if the legislator isn't in the listing, skip them
            if name not in cur_members:
                self.warning('%s is no longer in office' % name)
                continue
            else:
                leg_url, = set(cur_members_doc.xpath(
                    '//span[contains(text(), "%s")]/../..//'
                    'a[text()="Home Page"]/@href' % (
                        name
                    )))

            party = xpath(member, "string(wa:Party)")
            party = {'R': 'Republican', 'D': 'Democratic'}.get(
                party, party)

            district = xpath(member, "string(wa:District)")
            if district == '0':
                # Skip phony district 0.
                continue

            email = xpath(member, "string(wa:Email)")
            phone = xpath(member, "string(wa:Phone)")

            last = xpath(member, "string(wa:LastName)")
            last = last.lower().replace(' ', '')

            scraped_offices = []
            photo_url = ""

            try:
                leg_page = self.get(leg_url).text
                leg_page = lxml.html.fromstring(leg_page)
                leg_page.make_links_absolute(leg_url)

                photo_link = leg_page.xpath(
                    "//a[contains(@href, 'publishingimages')]")
                if photo_link:
                    photo_url = photo_link[0].attrib['href']
                offices = leg_page.xpath("//table[@cellspacing='0']/tr/td/b[contains(text(), 'Office')]")
                for office in offices:
                    office_block = office.getparent()
                    office_name = office.text_content().strip().rstrip(":")
                    address_lines = [x.tail for x in office_block.xpath(".//br")]
                    address_lines = filter(lambda a: a is not None, address_lines)
                    _ = address_lines.pop(len(address_lines) - 1)
                    phone = address_lines.pop(len(address_lines) - 1)
                    address = "\n".join(address_lines)
                    obj = {
                        "name": office_name,
                        "phone": phone
                    }
                    if address.strip() != '':
                        obj['address'] = address

                    scraped_offices.append(obj)

            except scrapelib.HTTPError:
                # Sometimes the API and website are out of sync
                # with respect to legislator resignations/appointments
                pass
            except requests.exceptions.ConnectionError:
                # Sometimes the API and website are out of sync
                # with respect to legislator resignations/appointments
                pass

            leg = Legislator(term, chamber, district,
                             name, '', '', '', party,
                             photo_url=photo_url, url=leg_url)
            leg.add_source(leg_url)

            for office in scraped_offices:
                typ = 'district' if 'District' in office['name'] else 'capitol'
                leg.add_office(typ, office.pop('name'), **office)

            self.save_legislator(leg)
Beispiel #9
0
    def scrape(self, chamber, term):
        term_slug = term[:-2]
        url = MEMBER_LIST_URL[chamber] % term_slug

        html = self.urlopen(url)
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)

        for row in doc.xpath('//table')[4].xpath('tr')[2:]:
            name, _, _, district, party = row.xpath('td')
            district = district.text
            party = {
                'D': 'Democratic',
                'R': 'Republican',
                'I': 'Independent'
            }[party.text]
            leg_url = name.xpath('a/@href')[0]
            name = name.text_content().strip()

            # inactive legislator, skip them for now
            if name.endswith('*'):
                name = name.strip('*')
                continue

            leg_html = self.urlopen(leg_url)
            leg_doc = lxml.html.fromstring(leg_html)
            leg_doc.make_links_absolute(leg_url)

            leg = Legislator(term,
                             chamber,
                             district,
                             name,
                             party=party,
                             url=leg_url)
            leg.add_source(url)

            hotgarbage = ('Senate Biography Information for the 98th General '
                          'Assembly is not currently available.')
            if hotgarbage in leg_html:
                # The legislator's bio isn't available yet.
                self.logger.warning('No legislator bio available for ' + name)
                self.save_legislator(leg)
                continue

            photo_url = leg_doc.xpath(
                '//img[contains(@src, "/members/")]/@src')[0]
            photo_url_parsed = urlparse(photo_url)
            encoded_path = quote(photo_url_parsed.path)
            photo_url = photo_url_parsed._replace(path=encoded_path).geturl()
            leg.update(photo_url=photo_url)
            leg.add_source(leg_url)

            # email
            email = leg_doc.xpath('//b[text()="Email: "]')
            if email:
                leg['email'] = email[0].tail

            # function for turning an IL contact info table to office details
            def _table_to_office(table, office_type, office_name):
                addr = ''
                phone = ''
                fax = None
                for row in table.xpath('tr'):
                    row = row.text_content().strip()
                    # skip rows that aren't part of address
                    if 'Office:' in row or row == 'Cook County':
                        continue
                    # fax number row ends with FAX
                    elif 'FAX' in row:
                        fax = row.replace(' FAX', '')
                    # phone number starts with ( [make it more specific?]
                    elif row.startswith('('):
                        phone = row
                    # everything else is an address
                    else:
                        addr += (row + '\n')
                if addr.strip() != ',':
                    leg.add_office(office_type,
                                   office_name,
                                   address=addr.strip(),
                                   phone=phone,
                                   fax=fax)

            # extract both offices from tables
            table = leg_doc.xpath(
                '//table[contains(string(), "Springfield Office")]')
            if table:
                _table_to_office(table[3], 'capitol', 'Springfield Office')
            table = leg_doc.xpath(
                '//table[contains(string(), "District Office")]')
            if table:
                _table_to_office(table[3], 'district', 'District Office')

            self.save_legislator(leg)
Beispiel #10
0
    def scrape_lower_chamber(self, term):
        url = "http://www.okhouse.gov/Members/Default.aspx"

        page = self.lxmlize(url)

        legislator_nodes = self.get_nodes(
            page,
            '//table[@id="ctl00_ContentPlaceHolder1_RadGrid1_ctl00"]/tbody/tr')

        for legislator_node in legislator_nodes:
            name_node = self.get_node(
                legislator_node,
                './/td[1]/a')

            if name_node is not None:
                name_text = name_node.text.strip()

                last_name, delimiter, first_name = name_text.partition(',')

                if last_name is not None and first_name is not None:
                    first_name = first_name.strip()
                    last_name = last_name.strip()
                    name = ' '.join([first_name, last_name])
                else:
                    raise ValueError('Unable to parse name: {}'.format(
                        name_text))

                if name.startswith('House District'):
                    continue

            district_node = self.get_node(
                legislator_node,
                './/td[3]')

            if district_node is not None:
                district = district_node.text.strip()

            party_node = self.get_node(
                legislator_node,
                './/td[4]')

            if party_node is not None:
                party_text = party_node.text.strip()

            party = self._parties[party_text]

            legislator_url = 'http://www.okhouse.gov/District.aspx?District=' + district

            legislator_page = self.lxmlize(legislator_url)

            photo_url = self.get_node(
                legislator_page,
                '//a[@id="ctl00_ContentPlaceHolder1_imgHiRes"]/@href')

            legislator = Legislator(
                _scraped_name=name_text,
                full_name=name,
                term=term,
                chamber='lower',
                district=district,
                party=party,
                photo_url=photo_url,
                url=legislator_url
            )

            legislator.add_source(url)
            legislator.add_source(legislator_url)

            # Scrape offices.
            self.scrape_lower_offices(legislator_page, legislator)

            self.save_legislator(legislator)
Beispiel #11
0
    def scrape_details(self, chamber, term, leg_name, leg_link, role):
        if not leg_link:
            # Vacant post, likely:
            if "Vacancy" in leg_name:
                return
            raise Exception("leg_link is null. something went wrong")
        try:
            url = 'http://billstatus.ls.state.ms.us/members/%s' % leg_link
            url_root = os.path.dirname(url)
            details_page = self.get(url)
            root = lxml.etree.fromstring(details_page.content)
            party = root.xpath('string(//PARTY)')

            district = root.xpath('string(//DISTRICT)')

            photo = "%s/%s" % (url_root, root.xpath('string(//IMG_NAME)'))

            home_phone = root.xpath('string(//H_PHONE)')

            home_address = root.xpath('string(//H_ADDRESS)')
            home_address2 = root.xpath('string(//H_ADDRESS2)')
            home_city = root.xpath('string(//H_CITY)')
            home_zip = root.xpath('string(//H_ZIP)')

            home_address_total = "%s\n%s\n%s\n%s" % (
                home_address,
                home_address2,
                home_city,
                home_zip
            )

            bis_phone = root.xpath('string(//B_PHONE)')
            capital_phone = root.xpath('string(//CAP_PHONE)')
            other_phone = root.xpath('string(//OTH_PHONE)')
            org_info = root.xpath('string(//ORG_INFO)')
            email_name = root.xpath('string(//EMAIL_ADDRESS)').strip()
            cap_room = root.xpath('string(//CAP_ROOM)')

            if leg_name in ('Lataisha Jackson', 'John G. Faulkner'):
                assert not party, "Remove special-casing for this Democrat without a listed party: {}".format(leg_name)
                party = 'Democratic'
            elif leg_name in ('James W. Mathis', 'John Glen Corley'):
                assert not party, "Remove special-casing for this Republican without a listed party: {}".format(leg_name)
                party = 'Republican'
            elif party == 'D':
                party = 'Democratic'
            elif party == 'R':
                party = 'Republican'
            else:
                raise AssertionError(
                    "A member with no identifiable party was found: {}".format(leg_name))

            leg = Legislator(term, chamber, district, leg_name, party=party, role=role,
                             org_info=org_info, url=url, photo_url=photo)
            leg.add_source(url)

            kwargs = {}

            if email_name != "":
                if "@" in email_name:
                    email = email_name
                else:
                    email = '%s@%s.ms.gov' % (email_name,
                                              {"upper": "senate", "lower": "house"}[chamber])
                kwargs['email'] = email

            if capital_phone != "":
                kwargs['phone'] = capital_phone

            if cap_room != "":
                kwargs["address"] = "Room %s\n%s" % (cap_room, CAP_ADDRESS)
            else:
                kwargs['address'] = CAP_ADDRESS

            leg.add_office('capitol', 'Capitol Office', **kwargs)

            kwargs = {}
            if home_phone != "":
                kwargs['phone'] = home_phone

            if home_address_total != "":
                kwargs['address'] = home_address_total

            if kwargs != {}:
                leg.add_office('district', 'District Office', **kwargs)

            self.save_legislator(leg)
        except scrapelib.HTTPError, e:
            self.warning(str(e))
Beispiel #12
0
    def scrape_session(self, term, chambers, session):
        session = self.metadata['session_details'][session]
        sid = session['_guid']
        members = self.sservice.GetMembersBySession(sid)['MemberListing']
        for member in members:
            guid = member['Id']
            # print member['Name']
            nick_name, first_name, middle_name, last_name = (
                member['Name'][x]
                for x in ['Nickname', 'First', 'Middle', 'Last'])
            chamber, district = (member['District'][x]
                                 for x in ['Type', 'Number'])

            party = member['Party']
            if party == 'Democrat':
                party = 'Democratic'

            # print first_name, middle_name, last_name, party
            # print chamber, district
            first_name = nick_name if nick_name else first_name
            # XXX: Due to the upstream handling...

            # if middle_name:
            #     name = "%s %s %s" % (first_name, middle_name, last_name)
            # else:
            # blocked out due to GA putting middle_name in first_name ...
            name = "%s %s" % (first_name, last_name)

            chamber = {"House": 'lower', "Senate": 'upper'}[chamber]

            if party.strip() == '':
                party = 'other'

            legislator = Legislator(
                term,
                chamber,
                str(district),
                name,
                party=party,
                #                last_name=last_name,
                #                first_name=first_name,
                _guid=guid)
            #            if middle_name:
            #                legislator['middle_name'] = middle_name

            #           Sadly, upstream isn't good about keeping first names first only,
            #           so I'm blocking this out.

            ainfo = [
                member['DistrictAddress'][x]
                for x in ['Street', 'City', 'State', 'Zip']
            ]
            if not None in ainfo:
                # XXX: Debug this nonsense.
                ainfo = [x.strip() for x in ainfo]
                address = " ".join(ainfo)
                email = member['DistrictAddress']['Email']
                legislator.add_office('district',
                                      'District Address',
                                      address=address,
                                      email=email)

            legislator.add_source(self.ssource)
            self.save_legislator(legislator)
Beispiel #13
0
    def scrape(self, term, chambers):
        leg_url = "ftp://ftp.cga.ct.gov/pub/data/LegislatorDatabase.csv"
        data = self.urlopen(leg_url)
        page = open_csv(data)

        for row in page:
            chamber = {'H': 'lower', 'S': 'upper'}[row['office code']]
            if chamber not in chambers:
                continue

            district = row['dist'].lstrip('0')

            name = row['first name']
            mid = row['middle initial'].strip()
            if mid:
                name += " %s" % mid
            name += " %s" % row['last name']
            suffix = row['suffix'].strip()
            if suffix:
                name += " %s" % suffix

            party = row['party']
            if party == 'Democrat':
                party = 'Democratic'

            leg = Legislator(term,
                             chamber,
                             district,
                             name,
                             first_name=row['first name'],
                             last_name=row['last name'],
                             middle_name=row['middle initial'],
                             suffixes=row['suffix'],
                             party=party,
                             email=row['email'],
                             url=row['URL'],
                             office_phone=row['capitol phone'])

            office_address = "%s, Room %s\nHartford, CT 06106-1591" % (
                row['capitol street address'], row['room number'])
            leg.add_office('capitol',
                           'Capitol Office',
                           address=office_address,
                           phone=row['capitol phone'])
            # skipping home address for now
            leg.add_source(leg_url)

            for comm in row['committee member1'].split(';'):
                if comm:
                    if ' (' in comm:
                        comm, role = comm.split(' (')
                        role = role.strip(')').lower()
                    else:
                        role = 'member'
                    leg.add_role('committee member',
                                 term,
                                 chamber='joint',
                                 committee=comm.strip(),
                                 position=role)

            self.save_legislator(leg)
Beispiel #14
0
    def scrape(self, chamber, term):
        if chamber == 'upper':
            url = ('http://webserver.rilin.state.ri.us/Documents/Senators.xls')
            rep_type = 'Senator'
            source_url = 'http://www.rilin.state.ri.us/senators/default.aspx'
            source_url_title_replacement = rep_type
            contact_url = 'http://webserver.rilin.state.ri.us/Email/SenEmailListDistrict.asp'
        elif chamber == 'lower':
            url = (
                'http://webserver.rilin.state.ri.us/Documents/Representatives.xls'
            )
            rep_type = 'Representative'
            source_url = 'http://www.rilin.state.ri.us/representatives/default.aspx'
            source_url_title_replacement = 'Rep. '
            contact_url = 'http://webserver.rilin.state.ri.us/Email/RepEmailListDistrict.asp'

        self.urlretrieve(url, 'ri_leg.xls')

        wb = xlrd.open_workbook('ri_leg.xls')
        sh = wb.sheet_by_index(0)

        # This isn't perfect but it's cheap and better than using the
        # XLS doc as the source URL for all legislators.
        # 374: RI: legislator url
        leg_source_url_map = {}
        leg_page = self.lxmlize(source_url)

        for link in leg_page.xpath('//td[@class="ms-vb2"]'):
            leg_name = link.text_content().replace(
                source_url_title_replacement, '')
            leg_url = link.xpath("..//a")[0].attrib['href']
            leg_source_url_map[leg_name] = leg_url

        for rownum in xrange(1, sh.nrows):
            d = {}
            for field, col_num in excel_mapping.iteritems():
                d[field] = sh.cell(rownum, col_num).value

            if d['full_name'].upper() == "VACANT":
                self.warning("District {}'s seat is vacant".format(
                    int(d['district'])))
                continue

            slug = re.match(
                "(?P<class>sen|rep)-(?P<slug>.*)@(rilin\.state\.ri\.us|rilegislature\.gov)",
                d['email'])

            if 'asp' in d['email']:
                d['email'] = None

            if d['email'] is not None:
                info = slug.groupdict()
                info['chamber'] = "senators" if info[
                    'class'] == 'sen' else "representatives"

                url = ("http://www.rilin.state.ri.us/{chamber}/"
                       "{slug}/Pages/Biography.aspx".format(**info))

            dist = str(int(d['district']))
            district_name = dist

            assert d['full_name'].startswith(rep_type), "Improper name found"
            full_name = re.sub(r"^{}(?=\s?[A-Z].*$)".format(rep_type), '',
                               d['full_name']).strip()
            translate = {
                "Democrat": "Democratic",
                "Republican": "Republican",
                "Independent": "Independent"
            }

            homepage_url = None
            url_names = lxml.html.fromstring(self.get(source_url).text)
            url_names = url_names.xpath('//td[@class="ms-vb2"]/a/@href')
            modified_name = re.sub(r'[^\w\s]', '', full_name)
            modified_name = modified_name.replace(' ', '').strip('').lower()

            for el in url_names:
                if 'default.aspx' in el:
                    el = el.replace('default.aspx', '')
                    el = el.strip('')
                if el[-1] == '/':
                    el = el[:-1]
                el = el.lower()
                url_name_array = el.split('/')
                if url_name_array[-1] in modified_name:
                    #remove '/default.aspx' and add last name
                    homepage_url = source_url[:-12] + url_name_array[-1]

            kwargs = {
                "town_represented": d['town_represented'],
            }

            contact = self.lxmlize(contact_url)
            contact_phone = contact.xpath(
                '//tr[@valign="TOP"]//td[@class="bodyCopy"]/text() | //td[@class="bodyCopy"]//center/text()'
            )

            phone = None
            for el in contact_phone:
                if len(el) <= 2 and dist == el:
                    number = contact_phone.index(el)
                    phone = contact_phone[number + 2]
                    phone = phone.strip()

            email = None
            if d['email'] is not None:
                email = d['email']

            if homepage_url is not None:
                kwargs['url'] = homepage_url

            if d['address'] is '':
                d['address'] = 'No Address Found'

            leg = Legislator(term, chamber, district_name, full_name, '', '',
                             '', translate[d['party']], **kwargs)

            leg.add_office('district',
                           'Dictrict Office',
                           address=d['address'],
                           phone=phone,
                           email=email)
            leg.add_source(source_url)
            leg.add_source(contact_url)
            if homepage_url:
                leg.add_source(homepage_url)
            self.save_legislator(leg)
Beispiel #15
0
    def scrape(self, chamber, term):
        urls = {
            'lower':
            "http://www.msa.md.gov/msa/mdmanual/06hse/html/hseal.html",
            'upper': "http://www.msa.md.gov/msa/mdmanual/05sen/html/senal.html"
        }
        detail_re = re.compile(
            '\((R|D)\), (?:Senate President, )?(?:House Speaker, )?District (\w+)'
        )

        with self.urlopen(urls[chamber]) as html:
            doc = lxml.html.fromstring(html)

            # rest of data on this page is <li>s that have anchor tags
            for a in doc.xpath('//li/a'):
                link = a.get('href')
                # tags don't close so we get the <li> and <a> content and diff them
                name_text = a.text_content()
                detail_text = a.getparent().text_content().replace(
                    name_text, '')

                # ignore if it is not a valid link
                if link:
                    # handle names
                    names = name_text.split(',')
                    last_name = names[0]
                    first_name = names[1].strip()
                    # TODO: try to trim first name to remove middle initial
                    if len(names) > 2:
                        suffixes = names[2]
                    else:
                        suffixes = ''

                    # handle details
                    details = detail_text.strip()
                    party, district = detail_re.match(details).groups()
                    party = PARTY_DICT[party]

                    leg_url = BASE_URL + link

                    leg = Legislator(term,
                                     chamber,
                                     district,
                                     ' '.join((first_name, last_name)),
                                     first_name,
                                     last_name,
                                     party=party,
                                     suffixes=suffixes,
                                     url=leg_url)
                    leg.add_source(url=leg_url)

                    with self.urlopen(leg_url) as leg_html:
                        leg_doc = lxml.html.fromstring(leg_html)
                        img_src = leg_doc.xpath('//img[@align="left"]/@src')
                        if img_src:
                            leg['photo_url'] = BASE_URL + img_src[0]

                        # address extraction
                        # this is pretty terrible, we get address in a format that looks
                        # like:
                        #   James Senate Office Building, Room 322
                        #   11 Bladen St., Annapolis, MD 21401
                        #   (410) 841-3565, (301) 858-3565; 1-800-492-7122, ext. 3565 (toll free)
                        #   e-mail: [email protected]
                        #   fax: (410) 841-3552, (301) 858-3552
                        #
                        #   Western Maryland Railway Station, 13 Canal St., Room 304, Cumberland, MD 21502
                        #   (301) 722-4780; 1-866-430-9553 (toll free)
                        #   e-mail: [email protected]
                        #   fax: (301) 722-4790
                        # usually first ul, sometimes first p
                        try:
                            addr_lines = leg_doc.xpath(
                                '//ul')[0].text_content().strip().splitlines()
                        except IndexError:
                            addr_lines = leg_doc.xpath(
                                '//p')[0].text_content().strip().splitlines()
                        addr_pieces = {
                            'capitol': defaultdict(str),
                            'district': defaultdict(str)
                        }
                        addr_type = 'capitol'
                        for line in addr_lines:
                            if '(401)' in line or '(301)' in line:
                                addr_pieces[addr_type]['phone'] = line
                            elif 'toll free' in line:
                                pass  # skip stand alone 1-800 numbers
                            elif 'e-mail' in line:
                                addr_pieces[addr_type]['email'] = line.replace(
                                    'email: ', '')
                            elif 'fax' in line:
                                addr_pieces[addr_type]['fax'] = line.replace(
                                    'fax: ', '')
                            elif line == '':
                                addr_type = 'district'
                            else:
                                addr_pieces[addr_type][
                                    'address'] += '{0}\n'.format(line)
                        if addr_pieces['capitol']:
                            leg.add_office('capitol', 'Capitol Office',
                                           **addr_pieces['capitol'])
                            leg['email'] = (addr_pieces['capitol']['email']
                                            or addr_pieces['district']['email']
                                            or None)
                        if addr_pieces['district']:
                            leg.add_office('district', 'District Office',
                                           **addr_pieces['district'])

                    self.save_legislator(leg)
Beispiel #16
0
    def scrape(self, chamber, term):

        for tdata in self.metadata['terms']:
            if term == tdata['name']:
                year = tdata['start_year']
                session_number = tdata['session_number']
                break

        # Scrape committees. Also produce a name dictionary that can be
        # used for fuzzy matching between the committee page names and the
        # all-caps csv names.
        for name_dict, _ in scrape_committees(year, chamber):
            pass

        # Fetch the csv.
        url = 'http://leg.mt.gov/content/sessions/%s/%d%sMembers.txt' % \
            (session_number, year, chamber == 'upper' and 'Senate' or 'House')

        # Parse it.
        data = self.urlopen(url)
        data = data.replace('"""', '"')  # weird triple quotes
        data = data.splitlines()

        fieldnames = [
            'last_name', 'first_name', 'party', 'district', 'address', 'city',
            'state', 'zip'
        ]
        csv_parser = csv.DictReader(data, fieldnames)

        district_leg_urls = self._district_legislator_dict()

        for entry in csv_parser:
            if not entry:
                continue

            # City.
            entry['city'] = entry['city'].title()

            # Address.
            entry['address'] = entry['address'].title()

            # District.
            district = entry['district']
            hd_or_sd, district = district.split()
            del entry['district']

            # Party.
            party_letter = entry['party']
            party = {'D': 'Democratic', 'R': 'Republican'}[party_letter]
            entry['party'] = party
            del entry['party']

            # Get full name properly capped.
            _fullname = '%s %s' % (entry['first_name'].capitalize(),
                                   entry['last_name'].capitalize())

            city_lower = entry['city'].lower()
            fullname = difflib.get_close_matches(_fullname,
                                                 name_dict[city_lower],
                                                 cutoff=0.5)

            # If there are no close matches with the committee page,
            # use the title-capped first and last name.
            if len(fullname) < 1:
                fullname = _fullname
                # msg = 'No matches found for "%s" with "%s" from %r'
                # self.debug(msg % (_fullname, fullname,
                #                   name_dict[city_lower]))
            else:
                fullname = fullname[0]
                # if _fullname != fullname:
                #     msg = 'matched "%s" with "%s" from %r'
                #     self.debug(msg % (_fullname, fullname,
                #                       name_dict[city_lower]))

            # Get any info at the legislator's detail_url.
            detail_url = district_leg_urls[hd_or_sd][district]
            deets = self._scrape_details(detail_url)

            # Add the details and delete junk.
            entry.update(deets)
            del entry['first_name'], entry['last_name']

            legislator = Legislator(term,
                                    chamber,
                                    district,
                                    fullname,
                                    party=party)
            legislator.update(entry)
            legislator.add_source(detail_url)
            legislator.add_source(url)
            legislator['url'] = detail_url

            self.save_legislator(legislator)
Beispiel #17
0
    def scrape_reps(self, chamber, term):
        # There are 99 House districts
        for district in xrange(1, 100):
            rep_url = ('http://www.house.state.oh.us/components/'
                       'com_displaymembers/page.php?district=%d' % district)

            with self.urlopen(rep_url) as page:
                page = lxml.html.fromstring(page)

                ranges = []
                cur = []
                info = page.xpath('//td[@class="info"]/*')
                for r in info:
                    if r.tag == 'strong':
                        ranges.append(cur)
                        cur = []
                    else:
                        cur.append(r)
                ranges.append(cur)

                block = ranges[4][:-1]

                address = ", ".join(
                    [ x.tail.strip() for x in block ])

                phone = page.xpath(
                    "//strong[contains(text(), 'Phone')]")[0].tail

                fax = page.xpath(
                    "//strong[contains(text(), 'Fax')]")[0].tail

                for el in page.xpath('//table[@class="page"]'):
                    rep_link = el.xpath('tr/td/title')[0]
                    full_name = rep_link.text
                    party = full_name[-2]
                    full_name = full_name[0:-3]

                    if full_name == 'Vacant Posit':
                        continue

                    if party == "D":
                        party = "Democratic"
                    elif party == "R":
                        party = "Republican"


                    leg = Legislator(term, chamber, str(district),
                                     full_name, party=party, url=rep_url)
                    leg.add_office('capitol',
                                   'Capitol Office',
                                    address=address,
                                    phone=phone,
                                    fax=fax)  # Yet, no email.

                    committees = page.xpath("//table[@class='billLinks']")[0]
                    for committee in committees.xpath(".//tr"):
                        td = committee.xpath(".//td")
                        if len(td) != 2:
                            break

                        name, role = td
                        name, role = name.text_content(), role.text_content()
                        name, role = name.strip(), role.strip()
                        if name[0] == "|":
                            continue

                        chmbr = chamber
                        if "joint" in name.lower():
                            chmbr = "joint"

                        if name in JOINT_COMMITTEE_OVERRIDE:
                            chmbr = "joint"

                        leg.add_role('committee member',
                            term=term,
                            chamber=chmbr,
                            committee=name,
                            position=role
                        )

                    leg.add_source(rep_url)
                    self.save_legislator(leg)
Beispiel #18
0
    def scrape(self, chamber, term):
        # What Vermont claims are Word and Excel files are actually
        # just HTML tables
        # What Vermont claims is a CSV file is actually one row of comma
        # separated values followed by a ColdFusion error.
        url = ("http://www.leg.state.vt.us/legdir/"
               "memberdata.cfm/memberdata.doc?FileType=W")

        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)

            for tr in page.xpath("//tr")[1:]:
                row_chamber = tr.xpath("string(td[4])")
                if row_chamber == 'S' and chamber == 'lower':
                    continue
                elif row_chamber == 'H' and chamber == 'upper':
                    continue

                district = tr.xpath("string(td[7])")
                district = district.replace('District', '').strip()

                first_name = tr.xpath("string(td[8])")
                middle_name = tr.xpath("string(td[9])")
                last_name = tr.xpath("string(td[10])")

                if first_name.endswith(" %s." % middle_name):
                    first_name = first_name.split(" %s." % middle_name)[0]

                if middle_name:
                    full_name = "%s %s. %s" % (first_name, middle_name,
                                               last_name)
                else:
                    full_name = "%s %s" % (first_name, last_name)

                email = tr.xpath("string(td[11])")

                party = tr.xpath("string(td[6])")
                party = re.sub(r'Democrat\b', 'Democratic', party)
                parties = party.split('/')
                if 'Republican' in parties:
                    if 'Democratic' in parties:
                        pass
                    else:
                        party = 'Republican'
                        parties.remove('Republican')
                elif 'Democratic' in parties:
                    party = 'Democratic'
                    parties.remove('Democratic')
                else:
                    party = parties.pop(0)

                leg = Legislator(
                    term,
                    chamber,
                    district,
                    full_name,
                    first_name=first_name,
                    middle_name=middle_name,
                    last_name=last_name,
                    party=party,
                    email=email,
                    # closest thing we have to a page for legislators, not ideal
                    url='http://www.leg.state.vt.us/legdir/LegDirMain.cfm')
                leg['roles'][0]['other_parties'] = parties
                leg.add_source(url)

                # 12-16: MailingAddress: 1,2,City,State,ZIP
                mail = '%s\n%s\n%s, %s %s' % (
                    tr.xpath('string(td[12])'), tr.xpath('string(td[13])'),
                    tr.xpath('string(td[14])'), tr.xpath('string(td[15])'),
                    tr.xpath('string(td[16])'))
                leg.add_office('district', 'Mailing Address', address=mail)
                # 17-21: HomeAddress: 1,2,City,State,ZIP, Email, Phone
                home = '%s\n%s\n%s, %s %s' % (
                    tr.xpath('string(td[17])'), tr.xpath('string(td[18])'),
                    tr.xpath('string(td[19])'), tr.xpath('string(td[20])'),
                    tr.xpath('string(td[21])'))
                home_email = tr.xpath('string(td[22])') or None
                home_phone = tr.xpath('string(td[23])') or None
                leg.add_office('district',
                               'Home Address',
                               address=home,
                               email=home_email,
                               phone=home_phone)

                self.save_legislator(leg)
Beispiel #19
0
    def scrape(self, chamber, term):
        self.validate_term(term)
        session = self.get_session_for_term(term)
        try:
            session_id = self.get_session_id(session)
        except KeyError:
            raise NoDataForPeriod(session)

        body = {'lower': 'H', 'upper': 'S'}[chamber]
        url = 'http://www.azleg.gov/MemberRoster.asp?Session_ID=%s&body=%s' % (
            session_id, body)
        with self.urlopen(url) as page:
            root = html.fromstring(page)
            path = '//table[@id="%s"]/tr' % {'H': 'house', 'S': 'senate'}[body]
            roster = root.xpath(path)[1:]
            for row in roster:
                position = ''
                vacated = ''
                name, district, party, email, room, phone, fax = row.xpath(
                    'td')

                if email.attrib.get('class') == 'vacantmember':
                    continue  # Skip any vacant members.

                link = name.xpath('string(a/@href)')
                link = "http://www.azleg.gov" + link
                if len(name) == 1:
                    name = name.text_content().strip()
                else:
                    position = name.tail.strip()
                    name = name[0].text_content().strip()

                district = district.text_content()
                party = party.text_content().strip()
                email = email.text_content().strip()

                if ('Vacated' in email or 'Resigned' in email
                        or 'Removed' in email):
                    # comment out the following 'continue' for historical
                    # legislative sessions
                    # for the current session, if a legislator has left we will
                    # skip him/her to keep from overwriting their information
                    continue
                    vacated = re.search('[0-9]*/[0-9]*/\d{4}', email).group()
                    email = ''

                party = self.get_party(party)
                room = room.text_content().strip()
                if chamber == 'lower':
                    address = "House of Representatives\n"
                else:
                    address = "Senate\n"
                address = address + "1700 West Washington\n Room " + room  \
                                  + "\nPhoenix, AZ 85007"

                phone = phone.text_content().strip()
                if not phone.startswith('602'):
                    phone = "602-" + phone
                fax = fax.text_content().strip()
                if not fax.startswith('602'):
                    fax = "602-" + fax
                if vacated:
                    end_date = datetime.datetime.strptime(vacated, '%m/%d/%Y')
                    leg = Legislator(term,
                                     chamber,
                                     district,
                                     full_name=name,
                                     party=party,
                                     url=link)
                    leg['roles'][0]['end_date'] = end_date
                else:
                    leg = Legislator(term,
                                     chamber,
                                     district,
                                     full_name=name,
                                     party=party,
                                     email=email,
                                     url=link)

                leg.add_office('capitol',
                               'Capitol Office',
                               address=address,
                               phone=phone,
                               fax=fax)

                if position:
                    leg.add_role(position,
                                 term,
                                 chamber=chamber,
                                 district=district,
                                 party=party)

                leg.add_source(url)

                #Probably just get this from the committee scraper
                #self.scrape_member_page(link, session, chamber, leg)
                self.save_legislator(leg)
Beispiel #20
0
    def scrape_member(self, chamber, term, member_url):
        page = self.get(member_url).text
        root = lxml.html.fromstring(page)
        root.make_links_absolute(member_url)

        photo_url = root.xpath('//div[@class="thumbPhoto"]/img/@src')[0]
        full_name = root.xpath('//h1/span')[0].tail.strip()

        email = root.xpath('//a[contains(@href, "mailto")]/@href')[0]
        email = email.replace('mailto:', '')

        party, district = root.xpath('//h1/span')[1].text.split('-')
        party = party.strip()
        district = clean_district(district.strip())

        if party in ('D', 'Democrat', 'Democratic'):
            party = 'Democratic'
        elif party in ('R', 'Republican'):
            party = 'Republican'
        else:
            party = 'Other'

        leg = Legislator(term,
                         chamber,
                         district,
                         full_name,
                         party=party,
                         photo_url=photo_url,
                         url=member_url)
        leg.add_source(member_url)

        # offices

        # this bool is so we only attach the email to one office
        # and we make sure to create at least one office
        email_stored = True
        if email:
            email_stored = False

        for addr in root.xpath('//address/div[@class="contactGroup"]'):
            office_name = addr.xpath(
                '../preceding-sibling::h4/text()')[0].strip()
            address = addr.xpath('a')[0].text_content()
            address = re.sub('\s{2,}', '\n', address)

            phone = fax = next = None
            for phonerow in addr.xpath('./div/div'):
                phonerow = phonerow.text_content().strip()
                if phonerow == 'Phone:':
                    next = 'phone'
                elif phonerow == 'Fax:':
                    next = 'fax'
                elif next == 'phone':
                    phone = phonerow
                    next = None
                elif next == 'fax':
                    fax = phonerow
                    next = None
                else:
                    self.warning('unknown phonerow %s', phonerow)

            # all pieces collected
            if 'District' in office_name:
                otype = 'district'
            elif 'State' in office_name:
                otype = 'capitol'

            if not email_stored:
                email_stored = True
                leg.add_office(otype,
                               office_name,
                               phone=phone,
                               fax=fax,
                               address=address,
                               email=email)
            else:
                leg.add_office(otype,
                               office_name,
                               phone=phone,
                               fax=fax,
                               address=address)

        if not email_stored:
            leg.add_office('capitol', 'Capitol Office', email=email)

        self.save_legislator(leg)
Beispiel #21
0
    def scrape_details(self, chamber, term, leg_name, leg_link, role):
        if not leg_link:
            # Vacant post, likely:
            if "Vacancy" in leg_name:
                return
            raise Exception("leg_link is null. something went wrong")
        try:
            url = 'http://billstatus.ls.state.ms.us/members/%s' % leg_link
            url_root = os.path.dirname(url)
            details_page = self.urlopen(url)
            root = lxml.etree.fromstring(details_page.bytes)
            party = root.xpath('string(//PARTY)')
            district = root.xpath('string(//DISTRICT)')
            photo = "%s/%s" % (url_root, root.xpath('string(//IMG_NAME)'))

            home_phone = root.xpath('string(//H_PHONE)')
            bis_phone = root.xpath('string(//B_PHONE)')
            capital_phone = root.xpath('string(//CAP_PHONE)')
            other_phone = root.xpath('string(//OTH_PHONE)')
            org_info = root.xpath('string(//ORG_INFO)')
            email_name = root.xpath('string(//EMAIL_ADDRESS)')
            cap_room = root.xpath('string(//CAP_ROOM)')

            if party == 'D':
                party = 'Democratic'
            elif party == 'R':
                party = 'Republican'
            elif leg_name in ('Oscar Denton', 'Lataisha Jackson',
                              'John G. Faulkner'):
                party = 'Democratic'

            leg = Legislator(term,
                             chamber,
                             district,
                             leg_name,
                             party=party,
                             role=role,
                             org_info=org_info,
                             url=url,
                             photo_url=photo)
            leg.add_source(url)

            kwargs = {}

            if email_name.strip() != "":
                email = '%s@%s.ms.gov' % (email_name, {
                    "upper": "senate",
                    "lower": "house"
                }[chamber])
                kwargs['email'] = email

            if capital_phone != "":
                kwargs['phone'] = capital_phone

            if cap_room != "":
                kwargs["address"] = "Room %s\n%s" % (cap_room, CAP_ADDRESS)
            else:
                kwargs['address'] = CAP_ADDRESS

            leg.add_office('capitol', 'Capitol Office', **kwargs)

            self.save_legislator(leg)
        except scrapelib.HTTPError, e:
            self.warning(str(e))
Beispiel #22
0
    def scrape_member(self, chamber, year, member_url):
        with self.urlopen(member_url) as member_page:
            member = {}
            member_root = lxml.html.fromstring(member_page)

            table = member_root.xpath('//body/div[2]/table')[0]
            imgtag = member_root.xpath('//body/div[2]/table//img')

            member['photo_url'] = imgtag[0].get('src')
            name_list = table.xpath('string(.//strong[1])').split(' ')
            member['full_name'] = ' '.join(name_list[1:-1]).strip()

            party = name_list[-1]
            party = re.sub(r'\(|\)', '', party)
            if party == 'R':
                party = 'Republican'
            elif party == 'D':
                party = 'Democratic'
            elif party == 'I':
                party = 'Independent'

            member['party'] = party

            boldList = [bold.text for bold in table.iterdescendants(tag='b')]

            for item in boldList:
                if item == None:
                    continue
                elif 'District' in item:
                    district = item.split(' ')[-1]
                    member['district'] = district.strip()
                else:
                    if 'additionalRoles' in member:
                        member['additionalRoles'].append(item)
                    else:
                        member['additionalRoles'] = [item]

            contact_rows = member_root.xpath(
                '//body/div[2]/div[1]/table/tr/td/table[1]/tr')

            for row in contact_rows:
                row_text = self.get_child_text(row)

                if len(row_text) > 0:
                    if row_text[0] == 'Frankfort Address(es)':
                        member['office_address'] = '\n'.join(row_text[1:])

                    if row_text[0] == 'Phone Number(s)':
                        for item in row_text:
                            # Use the first capitol annex phone
                            if item.startswith('Annex:'):
                                member['office_phone'] = item.replace(
                                    'Annex:', '').strip()
                                break

            office_info = self.scrape_office_info(member_url)

            leg = Legislator(year, chamber, member['district'],
                             member['full_name'],
                             party=member['party'],
                             photo_url=member['photo_url'],
                             url=member_url,
                             office_address=member['office_address'],
                             office_phone=member['office_phone'])
            leg.add_source(member_url)

            kwargs = {}
            if office_info['Email Address(es)'] != []:
                kwargs['email'] = office_info['Email Address(es)'][0]
                leg['email'] = office_info['Email Address(es)'][0]

            if office_info['Phone Number(s)']['Annex'] != []:
                kwargs['phone'] = office_info['Phone Number(s)']['Annex'][0]

            if office_info['Frankfort Address(es)'] != []:
                kwargs['address'] = office_info['Frankfort Address(es)'][0]

            if kwargs != {}:
                leg.add_office('capitol',
                               'Annex Office',
                               **kwargs)

            if 'additionalRoles' in member:
                for role in member['additionalRoles']:
                    leg.add_role(role, year, chamber=chamber)

            self.save_legislator(leg)
Beispiel #23
0
    def scrape_senators(self, chamber, term):
        session = ((int(term[0:4]) - 2009) / 2) + 124

        mapping = {
            'district': 1,
            'first_name': 2,
            'middle_name': 3,
            'last_name': 4,
            # 'suffix': 6,
            'party': 6,
            'resident_county': 5,
            'street_addr': 7,
            'city': 8,
            'state': 9,
            'zip_code': 10,
            'phone1': 12,
            'phone2': 13,
            'email': 11,
        }

        url = ('http://legisweb1.mainelegislature.org/wp/senate/'
               'wp-content/uploads/sites/2/2013/09/%sthSenatorsList.xlsx' %
               session)

        try:
            fn, result = self.urlretrieve(url)
        except scrapelib.HTTPError:
            url = 'http://www.maine.gov/legis/senate/%dthSenatorsList.xls'
            url = url % session
            fn, result = self.urlretrieve(url)

        wb = xlrd.open_workbook(fn)
        sh = wb.sheet_by_index(0)

        for rownum in xrange(1, sh.nrows):
            # get fields out of mapping
            d = {}
            for field, col_num in mapping.iteritems():
                try:
                    d[field] = str(sh.cell(rownum, col_num).value)
                except IndexError:
                    # This col_num doesn't exist in the sheet.
                    pass

            full_name = " ".join(
                (d['first_name'], d['middle_name'], d['last_name']))
            full_name = re.sub(r'\s+', ' ', full_name).strip()

            address = "{street_addr}\n{city}, ME {zip_code}".format(**d)

            # For matching up legs with votes
            district_name = d['city']

            phone = d['phone1']

            district = d['district'].split('.')[0]

            leg_url = 'http://www.maine.gov/legis/senate/bio%02ds.htm' % int(
                district)

            leg = Legislator(term,
                             chamber,
                             district,
                             full_name,
                             d['first_name'],
                             d['middle_name'],
                             d['last_name'],
                             _party_map[d['party']],
                             resident_county=d['resident_county'],
                             office_address=address,
                             office_phone=phone,
                             email=None,
                             district_name=district_name,
                             url=leg_url)
            leg.add_source(url)
            leg.add_source(leg_url)

            html = self.urlopen(leg_url)
            doc = lxml.html.fromstring(html)
            doc.make_links_absolute(leg_url)
            xpath = '//td[@class="XSP_MAIN_PANEL"]/descendant::img/@src'
            photo_url = doc.xpath(xpath)
            if photo_url:
                photo_url = photo_url.pop()
                leg['photo_url'] = photo_url
            else:
                photo_url = None

            office = dict(name='District Office',
                          type='district',
                          fax=None,
                          email=None,
                          address=''.join(address))

            leg['email'] = d['email']
            leg.add_office(**office)
            self.save_legislator(leg)
Beispiel #24
0
    def scrape_upper_leg_page(self, term, url, who):
        page = self.lxmlize(url)
        info = page.xpath("//td[@bgcolor='#EBEAEC']")
        who = page.xpath("//font[@size='4']")
        who = who[0].text_content()
        who = re.sub("\s+", " ", who)
        who, district = (x.strip() for x in who.rsplit("-", 1))
        who = who.replace("Senator", "").strip()
        district = district.replace("District", "").strip()

        infopane = page.xpath("//table[@cellpadding='3']")
        infos = [x.tail.strip() if x.tail else ""
                 for x in infopane[1].xpath(".//br")]

        keys = ["party", "email", "capitol-office",
                "district-office", "phone", "fax", "staffer"]
        nodes = [[]]
        for node in infos:
            if node == "":
                if nodes[-1] != []:
                    nodes.append([])
                continue
            nodes[-1].append(node)

        data = dict(zip(keys, nodes))

        district_office = "\n".join(data['district-office'])
        capitol_office = "\n".join(data['capitol-office'])

        rundown = infopane[1].xpath("./*")[-1]
        rundown_txt = rundown.text_content()
        parties = {
            "Republican": "Republican",
            "Democrat": "Democratic",
        }

        party = 'other'
        for slug in parties:
            if slug in rundown_txt:
                party = parties[slug]

        if party == 'other':
            raise Exception

        kwargs = {
            "party": party
        }

        leg = Legislator(term,
                         'upper',
                         district,
                         who,
                         **kwargs)


        leg.add_office('district',
                       'District Office',
                       address=district_office)

        leg.add_office('capitol',
                       'Capitol Office',
                       address=capitol_office)

        leg.add_source(url)

        self.save_legislator(leg)
Beispiel #25
0
    def _scrape_individual_legislator_page(self,
                                           url,
                                           term,
                                           chamber,
                                           district=None):
        """Scrape a specific lower house legislators page. The function will actually
        call one of three functions as there is 2 different bio templates and a completely
        separate one for the speaker of the house.

        Example url: http://www1.legis.ga.gov/legis/2009_10/house/bios/abdulsalaamRoberta/abdulsalaamRoberta.htm
        """
        if 'speaker/index.htm' in url:
            return self._scrape_speaker_of_the_house(url, term, chamber)

        with self.lxml_context(url) as page:
            # page == None == 404
            if page is None:
                return None

            page.make_links_absolute(url)

            # first check to see if this is the 'original' template or the new one
            stylesheet_path = '//link[@rel="stylesheet"]'
            stylesheets = page.xpath(stylesheet_path)

            for style_sheet in stylesheets:
                if 'legis.ga.gov.house.factsheet.css' in style_sheet.get('href') or \
                   'legis.ga.gov.house.bio.css' in style_sheet.get('href'):
                    return self._scrape_individual_legislator_page_second_template(
                        page, term, chamber, district=district)

            path = '//table[@id="hoverTable"]/tr'
            legislator_info = page.xpath(path)

            # There is one page, "www1.legis.ga.gov/legis/2011_12/house/bios/williamsCoach.htm" that has
            # malformed HTML, going to manually do that one:
            if "www1.legis.ga.gov/legis/2011_12/house/bios/williamsCoach.htm" in url:
                legislator = Legislator(term,
                                        chamber,
                                        district,
                                        '"Coach" Williams',
                                        party="Democratic",
                                        url=url)
                return legislator

            # See if we got to the first row, some templates don't start with their table as 'hoverTable'
            # in this case let's just get the first table on the page as that is seeming to work well.
            if not legislator_info:
                path = '//table'
                tables = page.xpath(path)
                legislator_info = tables[0].getchildren()
            first_row = legislator_info[0]

            td_elements = first_row.getchildren()[0]
            name = td_elements[0].text_content().split('\n')[0].strip()
            party = td_elements[1].text_content().strip()[0:1].upper()
            # There was some cases where the party wasn't in a <p> it was after the
            # <h2>name</h2> foo <br />, seriously wtf
            if party not in self.PARTY_DICT:
                elements = td_elements.text_content().split('\n')
                for ele in elements:
                    ele = ele.strip()
                    if " - " in ele:
                        party = ele[0:1]
                        break
                    elif ele.upper() == 'REPUBLICAN':
                        party = 'R'
                        break
                    elif ele.upper() == 'DEMOCRAT':
                        party = 'D'
                        break
                if party == '':
                    party = td_elements.text_content().split(
                        '\n')[1].strip()[0:1]

            if not district:
                if len(td_elements) < 3 or "District" not in td_elements[
                        2].text_content():
                    text_content = first_row[1].text_content().split('\n')
                    district = text_content[0].strip()[len("District "):]
                else:
                    district = td_elements[2].text_content().strip(
                    )[len("District "):]

            # Not every legislator has a sworn in date or facebook url, so attempt to parse
            # and just pass if it fails
            sworn_in = None
            try:
                sworn_in = td_elements[4].text_content().strip(
                )[len("Sworn in "):]
            except:
                pass

            facebook_url = ''
            try:
                facebook_url = td_elements[5].get('href')
            except:
                pass

            photo_url = ''
            try:
                td_elements = first_row.getchildren()[1]
                photo_url = td_elements[0].getchildren()[0].get('src') or ''
            except:
                pass

            # Second row:
            second_row = legislator_info[1]
            address_info = second_row.getchildren()[0].text_content().split(
                "<br />")[0].split("\n")
            phone_number = address_info.pop()
            address = " ".join(address_info)

            email = ''
            try:
                text_content = second_row.text_content().split('\n')
                for content in text_content:
                    if '@' in content.strip():
                        email = content.strip()
            except IndexError:
                try:
                    email = second_row.getchildren()[1].getchildren(
                    )[0].text_content()
                except:
                    pass

            legislator = Legislator(term,
                                    chamber,
                                    district,
                                    name,
                                    party=self.PARTY_DICT[party],
                                    email=email,
                                    photo_url=photo_url,
                                    facebook_url=facebook_url,
                                    address=address,
                                    sworn_in_date=sworn_in,
                                    office_phone=phone_number,
                                    url=url)
            legislator.add_source(url)
            return legislator
Beispiel #26
0
    def scrape(self, term, chambers):
        url = 'http://gencourt.state.nh.us/downloads/Members.txt'

        option_map = {}
        html = self.get(
            'http://www.gencourt.state.nh.us/house/members/memberlookup.aspx'
        ).text
        doc = lxml.html.fromstring(html)
        for opt in doc.xpath('//option'):
            option_map[opt.text] = opt.get('value')

        data = self.get(url).text
        for line in data.splitlines():
            if line.strip() == "":
                continue

            (chamber, fullname, last, first, middle, county, district_num,
             seat, party, street, street2, city, astate, zipcode, home_phone,
             office_phone, fax, email, com1, com2, com3, com4, com5, com6,
             com7) = line.split('*')

            chamber = chamber_map[chamber]

            # skip legislators from a chamber we aren't scraping
            if chamber not in chambers:
                continue

            middle = middle.strip()
            last = last.strip('"')

            if middle:
                full = '%s %s %s' % (first, middle, last)
            else:
                full = '%s %s' % (first, last)

            address = street
            if street2:
                address += (' ' + street2)
            address += '\n%s, %s %s' % (city, astate, zipcode)

            district = str(int(district_num))
            if county:
                district = '%s %s' % (county, district)

            # When a candidate receives enough write-in votes in the
            # other party's primary, they are listed on the ballot as
            # being a nominee of both parties (eg, 'd+r')
            # Cross-reference this list for official party affiliation:
            # http://www.gencourt.state.nh.us/House/caljourns/journals/2015/HJ_4.pdf
            if fullname == "Wall, Janet G.":
                assert party == 'd+r', "Remove special-casing for Wall"
                party = 'd'

            leg = Legislator(term,
                             chamber,
                             district,
                             full,
                             first,
                             last,
                             middle,
                             party_map[party],
                             email=email)
            leg.add_office('district',
                           'Home Address',
                           address=address,
                           phone=home_phone or None)
            leg.add_office('district',
                           'Office Address',
                           phone=office_phone or None,
                           fax=fax or None)

            if chamber == 'upper':
                leg['url'] = 'http://www.gencourt.state.nh.us/Senate/members/webpages/district%02d.aspx' % int(
                    district_num)
            elif chamber == 'lower':
                code = option_map.get('{0}, {1}'.format(last, first))
                if code:
                    leg['url'] = 'http://www.gencourt.state.nh.us/house/members/member.aspx?member=' + code

            romans = r'(?i)\s([IXV]+)(?:\s|$)'
            for com in (com1, com2, com3, com4, com5, com6, com7):
                com = com.strip('"')
                if com:
                    com_name = com.title()
                    com_name = re.sub(romans, lambda m: m.group().upper(),
                                      com_name)
                    leg.add_role('committee member',
                                 term=term,
                                 chamber=chamber,
                                 committee=com_name)

            if 'url' in leg:
                leg['photo_url'] = self.get_photo(leg['url'], chamber)

            leg.add_source(url)
            self.save_legislator(leg)
Beispiel #27
0
    def _parse_member(self, chamber, term, member):
        first_name = member.get('first-name')
        last_name = member.get('last-name')
        party = self.party_map[member.get('party')]

        # this is semi-safe because we validated term w/ latest_only=True
        session = self.metadata['terms'][-1]['sessions'][-1]

        # extra_fields
        extra_dict = {}
        for name, xpath in self.extra_fields.iteritems():
            result = member.xpath(xpath)
            if result:
                extra_dict[name] = result[0]

        # address fields
        for name, xpath in self.addr_fields.iteritems():
            result = member.xpath(xpath)
            if result:
                result = result[0]
                extra_dict[name] = '%s, %s, %s %s' % (
                    result.get('street-address'), result.get('city'),
                    result.get('state'), result.get('postal-code'))

        leg = Legislator(term,
                         chamber,
                         member.get('district-number'),
                         full_name=first_name + ' ' + last_name,
                         first_name=first_name,
                         last_name=last_name,
                         middle_name=member.get('middle-initial'),
                         party=party,
                         email=member.get('e-mail'),
                         url=member.get('website'),
                         photo_url="%s/member_photo.jpg" %
                         (member.get('website')),
                         oregon_member_id=member.get('leg-member-id'))

        # add offices
        leg.add_office('capitol',
                       'Capitol Office',
                       address=extra_dict['capitol_address'],
                       phone=extra_dict['phone'])
        if 'district_address' in extra_dict or 'district_phone' in extra_dict:
            leg.add_office('district',
                           'District Office',
                           address=extra_dict.get('district_address', None),
                           phone=extra_dict.get('district_phone', None))

        # committees
        com_xpath = 'committee-membership/session[@session-name="%s"]/committee' % session
        for com in member.xpath(com_xpath):
            cdict = {
                'position': com.get('title').lower(),
                'chamber': chamber,
            }
            com_name = com.get('name')
            com_class = com.get('committee-class')
            if com_class == 'sub-committee':
                cdict['committee'], cdict['subcommittee'] = \
                        com.get('name').split(' Subcommittee On ')
            else:
                cdict['committee'] = com.get('name')

            leg.add_role('committee member', term, **cdict)

        leg.add_source(self.source_url)
        return leg
Beispiel #28
0
    def scrape(self, scrape_for_term_named, chambers):
        # The links on http://www.sanjoseca.gov/index.aspx?NID=1187 may go off-
        # site, so use http://www.sanjoseca.gov/index.aspx?NID=146
        council_url = 'http://www.sanjoseca.gov/index.aspx?NID=146'
        doc = lxml.html.fromstring(self.urlopen(council_url))
        doc.make_links_absolute(council_url)

        tds = doc.xpath('//div[@id="Section1"]//td')
        assert len(
            tds
        ) <= 11, 'expected 11 unique mayor and councilmember URLs, found %d' % len(
            tds)

        lines = []
        for text in doc.xpath('//div[@id="Section1"]/text()'):
            text = clean_string(text)
            if re.match('^(?:\d+|San) ', text):
                lines.append(text)
        address = '\n'.join(lines)

        emails = []
        for text in doc.xpath('//div[@id="Section1"]/script/text()'):
            # PhantomJS would be sweet here.
            emails.append(''.join(
                re.search('([^"]+)"\+"(@)"\+"([^"]+)', text).groups()))

        for index, td in enumerate(tds):
            for text in td.xpath('.//text()'):
                match = tel_regex.search(text.strip())
                if match:
                    phone = '-'.join(match.groups())
                    break

            url = td.xpath('.//a[//strong]/@href')[0]
            photo_url = td.xpath('.//img/@src')[0]

            # Extract district, name, role
            text = td.xpath('.//strong/text()')[0]

            if 'District' in text:
                district = re.search('District \d+', text).group(0)
                name = re.sub(', District \d+$', '', text)
                role = None
                if 'Vice Mayor' in text:
                    name = name.replace('Vice Mayor ', '')
                    role = 'Vice Mayor'
            elif 'Mayor' in text:
                district = 'Mayor'
                name = text.replace('Mayor ', '')
                role = 'Mayor'
            else:
                self.logger.warning('Skipped: ' + text)

# Extract councilmember's term
            for text in td.xpath('.//text()'):
                match = re.search('\s*Term Expires:\s*([\d]+)/([\d]+)/([\d]+)',
                                  text)
                if match:
                    councilmember_term_expires_year = string.atoi(
                        '20' + match.group(3))  # Built-in Y2.1K bug
                    councilmember_term_begins_year = councilmember_term_expires_year - 3

# Skip if this legislator is not in the current term being scraped
            scrape_for_term = self.find_term_named(scrape_for_term_named)
            if not year_is_within_term(
                    councilmember_term_begins_year,
                    scrape_for_term) and not year_is_within_term(
                        councilmember_term_expires_year, scrape_for_term):
                continue

# Extract fax and secondary phone from councilmember's page
            phone2 = None
            fax = None
            councilmember_doc = lxml.html.fromstring(self.urlopen(url))
            councilmember_doc.make_links_absolute(url)

            # @todo xpath needs to be constrained further; it matches more elements than necessary
            for text in councilmember_doc.xpath(
                    '//div[//img[@alt="Contact Us"]]//text()'
            ):  # '//div[@id="quickLinks774"]//text()'):
                if re.match('\s*Fax.*\d', text, re.I):
                    fax = '-'.join(tel_regex.search(text).groups())
                if re.match('\s*Phone.*\d', text, re.I) or re.match(
                        '\s*Ph..*\d', text, re.I) or re.match(
                            '\s*Tel..*\d', text, re.I):
                    councilmember_phone = '-'.join(
                        tel_regex.search(text).groups())
                    phone2 = councilmember_phone if councilmember_phone != phone else None

# Assign councilmember information
            legislator = Legislator(scrape_for_term_named,
                                    'upper',
                                    district,
                                    name,
                                    email=emails[index],
                                    url=url,
                                    photo_url=photo_url,
                                    party=None)
            legislator.add_office('capitol',
                                  'Council Office',
                                  address=address,
                                  phone=phone,
                                  secondary_phone=phone2,
                                  fax=fax)

            if role:
                legislator.add_role(role, scrape_for_term_named)

            legislator.add_source(url)

            self.save_legislator(legislator)
Beispiel #29
0
    def scrape_upper_chamber(self, term):
        urls = {
            'At-Large':
            'http://www.senadopr.us/Pages/SenadoresporAcumulacion.aspx',
            'I': 'http://www.senadopr.us/Pages/Senadores%20Distrito%20I.aspx',
            'II':
            'http://www.senadopr.us/Pages/Senadores%20Distrito%20II.aspx',
            'III':
            'http://www.senadopr.us/Pages/Senadores%20Distrito%20III.aspx',
            'IV':
            'http://www.senadopr.us/Pages/Senadores%20Distrito%20IV.aspx',
            'V': 'http://www.senadopr.us/Pages/Senadores%20Distrito%20V.aspx',
            'VI':
            'http://www.senadopr.us/Pages/Senadores%20Distrito%20VI.aspx',
            'VII':
            'http://www.senadopr.us/Pages/Senadores%20Distrito%20VII.aspx',
            'VIII':
            'http://www.senadopr.us/Pages/Senadores%20Distrito%20VIII.aspx'
        }

        for district, url in urls.iteritems():
            leg_page_html = self.get(url).text
            doc = lxml.html.fromstring(leg_page_html)
            doc.make_links_absolute(url)
            rows = doc.xpath(
                '//table[@summary="Senadores 2013-2016"]/tr[not(@class="ms-viewheadertr")]'
            )

            for row in rows:
                tds = row.xpath('td')

                name = tds[0].text_content().title().replace('Hon.', '',
                                                             1).strip()
                party = tds[1].text_content()
                phone = tds[2].text_content()
                email = tds[3].text_content()

                #Code to guess the picture
                namefixed = unicode(
                    name.replace(".", ". ")
                )  #Those middle names abbreviations are sometimes weird.
                namefixed = unicodedata.normalize('NFKD', namefixed).encode(
                    'ascii', 'ignore')  #Remove the accents
                nameparts = namefixed.split()
                if nameparts[1].endswith('.'):
                    lastname = nameparts[2]
                else:
                    lastname = nameparts[1]

                # Construct the photo url
                photo_url = 'http://www.senadopr.us/Fotos%20Senadores/sen_' + (
                    nameparts[0][0] + lastname).lower() + '.jpg'
                try:
                    picture_data = self.head(
                        photo_url)  # Checking to see if the file is there
                except scrapelib.HTTPError:  # If not, leave out the photo_url
                    photo_url = ''

                leg = Legislator(term=term,
                                 chamber='upper',
                                 district=district,
                                 full_name=name,
                                 party=party,
                                 photo_url=photo_url)
                leg.add_office('capitol',
                               'Oficina del Capitolio',
                               phone=phone,
                               email=email)
                leg.add_source(url)

                self.save_legislator(leg)
Beispiel #30
0
    def scrape(self, chamber, term):
        url = self.URLs[chamber]
        page = self.lxmlize(url)

        for block in page.xpath("//div[@class='ms-rtestate-field']")[1:-1]:
            # Each legislator block.

            photo_block = block.xpath("ancestor::td/preceding-sibling::td")
            if len(photo_block) == 0:
                continue

            h2s = block.xpath(".//h2/a")
            if len(h2s) != 1:
                # We've got a Vacant person.
                print("Found a Vacant position. Skipping block.")
                continue

            h2, = h2s
            name = h2.text.strip()

            photo_block, = photo_block
            # (The <td> before ours was the photo)
            img, = photo_block.xpath("*")
            img = img.attrib['src']

            info = {}
            # Right, now let's get info out of their little profile box.
            for entry in block.xpath(".//p"):
                key = None
                for kvpair in itergraphs(entry.xpath("./*"), 'br'):
                    # OK. We either get the tail or the next element
                    # (usually an <a> tag)
                    if len(kvpair) == 1:
                        key, = kvpair
                        value = key.tail.strip() if key.tail else None
                        if value:
                            value = re.sub("\s+", " ", value).strip()
                    elif len(kvpair) == 2:
                        key, value = kvpair
                        if value.text_content().strip() == "arty:":
                            key = value
                            value = value.tail
                    elif len(kvpair) == 3:
                        k1, k2, value = kvpair
                        # As seen with a <stong><strong>Email:</strong></strong>
                        t = lambda x: x.text_content().strip()
                        assert t(k1) == "" or t(k2) == ""
                        if t(k1) != "":
                            key = k1
                        else:
                            key = k2
                    else:
                        # Never seen text + an <a> tag, perhaps this can happen.
                        raise ValueError(
                            "Too many elements. Something changed")

                    key = key.text_content().strip(" :")
                    if value is None:
                        # A page has the value in a <strong> tag. D'oh.
                        key, value = (x.strip() for x in key.rsplit(":", 1))

                    key = re.sub("\s+", " ", key).strip()
                    key = key.replace(":", "")
                    if key == "arty":
                        key = "Party"

                    info[key] = value

            info['District'] = info['District'].encode('ascii',
                                                       'ignore').strip()

            info['Party'] = info['Party'].strip(": ").replace(u"\u00a0", "")

            leg = Legislator(term=term,
                             url=h2.attrib['href'],
                             chamber=chamber,
                             full_name=name,
                             party=info['Party'],
                             district=info['District'],
                             photo_url=img)
            leg.add_source(url)

            phone = info.get('Capitol Phone', info.get('apitol Phone'))
            if hasattr(phone, 'text_content'):
                phone = phone.text_content()

            leg.add_office(type='capitol',
                           name='Capitol Office',
                           address=info['Capitol Address'],
                           phone=phone,
                           email=info['Email'].attrib['href'].replace(
                               "mailto:", ""))

            self.save_legislator(leg)