コード例 #1
0
    def _scrape_upper(self, roster_page, term):
        member_urls = roster_page.xpath('(//table[caption])[1]//a/@href')
        # Sort by district for easier spotting of omissions:
        member_urls.sort(
            key=lambda url: int(re.search(r'\d+(?=\.htm)', url).group()))

        for member_url in member_urls:
            self._scrape_senator(member_url, term)

        # Handle Lt. Governor (President of the Senate) separately:
        url = 'http://www.senate.state.tx.us/75r/LtGov/Ltgov.htm'
        page = lxml.html.fromstring(self.get(url).text)
        name = page.xpath('//div[@class="memtitle"]/text()')[0] \
                   .replace('Lieutenant Governor', '').strip()

        # A safe assumption for lack of information on official member page or
        # party listings:
        party = 'Republican'

        lt_governor = Person(name)
        lt_governor.add_role('Lt. Governor', term, party=party)
        lt_governor.add_source(url)
        self.save_legislator(lt_governor)
コード例 #2
0
    def scrape_member(self, chamber, term, member_url):
        with self.urlopen(member_url) as page:
            root = lxml.html.fromstring(page)
            root.make_links_absolute(member_url)

            sdiv = root.xpath('//div[@class="subtitle"]')[0]
            table = sdiv.getnext()

            photo_url = table.xpath('//img[@id="ctl00_ContentPlaceHolder1'
                                    '_imgMember"]')[0].attrib['src']

            td = table.xpath('//td[@valign="top"]')[0]

            type = td.xpath('string(//div[1]/strong)').strip()

            full_name = td.xpath('string(//div[2]/strong)').strip()
            full_name = re.sub(r'\s+', ' ', full_name)

            district = td.xpath('string(//div[3])').strip()
            district = district.replace('District ', '')

            addrs = {}
            for atype, text in (('capital_address', 'Capitol address:'),
                                ('district_address', 'District address:')):
                aspan = root.xpath("//span[. = '%s']" % text)
                addrs[atype] = None

                if aspan:
                    addrs[atype] = aspan[0].tail
                    elem = aspan[0].getnext()
                    while elem is not None and elem.tag == 'br':
                        if elem.tail:
                            addrs[atype] += "\n" + elem.tail
                        elem = elem.getnext()

            party = td.xpath('string(//div[4])').strip()[0]
            if party == 'D':
                party = 'Democratic'
            elif party == 'R':
                party = 'Republican'

            if type == 'Lt. Gov.':
                leg = Person(full_name)
                leg.add_role('Lt. Governor', term, party=party, **addrs)
            else:
                leg = Legislator(term,
                                 chamber,
                                 district,
                                 full_name,
                                 party=party,
                                 photo_url=photo_url,
                                 **addrs)

            leg.add_source(urlescape(member_url))

            comm_div = root.xpath('//div[string() = "Committee Membership:"]'
                                  '/following-sibling::div'
                                  '[@class="rcwcontent"]')[0]

            for link in comm_div.xpath('*/a'):
                name = link.text

                if '(Vice Chair)' in name:
                    mtype = 'vice chair'
                elif '(Chair)' in name:
                    mtype = 'chair'
                else:
                    mtype = 'member'

                name = clean_committee_name(link.text)

                # There's no easy way to determine whether a committee
                # is joint or not using the mobile legislator directory
                # (without grabbing a whole bunch of pages, at least)
                # so for now we will hard-code the one broken case
                if (name == "Oversight of HHS Eligibility System"
                        and term == '82'):
                    comm_chamber = 'joint'
                else:
                    comm_chamber = chamber

                if name.startswith('Appropriations-S/C on '):
                    sub = name.replace('Appropriations-S/C on ', '')
                    leg.add_role('committee member',
                                 term,
                                 chamber=comm_chamber,
                                 committee='Appropriations',
                                 subcommittee=sub,
                                 position=mtype)
                else:
                    leg.add_role('committee member',
                                 term,
                                 chamber=comm_chamber,
                                 committee=name,
                                 position=mtype)

            if type == 'Lt. Gov.':
                self.save_person(leg)
            else:
                if district:
                    self.save_legislator(leg)
コード例 #3
0
ファイル: legislators.py プロジェクト: tyrocca/openstates
    def scrape_member(self, chamber, term, member_url):
        page = self.get(member_url).text
        root = lxml.html.fromstring(page)
        root.make_links_absolute(member_url)

        sdiv = root.xpath('//div[@class="subtitle"]')[0]
        table = sdiv.getnext()

        photo_url = table.xpath('//img[@id="ctl00_ContentPlaceHolder1'
                                '_imgMember"]')[0].attrib['src']

        td = table.xpath('//td[@valign="top"]')[0]

        type = td.xpath('string(//div[1]/strong)').strip()

        full_name = td.xpath('//div/strong/text()')
        full_name = [re.sub(r'\s+', ' ', x).strip() for x in full_name]
        if full_name == []:
            self.warning("ERROR: CAN'T GET FULL NAME")
            return

        full_name = full_name[-1]

        district = td.xpath('string(//div[3])').strip()
        district = district.replace('District ', '')

        party = td.xpath('string(//div[4])').strip()[0]
        if party == 'D':
            party = 'Democratic'
        elif party == 'R':
            party = 'Republican'

        if type == 'Lt. Gov.':
            leg = Person(full_name)
            leg.add_role('Lt. Governor', term, party=party)
        else:
            leg = Legislator(term,
                             chamber,
                             district,
                             full_name,
                             party=party,
                             photo_url=photo_url,
                             url=member_url)

        leg.add_source(urlescape(member_url))

        # add addresses
        for atype, text in (('capitol', 'Capitol address'),
                            ('district', 'District address')):
            aspan = root.xpath("//span[. = '%s:']" % text)
            addr = ''
            phone = None
            if aspan:
                # cycle through brs
                addr = aspan[0].tail.strip()
                elem = aspan[0].getnext()
                while elem is not None and elem.tag == 'br':
                    if elem.tail:
                        if not phone_re.match(elem.tail):
                            addr += "\n" + elem.tail
                        else:
                            phone = elem.tail
                    elem = elem.getnext()
                # now add the addresses
                leg.add_office(atype, text, address=addr, phone=phone)

        # add committees
        comm_div = root.xpath('//div[string() = "Committee Membership:"]'
                              '/following-sibling::div'
                              '[@class="rcwcontent"]')[0]

        for link in comm_div.xpath('*/a'):
            name = link.text

            if '(Vice Chair)' in name:
                mtype = 'vice chair'
            elif '(Chair)' in name:
                mtype = 'chair'
            else:
                mtype = 'member'

            name = clean_committee_name(link.text)

            # There's no easy way to determine whether a committee
            # is joint or not using the mobile legislator directory
            # (without grabbing a whole bunch of pages, at least)
            # so for now we will hard-code the one broken case
            if (name == "Oversight of HHS Eligibility System"
                    and term == '82'):
                comm_chamber = 'joint'
            else:
                comm_chamber = chamber

            if name.startswith('Appropriations-S/C on '):
                sub = name.replace('Appropriations-S/C on ', '')
                leg.add_role('committee member',
                             term,
                             chamber=comm_chamber,
                             committee='Appropriations',
                             subcommittee=sub,
                             position=mtype)
            else:
                leg.add_role('committee member',
                             term,
                             chamber=comm_chamber,
                             committee=name,
                             position=mtype)

        if type == 'Lt. Gov.':
            self.save_object(leg)
        else:
            if district:
                self.save_legislator(leg)