def _scrape_upper(self, roster_page, term): member_urls = roster_page.xpath('(//table[caption])[1]//a/@href') # Sort by district for easier spotting of omissions: member_urls.sort( key=lambda url: int(re.search(r'\d+(?=\.htm)', url).group())) for member_url in member_urls: self._scrape_senator(member_url, term) # Handle Lt. Governor (President of the Senate) separately: url = 'http://www.senate.state.tx.us/75r/LtGov/Ltgov.htm' page = lxml.html.fromstring(self.get(url).text) name = page.xpath('//div[@class="memtitle"]/text()')[0] \ .replace('Lieutenant Governor', '').strip() # A safe assumption for lack of information on official member page or # party listings: party = 'Republican' lt_governor = Person(name) lt_governor.add_role('Lt. Governor', term, party=party) lt_governor.add_source(url) self.save_legislator(lt_governor)
def scrape_member(self, chamber, term, member_url): with self.urlopen(member_url) as page: root = lxml.html.fromstring(page) root.make_links_absolute(member_url) sdiv = root.xpath('//div[@class="subtitle"]')[0] table = sdiv.getnext() photo_url = table.xpath('//img[@id="ctl00_ContentPlaceHolder1' '_imgMember"]')[0].attrib['src'] td = table.xpath('//td[@valign="top"]')[0] type = td.xpath('string(//div[1]/strong)').strip() full_name = td.xpath('string(//div[2]/strong)').strip() full_name = re.sub(r'\s+', ' ', full_name) district = td.xpath('string(//div[3])').strip() district = district.replace('District ', '') addrs = {} for atype, text in (('capital_address', 'Capitol address:'), ('district_address', 'District address:')): aspan = root.xpath("//span[. = '%s']" % text) addrs[atype] = None if aspan: addrs[atype] = aspan[0].tail elem = aspan[0].getnext() while elem is not None and elem.tag == 'br': if elem.tail: addrs[atype] += "\n" + elem.tail elem = elem.getnext() party = td.xpath('string(//div[4])').strip()[0] if party == 'D': party = 'Democratic' elif party == 'R': party = 'Republican' if type == 'Lt. Gov.': leg = Person(full_name) leg.add_role('Lt. Governor', term, party=party, **addrs) else: leg = Legislator(term, chamber, district, full_name, party=party, photo_url=photo_url, **addrs) leg.add_source(urlescape(member_url)) comm_div = root.xpath('//div[string() = "Committee Membership:"]' '/following-sibling::div' '[@class="rcwcontent"]')[0] for link in comm_div.xpath('*/a'): name = link.text if '(Vice Chair)' in name: mtype = 'vice chair' elif '(Chair)' in name: mtype = 'chair' else: mtype = 'member' name = clean_committee_name(link.text) # There's no easy way to determine whether a committee # is joint or not using the mobile legislator directory # (without grabbing a whole bunch of pages, at least) # so for now we will hard-code the one broken case if (name == "Oversight of HHS Eligibility System" and term == '82'): comm_chamber = 'joint' else: comm_chamber = chamber if name.startswith('Appropriations-S/C on '): sub = name.replace('Appropriations-S/C on ', '') leg.add_role('committee member', term, chamber=comm_chamber, committee='Appropriations', subcommittee=sub, position=mtype) else: leg.add_role('committee member', term, chamber=comm_chamber, committee=name, position=mtype) if type == 'Lt. Gov.': self.save_person(leg) else: if district: self.save_legislator(leg)
def scrape_member(self, chamber, term, member_url): page = self.get(member_url).text root = lxml.html.fromstring(page) root.make_links_absolute(member_url) sdiv = root.xpath('//div[@class="subtitle"]')[0] table = sdiv.getnext() photo_url = table.xpath('//img[@id="ctl00_ContentPlaceHolder1' '_imgMember"]')[0].attrib['src'] td = table.xpath('//td[@valign="top"]')[0] type = td.xpath('string(//div[1]/strong)').strip() full_name = td.xpath('//div/strong/text()') full_name = [re.sub(r'\s+', ' ', x).strip() for x in full_name] if full_name == []: self.warning("ERROR: CAN'T GET FULL NAME") return full_name = full_name[-1] district = td.xpath('string(//div[3])').strip() district = district.replace('District ', '') party = td.xpath('string(//div[4])').strip()[0] if party == 'D': party = 'Democratic' elif party == 'R': party = 'Republican' if type == 'Lt. Gov.': leg = Person(full_name) leg.add_role('Lt. Governor', term, party=party) else: leg = Legislator(term, chamber, district, full_name, party=party, photo_url=photo_url, url=member_url) leg.add_source(urlescape(member_url)) # add addresses for atype, text in (('capitol', 'Capitol address'), ('district', 'District address')): aspan = root.xpath("//span[. = '%s:']" % text) addr = '' phone = None if aspan: # cycle through brs addr = aspan[0].tail.strip() elem = aspan[0].getnext() while elem is not None and elem.tag == 'br': if elem.tail: if not phone_re.match(elem.tail): addr += "\n" + elem.tail else: phone = elem.tail elem = elem.getnext() # now add the addresses leg.add_office(atype, text, address=addr, phone=phone) # add committees comm_div = root.xpath('//div[string() = "Committee Membership:"]' '/following-sibling::div' '[@class="rcwcontent"]')[0] for link in comm_div.xpath('*/a'): name = link.text if '(Vice Chair)' in name: mtype = 'vice chair' elif '(Chair)' in name: mtype = 'chair' else: mtype = 'member' name = clean_committee_name(link.text) # There's no easy way to determine whether a committee # is joint or not using the mobile legislator directory # (without grabbing a whole bunch of pages, at least) # so for now we will hard-code the one broken case if (name == "Oversight of HHS Eligibility System" and term == '82'): comm_chamber = 'joint' else: comm_chamber = chamber if name.startswith('Appropriations-S/C on '): sub = name.replace('Appropriations-S/C on ', '') leg.add_role('committee member', term, chamber=comm_chamber, committee='Appropriations', subcommittee=sub, position=mtype) else: leg.add_role('committee member', term, chamber=comm_chamber, committee=name, position=mtype) if type == 'Lt. Gov.': self.save_object(leg) else: if district: self.save_legislator(leg)