def scrape(self, chamber, term): self.validate_term(term, latest_only=False) root_url = 'http://www.capitol.tn.gov/' parties = { 'D': 'Democratic', 'R': 'Republican', 'CCR': 'Carter County Republican' } #testing for chamber if chamber == 'upper': url_chamber_name = 'senate' abbr = 's' else: url_chamber_name = 'house' abbr = 'h' if term != self.metadata["terms"][-1]["sessions"][0]: chamber_url = root_url + url_chamber_name + '/archives/' + term + 'GA/Members/index.html' else: chamber_url = root_url + url_chamber_name + '/members/' with self.urlopen(chamber_url) as page: page = lxml.html.fromstring(page) for row in page.xpath("//tr")[1:]: partyInit = row.xpath('td[2]')[0].text.split()[0] party = parties[partyInit] district = row.xpath('td[4]/a')[0].text.split()[1] phone = row.xpath('td[6]')[0].text #special case for Karen D. Camper if phone == None: phone = row.xpath('td[6]/div')[0].text phone = '615-' + phone.split()[0] email = row.xpath('td[7]/a')[0].text member_url = (root_url + url_chamber_name + '/members/' + abbr + district + '.html') member_photo_url = (root_url + url_chamber_name + '/members/images/' + abbr + district + '.jpg') with self.urlopen(member_url) as member_page: member_page = lxml.html.fromstring(member_page) name = member_page.xpath( '//div[@id="membertitle"]/h2')[0].text if 'Speaker' in name: full_name = name[8:len(name)] elif 'Lt.' in name: full_name = name[13:len(name)] elif abbr == 'h': full_name = name[5:len(name)] else: full_name = name[8:len(name)] leg = Legislator(term, chamber, district, full_name, party=party, email=email, phone=phone, url=member_url, photo_url=member_photo_url) leg.add_source(chamber_url) leg.add_source(member_url) self.save_legislator(leg)
def scrape(self, chamber, term): """ Scrapes legislators for the current term only """ self.validate_term(term, latest_only=True) url = _BASE_URL % _CHAMBERS[chamber].lower() index = self.get(url).text html = lxml.html.fromstring(index) html.make_links_absolute(url) base_table = html.xpath('body/table/tr/td[2]/table[2]') district = None # keep track of district for substitutes for row in base_table[0].xpath('tr'): img_url = row.xpath('string(.//img/@src)') contact_form, additional_info_url = row.xpath('.//a/@href') if "Substitute" in row.text_content(): # it seems like the sub always follows the person who he/she # is filling in for. # most sub info is provided at the additional info url self.scrape_sub(chamber, term, district, additional_info_url) continue else: full_name = " ".join(row[1][0].text_content().replace(u'\xa0', ' ').split()) party = _PARTY[row[1][0].tail.strip()] pieces = [ x.strip() for x in row.itertext() if x ][6:] # The parsed HTML will be something like: # ['District 4', '2', 'nd', 'term', address, phone(s), profession, committees] # Sometimes there's a leadership title before all that if 'District ' in pieces[1]: pieces.pop(0) assert pieces[0].startswith('District '), "Improper district found: {}".format(pieces[0]) assert pieces[3] == 'term', "Improper term found: {}".format(pieces[3]) district = pieces[0] district = district.replace('District', '').strip() pieces = pieces[4:] if pieces[0].startswith(u'(Served '): pieces.pop(0) address = pieces.pop(0).strip() assert re.match(r'.*\d{5}', address), "Address potentially invalid: {}".format(address) phone = None fax = None for line in pieces: if line.lower().startswith('home '): phone = line[len('home '):] elif not phone and line.lower().startswith('bus '): phone = line[len('bus '):] if line.lower().startswith('fax '): fax = line[len('fax '):] # After committees begin, no more contact information exists if line == "Committees:": break leg = Legislator(term, chamber, district, full_name, party=party) leg.add_office('district', 'District Office', fax=fax if fax else None, phone=phone if phone else None) leg.add_source(url) leg['photo_url'] = img_url leg['contact_form'] = contact_form leg['url'] = additional_info_url self.save_legislator(leg)
def scrape_lower_chamber(self, term): # E-mail contact is now hidden behind webforms. Sadness. party_map = { 'PNP': 'Partido Nuevo Progresista', 'PPD': u'Partido Popular Democr\xe1tico', 'PIP': u'Partido Independentista Puertorrique\u00F1o', } url = 'http://www.tucamarapr.org/dnncamara/ComposiciondelaCamara.aspx' page = self.lxmlize(url) member_nodes = self.get_nodes( page, '//div[@class="info-block"][1]//a[@class="opener"]') if member_nodes is not None: for member_node in member_nodes: # Initialize default values for legislator attributes. name = None district = None address = None party = None photo_url = None phone = None fax = None photo_url = self.get_node( member_node, './/span[@class="identity"]/img/@src') # Node reference for convenience. info_node = self.get_node(member_node, './/span[@class="info"]') name_node = self.get_node(info_node, './/span[@class="name"]') # Strip titles from legislator name. if name_node is not None: name_text = name_node.text.strip() name_text = re.sub(r'^Hon\.[\s]*', '', name_text) name_text = re.sub(r' - .*$', '', name_text) name = ' '.join(name_text.split()) party_node = self.get_node(info_node, './/span[@class="party"]/span') if party_node is not None: party_text = party_node.text.strip() party = party_map[party_text] district_node = self.get_node(info_node, './/span[@class="district"]') if district_node is not None: district_text = district_node.text.strip() try: district_number = re.search(r'0?(\d{1,2})', district_text).group(1) district = re.sub(r'^Distrito[\s]*', '', district_text).strip() except AttributeError: if "Distrito" not in district_text: district = 'At-Large' else: warning = u'{} missing district number.' self.logger.warning(warning.format(name)) address_node = self.get_node(info_node, './/span[@class="address"]') if address_node is not None: address_text = address_node.text if address_text and not address_text.isspace(): address = address_text.strip() # Only grabs the first validated phone number found. # Typically, representatives have multiple phone numbers. phone_nodes = self.get_nodes( member_node, './/span[@class="two-columns"]//span[@class="data-type"' 'and contains(text(), "Tel:")]') if phone_nodes is not None: has_valid_phone = False for phone_node in phone_nodes: # Don't keep searching phone numbers if a good # one is found. if has_valid_phone: break phone_text = phone_node.text phone_text = re.sub(r'^Tel:[\s]*', '', phone_text)\ .strip() if self.validate_phone_number(phone_text): phone = phone_text has_valid_phone = True fax_node = self.get_node( member_node, './/span[@class="two-columns"]//span[@class="data-type"' ' and contains(text(), "Fax:")]') if fax_node is not None: fax_text = fax_node.text fax_text = re.sub(r'^Fax:[\s]*', '', fax_text).strip() if self.validate_phone_number(fax_text): fax = fax_text legislator = Legislator(term=term, chamber='lower', district=district, full_name=name, party=party, photo_url=photo_url) legislator.add_source(url) legislator.add_office( type='capitol', name='Oficina del Capitolio', address=address, phone=phone, fax=fax, ) self.save_legislator(legislator)
def fetch_member(self, url, name, term, chamber): photo_url = '' lis_id = self._get_lis_id(chamber, url) if chamber == 'lower': base_url = 'http://memdata.virginiageneralassembly.gov' profile_url = base_url + '/images/display_image/{}' photo_url = profile_url.format(lis_id) #xpath_query = './/img/@src' elif chamber == 'upper': base_url = 'http://apps.senate.virginia.gov' profile_url = base_url + '/Senator/memberpage.php?id={}' xpath_query = './/img[@class="profile_pic"]/@src' # Retrieve profile photo. profile_page = self.lxmlize(profile_url.format(lis_id)) photo_url = self.get_node(profile_page, xpath_query) # Detect whether URL points to a blank base location. blank_urls = ( 'http://memdata.virginiageneralassembly.gov/images/display_' 'image/', 'http://virginiageneralassembly.gov/house/members/photos/', ) if photo_url in blank_urls: photo_url = '' if (name in CHAMBER_MOVES and (chamber != CHAMBER_MOVES[name])): return if "vacated" in name.lower(): self.logger.warning( "Seat seems to have been vacated: '{}'".format(name)) return party_map = {'R': 'Republican', 'D': 'Democratic', 'I': 'Independent'} party_district_re = re.compile( r'\((R|D|I)\) - (?:House|Senate) District\s+(\d+)') # handle resignations, special elections match = re.search(r'-(Resigned|Member) (\d{1,2}/\d{1,2})?', name) if match: action, date = match.groups() name = name.rsplit('-')[0] if action == 'Resigned': pass # TODO: set end date elif action == 'Member': pass # TODO: set start date html = self.get(url).text doc = lxml.html.fromstring(html) party_district_line = doc.xpath('//h3/font/text()')[0] party, district = party_district_re.match(party_district_line).groups() # Scrub status from name. name = re.sub(r'(- Elect)$', '', name).strip() leg = Legislator( term=term, chamber=chamber, district=district, full_name=name.strip(), party=party_map[party], url=url, photo_url=photo_url, ) leg.add_source(url) for ul in doc.xpath('//ul[@class="linkNon" and normalize-space()]'): address = [] phone = None email = None for li in ul.getchildren(): text = li.text_content() if re.match('\(\d{3}\)', text): phone = text elif text.startswith('email:'): email = text.strip('email: ').strip() else: address.append(text) office_type = ('capitol' if 'Capitol Square' in address else 'district') name = ('Capitol Office' if office_type == 'capitol' else 'District Office') leg.add_office(office_type, name, address='\n'.join(address), phone=phone, email=email) for com in doc.xpath('//ul[@class="linkSect"][1]/li/a/text()'): leg.add_role('committee member', term=term, chamber=chamber, committee=com) self.save_legislator(leg)
def scrape_reps(self, chamber, term_name): url = 'http://www.maine.gov/legis/house/dist_mem.htm' page = self.urlopen(url) page = lxml.html.fromstring(page) page.make_links_absolute(url) # There are 151 districts for district in xrange(1, 152): if (district % 10) == 0: path = '/html/body/p[%s]/a[3]' % (district + 4) else: path = '/html/body/p[%s]/a[2]' % (district + 4) try: link = page.xpath(path)[0] except IndexError: # If the the district % 10 == 0 query doesn't # produce a link, retry the second link. Horrible. path = '/html/body/p[%s]/a[2]' % (district + 4) link = page.xpath(path)[0] leg_url = link.get('href') name = link.text_content() if len(name) == 0: return if name.split()[0] == 'District': return mark = name.find('(') party = name[mark + 1] district_name = name[mark + 3:-1] name = name[15:mark] # vacant if party == "V": continue else: party = _party_map[party] leg = Legislator(term_name, chamber, str(district), name, party=party, url=leg_url, district_name=district_name) leg.add_source(url) leg.add_source(leg_url) # Get the photo url. html = self.urlopen(leg_url) doc = lxml.html.fromstring(html) doc.make_links_absolute(leg_url) # Get the default (B&W) photo url. photo_url = doc.xpath('//img')[0] if 'src' in photo_url.attrib: photo_url = photo_url.attrib.pop('src') leg['photo_url'] = photo_url else: photo_url = None # Try to get color photo from the GOP website. if party == 'Republican': xpath = '//a[contains(@href, "house_gop")]/@href' party_website_url = doc.xpath(xpath)[0] party_website_html = self.urlopen(party_website_url) if party_website_html.response.status_code == 200: party_website = lxml.html.fromstring(party_website_html) photo_url = party_website.xpath('//img/@src')[1] # Try to get color photo from the dems' website. elif party == 'Democratic': xpath = '//a[contains(@href, "housedems")]/@href' els = doc.xpath(xpath) if els: party_website_url = els[0] try: party_website_html = self.urlopen(party_website_url) except scrapelib.HTTPError: # Sometimes the page doesn't exist. pass else: if party_website_html.response.status_code == 200: party_website = lxml.html.fromstring( party_website_html) photo_url = party_website.xpath('//img/@src')[1] self.scrape_lower_offices(leg, page, leg_url) self.save_legislator(leg)
def scrape(self, chamber, term): self.validate_term(term, latest_only=False) root_url = 'http://www.capitol.tn.gov/' parties = { 'D': 'Democratic', 'R': 'Republican', 'CCR': 'Carter County Republican', 'I': 'Independent' } #testing for chamber if chamber == 'upper': url_chamber_name = 'senate' abbr = 's' else: url_chamber_name = 'house' abbr = 'h' if term != self.metadata["terms"][-1]["sessions"][0]: chamber_url = root_url + url_chamber_name chamber_url += '/archives/' + term + 'GA/Members/index.html' else: chamber_url = root_url + url_chamber_name + '/members/' page = self.get(chamber_url).text page = lxml.html.fromstring(page) for row in page.xpath("//tr"): # Skip any a header row. if set(child.tag for child in row) == set(['th']): continue vacancy_check = row.xpath('./td/text()')[1] if 'Vacant' in vacancy_check: self.logger.warning("Vacant Seat") continue partyInit = row.xpath('td[3]')[0].text.split()[0] party = parties[partyInit] district = row.xpath('td[5]/a')[0].text.split()[1] address = row.xpath('td[6]')[0].text_content() # 301 6th Avenue North Suite address = address.replace( 'LP', 'Legislative Plaza\nNashville, TN 37243') address = address.replace( 'WMB', 'War Memorial Building\nNashville, TN 37243') address = '301 6th Avenue North\nSuite ' + address phone = [ x.strip() for x in row.xpath('td[7]//text()') if x.strip() ][0] email = HTMLParser.HTMLParser().unescape( row.xpath('td[1]/a/@href')[0][len("mailto:"):]) member_url = (root_url + url_chamber_name + '/members/' + abbr + district + '.html') member_photo_url = (root_url + url_chamber_name + '/members/images/' + abbr + district + '.jpg') try: member_page = self.get(member_url, follow_redirects=False).text except TypeError: member_page = self.get(member_url).text member_page = lxml.html.fromstring(member_page) try: name = member_page.xpath('body/div/div/h1/text()')[0] except IndexError: name = member_page.xpath( '//div[@id="membertitle"]/h2/text()')[0] if 'Speaker' in name: full_name = name[8:len(name)] elif 'Lt.' in name: full_name = name[13:len(name)] elif abbr == 'h': full_name = name[len("Representative "):len(name)] else: full_name = name[8:len(name)] leg = Legislator(term, chamber, district, full_name.strip(), party=party, email=email, url=member_url, photo_url=member_photo_url) leg.add_source(chamber_url) leg.add_source(member_url) # TODO: add district address from this page leg.add_office('capitol', 'Nashville Address', address=address, phone=phone) self.save_legislator(leg)
def scrape(self, chamber, term): if chamber == 'upper': index_url = 'http://www.leg.wa.gov/senate/senators/Pages/default.aspx' else: index_url = 'http://www.leg.wa.gov/house/representatives/Pages/default.aspx' doc = self.lxmlize(index_url) # Email addresses are listed on a separate page. email_list_url = 'http://app.leg.wa.gov/memberemail/Default.aspx' email_doc = self.lxmlize(email_list_url) for member in doc.xpath('//div[@id="allMembers"]/div[@class="memberInformation"]'): (photo_url, ) = member.xpath('.//a[text()="Print Quality Photo"]/@href') (title_name_party, ) = member.xpath('.//span[@class="memberName"]/text()') (name, party) = re.search(r'^(?:Senator|Representative)\s(.+)\s\(([RD])\)$', title_name_party).groups() if party == 'R': party = "Republican" elif party == 'D': party = "Democratic" (district_name, _district_name, ) = member.xpath('.//a[contains(text(), " Legislative District")]/text()') assert district_name == _district_name district_num = re.search(r'(\d{1,2})\w{2} Legislative District', district_name).group(1) leg = Legislator( full_name=name, term=term, chamber=chamber, district=district_num, party=party, photo_url=photo_url ) leg['url'] = member.xpath('.//a[contains(text(), "Home Page")]/@href')[0] capitol_office = member.xpath('.//div[@class="memberColumnTitle" and text()=" Olympia Office"]/parent::div[1]/text()') capitol_office = [l.strip() for l in capitol_office if l.strip()] capitol_fax = None capitol_phone = None capitol_address = None # Can't capture any information anyway if office data is empty, # so we can skip if that's the case. if capitol_office: # Retrieve capitol office fax number. if capitol_office[-1].startswith('Fax: '): capitol_fax = capitol_office.pop().replace('Fax: ', "") # Retrieve capitol office phone number. capitol_phone = capitol_office.pop() # Retrieve capitol office address. capitol_address = '\n'.join(capitol_office) # Retrieve the member's position from the email link. We need it to find the member's email address. # These positions are enough to discriminate the chamber too (0 = upper, 1,2 = lower) email_link_url = member.xpath('.//a[contains(@href, "memberEmail")]')[0].get('href') position = re.search(r'/([[0-9]+)$', email_link_url).group(1) # Need to get the email from the email page by matching with the member's district and position email = self.get_node( email_doc, './/tr/td/a[contains(@href, "memberEmail/{}/{}")]/parent::td/' 'following-sibling::td[1]/text()'.format( district_num, position)) leg.add_office( 'capitol', 'Capitol Office', address=capitol_address, phone=capitol_phone, email=email, fax=capitol_fax ) _has_district_office = member.xpath('.//div[@class="memberColumnTitle" and text()=" District Office"]') if _has_district_office: # Out of both chambers, only one member has multiple district offices, so ignore that # Also ignore the few members who have separate mailing addresses district_office = member.xpath('.//div[@class="memberColumnTitle" and text()=" District Office"]/parent::div[1]/text()') district_office = [l.strip() for l in district_office if l.strip()] _end_of_first_address = district_office.index([l for l in district_office if re.search(r'\,\s*WA\s*\d{5}', l)][0]) district_address = '\n'.join(district_office[0:(_end_of_first_address + 1)]) try: district_phone = district_office[(_end_of_first_address + 1)] assert re.match(r'\(\d{3}\) \d{3} \- \d{4}', district_phone) except IndexError: pass except AssertionError: pass leg.add_office( 'district', 'District Office', address=district_address, phone=district_phone ) leg.add_source(index_url) self.save_legislator(leg)
def scrape(self, chamber, term): biennium = "%s-%s" % (term[0:4], term[7:9]) url = ("http://wslwebservices.leg.wa.gov/SponsorService.asmx/" "GetSponsors?biennium=%s" % biennium) # these pages are useful for checking if a leg is still in office if chamber == 'upper': cur_member_url = 'http://www.leg.wa.gov/senate/senators/Pages/default.aspx' else: cur_member_url = 'http://www.leg.wa.gov/house/representatives/Pages/default.aspx' cur_members = self.get(cur_member_url).text cur_members_doc = lxml.html.fromstring(cur_members) cur_members_doc.make_links_absolute(cur_member_url) page = self.get(url) page = lxml.etree.fromstring(page.content) for member in xpath(page, "//wa:Member"): mchamber = xpath(member, "string(wa:Agency)") mchamber = {'House': 'lower', 'Senate': 'upper'}[mchamber] if mchamber != chamber: continue name = xpath(member, "string(wa:Name)").strip() if name == "": continue # if the legislator isn't in the listing, skip them if name not in cur_members: self.warning('%s is no longer in office' % name) continue else: leg_url, = set(cur_members_doc.xpath( '//span[contains(text(), "%s")]/../..//' 'a[text()="Home Page"]/@href' % ( name ))) party = xpath(member, "string(wa:Party)") party = {'R': 'Republican', 'D': 'Democratic'}.get( party, party) district = xpath(member, "string(wa:District)") if district == '0': # Skip phony district 0. continue email = xpath(member, "string(wa:Email)") phone = xpath(member, "string(wa:Phone)") last = xpath(member, "string(wa:LastName)") last = last.lower().replace(' ', '') scraped_offices = [] photo_url = "" try: leg_page = self.get(leg_url).text leg_page = lxml.html.fromstring(leg_page) leg_page.make_links_absolute(leg_url) photo_link = leg_page.xpath( "//a[contains(@href, 'publishingimages')]") if photo_link: photo_url = photo_link[0].attrib['href'] offices = leg_page.xpath("//table[@cellspacing='0']/tr/td/b[contains(text(), 'Office')]") for office in offices: office_block = office.getparent() office_name = office.text_content().strip().rstrip(":") address_lines = [x.tail for x in office_block.xpath(".//br")] address_lines = filter(lambda a: a is not None, address_lines) _ = address_lines.pop(len(address_lines) - 1) phone = address_lines.pop(len(address_lines) - 1) address = "\n".join(address_lines) obj = { "name": office_name, "phone": phone } if address.strip() != '': obj['address'] = address scraped_offices.append(obj) except scrapelib.HTTPError: # Sometimes the API and website are out of sync # with respect to legislator resignations/appointments pass except requests.exceptions.ConnectionError: # Sometimes the API and website are out of sync # with respect to legislator resignations/appointments pass leg = Legislator(term, chamber, district, name, '', '', '', party, photo_url=photo_url, url=leg_url) leg.add_source(leg_url) for office in scraped_offices: typ = 'district' if 'District' in office['name'] else 'capitol' leg.add_office(typ, office.pop('name'), **office) self.save_legislator(leg)
def scrape(self, chamber, term): term_slug = term[:-2] url = MEMBER_LIST_URL[chamber] % term_slug html = self.urlopen(url) doc = lxml.html.fromstring(html) doc.make_links_absolute(url) for row in doc.xpath('//table')[4].xpath('tr')[2:]: name, _, _, district, party = row.xpath('td') district = district.text party = { 'D': 'Democratic', 'R': 'Republican', 'I': 'Independent' }[party.text] leg_url = name.xpath('a/@href')[0] name = name.text_content().strip() # inactive legislator, skip them for now if name.endswith('*'): name = name.strip('*') continue leg_html = self.urlopen(leg_url) leg_doc = lxml.html.fromstring(leg_html) leg_doc.make_links_absolute(leg_url) leg = Legislator(term, chamber, district, name, party=party, url=leg_url) leg.add_source(url) hotgarbage = ('Senate Biography Information for the 98th General ' 'Assembly is not currently available.') if hotgarbage in leg_html: # The legislator's bio isn't available yet. self.logger.warning('No legislator bio available for ' + name) self.save_legislator(leg) continue photo_url = leg_doc.xpath( '//img[contains(@src, "/members/")]/@src')[0] photo_url_parsed = urlparse(photo_url) encoded_path = quote(photo_url_parsed.path) photo_url = photo_url_parsed._replace(path=encoded_path).geturl() leg.update(photo_url=photo_url) leg.add_source(leg_url) # email email = leg_doc.xpath('//b[text()="Email: "]') if email: leg['email'] = email[0].tail # function for turning an IL contact info table to office details def _table_to_office(table, office_type, office_name): addr = '' phone = '' fax = None for row in table.xpath('tr'): row = row.text_content().strip() # skip rows that aren't part of address if 'Office:' in row or row == 'Cook County': continue # fax number row ends with FAX elif 'FAX' in row: fax = row.replace(' FAX', '') # phone number starts with ( [make it more specific?] elif row.startswith('('): phone = row # everything else is an address else: addr += (row + '\n') if addr.strip() != ',': leg.add_office(office_type, office_name, address=addr.strip(), phone=phone, fax=fax) # extract both offices from tables table = leg_doc.xpath( '//table[contains(string(), "Springfield Office")]') if table: _table_to_office(table[3], 'capitol', 'Springfield Office') table = leg_doc.xpath( '//table[contains(string(), "District Office")]') if table: _table_to_office(table[3], 'district', 'District Office') self.save_legislator(leg)
def scrape_lower_chamber(self, term): url = "http://www.okhouse.gov/Members/Default.aspx" page = self.lxmlize(url) legislator_nodes = self.get_nodes( page, '//table[@id="ctl00_ContentPlaceHolder1_RadGrid1_ctl00"]/tbody/tr') for legislator_node in legislator_nodes: name_node = self.get_node( legislator_node, './/td[1]/a') if name_node is not None: name_text = name_node.text.strip() last_name, delimiter, first_name = name_text.partition(',') if last_name is not None and first_name is not None: first_name = first_name.strip() last_name = last_name.strip() name = ' '.join([first_name, last_name]) else: raise ValueError('Unable to parse name: {}'.format( name_text)) if name.startswith('House District'): continue district_node = self.get_node( legislator_node, './/td[3]') if district_node is not None: district = district_node.text.strip() party_node = self.get_node( legislator_node, './/td[4]') if party_node is not None: party_text = party_node.text.strip() party = self._parties[party_text] legislator_url = 'http://www.okhouse.gov/District.aspx?District=' + district legislator_page = self.lxmlize(legislator_url) photo_url = self.get_node( legislator_page, '//a[@id="ctl00_ContentPlaceHolder1_imgHiRes"]/@href') legislator = Legislator( _scraped_name=name_text, full_name=name, term=term, chamber='lower', district=district, party=party, photo_url=photo_url, url=legislator_url ) legislator.add_source(url) legislator.add_source(legislator_url) # Scrape offices. self.scrape_lower_offices(legislator_page, legislator) self.save_legislator(legislator)
def scrape_details(self, chamber, term, leg_name, leg_link, role): if not leg_link: # Vacant post, likely: if "Vacancy" in leg_name: return raise Exception("leg_link is null. something went wrong") try: url = 'http://billstatus.ls.state.ms.us/members/%s' % leg_link url_root = os.path.dirname(url) details_page = self.get(url) root = lxml.etree.fromstring(details_page.content) party = root.xpath('string(//PARTY)') district = root.xpath('string(//DISTRICT)') photo = "%s/%s" % (url_root, root.xpath('string(//IMG_NAME)')) home_phone = root.xpath('string(//H_PHONE)') home_address = root.xpath('string(//H_ADDRESS)') home_address2 = root.xpath('string(//H_ADDRESS2)') home_city = root.xpath('string(//H_CITY)') home_zip = root.xpath('string(//H_ZIP)') home_address_total = "%s\n%s\n%s\n%s" % ( home_address, home_address2, home_city, home_zip ) bis_phone = root.xpath('string(//B_PHONE)') capital_phone = root.xpath('string(//CAP_PHONE)') other_phone = root.xpath('string(//OTH_PHONE)') org_info = root.xpath('string(//ORG_INFO)') email_name = root.xpath('string(//EMAIL_ADDRESS)').strip() cap_room = root.xpath('string(//CAP_ROOM)') if leg_name in ('Lataisha Jackson', 'John G. Faulkner'): assert not party, "Remove special-casing for this Democrat without a listed party: {}".format(leg_name) party = 'Democratic' elif leg_name in ('James W. Mathis', 'John Glen Corley'): assert not party, "Remove special-casing for this Republican without a listed party: {}".format(leg_name) party = 'Republican' elif party == 'D': party = 'Democratic' elif party == 'R': party = 'Republican' else: raise AssertionError( "A member with no identifiable party was found: {}".format(leg_name)) leg = Legislator(term, chamber, district, leg_name, party=party, role=role, org_info=org_info, url=url, photo_url=photo) leg.add_source(url) kwargs = {} if email_name != "": if "@" in email_name: email = email_name else: email = '%s@%s.ms.gov' % (email_name, {"upper": "senate", "lower": "house"}[chamber]) kwargs['email'] = email if capital_phone != "": kwargs['phone'] = capital_phone if cap_room != "": kwargs["address"] = "Room %s\n%s" % (cap_room, CAP_ADDRESS) else: kwargs['address'] = CAP_ADDRESS leg.add_office('capitol', 'Capitol Office', **kwargs) kwargs = {} if home_phone != "": kwargs['phone'] = home_phone if home_address_total != "": kwargs['address'] = home_address_total if kwargs != {}: leg.add_office('district', 'District Office', **kwargs) self.save_legislator(leg) except scrapelib.HTTPError, e: self.warning(str(e))
def scrape_session(self, term, chambers, session): session = self.metadata['session_details'][session] sid = session['_guid'] members = self.sservice.GetMembersBySession(sid)['MemberListing'] for member in members: guid = member['Id'] # print member['Name'] nick_name, first_name, middle_name, last_name = ( member['Name'][x] for x in ['Nickname', 'First', 'Middle', 'Last']) chamber, district = (member['District'][x] for x in ['Type', 'Number']) party = member['Party'] if party == 'Democrat': party = 'Democratic' # print first_name, middle_name, last_name, party # print chamber, district first_name = nick_name if nick_name else first_name # XXX: Due to the upstream handling... # if middle_name: # name = "%s %s %s" % (first_name, middle_name, last_name) # else: # blocked out due to GA putting middle_name in first_name ... name = "%s %s" % (first_name, last_name) chamber = {"House": 'lower', "Senate": 'upper'}[chamber] if party.strip() == '': party = 'other' legislator = Legislator( term, chamber, str(district), name, party=party, # last_name=last_name, # first_name=first_name, _guid=guid) # if middle_name: # legislator['middle_name'] = middle_name # Sadly, upstream isn't good about keeping first names first only, # so I'm blocking this out. ainfo = [ member['DistrictAddress'][x] for x in ['Street', 'City', 'State', 'Zip'] ] if not None in ainfo: # XXX: Debug this nonsense. ainfo = [x.strip() for x in ainfo] address = " ".join(ainfo) email = member['DistrictAddress']['Email'] legislator.add_office('district', 'District Address', address=address, email=email) legislator.add_source(self.ssource) self.save_legislator(legislator)
def scrape(self, term, chambers): leg_url = "ftp://ftp.cga.ct.gov/pub/data/LegislatorDatabase.csv" data = self.urlopen(leg_url) page = open_csv(data) for row in page: chamber = {'H': 'lower', 'S': 'upper'}[row['office code']] if chamber not in chambers: continue district = row['dist'].lstrip('0') name = row['first name'] mid = row['middle initial'].strip() if mid: name += " %s" % mid name += " %s" % row['last name'] suffix = row['suffix'].strip() if suffix: name += " %s" % suffix party = row['party'] if party == 'Democrat': party = 'Democratic' leg = Legislator(term, chamber, district, name, first_name=row['first name'], last_name=row['last name'], middle_name=row['middle initial'], suffixes=row['suffix'], party=party, email=row['email'], url=row['URL'], office_phone=row['capitol phone']) office_address = "%s, Room %s\nHartford, CT 06106-1591" % ( row['capitol street address'], row['room number']) leg.add_office('capitol', 'Capitol Office', address=office_address, phone=row['capitol phone']) # skipping home address for now leg.add_source(leg_url) for comm in row['committee member1'].split(';'): if comm: if ' (' in comm: comm, role = comm.split(' (') role = role.strip(')').lower() else: role = 'member' leg.add_role('committee member', term, chamber='joint', committee=comm.strip(), position=role) self.save_legislator(leg)
def scrape(self, chamber, term): if chamber == 'upper': url = ('http://webserver.rilin.state.ri.us/Documents/Senators.xls') rep_type = 'Senator' source_url = 'http://www.rilin.state.ri.us/senators/default.aspx' source_url_title_replacement = rep_type contact_url = 'http://webserver.rilin.state.ri.us/Email/SenEmailListDistrict.asp' elif chamber == 'lower': url = ( 'http://webserver.rilin.state.ri.us/Documents/Representatives.xls' ) rep_type = 'Representative' source_url = 'http://www.rilin.state.ri.us/representatives/default.aspx' source_url_title_replacement = 'Rep. ' contact_url = 'http://webserver.rilin.state.ri.us/Email/RepEmailListDistrict.asp' self.urlretrieve(url, 'ri_leg.xls') wb = xlrd.open_workbook('ri_leg.xls') sh = wb.sheet_by_index(0) # This isn't perfect but it's cheap and better than using the # XLS doc as the source URL for all legislators. # 374: RI: legislator url leg_source_url_map = {} leg_page = self.lxmlize(source_url) for link in leg_page.xpath('//td[@class="ms-vb2"]'): leg_name = link.text_content().replace( source_url_title_replacement, '') leg_url = link.xpath("..//a")[0].attrib['href'] leg_source_url_map[leg_name] = leg_url for rownum in xrange(1, sh.nrows): d = {} for field, col_num in excel_mapping.iteritems(): d[field] = sh.cell(rownum, col_num).value if d['full_name'].upper() == "VACANT": self.warning("District {}'s seat is vacant".format( int(d['district']))) continue slug = re.match( "(?P<class>sen|rep)-(?P<slug>.*)@(rilin\.state\.ri\.us|rilegislature\.gov)", d['email']) if 'asp' in d['email']: d['email'] = None if d['email'] is not None: info = slug.groupdict() info['chamber'] = "senators" if info[ 'class'] == 'sen' else "representatives" url = ("http://www.rilin.state.ri.us/{chamber}/" "{slug}/Pages/Biography.aspx".format(**info)) dist = str(int(d['district'])) district_name = dist assert d['full_name'].startswith(rep_type), "Improper name found" full_name = re.sub(r"^{}(?=\s?[A-Z].*$)".format(rep_type), '', d['full_name']).strip() translate = { "Democrat": "Democratic", "Republican": "Republican", "Independent": "Independent" } homepage_url = None url_names = lxml.html.fromstring(self.get(source_url).text) url_names = url_names.xpath('//td[@class="ms-vb2"]/a/@href') modified_name = re.sub(r'[^\w\s]', '', full_name) modified_name = modified_name.replace(' ', '').strip('').lower() for el in url_names: if 'default.aspx' in el: el = el.replace('default.aspx', '') el = el.strip('') if el[-1] == '/': el = el[:-1] el = el.lower() url_name_array = el.split('/') if url_name_array[-1] in modified_name: #remove '/default.aspx' and add last name homepage_url = source_url[:-12] + url_name_array[-1] kwargs = { "town_represented": d['town_represented'], } contact = self.lxmlize(contact_url) contact_phone = contact.xpath( '//tr[@valign="TOP"]//td[@class="bodyCopy"]/text() | //td[@class="bodyCopy"]//center/text()' ) phone = None for el in contact_phone: if len(el) <= 2 and dist == el: number = contact_phone.index(el) phone = contact_phone[number + 2] phone = phone.strip() email = None if d['email'] is not None: email = d['email'] if homepage_url is not None: kwargs['url'] = homepage_url if d['address'] is '': d['address'] = 'No Address Found' leg = Legislator(term, chamber, district_name, full_name, '', '', '', translate[d['party']], **kwargs) leg.add_office('district', 'Dictrict Office', address=d['address'], phone=phone, email=email) leg.add_source(source_url) leg.add_source(contact_url) if homepage_url: leg.add_source(homepage_url) self.save_legislator(leg)
def scrape(self, chamber, term): urls = { 'lower': "http://www.msa.md.gov/msa/mdmanual/06hse/html/hseal.html", 'upper': "http://www.msa.md.gov/msa/mdmanual/05sen/html/senal.html" } detail_re = re.compile( '\((R|D)\), (?:Senate President, )?(?:House Speaker, )?District (\w+)' ) with self.urlopen(urls[chamber]) as html: doc = lxml.html.fromstring(html) # rest of data on this page is <li>s that have anchor tags for a in doc.xpath('//li/a'): link = a.get('href') # tags don't close so we get the <li> and <a> content and diff them name_text = a.text_content() detail_text = a.getparent().text_content().replace( name_text, '') # ignore if it is not a valid link if link: # handle names names = name_text.split(',') last_name = names[0] first_name = names[1].strip() # TODO: try to trim first name to remove middle initial if len(names) > 2: suffixes = names[2] else: suffixes = '' # handle details details = detail_text.strip() party, district = detail_re.match(details).groups() party = PARTY_DICT[party] leg_url = BASE_URL + link leg = Legislator(term, chamber, district, ' '.join((first_name, last_name)), first_name, last_name, party=party, suffixes=suffixes, url=leg_url) leg.add_source(url=leg_url) with self.urlopen(leg_url) as leg_html: leg_doc = lxml.html.fromstring(leg_html) img_src = leg_doc.xpath('//img[@align="left"]/@src') if img_src: leg['photo_url'] = BASE_URL + img_src[0] # address extraction # this is pretty terrible, we get address in a format that looks # like: # James Senate Office Building, Room 322 # 11 Bladen St., Annapolis, MD 21401 # (410) 841-3565, (301) 858-3565; 1-800-492-7122, ext. 3565 (toll free) # e-mail: [email protected] # fax: (410) 841-3552, (301) 858-3552 # # Western Maryland Railway Station, 13 Canal St., Room 304, Cumberland, MD 21502 # (301) 722-4780; 1-866-430-9553 (toll free) # e-mail: [email protected] # fax: (301) 722-4790 # usually first ul, sometimes first p try: addr_lines = leg_doc.xpath( '//ul')[0].text_content().strip().splitlines() except IndexError: addr_lines = leg_doc.xpath( '//p')[0].text_content().strip().splitlines() addr_pieces = { 'capitol': defaultdict(str), 'district': defaultdict(str) } addr_type = 'capitol' for line in addr_lines: if '(401)' in line or '(301)' in line: addr_pieces[addr_type]['phone'] = line elif 'toll free' in line: pass # skip stand alone 1-800 numbers elif 'e-mail' in line: addr_pieces[addr_type]['email'] = line.replace( 'email: ', '') elif 'fax' in line: addr_pieces[addr_type]['fax'] = line.replace( 'fax: ', '') elif line == '': addr_type = 'district' else: addr_pieces[addr_type][ 'address'] += '{0}\n'.format(line) if addr_pieces['capitol']: leg.add_office('capitol', 'Capitol Office', **addr_pieces['capitol']) leg['email'] = (addr_pieces['capitol']['email'] or addr_pieces['district']['email'] or None) if addr_pieces['district']: leg.add_office('district', 'District Office', **addr_pieces['district']) self.save_legislator(leg)
def scrape(self, chamber, term): for tdata in self.metadata['terms']: if term == tdata['name']: year = tdata['start_year'] session_number = tdata['session_number'] break # Scrape committees. Also produce a name dictionary that can be # used for fuzzy matching between the committee page names and the # all-caps csv names. for name_dict, _ in scrape_committees(year, chamber): pass # Fetch the csv. url = 'http://leg.mt.gov/content/sessions/%s/%d%sMembers.txt' % \ (session_number, year, chamber == 'upper' and 'Senate' or 'House') # Parse it. data = self.urlopen(url) data = data.replace('"""', '"') # weird triple quotes data = data.splitlines() fieldnames = [ 'last_name', 'first_name', 'party', 'district', 'address', 'city', 'state', 'zip' ] csv_parser = csv.DictReader(data, fieldnames) district_leg_urls = self._district_legislator_dict() for entry in csv_parser: if not entry: continue # City. entry['city'] = entry['city'].title() # Address. entry['address'] = entry['address'].title() # District. district = entry['district'] hd_or_sd, district = district.split() del entry['district'] # Party. party_letter = entry['party'] party = {'D': 'Democratic', 'R': 'Republican'}[party_letter] entry['party'] = party del entry['party'] # Get full name properly capped. _fullname = '%s %s' % (entry['first_name'].capitalize(), entry['last_name'].capitalize()) city_lower = entry['city'].lower() fullname = difflib.get_close_matches(_fullname, name_dict[city_lower], cutoff=0.5) # If there are no close matches with the committee page, # use the title-capped first and last name. if len(fullname) < 1: fullname = _fullname # msg = 'No matches found for "%s" with "%s" from %r' # self.debug(msg % (_fullname, fullname, # name_dict[city_lower])) else: fullname = fullname[0] # if _fullname != fullname: # msg = 'matched "%s" with "%s" from %r' # self.debug(msg % (_fullname, fullname, # name_dict[city_lower])) # Get any info at the legislator's detail_url. detail_url = district_leg_urls[hd_or_sd][district] deets = self._scrape_details(detail_url) # Add the details and delete junk. entry.update(deets) del entry['first_name'], entry['last_name'] legislator = Legislator(term, chamber, district, fullname, party=party) legislator.update(entry) legislator.add_source(detail_url) legislator.add_source(url) legislator['url'] = detail_url self.save_legislator(legislator)
def scrape_reps(self, chamber, term): # There are 99 House districts for district in xrange(1, 100): rep_url = ('http://www.house.state.oh.us/components/' 'com_displaymembers/page.php?district=%d' % district) with self.urlopen(rep_url) as page: page = lxml.html.fromstring(page) ranges = [] cur = [] info = page.xpath('//td[@class="info"]/*') for r in info: if r.tag == 'strong': ranges.append(cur) cur = [] else: cur.append(r) ranges.append(cur) block = ranges[4][:-1] address = ", ".join( [ x.tail.strip() for x in block ]) phone = page.xpath( "//strong[contains(text(), 'Phone')]")[0].tail fax = page.xpath( "//strong[contains(text(), 'Fax')]")[0].tail for el in page.xpath('//table[@class="page"]'): rep_link = el.xpath('tr/td/title')[0] full_name = rep_link.text party = full_name[-2] full_name = full_name[0:-3] if full_name == 'Vacant Posit': continue if party == "D": party = "Democratic" elif party == "R": party = "Republican" leg = Legislator(term, chamber, str(district), full_name, party=party, url=rep_url) leg.add_office('capitol', 'Capitol Office', address=address, phone=phone, fax=fax) # Yet, no email. committees = page.xpath("//table[@class='billLinks']")[0] for committee in committees.xpath(".//tr"): td = committee.xpath(".//td") if len(td) != 2: break name, role = td name, role = name.text_content(), role.text_content() name, role = name.strip(), role.strip() if name[0] == "|": continue chmbr = chamber if "joint" in name.lower(): chmbr = "joint" if name in JOINT_COMMITTEE_OVERRIDE: chmbr = "joint" leg.add_role('committee member', term=term, chamber=chmbr, committee=name, position=role ) leg.add_source(rep_url) self.save_legislator(leg)
def scrape(self, chamber, term): # What Vermont claims are Word and Excel files are actually # just HTML tables # What Vermont claims is a CSV file is actually one row of comma # separated values followed by a ColdFusion error. url = ("http://www.leg.state.vt.us/legdir/" "memberdata.cfm/memberdata.doc?FileType=W") with self.urlopen(url) as page: page = lxml.html.fromstring(page) for tr in page.xpath("//tr")[1:]: row_chamber = tr.xpath("string(td[4])") if row_chamber == 'S' and chamber == 'lower': continue elif row_chamber == 'H' and chamber == 'upper': continue district = tr.xpath("string(td[7])") district = district.replace('District', '').strip() first_name = tr.xpath("string(td[8])") middle_name = tr.xpath("string(td[9])") last_name = tr.xpath("string(td[10])") if first_name.endswith(" %s." % middle_name): first_name = first_name.split(" %s." % middle_name)[0] if middle_name: full_name = "%s %s. %s" % (first_name, middle_name, last_name) else: full_name = "%s %s" % (first_name, last_name) email = tr.xpath("string(td[11])") party = tr.xpath("string(td[6])") party = re.sub(r'Democrat\b', 'Democratic', party) parties = party.split('/') if 'Republican' in parties: if 'Democratic' in parties: pass else: party = 'Republican' parties.remove('Republican') elif 'Democratic' in parties: party = 'Democratic' parties.remove('Democratic') else: party = parties.pop(0) leg = Legislator( term, chamber, district, full_name, first_name=first_name, middle_name=middle_name, last_name=last_name, party=party, email=email, # closest thing we have to a page for legislators, not ideal url='http://www.leg.state.vt.us/legdir/LegDirMain.cfm') leg['roles'][0]['other_parties'] = parties leg.add_source(url) # 12-16: MailingAddress: 1,2,City,State,ZIP mail = '%s\n%s\n%s, %s %s' % ( tr.xpath('string(td[12])'), tr.xpath('string(td[13])'), tr.xpath('string(td[14])'), tr.xpath('string(td[15])'), tr.xpath('string(td[16])')) leg.add_office('district', 'Mailing Address', address=mail) # 17-21: HomeAddress: 1,2,City,State,ZIP, Email, Phone home = '%s\n%s\n%s, %s %s' % ( tr.xpath('string(td[17])'), tr.xpath('string(td[18])'), tr.xpath('string(td[19])'), tr.xpath('string(td[20])'), tr.xpath('string(td[21])')) home_email = tr.xpath('string(td[22])') or None home_phone = tr.xpath('string(td[23])') or None leg.add_office('district', 'Home Address', address=home, email=home_email, phone=home_phone) self.save_legislator(leg)
def scrape(self, chamber, term): self.validate_term(term) session = self.get_session_for_term(term) try: session_id = self.get_session_id(session) except KeyError: raise NoDataForPeriod(session) body = {'lower': 'H', 'upper': 'S'}[chamber] url = 'http://www.azleg.gov/MemberRoster.asp?Session_ID=%s&body=%s' % ( session_id, body) with self.urlopen(url) as page: root = html.fromstring(page) path = '//table[@id="%s"]/tr' % {'H': 'house', 'S': 'senate'}[body] roster = root.xpath(path)[1:] for row in roster: position = '' vacated = '' name, district, party, email, room, phone, fax = row.xpath( 'td') if email.attrib.get('class') == 'vacantmember': continue # Skip any vacant members. link = name.xpath('string(a/@href)') link = "http://www.azleg.gov" + link if len(name) == 1: name = name.text_content().strip() else: position = name.tail.strip() name = name[0].text_content().strip() district = district.text_content() party = party.text_content().strip() email = email.text_content().strip() if ('Vacated' in email or 'Resigned' in email or 'Removed' in email): # comment out the following 'continue' for historical # legislative sessions # for the current session, if a legislator has left we will # skip him/her to keep from overwriting their information continue vacated = re.search('[0-9]*/[0-9]*/\d{4}', email).group() email = '' party = self.get_party(party) room = room.text_content().strip() if chamber == 'lower': address = "House of Representatives\n" else: address = "Senate\n" address = address + "1700 West Washington\n Room " + room \ + "\nPhoenix, AZ 85007" phone = phone.text_content().strip() if not phone.startswith('602'): phone = "602-" + phone fax = fax.text_content().strip() if not fax.startswith('602'): fax = "602-" + fax if vacated: end_date = datetime.datetime.strptime(vacated, '%m/%d/%Y') leg = Legislator(term, chamber, district, full_name=name, party=party, url=link) leg['roles'][0]['end_date'] = end_date else: leg = Legislator(term, chamber, district, full_name=name, party=party, email=email, url=link) leg.add_office('capitol', 'Capitol Office', address=address, phone=phone, fax=fax) if position: leg.add_role(position, term, chamber=chamber, district=district, party=party) leg.add_source(url) #Probably just get this from the committee scraper #self.scrape_member_page(link, session, chamber, leg) self.save_legislator(leg)
def scrape_member(self, chamber, term, member_url): page = self.get(member_url).text root = lxml.html.fromstring(page) root.make_links_absolute(member_url) photo_url = root.xpath('//div[@class="thumbPhoto"]/img/@src')[0] full_name = root.xpath('//h1/span')[0].tail.strip() email = root.xpath('//a[contains(@href, "mailto")]/@href')[0] email = email.replace('mailto:', '') party, district = root.xpath('//h1/span')[1].text.split('-') party = party.strip() district = clean_district(district.strip()) if party in ('D', 'Democrat', 'Democratic'): party = 'Democratic' elif party in ('R', 'Republican'): party = 'Republican' else: party = 'Other' leg = Legislator(term, chamber, district, full_name, party=party, photo_url=photo_url, url=member_url) leg.add_source(member_url) # offices # this bool is so we only attach the email to one office # and we make sure to create at least one office email_stored = True if email: email_stored = False for addr in root.xpath('//address/div[@class="contactGroup"]'): office_name = addr.xpath( '../preceding-sibling::h4/text()')[0].strip() address = addr.xpath('a')[0].text_content() address = re.sub('\s{2,}', '\n', address) phone = fax = next = None for phonerow in addr.xpath('./div/div'): phonerow = phonerow.text_content().strip() if phonerow == 'Phone:': next = 'phone' elif phonerow == 'Fax:': next = 'fax' elif next == 'phone': phone = phonerow next = None elif next == 'fax': fax = phonerow next = None else: self.warning('unknown phonerow %s', phonerow) # all pieces collected if 'District' in office_name: otype = 'district' elif 'State' in office_name: otype = 'capitol' if not email_stored: email_stored = True leg.add_office(otype, office_name, phone=phone, fax=fax, address=address, email=email) else: leg.add_office(otype, office_name, phone=phone, fax=fax, address=address) if not email_stored: leg.add_office('capitol', 'Capitol Office', email=email) self.save_legislator(leg)
def scrape_details(self, chamber, term, leg_name, leg_link, role): if not leg_link: # Vacant post, likely: if "Vacancy" in leg_name: return raise Exception("leg_link is null. something went wrong") try: url = 'http://billstatus.ls.state.ms.us/members/%s' % leg_link url_root = os.path.dirname(url) details_page = self.urlopen(url) root = lxml.etree.fromstring(details_page.bytes) party = root.xpath('string(//PARTY)') district = root.xpath('string(//DISTRICT)') photo = "%s/%s" % (url_root, root.xpath('string(//IMG_NAME)')) home_phone = root.xpath('string(//H_PHONE)') bis_phone = root.xpath('string(//B_PHONE)') capital_phone = root.xpath('string(//CAP_PHONE)') other_phone = root.xpath('string(//OTH_PHONE)') org_info = root.xpath('string(//ORG_INFO)') email_name = root.xpath('string(//EMAIL_ADDRESS)') cap_room = root.xpath('string(//CAP_ROOM)') if party == 'D': party = 'Democratic' elif party == 'R': party = 'Republican' elif leg_name in ('Oscar Denton', 'Lataisha Jackson', 'John G. Faulkner'): party = 'Democratic' leg = Legislator(term, chamber, district, leg_name, party=party, role=role, org_info=org_info, url=url, photo_url=photo) leg.add_source(url) kwargs = {} if email_name.strip() != "": email = '%s@%s.ms.gov' % (email_name, { "upper": "senate", "lower": "house" }[chamber]) kwargs['email'] = email if capital_phone != "": kwargs['phone'] = capital_phone if cap_room != "": kwargs["address"] = "Room %s\n%s" % (cap_room, CAP_ADDRESS) else: kwargs['address'] = CAP_ADDRESS leg.add_office('capitol', 'Capitol Office', **kwargs) self.save_legislator(leg) except scrapelib.HTTPError, e: self.warning(str(e))
def scrape_member(self, chamber, year, member_url): with self.urlopen(member_url) as member_page: member = {} member_root = lxml.html.fromstring(member_page) table = member_root.xpath('//body/div[2]/table')[0] imgtag = member_root.xpath('//body/div[2]/table//img') member['photo_url'] = imgtag[0].get('src') name_list = table.xpath('string(.//strong[1])').split(' ') member['full_name'] = ' '.join(name_list[1:-1]).strip() party = name_list[-1] party = re.sub(r'\(|\)', '', party) if party == 'R': party = 'Republican' elif party == 'D': party = 'Democratic' elif party == 'I': party = 'Independent' member['party'] = party boldList = [bold.text for bold in table.iterdescendants(tag='b')] for item in boldList: if item == None: continue elif 'District' in item: district = item.split(' ')[-1] member['district'] = district.strip() else: if 'additionalRoles' in member: member['additionalRoles'].append(item) else: member['additionalRoles'] = [item] contact_rows = member_root.xpath( '//body/div[2]/div[1]/table/tr/td/table[1]/tr') for row in contact_rows: row_text = self.get_child_text(row) if len(row_text) > 0: if row_text[0] == 'Frankfort Address(es)': member['office_address'] = '\n'.join(row_text[1:]) if row_text[0] == 'Phone Number(s)': for item in row_text: # Use the first capitol annex phone if item.startswith('Annex:'): member['office_phone'] = item.replace( 'Annex:', '').strip() break office_info = self.scrape_office_info(member_url) leg = Legislator(year, chamber, member['district'], member['full_name'], party=member['party'], photo_url=member['photo_url'], url=member_url, office_address=member['office_address'], office_phone=member['office_phone']) leg.add_source(member_url) kwargs = {} if office_info['Email Address(es)'] != []: kwargs['email'] = office_info['Email Address(es)'][0] leg['email'] = office_info['Email Address(es)'][0] if office_info['Phone Number(s)']['Annex'] != []: kwargs['phone'] = office_info['Phone Number(s)']['Annex'][0] if office_info['Frankfort Address(es)'] != []: kwargs['address'] = office_info['Frankfort Address(es)'][0] if kwargs != {}: leg.add_office('capitol', 'Annex Office', **kwargs) if 'additionalRoles' in member: for role in member['additionalRoles']: leg.add_role(role, year, chamber=chamber) self.save_legislator(leg)
def scrape_senators(self, chamber, term): session = ((int(term[0:4]) - 2009) / 2) + 124 mapping = { 'district': 1, 'first_name': 2, 'middle_name': 3, 'last_name': 4, # 'suffix': 6, 'party': 6, 'resident_county': 5, 'street_addr': 7, 'city': 8, 'state': 9, 'zip_code': 10, 'phone1': 12, 'phone2': 13, 'email': 11, } url = ('http://legisweb1.mainelegislature.org/wp/senate/' 'wp-content/uploads/sites/2/2013/09/%sthSenatorsList.xlsx' % session) try: fn, result = self.urlretrieve(url) except scrapelib.HTTPError: url = 'http://www.maine.gov/legis/senate/%dthSenatorsList.xls' url = url % session fn, result = self.urlretrieve(url) wb = xlrd.open_workbook(fn) sh = wb.sheet_by_index(0) for rownum in xrange(1, sh.nrows): # get fields out of mapping d = {} for field, col_num in mapping.iteritems(): try: d[field] = str(sh.cell(rownum, col_num).value) except IndexError: # This col_num doesn't exist in the sheet. pass full_name = " ".join( (d['first_name'], d['middle_name'], d['last_name'])) full_name = re.sub(r'\s+', ' ', full_name).strip() address = "{street_addr}\n{city}, ME {zip_code}".format(**d) # For matching up legs with votes district_name = d['city'] phone = d['phone1'] district = d['district'].split('.')[0] leg_url = 'http://www.maine.gov/legis/senate/bio%02ds.htm' % int( district) leg = Legislator(term, chamber, district, full_name, d['first_name'], d['middle_name'], d['last_name'], _party_map[d['party']], resident_county=d['resident_county'], office_address=address, office_phone=phone, email=None, district_name=district_name, url=leg_url) leg.add_source(url) leg.add_source(leg_url) html = self.urlopen(leg_url) doc = lxml.html.fromstring(html) doc.make_links_absolute(leg_url) xpath = '//td[@class="XSP_MAIN_PANEL"]/descendant::img/@src' photo_url = doc.xpath(xpath) if photo_url: photo_url = photo_url.pop() leg['photo_url'] = photo_url else: photo_url = None office = dict(name='District Office', type='district', fax=None, email=None, address=''.join(address)) leg['email'] = d['email'] leg.add_office(**office) self.save_legislator(leg)
def scrape_upper_leg_page(self, term, url, who): page = self.lxmlize(url) info = page.xpath("//td[@bgcolor='#EBEAEC']") who = page.xpath("//font[@size='4']") who = who[0].text_content() who = re.sub("\s+", " ", who) who, district = (x.strip() for x in who.rsplit("-", 1)) who = who.replace("Senator", "").strip() district = district.replace("District", "").strip() infopane = page.xpath("//table[@cellpadding='3']") infos = [x.tail.strip() if x.tail else "" for x in infopane[1].xpath(".//br")] keys = ["party", "email", "capitol-office", "district-office", "phone", "fax", "staffer"] nodes = [[]] for node in infos: if node == "": if nodes[-1] != []: nodes.append([]) continue nodes[-1].append(node) data = dict(zip(keys, nodes)) district_office = "\n".join(data['district-office']) capitol_office = "\n".join(data['capitol-office']) rundown = infopane[1].xpath("./*")[-1] rundown_txt = rundown.text_content() parties = { "Republican": "Republican", "Democrat": "Democratic", } party = 'other' for slug in parties: if slug in rundown_txt: party = parties[slug] if party == 'other': raise Exception kwargs = { "party": party } leg = Legislator(term, 'upper', district, who, **kwargs) leg.add_office('district', 'District Office', address=district_office) leg.add_office('capitol', 'Capitol Office', address=capitol_office) leg.add_source(url) self.save_legislator(leg)
def _scrape_individual_legislator_page(self, url, term, chamber, district=None): """Scrape a specific lower house legislators page. The function will actually call one of three functions as there is 2 different bio templates and a completely separate one for the speaker of the house. Example url: http://www1.legis.ga.gov/legis/2009_10/house/bios/abdulsalaamRoberta/abdulsalaamRoberta.htm """ if 'speaker/index.htm' in url: return self._scrape_speaker_of_the_house(url, term, chamber) with self.lxml_context(url) as page: # page == None == 404 if page is None: return None page.make_links_absolute(url) # first check to see if this is the 'original' template or the new one stylesheet_path = '//link[@rel="stylesheet"]' stylesheets = page.xpath(stylesheet_path) for style_sheet in stylesheets: if 'legis.ga.gov.house.factsheet.css' in style_sheet.get('href') or \ 'legis.ga.gov.house.bio.css' in style_sheet.get('href'): return self._scrape_individual_legislator_page_second_template( page, term, chamber, district=district) path = '//table[@id="hoverTable"]/tr' legislator_info = page.xpath(path) # There is one page, "www1.legis.ga.gov/legis/2011_12/house/bios/williamsCoach.htm" that has # malformed HTML, going to manually do that one: if "www1.legis.ga.gov/legis/2011_12/house/bios/williamsCoach.htm" in url: legislator = Legislator(term, chamber, district, '"Coach" Williams', party="Democratic", url=url) return legislator # See if we got to the first row, some templates don't start with their table as 'hoverTable' # in this case let's just get the first table on the page as that is seeming to work well. if not legislator_info: path = '//table' tables = page.xpath(path) legislator_info = tables[0].getchildren() first_row = legislator_info[0] td_elements = first_row.getchildren()[0] name = td_elements[0].text_content().split('\n')[0].strip() party = td_elements[1].text_content().strip()[0:1].upper() # There was some cases where the party wasn't in a <p> it was after the # <h2>name</h2> foo <br />, seriously wtf if party not in self.PARTY_DICT: elements = td_elements.text_content().split('\n') for ele in elements: ele = ele.strip() if " - " in ele: party = ele[0:1] break elif ele.upper() == 'REPUBLICAN': party = 'R' break elif ele.upper() == 'DEMOCRAT': party = 'D' break if party == '': party = td_elements.text_content().split( '\n')[1].strip()[0:1] if not district: if len(td_elements) < 3 or "District" not in td_elements[ 2].text_content(): text_content = first_row[1].text_content().split('\n') district = text_content[0].strip()[len("District "):] else: district = td_elements[2].text_content().strip( )[len("District "):] # Not every legislator has a sworn in date or facebook url, so attempt to parse # and just pass if it fails sworn_in = None try: sworn_in = td_elements[4].text_content().strip( )[len("Sworn in "):] except: pass facebook_url = '' try: facebook_url = td_elements[5].get('href') except: pass photo_url = '' try: td_elements = first_row.getchildren()[1] photo_url = td_elements[0].getchildren()[0].get('src') or '' except: pass # Second row: second_row = legislator_info[1] address_info = second_row.getchildren()[0].text_content().split( "<br />")[0].split("\n") phone_number = address_info.pop() address = " ".join(address_info) email = '' try: text_content = second_row.text_content().split('\n') for content in text_content: if '@' in content.strip(): email = content.strip() except IndexError: try: email = second_row.getchildren()[1].getchildren( )[0].text_content() except: pass legislator = Legislator(term, chamber, district, name, party=self.PARTY_DICT[party], email=email, photo_url=photo_url, facebook_url=facebook_url, address=address, sworn_in_date=sworn_in, office_phone=phone_number, url=url) legislator.add_source(url) return legislator
def scrape(self, term, chambers): url = 'http://gencourt.state.nh.us/downloads/Members.txt' option_map = {} html = self.get( 'http://www.gencourt.state.nh.us/house/members/memberlookup.aspx' ).text doc = lxml.html.fromstring(html) for opt in doc.xpath('//option'): option_map[opt.text] = opt.get('value') data = self.get(url).text for line in data.splitlines(): if line.strip() == "": continue (chamber, fullname, last, first, middle, county, district_num, seat, party, street, street2, city, astate, zipcode, home_phone, office_phone, fax, email, com1, com2, com3, com4, com5, com6, com7) = line.split('*') chamber = chamber_map[chamber] # skip legislators from a chamber we aren't scraping if chamber not in chambers: continue middle = middle.strip() last = last.strip('"') if middle: full = '%s %s %s' % (first, middle, last) else: full = '%s %s' % (first, last) address = street if street2: address += (' ' + street2) address += '\n%s, %s %s' % (city, astate, zipcode) district = str(int(district_num)) if county: district = '%s %s' % (county, district) # When a candidate receives enough write-in votes in the # other party's primary, they are listed on the ballot as # being a nominee of both parties (eg, 'd+r') # Cross-reference this list for official party affiliation: # http://www.gencourt.state.nh.us/House/caljourns/journals/2015/HJ_4.pdf if fullname == "Wall, Janet G.": assert party == 'd+r', "Remove special-casing for Wall" party = 'd' leg = Legislator(term, chamber, district, full, first, last, middle, party_map[party], email=email) leg.add_office('district', 'Home Address', address=address, phone=home_phone or None) leg.add_office('district', 'Office Address', phone=office_phone or None, fax=fax or None) if chamber == 'upper': leg['url'] = 'http://www.gencourt.state.nh.us/Senate/members/webpages/district%02d.aspx' % int( district_num) elif chamber == 'lower': code = option_map.get('{0}, {1}'.format(last, first)) if code: leg['url'] = 'http://www.gencourt.state.nh.us/house/members/member.aspx?member=' + code romans = r'(?i)\s([IXV]+)(?:\s|$)' for com in (com1, com2, com3, com4, com5, com6, com7): com = com.strip('"') if com: com_name = com.title() com_name = re.sub(romans, lambda m: m.group().upper(), com_name) leg.add_role('committee member', term=term, chamber=chamber, committee=com_name) if 'url' in leg: leg['photo_url'] = self.get_photo(leg['url'], chamber) leg.add_source(url) self.save_legislator(leg)
def _parse_member(self, chamber, term, member): first_name = member.get('first-name') last_name = member.get('last-name') party = self.party_map[member.get('party')] # this is semi-safe because we validated term w/ latest_only=True session = self.metadata['terms'][-1]['sessions'][-1] # extra_fields extra_dict = {} for name, xpath in self.extra_fields.iteritems(): result = member.xpath(xpath) if result: extra_dict[name] = result[0] # address fields for name, xpath in self.addr_fields.iteritems(): result = member.xpath(xpath) if result: result = result[0] extra_dict[name] = '%s, %s, %s %s' % ( result.get('street-address'), result.get('city'), result.get('state'), result.get('postal-code')) leg = Legislator(term, chamber, member.get('district-number'), full_name=first_name + ' ' + last_name, first_name=first_name, last_name=last_name, middle_name=member.get('middle-initial'), party=party, email=member.get('e-mail'), url=member.get('website'), photo_url="%s/member_photo.jpg" % (member.get('website')), oregon_member_id=member.get('leg-member-id')) # add offices leg.add_office('capitol', 'Capitol Office', address=extra_dict['capitol_address'], phone=extra_dict['phone']) if 'district_address' in extra_dict or 'district_phone' in extra_dict: leg.add_office('district', 'District Office', address=extra_dict.get('district_address', None), phone=extra_dict.get('district_phone', None)) # committees com_xpath = 'committee-membership/session[@session-name="%s"]/committee' % session for com in member.xpath(com_xpath): cdict = { 'position': com.get('title').lower(), 'chamber': chamber, } com_name = com.get('name') com_class = com.get('committee-class') if com_class == 'sub-committee': cdict['committee'], cdict['subcommittee'] = \ com.get('name').split(' Subcommittee On ') else: cdict['committee'] = com.get('name') leg.add_role('committee member', term, **cdict) leg.add_source(self.source_url) return leg
def scrape(self, scrape_for_term_named, chambers): # The links on http://www.sanjoseca.gov/index.aspx?NID=1187 may go off- # site, so use http://www.sanjoseca.gov/index.aspx?NID=146 council_url = 'http://www.sanjoseca.gov/index.aspx?NID=146' doc = lxml.html.fromstring(self.urlopen(council_url)) doc.make_links_absolute(council_url) tds = doc.xpath('//div[@id="Section1"]//td') assert len( tds ) <= 11, 'expected 11 unique mayor and councilmember URLs, found %d' % len( tds) lines = [] for text in doc.xpath('//div[@id="Section1"]/text()'): text = clean_string(text) if re.match('^(?:\d+|San) ', text): lines.append(text) address = '\n'.join(lines) emails = [] for text in doc.xpath('//div[@id="Section1"]/script/text()'): # PhantomJS would be sweet here. emails.append(''.join( re.search('([^"]+)"\+"(@)"\+"([^"]+)', text).groups())) for index, td in enumerate(tds): for text in td.xpath('.//text()'): match = tel_regex.search(text.strip()) if match: phone = '-'.join(match.groups()) break url = td.xpath('.//a[//strong]/@href')[0] photo_url = td.xpath('.//img/@src')[0] # Extract district, name, role text = td.xpath('.//strong/text()')[0] if 'District' in text: district = re.search('District \d+', text).group(0) name = re.sub(', District \d+$', '', text) role = None if 'Vice Mayor' in text: name = name.replace('Vice Mayor ', '') role = 'Vice Mayor' elif 'Mayor' in text: district = 'Mayor' name = text.replace('Mayor ', '') role = 'Mayor' else: self.logger.warning('Skipped: ' + text) # Extract councilmember's term for text in td.xpath('.//text()'): match = re.search('\s*Term Expires:\s*([\d]+)/([\d]+)/([\d]+)', text) if match: councilmember_term_expires_year = string.atoi( '20' + match.group(3)) # Built-in Y2.1K bug councilmember_term_begins_year = councilmember_term_expires_year - 3 # Skip if this legislator is not in the current term being scraped scrape_for_term = self.find_term_named(scrape_for_term_named) if not year_is_within_term( councilmember_term_begins_year, scrape_for_term) and not year_is_within_term( councilmember_term_expires_year, scrape_for_term): continue # Extract fax and secondary phone from councilmember's page phone2 = None fax = None councilmember_doc = lxml.html.fromstring(self.urlopen(url)) councilmember_doc.make_links_absolute(url) # @todo xpath needs to be constrained further; it matches more elements than necessary for text in councilmember_doc.xpath( '//div[//img[@alt="Contact Us"]]//text()' ): # '//div[@id="quickLinks774"]//text()'): if re.match('\s*Fax.*\d', text, re.I): fax = '-'.join(tel_regex.search(text).groups()) if re.match('\s*Phone.*\d', text, re.I) or re.match( '\s*Ph..*\d', text, re.I) or re.match( '\s*Tel..*\d', text, re.I): councilmember_phone = '-'.join( tel_regex.search(text).groups()) phone2 = councilmember_phone if councilmember_phone != phone else None # Assign councilmember information legislator = Legislator(scrape_for_term_named, 'upper', district, name, email=emails[index], url=url, photo_url=photo_url, party=None) legislator.add_office('capitol', 'Council Office', address=address, phone=phone, secondary_phone=phone2, fax=fax) if role: legislator.add_role(role, scrape_for_term_named) legislator.add_source(url) self.save_legislator(legislator)
def scrape_upper_chamber(self, term): urls = { 'At-Large': 'http://www.senadopr.us/Pages/SenadoresporAcumulacion.aspx', 'I': 'http://www.senadopr.us/Pages/Senadores%20Distrito%20I.aspx', 'II': 'http://www.senadopr.us/Pages/Senadores%20Distrito%20II.aspx', 'III': 'http://www.senadopr.us/Pages/Senadores%20Distrito%20III.aspx', 'IV': 'http://www.senadopr.us/Pages/Senadores%20Distrito%20IV.aspx', 'V': 'http://www.senadopr.us/Pages/Senadores%20Distrito%20V.aspx', 'VI': 'http://www.senadopr.us/Pages/Senadores%20Distrito%20VI.aspx', 'VII': 'http://www.senadopr.us/Pages/Senadores%20Distrito%20VII.aspx', 'VIII': 'http://www.senadopr.us/Pages/Senadores%20Distrito%20VIII.aspx' } for district, url in urls.iteritems(): leg_page_html = self.get(url).text doc = lxml.html.fromstring(leg_page_html) doc.make_links_absolute(url) rows = doc.xpath( '//table[@summary="Senadores 2013-2016"]/tr[not(@class="ms-viewheadertr")]' ) for row in rows: tds = row.xpath('td') name = tds[0].text_content().title().replace('Hon.', '', 1).strip() party = tds[1].text_content() phone = tds[2].text_content() email = tds[3].text_content() #Code to guess the picture namefixed = unicode( name.replace(".", ". ") ) #Those middle names abbreviations are sometimes weird. namefixed = unicodedata.normalize('NFKD', namefixed).encode( 'ascii', 'ignore') #Remove the accents nameparts = namefixed.split() if nameparts[1].endswith('.'): lastname = nameparts[2] else: lastname = nameparts[1] # Construct the photo url photo_url = 'http://www.senadopr.us/Fotos%20Senadores/sen_' + ( nameparts[0][0] + lastname).lower() + '.jpg' try: picture_data = self.head( photo_url) # Checking to see if the file is there except scrapelib.HTTPError: # If not, leave out the photo_url photo_url = '' leg = Legislator(term=term, chamber='upper', district=district, full_name=name, party=party, photo_url=photo_url) leg.add_office('capitol', 'Oficina del Capitolio', phone=phone, email=email) leg.add_source(url) self.save_legislator(leg)
def scrape(self, chamber, term): url = self.URLs[chamber] page = self.lxmlize(url) for block in page.xpath("//div[@class='ms-rtestate-field']")[1:-1]: # Each legislator block. photo_block = block.xpath("ancestor::td/preceding-sibling::td") if len(photo_block) == 0: continue h2s = block.xpath(".//h2/a") if len(h2s) != 1: # We've got a Vacant person. print("Found a Vacant position. Skipping block.") continue h2, = h2s name = h2.text.strip() photo_block, = photo_block # (The <td> before ours was the photo) img, = photo_block.xpath("*") img = img.attrib['src'] info = {} # Right, now let's get info out of their little profile box. for entry in block.xpath(".//p"): key = None for kvpair in itergraphs(entry.xpath("./*"), 'br'): # OK. We either get the tail or the next element # (usually an <a> tag) if len(kvpair) == 1: key, = kvpair value = key.tail.strip() if key.tail else None if value: value = re.sub("\s+", " ", value).strip() elif len(kvpair) == 2: key, value = kvpair if value.text_content().strip() == "arty:": key = value value = value.tail elif len(kvpair) == 3: k1, k2, value = kvpair # As seen with a <stong><strong>Email:</strong></strong> t = lambda x: x.text_content().strip() assert t(k1) == "" or t(k2) == "" if t(k1) != "": key = k1 else: key = k2 else: # Never seen text + an <a> tag, perhaps this can happen. raise ValueError( "Too many elements. Something changed") key = key.text_content().strip(" :") if value is None: # A page has the value in a <strong> tag. D'oh. key, value = (x.strip() for x in key.rsplit(":", 1)) key = re.sub("\s+", " ", key).strip() key = key.replace(":", "") if key == "arty": key = "Party" info[key] = value info['District'] = info['District'].encode('ascii', 'ignore').strip() info['Party'] = info['Party'].strip(": ").replace(u"\u00a0", "") leg = Legislator(term=term, url=h2.attrib['href'], chamber=chamber, full_name=name, party=info['Party'], district=info['District'], photo_url=img) leg.add_source(url) phone = info.get('Capitol Phone', info.get('apitol Phone')) if hasattr(phone, 'text_content'): phone = phone.text_content() leg.add_office(type='capitol', name='Capitol Office', address=info['Capitol Address'], phone=phone, email=info['Email'].attrib['href'].replace( "mailto:", "")) self.save_legislator(leg)