def fetch_member(self, url, name, term, chamber): party_map = {"R": "Republican", "D": "Democratic", "I": "Independent"} party_district_re = re.compile(r"\((R|D|I)\) - (?:House|Senate) District\s+(\d+)") url = "http://leg6.state.va.us" + url # handle resignations, special elections match = re.search(r"-(Resigned|Member) (\d{1,2}/\d{1,2})?", name) if match: action, date = match.groups() name = name.rsplit("-")[0] if action == "Resigned": pass # TODO: set end date elif action == "Member": pass # TODO: set start date with self.urlopen(url) as html: doc = lxml.html.fromstring(html) party_district_line = doc.xpath("//h3/font/text()")[0] party, district = party_district_re.match(party_district_line).groups() leg = Legislator(term, chamber, district, name.strip(), party=party_map[party]) leg.add_source(url) for com in doc.xpath('//ul[@class="linkSect"][1]/li/a/text()'): leg.add_role("committee member", term=term, chamber=chamber, committee=com) self.save_legislator(leg)
def test_legislator(): l = Legislator('T1', 'upper', '1', 'Adam Smith', 'Adam', 'Smith') assert_equal(l, {'_type': 'person', 'full_name': 'Adam Smith', 'first_name': 'Adam', 'last_name': 'Smith', 'middle_name': '', 'suffixes': '', 'roles': [ {'chamber': 'upper', 'term': 'T1', 'role': 'member', 'start_date': None, 'end_date': None, 'district': '1', 'party': ''}], 'offices': [], 'sources': []}) l.add_role('committee member', 'T1', committee='Some Committee', position='chairman') assert_equal(l['roles'][1], {'role': 'committee member', 'term': 'T1', 'start_date': None, 'end_date': None, 'committee': 'Some Committee', 'position': 'chairman'}) l.add_office('capitol', 'Statehouse Office', '123 Main St', '123-456-7890', '123-555-5555', '*****@*****.**') assert_equal(l['offices'], [{'type': 'capitol', 'name': 'Statehouse Office', 'address': '123 Main St', 'phone': '123-456-7890', 'fax': '123-555-5555', 'email': '*****@*****.**'}])
def scrape_senators(self, chamber, term): url = 'http://www.ohiosenate.gov/directory.html' with self.urlopen(url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(url) for el in page.xpath('//table[@class="fullWidth"]/tr/td'): sen_link = el.xpath('a[@class="senatorLN"]')[1] sen_url = sen_link.get('href') full_name = sen_link.text full_name = full_name[0:-2] if full_name == 'To Be Announced': continue district = el.xpath('string(h3)').split()[1] party = el.xpath('string(a[@class="senatorLN"]/span)') if party == "D": party = "Democratic" elif party == "R": party = "Republican" office_phone = el.xpath("b[text() = 'Phone']")[0].tail office_phone = office_phone.strip(' :') office = ", ".join([x.strip() for x in \ el.xpath("./text()")[2:-1]]) photo_url = el.xpath("a/img")[0].attrib['src'] email = el.xpath('.//span[@class="tan"]/text()')[1] leg = Legislator(term, chamber, district, full_name, party=party, photo_url=photo_url, url=sen_url, email="") committees = self.scrape_senate_committees(sen_url) leg.add_office('capitol', 'Capitol Office', address=office, phone=office_phone) leg.add_source(url) leg.add_source(sen_url) for committee in committees: chmbr = chamber if "joint" in committee['committee'].lower(): chmbr = "joint" leg.add_role('committee member', term=term, chamber=chmbr, committee=committee['committee'], position=committee['title'] ) self.save_legislator(leg)
def scrape(self, chamber, session): url = self.get_district_list(chamber, session) people_pages = self.scrape_directory( url, chamber, session ) for person in people_pages: district = person p_url = people_pages[district] metainf = self.process_person( p_url ) p = Legislator( session, chamber, district, metainf['name'], party=metainf['party'], # some additional things the website provides: occupation=metainf['occupation'], photo_url=metainf['photo_url']) p.add_source( p_url ) if 'ctty' in metainf: for ctty in metainf['ctty']: p.add_role( 'committee member', term=session, chamber=chamber, committee=ctty, position="member" ) self.save_legislator( p )
def fetch_member(self, url, name, term, chamber): party_map = {'R': 'Republican', 'D': 'Democratic', 'I': 'Independent'} party_district_re = re.compile( r'\((R|D|I)\) - (?:House|Senate) District\s+(\d+)') # handle resignations, special elections match = re.search(r'-(Resigned|Member) (\d{1,2}/\d{1,2})?', name) if match: action, date = match.groups() name = name.rsplit('-')[0] if action == 'Resigned': pass # TODO: set end date elif action == 'Member': pass # TODO: set start date with self.urlopen(url) as html: doc = lxml.html.fromstring(html) party_district_line = doc.xpath('//h3/font/text()')[0] party, district = party_district_re.match(party_district_line).groups() leg = Legislator(term, chamber, district, name.strip(), party=party_map[party], url=url) leg.add_source(url) for com in doc.xpath('//ul[@class="linkSect"][1]/li/a/text()'): leg.add_role('committee member', term=term, chamber=chamber, committee=com) self.save_legislator(leg)
def scrape(self, chamber, session): url = self.get_district_list(chamber, session) people_pages = self.scrape_directory( url, chamber, session ) for person in people_pages: district = person p_url = people_pages[district] metainf = self.process_person( p_url ) p = Legislator( session, chamber, district, metainf['name'], party=metainf['party'], # some additional things the website provides: occupation=metainf['occupation'], photo_url=metainf['photo_url'], url=metainf['homepage']) if "email" in metainf: p['email'] = metainf['email'] if "number" in metainf: p.add_office('capitol', 'Capitol Office', phone=metainf['number'], address='200 E. Colfax\nDenver, CO 80203' ) p.add_source( p_url ) if 'ctty' in metainf: for ctty in metainf['ctty']: p.add_role( 'committee member', term=session, chamber=chamber, committee=clean_committee(ctty), position="member" ) self.save_legislator( p )
def scrape(self, term, chambers): url = 'http://gencourt.state.nh.us/downloads/Members(Asterisk%20Delimited).txt' option_map = {} html = self.urlopen('http://www.gencourt.state.nh.us/house/members/memberlookup.aspx') doc = lxml.html.fromstring(html) for opt in doc.xpath('//option'): option_map[opt.text] = opt.get('value') with self.urlopen(url) as data: for line in data.splitlines(): if line.strip() == "": continue (chamber, fullname, last, first, middle, county, district_num, seat, party, street, street2, city, astate, zipcode, home_phone, office_phone, fax, email, com1, com2, com3, com4, com5, _, _) = line.split('*') chamber = chamber_map[chamber] # skip legislators from a chamber we aren't scraping if chamber not in chambers: continue if middle: full = '%s %s %s' % (first, middle, last) else: full = '%s %s' % (first, last) address = street if street2: address += (' ' + street2) address += '\n%s, %s %s' % (city, astate, zipcode) district = str(int(district_num)) if county: district = '%s %s' % (county, district) leg = Legislator(term, chamber, district, full, first, last, middle, party_map[party], email=email) leg.add_office('district', 'Home Address', address=address, phone=home_phone or None) leg.add_office('district', 'Office Address', phone=office_phone or None, fax=fax or None) if chamber == 'upper': leg['url'] = 'http://www.gencourt.state.nh.us/Senate/members/webpages/district%02d.aspx' % int(district_num) elif chamber == 'lower': code = option_map.get('{0}, {1}'.format(last, first)) if code: leg['url'] = 'http://www.gencourt.state.nh.us/house/members/member.aspx?member=' + code for com in (com1, com2, com3, com4, com5): if com: leg.add_role('committee member', term=term, chamber=chamber, committee=com) leg.add_source(url) self.save_legislator(leg)
def scrape_senators(self, chamber, session, term): url = self.senator_url % (session[2:]) with self.urlopen(url) as page: page = lxml.html.fromstring(page) table = page.xpath('//*[@id="mainContent"]/table//table/tr') rowcount = 0 for tr in table: rowcount += 1 # the first two rows are headers, skip: if rowcount < 2: continue tds = tr.xpath('td') full_name = tds[0].xpath('div/a')[0].text_content().strip() party_and_district = tds[1].xpath('div')[0].text_content().strip().split('-') if party_and_district[0] == 'D': party = 'Democratic' elif party_and_district[0] == 'R': party = 'Republican' senator_key = "%s%s" % (party_and_district[0].lower(),party_and_district[1]) district = party_and_district[1] phone = tds[3].xpath('div')[0].text_content().strip() leg = Legislator(term, chamber, district, full_name, '', '', '', party) leg.add_source(url) url = self.senator_details_url % (session[2:],int(district)) with self.urlopen(url) as details_page: leg.add_source(url) #Using soupparser as legislator pages are very soupy page = lxml.html.fromstring(details_page) photo_url = page.xpath('//html/body/div[2]/div/img/@src')[0] committees = page.xpath('//html/body/div[2]//span[@class="style3"]/a') for c in committees: if c.attrib.get('href').find('info/comm/') == -1: continue parts = c.text_content().split('\n') #print "committee = '%s'" % parts[0].strip() subcommittee = None if len(parts) > 1: subcommittee = parts[1].strip().replace('- ','').replace(', Vice-Chairman','').replace(', Chairman','') committee = parts[0].strip().replace(', Vice-Chairman','').replace(', Chairman','') if subcommittee: leg.add_role('committee member', term, committee=committee, subcommittee=subcommittee, chamber=chamber) else: leg.add_role('committee member', term, committee=committee, chamber=chamber) url = self.senator_address_url % (session[2:],int(senator_key[1:])) with self.urlopen(url) as details_page: leg.add_source(url) page = lxml.html.fromstring(details_page) address = page.xpath('/html/body//span[2]')[0].text_content().split('\n') email = page.xpath('/html/body/p/span[2]/a/@href') # TODO This is only true if the href doesn't contain 'mail_form'. If it does, # then there is only a webform. So...no email? # TODO a lot of these have fax numbers. Include? leg['office_phone'] = phone leg['office_address'] = "%s%s" % (address[0],address[1]) leg['photo_url'] = photo_url if email and len(email) > 0 and email[0] != 'mailto:': leg['email'] = email[0].split(':')[1] #print "em = %s" % email self.save_legislator(leg)
def scrape_legislator_data(self, url, chamber): party_fulls = {'R' : 'Republican', 'D' : 'Democrat'} with self.urlopen(url) as page: page = BeautifulSoup(page) for data in page.find('table', id = 'ctl00_mainCopy_DataList1')('td'): spans = data('span') if len(spans) == 0: self.debug('Found an empty cell in %s. Continuing' % url) continue full_name = ' '.join([span.string.strip() for span in spans]) if len(spans[0].string.strip().split()) == 2: first_name, middle_name = spans[0].string.strip().split() else: first_name, middle_name = spans[0].string.strip(), '' last_name = spans[1].string.strip() details_url = get_abs_url(url, data.find('a')['href']) with self.urlopen(details_url) as details: details = BeautifulSoup(details) district = details.find('a', id = 'ctl00_mainCopy_LegisInfo_DISTRICTLabel').string.strip() party = party_fulls[details.find('span', id = 'ctl00_mainCopy_LegisInfo_PARTYLabel').string] leg = Legislator('2010', chamber, district, full_name, first_name, last_name, middle_name, party) leg.add_source(details_url) comms_table = details.find('table', id = 'ctl00_mainCopy_MembershipGrid') for comms_raw_data in comms_table('tr')[1:]: comm_data = comms_raw_data('td') comm_role_type = comm_data[0].string.strip() comm_name = comm_data[1]('a')[0].string.strip() leg.add_role(comm_role_type, '2010', chamber = chamber, committee = comm_name) self.save_legislator(leg)
def scrape(self, chamber, session): metainf = self.scrape_leg_page(get_chamber_listing_url( chamber )) for leg in metainf: p = Legislator( session, chamber, leg['district'], leg['name'], party=leg['party'], # some additional things the website provides: photo_url=leg['image'], url=leg['homepage'], room=leg['room'], phone=leg['phone'], fax=leg['fax'], email=leg['email'], address=leg['addr']) for source in leg['source']: p.add_source( source ) try: for ctty in leg['ctty']: flag='Joint Legislative' if ctty['name'][:len(flag)] == flag: ctty_chamber = "joint" else: ctty_chamber = chamber p.add_role( 'committee member', term=session, chamber=ctty_chamber, committee=ctty['name'], position="member") except KeyError: self.log( "XXX: Warning, %s has no scraped Commities" % leg['name'] ) self.save_legislator( p )
def scrape_reps(self, chamber, session, term): url = (self.reps_url % (session)) with self.urlopen(url) as page: page = lxml.html.fromstring(page) # This is the ASP.net table container table_xpath = ('id("ContentPlaceHolder1_' 'gridMembers_DXMainTable")') table = page.xpath(table_xpath)[0] for tr in table.xpath('tr')[1:]: tds = tr.xpath('td') leg_code = tds[0].xpath('a[1]')[0].attrib.get('href') last_name = tds[0].text_content().strip() first_name = tds[1].text_content().strip() full_name = '%s %s' % (first_name, last_name) district = str(int(tds[2].text_content().strip())) party = tds[3].text_content().strip() if party == 'Democrat': party = 'Democratic' phone = tds[4].text_content().strip() room = tds[5].text_content().strip() address = self.assumed_address_fmt % (room if room else '') if last_name == 'Vacant': leg = Legislator(term, chamber, district, full_name=full_name, first_name=first_name, last_name=last_name, party=party, _code=leg_code, url=url) leg.add_office('capitol', "Capitol Office", address=address, phone=phone) leg.add_source(url) self.save_vacant_legislator(leg) else: leg = Legislator(term, chamber, district, full_name=full_name, first_name=first_name, last_name=last_name, party=party, _code=leg_code, url=url) leg.add_office('capitol', 'Capitol Office', address=address, phone=phone) url = (self.rep_details_url % (session,district)) leg.add_source(url) with self.urlopen(url) as details_page: page = lxml.html.fromstring(details_page) picture = page.xpath('//*[@id="ContentPlaceHolder1_imgPhoto"]/@src') email = page.xpath('//*[@id="ContentPlaceHolder1_lblAddresses"]/table/tr[4]/td/a/@href') terms = page.xpath('//*[@id="ContentPlaceHolder1_lblElected"]') committees = page.xpath('//*[@id="ContentPlaceHolder1_lblCommittees"]/li/a') for c in committees: leg.add_role('committee member', term, committee=c.text_content().strip(), chamber=chamber) # TODO home address? if len(email) > 0 and email[0] != 'mailto:': #print "Found email : %s" % email[0] leg['email'] = email[0].split(':')[1] if len(picture) > 0: #print "Found picture : %s" % picture[0] leg['photo_url'] = picture[0] #leg.add_source(url) self.save_legislator(leg)
def scrape_reps(self, chamber, session, term): url = (self.reps_url % (session)) page = self.urlopen(url) page = lxml.html.fromstring(page) # This is the ASP.net table container table_xpath = ('id("ContentPlaceHolder1_' 'gridMembers_DXMainTable")') table = page.xpath(table_xpath)[0] for tr in table.xpath('tr')[1:]: tds = tr.xpath('td') leg_code = tds[0].xpath('a[1]')[0].attrib.get('href') last_name = tds[0].text_content().strip() first_name = tds[1].text_content().strip() full_name = '%s %s' % (first_name, last_name) district = str(int(tds[2].text_content().strip())) party = tds[3].text_content().strip() if party == 'Democrat': party = 'Democratic' phone = tds[4].text_content().strip() room = tds[5].text_content().strip() address = self.assumed_address_fmt % (room if room else '') if last_name == 'Vacant': leg = Legislator(term, chamber, district, full_name=full_name, first_name=first_name, last_name=last_name, party=party, _code=leg_code, url=url) leg.add_office('capitol', "Capitol Office", address=address, phone=phone) leg.add_source(url) self.save_vacant_legislator(leg) else: leg = Legislator(term, chamber, district, full_name=full_name, first_name=first_name, last_name=last_name, party=party, _code=leg_code, url=url) leg.add_office('capitol', 'Capitol Office', address=address, phone=phone) url = (self.rep_details_url % (session,district)) leg.add_source(url) details_page = self.urlopen(url) page = lxml.html.fromstring(details_page) picture = page.xpath('//*[@id="ContentPlaceHolder1_imgPhoto"]/@src') email = page.xpath('//*[@id="ContentPlaceHolder1_lblAddresses"]/table/tr[4]/td/a/@href') terms = page.xpath('//*[@id="ContentPlaceHolder1_lblElected"]') committees = page.xpath('//*[@id="ContentPlaceHolder1_lblCommittees"]/li/a') for c in committees: leg.add_role('committee member', term, committee=c.text_content().strip(), chamber=chamber) # TODO home address? if len(email) > 0 and email[0] != 'mailto:': #print "Found email : %s" % email[0] leg['email'] = email[0].split(':')[1] if len(picture) > 0: #print "Found picture : %s" % picture[0] leg['photo_url'] = picture[0] #leg.add_source(url) self.save_legislator(leg)
def scrape(self, chamber, session): metainf = self.scrape_leg_page(get_chamber_listing_url( chamber )) for leg in metainf: p = Legislator( session, chamber, leg['district'], leg['name'], party=leg['party'], # some additional things the website provides: photo_url=leg['image'], url=leg['homepage'], email=leg['email']) p.add_office('capitol', 'Capitol Office', address=leg['addr'], phone=leg['phone'], fax=leg['fax'] or None) for source in leg['source']: p.add_source( source ) try: for ctty in leg['ctty']: flag='Joint Legislative' if ctty['name'][:len(flag)] == flag: ctty_chamber = "joint" else: ctty_chamber = chamber p.add_role( 'committee member', term=session, chamber=ctty_chamber, committee=ctty['name'], position="member") except KeyError: self.log( "XXX: Warning, %s has no scraped Commities" % leg['name'] ) self.save_legislator( p )
def fetch_member(self, url, name, term, chamber): party_map = {'R': 'Republican', 'D': 'Democratic', 'I': 'Independent'} party_district_re = re.compile( r'\((R|D|I)\) - (?:House|Senate) District\s+(\d+)') # handle resignations, special elections match = re.search(r'-(Resigned|Member) (\d{1,2}/\d{1,2})?', name) if match: action, date = match.groups() name = name.rsplit('-')[0] if action == 'Resigned': pass # TODO: set end date elif action == 'Member': pass # TODO: set start date with self.urlopen(url) as html: doc = lxml.html.fromstring(html) party_district_line = doc.xpath('//h3/font/text()')[0] party, district = party_district_re.match( party_district_line).groups() leg = Legislator(term, chamber, district, name.strip(), party=party_map[party], url=url) leg.add_source(url) for ul in doc.xpath('//ul[@class="linkNon"]'): address = [] phone = None email = None for li in ul.getchildren(): text = li.text_content() if re.match('\(\d{3}\)', text): phone = text elif text.startswith('email:'): email = text.strip('email: ') else: address.append(text) type = ('capitol' if 'Capitol Square' in address else 'district') name = ('Capitol Office' if type == 'capitol' else 'District Office') leg.add_office(type, name, address='\n'.join(address), phone=phone, email=email) for com in doc.xpath('//ul[@class="linkSect"][1]/li/a/text()'): leg.add_role('committee member', term=term, chamber=chamber, committee=com) self.save_legislator(leg)
def scrape(self, term, chambers): leg_url = "ftp://ftp.cga.ct.gov/pub/data/LegislatorDatabase.csv" data = self.get(leg_url) page = open_csv(data) for row in page: chamber = {'H': 'lower', 'S': 'upper'}[row['office code']] if chamber not in chambers: continue district = row['dist'].lstrip('0') name = row['first name'] mid = row['middle initial'].strip() if mid: name += " %s" % mid name += " %s" % row['last name'] suffix = row['suffix'].strip() if suffix: name += " %s" % suffix party = row['party'] if party == 'Democrat': party = 'Democratic' leg = Legislator(term, chamber, district, name, first_name=row['first name'], last_name=row['last name'], middle_name=row['middle initial'], suffixes=row['suffix'], party=party, email=row['email'].strip(), url=row['URL'], office_phone=row['capitol phone']) office_address = "%s, Room %s\nHartford, CT 06106-1591" % ( row['capitol street address'], row['room number']) leg.add_office('capitol', 'Capitol Office', address=office_address, phone=row['capitol phone']) # skipping home address for now leg.add_source(leg_url) for comm in row['committee member1'].split(';'): if comm: if ' (' in comm: comm, role = comm.split(' (') role = role.strip(')').lower() else: role = 'member' comm = comm.strip() if comm == '': continue leg.add_role('committee member', term, chamber='joint', committee=comm, position=role) self.save_legislator(leg)
def scrape_legislator(self, chamber, term, url): with self.urlopen(url) as html: doc = lxml.html.fromstring(html) doc.make_links_absolute(url) # most properties are easy to pull properties = {'first_name': 'FNAME', 'last_name': 'LNAME', 'party': 'PARTY', 'district': 'DISTRICT', 'county': 'COUNTY', 'start_year': 'STARTYEAR', 'occupation': 'OCCUPATION', 'capitol_phone': 'OFF_PHONE', 'office_phone': 'WKPH'} for key, value in properties.iteritems(): id = 'ctl00_mainCopy_LegisInfo_%sLabel' % value try: val = doc.get_element_by_id(id).text except KeyError: self.warning('bad legislator page %s missing %s' % (url, id)) return if val: properties[key] = val.strip() # image & email are a bit different properties['photo_url'] = doc.xpath('//img[@id="ctl00_mainCopy_LegisInfo_LegislatorPhoto"]/@src')[0] email = doc.get_element_by_id('ctl00_mainCopy_LegisInfo_lnkEmail').text if email: properties['email'] = email.strip() properties['url'] = url properties['chamber'] = chamber properties['term'] = term properties['full_name'] = '%(first_name)s %(last_name)s' % properties if '(D)' in properties['party']: properties['party'] = 'Democratic' elif '(R)' in properties['party']: properties['party'] = 'Republican' elif '(DTS)' in properties['party']: properties['party'] = 'Decline to State' else: raise Exception("unknown party encountered") leg = Legislator(**properties) leg.add_source(url) # committees # skip first header row for row in doc.xpath('//table[@id="ctl00_mainCopy_MembershipGrid"]/tr')[1:]: role, committee, note = [x.text_content() for x in row.xpath('td')] if 'Interim' in note: role = 'interim ' + role.lower() else: role = role.lower() leg.add_role('committee member', term, committee=committee, position=role, chamber=chamber) self.save_legislator(leg)
def scrape(self, chamber, term): office_code = {'upper': 'S', 'lower': 'H'}[chamber] leg_url = "ftp://ftp.cga.ct.gov/pub/data/LegislatorDatabase.csv" data = self.urlopen(leg_url) page = open_csv(data) for row in page: if office_code != row['office code']: continue district = row['dist'].lstrip('0') name = row['first name'] mid = row['middle initial'].strip() if mid: name += " %s" % mid name += " %s" % row['last name'] suffix = row['suffix'].strip() if suffix: name += " %s" % suffix party = row['party'] if party == 'Democrat': party = 'Democratic' leg = Legislator(term, chamber, district, name, first_name=row['first name'], last_name=row['last name'], middle_name=row['middle initial'], suffixes=row['suffix'], party=party, email=row['email'], url=row['URL'], office_phone=row['capitol phone']) office_address = "%s, Room %s\nHartford, CT 06106-1591" % ( row['capitol street address'], row['room number']) leg.add_office('capitol', 'Capitol Office', address=office_address, phone=row['capitol phone']) # skipping home address for now leg.add_source(leg_url) for comm_code in row['committee codes'].split(';'): if comm_code: comm_name = self._committee_names[comm_code] leg.add_role('committee member', term, chamber='joint', committee=comm_name) self.save_legislator(leg)
def fetch_member(self, url, name, term, chamber): if name in CHAMBER_MOVES: if chamber != CHAMBER_MOVES[name]: return # Skip bad chambers. if "vacated" in name.lower(): self.logger.warning("Seat seems to have been vacated: '{}'".format(name)) return party_map = {'R': 'Republican', 'D': 'Democratic', 'I': 'Independent'} party_district_re = re.compile( r'\((R|D|I)\) - (?:House|Senate) District\s+(\d+)') # handle resignations, special elections match = re.search(r'-(Resigned|Member) (\d{1,2}/\d{1,2})?', name) if match: action, date = match.groups() name = name.rsplit('-')[0] if action == 'Resigned': pass # TODO: set end date elif action == 'Member': pass # TODO: set start date html = self.get(url).text doc = lxml.html.fromstring(html) party_district_line = doc.xpath('//h3/font/text()')[0] party, district = party_district_re.match(party_district_line).groups() leg = Legislator(term, chamber, district, name.strip(), party=party_map[party], url=url) leg.add_source(url) for ul in doc.xpath('//ul[@class="linkNon"]'): address = [] phone = None email = None for li in ul.getchildren(): text = li.text_content() if re.match('\(\d{3}\)', text): phone = text elif text.startswith('email:'): email = text.strip('email: ').strip() else: address.append(text) office_type = ('capitol' if 'Capitol Square' in address else 'district') name = ('Capitol Office' if office_type == 'capitol' else 'District Office') leg.add_office(office_type, name, address='\n'.join(address), phone=phone, email=email) for com in doc.xpath('//ul[@class="linkSect"][1]/li/a/text()'): leg.add_role('committee member', term=term, chamber=chamber, committee=com) self.save_legislator(leg)
def scrape(self, chamber, term): url = 'http://gencourt.state.nh.us/downloads/Members(Asterisk%20Delimited).txt' self.validate_term(term, latest_only=True) with self.urlopen(url) as data: for line in data.splitlines(): (body, fullname, last, first, middle, county, district, seat, party, street, street2, city, astate, zipcode, home_phone, office_phone, fax, email, com1, com2, com3, com4, com5, _, _) = line.split('*') # skip legislators from other chamber if body != chamber_name[chamber]: continue if middle: full = '%s %s %s' % (first, middle, last) else: full = '%s %s' % (first, last) address = street if street2: address += (' ' + street2) address += '\n%s, %s %s' % (city, astate, zipcode) district = str(int(district)) if county: district = '%s %s' % (county, district) leg = Legislator(term, chamber, district, full, first, last, middle, party_map[party], address=address, home_phone=home_phone, office_phone=office_phone, office_fax=fax, email=email) # use seat as a _code if chamber is lower if chamber == 'lower': leg['_code'] = seat for com in (com1, com2, com3, com4, com5): if com: leg.add_role('committee member', term=term, chamber=chamber, committee=com) leg.add_source(url) self.save_legislator(leg)
def scrape(self, chamber, term): if term != '2011-2012': raise NoDataForPeriod(term) if chamber == 'upper': chamber_name = 'senate' else: chamber_name = 'house' url = "http://www.legis.iowa.gov/Legislators/%s.aspx" % chamber_name page = lxml.html.fromstring(self.urlopen(url)) page.make_links_absolute(url) table = page.xpath('//table[@class="legis"]')[0] for link in table.xpath(".//a[contains(@href, 'legislator.aspx')]"): name = link.text.strip() leg_url = link.get('href') district = link.xpath("string(../../td[2])") party = link.xpath("string(../../td[3])") email = link.xpath("string(../../td[5])") if party == 'Democrat': party = 'Democratic' pid = re.search("PID=(\d+)", link.attrib['href']).group(1) photo_url = ("http://www.legis.iowa.gov/getPhotoPeople.aspx" "?GA=84&PID=%s" % pid) leg = Legislator(term, chamber, district, name, party=party, email=email, photo_url=photo_url, url=url) leg.add_source(url) leg_page = lxml.html.fromstring(self.urlopen(link.attrib['href'])) comm_path = "//a[contains(@href, 'committee')]" for comm_link in leg_page.xpath(comm_path): comm = comm_link.text.strip() match = re.search(r'\((.+)\)$', comm) if match: comm = re.sub(r'\((.+)\)$', '', comm).strip() mtype = match.group(1).lower() else: mtype = 'member' if comm.endswith('Appropriations Subcommittee'): sub = re.match('^(.+) Appropriations Subcommittee$', comm).group(1) leg.add_role('committee member', term, chamber=chamber, committee='Appropriations', subcommittee=sub, position=mtype) else: leg.add_role('committee member', term, chamber=chamber, committee=comm, position=mtype) self.save_legislator(leg)
def scrape(self, term, chambers): # The links on http://www.sanjoseca.gov/index.aspx?NID=1187 may go off- # site, so use http://www.sanjoseca.gov/index.aspx?NID=146 council_url = 'http://www.sanjoseca.gov/index.aspx?NID=146' doc = lxml.html.fromstring(self.urlopen(council_url)) doc.make_links_absolute(council_url) tds = doc.xpath('//div[@id="Section1"]//td') assert len(tds) <= 11, 'expected 11 unique mayor and councilmember URLs, found %d' % len(tds) lines = [] for text in doc.xpath('//div[@id="Section1"]/text()'): text = clean_string(text) if re.match('^(?:\d+|San) ', text): lines.append(text) address = '\n'.join(lines) emails = [] for text in doc.xpath('//div[@id="Section1"]/script/text()'): # PhantomJS would be sweet here. emails.append(''.join(re.search('([^"]+)"\+"(@)"\+"([^"]+)', text).groups())) for index, td in enumerate(tds): for text in td.xpath('.//text()'): match = tel_regex.search(text.strip()) if match: phone = '-'.join(match.groups()) break url = td.xpath('.//a[strong]/@href')[0] photo_url = td.xpath('.//img/@src')[0] text = td.xpath('.//strong/text()')[0] if 'District' in text: district = re.search('District \d+', text).group(0) name = re.sub(', District \d+$', '', text) role = None if 'Vice Mayor' in text: name = name.replace('Vice Mayor ', '') role = 'Vice Mayor' elif 'Mayor' in text: district = 'Mayor' name = text.replace('Mayor ', '') role = 'Mayor' else: self.logger.warning('Skipped: ' + text) legislator = Legislator(term, 'upper', district, name, email=emails[index], url=url, photo_url=photo_url, party=None) legislator.add_office('capitol', 'Council Office', address=address, phone=phone) if role: legislator.add_role(role, term) legislator.add_source(url) self.save_legislator(legislator)
def scrape_legislator(self, chamber, term, name, url): page = lxml.html.fromstring(self.urlopen(url)) page.make_links_absolute(url) dist_link = page.xpath("//a[contains(@href, 'dist=')]")[0] district = dist_link.xpath('string()').strip().lstrip('0') mem_span = page.xpath("//span[contains(@class, 'memname')]")[0] mem_tail = mem_span.tail.strip() party = re.match(r'\((R|D)', mem_tail).group(1) if party == 'D': party = 'Democratic' elif party == 'R': party = 'Republican' photo_url = page.xpath( "//img[contains(@src, 'images/members/')]")[0].attrib['src'] email = page.xpath("//a[contains(@href, 'mailto:')]" )[1].attrib['href'].split('mailto:')[1] leg = Legislator(term, chamber, district, name, party=party, photo_url=photo_url, email=email, url=url) leg.add_source(url) for link in page.xpath("//a[contains(@href, 'committee.cfm')]"): comm = link.xpath("string()").strip() committee_chamber = chamber if 'interims' in link.attrib['href']: committee_chamber = 'joint' sub_index = comm.find('Subcommittee') if sub_index > 0: sub = comm[sub_index:].strip() comm = comm[:sub_index].strip() leg.add_role('committee member', term, committee=comm, subcommittee=sub, chamber=committee_chamber) else: leg.add_role('committee member', term, committee=comm, chamber=committee_chamber) self.scrape_offices(leg, page) self.save_legislator(leg)
def scrape_legislator(self, chamber, term, option): url = urlparse.urljoin(self.url, option.attrib['value']) name, party, district = re.split(r'\s*,\s*', option.text.strip()) name = re.sub(r'^(Sen\.|Rep\.)\s+', '', name) district = re.sub(r'^District\s+', '', district) if district == '[N/A]': msg = 'No district found for %r; skipping.' self.logger.warning(msg, name) return leg = Legislator(term, chamber, district, name, party=party) # Scrape leg page. try: html = self.urlopen(url) except scrapelib.HTTPError as exc: # As of July 2014, this only happens when a page has # gone missing from their varnish server. # if exc.response.status_code is 503: self.logger.exception(exc) self.logger.warning('Skipping legislator at url: %s' % url) skipped = True return doc = lxml.html.fromstring(html) doc.make_links_absolute(self.url) leg.add_source(url) # Scrape committees. for tr in doc.xpath( '//div[@class="legislator-committees-container"]//table//tr'): committee, committee_type, role = tr committee = committee.text_content().strip() role = role.text_content().strip() if 'member' in role.lower(): role = 'committee member' elif 'chair' in role.lower(): role = 'chair' if committee != "Committee Name": leg.add_role(role, term, chamber=chamber, committee=committee) # Scrape offices. dist_office, phone = doc.xpath('//address') dist_office = dist_office.text_content().strip() dist_office = re.sub(r' {2,}', '', dist_office) phone = phone.text_content().strip() email = doc.xpath('string(//a[starts-with(@href, "mailto:")]/@href)') photo_url = doc.xpath('string(//img[contains(@class, "member")]/@src)') leg.update(email=email, photo_url=photo_url) leg.add_office(address=dist_office, name='Capitol Office', type='capitol', phone=phone) self.save_legislator(leg)
def _parse_member(self, chamber, term, member): first_name = member.get('first-name') last_name = member.get('last-name') party = self.party_map[member.get('party')] # this is semi-safe because we validated term w/ latest_only=True session = self.metadata['terms'][-1]['sessions'][-1] # extra_fields extra_dict = {} for name, xpath in self.extra_fields.iteritems(): result = member.xpath(xpath) if result: extra_dict[name] = result[0] # address fields for name, xpath in self.addr_fields.iteritems(): result = member.xpath(xpath) if result: result = result[0] extra_dict[name] = '%s, %s, %s %s' % ( result.get('street-address'), result.get('city'), result.get('state'), result.get('postal-code')) leg = Legislator(term, chamber, member.get('district-number'), full_name=first_name + ' ' + last_name, first_name=first_name, last_name=last_name, middle_name=member.get('middle-initial'), party=party, email=member.get('e-mail'), url=member.get('website'), oregon_member_id=member.get('leg-member-id'), **extra_dict) # committees com_xpath = 'committee-membership/session[@session-name="%s"]/committee' % session for com in member.xpath(com_xpath): cdict = { 'position': com.get('title').lower(), 'chamber': chamber, } com_name = com.get('name') com_class = com.get('committee-class') if com_class == 'sub-committee': cdict['committee'], cdict['subcommittee'] = \ com.get('name').split(' Subcommittee On ') else: cdict['committee'] = com.get('name') leg.add_role('committee member', term, **cdict) leg.add_source(self.source_url) return leg
def _parse_member(self, chamber, term, member): first_name = member.get('first-name') last_name = member.get('last-name') party = self.party_map[member.get('party')] # this is semi-safe because we validated term w/ latest_only=True session = self.metadata['terms'][-1]['sessions'][-1] # extra_fields extra_dict = {} for name, xpath in self.extra_fields.iteritems(): result = member.xpath(xpath) if result: extra_dict[name] = result[0] # address fields for name, xpath in self.addr_fields.iteritems(): result = member.xpath(xpath) if result: result = result[0] extra_dict[name] = '%s, %s, %s %s' % ( result.get('street-address'), result.get('city'), result.get('state'), result.get('postal-code')) leg = Legislator(term, chamber, member.get('district-number'), full_name=first_name+' '+last_name, first_name=first_name, last_name=last_name, middle_name=member.get('middle-initial'), party=party, email=member.get('e-mail'), website=member.get('website'), oregon_member_id=member.get('leg-member-id'), **extra_dict) # committees com_xpath = 'committee-membership/session[@session-name="%s"]/committee' % session for com in member.xpath(com_xpath): cdict = { 'position': com.get('title').lower(), 'chamber': chamber, } com_name = com.get('name') com_class = com.get('committee-class') if com_class == 'sub-committee': cdict['committee'], cdict['subcommittee'] = \ com.get('name').split(' Subcommittee On ') else: cdict['committee'] = com.get('name') leg.add_role('committee member', term, **cdict) leg.add_source(self.source_url) return leg
def scrape(self, chamber, session): metainf = self.scrape_leg_page(get_legislator_listing_url(chamber)) for leg in metainf: try: chamber = {"House": "lower", "Senate": "upper"}[leg['chamber']] except KeyError: print("") print(" ERROR: Bad Legislator page.") print(" -> " + "\n -> ".join(leg['source'])) print("") print(" Added this workaround because of a bad legislator") print(" page, while they filled their info out.") print("") print(" Emailed webmaster. Told to wait.") print(" - PRT, Jun 23, 2014") print("") continue p = Legislator( session, chamber, leg['district'], leg['name'], party=leg['party'], # some additional things the website provides: photo_url=leg['image'], url=leg['homepage']) p.add_office('capitol', 'Capitol Office', address=leg['addr'], phone=leg['phone'], fax=leg['fax'] or None, email=leg['email']) for source in leg['source']: p.add_source(source) try: for ctty in leg['ctty']: flag = 'Joint Legislative' if ctty['name'][:len(flag)] == flag: ctty_chamber = "joint" else: ctty_chamber = chamber p.add_role('committee member', term=session, chamber=ctty_chamber, committee=ctty['name'], position="member") except KeyError: self.log("XXX: Warning, %s has no scraped Commities" % leg['name']) self.save_legislator(p)
def scrape(self, chamber, term): if term != '2011-2012': raise NoDataForPeriod(term) office_code = {'upper': 'S', 'lower': 'H'}[chamber] leg_url = "ftp://ftp.cga.ct.gov/pub/data/LegislatorDatabase.csv" page = urllib2.urlopen(leg_url) page = csv.DictReader(page) for row in page: if office_code != row['office code']: continue district = row['dist'].lstrip('0') name = row['first name'] mid = row['middle initial'].strip() if mid: name += " %s" % mid name += " %s" % row['last name'] suffix = row['suffix'].strip() if suffix: name += " %s" % suffix party = row['party'] if party == 'Democrat': party = 'Democratic' office_address = "%s, Room %s\nHartford, CT 06106-1591" % ( row['capitol street address'], row['room number']) leg = Legislator(term, chamber, district, name, first_name=row['first name'], last_name=row['last name'], middle_name=row['middle initial'], suffixes=row['suffix'], party=party, office_address=office_address, office_phone=row['capitol phone']) leg.add_source(leg_url) for comm_code in row['committee codes'].split(';'): if comm_code: comm_name = self._committee_names[comm_code] leg.add_role('committee member', term, chamber='joint', committee=comm_name) self.save_legislator(leg)
def test_legislator(): l = Legislator('T1', 'upper', '1', 'Adam Smith', 'Adam', 'Smith') assert_equal( l, { '_type': 'person', 'full_name': 'Adam Smith', 'first_name': 'Adam', 'last_name': 'Smith', 'middle_name': '', 'suffixes': '', 'roles': [{ 'chamber': 'upper', 'term': 'T1', 'role': 'member', 'start_date': None, 'end_date': None, 'district': '1', 'party': '' }], 'offices': [], 'sources': [] }) l.add_role('committee member', 'T1', committee='Some Committee', position='chairman') assert_equal( l['roles'][1], { 'role': 'committee member', 'term': 'T1', 'start_date': None, 'end_date': None, 'committee': 'Some Committee', 'position': 'chairman' }) l.add_office('capitol', 'Statehouse Office', '123 Main St', '123-456-7890', '123-555-5555', '*****@*****.**') assert_equal(l['offices'], [{ 'type': 'capitol', 'name': 'Statehouse Office', 'address': '123 Main St', 'phone': '123-456-7890', 'fax': '123-555-5555', 'email': '*****@*****.**' }])
def scrape_legislator(self, chamber, term, option): url = urlparse.urljoin(self.url, option.attrib["value"]) name, party, district = re.split(r"\s*,\s*", option.text.strip()) name = re.sub(r"^(Sen\.|Rep\.)\s+", "", name) district = re.sub(r"^District\s+", "", district) if district == "[N/A]": msg = "No district found for %r; skipping." self.logger.warning(msg, name) return leg = Legislator(term, chamber, district, name, party=party) leg.add_source(self.url) # Scrape leg page. try: html = self.urlopen(url) except scrapelib.HTTPError as exc: # As of July 2014, this only happens when a page has # gone missing from their varnish server. # if exc.response.status_code is 503: self.logger.exception(exc) self.logger.warning("Skipping legislator at url: %s" % url) skipped = True return doc = lxml.html.fromstring(html) doc.make_links_absolute(self.url) leg.add_source(url) # Scrape committees. for tr in doc.xpath("//table//tr"): committee, role = tr committee = committee.text_content().strip() role = role.text_content().strip() if "member" in role.lower(): role = "committee member" elif "chair" in role.lower(): role = "chair" leg.add_role(role, term, chamber=chamber, committee=committee) # Scrape offices. dist_office, phone = doc.xpath("//address") dist_office = dist_office.text_content().strip() dist_office = re.sub(r" {2,}", "", dist_office) phone = phone.text_content().strip() email = doc.xpath('string(//a[starts-with(@href, "mailto:")]/@href)') photo_url = doc.xpath('string(//img[contains(@class, "member")]/@src)') leg.update(email=email, photo_url=photo_url) leg.add_office(address=dist_office, name="District Office", type="district", phone=phone) self.save_legislator(leg)
def scrape(self, chamber, term): url = 'http://gencourt.state.nh.us/downloads/Members(Asterisk%20Delimited).txt' self.validate_term(term, latest_only=True) with self.urlopen(url) as data: for line in data.splitlines(): (body, fullname, last, first, middle, county, district_num, seat, party, street, street2, city, astate, zipcode, home_phone, office_phone, fax, email, com1, com2, com3, com4, com5, _, _) = line.split('*') # skip legislators from other chamber if body != chamber_name[chamber]: continue if middle: full = '%s %s %s' % (first, middle, last) else: full = '%s %s' % (first, last) address = street if street2: address += (' ' + street2) address += '\n%s, %s %s' % (city, astate, zipcode) district = str(int(district_num)) if county: district = '%s %s' % (county, district) leg = Legislator(term, chamber, district, full, first, last, middle, party_map[party], address=address, home_phone=home_phone, office_phone=office_phone, office_fax=fax, email=email) if chamber == 'lower': # use seat as a _code if chamber is lower leg['_code'] = seat else: # Senate URLs are guessable leg['url'] = 'http://www.gencourt.state.nh.us/Senate/members/webpages/district%02d.aspx' % int(district_num) for com in (com1, com2, com3, com4, com5): if com: leg.add_role('committee member', term=term, chamber=chamber, committee=com) leg.add_source(url) self.save_legislator(leg)
def scrape(self, chamber, session): metainf = self.scrape_leg_page(get_legislator_listing_url(chamber)) for leg in metainf: try: chamber = {"House": "lower", "Senate": "upper"}[leg['chamber']] except KeyError: print("") print(" ERROR: Bad Legislator page.") print(" -> " + "\n -> ".join(leg['source'])) print("") print(" Added this workaround because of a bad legislator") print(" page, while they filled their info out.") print("") print(" Emailed webmaster. Told to wait.") print(" - PRT, Jun 23, 2014") print("") continue p = Legislator( session, chamber, leg['district'], leg['name'], party=leg['party'], # some additional things the website provides: photo_url=leg['image'], url=leg['homepage'], email=leg['email']) p.add_office('capitol', 'Capitol Office', address=leg['addr'], phone=leg['phone'], fax=leg['fax'] or None) for source in leg['source']: p.add_source( source ) try: for ctty in leg['ctty']: flag='Joint Legislative' if ctty['name'][:len(flag)] == flag: ctty_chamber = "joint" else: ctty_chamber = chamber p.add_role( 'committee member', term=session, chamber=ctty_chamber, committee=ctty['name'], position="member") except KeyError: self.log( "XXX: Warning, %s has no scraped Commities" % leg['name'] ) self.save_legislator( p )
def scrape(self, chamber, term): """ Scrapes legislators for the current term only """ self.validate_term(term, latest_only=True) url = BASE_URL % CHAMBERS[chamber].lower() index = self.get(url).text html = lxml.html.fromstring(index) html.make_links_absolute(url) rows = html.xpath('//div[contains(@class, "row-equal-height")]') for row in rows: img_url = row.xpath('.//img/@src')[0] inner = row.xpath('.//div[@class="vc-column-innner-wrapper"]')[1] name = inner.xpath('p/strong')[0].text.replace(u'\xa0', ' ').strip() name = re.sub('\s+', ' ', name) party = PARTY[inner.xpath('p/strong')[0].tail.strip()] email = inner.xpath('p/strong/a')[0].text district = inner.xpath('p/a')[0].text.replace('District ', '') leg_url = inner.xpath('p/a/@href')[0] leg = Legislator(term, chamber, district, name, party=party, email=email) phones = get_phones(inner) leg.add_office('district', 'District Office', address=get_address(inner), fax=get_fax(inner), phone=phones.get('home') or phones.get('business')) leg.add_office('capitol', 'Capitol Office', phone=phones.get('office')) leg.add_source(url) leg['photo_url'] = img_url leg['url'] = leg_url for com in inner.xpath('p/a[contains(@href, "committees")]'): role = com.tail.strip() if not role: role = 'member' leg.add_role('committee member', term=term, chamber=chamber, committee=com.text, position=role) self.save_legislator(leg)
def scrape_legislator(self, chamber, term, name, url): page = lxml.html.fromstring(self.urlopen(url)) page.make_links_absolute(url) dist_link = page.xpath("//a[contains(@href, 'dist=')]")[0] district = dist_link.xpath('string()').strip().lstrip('0') mem_span = page.xpath("//span[contains(@class, 'memname')]")[0] mem_tail = mem_span.tail.strip() party = re.match(r'\((R|D)', mem_tail).group(1) if party == 'D': party = 'Democratic' elif party == 'R': party = 'Republican' photo_url = page.xpath( "//img[contains(@src, 'images/members/')]")[0].attrib['src'] email = page.xpath( "//a[contains(@href, 'mailto:')]")[1].attrib['href'].split( 'mailto:')[1] leg = Legislator(term, chamber, district, name, party=party, photo_url=photo_url, email=email, url=url) leg.add_source(url) for link in page.xpath("//a[contains(@href, 'committee.cfm')]"): comm = link.xpath("string()").strip() committee_chamber = chamber if 'interims' in link.attrib['href']: committee_chamber = 'joint' sub_index = comm.find('Subcommittee') if sub_index > 0: sub = comm[sub_index:].strip() comm = comm[:sub_index].strip() leg.add_role('committee member', term, committee=comm, subcommittee=sub, chamber=committee_chamber) else: leg.add_role('committee member', term, committee=comm, chamber=committee_chamber) self.scrape_offices(leg, page) self.save_legislator(leg)
def scrape_bio(self, term, chamber, district, name, url): # this opens the committee section without having to do another request url += '&TableRow=1.5.5' doc = lxml.html.fromstring(self.urlopen(url)) doc.make_links_absolute(url) # party is in one of these party = doc.xpath('//div[@align="center"]/b/font[@size="2"]/text()') if '(D)' in party: party = 'Democratic' elif '(R)' in party: party = 'Republican' leg = Legislator(term, chamber, district, name, party=party, url=url) photo_url = doc.xpath('//img[contains(@src, "FieldElemFormat")]/@src') if photo_url: leg['photo_url'] = photo_url[0] roles = defaultdict(lambda: {}) position = 'member' for text in doc.xpath('//td[@width="584"]/descendant::font/text()'): text = text.strip() if text == 'Committee Chair:': position = 'chair' elif text == 'Committee Co-chair:': position = 'co-chair' else: for committee in text.splitlines(): roles[committee].update( role='committee member', term=term, chamber=chamber, committee=committee, party=party, position=position) for role in roles.values(): leg.add_role(**role) contact_info = self.scrape_contact_info(doc) leg.update(contact_info) return leg
def scrape_legislator(self, chamber, term, option): url = urlparse.urljoin(self.url, option.attrib['value']) name, party, district = re.split(r'\s*,\s*', option.text.strip()) name = re.sub(r'^(Sen\.|Rep\.)\s+', '', name) district = re.sub(r'^District\s+', '', district) if district == '[N/A]': msg = 'No district found for %r; skipping.' self.logger.warning(msg, name) return leg = Legislator(term, chamber, district, name, party=party) leg.add_source(self.url) # Scrape leg page. html = self.urlopen(url) doc = lxml.html.fromstring(html) doc.make_links_absolute(self.url) leg.add_source(url) # Scrape committees. for tr in doc.xpath('//table//tr'): committee, role = tr committee = committee.text_content().strip() role = role.text_content().strip() if 'member' in role.lower(): role = 'committee member' elif 'chair' in role.lower(): role = 'chair' leg.add_role(role, term, chamber=chamber, committee=committee) # Scrape offices. dist_office, phone = doc.xpath('//address') dist_office = dist_office.text_content().strip() dist_office = re.sub(r' {2,}', '', dist_office) phone = phone.text_content().strip() email = doc.xpath('string(//a[starts-with(@href, "mailto:")]/@href)') photo_url = doc.xpath('string(//img[contains(@class, "member")]/@src)') leg.update(email=email, photo_url=photo_url) leg.add_office( address=dist_office, name='District Office', type='district', phone=phone) self.save_legislator(leg)
def scrape_legislator(self, chamber, term, option): url = urlparse.urljoin(self.url, option.attrib['value']) name, party, district = re.split(r'\s*,\s*', option.text.strip()) name = re.sub(r'^(Sen\.|Rep\.)\s+', '', name) district = re.sub(r'^District\s+', '', district) leg = Legislator(term, chamber, district, name, party=party) leg.add_source(self.url) # Scrape leg page. html = self.urlopen(url) doc = lxml.html.fromstring(html) doc.make_links_absolute(self.url) leg.add_source(url) # Scrape committees. for tr in doc.xpath('//table//tr'): committee, role = tr committee = committee.text_content().strip() role = role.text_content().strip() if 'member' in role.lower(): role = 'committee member' elif 'chair' in role.lower(): role = 'chair' leg.add_role(role, term, chamber=chamber, committee=committee) # Scrape offices. dist_office, phone = doc.xpath('//address') dist_office = dist_office.text_content().strip() dist_office = re.sub(r' {2,}', '', dist_office) phone = phone.text_content().strip() email = doc.xpath('string(//a[starts-with(@href, "mailto:")]/@href)') photo_url = doc.xpath('string(//img[contains(@class, "member")]/@src)') leg.update(email=email, photo_url=photo_url) leg.add_office( address=dist_office, name='District Office', type='district', phone=phone) self.save_legislator(leg)
def scrape(self, chamber, session): metainf = self.scrape_leg_page(get_chamber_listing_url(chamber)) for leg in metainf: p = Legislator( session, chamber, leg["district"], leg["name"], party=leg["party"], # some additional things the website provides: photo_url=leg["image"], url=leg["homepage"], email=leg["email"], ) p.add_office("capitol", "Capitol Office", address=leg["addr"], phone=leg["phone"], fax=leg["fax"] or None) for source in leg["source"]: p.add_source(source) try: for ctty in leg["ctty"]: flag = "Joint Legislative" if ctty["name"][: len(flag)] == flag: ctty_chamber = "joint" else: ctty_chamber = chamber p.add_role( "committee member", term=session, chamber=ctty_chamber, committee=ctty["name"], position="member", ) except KeyError: self.log("XXX: Warning, %s has no scraped Commities" % leg["name"]) self.save_legislator(p)
def fetch_member(self, url, name, term, chamber): party_map = {'R': 'Republican', 'D': 'Democratic', 'I': 'Independent'} party_district_re = re.compile( r'\((R|D|I)\) - (?:House|Senate) District\s+(\d+)') url = 'http://leg6.state.va.us' + url # handle resignations, special elections match = re.search(r'-(Resigned|Member) (\d{1,2}/\d{1,2})?', name) if match: action, date = match.groups() name = name.rsplit('-')[0] if action == 'Resigned': pass # TODO: set end date elif action == 'Member': pass # TODO: set start date with self.urlopen(url) as html: doc = lxml.html.fromstring(html) party_district_line = doc.xpath('//h3/font/text()')[0] party, district = party_district_re.match( party_district_line).groups() leg = Legislator(term, chamber, district, name.strip(), party=party_map[party]) leg.add_source(url) for com in doc.xpath('//ul[@class="linkSect"][1]/li/a/text()'): leg.add_role('committee member', term=term, chamber=chamber, committee=com) self.save_legislator(leg)
def scrape(self, chamber, session): url = self.get_district_list(chamber, session) people_pages = self.scrape_directory(url, chamber, session) for person in people_pages: district = person p_url = people_pages[district] metainf = self.process_person(p_url) p = Legislator( session, chamber, district, metainf['name'], party=metainf['party'], # some additional things the website provides: occupation=metainf['occupation'], photo_url=metainf['photo_url'], url=metainf['homepage']) if "email" in metainf: p['email'] = metainf['email'] if "number" in metainf: p.add_office('capitol', 'Capitol Office', phone=metainf['number'], address='200 E. Colfax\nDenver, CO 80203') p.add_source(p_url) if 'ctty' in metainf: for ctty in metainf['ctty']: p.add_role('committee member', term=session, chamber=chamber, committee=ctty, position="member") self.save_legislator(p)
def scrape(self, chamber, term): self.validate_term(term) l1 = Legislator(term, chamber, '1st', 'Bob Smith', party='Democrat') if chamber == 'upper': l1.add_role('President of the Senate', term) else: l1.add_role('Speaker of the House', term) l1.add_source('http://example.com/Bob_Smith.html') l2 = Legislator(term, chamber, '2nd', 'Sally Johnson', party='Republican') l2.add_role('Minority Leader', term) l2.add_source('http://example.com/Sally_Johnson.html') self.save_legislator(l1) self.save_legislator(l2)
def scrape_legislator(self, chamber, term, url): with self.urlopen(url) as html: doc = lxml.html.fromstring(html) doc.make_links_absolute(url) # most properties are easy to pull properties = { 'first_name': 'FNAME', 'last_name': 'LNAME', 'party': 'PARTY', 'district': 'DISTRICT', 'county': 'COUNTY', 'start_year': 'STARTYEAR', 'occupation': 'OCCUPATION', 'office_phone': 'WKPH' } for key, value in properties.iteritems(): id_ = 'ctl00_mainCopy_LegisInfo_%sLabel' % value try: val = doc.get_element_by_id(id_).text except KeyError: self.warning('bad legislator page %s missing %s' % (url, id)) return if val: properties[key] = val.strip() # image & email are a bit different properties['photo_url'] = doc.xpath( '//img[@id="ctl00_mainCopy_LegisInfo_LegislatorPhoto"]/@src' )[0] email = doc.get_element_by_id( 'ctl00_mainCopy_LegisInfo_lnkEmail').text if email: properties['email'] = email.strip() properties['url'] = url properties['chamber'] = chamber properties['term'] = term properties[ 'full_name'] = '%(first_name)s %(last_name)s' % properties if '(D)' in properties['party']: properties['party'] = 'Democratic' elif '(R)' in properties['party']: properties['party'] = 'Republican' elif '(DTS)' in properties['party']: properties['party'] = 'Decline to State' else: raise Exception("unknown party encountered") leg = Legislator(**properties) leg.add_source(url) # committees # skip first header row for row in doc.xpath( '//table[@id="ctl00_mainCopy_MembershipGrid"]/tr')[1:]: role, committee, note = [ x.text_content() for x in row.xpath('td') ] if 'Interim' in note: role = 'interim ' + role.lower() else: role = role.lower() leg.add_role('committee member', term, committee=committee, position=role, chamber=chamber) # Already have the photo url. try: del leg['image_url'] except KeyError: pass self.save_legislator(leg)
def scrape(self, chamber, term): self.validate_term(term) session = self.get_session_for_term(term) try: session_id = self.get_session_id(session) except KeyError: raise NoDataForPeriod(session) body = {'lower': 'H', 'upper': 'S'}[chamber] url = 'http://www.azleg.gov/MemberRoster.asp?Session_ID=%s&body=%s' % ( session_id, body) page = self.get(url).text root = html.fromstring(page) path = '//table[@id="%s"]/tr' % {'H': 'house', 'S': 'senate'}[body] roster = root.xpath(path)[1:] for row in roster: position = '' vacated = '' name, district, party, email, room, phone, fax = row.xpath('td') if email.attrib.get('class') == 'vacantmember': continue # Skip any vacant members. link = name.xpath('string(a/@href)') link = "http://www.azleg.gov" + link if len(name) == 1: name = name.text_content().strip() else: position = name.tail.strip() name = name[0].text_content().strip() linkpage = self.get(link).text linkroot = html.fromstring(linkpage) linkroot.make_links_absolute(link) photos = linkroot.xpath("//img[@name='memberphoto']") if len(photos) != 1: raise Exception photo_url = photos[0].attrib['src'] district = district.text_content() party = party.text_content().strip() email = email.text_content().strip() if ('Vacated' in email or 'Resigned' in email or 'Removed' in email): # comment out the following 'continue' for historical # legislative sessions # for the current session, if a legislator has left we will # skip him/her to keep from overwriting their information continue vacated = re.search('[0-9]*/[0-9]*/\d{4}', email).group() email = '' party = self.get_party(party) room = room.text_content().strip() if chamber == 'lower': address = "House of Representatives\n" else: address = "Senate\n" address = address + "1700 West Washington\n Room " + room \ + "\nPhoenix, AZ 85007" phone = phone.text_content().strip() if not phone.startswith('602'): phone = "602-" + phone fax = fax.text_content().strip() if not fax.startswith('602'): fax = "602-" + fax if vacated: end_date = datetime.datetime.strptime(vacated, '%m/%d/%Y') leg = Legislator( term, chamber, district, full_name=name, party=party, url=link) leg['roles'][0]['end_date'] = end_date else: leg = Legislator(term, chamber, district, full_name=name, party=party, url=link, photo_url=photo_url) leg.add_office('capitol', 'Capitol Office', address=address, phone=phone, fax=fax, email=email) if position: leg.add_role( position, term, chamber=chamber, district=district, party=party) leg.add_source(url) #Probably just get this from the committee scraper #self.scrape_member_page(link, session, chamber, leg) self.save_legislator(leg)
def scrape(self, chamber, term): """ Scrapes legislators for the current term only """ self.validate_term(term, latest_only=True) url = _BASE_URL % _CHAMBERS[chamber].lower() index = self.get(url).text html = lxml.html.fromstring(index) html.make_links_absolute(url) rows = html.xpath('//div[contains(@class, "row-equal-height")]') for row in rows: img_url = row.xpath('.//img/@src')[0] inner = row.xpath('.//div[@class="vc-column-innner-wrapper"]')[1] name = inner.xpath('p/strong')[0].text.replace(u'\xa0', ' ').strip() name = re.sub('\s+', ' ', name) party = _PARTY[inner.xpath('p/strong')[0].tail.strip()] email = inner.xpath('p/strong/a')[0].text district = inner.xpath('p/a')[0].text.replace('District ', '') leg_url = inner.xpath('p/a/@href')[0] address = home_phone = office_phone = fax = None for br in inner.xpath('p/br'): piece = br.tail or '' piece = piece.strip() if re.findall(', \d{5}', piece): address = re.sub(r'(\d{5})', r'ID \1', piece).strip() elif piece.startswith('Home '): home_phone = piece[5:] elif piece.startswith('Bus '): office_phone = piece[4:] elif piece.startswith('FAX '): fax = piece[4:] print(piece) leg = Legislator(term, chamber, district, name, party=party, email=email) phone = home_phone or office_phone leg.add_office('district', 'District Office', address=address, fax=fax, phone=phone) leg.add_source(url) leg['photo_url'] = img_url leg['url'] = leg_url for com in inner.xpath('p/a[contains(@href, "committees")]'): role = com.tail.strip() if not role: role = 'member' leg.add_role('committee member', term=term, chamber=chamber, committee=com.text, position=role) self.save_legislator(leg)
def scrape(self, chamber, term): self.validate_term(term, latest_only=True) session_id = self.metadata['session_details'][term]['number'] if chamber == 'upper': chamber_name = 'senate' else: chamber_name = 'house' url = "http://www.legis.iowa.gov/Legislators/%s.aspx" % chamber_name page = lxml.html.fromstring(self.urlopen(url)) page.make_links_absolute(url) table = page.xpath('//table[@class="legis"]')[0] for link in table.xpath(".//a[contains(@href, 'legislator.aspx')]"): name = link.text.strip() leg_url = link.get('href') district = link.xpath("string(../../td[2])") party = link.xpath("string(../../td[3])") email = link.xpath("string(../../td[5])") if party == 'Democrat': party = 'Democratic' pid = re.search("PID=(\d+)", link.attrib['href']).group(1) photo_url = ("http://www.legis.iowa.gov/getPhotoPeople.aspx" "?GA=%s&PID=%s" % (session_id, pid)) leg = Legislator(term, chamber, district, name, party=party, email=email, photo_url=photo_url, url=url) leg.add_source(url) leg_page = lxml.html.fromstring(self.urlopen(link.attrib['href'])) office_data = { "email": "ctl00_cphMainContent_divEmailLegis", "home_phone": "ctl00_cphMainContent_divPhoneHome", "home_addr": "ctl00_cphMainContent_divAddrHome", "office_phone": "ctl00_cphMainContent_divPhoneCapitol", } metainf = {} for attr in office_data: path = office_data[attr] info = leg_page.xpath("//div[@id='%s']" % path) if len(info) != 1: continue info = info[0] _, data = [x.text_content() for x in info.xpath("./span")] data = data.strip() if data == "": continue metainf[attr] = data if "home_phone" in metainf or "home_addr" in metainf: home_args = {} if "home_phone" in metainf: home_args['phone'] = metainf['home_phone'] if "home_addr" in metainf: home_args['address'] = metainf['home_addr'] leg.add_office('district', 'Home Office', **home_args) if "email" in metainf or "office_phone" in metainf: cap_args = {} if "email" in metainf: cap_args['email'] = metainf['email'] if "office_phone" in metainf: cap_args['phone'] = metainf['office_phone'] leg.add_office('capitol', 'Capitol Office', **cap_args) comm_path = "//a[contains(@href, 'committee')]" for comm_link in leg_page.xpath(comm_path): comm = comm_link.text.strip() match = re.search(r'\((.+)\)$', comm) if match: comm = re.sub(r'\((.+)\)$', '', comm).strip() mtype = match.group(1).lower() else: mtype = 'member' if comm.endswith('Appropriations Subcommittee'): sub = re.match('^(.+) Appropriations Subcommittee$', comm).group(1) leg.add_role('committee member', term, chamber=chamber, committee='Appropriations', subcommittee=sub, position=mtype) else: leg.add_role('committee member', term, chamber=chamber, committee=comm, position=mtype) self.save_legislator(leg)
def scrape(self, chamber, term): # CSS isn't there without this, it serves up a mobile version self.user_agent = 'Mozilla/5.0' if chamber == 'lower': url = 'http://www.scstatehouse.gov/member.php?chamber=H' else: url = 'http://www.scstatehouse.gov/member.php?chamber=S' data = self.urlopen(url) doc = lxml.html.fromstring(data) doc.make_links_absolute(url) for a in doc.xpath('//a[contains(@href, "code=")]'): full_name = a.text leg_url = a.get('href') leg_html = self.urlopen(leg_url) leg_doc = lxml.html.fromstring(leg_html) leg_doc.make_links_absolute(leg_url) party, district, _ = leg_doc.xpath( '//p[@style="font-size: 17px; margin: 0 0 0 0; padding: 0;"]/text()' ) if 'Republican' in party: party = 'Republican' elif 'Democrat' in party: party = 'Democratic' # District # - County - Map district = district.split()[1] photo_url = leg_doc.xpath( '//img[contains(@src,"/members/")]/@src')[0] legislator = Legislator(term, chamber, district, full_name, party=party, photo_url=photo_url, url=leg_url) # office address / phone try: addr_div = leg_doc.xpath( '//div[@style="float: left; width: 225px; margin: 10px 5px 0 20px; padding: 0;"]' )[0] addr = addr_div.xpath( 'p[@style="font-size: 13px; margin: 0 0 10px 0; padding: 0;"]' )[0].text_content() phone = addr_div.xpath( 'p[@style="font-size: 13px; margin: 0 0 0 0; padding: 0;"]/text()' )[0] phone = phone.strip() legislator.add_office('capitol', 'Columbia Address', address=addr, phone=phone) except IndexError: self.warning('no address for {0}'.format(full_name)) legislator.add_source(leg_url) legislator.add_source(url) # committees (skip first link) for com in leg_doc.xpath( '//a[contains(@href, "committee.php")]')[1:]: if com.text.endswith(', '): committee, role = com.text_content().rsplit(', ', 1) # known roles role = { 'Treas.': 'treasurer', 'Secy.': 'secretary', 'Secy./Treas.': 'secretary/treasurer', 'V.C.': 'vice-chair', '1st V.C.': 'first vice-chair', '2nd V.C.': 'second vice-chair', '3rd V.C.': 'third vice-chair', 'Ex.Officio Member': 'ex-officio member', 'Chairman': 'chairman' }[role] else: committee = com.text role = 'member' legislator.add_role('committee member', term=term, chamber=chamber, committee=committee, position=role) self.save_legislator(legislator)
def scrape(self, chamber, term): self.validate_term(term, latest_only=True) session_id = self.metadata['session_details'][term]['number'] if chamber == 'upper': chamber_name = 'senate' else: chamber_name = 'house' url = "https://www.legis.iowa.gov/legislators/%s" % chamber_name page = lxml.html.fromstring(self.urlopen(url)) page.make_links_absolute(url) table = page.xpath('//table[@id="sortableTable"]')[0] for link in table.xpath(".//a[contains(@href, 'legislator')]"): name = link.text.strip() leg_url = link.get('href') district = link.xpath("string(../../td[3])") party = link.xpath("string(../../td[4])") email = link.xpath("string(../../td[5])") if party == 'Democrat': party = 'Democratic' pid = re.search("personID=(\d+)", link.attrib['href']).group(1) photo_url = ("https://www.legis.iowa.gov/photo" "?action=getPhoto&ga=%s&pid=%s" % (session_id, pid)) leg = Legislator(term, chamber, district, name, party=party, photo_url=photo_url, url=url) leg.add_source(url) leg_page = lxml.html.fromstring(self.urlopen(link.attrib['href'])) office_data = { "Legislative Email:": "email", "Home Phone:": "home_phone", "Home Address:": "home_addr", "Capitol Phone:": "office_phone", } metainf = {} table ,= leg_page.xpath( "//div[@class='legisIndent divideVert']/table" ) for row in table.xpath(".//tr"): try: key, value = ( x.text_content().strip() for x in row.xpath("./td") ) except ValueError: continue try: metainf[office_data[key]] = value except KeyError: continue if "home_phone" in metainf or "home_addr" in metainf: home_args = {} if "home_phone" in metainf: home_args['phone'] = metainf['home_phone'] if "home_addr" in metainf: home_args['address'] = metainf['home_addr'] leg.add_office('district', 'Home Office', **home_args) if "email" in metainf or "office_phone" in metainf: cap_args = {} if "email" in metainf: cap_args['email'] = metainf['email'] if "office_phone" in metainf: cap_args['phone'] = metainf['office_phone'] leg.add_office('capitol', 'Capitol Office', **cap_args) comm_path = "//a[contains(@href, 'committee')]" for comm_link in leg_page.xpath(comm_path): comm = comm_link.text.strip() match = re.search(r'\((.+)\)$', comm) if match: comm = re.sub(r'\((.+)\)$', '', comm).strip() mtype = match.group(1).lower() else: mtype = 'member' if comm.endswith('Appropriations Subcommittee'): sub = re.match('^(.+) Appropriations Subcommittee$', comm).group(1) leg.add_role('committee member', term, chamber=chamber, committee='Appropriations', subcommittee=sub, position=mtype) else: leg.add_role('committee member', term, chamber=chamber, committee=comm, position=mtype) self.save_legislator(leg)
def scrape(self, chamber, term): # TODO: old AZ scraper allowed old sessions, they seem to be gone? self.validate_term(term, latest_only=True) body = {'lower': 'H', 'upper': 'S'}[chamber] url = 'http://www.azleg.gov/MemberRoster/?body=' + body page = self.get(url).text # there is a bad comment closing tag on this page page = page.replace('--!>', '-->') root = html.fromstring(page) path = '//table//tr' roster = root.xpath(path)[1:] for row in roster: position = '' name, district, party, email, room, phone, = row.xpath('td') if email.attrib.get('class') == 'vacantmember': continue # Skip any vacant members. link = name.xpath('string(a/@href)') if len(name) == 1: name = name.text_content().strip() else: position = name.tail.strip() name = name[0].text_content().strip() if '--' in name: name = name.split('--')[0].strip() linkpage = self.get(link).text linkpage = linkpage.replace('--!>', '-->') linkroot = html.fromstring(linkpage) linkroot.make_links_absolute(link) photos = linkroot.xpath("//img[contains(@src, 'MemberPhoto')]") if len(photos) != 1: self.warning('no photo on ' + link) photo_url = '' else: photo_url = photos[0].attrib['src'] district = district.text_content() party = party.text_content().strip() email = email.text_content().strip() if email.startswith('Email: '): email = email.replace('Email: ', '').lower() + '@azleg.gov' else: email = '' party = self.get_party(party) room = room.text_content().strip() if chamber == 'lower': address = "House of Representatives\n" else: address = "Senate\n" address = address + "1700 West Washington\n Room " + room \ + "\nPhoenix, AZ 85007" phone = phone.text_content().strip() if '602' not in re.findall(r'(\d+)', phone): phone = "602-" + phone leg = Legislator(term, chamber, district, full_name=name, party=party, url=link, photo_url=photo_url) leg.add_office('capitol', 'Capitol Office', address=address, phone=phone, email=email) if position: leg.add_role(position, term, chamber=chamber, district=district, party=party) leg.add_source(url) # Probably just get this from the committee scraper # self.scrape_member_page(link, session, chamber, leg) self.save_legislator(leg)
def scrape(self, chamber, term): self.validate_term(term, latest_only=True) session_id = self.metadata['session_details'][term]['number'] if chamber == 'upper': chamber_name = 'senate' else: chamber_name = 'house' url = "https://www.legis.iowa.gov/legislators/%s" % chamber_name page = lxml.html.fromstring(self.urlopen(url)) page.make_links_absolute(url) table = page.xpath('//table[@id="sortableTable"]')[0] for link in table.xpath(".//a[contains(@href, 'legislator')]"): name = link.text.strip() leg_url = link.get('href') district = link.xpath("string(../../td[3])") party = link.xpath("string(../../td[4])") email = link.xpath("string(../../td[5])") if party == 'Democrat': party = 'Democratic' pid = re.search("personID=(\d+)", link.attrib['href']).group(1) photo_url = ("http://www.legis.iowa.gov/getPhotoPeople.aspx" "?GA=%s&PID=%s" % (session_id, pid)) leg = Legislator(term, chamber, district, name, party=party, photo_url=photo_url, url=url) leg.add_source(url) leg_page = lxml.html.fromstring(self.urlopen(link.attrib['href'])) office_data = { "email": "ctl00_cphMainContent_divEmailLegis", "home_phone": "ctl00_cphMainContent_divPhoneHome", "home_addr": "ctl00_cphMainContent_divAddrHome", "office_phone": "ctl00_cphMainContent_divPhoneCapitol", } metainf = {} for attr in office_data: path = office_data[attr] info = leg_page.xpath("//div[@id='%s']" % path) if len(info) != 1: continue info = info[0] _, data = [x.text_content() for x in info.xpath("./span")] data = data.strip() if data == "": continue metainf[attr] = data if "home_phone" in metainf or "home_addr" in metainf: home_args = {} if "home_phone" in metainf: home_args['phone'] = metainf['home_phone'] if "home_addr" in metainf: home_args['address'] = metainf['home_addr'] leg.add_office('district', 'Home Office', **home_args) if "email" in metainf or "office_phone" in metainf: cap_args = {} if "email" in metainf: cap_args['email'] = metainf['email'] if "office_phone" in metainf: cap_args['phone'] = metainf['office_phone'] leg.add_office('capitol', 'Capitol Office', **cap_args) comm_path = "//a[contains(@href, 'committee')]" for comm_link in leg_page.xpath(comm_path): comm = comm_link.text.strip() match = re.search(r'\((.+)\)$', comm) if match: comm = re.sub(r'\((.+)\)$', '', comm).strip() mtype = match.group(1).lower() else: mtype = 'member' if comm.endswith('Appropriations Subcommittee'): sub = re.match('^(.+) Appropriations Subcommittee$', comm).group(1) leg.add_role('committee member', term, chamber=chamber, committee='Appropriations', subcommittee=sub, position=mtype) else: leg.add_role('committee member', term, chamber=chamber, committee=comm, position=mtype) self.save_legislator(leg)
def scrape(self, chamber, term): # CSS isn't there without this, it serves up a mobile version self.user_agent = 'Mozilla/5.0' if chamber == 'lower': url = 'http://www.scstatehouse.gov/member.php?chamber=H' else: url = 'http://www.scstatehouse.gov/member.php?chamber=S' data = self.urlopen(url) doc = lxml.html.fromstring(data) doc.make_links_absolute(url) for a in doc.xpath('//a[contains(@href, "code=")]'): full_name = a.text leg_url = a.get('href') leg_html = self.urlopen(leg_url) leg_doc = lxml.html.fromstring(leg_html) leg_doc.make_links_absolute(leg_url) party, district, _ = leg_doc.xpath('//p[@style="font-size: 17px; margin: 0 0 0 0; padding: 0;"]/text()') if 'Republican' in party: party = 'Republican' elif 'Democrat' in party: party = 'Democratic' # District # - County - Map district = district.split()[1] photo_url = leg_doc.xpath('//img[contains(@src,"/members/")]/@src')[0] legislator = Legislator(term, chamber, district, full_name, party=party, photo_url=photo_url, url=leg_url) # office address / phone try: addr_div = leg_doc.xpath('//div[@style="float: left; width: 225px; margin: 10px 5px 0 20px; padding: 0;"]')[0] addr = addr_div.xpath('p[@style="font-size: 13px; margin: 0 0 10px 0; padding: 0;"]')[0].text_content() phone = addr_div.xpath('p[@style="font-size: 13px; margin: 0 0 0 0; padding: 0;"]/text()')[0] phone = phone.strip() legislator.add_office('capitol', 'Columbia Address', address=addr, phone=phone) except IndexError: self.warning('no address for {0}'.format(full_name)) legislator.add_source(leg_url) legislator.add_source(url) # committees (skip first link) for com in leg_doc.xpath('//a[contains(@href, "committee.php")]')[1:]: if com.text.endswith(', '): committee, role = com.text_content().rsplit(', ',1) # known roles role = {'Treas.': 'treasurer', 'Secy.': 'secretary', 'Secy./Treas.': 'secretary/treasurer', 'V.C.': 'vice-chair', '1st V.C.': 'first vice-chair', '2nd V.C.': 'second vice-chair', '3rd V.C.': 'third vice-chair', 'Ex.Officio Member': 'ex-officio member', 'Chairman': 'chairman'}[role] else: committee = com.text role = 'member' legislator.add_role('committee member', term=term, chamber=chamber, committee=committee, position=role) self.save_legislator(legislator)
def scrape(self, term, chambers): special_case_used = False url = 'http://gencourt.state.nh.us/downloads/Members.txt' option_map = {} html = self.get( 'http://www.gencourt.state.nh.us/house/members/memberlookup.aspx' ).text doc = lxml.html.fromstring(html) for opt in doc.xpath('//option'): option_map[opt.text] = opt.get('value') data = self.get(url).text for line in data.splitlines(): if line.strip() == "": continue (chamber, fullname, last, first, middle, county, district_num, seat, party, street, street2, city, astate, zipcode, home_phone, office_phone, fax, email, com1, com2, com3, com4, com5, com6) = line.split('\t') chamber = chamber_map[chamber] # skip legislators from a chamber we aren't scraping if chamber not in chambers: continue middle = middle.strip() last = last.strip('"') if middle: full = '%s %s %s' % (first, middle, last) else: full = '%s %s' % (first, last) address = street if street2: address += (' ' + street2) address += '\n%s, %s %s' % (city, astate, zipcode) district = str(int(district_num)) if county: district = '%s %s' % (county, district) # When a candidate receives enough write-in votes in the # other party's primary, they are listed on the ballot as # being a nominee of both parties (eg, 'd+r') # Cross-reference this list for official party affiliation: # http://www.gencourt.state.nh.us/House/caljourns/journals/2015/HJ_4.pdf leg = Legislator(term, chamber, district, full, party=party_map[party]) leg.add_office('district', 'Home Address', address=address, phone=home_phone or None) leg.add_office('district', 'Office Address', phone=office_phone or None, fax=fax or None, email=email or None) if chamber == 'upper': leg['url'] = 'http://www.gencourt.state.nh.us/Senate/members/webpages/district%02d.aspx' % int( district_num) elif chamber == 'lower': code = option_map.get('{0}, {1}'.format(last, first)) if code: leg['url'] = 'http://www.gencourt.state.nh.us/house/members/member.aspx?member=' + code romans = r'(?i)\s([IXV]+)(?:\s|$)' for com in (com1, com2, com3, com4, com5, com6): com = com.strip('"') if com: com_name = com.title() com_name = re.sub(romans, lambda m: m.group().upper(), com_name) leg.add_role('committee member', term=term, chamber=chamber, committee=com_name) if 'url' in leg: leg['photo_url'] = self.get_photo(leg['url'], chamber) leg.add_source(url) self.save_legislator(leg)
def scrape(self, chamber, term): self.validate_term(term) session = self.get_session_for_term(term) try: session_id = self.get_session_id(session) except KeyError: raise NoDataForPeriod(session) body = {'lower': 'H', 'upper': 'S'}[chamber] url = 'http://www.azleg.gov/MemberRoster.asp?Session_ID=%s&body=%s' % ( session_id, body) page = self.get(url).text root = html.fromstring(page) path = '//table[@id="%s"]/tr' % {'H': 'house', 'S': 'senate'}[body] roster = root.xpath(path)[1:] for row in roster: position = '' vacated = '' name, district, party, email, room, phone, fax = row.xpath('td') if email.attrib.get('class') == 'vacantmember': continue # Skip any vacant members. link = name.xpath('string(a/@href)') link = "http://www.azleg.gov" + link if len(name) == 1: name = name.text_content().strip() else: position = name.tail.strip() name = name[0].text_content().strip() linkpage = self.get(link).text linkroot = html.fromstring(linkpage) linkroot.make_links_absolute(link) photos = linkroot.xpath("//img[@name='memberphoto']") if len(photos) != 1: raise Exception photo_url = photos[0].attrib['src'] district = district.text_content() party = party.text_content().strip() email = email.text_content().strip() if ('Vacated' in email or 'Resigned' in email or 'Removed' in email): # comment out the following 'continue' for historical # legislative sessions # for the current session, if a legislator has left we will # skip him/her to keep from overwriting their information continue vacated = re.search('[0-9]*/[0-9]*/\d{4}', email).group() email = '' party = self.get_party(party) room = room.text_content().strip() if chamber == 'lower': address = "House of Representatives\n" else: address = "Senate\n" address = address + "1700 West Washington\n Room " + room \ + "\nPhoenix, AZ 85007" phone = phone.text_content().strip() if not phone.startswith('602'): phone = "602-" + phone fax = fax.text_content().strip() if not fax.startswith('602'): fax = "602-" + fax if vacated: end_date = datetime.datetime.strptime(vacated, '%m/%d/%Y') leg = Legislator(term, chamber, district, full_name=name, party=party, url=link) leg['roles'][0]['end_date'] = end_date else: leg = Legislator(term, chamber, district, full_name=name, party=party, email=email, url=link, photo_url=photo_url) leg.add_office('capitol', 'Capitol Office', address=address, phone=phone, fax=fax) if position: leg.add_role(position, term, chamber=chamber, district=district, party=party) leg.add_source(url) #Probably just get this from the committee scraper #self.scrape_member_page(link, session, chamber, leg) self.save_legislator(leg)