def getAKLeg(): house, senate = map( lambda body: BeautifulSoup( urlopen('http://house.legis.state.ak.us/').read() ).find( 'div', {'id': 'tab1-2'} ).find( 'ul', {'class': 'people-holder'} ).find( 'ul', {'class': 'item'} ).find_all('li'), ('house', 'senate') ) dictList = [] for body, table in zip(('House', 'Senate'), (house, senate)): for item in table: repInfo = {} repInfo['Name'] = unidecode( item.find('strong', {'class': 'name'}).string ).strip() link = item.find('a') repInfo['Website'] = link.get('href') dl = item.find('dl') district = re.search( r'District:\s*(\w+)', dl.get_text(), re.DOTALL ).group(1) repInfo['District'] = 'AK State {0} District {1}'.format( body, district ) repInfo['Party'] = re.search( r'Party:\s*(\w+)', dl.get_text(), re.DOTALL ).group(1) repInfo['Phone'] = re.search( r'Phone:\s*([0-9-]+)', dl.get_text(), re.DOTALL ).group(1) repInfo['Email'] = dl.find('a').get('href').replace('mailto:', '') member_soup = BeautifulSoup(urlopen(repInfo['Website']).read()) repInfo['Address'] = multiline_strip( re.search( r'Session Contact(.+99801)', member_soup.find_all('div', {'class': 'bioleft'})[1].get_text(), re.DOTALL ).group(1) ) print str(repInfo) + '\n' dictList.append(repInfo) return dictList
def get_house_rep(soup): member_info = soup.find('div', {'class': 'member-info'}) number = re.search( r'District (\d+)', str(member_info) ).group(1) district = 'TX State House District %s' % number # TX House member names are in "Last, First" format: def rewrite_name(string): search = re.search('Rep. (.+?)(?:, (?!Jr.))(.+)', string) if search is None: return None first, last = search.group(2).strip(), search.group(1).strip() return unidecode(first + ' ' + last).strip() name = rewrite_name(member_info.find('h2').get_text()) phone = re.search( r'\([0-9]{3}\)\s[0-9]{3}-[0-9]{4}', str(member_info) ).group() address = multiline_strip( re.search( r'Capitol Address:(.+?787\d{2})', str(member_info), re.DOTALL ).group(1) ) return { 'District': district, 'Name': name, 'Phone': phone, 'Address': address }