def fetch_member(self, url, name, session, chamber): abbr = {'R': 'Republican', 'D': 'Democrat', 'I': 'Independent'} url = "http://leg1.state.va.us/%s" % url with self.soup_context(url) as member: ex = member.findAll('table', text=re.compile(re.escape(name))) if ex == []: raise Exception("Parse error fetching member %s" % name) else: ex = ex[0].parent.nextSibling.nextSibling.string.split() # Some people are "Joe X. Schmoe;Resigned". Fantastic. name = re.split('\;|\(', name)[0] # some other people are Joe X. Schmoe (resigned name_parts = name.split() first_name = name_parts[0] last = name_parts[len(name_parts)-1] if re.match(r'[IV]+$|\bJr\b\.$|\b(Sr)\b\.$', last): last_name = name_parts[len(name_parts)-2] else: last_name = last if name_parts[1] == last_name: middle_name = '' else: middle_name = name_parts[1] # Deal with the Van Houtens of the world # also, watch out for their rugged Danish relatives... if name_parts[1] == 'Van': middle_name = '' last_name = name_parts[1] + ' ' + last_name last_name = last_name.replace(',','') middle_name = middle_name.replace('.', '') party = ex[0][1] district = ex[len(ex)-1] leg = Legislator(session=session, chamber=chamber, district=district, full_name=name.strip(), first_name=first_name.strip(), last_name=last_name.strip(), middle_name=middle_name.replace('.', '').strip(), party=abbr[party]) leg.add_source(url) # [_,_,district,_] # so... yeah. not totally sure how I should handle legislators in subsessions # but I'll only add them if the matcher doesn't already know about them. sanitized = leg['full_name'].replace('.', '').lower() if self.matcher[chamber][sanitized] and self.matcher[chamber][sanitized][2] == district: return self.save_legislator(leg)
def scrape_new_legislators(self, chamber, session): """ Scrape legislators from 2009 and later. """ if chamber == 'upper': search = 'Senate Members' else: search = 'House Members' leg_list_url = "http://legis.state.sd.us/sessions/%s/"\ "MemberMenu.aspx" % (session) leg_list = self.soup_parser(self.urlopen(leg_list_url)) list_div = leg_list.find(text=search).findNext('div') for link in list_div.findAll('a'): full_name = link.contents[0].strip() first_name = full_name.split(', ')[1].split(' ')[0] last_name = full_name.split(',')[0] middle_name = '' leg_page_url = "http://legis.state.sd.us/sessions/%s/%s" % ( session, link['href']) leg_page = self.soup_parser(self.urlopen(leg_page_url)) party = leg_page.find( id="ctl00_contentMain_spanParty").contents[0].strip() district = leg_page.find( id="ctl00_contentMain_spanDistrict").contents[0] district = district.strip().lstrip('0') occ_span = leg_page.find(id="ctl00_contentMain_spanOccupation") if len(occ_span.contents) > 0: occupation = occ_span.contents[0].strip() else: occupation = None legislator = Legislator(session, chamber, district, full_name, first_name, last_name, middle_name, party, occupation=occupation) legislator.add_source(leg_page_url) self.save_legislator(legislator)
def scrape_old_legislators(self, chamber, session): """ Scrape pre-2009 legislators. """ if chamber == 'upper': chamber_name = 'Senate' else: chamber_name = 'House' if int(session) < 2008: filename = 'district.htm' else: filename = 'MembersDistrict.htm' leg_list_url = "http://legis.state.sd.us/sessions/%s/%s" % ( session, filename) leg_list = self.soup_parser(self.urlopen(leg_list_url)) for district_str in leg_list.findAll('h2'): district = district_str.contents[0].split(' ')[1].lstrip('0') for row in district_str.findNext('table').findAll('tr')[1:]: if row.findAll('td')[1].contents[0].strip() != chamber_name: continue full_name = row.td.a.contents[0].strip() first_name = full_name.split(', ')[1].split(' ')[0] last_name = full_name.split(',')[0] middle_name = '' party = row.findAll('td')[3].contents[0].strip() occupation = row.findAll('td')[4].contents[0].strip() legislator = Legislator(session, chamber, district, full_name, first_name, last_name, middle_name, party=party, occupation=occupation) legislator.add_source(leg_list_url) self.save_legislator(legislator)