def fetch_member(self, url, name, term, chamber): party_map = {'R': 'Republican', 'D': 'Democratic', 'I': 'Independent'} party_district_re = re.compile( r'\((R|D|I)\) - (?:House|Senate) District\s+(\d+)') url = 'http://leg6.state.va.us' + url # handle resignations, special elections match = re.search(r'-(Resigned|Member) (\d{1,2}/\d{1,2})?', name) if match: action, date = match.groups() name = name.rsplit('-')[0] if action == 'Resigned': pass # TODO: set end date elif action == 'Member': pass # TODO: set start date with self.urlopen(url) as html: doc = lxml.html.fromstring(html) party_district_line = doc.xpath('//h3/font/text()')[0] party, district = party_district_re.match(party_district_line).groups() leg = Legislator(term, chamber, district, name.strip(), party=party_map[party]) leg.add_source(url) for com in doc.xpath('//ul[@class="linkSect"][1]/li/a/text()'): leg.add_role('committee member', term=term, chamber=chamber, committee=com) self.save_legislator(leg)
def scrape_legislator_data(self, url, chamber): party_fulls = {'R' : 'Republican', 'D' : 'Democrat'} with self.urlopen(url) as page: page = BeautifulSoup(page) for data in page.find('table', id = 'ctl00_mainCopy_DataList1')('td'): spans = data('span') if len(spans) == 0: self.debug('Found an empty cell in %s. Continuing' % url) continue full_name = ' '.join([span.string.strip() for span in spans]) if len(spans[0].string.strip().split()) == 2: first_name, middle_name = spans[0].string.strip().split() else: first_name, middle_name = spans[0].string.strip(), '' last_name = spans[1].string.strip() details_url = get_abs_url(url, data.find('a')['href']) with self.urlopen(details_url) as details: details = BeautifulSoup(details) district = details.find('a', id = 'ctl00_mainCopy_LegisInfo_DISTRICTLabel').string.strip() party = party_fulls[details.find('span', id = 'ctl00_mainCopy_LegisInfo_PARTYLabel').string] leg = Legislator('2010', chamber, district, full_name, first_name, last_name, middle_name, party) leg.add_source(details_url) comms_table = details.find('table', id = 'ctl00_mainCopy_MembershipGrid') for comms_raw_data in comms_table('tr')[1:]: comm_data = comms_raw_data('td') comm_role_type = comm_data[0].string.strip() comm_name = comm_data[1]('a')[0].string.strip() leg.add_role(comm_role_type, '2010', chamber = chamber, committee = comm_name) self.save_legislator(leg)
def scrape_member(self, chamber, term, member_url): with self.urlopen(member_url) as page: root = lxml.html.fromstring(page) root.make_links_absolute(member_url) photo_url = root.xpath('//div[@class="bioPicContainer"]/img/@src')[0] full_name = root.xpath('//div[@class="bioPicContainer"]/img/@alt')[0] name_parts = full_name.split(' ') first_name = last_name = middle_name = None if len(name_parts) == 2: first_name, last_name = name_parts middle_name = '' elif len(name_parts) == 3: first_name, middle_name, last_name = name_parts elif len(name_parts) > 3: first_name = name_parts[0] middle_name = name_parts[1] last_name = name_parts[2] district = root.xpath('//div[@id="District"]//div[@class="widgetContent"]') if len(district): district = district[0].text.strip() if len(district.split(' - ')) > 1: district = district.split(' - ')[0] elif len(district.split('. ')) > 1: district = district.split('. ')[0] else: district = district[0:32] else: district = 'NotFound' party = root.xpath('//div[@class="bioDescription"]/div')[0].text.strip().split(',')[0] if party == 'Democrat': party = 'Democratic' elif party == 'Republican': party = 'Republican' leg = Legislator(term, chamber, district, full_name, party=party, photo_url=photo_url, first_name=first_name, middle_name=middle_name, last_name=last_name) leg.add_source(member_url) comm_div = root.xpath('//div[@id="Column5"]//div[@class="widgetContent"]') if len(comm_div): comm_div = comm_div[0] for li in comm_div.xpath('/ul/li'): role = li.xpath('text()').strip() comm = li.xpath('/a/text()').strip()[0].strip(',') if role == 'Member': role = 'committee member' leg.add_role(role, term, chamber=chamber, committee=comm) self.save_legislator(leg)
def scrape_member(self, chamber, year, member_url): with self.urlopen(member_url) as page: root = lxml.html.fromstring(page) root.make_links_absolute(member_url) sdiv = root.xpath('//div[@class="subtitle"]')[0] table = sdiv.getnext() photo_url = table.xpath('//img[@id="ctl00_ContentPlaceHolder1' '_imgMember"]')[0].attrib['src'] td = table.xpath('//td[@valign="top"]')[0] full_name = td.xpath('string(//div[2]/strong)').strip() district = td.xpath('string(//div[3])').strip() district = district.replace('District ', '') party = td.xpath('string(//div[4])').strip()[0] if party == 'D': party = 'Democrat' elif party == 'R': party = 'Republican' leg = Legislator('81', chamber, district, full_name, party=party, photo_url=photo_url) leg.add_source(member_url) comm_div = root.xpath('//div[string() = "Committee Membership:"]' '/following-sibling::div' '[@class="rcwcontent"]')[0] for br in comm_div.xpath('*/br'): if br.tail: leg.add_role('committee member', '81', chamber=chamber, committee=br.tail.strip()) self.save_legislator(leg)
def scrape(self, chamber, term): self.validate_term(term) l1 = Legislator(term, chamber, '1st', 'Bob Smith', party='Democrat') if chamber == 'upper': l1.add_role('President of the Senate', term) else: l1.add_role('Speaker of the House', term) l1.add_source('http://example.com/Bob_Smith.html') l2 = Legislator(term, chamber, '2nd', 'Sally Johnson', party='Republican') l2.add_role('Minority Leader', term) l2.add_source('http://example.com/Sally_Johnson.html') self.save_legislator(l1) self.save_legislator(l2)
def scrape(self, chamber, year): if year != '2009': raise NoDataForYear l1 = Legislator('2009-2010', chamber, '1st', 'Bob Smith', party='Democrat') if chamber == 'upper': l1.add_role('President of the Senate', '2009-2010') else: l1.add_role('Speaker of the House', '2009-2010') l1.add_source('http://example.com/Bob_Smith.html') l2 = Legislator('2009-2010', chamber, '2nd', 'Sally Johnson', party='Republican') l2.add_role('Minority Leader', '2009-2010') l2.add_source('http://example.com/Sally_Johnson.html') self.save_legislator(l1) self.save_legislator(l2)
def scrape(self, chamber, term): self.validate_term(term) session = self.get_session_for_term(term) try: session_id = self.get_session_id(session) except KeyError: raise NoDataForPeriod(session) body = {'lower': 'H', 'upper': 'S'}[chamber] url = 'http://www.azleg.gov/MemberRoster.asp?Session_ID=%s&body=%s' % ( session_id, body) with self.urlopen(url) as page: root = html.fromstring(page) path = '//table[@id="%s"]/tr' % {'H': 'house', 'S': 'senate'}[body] roster = root.xpath(path)[1:] for row in roster: position = '' vacated = '' name, district, party, email, room, phone, fax = row.getchildren() link = name.xpath('string(a/@href)') link = "http://www.azleg.gov" + link if len(name) == 1: name = name.text_content().strip() else: position = name.tail.strip() name = name[0].text_content().strip() district = district.text_content() party = party.text_content().strip() party = self.get_party(party) email = email.text_content().strip() if re.match('Vacated', email): vacated = re.search('[0-9]*/[0-9]*/\d{4}', email).group() email = '' room = room.text_content().strip() phone = phone.text_content().strip() if not phone.startswith('602'): phone = "602-" + phone fax = fax.text_content().strip() if not fax.startswith('602'): fax = "602-" + fax if vacated: end_date = datetime.datetime.strptime(vacated, '%m/%d/%Y') leg = Legislator( term, chamber, district, full_name=name, party=party, url=link) leg['roles'][0]['end_date'] = end_date else: leg = Legislator( term, chamber, district, full_name=name, party=party, phone=phone, fax=fax, room=room, email=email, url=link) if position: leg.add_role( position, term, chamber=chamber, district=district, party=party) leg.add_source(url) #Probably just get this from the committee scraper #self.scrape_member_page(link, session, chamber, leg) self.save_legislator(leg)