def scrape_details(self, chamber, term, leg_name, leg_link, role): if not leg_link: # Vacant post, likely: if "Vacancy" in leg_name: return raise Exception("leg_link is null. something went wrong") try: url = 'http://billstatus.ls.state.ms.us/members/%s' % leg_link url_root = os.path.dirname(url) details_page = self.urlopen(url) root = lxml.etree.fromstring(details_page.bytes) party = root.xpath('string(//PARTY)') district = root.xpath('string(//DISTRICT)') photo = "%s/%s" % (url_root, root.xpath('string(//IMG_NAME)')) home_phone = root.xpath('string(//H_PHONE)') bis_phone = root.xpath('string(//B_PHONE)') capital_phone = root.xpath('string(//CAP_PHONE)') other_phone = root.xpath('string(//OTH_PHONE)') org_info = root.xpath('string(//ORG_INFO)') email_name = root.xpath('string(//EMAIL_ADDRESS)') cap_room = root.xpath('string(//CAP_ROOM)') if party == 'D': party = 'Democratic' else: party = 'Republican' leg = Legislator(term, chamber, district, leg_name, party=party, role=role, org_info=org_info, url=url, photo_url=photo) leg.add_source(url) kwargs = {} if email_name.strip() != "": email = '%s@%s.ms.gov' % (email_name, { "upper": "senate", "lower": "house" }[chamber]) kwargs['email'] = email if capital_phone != "": kwargs['phone'] = capital_phone if cap_room != "": kwargs["address"] = "Room %s\n%s" % (cap_room, CAP_ADDRESS) else: kwargs['address'] = CAP_ADDRESS leg.add_office('capitol', 'Capitol Office', **kwargs) self.save_legislator(leg) except scrapelib.HTTPError, e: self.warning(str(e))
def scrape_legislator(self, chamber, term, name, url): html = self.get(url).text page = lxml.html.fromstring(html) page.make_links_absolute(url) xpath = '//select[@name="sel_member"]/option[@selected]/text()' district = page.xpath('//h1[contains(., "DISTRICT")]/text()').pop().split()[1].strip().lstrip('0') party = page.xpath('//h2').pop().text_content() party = re.search(r'\((R|D|I)[ \-\]]', party).group(1) if party == 'D': party = 'Democratic' elif party == 'R': party = 'Republican' elif party == 'I': party = 'Independent' photo_url = page.xpath( "//img[contains(@src, 'images/members/')]")[0].attrib['src'] leg = Legislator(term, chamber, district, name, party=party, photo_url=photo_url, url=url) leg.add_source(url) self.scrape_offices(leg, page) self.save_legislator(leg)
def scrape_member(self, chamber, term, member_url): with self.urlopen(member_url) as page: root = lxml.html.fromstring(page) root.make_links_absolute(member_url) photo_url = root.xpath('//div[starts-with(@class,"bioPicContainer")]/img/@src')[0] photo_url = root.xpath('//div[starts-with(@class,"bioPicContainer")]/img/@src')[0] full_name = root.xpath('//div[starts-with(@class,"bioPicContainer")]/img/@alt')[0] email = root.xpath('//a[contains(@href, "mailto")]/@href')[0] email = email.replace('mailto:','') district = root.xpath('//div[@id="District"]//div[starts-with(@class,"widgetContent")]') if len(district): district = district[0].text.strip() district = clean_district(district) party = root.xpath('//span[@class="legislatorAffiliation"]/text()')[0] if party == 'D': party = 'Democratic' elif party == 'R': party = 'Republican' else: party = 'Other' leg = Legislator(term, chamber, district, full_name, party=party, photo_url=photo_url, url=member_url, email=email) leg.add_source(member_url) self.save_legislator(leg)
def scrape_lower(self, term): url = "http://le.utah.gov/house2/representatives.jsp" html = self.urlopen(url) doc = lxml.html.fromstring(html) doc.make_links_absolute(url) for row in doc.xpath("//tr")[1:]: tds = row.xpath("td") district = tds[0].text_content() if tds[1].text_content() == "Empty": self.log("district %s is empty" % district) continue a = tds[1].xpath("a")[0] name = a.text_content() leg_url = a.get("href") party = tds[2].text_content() if party == "D": party = "Democratic" elif party == "R": party = "Republican" else: raise ValueError("unknown party") # get photo leg_html = self.urlopen(leg_url) leg_doc = lxml.html.fromstring(leg_html) leg_doc.make_links_absolute(leg_url) photo_url = leg_doc.xpath('//img[@alt="photo"]/@src')[0] leg = Legislator(term, "lower", district, name, party=party, photo_url=photo_url, url=leg_url) leg.add_source(url) leg.add_source(leg_url) self.save_legislator(leg)
def scrape(self, chamber, term): if chamber == 'lower': url = 'http://www.scstatehouse.gov/html-pages/housemembers.html' else: url = 'http://www.scstatehouse.gov/html-pages/senatemembersd.html' with self.urlopen(url) as data: doc = lxml.html.fromstring(data) rows = doc.xpath('//pre/div[@class="sansSerifNormal"]') for row in rows: member_a = row.xpath('a')[0] name_party = member_a.text_content() if name_party.find('[D]') != -1: party = 'Democratic' full_name = name_party.partition('[D]')[0].strip() elif name_party.find('[R]') != -1: party = 'Republican' full_name = name_party.partition('[R]')[0].strip() photo_url = 'http://www.scstatehouse.gov/members/gif/' + re.search('(\d+)\.html', member_a.attrib['href']).group(1) + '.jpg' other_data = row.text_content().encode('ascii', 'ignore') od_result = re.search('^.+District (\d+) - (.+)Count.+$', other_data) district = od_result.group(1) contentb = re.search('^.+\(C\) (.+,.*\d+).*Bus. (\(\d+\) \d+-\d+).+$', other_data) if contentb is not None: office_address = contentb.group(1) office_phone = contentb.group(2) legislator = Legislator(term, chamber, district, full_name, party=party, photo_url=photo_url, office_address=office_address, office_phone=office_phone) legislator.add_source(url) self.save_legislator(legislator)
def _scrape_speaker_of_the_house(self, url, term, chamber): """The speaker of the house has a special page, because he is just OH so special</sarcasm> Main page url like: http://www1.legis.ga.gov/legis/2011_12/house/speaker/index.htm but need to scrape: http://www1.legis.ga.gov/legis/2011_12/house/speaker/bio.html """ if url.endswith("index.htm"): url = url.replace("index.htm", "bio.html") with self.lxml_context(url) as page: path = '//div[@id="title"]' speaker_info_div = page.xpath(path) if speaker_info_div and len(speaker_info_div) == 1: # This isn't exactly great but it's the best/quickest solution for now speaker_info = speaker_info_div[0].text_content().split() name = speaker_info[2] + " " + speaker_info[3] party = None if "R-" in speaker_info[4]: party = "Republican" elif "D-" in speaker_info[4]: party = "Democrat" elif "I-" in speaker_info[4]: party = "Independent" district = None if "district" in speaker_info[6].lower(): district = speaker_info[7].strip(")") legislator = Legislator(term, chamber, district, name, party=party) legislator.add_source(url) return legislator
def scrape_senator(self, name, term, url): with self.urlopen(url) as text: page = lxml.html.fromstring(text) district = page.xpath( "string(//*[starts-with(text(), 'Senator ')])") district = re.search(r'District (\d+)', district).group(1) try: party = page.xpath( "//b[contains(text(), 'Party')]")[0].getnext().tail party = party.strip() except IndexError: party = 'N/A' if party == 'No Party (Independent)': party = 'Independent' elif party == 'Democrat': party = 'Democratic' leg = Legislator(term, 'upper', district, name, party=party, url=url) leg.add_source(url) self.save_legislator(leg)
def scrape_rep(self, name, term, url): # special case names that confuses name_tools if name == "Franklin, A.B.": name = "Franklin, A. B." elif ", Jr., " in name: name.replace(", Jr., ", " ") name += ", Jr." elif ", III, " in name: name.replace(", III, ", " ") name += ", III" with self.urlopen(url) as text: page = lxml.html.fromstring(text) district = page.xpath("//a[contains(@href, 'Maps')]")[0].attrib["href"] district = re.search("district(\d+).pdf", district).group(1) if "Democrat District" in text: party = "Democratic" elif "Republican District" in text: party = "Republican" elif "Independent District" in text: party = "Independent" else: party = "Other" leg = Legislator(term, "lower", district, name, party=party) leg.add_source(url) self.save_legislator(leg)
def scrape_bio(self, term, chamber, district, name, url): # this opens the committee section without having to do another request url += '&TableRow=1.5.5' frame_doc = self.lxmlize(url) actual_url = frame_doc.xpath("//frame[@name='right']/@src")[0] doc = self.lxmlize(actual_url) # party is in one of these party = doc.xpath('//div[@id="page_header"]')[0].text.strip()[-3:] if '(D)' in party: party = 'Democratic' elif '(R)' in party: party = 'Republican' else: raise AssertionError("No party found for {name}".format(name=name)) leg = Legislator(term, chamber, district, name, party=party) photo_url = doc.xpath('//img[contains(@src, "jpg")]/@src') if photo_url: leg['photo_url'] = photo_url[0] contact_info = self.scrape_contact_info(doc) leg.update(contact_info) return leg
def scrape_lower(self, term): url = "http://assembly.state.ny.us/mem/?sh=email" with self.urlopen(url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(url) for link, email in zip( page.xpath("//a[contains(@href, '/mem/')]"), page.xpath("//a[contains(@href, 'mailto')]") ): name = link.text.strip() if name == "Assembly Members": continue # empty seats if "Assembly District" in name: continue leg_url = link.get("href") district = link.xpath("string(../following-sibling::" "div[@class = 'email2'][1])") district = district.rstrip("rthnds") legislator = Legislator(term, "lower", district, name, party="Unknown", url=leg_url) legislator.add_source(url) email = email.text_content().strip() if email: legislator["email"] = email self.save_legislator(legislator)
def scrape_upper(self, term): url = "http://www.nysenate.gov/senators" page = self.urlopen(url) page = lxml.html.fromstring(page) page.make_links_absolute(url) xpath = ( '//div[contains(@class, "views-row")]/' 'div[contains(@class, "last-name")]/' 'span[contains(@class, "field-content")]/a') for link in page.xpath(xpath): if link.text in (None, 'Contact', 'RSS'): continue name = link.text.strip() district = link.xpath("string(../../../div[3]/span[1])") district = re.match(r"District (\d+)", district).group(1) photo_link = link.xpath("../../../div[1]/span/a/img")[0] photo_url = photo_link.attrib['src'] legislator = Legislator(term, 'upper', district, name, party="Unknown", photo_url=photo_url) legislator.add_source(url) contact_link = link.xpath("../span[@class = 'contact']/a")[0] contact_url = contact_link.attrib['href'] self.scrape_upper_offices(legislator, contact_url) legislator['url'] = contact_url.replace('/contact', '') self.save_legislator(legislator)
def scrape(self, chamber, term): self.validate_term(term) if chamber == 'upper': url = "http://legis.wi.gov/w3asp/contact/legislatorslist.aspx?house=senate" else: url = "http://legis.wi.gov/w3asp/contact/legislatorslist.aspx?house=assembly" with self.urlopen(url) as body: page = lxml.html.fromstring(body) for row in page.cssselect("#ctl00_C_dgLegData tr"): if len(row.cssselect("td a")) > 0: rep_url = list(row)[0].cssselect("a[href]")[0].get("href") rep_url = 'http://legis.wi.gov/w3asp/contact/' + rep_url legpart = re.findall(r'([\w\-\,\s\.]+)\s+\(([\w])\)', list(row)[0].text_content()) if legpart: full_name, party = legpart[0] # skip if the legislator is vacant (occurred in 2011 session) if full_name == 'Vacant': continue party = PARTY_DICT[party] district = str(int(list(row)[2].text_content())) leg = Legislator(term, chamber, district, full_name, party=party, url=rep_url) leg.add_source(rep_url) leg = self.add_committees(leg, rep_url, term, chamber) self.save_legislator(leg)
def scrape_upper(self, term): url = "http://www.nysenate.gov/senators" with self.urlopen(url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(url) for link in page.xpath('//a[contains(@href, "/senator/")]'): if link.text in (None, "Contact", "RSS"): continue name = link.text.strip() district = link.xpath("string(../../../div[3]/span[1])") district = re.match(r"District (\d+)", district).group(1) photo_link = link.xpath("../../../div[1]/span/a/img")[0] photo_url = photo_link.attrib["src"] legislator = Legislator(term, "upper", district, name, party="Unknown", photo_url=photo_url) legislator.add_source(url) contact_link = link.xpath("../span[@class = 'contact']/a")[0] contact_url = contact_link.attrib["href"] self.scrape_upper_contact_info(legislator, contact_url) legislator["url"] = contact_url.replace("/contact", "") self.save_legislator(legislator)
def scrape_legislator(self, chamber, term, name, url): with self.urlopen(url) as page: # Alaska fails at unicode, some of the pages have broken # characters. They're not in data we care about so just # replace them. page = page.decode('utf8', 'replace') page = lxml.html.fromstring(page) name = re.sub(r'\s+', ' ', name) info = page.xpath('string(//div[@id = "fullpage"])') district = re.search(r'District ([\w\d]+)', info).group(1) party = re.search(r'Party: (.+) Toll-Free', info).group(1).strip() email = re.search(r'Email: ([\w_]+@legis\.state\.ak\.us)', info).group(1) # for consistency if party == 'Democrat': party = 'Democratic' leg = Legislator(term, chamber, district, name, party=party, email=email, url=url) leg.add_source(url) self.save_legislator(leg)
def scrape_legislator_data(self, url, chamber): party_fulls = {'R' : 'Republican', 'D' : 'Democrat'} with self.urlopen(url) as page: page = BeautifulSoup(page) for data in page.find('table', id = 'ctl00_mainCopy_DataList1')('td'): spans = data('span') if len(spans) == 0: self.debug('Found an empty cell in %s. Continuing' % url) continue full_name = ' '.join([span.string.strip() for span in spans]) if len(spans[0].string.strip().split()) == 2: first_name, middle_name = spans[0].string.strip().split() else: first_name, middle_name = spans[0].string.strip(), '' last_name = spans[1].string.strip() details_url = get_abs_url(url, data.find('a')['href']) with self.urlopen(details_url) as details: details = BeautifulSoup(details) district = details.find('a', id = 'ctl00_mainCopy_LegisInfo_DISTRICTLabel').string.strip() party = party_fulls[details.find('span', id = 'ctl00_mainCopy_LegisInfo_PARTYLabel').string] leg = Legislator('2010', chamber, district, full_name, first_name, last_name, middle_name, party) leg.add_source(details_url) comms_table = details.find('table', id = 'ctl00_mainCopy_MembershipGrid') for comms_raw_data in comms_table('tr')[1:]: comm_data = comms_raw_data('td') comm_role_type = comm_data[0].string.strip() comm_name = comm_data[1]('a')[0].string.strip() leg.add_role(comm_role_type, '2010', chamber = chamber, committee = comm_name) self.save_legislator(leg)
def scrape_upper(self, term): url = "http://oksenate.gov/Senators/Default.aspx" html = self.urlopen(url) doc = lxml.html.fromstring(html) doc.make_links_absolute(url) for a in doc.xpath('//table[@summary]')[1].xpath('.//td//a[contains(@href, "biographies")]'): name, party = a.text.rsplit(None, 1) if party == '(D)': party = 'Democratic' elif party == '(R)': party = 'Republican' tail = a.xpath('..')[0].tail if tail: district = tail.split()[1] else: district = a.xpath('../../span')[1].text.split()[1] url = a.get('href') leg = Legislator(term, 'upper', district, name, party=party, url=url) leg.add_source(url) self.scrape_upper_offices(leg, url) self.save_legislator(leg)
def scrape_lower(self, term): url = "http://www.okhouse.gov/Members/Default.aspx" page = lxml.html.fromstring(self.urlopen(url)) page.make_links_absolute(url) for tr in page.xpath("//table[@class='rgMasterTable']/tbody/tr")[1:]: name = tr.xpath('.//td[1]/a')[0].text.strip() district = tr.xpath('.//td[3]')[0].text_content().strip() party = tr.xpath('.//td[4]')[0].text_content().strip() party = {'R': 'Republican', 'D': 'Democratic'}[party] leg_url = 'http://www.okhouse.gov/District.aspx?District=' + district leg_doc = lxml.html.fromstring(self.urlopen(leg_url)) leg_doc.make_links_absolute(leg_url) photo_url = leg_doc.xpath('//a[contains(@href, "HiRes")]/@href')[0] if name.startswith('House District'): self.warning("skipping %s %s" % (name, leg_url)) continue leg = Legislator(term, 'lower', district, name, party=party, photo_url=photo_url, url=leg_url) leg.add_source(url) leg.add_source(leg_url) # Scrape offices. self.scrape_lower_offices(leg_doc, leg) self.save_legislator(leg)
def scrape(self, chamber, term): if chamber == 'upper': url = ('http://webserver.rilin.state.ri.us/Documents/Senators.xls') rep_type = 'Senator ' elif chamber == 'lower': url = ( 'http://webserver.rilin.state.ri.us/Documents/Representatives.xls') rep_type = 'Representative ' self.urlretrieve(url, 'ri_leg.xls') wb = xlrd.open_workbook('ri_leg.xls') sh = wb.sheet_by_index(0) for rownum in xrange(1, sh.nrows): d = {} for field, col_num in excel_mapping.iteritems(): d[field] = sh.cell(rownum, col_num).value dist = str(int(d['district'])) district_name = dist full_name = re.sub(rep_type, '', d['full_name']).strip() translate = { "Democrat" : "Democratic", "Republican" : "Republican", "Independent" : "Independent" } leg = Legislator(term, chamber, district_name, full_name, '', '', '', translate[d['party']], town_represented=d['town_represented'], email=d['email']) leg.add_office('district', 'Address', address=d['address']) leg.add_source(url) self.save_legislator(leg)
def scrape_2011Leg(self, chamber, term, url): """2011 Scraper for legislators""" parties = {'(D)': 'Democratic', '(R)': 'Republican'} with self.urlopen(url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(url) table = page.xpath('//table[contains(@id, "GridView1")]')[0] for row in table.xpath('tr[td/a[contains(@href, "memberpage")]]'): params = {} district = row.xpath('td/span[contains(@id, "LabelDistrict")]/font')[0].text last_name_a = row.xpath('td/a[contains(@id, "HyperLinkLast")]')[0] member_url = last_name_a.get('href') last_name = last_name_a.text_content().strip() first_names = row.xpath('td/span[contains(@id, "LabelFirst")]/font')[0].text.strip() first_name = first_names.split()[0] middle_name = ' '.join(first_names.split()[1:]) party = row.xpath('td/span[contains(@id, "LabelParty")]/font')[0].text party = parties[party] params['office_address'] = row.xpath('td/span[contains(@id, "LabelRoom")]')[0].text + \ " " + row.xpath('td/span[contains(@id, "LabelRoom2")]')[0].text params['photo_url'] = row.xpath('td/a[contains(@id, "HyperLinkChairJPG")]/img')[0].attrib['src'] params['email'] = row.xpath('td/a[contains(@id, "HyperLinkEmail")]')[0].text params['phone'] = row.xpath('td/span[contains(@id, "LabelPhone2")]')[0].text full_name = first_names + " " + last_name leg = Legislator(term, chamber, district, full_name, first_name, last_name, middle_name, party, url=member_url, **params) leg.add_source(url) self.save_legislator(leg)
def scrape(self, chamber, session): metainf = self.scrape_leg_page(get_chamber_listing_url( chamber )) for leg in metainf: p = Legislator( session, chamber, leg['district'], leg['name'], party=leg['party'], # some additional things the website provides: photo_url=leg['image'], url=leg['homepage'], room=leg['room'], phone=leg['phone'], fax=leg['fax'], email=leg['email'], address=leg['addr']) for source in leg['source']: p.add_source( source ) try: for ctty in leg['ctty']: flag='Joint Legislative' if ctty['name'][:len(flag)] == flag: ctty_chamber = "joint" else: ctty_chamber = chamber p.add_role( 'committee member', term=session, chamber=ctty_chamber, committee=ctty['name'], position="member") except KeyError: self.log( "XXX: Warning, %s has no scraped Commities" % leg['name'] ) self.save_legislator( p )
def scrape_details(self, chamber, term, leg_name, leg_link, role): try: url = 'http://billstatus.ls.state.ms.us/members/%s' % leg_link with self.urlopen(url) as details_page: details_page = details_page.decode('latin1').encode('utf8', 'ignore') root = lxml.etree.fromstring(details_page, lxml.etree.HTMLParser()) party = root.xpath('string(//party)') district = root.xpath('string(//district)') first_name, middle_name, last_name = "", "", "" home_phone = root.xpath('string(//h_phone)') bis_phone = root.xpath('string(//b_phone)') capital_phone = root.xpath('string(//cap_phone)') other_phone = root.xpath('string(//oth_phone)') org_info = root.xpath('string(//org_info)') email_name = root.xpath('string(//email_address)') email = '%s@%s.ms.gov' % (email_name, chamber) if party == 'D': party = 'Democratic' else: party = 'Republican' leg = Legislator(term, chamber, district, leg_name, first_name, last_name, middle_name, party, role=role, home_phone = home_phone, bis_phone=bis_phone, capital_phone=capital_phone, other_phone=other_phone, org_info=org_info, email=email, url=url) leg.add_source(url) self.save_legislator(leg) except scrapelib.HTTPError, e: self.warning(str(e))
def test_legislator(): l = Legislator('T1', 'upper', '1', 'Adam Smith', 'Adam', 'Smith') assert_equal(l, {'_type': 'person', 'full_name': 'Adam Smith', 'first_name': 'Adam', 'last_name': 'Smith', 'middle_name': '', 'suffixes': '', 'roles': [ {'chamber': 'upper', 'term': 'T1', 'role': 'member', 'start_date': None, 'end_date': None, 'district': '1', 'party': ''}], 'offices': [], 'sources': []}) l.add_role('committee member', 'T1', committee='Some Committee', position='chairman') assert_equal(l['roles'][1], {'role': 'committee member', 'term': 'T1', 'start_date': None, 'end_date': None, 'committee': 'Some Committee', 'position': 'chairman'}) l.add_office('capitol', 'Statehouse Office', '123 Main St', '123-456-7890', '123-555-5555', '*****@*****.**') assert_equal(l['offices'], [{'type': 'capitol', 'name': 'Statehouse Office', 'address': '123 Main St', 'phone': '123-456-7890', 'fax': '123-555-5555', 'email': '*****@*****.**'}])
def scrape(self, chamber, term): # Pennsylvania doesn't make member lists easily available # for previous sessions, unfortunately self.validate_term(term, latest_only=True) leg_list_url = legislators_url(chamber) with self.urlopen(leg_list_url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(leg_list_url) for link in page.xpath("//a[contains(@href, '_bio.cfm')]"): full_name = link.text district = link.getparent().getnext().tail.strip() district = re.search("District (\d+)", district).group(1) party = link.text[-2] if party == 'R': party = 'Republican' elif party == 'D': party = 'Democratic' url = link.get('href') legislator = Legislator(term, chamber, district, full_name, party=party, url=url) legislator.add_source(leg_list_url) self.save_legislator(legislator)
def scrape(self, chamber, session): url = self.get_district_list(chamber, session) people_pages = self.scrape_directory( url, chamber, session ) for person in people_pages: district = person p_url = people_pages[district] metainf = self.process_person( p_url ) p = Legislator( session, chamber, district, metainf['name'], party=metainf['party'], # some additional things the website provides: occupation=metainf['occupation'], photo_url=metainf['photo_url'], url=metainf['homepage']) phone = metainf['number'] if 'number' in metainf else None email = metainf['email'] if 'email' in metainf else None p.add_office('capitol', 'Capitol Office', phone=phone, address='200 E. Colfax\nDenver, CO 80203', email=email ) p.add_source( p_url ) self.save_legislator( p )
def scrape(self, chamber, term): self.validate_term(term, latest_only=True) if chamber == 'upper': url = ('http://www.rilin.state.ri.us/Documents/Senators.xls') rep_type = 'Senator ' elif chamber == 'lower': url = ('http://www.rilin.state.ri.us/Documents/Representatives.xls') rep_type = 'Representative ' with self.urlopen(url) as senator_xls: with open('ri_senate.xls', 'w') as f: f.write(senator_xls) wb = xlrd.open_workbook('ri_senate.xls') sh = wb.sheet_by_index(0) for rownum in xrange(1, sh.nrows): d = {} for field, col_num in excel_mapping.iteritems(): d[field] = str(sh.cell(rownum, col_num).value) district_name = "District " + d['district'] full_name = re.sub(rep_type, '', d['full_name']).strip() leg = Legislator(term, chamber, district_name, full_name, '', '', '', d['party'], office_address=d['address'], town_represented=d['town_represented'], email=d['email']) leg.add_source(url) self.save_legislator(leg)
def scrape_upper(self, chamber, term): url = 'http://www.senate.michigan.gov/members/memberlist.htm' html = self.urlopen(url) doc = lxml.html.fromstring(html) for row in doc.xpath('//table[@width=550]/tr')[1:39]: # party, dist, member, office_phone, office_fax, office_loc party, dist, member, phone, fax, loc = row.getchildren() party = abbr[party.text] district = dist.text_content().strip() name = member.text_content().strip() if name == 'Vacant': self.info('district %s is vacant', district) continue leg_url = member.xpath('a/@href')[0] office_phone = phone.text office_fax = fax.text office_loc = loc.text leg = Legislator(term=term, chamber=chamber, district=district, full_name=name, party=party, url=leg_url) leg.add_office('capitol', 'Capitol Office', address=office_loc, fax=office_fax, phone=office_phone) leg.add_source(url) self.save_legislator(leg)
def scrape_reps(self, chamber, term): # There are 99 House districts for district in xrange(1, 100): rep_url = "http://www.house.state.oh.us/components/" "com_displaymembers/page.php?district=%d" % district with self.urlopen(rep_url) as page: page = lxml.html.fromstring(page) for el in page.xpath('//table[@class="page"]'): rep_link = el.xpath("tr/td/title")[0] full_name = rep_link.text party = full_name[-2] full_name = full_name[0:-3] if full_name == "Vacant Posit": continue if party == "D": party = "Democratic" elif party == "R": party = "Republican" leg = Legislator(term, chamber, str(district), full_name, party=party, url=rep_url) leg.add_source(rep_url) self.save_legislator(leg)
def scrape_2011Leg(self, chamber, term, url): """2011 Scraper for legislators""" titles = {'lower': 'Representative', 'upper': 'Senator'} parties = {'D': 'Democrat', 'R': 'Republican'} with self.urlopen(url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(url) table = page.xpath('//table[contains(@id, "GridView1")]')[0] for row in table.xpath('tr[td/a[contains(@href, "memberpage")]]'): params = {} district = row.xpath('td/span[contains(@id, "LabelDis")]/font')[0].text + " " + \ row.xpath('td/span[contains(@id, "LabelDistrict2")]/font')[0].text # Replace any / in district name to allow json file to save. district = district.replace('/', '-') params['title'] = titles.get(chamber, '') last_name = row.xpath('td/a[contains(@id, "HyperLinkLast")]/font')[0].text.strip() first_names = row.xpath('td/span[contains(@id, "LabelFirst")]/font')[0].text.strip() first_name = first_names.split()[0] middle_name = ' '.join(first_names.split()[1:]) party = row.xpath('td/span[contains(@id, "LabelParty")]/font')[0].text party = party.replace('(', '') party = party.replace(')', '') party = parties.get(party, '') # Expand party from initial letter. params['office_address'] = row.xpath('td/span[contains(@id, "LabelRoom")]')[0].text + \ " " + row.xpath('td/span[contains(@id, "LabelRoom2")]')[0].text params['photo_url'] = row.xpath('td/a[contains(@id, "HyperLinkChairJPG")]/img')[0].attrib['src'] params['email'] = row.xpath('td/a[contains(@id, "HyperLinkEmail")]')[0].text params['phone'] = row.xpath('td/span[contains(@id, "LabelPhone2")]')[0].text full_name = first_names + " " + last_name leg = Legislator(term, chamber, district, full_name, first_name, last_name, middle_name, party, **params) leg.add_source(url) self.save_legislator(leg)
def scrape(self, chamber, term): term_slug = term[:-2] url = MEMBER_LIST_URL[chamber] % term_slug html = self.urlopen(url) doc = lxml.html.fromstring(html) doc.make_links_absolute(url) for row in doc.xpath('//table')[4].xpath('tr')[2:]: name, _, _, district, party = row.xpath('td') district = district.text party = {'D':'Democratic', 'R': 'Republican', 'I': 'Independent'}[party.text] leg_url = name.xpath('a/@href')[0] name = name.text_content().strip() # inactive legislator, skip them for now if name.endswith('*'): continue leg_html = self.urlopen(leg_url) leg_doc = lxml.html.fromstring(leg_html) photo_url = leg_doc.xpath('//img[contains(@src, "/members/")]/@src')[0] leg = Legislator(term, chamber, district, name, party=party, url=leg_url, photo_url=photo_url) leg.add_source(url) leg.add_source(leg_url) self.save_legislator(leg)
def scrape_rep(self, name, term, url): # special case names that confuses name_tools if name == 'Franklin, A.B.': name = 'Franklin, A. B.' elif ', Jr., ' in name: name = name.replace(', Jr., ', ' ') name += ', Jr.' elif ', III, ' in name: name = name.replace(', III, ', ' ') name += ', III' with self.urlopen(url) as text: page = lxml.html.fromstring(text) district = page.xpath( "//a[contains(@href, 'district')]")[0].attrib['href'] district = re.search("district(\d+).pdf", district).group(1) if "Democrat District" in text: party = "Democratic" elif "Republican District" in text: party = "Republican" elif "Independent District" in text: party = "Independent" else: party = "Other" leg = Legislator(term, 'lower', district, name, party=party, url=url) leg.add_source(url) self.save_legislator(leg)
def scrape_lower_chamber(self, term): url = "http://www.okhouse.gov/Members/Default.aspx" page = self.lxmlize(url) legislator_nodes = self.get_nodes( page, '//table[@id="ctl00_ContentPlaceHolder1_RadGrid1_ctl00"]/tbody/tr') for legislator_node in legislator_nodes: name_node = self.get_node( legislator_node, './/td[1]/a') if name_node is not None: name_text = name_node.text.strip() last_name, delimiter, first_name = name_text.partition(',') if last_name is not None and first_name is not None: first_name = first_name.strip() last_name = last_name.strip() name = ' '.join([first_name, last_name]) else: raise ValueError('Unable to parse name: {}'.format( name_text)) if name.startswith('House District'): continue district_node = self.get_node( legislator_node, './/td[3]') if district_node is not None: district = district_node.text.strip() party_node = self.get_node( legislator_node, './/td[4]') if party_node is not None: party_text = party_node.text.strip() party = self._parties[party_text] legislator_url = 'http://www.okhouse.gov/District.aspx?District=' + district legislator_page = self.lxmlize(legislator_url) photo_url = self.get_node( legislator_page, '//a[@id="ctl00_ContentPlaceHolder1_imgHiRes"]/@href') legislator = Legislator( _scraped_name=name_text, full_name=name, term=term, chamber='lower', district=district, party=party, photo_url=photo_url, url=legislator_url ) legislator.add_source(url) legislator.add_source(legislator_url) # Scrape offices. self.scrape_lower_offices(legislator_page, legislator) self.save_legislator(legislator)
def scrape(self, chamber, term): self.validate_term(term) session = self.get_session_for_term(term) try: session_id = self.get_session_id(session) except KeyError: raise NoDataForPeriod(session) body = {'lower': 'H', 'upper': 'S'}[chamber] url = 'http://www.azleg.gov/MemberRoster.asp?Session_ID=%s&body=%s' % ( session_id, body) with self.urlopen(url) as page: root = html.fromstring(page) path = '//table[@id="%s"]/tr' % {'H': 'house', 'S': 'senate'}[body] roster = root.xpath(path)[1:] for row in roster: position = '' vacated = '' name, district, party, email, room, phone, fax = row.getchildren() link = name.xpath('string(a/@href)') link = "http://www.azleg.gov" + link if len(name) == 1: name = name.text_content().strip() else: position = name.tail.strip() name = name[0].text_content().strip() district = district.text_content() party = party.text_content().strip() email = email.text_content().strip() if 'Vacated' in email: # comment out the following 'continue' for historical # legislative sessions # for the current session, if a legislator has left we will # skip him/her to keep from overwriting their information continue vacated = re.search('[0-9]*/[0-9]*/\d{4}', email).group() email = '' party = self.get_party(party) room = room.text_content().strip() if chamber == 'lower': address = "House of Representatives\n" else: address = "Senate\n" address = address + "1700 West Washington\n" + room \ + "\nPhoenix, AZ 85007" phone = phone.text_content().strip() if not phone.startswith('602'): phone = "602-" + phone fax = fax.text_content().strip() if not fax.startswith('602'): fax = "602-" + fax if vacated: end_date = datetime.datetime.strptime(vacated, '%m/%d/%Y') leg = Legislator( term, chamber, district, full_name=name, party=party, url=link) leg['roles'][0]['end_date'] = end_date else: leg = Legislator( term, chamber, district, full_name=name, party=party, office_phone=phone, office_fax=fax, office_address=address, email=email, url=link) if position: leg.add_role( position, term, chamber=chamber, district=district, party=party) leg.add_source(url) #Probably just get this from the committee scraper #self.scrape_member_page(link, session, chamber, leg) self.save_legislator(leg)
def scrape(self, chamber, term): for tdata in self.metadata['terms']: if term == tdata['name']: year = tdata['start_year'] session_number = tdata['session_number'] break # Scrape committees. Also produce a name dictionary that can be # used for fuzzy matching between the committee page names and the # all-caps csv names. for name_dict, _ in scrape_committees(year, chamber): pass # Fetch the csv. url = 'http://leg.mt.gov/content/sessions/%s/%d%sMembers.txt' % \ (session_number, year, chamber == 'upper' and 'Senate' or 'House') # Parse it. data = self.urlopen(url) data = data.replace('"""', '"') # weird triple quotes data = data.splitlines() fieldnames = [ 'last_name', 'first_name', 'party', 'district', 'address', 'city', 'state', 'zip' ] csv_parser = csv.DictReader(data, fieldnames) district_leg_urls = self._district_legislator_dict() for entry in csv_parser: if not entry: continue # City. entry['city'] = entry['city'].title() # Address. entry['address'] = entry['address'].title() # District. district = entry['district'] hd_or_sd, district = district.split() del entry['district'] # Party. party_letter = entry['party'] party = {'D': 'Democratic', 'R': 'Republican'}[party_letter] entry['party'] = party del entry['party'] # Get full name properly capped. _fullname = '%s %s' % (entry['first_name'].capitalize(), entry['last_name'].capitalize()) city_lower = entry['city'].lower() fullname = difflib.get_close_matches(_fullname, name_dict[city_lower], cutoff=0.5) # If there are no close matches with the committee page, # use the title-capped first and last name. if len(fullname) < 1: fullname = _fullname # msg = 'No matches found for "%s" with "%s" from %r' # self.debug(msg % (_fullname, fullname, # name_dict[city_lower])) else: fullname = fullname[0] # if _fullname != fullname: # msg = 'matched "%s" with "%s" from %r' # self.debug(msg % (_fullname, fullname, # name_dict[city_lower])) # Get any info at the legislator's detail_url. detail_url = district_leg_urls[hd_or_sd][district] deets = self._scrape_details(detail_url) # Add the details and delete junk. entry.update(deets) del entry['first_name'], entry['last_name'] legislator = Legislator(term, chamber, district, fullname, party=party) legislator.update(entry) legislator.add_source(detail_url) legislator.add_source(url) legislator['url'] = detail_url self.save_legislator(legislator)
def scrape(self, chamber, term): urls = {'lower': "http://www.msa.md.gov/msa/mdmanual/06hse/html/hseal.html", 'upper': "http://www.msa.md.gov/msa/mdmanual/05sen/html/senal.html"} detail_re = re.compile('\((R|D)\), (?:Senate President, )?(?:House Speaker, )?District (\w+)') with self.urlopen(urls[chamber]) as html: doc = lxml.html.fromstring(html) # rest of data on this page is <li>s that have anchor tags for a in doc.cssselect('li a'): link = a.get('href') # tags don't close so we get the <li> and <a> content and diff them name_text = a.text_content() detail_text = a.getparent().text_content().replace(name_text, '') # ignore if it is not a valid link if link: # handle names names = name_text.split(',') last_name = names[0] first_name = names[1].strip() # TODO: try to trim first name to remove middle initial if len(names) > 2: suffixes = names[2] else: suffixes = '' # handle details details = detail_text.strip() party, district = detail_re.match(details).groups() party = PARTY_DICT[party] leg_url = BASE_URL+link leg = Legislator(term, chamber, district, ' '.join((first_name, last_name)), first_name, last_name, party=party, suffixes=suffixes, url=leg_url) leg.add_source(url=leg_url) with self.urlopen(leg_url) as leg_html: leg_doc = lxml.html.fromstring(leg_html) img_src = leg_doc.xpath('//img[@align="left"]/@src') if img_src: leg['photo_url'] = BASE_URL + img_src[0] # address extraction # this is pretty terrible, we get address in a format that looks # like: # James Senate Office Building, Room 322 # 11 Bladen St., Annapolis, MD 21401 # (410) 841-3565, (301) 858-3565; 1-800-492-7122, ext. 3565 (toll free) # e-mail: [email protected] # fax: (410) 841-3552, (301) 858-3552 # # Western Maryland Railway Station, 13 Canal St., Room 304, Cumberland, MD 21502 # (301) 722-4780; 1-866-430-9553 (toll free) # e-mail: [email protected] # fax: (301) 722-4790 # usually first ul, sometimes first p try: addr_lines = leg_doc.xpath('//ul')[0].text_content().strip().splitlines() except IndexError: addr_lines = leg_doc.xpath('//p')[0].text_content().strip().splitlines() addr_pieces = {'capitol': defaultdict(str), 'district': defaultdict(str)} addr_type = 'capitol' for line in addr_lines: if '(401)' in line or '(301)' in line: addr_pieces[addr_type]['phone'] = line elif 'toll free' in line: pass # skip stand alone 1-800 numbers elif 'e-mail' in line: addr_pieces[addr_type]['email'] = line.replace('email: ', '') elif 'fax' in line: addr_pieces[addr_type]['fax'] = line.replace('fax: ', '') elif line == '': addr_type = 'district' else: addr_pieces[addr_type]['address'] += '{0}\n'.format(line) if addr_pieces['capitol']: leg.add_office('capitol', 'Capitol Office', **addr_pieces['capitol']) leg['email'] = (addr_pieces['capitol']['email'] or addr_pieces['district']['email'] or None) if addr_pieces['district']: leg.add_office('district', 'District Office', **addr_pieces['district']) self.save_legislator(leg)
def scrape(self, chamber, term): self.validate_term(term, latest_only=False) root_url = 'http://www.capitol.tn.gov/' parties = { 'D': 'Democratic', 'R': 'Republican', 'CCR': 'Carter County Republican', 'I': 'Independent' } #testing for chamber if chamber == 'upper': url_chamber_name = 'senate' abbr = 's' else: url_chamber_name = 'house' abbr = 'h' if term != self.metadata["terms"][-1]["sessions"][0]: chamber_url = root_url + url_chamber_name chamber_url += '/archives/' + term + 'GA/Members/index.html' else: chamber_url = root_url + url_chamber_name + '/members/' page = self.lxmlize(chamber_url) for row in page.xpath("//tr"): # Skip any a header row. if set(child.tag for child in row) == set(['th']): continue vacancy_check = row.xpath('./td/text()')[1] if 'Vacant' in vacancy_check: self.logger.warning("Vacant Seat") continue partyInit = row.xpath('td[3]')[0].text.split()[0] party = parties[partyInit] district = row.xpath('td[5]/a')[0].text.split()[1] address = row.xpath('td[6]')[0].text_content() # 301 6th Avenue North Suite address = address.replace( 'LP', 'Legislative Plaza\nNashville, TN 37243') address = address.replace( 'WMB', 'War Memorial Building\nNashville, TN 37243') address = '301 6th Avenue North\nSuite ' + address phone = [ x.strip() for x in row.xpath('td[7]//text()') if x.strip() ][0] email = HTMLParser.HTMLParser().unescape( row.xpath('td[1]/a/@href')[0][len("mailto:"):]) member_url = (root_url + url_chamber_name + '/members/' + abbr + district + '.html') member_photo_url = (root_url + url_chamber_name + '/members/images/' + abbr + district + '.jpg') try: member_page = self.get(member_url, allow_redirects=False).text except (TypeError, HTTPError): try: member_url = row.xpath('td[2]/a/@href')[0] member_page = self.get(member_url, allow_redirects=False).text except (TypeError, HTTPError): self.logger.warning("Valid member page does not exist.") continue member_page = lxml.html.fromstring(member_page) try: name = member_page.xpath('//div/div/h1/text()')[0] except IndexError: name = member_page.xpath( '//div[@id="membertitle"]/h2/text()')[0] if 'Speaker' in name: full_name = name[8:len(name)] elif 'Lt.' in name: full_name = name[13:len(name)] elif abbr == 'h': full_name = name[len("Representative "):len(name)] else: full_name = name[8:len(name)] leg = Legislator(term, chamber, district, full_name.strip(), party=party, url=member_url, photo_url=member_photo_url) leg.add_source(chamber_url) leg.add_source(member_url) # TODO: add district address from this page leg.add_office('capitol', 'Nashville Address', address=address, phone=phone, email=email) self.save_legislator(leg)
def scrape(self, chamber, term): term_slug = term[:-2] url = MEMBER_LIST_URL[chamber] % term_slug html = self.urlopen(url) doc = lxml.html.fromstring(html) doc.make_links_absolute(url) for row in doc.xpath('//table')[4].xpath('tr')[2:]: name, _, _, district, party = row.xpath('td') district = district.text party = { 'D': 'Democratic', 'R': 'Republican', 'I': 'Independent' }[party.text] leg_url = name.xpath('a/@href')[0] name = name.text_content().strip() # inactive legislator, skip them for now if name.endswith('*'): name = name.strip('*') continue leg_html = self.urlopen(leg_url) leg_doc = lxml.html.fromstring(leg_html) leg_doc.make_links_absolute(leg_url) leg = Legislator(term, chamber, district, name, party=party, url=leg_url) leg.add_source(url) hotgarbage = ('Senate Biography Information for the 98th General ' 'Assembly is not currently available.') if hotgarbage in leg_html: # The legislator's bio isn't available yet. self.logger.warning('No legislator bio available for ' + name) self.save_legislator(leg) continue photo_url = leg_doc.xpath( '//img[contains(@src, "/members/")]/@src')[0] photo_url_parsed = urlparse(photo_url) encoded_path = quote(photo_url_parsed.path) photo_url = photo_url_parsed._replace(path=encoded_path).geturl() leg.update(photo_url=photo_url) leg.add_source(leg_url) # email email = leg_doc.xpath('//b[text()="Email: "]') if email: leg['email'] = email[0].tail # function for turning an IL contact info table to office details def _table_to_office(table, office_type, office_name): addr = '' phone = '' fax = None for row in table.xpath('tr'): row = row.text_content().strip() # skip rows that aren't part of address if 'Office:' in row or row == 'Cook County': continue # fax number row ends with FAX elif 'FAX' in row: fax = row.replace(' FAX', '') # phone number starts with ( [make it more specific?] elif row.startswith('('): phone = row # everything else is an address else: addr += (row + '\n') if addr.strip() != ',': leg.add_office(office_type, office_name, address=addr.strip(), phone=phone, fax=fax) # extract both offices from tables table = leg_doc.xpath( '//table[contains(string(), "Springfield Office")]') if table: _table_to_office(table[3], 'capitol', 'Springfield Office') table = leg_doc.xpath( '//table[contains(string(), "District Office")]') if table: _table_to_office(table[3], 'district', 'District Office') self.save_legislator(leg)
def scrape(self, chamber, term): biennium = "%s-%s" % (term[0:4], term[7:9]) url = ("http://wslwebservices.leg.wa.gov/SponsorService.asmx/" "GetSponsors?biennium=%s" % biennium) # these pages are useful for checking if a leg is still in office if chamber == 'upper': cur_member_url = 'http://www.leg.wa.gov/senate/senators/Pages/default.aspx' else: cur_member_url = 'http://www.leg.wa.gov/house/representatives/Pages/default.aspx' cur_members = self.get(cur_member_url).text cur_members_doc = lxml.html.fromstring(cur_members) cur_members_doc.make_links_absolute(cur_member_url) page = self.get(url) page = lxml.etree.fromstring(page.content) for member in xpath(page, "//wa:Member"): mchamber = xpath(member, "string(wa:Agency)") mchamber = {'House': 'lower', 'Senate': 'upper'}[mchamber] if mchamber != chamber: continue name = xpath(member, "string(wa:Name)").strip() if name == "": continue # if the legislator isn't in the listing, skip them if name not in cur_members: self.warning('%s is no longer in office' % name) continue else: leg_url, = set(cur_members_doc.xpath( '//span[contains(text(), "%s")]/../..//' 'a[text()="Home Page"]/@href' % ( name ))) party = xpath(member, "string(wa:Party)") party = {'R': 'Republican', 'D': 'Democratic'}.get( party, party) district = xpath(member, "string(wa:District)") if district == '0': # Skip phony district 0. continue email = xpath(member, "string(wa:Email)") phone = xpath(member, "string(wa:Phone)") last = xpath(member, "string(wa:LastName)") last = last.lower().replace(' ', '') scraped_offices = [] photo_url = "" try: leg_page = self.get(leg_url).text leg_page = lxml.html.fromstring(leg_page) leg_page.make_links_absolute(leg_url) photo_link = leg_page.xpath( "//a[contains(@href, 'publishingimages')]") if photo_link: photo_url = photo_link[0].attrib['href'] offices = leg_page.xpath("//table[@cellspacing='0']/tr/td/b[contains(text(), 'Office')]") for office in offices: office_block = office.getparent() office_name = office.text_content().strip().rstrip(":") address_lines = [x.tail for x in office_block.xpath(".//br")] address_lines = filter(lambda a: a is not None, address_lines) _ = address_lines.pop(len(address_lines) - 1) phone = address_lines.pop(len(address_lines) - 1) address = "\n".join(address_lines) obj = { "name": office_name, "phone": phone } if address.strip() != '': obj['address'] = address scraped_offices.append(obj) except scrapelib.HTTPError: # Sometimes the API and website are out of sync # with respect to legislator resignations/appointments pass except requests.exceptions.ConnectionError: # Sometimes the API and website are out of sync # with respect to legislator resignations/appointments pass leg = Legislator(term, chamber, district, name, '', '', '', party, photo_url=photo_url, url=leg_url) leg.add_source(leg_url) for office in scraped_offices: typ = 'district' if 'District' in office['name'] else 'capitol' leg.add_office(typ, office.pop('name'), **office) self.save_legislator(leg)
def scrape_legislator(self, chamber, term, option): url = urlparse.urljoin(self.url, option.attrib['value']) name, party, district = re.split(r'\s*,\s*', option.text.strip()) name = re.sub(r'^(Sen\.|Rep\.)\s+', '', name) district = re.sub(r'^District\s+', '', district) if district == '[N/A]': msg = 'No district found for %r; skipping.' self.logger.warning(msg, name) return leg = Legislator(term, chamber, district, name, party=party) leg.add_source(self.url) # Scrape leg page. try: html = self.urlopen(url) except scrapelib.HTTPError as exc: # As of July 2014, this only happens when a page has # gone missing from their varnish server. # if exc.response.status_code is 503: self.logger.exception(exc) self.logger.warning('Skipping legislator at url: %s' % url) skipped = True return doc = lxml.html.fromstring(html) doc.make_links_absolute(self.url) leg.add_source(url) # Scrape committees. for tr in doc.xpath('//table//tr'): committee, role = tr committee = committee.text_content().strip() role = role.text_content().strip() if 'member' in role.lower(): role = 'committee member' elif 'chair' in role.lower(): role = 'chair' leg.add_role(role, term, chamber=chamber, committee=committee) # Scrape offices. dist_office, phone = doc.xpath('//address') dist_office = dist_office.text_content().strip() dist_office = re.sub(r' {2,}', '', dist_office) phone = phone.text_content().strip() email = doc.xpath('string(//a[starts-with(@href, "mailto:")]/@href)') photo_url = doc.xpath('string(//img[contains(@class, "member")]/@src)') leg.update(email=email, photo_url=photo_url) leg.add_office(address=dist_office, name='District Office', type='district', phone=phone) self.save_legislator(leg)
def scrape(self, term, chambers): leg_url = "ftp://ftp.cga.ct.gov/pub/data/LegislatorDatabase.csv" data = self.urlopen(leg_url) page = open_csv(data) for row in page: chamber = {'H': 'lower', 'S': 'upper'}[row['office code']] if chamber not in chambers: continue district = row['dist'].lstrip('0') name = row['first name'] mid = row['middle initial'].strip() if mid: name += " %s" % mid name += " %s" % row['last name'] suffix = row['suffix'].strip() if suffix: name += " %s" % suffix party = row['party'] if party == 'Democrat': party = 'Democratic' leg = Legislator(term, chamber, district, name, first_name=row['first name'], last_name=row['last name'], middle_name=row['middle initial'], suffixes=row['suffix'], party=party, email=row['email'], url=row['URL'], office_phone=row['capitol phone']) office_address = "%s, Room %s\nHartford, CT 06106-1591" % ( row['capitol street address'], row['room number']) leg.add_office('capitol', 'Capitol Office', address=office_address, phone=row['capitol phone']) # skipping home address for now leg.add_source(leg_url) for comm in row['committee member1'].split(';'): if comm: if ' (' in comm: comm, role = comm.split(' (') role = role.strip(')').lower() else: role = 'member' leg.add_role('committee member', term, chamber='joint', committee=comm.strip(), position=role) self.save_legislator(leg)
def scrape_details(self, chamber, term, leg_name, leg_link, role): if not leg_link: # Vacant post, likely: if "Vacancy" in leg_name: return raise Exception("leg_link is null. something went wrong") try: url = 'http://billstatus.ls.state.ms.us/members/%s' % leg_link url_root = os.path.dirname(url) details_page = self.get(url) root = lxml.etree.fromstring(details_page.content) party = root.xpath('string(//PARTY)') district = root.xpath('string(//DISTRICT)') photo = "%s/%s" % (url_root, root.xpath('string(//IMG_NAME)')) home_phone = root.xpath('string(//H_PHONE)') home_address = root.xpath('string(//H_ADDRESS)') home_address2 = root.xpath('string(//H_ADDRESS2)') home_city = root.xpath('string(//H_CITY)') home_zip = root.xpath('string(//H_ZIP)') home_address_total = "%s\n%s\n%s\n%s" % ( home_address, home_address2, home_city, home_zip ) bis_phone = root.xpath('string(//B_PHONE)') capital_phone = root.xpath('string(//CAP_PHONE)') other_phone = root.xpath('string(//OTH_PHONE)') org_info = root.xpath('string(//ORG_INFO)') email_name = root.xpath('string(//EMAIL_ADDRESS)').strip() cap_room = root.xpath('string(//CAP_ROOM)') if leg_name in ('Lataisha Jackson', 'John G. Faulkner'): assert not party, "Remove special-casing for this Democrat without a listed party: {}".format(leg_name) party = 'Democratic' elif leg_name in ('James W. Mathis', 'John Glen Corley'): assert not party, "Remove special-casing for this Republican without a listed party: {}".format(leg_name) party = 'Republican' elif party == 'D': party = 'Democratic' elif party == 'R': party = 'Republican' else: raise AssertionError( "A member with no identifiable party was found: {}".format(leg_name)) leg = Legislator(term, chamber, district, leg_name, party=party, role=role, org_info=org_info, url=url, photo_url=photo) leg.add_source(url) kwargs = {} if email_name != "": if "@" in email_name: email = email_name else: email = '%s@%s.ms.gov' % (email_name, {"upper": "senate", "lower": "house"}[chamber]) kwargs['email'] = email if capital_phone != "": kwargs['phone'] = capital_phone if cap_room != "": kwargs["address"] = "Room %s\n%s" % (cap_room, CAP_ADDRESS) else: kwargs['address'] = CAP_ADDRESS leg.add_office('capitol', 'Capitol Office', **kwargs) kwargs = {} if home_phone != "": kwargs['phone'] = home_phone if home_address_total != "": kwargs['address'] = home_address_total if kwargs != {}: leg.add_office('district', 'District Office', **kwargs) self.save_legislator(leg) except scrapelib.HTTPError, e: self.warning(str(e))
def scrape(self, chamber, term): self.validate_term(term, latest_only=False) root_url = 'http://www.capitol.tn.gov/' parties = { 'D': 'Democratic', 'R': 'Republican', 'CCR': 'Carter County Republican' } #testing for chamber if chamber == 'upper': url_chamber_name = 'senate' abbr = 's' else: url_chamber_name = 'house' abbr = 'h' if term != self.metadata["terms"][-1]["sessions"][0]: chamber_url = root_url + url_chamber_name + '/archives/' + term + 'GA/Members/index.html' else: chamber_url = root_url + url_chamber_name + '/members/' with self.urlopen(chamber_url) as page: page = lxml.html.fromstring(page) for row in page.xpath("//tr")[1:]: partyInit = row.xpath('td[2]')[0].text.split()[0] party = parties[partyInit] district = row.xpath('td[4]/a')[0].text.split()[1] phone = row.xpath('td[6]')[0].text #special case for Karen D. Camper if phone == None: phone = row.xpath('td[6]/div')[0].text phone = '615-' + phone.split()[0] email = row.xpath('td[7]/a')[0].text member_url = (root_url + url_chamber_name + '/members/' + abbr + district + '.html') member_photo_url = (root_url + url_chamber_name + '/members/images/' + abbr + district + '.jpg') with self.urlopen(member_url) as member_page: member_page = lxml.html.fromstring(member_page) name = member_page.xpath( '//div[@id="membertitle"]/h2')[0].text if 'Speaker' in name: full_name = name[8:len(name)] elif 'Lt.' in name: full_name = name[13:len(name)] elif abbr == 'h': full_name = name[5:len(name)] else: full_name = name[8:len(name)] leg = Legislator(term, chamber, district, full_name, party=party, email=email, phone=phone, url=member_url, photo_url=member_photo_url) leg.add_source(chamber_url) leg.add_source(member_url) self.save_legislator(leg)
def scrape_session(self, term, chambers, session): session = self.metadata['session_details'][session] sid = session['_guid'] members = self.sservice.GetMembersBySession(sid)['MemberListing'] for member in members: guid = member['Id'] # print member['Name'] nick_name, first_name, middle_name, last_name = ( member['Name'][x] for x in ['Nickname', 'First', 'Middle', 'Last']) chamber, district = (member['District'][x] for x in ['Type', 'Number']) party = member['Party'] if party == 'Democrat': party = 'Democratic' # print first_name, middle_name, last_name, party # print chamber, district first_name = nick_name if nick_name else first_name # XXX: Due to the upstream handling... # if middle_name: # name = "%s %s %s" % (first_name, middle_name, last_name) # else: # blocked out due to GA putting middle_name in first_name ... name = "%s %s" % (first_name, last_name) chamber = {"House": 'lower', "Senate": 'upper'}[chamber] if party.strip() == '': party = 'other' legislator = Legislator( term, chamber, str(district), name, party=party, # last_name=last_name, # first_name=first_name, _guid=guid) # if middle_name: # legislator['middle_name'] = middle_name # Sadly, upstream isn't good about keeping first names first only, # so I'm blocking this out. ainfo = [ member['DistrictAddress'][x] for x in ['Street', 'City', 'State', 'Zip'] ] if not None in ainfo: # XXX: Debug this nonsense. ainfo = [x.strip() for x in ainfo] address = " ".join(ainfo) email = member['DistrictAddress']['Email'] legislator.add_office('district', 'District Address', address=address, email=email) legislator.add_source(self.ssource) self.save_legislator(legislator)
def scrape(self, chamber, term): url = self.URLs[chamber] page = self.lxmlize(url) for block in page.xpath("//div[@class='ms-rtestate-field']")[1:-1]: # Each legislator block. photo_block = block.xpath("ancestor::td/preceding-sibling::td") if len(photo_block) == 0: continue h2s = block.xpath(".//h2/a") if len(h2s) != 1: # We've got a Vacant person. print("Found a Vacant position. Skipping block.") continue h2, = h2s name = h2.text.strip() photo_block, = photo_block # (The <td> before ours was the photo) img, = photo_block.xpath("*") img = img.attrib['src'] info = {} # Right, now let's get info out of their little profile box. for entry in block.xpath(".//p"): key = None for kvpair in itergraphs(entry.xpath("./*"), 'br'): # OK. We either get the tail or the next element # (usually an <a> tag) if len(kvpair) == 1: key, = kvpair value = key.tail.strip() if key.tail else None if value: value = re.sub("\s+", " ", value).strip() elif len(kvpair) == 2: key, value = kvpair if value.text_content().strip() == "arty:": key = value value = value.tail elif len(kvpair) == 3: k1, k2, value = kvpair # As seen with a <stong><strong>Email:</strong></strong> t = lambda x: x.text_content().strip() assert t(k1) == "" or t(k2) == "" if t(k1) != "": key = k1 else: key = k2 else: # Never seen text + an <a> tag, perhaps this can happen. raise ValueError( "Too many elements. Something changed") key = key.text_content().strip(" :") if value is None: # A page has the value in a <strong> tag. D'oh. key, value = (x.strip() for x in key.rsplit(":", 1)) key = re.sub("\s+", " ", key).strip() key = key.replace(":", "") if key == "arty": key = "Party" info[key] = value info['District'] = info['District'].encode('ascii', 'ignore').strip() info['Party'] = info['Party'].strip(": ").replace(u"\u00a0", "") leg = Legislator(term=term, url=h2.attrib['href'], chamber=chamber, full_name=name, party=info['Party'], district=info['District'], photo_url=img) leg.add_source(url) phone = info.get('Capitol Phone', info.get('apitol Phone')) if hasattr(phone, 'text_content'): phone = phone.text_content() leg.add_office(type='capitol', name='Capitol Office', address=info['Capitol Address'], phone=phone, email=info['Email'].attrib['href'].replace( "mailto:", "")) self.save_legislator(leg)
def scrape(self, chamber, term): if chamber == 'upper': url = ('http://webserver.rilin.state.ri.us/Documents/Senators.xls') rep_type = 'Senator' source_url = 'http://www.rilin.state.ri.us/senators/default.aspx' source_url_title_replacement = rep_type contact_url = 'http://webserver.rilin.state.ri.us/Email/SenEmailListDistrict.asp' elif chamber == 'lower': url = ( 'http://webserver.rilin.state.ri.us/Documents/Representatives.xls' ) rep_type = 'Representative' source_url = 'http://www.rilin.state.ri.us/representatives/default.aspx' source_url_title_replacement = 'Rep. ' contact_url = 'http://webserver.rilin.state.ri.us/Email/RepEmailListDistrict.asp' self.urlretrieve(url, 'ri_leg.xls') wb = xlrd.open_workbook('ri_leg.xls') sh = wb.sheet_by_index(0) # This isn't perfect but it's cheap and better than using the # XLS doc as the source URL for all legislators. # 374: RI: legislator url leg_source_url_map = {} leg_page = self.lxmlize(source_url) for link in leg_page.xpath('//td[@class="ms-vb2"]'): leg_name = link.text_content().replace( source_url_title_replacement, '') leg_url = link.xpath("..//a")[0].attrib['href'] leg_source_url_map[leg_name] = leg_url for rownum in xrange(1, sh.nrows): d = {} for field, col_num in excel_mapping.iteritems(): d[field] = sh.cell(rownum, col_num).value if d['full_name'].upper() == "VACANT": self.warning("District {}'s seat is vacant".format( int(d['district']))) continue slug = re.match( "(?P<class>sen|rep)-(?P<slug>.*)@(rilin\.state\.ri\.us|rilegislature\.gov)", d['email']) if 'asp' in d['email']: d['email'] = None if d['email'] is not None: info = slug.groupdict() info['chamber'] = "senators" if info[ 'class'] == 'sen' else "representatives" url = ("http://www.rilin.state.ri.us/{chamber}/" "{slug}/Pages/Biography.aspx".format(**info)) dist = str(int(d['district'])) district_name = dist assert d['full_name'].startswith(rep_type), "Improper name found" full_name = re.sub(r"^{}(?=\s?[A-Z].*$)".format(rep_type), '', d['full_name']).strip() translate = { "Democrat": "Democratic", "Republican": "Republican", "Independent": "Independent" } homepage_url = None url_names = lxml.html.fromstring(self.get(source_url).text) url_names = url_names.xpath('//td[@class="ms-vb2"]/a/@href') modified_name = re.sub(r'[^\w\s]', '', full_name) modified_name = modified_name.replace(' ', '').strip('').lower() for el in url_names: if 'default.aspx' in el: el = el.replace('default.aspx', '') el = el.strip('') if el[-1] == '/': el = el[:-1] el = el.lower() url_name_array = el.split('/') if url_name_array[-1] in modified_name: #remove '/default.aspx' and add last name homepage_url = source_url[:-12] + url_name_array[-1] kwargs = { "town_represented": d['town_represented'], } contact = self.lxmlize(contact_url) contact_phone = contact.xpath( '//tr[@valign="TOP"]//td[@class="bodyCopy"]/text() | //td[@class="bodyCopy"]//center/text()' ) phone = None for el in contact_phone: if len(el) <= 2 and dist == el: number = contact_phone.index(el) phone = contact_phone[number + 2] phone = phone.strip() email = None if d['email'] is not None: email = d['email'] if homepage_url is not None: kwargs['url'] = homepage_url if d['address'] is '': d['address'] = 'No Address Found' leg = Legislator(term, chamber, district_name, full_name, '', '', '', translate[d['party']], **kwargs) leg.add_office('district', 'Dictrict Office', address=d['address'], phone=phone, email=email) leg.add_source(source_url) leg.add_source(contact_url) if homepage_url: leg.add_source(homepage_url) self.save_legislator(leg)
def scrape(self, term, chambers): year_abr = term[0:4] self._init_mdb(year_abr) roster_csv = self.access_to_csv('Roster') bio_csv = self.access_to_csv('LegBio') photos = {} for rec in bio_csv: photos[rec['Roster Key']] = rec['URLPicture'] for rec in roster_csv: first_name = rec["Firstname"] middle_name = rec["MidName"] last_name = rec["LastName"] suffix = rec["Suffix"] full_name = first_name + " " + middle_name + " " + last_name + " " + suffix full_name = full_name.replace(' ', ' ') full_name = full_name[0: len(full_name) - 1] district = int(rec["District"]) party = rec["Party"] if party == 'R': party = "Republican" elif party == 'D': party = "Democratic" else: party = party chamber = rec["House"] if chamber == 'A': chamber = "lower" elif chamber == 'S': chamber = "upper" leg_status = rec["LegStatus"] # skip Deceased/Retired members if leg_status != 'Active': continue title = rec["Title"] legal_position = rec["LegPos"] phone = rec["Phone"] or None email = None if rec["Email"]: email = rec["Email"] try: photo_url = photos[rec['Roster Key']] except KeyError: photo_url = '' self.warning('no photo url for %s', rec['Roster Key']) url = ('http://www.njleg.state.nj.us/members/bio.asp?Leg=' + str(int(rec['Roster Key']))) address = '{0}\n{1}, {2} {3}'.format(rec['Address'], rec['City'], rec['State'], rec['Zipcode']) gender = {'M': 'Male', 'F': 'Female'}[rec['Sex']] leg = Legislator(term, chamber, str(district), full_name, first_name, last_name, middle_name, party, suffixes=suffix, title=title, legal_position=legal_position, url=url, photo_url=photo_url, gender=gender) leg.add_office('district', 'District Office', address=address, phone=phone, email=email) leg.add_source(url) leg.add_source('http://www.njleg.state.nj.us/downloads.asp') self.save_legislator(leg)
def scrape_reps(self, chamber, term): # There are 99 House districts for district in xrange(1, 100): rep_url = ('http://www.house.state.oh.us/components/' 'com_displaymembers/page.php?district=%d' % district) with self.urlopen(rep_url) as page: page = lxml.html.fromstring(page) ranges = [] cur = [] info = page.xpath('//td[@class="info"]/*') for r in info: if r.tag == 'strong': ranges.append(cur) cur = [] else: cur.append(r) ranges.append(cur) block = ranges[4][:-1] address = ", ".join( [ x.tail.strip() for x in block ]) phone = page.xpath( "//strong[contains(text(), 'Phone')]")[0].tail fax = page.xpath( "//strong[contains(text(), 'Fax')]")[0].tail for el in page.xpath('//table[@class="page"]'): rep_link = el.xpath('tr/td/title')[0] full_name = rep_link.text party = full_name[-2] full_name = full_name[0:-3] if full_name == 'Vacant Posit': continue if party == "D": party = "Democratic" elif party == "R": party = "Republican" leg = Legislator(term, chamber, str(district), full_name, party=party, url=rep_url) leg.add_office('capitol', 'Capitol Office', address=address, phone=phone, fax=fax) # Yet, no email. committees = page.xpath("//table[@class='billLinks']")[0] for committee in committees.xpath(".//tr"): td = committee.xpath(".//td") if len(td) != 2: break name, role = td name, role = name.text_content(), role.text_content() name, role = name.strip(), role.strip() if name[0] == "|": continue if name.strip() == "Committee Name": continue chmbr = chamber if "joint" in name.lower(): chmbr = "joint" if name in JOINT_COMMITTEE_OVERRIDE: chmbr = "joint" leg.add_role('committee member', term=term, chamber=chmbr, committee=name, position=role ) leg.add_source(rep_url) self.save_legislator(leg)
def scrape(self, chamber, term): self.validate_term(term, latest_only=False) root_url = 'http://www.capitol.tn.gov/' parties = { 'D': 'Democratic', 'R': 'Republican', 'CCR': 'Carter County Republican', 'I': 'Independent' } #testing for chamber if chamber == 'upper': url_chamber_name = 'senate' abbr = 's' else: url_chamber_name = 'house' abbr = 'h' if term != self.metadata["terms"][-1]["sessions"][0]: chamber_url = root_url + url_chamber_name chamber_url += '/archives/' + term + 'GA/Members/index.html' else: chamber_url = root_url + url_chamber_name + '/members/' page = self.urlopen(chamber_url) page = lxml.html.fromstring(page) for row in page.xpath("//tr")[1:]: # Skip any a header row. if set(child.tag for child in row) == set(['th']): continue partyInit = row.xpath('td[2]')[0].text.split()[0] party = parties[partyInit] district = row.xpath('td[4]/a')[0].text.split()[1] address = row.xpath('td[5]')[0].text_content() # 301 6th Avenue North Suite address = address.replace( 'LP', 'Legislative Plaza\nNashville, TN 37243') address = address.replace( 'WMB', 'War Memorial Building\nNashville, TN 37243') address = '301 6th Avenue North\nSuite ' + address phone = row.xpath('td[6]')[0].text #special case for Karen D. Camper if phone == None: phone = row.xpath('td[6]/div')[0].text phone = '615-' + phone.split()[0] email = row.xpath('td[7]/a')[0].text member_url = (root_url + url_chamber_name + '/members/' + abbr + district + '.html') member_photo_url = (root_url + url_chamber_name + '/members/images/' + abbr + district + '.jpg') member_page = self.urlopen(member_url) member_page = lxml.html.fromstring(member_page) name = member_page.xpath('//div[@id="membertitle"]/h2')[0].text if 'Speaker' in name: full_name = name[8:len(name)] elif 'Lt.' in name: full_name = name[13:len(name)] elif abbr == 'h': full_name = name[5:len(name)] else: full_name = name[8:len(name)] leg = Legislator(term, chamber, district, full_name.strip(), party=party, email=email, url=member_url, photo_url=member_photo_url) leg.add_source(chamber_url) leg.add_source(member_url) # TODO: add district address from this page leg.add_office('capitol', 'Nashville Address', address=address, phone=phone) self.save_legislator(leg)
def scrape(self, chamber, term): if term != '2011-2012': raise NoDataForPeriod(term) if chamber == 'upper': chamber_name = 'senate' else: chamber_name = 'house' url = "http://www.legis.iowa.gov/Legislators/%s.aspx" % chamber_name page = lxml.html.fromstring(self.urlopen(url)) page.make_links_absolute(url) table = page.xpath('//table[@class="legis"]')[0] for link in table.xpath(".//a[contains(@href, 'legislator.aspx')]"): name = link.text.strip() district = link.xpath("string(../../td[2])") party = link.xpath("string(../../td[3])") email = link.xpath("string(../../td[5])") if party == 'Democrat': party = 'Democratic' pid = re.search("PID=(\d+)", link.attrib['href']).group(1) photo_url = ("http://www.legis.iowa.gov/getPhotoPeople.aspx" "?GA=84&PID=%s" % pid) leg = Legislator(term, chamber, district, name, party=party, email_address=email, photo_url=photo_url) leg.add_source(url) leg_page = lxml.html.fromstring(self.urlopen(link.attrib['href'])) comm_path = "//a[contains(@href, 'committee')]" for comm_link in leg_page.xpath(comm_path): comm = comm_link.text.strip() match = re.search(r'\((.+)\)$', comm) if match: comm = re.sub(r'\((.+)\)$', '', comm).strip() mtype = match.group(1).lower() else: mtype = 'member' if comm.endswith('Appropriations Subcommittee'): sub = re.match('^(.+) Appropriations Subcommittee$', comm).group(1) leg.add_role('committee member', term, chamber=chamber, committee='Appropriations', subcommittee=sub, position=mtype) else: leg.add_role('committee member', term, chamber=chamber, committee=comm, position=mtype) self.save_legislator(leg)
def scrape(self, term, chambers): year_abr = term[0:4] file_url, db = self.get_dbf(year_abr, 'ROSTER') bio_url, bio_db = self.get_dbf(year_abr, 'LEGBIO') photos = {} for rec in bio_db: photos[rec['roster_key']] = rec['urlpicture'] for rec in db: first_name = rec["firstname"] middle_name = rec["midname"] last_name = rec["lastname"] suffix = rec["suffix"] full_name = first_name + " " + middle_name + " " + last_name + " " + suffix full_name = full_name.replace(' ', ' ') full_name = full_name[0:len(full_name) - 1] district = int(rec["district"]) party = rec["party"] if party == 'R': party = "Republican" elif party == 'D': party = "Democratic" else: party = party chamber = rec["house"] if chamber == 'A': chamber = "lower" elif chamber == 'S': chamber = "upper" leg_status = rec["legstatus"] # skip Deceased/Retired members if leg_status != 'Active': continue title = rec["title"] legal_position = rec["legpos"] address = rec["address"] city = rec["city"] state = rec["state"] zipcode = rec["zipcode"] phone = rec["phone"] if 'email' in rec: email = rec["email"] else: email = '' photo_url = photos[rec['roster_key']] url = ('http://www.njleg.state.nj.us/members/bio.asp?Leg=' + str(int(rec['roster_key']))) address = '{0}\n{1}, {2} {3}'.format(rec['address'], rec['city'], rec['state'], rec['zipcode']) gender = {'M': 'Male', 'F': 'Female'}[rec['sex']] leg = Legislator(term, chamber, str(district), full_name, first_name, last_name, middle_name, party, suffixes=suffix, title=title, legal_position=legal_position, email=email, url=url, photo_url=photo_url, gender=gender) leg.add_source(url) leg.add_source(file_url) leg.add_office('district', 'District Office', address=address, phone=rec['phone']) self.save_legislator(leg)
def _scrape_representative(self, url, term, parties): """ Returns a Legislator object representing a member of the lower legislative chamber. """ #url = self.get(url).text.replace('<br>', '') member_page = self.lxmlize(url) photo_url = member_page.xpath('//img[@class="member-photo"]/@src')[0] if photo_url.endswith('/.jpg'): photo_url = None scraped_name, district_text = member_page.xpath( '//div[@class="member-info"]/h2') scraped_name = scraped_name.text_content().strip().replace('Rep. ', '') scraped_name = ' '.join(scraped_name.split()) name = scraped_name district_text = district_text.text_content().strip() district = str(self.district_re.search(district_text).group(1)) # Vacant house "members" are named after their district numbers: if re.match(r'^\d+$', scraped_name): return None party = parties[district] legislator = Legislator(term, 'lower', district, name, party=party, url=url, _scraped_name=scraped_name) if photo_url is not None: legislator['photo_url'] = photo_url legislator.add_source(url) def office_name(element): """Returns the office address type.""" return element.xpath('preceding-sibling::h4[1]/text()')[0] \ .rstrip(':') offices_text = [{ 'name': office_name(p_tag), 'type': office_name(p_tag).replace(' Address', '').lower(), 'details': p_tag.text_content() } for p_tag in member_page.xpath( '//h4/following-sibling::p[@class="double-space"]')] for office_text in offices_text: details = office_text['details'].strip() # A few member pages have blank office listings: if details == '': continue # At the time of writing, this case of multiple district # offices occurs exactly once, for the representative at # District 43: if details.count('Office') > 1: district_offices = [ district_office.strip() for district_office in re.findall( r'(\w+ Office.+?(?=\w+ Office|$))', details, flags=re.DOTALL) ] offices_text += [{ 'name': re.match(r'\w+ Office', office).group(), 'type': 'district', 'details': re.search(r'(?<=Office).+(?=\w+ Office|$)?', office, re.DOTALL).group() } for office in district_offices] match = self.address_re.search(details) if match is not None: address = re.sub(' +$', '', match.group().replace('\r', '').replace( '\n\n', '\n'), flags=re.MULTILINE) else: # No valid address found in the details. continue phone_number = extract_phone(details) fax_number = extract_fax(details) legislator.add_office(office_text['type'], office_text['name'], address=address, phone=phone_number, fax=fax_number) return legislator
def scrape_member(self, chamber, year, member_url): with self.urlopen(member_url) as member_page: member = {} member_root = lxml.html.fromstring(member_page) table = member_root.xpath('//body/div[2]/table')[0] imgtag = member_root.xpath('//body/div[2]/table//img') member['photo_url'] = imgtag[0].get('src') name_list = [ mem.text for mem in table.iterdescendants(tag='strong') ][0].split(' ') member['full_name'] = ' '.join(name_list[1:-1]).strip() party = name_list[-1] party = re.sub(r'\(|\)', '', party) if party == 'R': party = 'Republican' elif party == 'D': party = 'Democratic' elif party == 'I': party = 'Independent' member['party'] = party boldList = [bold.text for bold in table.iterdescendants(tag='b')] for item in boldList: if item == None: continue elif 'District' in item: district = item.split(' ')[-1] member['district'] = district.strip() else: if member.has_key('additionalRoles'): member['additionalRoles'].append(item) else: member['additionalRoles'] = [item] contact_rows = member_root.xpath( '//body/div[2]/div[1]/table/tr/td/table[1]/tr') for row in contact_rows: row_text = self.get_child_text(row) if len(row_text) > 0: if row_text[0] == 'Frankfort Address(es)': member['office_address'] = '\n'.join(row_text[1:]) if row_text[0] == 'Phone Number(s)': for item in row_text: # Use the first capitol annex phone if item.startswith('Annex:'): member['office_phone'] = item.replace( 'Annex:', '').strip() break leg = Legislator(year, chamber, member['district'], member['full_name'], party=member['party'], photo_url=member['photo_url'], office_address=member['office_address'], office_phone=member['office_phone']) leg.add_source(member_url) if member.has_key('additionalRoles'): for role in member['additionalRoles']: leg.add_role(role, year, member['party']) self.save_legislator(leg)
def scrape_house(self, term): url = 'http://www.camaraderepresentantes.org/cr_legs.asp' party_map = {'PNP': 'Partido Nuevo Progresista', 'PPD': u'Partido Popular Democr\xe1tico'} with self.urlopen(url) as html: doc = lxml.html.fromstring(html) doc.make_links_absolute(url) tables = doc.xpath('//table[@width="90%"]') # first table is district-based, second is at-large for table, at_large in zip(tables, [False, True]): for tr in table.xpath('.//tr')[1:]: tds = tr.getchildren() if not at_large: # tds: name, district, addr, phone, office, email name = tds[0] district = tds[1].text_content().lstrip('0') capitol_office = tds[2] phone = tds[3] email = tds[5] # district offices district_office = tds[4] district_addr = [] district_phone = None district_fax = None pieces = district_office.xpath('.//text()') for piece in pieces: if piece.startswith('Tel'): district_phone = PHONE_RE.findall(piece)[0] elif piece.startswith('Fax'): district_fax = PHONE_RE.findall(piece)[0] else: district_addr.append(piece) if district_addr: district_addr = ' '.join(district_addr) else: # name, addr, phone, email name = tds[0] district = 'At-Large' capitol_office = tds[1] phone = tds[2] email = tds[3] district_addr = None # cleanup is same for both tables name = re.sub('\s+', ' ', name.text_content().strip().replace(u'\xa0', ' ')) email = email.xpath('.//a/@href')[0].strip('mailto:') numbers = {} for b in phone.xpath('b'): numbers[b.text] = b.tail.strip() # capitol_office as provided is junk # things like 'Basement', and '2nd Floor' # urls @ http://www.camaraderepresentantes.org/legs2.asp?r=BOKCADHRTZ # where random chars are tr's id leg_url = 'http://www.camaraderepresentantes.org/legs2.asp?r=' + tr.get('id') leg = Legislator(term, 'lower', district, name, party='unknown', email=email, url=url) leg.add_office('capitol', 'Oficina del Capitolio', phone=numbers.get('Tel:') or None, # could also add TTY #tty=numbers.get('TTY:') or None, fax=numbers.get('Fax:') or None) if district_addr: leg.add_office('district', 'Oficina de Distrito', address=district_addr, phone=district_phone, fax=district_fax) leg.add_source(url) self.save_legislator(leg)
def scrape(self, chamber, term): term_slug = term[:-2] url = MEMBER_LIST_URL[chamber] % term_slug html = self.urlopen(url) doc = lxml.html.fromstring(html) doc.make_links_absolute(url) for row in doc.xpath('//table')[4].xpath('tr')[2:]: name, _, _, district, party = row.xpath('td') district = district.text party = { 'D': 'Democratic', 'R': 'Republican', 'I': 'Independent' }[party.text] leg_url = name.xpath('a/@href')[0] name = name.text_content().strip() # inactive legislator, skip them for now if name.endswith('*'): continue leg_html = self.urlopen(leg_url) leg_doc = lxml.html.fromstring(leg_html) leg_doc.make_links_absolute(leg_url) photo_url = leg_doc.xpath( '//img[contains(@src, "/members/")]/@src')[0] leg = Legislator(term, chamber, district, name, party=party, url=leg_url, photo_url=photo_url) leg.add_source(url) leg.add_source(leg_url) # email email = leg_doc.xpath('//b[text()="Email: "]') if email: leg['email'] = email[0].tail # function for turning an IL contact info table to office details def _table_to_office(table, office_type, office_name): addr = '' phone = '' fax = None for row in table.xpath('tr'): row = row.text_content().strip() # skip rows that aren't part of address if 'Office:' in row or row == 'Cook County': continue # fax number row ends with FAX elif 'FAX' in row: fax = row.replace(' FAX', '') # phone number starts with ( [make it more specific?] elif row.startswith('('): phone = row # everything else is an address else: addr += (row + '\n') leg.add_office(office_type, office_name, address=addr.strip(), phone=phone, fax=fax) # extract both offices from tables table = leg_doc.xpath( '//table[contains(string(), "Springfield Office")]')[3] _table_to_office(table, 'capitol', 'Springfield Office') table = leg_doc.xpath( '//table[contains(string(), "District Office")]')[3] _table_to_office(table, 'district', 'District Office') self.save_legislator(leg)
def parse_legislator(self, tr, term, chamber, strip=methodcaller('strip'), xpath='td[contains(@class, "views-field-field-%s-%s")]%s', xp={'url': ('lname-value-1', '/a/@href'), 'district': ('district-value', '/text()'), 'party': ('party-value', '/text()'), 'full_name': ('feedbackurl-value', '/a/text()'), 'address': ('feedbackurl-value', '/p/text()')}, titles={'upper': 'senator', 'lower': 'member'}, funcs={ 'full_name': lambda s: s.replace('Contact Senator', '').strip(), 'address': parse_address, }): ''' Given a tr element, get specific data from it. ''' rubberstamp = lambda _: _ tr_xpath = tr.xpath res = {} for k, v in xp.items(): f = funcs.get(k, rubberstamp) v = (titles[chamber],) + v v = map(f, map(strip, tr_xpath(xpath % v))) if len(v) == 1: res[k] = v[0] else: res[k] = v # Photo. try: res['photo_url'] = tr_xpath('td/p/img/@src')[0] except IndexError: pass # Addresses. addresses = map(dict, filter(None, res['address'])) for x in addresses: try: x['zip'] = x['zip'].replace('CA ', '') except KeyError: # No zip? Toss. addresses.remove(x) # Re-key the addresses addresses[0].update(type='capitol', name='Capitol Office') offices = [addresses[0]] for office in addresses[1:]: office.update(type='district', name='District Office') offices.append(office) for office in offices: street = office['street'] street = '%s\n%s, %s %s' % (street, office['city'], 'CA', office['zip']) office['address'] = street office['fax'] = None office['email'] = None del office['street'], office['city'], office['zip'] res['offices'] = offices del res['address'] # Remove junk from assembly member names. junk = 'Contact Assembly Member ' res['full_name'] = res['full_name'].replace(junk, '') # convert party if res['party'] == 'Democrat': res['party'] = 'Democratic' # strip leading zero res['district'] = str(int(res['district'])) # Add a source for the url. leg = Legislator(term, chamber, **res) leg.update(**res) return leg
def _scrape_upper_chamber(self, term): index_url = 'http://www.senate.mn/members/index.php' doc = lxml.html.fromstring(self.get(index_url).text) doc.make_links_absolute(index_url) leg_data = defaultdict(dict) # get all the tds in a certain div tds = doc.xpath( '//div[@id="hide_show_alpha_all"]//td[@style="vertical-align:top;"]' ) for td in tds: # each td has 2 <a>s- site & email main_link, email = td.xpath('.//a') # get name name = main_link.text_content().split(' (')[0] leg = leg_data[name] leg['office_phone'] = filter( lambda string: re.match(r'\d{3}-\d{3}-\d{4}', string), td.xpath('.//p/text()'))[0].strip() leg['url'] = main_link.get('href') leg['photo_url'] = td.xpath('./preceding-sibling::td//img/@src')[0] if 'mailto:' in email.get('href'): leg['email'] = email.get('href').replace('mailto:', '') self.info('collected preliminary data on %s legislators', len(leg_data)) assert leg_data # use CSV for most of data csv_url = 'http://www.senate.mn/members/member_list_ascii.php?ls=' csvfile = self.get(csv_url).text for row in csv.DictReader(StringIO(csvfile)): if not row['First Name']: continue name = '%s %s' % (row['First Name'], row['Last Name']) party = self._parties[row['Party']] leg_data[name] if 'email' in leg_data[name]: email = leg_data[name].pop('email') else: email = None leg = Legislator(term, 'upper', row['District'].lstrip('0'), name, party=party, first_name=row['First Name'], last_name=row['Last Name'], **leg_data[name]) row["Zipcode"] = row["Zipcode"].strip() # Accommodate for multiple address column naming conventions. address1_fields = [row.get('Address'), row.get('Office Building')] address2_fields = [row.get('Address2'), row.get('Office Address')] row['Address'] = next( (a for a in address1_fields if a is not None), False) row['Address2'] = next( (a for a in address2_fields if a is not None), False) if (a in row['Address2'] for a in ['95 University Avenue W', '100 Rev. Dr. Martin Luther King']): leg.add_office('capitol', 'Capitol Office', address='{Room} {Address}\n{Address2}\n{City}, {State} '\ '{Zipcode}'.format(Room=row['Rm. Number'], **row), email=email, phone=leg.get('office_phone')) elif row['Address2']: leg.add_office( 'district', 'District Office', address='{Address}\n{Address2}\n{City}, {State} {Zipcode}'. format(**row), email=email) else: leg.add_office( 'district', 'District Office', address='{Address}\n{City}, {State} {Zipcode}'.format( **row), email=email) leg.add_source(csv_url) leg.add_source(index_url) self.save_legislator(leg)
def scrape(self, chamber, term): self.validate_term(term, latest_only=True) if chamber == 'upper': chamber_name = 'senate' else: chamber_name = 'house' url = "http://www.legis.iowa.gov/Legislators/%s.aspx" % chamber_name page = lxml.html.fromstring(self.urlopen(url)) page.make_links_absolute(url) table = page.xpath('//table[@class="legis"]')[0] for link in table.xpath(".//a[contains(@href, 'legislator.aspx')]"): name = link.text.strip() leg_url = link.get('href') district = link.xpath("string(../../td[2])") party = link.xpath("string(../../td[3])") email = link.xpath("string(../../td[5])") if party == 'Democrat': party = 'Democratic' pid = re.search("PID=(\d+)", link.attrib['href']).group(1) photo_url = ("http://www.legis.iowa.gov/getPhotoPeople.aspx" "?GA=84&PID=%s" % pid) leg = Legislator(term, chamber, district, name, party=party, email=email, photo_url=photo_url, url=url) leg.add_source(url) leg_page = lxml.html.fromstring(self.urlopen(link.attrib['href'])) office_data = { "email": "ctl00_cphMainContent_divEmailLegis", "home_phone": "ctl00_cphMainContent_divPhoneHome", "home_addr": "ctl00_cphMainContent_divAddrHome", "office_phone": "ctl00_cphMainContent_divPhoneCapitol", } metainf = {} for attr in office_data: path = office_data[attr] info = leg_page.xpath("//div[@id='%s']" % path) if len(info) != 1: continue info = info[0] _, data = [x.text_content() for x in info.xpath("./span")] data = data.strip() if data == "": continue metainf[attr] = data if "home_phone" in metainf or "home_addr" in metainf: home_args = {} if "home_phone" in metainf: home_args['phone'] = metainf['home_phone'] if "home_addr" in metainf: home_args['address'] = metainf['home_addr'] leg.add_office('district', 'Home Office', **home_args) if "email" in metainf or "office_phone" in metainf: cap_args = {} if "email" in metainf: cap_args['email'] = metainf['email'] if "office_phone" in metainf: cap_args['phone'] = metainf['office_phone'] leg.add_office('capitol', 'Capitol Office', **cap_args) comm_path = "//a[contains(@href, 'committee')]" for comm_link in leg_page.xpath(comm_path): comm = comm_link.text.strip() match = re.search(r'\((.+)\)$', comm) if match: comm = re.sub(r'\((.+)\)$', '', comm).strip() mtype = match.group(1).lower() else: mtype = 'member' if comm.endswith('Appropriations Subcommittee'): sub = re.match('^(.+) Appropriations Subcommittee$', comm).group(1) leg.add_role('committee member', term, chamber=chamber, committee='Appropriations', subcommittee=sub, position=mtype) else: leg.add_role('committee member', term, chamber=chamber, committee=comm, position=mtype) self.save_legislator(leg)
def _scrape_lower_chamber(self, term): url = 'http://www.house.leg.state.mn.us/members/hmem.asp' page = self.lxmlize(url) legislator_nodes = self.get_nodes( page, '//div[@id="hide_show_alpha_all"]/table/tr/td/table/tr') need_special_email_case = False for legislator_node in legislator_nodes: photo_url = self.get_node(legislator_node, './td[1]/a/img/@src') info_nodes = self.get_nodes(legislator_node, './td[2]/p/a') name_text = self.get_node(info_nodes[0], './b/text()') name_match = re.search(r'^.+\(', name_text) name = name_match.group(0) name = name.replace('(', '').strip() district_match = re.search(r'\([0-9]{2}[A-Z]', name_text) district_text = district_match.group(0) district = district_text.replace('(', '').lstrip('0').strip() party_match = re.search(r'[A-Z]+\)$', name_text) party_text = party_match.group(0) party_text = party_text.replace(')', '').strip() party = self._parties[party_text] info_texts = self.get_nodes( legislator_node, './td[2]/p/text()[normalize-space() and preceding-sibling' '::br]') address = '\n'.join((info_texts[0], info_texts[1])) phone_text = info_texts[2] if validate_phone_number(phone_text): phone = phone_text # E-mail markup is screwed-up and inconsistent. try: email_node = info_nodes[1] email_text = email_node.text except IndexError: # Primarily for Dan Fabian. email_node = info_texts[3] need_special_email_case = True email_text = email_text.replace('Email: ', '').strip() if validate_email_address(email_text): email = email_text legislator = Legislator( term=term, chamber='lower', district=district, full_name=name, party=party, email=email, photo_url=photo_url, ) legislator.add_source(url) legislator.add_office( type='capitol', name="Capitol Office", address=address, phone=phone, email=email, ) self.save_legislator(legislator) if not need_special_email_case: self.logger.warning('Special e-mail handling no longer required.')
def scrape_legislators(self, term, chamber, leg_page, member_url, main_url, member): full_name = leg_page.xpath( '//div[@class="content"][1]/table[1]//tr[1]/td[2]/table//tr[1]/td/h2' )[0].text if len(full_name.split()) == 3: first_name = full_name.split()[1] middle_name = '' last_name = full_name.split()[2] full_name = first_name + ' ' + last_name else: first_name = full_name.split()[1] middle_name = full_name.split()[2] last_name = full_name.split()[3] full_name = first_name + ' ' + middle_name + ' ' + last_name district = leg_page.xpath( '//div[@class="content"][1]/table[1]//tr[1]/td[2]/table//tr[5]/td[2]' )[0].text party = leg_page.xpath( '//div[@class="content"][1]/table[1]//tr[1]/td[2]/table//tr[6]/td[2]' )[0].text.strip() full_address = leg_page.xpath( '//div[@class="content"][1]/table[1]//tr[1]/td[2]/table//tr[2]/td[2]' )[0].text phone = leg_page.xpath( '//div[@class="content"][1]/table[1]//tr[1]/td[2]/table//tr[3]/td[2]' )[0].text email = leg_page.xpath( '//div[@class="content"][1]/table[1]//tr[1]/td[2]/table//tr[4]/td[2]/a' )[0].text if member.tail: logger.info("Skipping legislator because: %s" % (member.tail)) return if chamber == 'lower': photo_url = leg_page.xpath( '//img[contains(@src, "representatives")]')[0].get('src') else: photo_url = leg_page.xpath('//img[contains(@src, "senators")]') if len(photo_url) > 0: photo_url = photo_url[0].get('src') if party == 'Democrat': party = 'Democratic' kwargs = {"url": member_url} if photo_url: kwargs['photo_url'] = photo_url leg = Legislator(term, chamber, district, full_name, first_name, last_name, middle_name, party, **kwargs) leg.add_office('district', 'District Office', address=full_address, phone=phone, email=email) leg.add_source(member_url) leg.add_source(main_url) self.save_legislator(leg)