def get_member(self, session, chamber, kpid): url = "%smembers/%s" % (ksapi.url, kpid) content = json.loads(self.get(url).text)["content"] party = content["PARTY"] if party == "Democrat": party = "Democratic" slug = { "2013-2014": "b2013_14", "2015-2016": "b2015_16", "2017-2018": "b2017_18", "2019-2020": "b2019_20", }[session] leg_url = "http://www.kslegislature.org/li/%s/members/%s/" % (slug, kpid) try: legislator_page = self.lxmlize(leg_url) (photo_url, ) = legislator_page.xpath('//img[@class="profile-picture"]/@src') except scrapelib.HTTPError: self.warning("{}'s legislator bio page not found".format( content["FULLNAME"])) leg_url = "" photo_url = "" person = Person( name=content["FULLNAME"], district=str(content["DISTRICT"]), primary_org=chamber, party=party, image=photo_url, ) person.extras = {"occupation": content["OCCUPATION"]} address = "\n".join([ "Room {}".format(content["OFFICENUM"]), "Kansas State Capitol Building", "300 SW 10th St.", "Topeka, KS 66612", ]) note = "Capitol Office" person.add_contact_detail(type="address", value=address, note=note) person.add_contact_detail(type="email", value=content["EMAIL"], note=note) if content["OFFPH"]: person.add_contact_detail(type="voice", value=content["OFFPH"], note=note) person.add_source(url) person.add_link(leg_url) yield person
def _scrape_lower_chamber(self): self.info("Scraping lower chamber for legislators.") chamber = "lower" roster_url = self._reps_url page = self.get(roster_url).text page = lxml.html.fromstring(page) # This is the ASP.net table container table_xpath = "//table[@id='theTable']" table = page.xpath(table_xpath)[0] for tr in table.xpath("tr")[3:]: # If a given term hasn't occurred yet, then ignore it # Eg, in 2017, the 2018 term page will have a blank table if tr.attrib.get("class") == "dxgvEmptyDataRow": self.warning("No House members found") return tds = tr.xpath("td") last_name = tds[1].text_content().strip() first_name = tds[2].text_content().strip() full_name = "{} {}".format(first_name, last_name) district = str(int(tds[3].text_content().strip())) party = tds[4].text_content().strip() if party == "D": party = "Democratic" elif party == "R": party = "Republican" if party.strip() == "": # Workaround for now. party = "Other" phone = tds[6].text_content().strip() room = tds[7].text_content().strip() address = self._assumed_address_fmt.format(room if room else "") if last_name == "Vacant": person = Person(name=full_name, primary_org=chamber, district=district, party=party) person.extras = { "first_name": first_name, "last_name": last_name } person.add_contact_detail(type="address", value=address, note="Capitol Office") if phone.strip(): person.add_contact_detail(type="voice", value=phone, note="Capitol Office") person.add_source(roster_url) self._save_vacant_legislator(person) else: party_override = { " Green": "Democratic", " Sisco": "Republican" } if party == "" and full_name in party_override: party = party_override[full_name] details_url = self._rep_details_url.format(district) details_page = lxml.html.fromstring(self.get(details_url).text) person = Person(name=full_name, primary_org=chamber, district=district, party=party) person.extras = { "first_name": first_name, "last_name": last_name } person.add_source(roster_url) person.add_source(details_url) person.add_link(details_url) email = details_page.xpath( '//*[@id="ContentPlaceHolder1_lblAddresses"] ' '//a[starts-with(@href,"mailto:")]/@href') if len(email) > 0 and email[0].lower() != "mailto:": email = email[0].split(":")[1] else: email = None person.add_contact_detail(type="address", value=address, note="Capitol Office") if phone: person.add_contact_detail(type="voice", value=phone, note="Capitol Office") if email: person.add_contact_detail(type="email", value=email, note="Capitol Office") picture = details_page.xpath( '//*[@id="ContentPlaceHolder1_imgPhoto"]/@src') if len(picture) > 0: person.image = picture[0] yield person