def scrape_member_page(self, chamber, url): page = self.get(url).text page = lxml.html.fromstring(page) page.make_links_absolute(url) for legislator in page.xpath( "//div[contains(concat(' ', normalize-space(@class), ' '), " "' memberModule ')]" ): img = legislator.xpath(".//div[@class='thumbnail']//img")[0].attrib["src"] data = legislator.xpath(".//div[@class='data']")[0] homepage = data.xpath(".//a[@class='black']")[0] full_name = homepage.text_content() if "Vacant" in full_name: continue homepage = homepage.attrib["href"] party = data.xpath(".//span[@class='partyLetter']")[0].text_content() party = {"R": "Republican", "D": "Democratic"}[party] office_lines = data.xpath("child::text()") phone = office_lines.pop(-1) if re.search(r"(Leader|Whip|Speaker)", office_lines[0]): office_lines.pop(0) office = "\n".join(office_lines) h3 = data.xpath("./h3") if len(h3): h3 = h3[0] district = h3.xpath("./br")[0].tail.replace("District", "").strip() else: district = re.findall(r"\d+\.png", legislator.attrib["style"])[ -1 ].split(".", 1)[0] full_name = re.sub(r"\s+", " ", full_name).strip() email = ( "rep{0:0{width}}@ohiohouse.gov" if chamber == "lower" else "sd{0:0{width}}@ohiosenate.gov" ).format(int(district), width=2) leg = Person( name=full_name, district=district, party=party, primary_org=chamber, image=img, ) leg.add_contact_detail(type="address", value=office, note="Capitol Office") leg.add_contact_detail(type="voice", value=phone, note="Capitol Office") leg.add_contact_detail(type="email", value=email, note="Capitol Office") self.scrape_homepage(leg, chamber, homepage) leg.add_source(url) leg.add_link(homepage) yield leg
def test_full_person(): person = ScrapePerson("Tom Sawyer") person.add_identifier("1") person.add_name("Tommy", start_date="1880") person.add_contact_detail(type="phone", value="555-555-1234", note="this is fake") person.add_link("http://example.com/link") person.add_source("http://example.com/source") # import person pd = person.as_dict() PersonImporter("jid").import_data([pd]) # get person from db and assert it imported correctly p = Person.objects.get() assert "ocd-person" in p.id assert p.name == person.name assert p.identifiers.all()[0].identifier == "1" assert p.identifiers.all()[0].scheme == "" assert p.other_names.all()[0].name == "Tommy" assert p.other_names.all()[0].start_date == "1880" assert p.contact_details.all()[0].type == "phone" assert p.contact_details.all()[0].value == "555-555-1234" assert p.contact_details.all()[0].note == "this is fake" assert p.links.all()[0].url == "http://example.com/link" assert p.sources.all()[0].url == "http://example.com/source"
def scrape_lower(self, chamber): url = "http://www.house.mi.gov/mhrpublic/frmRepList.aspx" table = ["website", "district", "name", "party", "location", "phone", "email"] data = self.get(url).text doc = lxml.html.fromstring(data) # skip two rows at top for row in doc.xpath('//table[@id="grvRepInfo"]/*'): tds = row.xpath(".//td") if len(tds) == 0: continue metainf = {} for i in range(0, len(table)): metainf[table[i]] = tds[i] district = str(int(metainf["district"].text_content().strip())) party = metainf["party"].text_content().strip() phone = metainf["phone"].text_content().strip() email = metainf["email"].text_content().strip() name = metainf["name"].text_content().strip() if name == "Vacant" or re.match(r"^District \d{1,3}$", name): self.warning( "District {} appears vacant, and will be skipped".format(district) ) continue leg_url = metainf["website"].xpath("./a")[0].attrib["href"] office = metainf["location"].text_content().strip() office = re.sub( " HOB", " Anderson House Office Building\n124 North Capitol Avenue\nLansing, MI 48933", office, ) office = re.sub(" CB", " State Capitol Building\nLansing, MI 48909", office) try: photo_url = self.get_photo_url(leg_url)[0] except (scrapelib.HTTPError, IndexError): photo_url = "" self.warning("no photo url for %s", name) person = Person( name=name, district=district, party=abbr[party], primary_org="lower", image=photo_url, ) person.add_link(leg_url) person.add_source(leg_url) person.add_contact_detail( type="address", value=office, note="Capitol Office" ) person.add_contact_detail(type="voice", value=phone, note="Capitol Office") person.add_contact_detail(type="email", value=email, note="Capitol Office") yield person
def scrape_chamber(self, chamber): client = ApiClient(self) session = self.latest_session() base_url = "http://iga.in.gov/legislative" api_base_url = "https://api.iga.in.gov" chamber_name = "senate" if chamber == "upper" else "house" r = client.get("chamber_legislators", session=session, chamber=chamber_name) all_pages = client.unpaginate(r) for leg in all_pages: firstname = leg["firstName"] lastname = leg["lastName"] party = leg["party"] link = leg["link"] api_link = api_base_url + link html_link = base_url + link.replace("legislators/", "legislators/legislator_") try: html = get_with_increasing_timeout(self, html_link, fail=True, kwargs={"verify": False}) except scrapelib.HTTPError: self.logger.warning("Legislator's page is not available.") continue doc = lxml.html.fromstring(html.text) doc.make_links_absolute(html_link) address, phone = doc.xpath("//address") address = address.text_content().strip() address = "\n".join([ln.strip() for ln in address.split("\n")]) phone = phone.text_content().strip() try: district = (doc.xpath("//span[@class='district-heading']") [0].text.lower().replace("district", "").strip()) except IndexError: self.warning("skipping legislator w/o district") continue image_link = base_url + link.replace("legislators/", "portraits/legislator_") legislator = Person( primary_org=chamber, district=district, name=" ".join([firstname, lastname]), party=party, image=image_link, ) legislator.add_contact_detail(type="address", note="Capitol Office", value=address) legislator.add_contact_detail(type="voice", note="Capitol Office", value=phone) legislator.add_link(html_link) legislator.add_source(html_link) legislator.add_source(api_link) yield legislator
def scrape_senator_page(self, chamber, url): page = self.get(url).text page = lxml.html.fromstring(page) page.make_links_absolute(url) for legislator in page.xpath( "//div[@id='senators']//div[contains(concat(' ', normalize-space(@class), ' '), " "' portraitContainer ')]" ): img = legislator.xpath( ".//div[@class='profileThumbnailBoundingBox']/@style" )[0] img = img[img.find("(") + 1 : img.find(")")] full_name = legislator.xpath(".//div[@class='profileName']/a/text()")[0] homepage_url = legislator.xpath(".//a[@class='profileImageLink']")[ 0 ].attrib["href"] district = legislator.xpath(".//div[@class='profileDistrict']" "/a/text()")[ 0 ].split("#")[1] if "Vacant" in full_name: continue homepage = self.get(homepage_url).text page = lxml.html.fromstring(homepage) phone = page.xpath("//div[@class='phone']/span/text()")[0] address_lines = page.xpath("//div[@class='address']/descendant::*/text()") address = "\n".join(address_lines) party_image = page.xpath('//div[@class="senatorParty"]/img/@src')[0] if "Republican" in party_image: party = "Republican" elif "Democrat" in party_image: party = "Democratic" email = ( "rep{0:0{width}}@ohiohouse.gov" if chamber == "lower" else "sd{0:0{width}}@ohiosenate.gov" ).format(int(district), width=2) leg = Person( name=full_name, district=district, primary_org=chamber, image=img, party=party, ) leg.add_contact_detail(type="address", value=address, note="Capitol Office") leg.add_contact_detail(type="voice", value=phone, note="Capitol Office") leg.add_contact_detail(type="email", value=email, note="Capitol Office") leg.add_source(url) leg.add_link(homepage_url) yield leg
def scrape_rep(self, url): page = self.get(url).text page = lxml.html.fromstring(page) page.make_links_absolute(url) main = page.xpath('//div[@id="main-info"]')[0] if "Resigned" in main.text_content(): print("Member resigned {}".format(url)) raise StopIteration # don't yield anything if "Deceased" in main.text_content(): print("Member is deceased {}".format(url)) raise StopIteration # don't yield anything name = page.xpath('//div[@class="member-name"]/text()')[0].strip() name = re.sub(r"\s+", " ", name) district_number = page.xpath( '//span[contains(text(), "House District:")]' "/following-sibling::span/text()")[0].strip() # remove anything after first whitespace district_number = re.sub(r"\s.*", "", district_number.strip()) email = None email_content = page.xpath( '//a[./i[contains(@class,"fa-envelope")]]/text()') if email_content and email_content[0].strip(): email = email_content[0].strip() photo_url = page.xpath('//header[@id="home"]/img/@src')[0] party = self.get_rep_table_by_header(page, "Party Affiliation").text.strip() party = _party_map[party[0]] # standardize main_p_text = page.xpath('//div[@id="main-info"]/p/text()') address = [t.strip() for t in main_p_text if t.strip()][0] person = Person( name=name, district=district_number, primary_org="lower", party=party, image=photo_url, ) person.add_contact_detail(type="address", value=address, note="District Office") if email: person.add_contact_detail(type="email", value=email, note="District Office") person.add_link(url) person.add_source(url) yield person
def handle_list_item(self, row): if not row["First Name"]: return name = "{} {}".format(row["First Name"], row["Last Name"]) party = PARTIES[row["Party"]] leg = Person( name=name, district=row["District"].lstrip("0"), party=party, primary_org="upper", role="Senator", image=self.extra_info[name]["image"], ) leg.add_link(self.extra_info[name]["url"]) leg.add_contact_detail(type="voice", value=self.extra_info[name]["office_phone"], note="capitol") if "email" in self.extra_info[name]: leg.add_contact_detail(type="email", value=self.extra_info[name]["email"], note="capitol") row["Zipcode"] = row["Zipcode"].strip() # Accommodate for multiple address column naming conventions. address1_fields = [row.get("Address"), row.get("Office Building")] address2_fields = [row.get("Address2"), row.get("Office Address")] row["Address"] = next((a for a in address1_fields if a is not None), False) row["Address2"] = next((a for a in address2_fields if a is not None), False) if (a in row["Address2"] for a in ["95 University Avenue W", "100 Rev. Dr. Martin Luther King"]): address = "{Address}\n{Address2}\n{City}, {State} {Zipcode}".format( **row) if "Rm. Number" in row: address = "{0} {1}".format(row["Rm. Number"], address) leg.add_contact_detail(type="address", value=address, note="capitol") elif row["Address2"]: address = "{Address}\n{Address2}\n{City}, {State} {Zipcode}".format( **row) leg.add_contact_detail(type="address", value=address, note="district") else: address = "{Address}\n{City}, {State} {Zipcode}".format(**row) leg.add_contact_detail(type="address", value=address, note="district") leg.add_source(self.url) leg.add_source(self._html_url) return leg
def scrape_lower_legislator(self, url, leg_info): page = self.lxmlize(url) name = page.xpath( '//span[@id="body_FormView5_FULLNAMELabel"]/text()')[0].strip() if name.startswith("District ") or name.startswith("Vacant "): self.warning("Seat is vacant: {}".format(name)) return photo = page.xpath( '//img[contains(@src, "/h_reps/RepPics")]')[0].attrib["src"] party_flags = { "Democrat": "Democratic", "Republican": "Republican", "Independent": "Independent", } party_info = page.xpath( '//span[@id="body_FormView5_PARTYAFFILIATIONLabel"]/text()' )[0].strip() party = party_flags[party_info] try: email = page.xpath( '//span[@id="body_FormView6_EMAILADDRESSPUBLICLabel"]/text()' )[0].strip() except IndexError: email = None district = leg_info["dist"].replace("Dist", "").strip() person = Person(name=name, party=party, district=district, primary_org="lower", image=photo) contacts = [ (leg_info["office"], "address"), (leg_info["phone"], "voice"), (email, "email"), ] for value, key in contacts: if value: person.add_contact_detail(type=key, value=value, note="District Office") person.add_source(url) person.add_link(url) yield person
def handle_list_item(self, item): photo_url = item.xpath("./img/@src")[0] url = item.xpath(".//h5/a/@href")[0] name_text = item.xpath(".//h5/a/b/text()")[0] name_match = re.match(r"^(.+)\(([0-9]{2}[AB]), ([A-Z]+)\)$", name_text) name = name_match.group(1).strip() district = name_match.group(2).lstrip("0").upper() party_text = name_match.group(3) party = PARTIES[party_text] info_texts = [ x.strip() for x in item.xpath("./div/text()[normalize-space()]") if x.strip() ] address = "\n".join((info_texts[0], info_texts[1])) phone_text = info_texts[2] if validate_phone_number(phone_text): phone = phone_text email_text = item.xpath(".//a/@href")[1].replace("mailto:", "").strip() if validate_email_address(email_text): email = email_text rep = Person( name=name, district=district, party=party, primary_org="lower", role="Representative", image=photo_url, ) rep.add_link(url) rep.add_contact_detail(type="address", value=address, note="capitol") rep.add_contact_detail(type="voice", value=phone, note="capitol") rep.add_contact_detail(type="email", value=email, note="capitol") rep.add_source(self.url) yield rep
def get_member(self, session, chamber, kpid): url = "%smembers/%s" % (ksapi.url, kpid) content = json.loads(self.get(url).text)["content"] party = content["PARTY"] if party == "Democrat": party = "Democratic" slug = { "2013-2014": "b2013_14", "2015-2016": "b2015_16", "2017-2018": "b2017_18", "2019-2020": "b2019_20", }[session] leg_url = "http://www.kslegislature.org/li/%s/members/%s/" % (slug, kpid) try: legislator_page = self.lxmlize(leg_url) (photo_url, ) = legislator_page.xpath('//img[@class="profile-picture"]/@src') except scrapelib.HTTPError: self.warning("{}'s legislator bio page not found".format( content["FULLNAME"])) leg_url = "" photo_url = "" person = Person( name=content["FULLNAME"], district=str(content["DISTRICT"]), primary_org=chamber, party=party, image=photo_url, ) person.extras = {"occupation": content["OCCUPATION"]} address = "\n".join([ "Room {}".format(content["OFFICENUM"]), "Kansas State Capitol Building", "300 SW 10th St.", "Topeka, KS 66612", ]) note = "Capitol Office" person.add_contact_detail(type="address", value=address, note=note) person.add_contact_detail(type="email", value=content["EMAIL"], note=note) if content["OFFPH"]: person.add_contact_detail(type="voice", value=content["OFFPH"], note=note) person.add_source(url) person.add_link(leg_url) yield person
def _scrape_legislator(self, row, chamber): name_cell = row.xpath('./td[@class="rosterCell nameCell"]/a')[0] name = " ".join([ line.strip() for line in name_cell.text_content().split("\n") if len(line.strip()) > 0 ]) party_letter = row.xpath( './td[@class="rosterCell partyCell"]/text()')[0].strip() party = dict(D="Democratic", R="Republican")[party_letter] chamber_abbr = self._chamber_map[chamber] district = (row.xpath('./td[@class="rosterCell seatCell"]' "/text()")[0].replace(chamber_abbr, "").strip()) try: email = (row.xpath('./td[@class="rosterCell emailCell"]' "/a/@href")[0].replace("mailto:", "").strip()) except IndexError: email = None phone = (row.xpath('./td[@class="rosterCell phoneCell"]' "/text()")[0].strip() or None) details_url = "https://leg.mt.gov{}".format(name_cell.attrib["href"]) response = self.get(details_url) details_page = lxml.html.fromstring(response.text) address_lines = (details_page.xpath( '//div[@class="col-lg-6 col-md-12 text-lg-left align-self-center"]' '/p[contains(text(), "Address")]')[0].text_content().replace( "Address", "").split("\n")) address = "\n".join( [line.strip() for line in address_lines if len(line.strip()) > 0]) legislator = Person(name=name, district=district, party=party, primary_org=chamber) legislator.add_contact_detail(type="address", value=address, note="Capitol Office") if phone is not None: legislator.add_contact_detail(type="voice", value=phone, note="Capitol Office") if email is not None: legislator.add_contact_detail(type="email", value=email, note="E-mail") legislator.add_link(details_url) legislator.add_source(self._roster_url) yield legislator
def scrape_senator(self, district): link = "https://legislature.maine.gov/District-{}".format(district) page = lxml.html.fromstring(self.get(link).text) page.make_links_absolute(link) main = page.xpath('//div[@id="main"]/div[@id="content"]')[0] title = main.xpath("h1")[0].text # e.g. District 25 - State Senator Catherine Breen (D - Cumberland)... title_match = re.match( r"District (\d+) - State Senator ([^\(]+) \(([DRI])", title) _, name, party = title_match.groups() name = re.sub(r"\s+", " ", name.strip()) party = _party_map[party] image_url = address = phone = email = None for p in main.xpath("p"): if p.xpath(".//img") and not image_url: image_url = p.xpath(".//img/@src")[0] continue field, _, value = p.text_content().partition(":") value = value.strip() if field in ("Address", "Mailing Address"): address = value elif field in ("Phone", "Home Phone"): phone = value elif field == "Email": email = value person = Person( name=name, district=district, image=image_url, primary_org="upper", party=party, ) person.add_link(link) person.add_source(link) if address: person.add_contact_detail(type="address", value=address, note="District Office") if phone: person.add_contact_detail(type="voice", value=clean_phone(phone), note="District Phone") person.add_contact_detail(type="email", value=email, note="District Email") yield person
def table_row_to_legislator_and_profile_url(table_row_element, chamber): """Derive a Legislator from an HTML table row lxml Element, and a link to their profile""" td_elements = table_row_element.xpath("td") ( role_element, name_element, district_element, party_element, phone_element, email_element, ) = td_elements # Name comes in the form Last, First # last_name_first_name = name_element.text_content().strip() # full_name = last_name_first_name_to_full_name(last_name_first_name) full_name = name_element.text_content().strip() if full_name.count(", ") == 1: full_name = " ".join(full_name.split(", ")[::-1]).strip() district = district_element.text_content().strip() party = party_element.text_content().strip() if party == "Democrat": party = "Democratic" elif party == "Unaffiliated": party = "Independent" role = role_element.text_content().strip() address = co_address_from_role(role) phone = phone_element.text_content().strip() email = email_element.text_content().strip() (profile_url, ) = name_element.xpath("a/@href") print(chamber, district, party) legislator = Person(primary_org=chamber, name=full_name, district=district, party=party) legislator.add_contact_detail(type="address", value=address, note="Capitol Office") if phone: legislator.add_contact_detail(type="voice", value=phone, note="Capitol Office") if email: legislator.add_contact_detail(type="email", value=email, note="Capitol Office") return legislator, profile_url
def scrape_chamber(self, session): session_key = SESSION_KEYS[session] legislators_reponse = self.api_client.get("legislators", session=session_key) for legislator in legislators_reponse: url_name = legislator["WebSiteUrl"].split("/")[-1] chamber_name = "house" if legislator["Chamber"] == "H" else "senate" img = "https://www.oregonlegislature.gov/{}/MemberPhotos/{}.jpg".format( chamber_name, url_name ) party = legislator["Party"] if party == "Democrat": party = "Democratic" person = Person( name="{} {}".format(legislator["FirstName"], legislator["LastName"]), primary_org={"S": "upper", "H": "lower"}[legislator["Chamber"]], party=party, district=legislator["DistrictNumber"], image=img, ) person.add_link(legislator["WebSiteUrl"]) person.add_source(legislator["WebSiteUrl"]) if legislator["CapitolAddress"]: person.add_contact_detail( type="address", value=legislator["CapitolAddress"], note="Capitol Office", ) if legislator["CapitolPhone"]: person.add_contact_detail( type="voice", value=legislator["CapitolPhone"], note="Capitol Office", ) person.add_contact_detail( type="email", value=legislator["EmailAddress"], note="Capitol Office" ) yield person
def scrape_chamber(self, chamber=None): if chamber == "upper": url = "http://www.rilegislature.gov/SiteAssets/MailingLists/Senators.xls" rep_type = "Senator" contact_url = ( "http://webserver.rilin.state.ri.us/Email/SenEmailListDistrict.asp" ) elif chamber == "lower": url = "http://www.rilegislature.gov/SiteAssets/MailingLists/Representatives.xls" rep_type = "Representative" contact_url = ( "http://webserver.rilin.state.ri.us/Email/RepEmailListDistrict.asp" ) contact_page = self.lxmlize(contact_url) contact_info_by_district = {} for row in contact_page.xpath('//tr[@valign="TOP"]'): tds = row.xpath("td") (detail_link,) = tds[link_col_ix].xpath(".//a/@href") # Ignore name (2nd col). We have a regex built up below for the spreadsheet name # I don't want to touch district, _, email, phone = [ td.text_content().strip() for td in tds[:link_col_ix] ] contact_info_by_district[district] = { "email": email, "phone": phone, "detail_link": detail_link, } self.urlretrieve(url, "ri_leg.xls") wb = xlrd.open_workbook("ri_leg.xls") sh = wb.sheet_by_index(0) for rownum in range(1, sh.nrows): d = { field: sh.cell(rownum, col_num).value for field, col_num in excel_mapping.items() } # Convert float to an int, and then to string, the required format district = str(int(d["district"])) if d["full_name"].upper() == "VACANT": self.warning("District {}'s seat is vacant".format(district)) continue contact_info = contact_info_by_district[district] # RI is very fond of First M. Last name formats and # they're being misparsed upstream, so fix here (first, middle, last) = ("", "", "") full_name = re.sub( r"^{}(?=\s?[A-Z].*$)".format(rep_type), "", d["full_name"] ).strip() if re.match(r"^\S+\s[A-Z]\.\s\S+$", full_name): (first, middle, last) = full_name.split() # Note - if we ever need to speed this up, it looks like photo_url can be mapped # from the detail_link a la /senators/Paolino/ -> /senators/pictures/Paolino.jpg detail_page = self.lxmlize(contact_info["detail_link"]) try: (photo_url,) = detail_page.xpath('//div[@class="ms-WPBody"]//img/@src') except ValueError: photo_url = "" person = Person( primary_org=chamber, district=district, name=full_name, party=translate[d["party"]], image=photo_url, ) person.extras["town_represented"] = d["town_represented"] person.extras["name_first"] = first person.extras["name_middle"] = middle person.extras["name_last"] = last person.add_link(detail_link) if d["address"]: person.add_contact_detail( type="address", value=d["address"], note="District Office" ) if contact_info["phone"]: person.add_contact_detail( type="voice", value=contact_info["phone"], note="District Office" ) if contact_info["email"]: person.add_contact_detail( type="email", value=contact_info["email"], note="District Office" ) person.add_source(contact_url) person.add_source(contact_info["detail_link"]) yield person
def scrape_member(self, chamber, member_url): member_page = self.get(member_url).text doc = lxml.html.fromstring(member_page) doc.make_links_absolute(member_url) photo_url = doc.xpath('//a[@class="download"]/@href')[0] name_pieces = doc.xpath( '//div[@class="row profile-top"]/h2/text()')[0].split() full_name = " ".join(name_pieces[1:-1]).strip() party = name_pieces[-1] if party == "(R)": party = "Republican" elif party == "(D)": party = "Democratic" elif party == "(I)": party = "Independent" sidebar = doc.xpath( '//div[@class="relativeContent col-sm-4 col-xs-12"]')[0] district = sidebar.xpath('//div[@class="circle"]/h3/text()')[0] district = district.lstrip("0") person = Person( name=full_name, district=district, party=party, primary_org=chamber, image=photo_url, ) person.add_source(member_url) person.add_link(member_url) info = {} sidebar_items = iter(sidebar.getchildren()) for item in sidebar_items: if item.tag == "p": info[item.text] = next(sidebar_items) address = "\n".join(info["Legislative Address"].xpath("./text()")) phone = None fax = None phone_numbers = info["Phone Number(s)"].xpath("./text()") for num in phone_numbers: kind, num = num.split(": ") if kind == "LRC": if num.endswith(" (fax)"): fax = num.replace(" (fax)", "") else: phone = num email = info["Email"].text if phone: person.add_contact_detail(type="voice", value=phone, note="Capitol Office") if fax: person.add_contact_detail(type="fax", value=fax, note="Capitol Office") if email: person.add_contact_detail(type="email", value=email, note="Capitol Office") if address.strip() == "": self.warning("Missing Capitol Office!!") else: person.add_contact_detail(type="address", value=address, note="Capitol Office") yield person
def scrape_chamber(self, chamber): # the url for each rep is unfindable (by me) # and the parts needed to make it up do not appear in the html or js. # we can find basic information on the main rep page, and sponsor # info on a version of their indivdual page called using only their # sponsor ID (which we have to scrape from ALISON) # we can't get detailed information without another ID # which I have not been able to find. if chamber == "upper": member_list_url = self._base_url + "Senate/ALSenators.aspx" legislator_base_url = self._base_url + "ALSenator.aspx" elif chamber == "lower": member_list_url = self._base_url + "House/ALRepresentatives.aspx" legislator_base_url = self._base_url + "ALRepresentative.aspx" page = self.lxmlize(member_list_url) legislator_nodes = self.get_nodes( page, '//div[@class="container container-main"]/table/tr/td/input') legislator_url_template = (legislator_base_url + "?OID_SPONSOR=" "{oid_sponsor}&OID_PERSON={oid_person}") html_parser = HTMLParser() for legislator_node in legislator_nodes: # Set identifiers internal to AlisonDB. # Have to do this to OID_SPONSOR because they don't know # how to HTML and I'm making links absolute out of convenience. try: oid_sponsor = legislator_node.attrib["longdesc"].split("/")[-1] oid_person = legislator_node.attrib["alt"] except KeyError: continue legislator_url = legislator_url_template.format( oid_sponsor=oid_sponsor, oid_person=oid_person) legislator_page = self.lxmlize(legislator_url) name_text = self.get_node( legislator_page, '//span[@id="ContentPlaceHolder1_lblMember"]').text_content() # This just makes processing the text easier. name_text = name_text.lower() # Skip vacant seats. if "vacant" in name_text: continue photo_url = self.get_node( legislator_page, '//input[@id="ContentPlaceHolder1_TabSenator_TabLeg_imgLEG"]' "/@src", ) # Another check for vacant seats if "VACANT.jpeg" in photo_url or "pending.jpeg" in photo_url: continue # Removes titles and nicknames. name = html_parser.unescape( re.sub(r"(?i)(representative|senator|".*")", "", name_text).strip().title()) # Assemble full name by reversing last name, first name format. name_parts = [x.strip() for x in name.split(",")] full_name = "{0} {1}".format(name_parts[1], name_parts[0]) info_node = self.get_node( legislator_page, '//div[@id="ContentPlaceHolder1_TabSenator_body"]//table', ) district_text = self.get_node(info_node, "./tr[2]/td[2]").text_content() district_text = district_text.replace(" ", u"") if chamber == "upper": district = district_text.replace("Senate District", "").strip() elif chamber == "lower": district = district_text.replace("House District", "").strip() party_text = self.get_node(info_node, "./tr[1]/td[2]").text_content() if not full_name.strip() and party_text == "()": self.warning( "Found empty seat, for district {}; skipping".format( district)) continue if party_text.strip() in self._parties.keys(): party = self._parties[party_text.strip()] else: party = None phone_number = (self.get_node( info_node, "./tr[4]/td[2]").text_content().strip()) fax_number = (self.get_node( info_node, "./tr[5]/td[2]").text_content().strip().replace("\u00a0", "")) suite_text = self.get_node(info_node, "./tr[7]/td[2]").text_content() office_address = "{}\n11 S. Union Street\nMontgomery, AL 36130".format( suite_text) email_address = self.get_node(info_node, "./tr[11]/td[2]").text_content() photo_url = self.get_node( legislator_page, '//input[@id="ContentPlaceHolder1_TabSenator_TabLeg_imgLEG"]' "/@src", ) # add basic leg info and main office person = Person( name=full_name, district=district, primary_org=chamber, party=party, image=photo_url, ) person.add_contact_detail(type="address", value=office_address, note="Capitol Office") if phone_number: person.add_contact_detail(type="voice", value=phone_number, note="Capitol Office") if fax_number: person.add_contact_detail(type="fax", value=fax_number, note="Capitol Office") if email_address: person.add_contact_detail(type="email", value=email_address, note="Capitol Office") self.add_committees(legislator_page, person, chamber, legislator_url) person.add_link(legislator_url) person.add_source(legislator_url) person.add_source(member_list_url) yield person
def scrape(self): base_url = "http://news.legislature.ne.gov/dist" # there are 49 districts for district in range(1, 50): rep_url = base_url + str(district).zfill(2) full_name = None address = None phone = None email = None photo_url = None try: page = self.lxmlize(rep_url) info_node = self.get_node( page, '//div[@class="container view-front"]' '//div[@class="col-sm-4 col-md-3 ltc-col-right"]' '/div[@class="block-box"]', ) full_name = self.get_node(info_node, "./h2/text()[normalize-space()]") full_name = re.sub(r"^Sen\.[\s]+", "", full_name).strip() if full_name == "Seat Vacant": continue address_node = self.get_node( info_node, './address[@class="feature-content"]') email = self.get_node( address_node, './a[starts-with(@href, "mailto:")]/text()') contact_text_nodes = self.get_nodes( address_node, "./text()[following-sibling::br]") address_sections = [] for text in contact_text_nodes: text = text.strip() if not text: continue phone_match = re.search(r"Phone:", text) if phone_match: phone = re.sub(r"^Phone:[\s]+", "", text) continue # If neither a phone number nor e-mail address. address_sections.append(text) address = "\n".join(address_sections) photo_url = ( "http://www.nebraskalegislature.gov/media/images/blogs" "/dist{:2d}.jpg").format(district) # Nebraska is offically nonpartisan. party = "Nonpartisan" person = Person( name=full_name, district=str(district), party=party, image=photo_url, primary_org="legislature", ) person.add_link(rep_url) person.add_source(rep_url) note = "Capitol Office" person.add_contact_detail(type="address", value=address, note=note) if phone: person.add_contact_detail(type="voice", value=phone, note=note) if email: person.add_contact_detail(type="email", value=email, note=note) yield person except scrapelib.HTTPError: self.warning("could not retrieve %s" % rep_url)
def scrape(self): # chambers = [chamber] if chamber is not None else ['upper', 'lower'] leg_url = "ftp://ftp.cga.ct.gov/pub/data/LegislatorDatabase.csv" page = self.get(leg_url) committees = {} # Ensure that the spreadsheet's structure hasn't generally changed _row_headers = page.text.split("\r\n")[0].replace('"', "").split(",") assert _row_headers == HEADERS, "Spreadsheet structure may have changed" page = open_csv(page) for row in page: chamber = {"H": "lower", "S": "upper"}[row["office code"]] district = row["dist"].lstrip("0") assert district.isdigit(), "Invalid district found: {}".format( district) name = row["first name"] mid = row["middle initial"].strip() if mid: name += " %s" % mid name += " %s" % row["last name"] suffix = row["suffix"].strip() if suffix: name += " %s" % suffix party = row["party"] if party == "Democrat": party = "Democratic" leg = Person(primary_org=chamber, name=name, district=district, party=party) legislator_url = row["URL"].replace("\\", "//").strip() if legislator_url != "": if not legislator_url.startswith("http"): legislator_url = "http://" leg.add_link(legislator_url) leg.add_party(party=party) office_address = "%s\nRoom %s\nHartford, CT 06106" % ( row["capitol street address"], row["room number"], ) # extra_office_fields = dict() email = row["email"].strip() if "@" not in email: if not email: email = None elif email.startswith("http://") or email.startswith( "https://"): # extra_office_fields['contact_form'] = email email = None else: raise ValueError( "Problematic email found: {}".format(email)) leg.add_contact_detail(type="address", value=office_address, note="Capitol Office") leg.add_contact_detail(type="voice", value=row["capitol phone"], note="Capitol Office") if email: leg.add_contact_detail(type="email", value=email) home_address = "{}\n{}, {} {}".format( row["home street address"], row["home city"], row["home state"], row["home zip code"], ) if "Legislative Office Building" not in home_address: leg.add_contact_detail(type="address", value=home_address, note="District Office") if row["home phone"].strip(): leg.add_contact_detail(type="voice", value=row["home phone"], note="District Office") leg.add_source(leg_url) for comm_name in row["committee member1"].split(";"): if " (" in comm_name: comm_name, role = comm_name.split(" (") role = role.strip(")").lower() else: role = "member" comm_name = comm_name.strip() if comm_name: if comm_name in committees: com = committees[comm_name] else: com = Organization(comm_name, classification="committee", chamber=chamber) com.add_source(leg_url) committees[comm_name] = com yield com leg.add_membership(name_or_org=com, role=role) yield leg
def _parse_person(self, row, chamber, seat_map): # Capture legislator vitals. first_name = row["FirstName"] middle_name = row["MiddleName"] last_name = row["LastName"] full_name = "{} {} {}".format(first_name, middle_name, last_name) full_name = re.sub(r"[\s]{2,}", " ", full_name) if chamber == "lower": district = "{} {}".format(row["County"], int(row["District"])).strip() else: district = str(int(row["District"])).strip() party = self.party_map[row["party"].upper()] email = row["WorkEmail"] if district == "0": self.warning("Skipping {}, district is set to 0".format(full_name)) return person = Person(primary_org=chamber, district=district, name=full_name, party=party) extras = { "first_name": first_name, "middle_name": middle_name, "last_name": last_name, } person.extras = extras if email: office = "Capitol" if email.endswith( "@leg.state.nh.us") else "District" person.add_contact_detail(type="email", value=email, note=office + " Office") # Capture legislator office contact information. district_address = "{}\n{}\n{}, {} {}".format(row["Address"], row["address2"], row["city"], row["State"], row["Zipcode"]).strip() phone = row["Phone"].strip() if not phone: phone = None if district_address: office = "Capitol" if chamber == "upper" else "District" person.add_contact_detail(type="address", value=district_address, note=office + " Office") if phone: office = "Capitol" if "271-" in phone else "District" person.add_contact_detail(type="voice", value=phone, note=office + " Office") # Retrieve legislator portrait. profile_url = None if chamber == "upper": profile_url = self.senate_profile_url.format(row["District"]) elif chamber == "lower": try: seat_number = seat_map[row["seatno"]] profile_url = self.house_profile_url.format(seat_number) except KeyError: pass if profile_url: person.image = self._get_photo(profile_url, chamber) person.add_source(profile_url) return person
def _scrape_senator(self, url, parties): # logger.info(f'Generating senator person object from {url}') """ Returns a Person object representing a member of the upper legislative chamber. """ # Scrape legislator information from roster URL # Example: view-source:https://senate.texas.gov/member.php?d=1 member_page = self.lxmlize(url) photo_url = member_page.xpath('//img[@id="memhead"]/@src')[0] scraped_name_district_text = member_page.xpath( '//div[@class="pgtitle"]/text()')[0] scraped_name, district_text = scraped_name_district_text.split(":") name = " ".join(scraped_name.replace("Senator ", "").split()).strip() district = str(district_text.split()[1]).strip() # Vacant house "members" are named after their district numbers: if re.match(r"^District \d+$", name): return None bio = " ".join(member_page.xpath('//div[@class="bio"]/text()')) party = parties[district] person = Person( name=name, district=district, party=party, primary_org="upper", biography=bio, ) if photo_url is not None: person.image = photo_url person.add_link(url) person.add_source(url) office_ids = [] # Get offices based on table headers for th_tag in member_page.xpath('//table[@class="memdir"]/tr/th'): # logger.warn([th_tag.xpath('text()'),th_tag.xpath('@id')]) id = th_tag.xpath("@id")[0] if th_tag.xpath("@id") else "" label = th_tag.xpath("text()")[0].strip() if th_tag.xpath( "text()") else "" if id != "" and label != "": office_ids.append({"id": id, "label": label}) # logger.warn(office_ids) for office in office_ids: # logger.warn(office) row = member_page.xpath( f'//table[@class="memdir"]/tr/td[@headers="{office["id"]}"]') # A few member pages have broken ids for office listings: if len(row) == 0: row = member_page.xpath( '//table[@class="memdir"]/tr/td[@headers="dDA1"]') if len(row) > 0: details = " ".join(row[0].xpath("text()")).strip() details = details.replace("\r", "").replace("\n", "") # logger.warn(details) # A few member pages have blank office listings: if details == "": continue match = self.address_re.search(details) if match is not None: address = re.sub( " +$", "", match.group().replace("\r", "").replace("\n", ""), flags=re.MULTILINE, ) else: # No valid address found in the details. continue phone_number = extract_phone(details) fax_number = extract_fax(details) if address: person.add_contact_detail(type="address", value=address, note=office["label"]) if phone_number: person.add_contact_detail(type="voice", value=phone_number, note=office["label"]) if fax_number: person.add_contact_detail(type="fax", value=fax_number, note=office["label"]) yield person
def scrape_chamber(self, chamber, session): if chamber == "upper": chamber_slug = "Senate" elif chamber == "lower": chamber_slug = "Assembly" session_slug = self.jurisdiction.session_slugs[session] leg_base_url = "http://www.leg.state.nv.us/App/Legislator/A/%s/%s/" % ( chamber_slug, session_slug, ) leg_json_url = ( "http://www.leg.state.nv.us/App/Legislator/A/api/%s/Legislator?house=%s" % (session_slug, chamber_slug)) resp = json.loads(self.get(leg_json_url).text) for item in resp: # empty district empty_names = ["District No", "Vacant"] if any(name in item["FullName"] for name in empty_names): continue name_parts = item["FullName"].split(",") last, first = name_parts[:2] item["FullName"] = "{first} {last}".format(last=last.strip(), first=first.strip()) person = Person( name=item["FullName"], district=item["DistrictNbr"], party=item["Party"], primary_org=chamber, image=item["PhotoURL"], ) capitol_phone = item["LCBPhone"] if capitol_phone: person.add_contact_detail(type="voice", value=capitol_phone, note="Capitol Office") leg_url = leg_base_url + item["DistrictNbr"] # hack to get the legislator ID html = self.get(leg_url).text for ln in html.split("\n"): if "GetLegislatorDetails" in ln: leg_id = ln.split(",")[1].split("'")[1] # fetch the json used by the page leg_details_url = ( "https://www.leg.state.nv.us/App/Legislator/A/api/{}/Legislator?id=" .format(session_slug) + leg_id) leg_resp = json.loads(self.get(leg_details_url).text) details = leg_resp["legislatorDetails"] address = details["Address1"] address2 = details["Address2"] if address2: address += " " + address2 address += "\n%s, NV %s" % (details["City"], details["Zip"]) phone = details["LCBPhone"] email = details["LCBEmail"] if address: person.add_contact_detail(type="address", value=address, note="District Office") if phone: person.add_contact_detail(type="voice", value=phone, note="District Office") if email: person.add_contact_detail(type="email", value=email, note="District Office") person.add_link(leg_details_url) person.add_source(leg_details_url) yield person
def scrape(self, session=None): if not session: session = self.jurisdiction.legislative_sessions[-1]["name"] self.info("no session specified, using %s", session) year_abr = session[0:4] self._init_mdb(int(year_abr)) roster_csv = self.access_to_csv("Roster") bio_csv = self.access_to_csv("LegBio") photos = {} for rec in bio_csv: photos[rec["Roster Key"]] = rec["URLPicture"] for rec in roster_csv: first_name = rec["Firstname"] middle_name = rec["MidName"] last_name = rec["LastName"] suffix = rec["Suffix"] full_name = first_name + " " + middle_name + " " + last_name + " " + suffix full_name = full_name.replace(" ", " ") full_name = full_name[0:len(full_name) - 1] district = str(int(rec["District"])) party = rec["Party"] if party == "R": party = "Republican" elif party == "D": party = "Democratic" else: party = party chamber = rec["House"] if chamber == "A": chamber = "lower" elif chamber == "S": chamber = "upper" leg_status = rec["LegStatus"] # skip Deceased/Retired members if leg_status != "Active": continue phone = rec["Phone"] or None email = None if rec["Email"]: email = rec["Email"] # Email has been removed from the Access DB, but it's # still [email protected] and [email protected] - many # reps have these emails on their personal pages even if # they're gone from the DB file if not email: email = self._construct_email(chamber, rec["Sex"], last_name) try: photo_url = photos[rec["Roster Key"]] except KeyError: photo_url = "" self.warning("no photo url for %s", rec["Roster Key"]) url = "http://www.njleg.state.nj.us/members/bio.asp?Leg=" + str( int(rec["Roster Key"])) address = "{0}\n{1}, {2} {3}".format(rec["Address"], rec["City"], rec["State"], rec["Zipcode"]) gender = {"M": "Male", "F": "Female"}[rec["Sex"]] person = Person( name=full_name, district=district, primary_org=chamber, party=party, image=photo_url, gender=gender, ) person.add_link(url) person.add_source(url) person.add_source("http://www.njleg.state.nj.us/downloads.asp") person.add_contact_detail(type="address", value=address, note="District Office") if phone is not None: person.add_contact_detail(type="voice", value=phone, note="District Office") if email is not None: person.add_contact_detail(type="email", value=email, note="District Office") yield person
def scrape_legislator(self, name, chamber, url, contact_page): page = self.get(url).text page = lxml.html.fromstring(page) page.make_links_absolute(url) party = page.xpath("string(//span[contains(@id, 'Party')])") party = party.strip() if party == "Democrat": party = "Democratic" district = page.xpath("string(//span[contains(@id, 'District')])") district = district.strip().lstrip("0") occupation = page.xpath("string(//span[contains(@id, 'Occupation')])") occupation = occupation.strip() (photo_url, ) = page.xpath('//img[contains(@id, "_imgMember")]/@src') office_phone = page.xpath( "string(//span[contains(@id, 'CapitolPhone')])").strip() legislator = Person( primary_org=chamber, image=photo_url, name=name, party=party, district=district, ) legislator.extras["occupation"] = occupation if office_phone.strip() != "": legislator.add_contact_detail(type="voice", value=office_phone, note="Capitol Office") # SD removed email from the detail pages but it's still in the # contact page, shared for all congress people member_id = re.search(r"Member=(\d+)", url).group(1) # find the profile block by finding a link inside it to their # detail page profile_link = contact_page.xpath( '//ul[@id="contact-list"]//a[contains(@href, "Member=%s")]' % (member_id, )) if profile_link: # look for the adjacent email mailto link profile_link = profile_link[0] profile_block = profile_link.getparent().getparent().getparent() email_link = profile_block.xpath( './span/span/a[@class="mail-break"]') if email_link: email = email_link[0].text email = email.lstrip() email = email.rstrip() if email: legislator.add_contact_detail(type="email", value=email, note="Capitol Office") home_address = [ x.strip() for x in page.xpath( '//td/span[contains(@id, "HomeAddress")]/text()') if x.strip() ] if home_address: home_address = "\n".join(home_address) home_phone = page.xpath( "string(//span[contains(@id, 'HomePhone')])").strip() legislator.add_contact_detail(type="address", value=home_address, note="District Office") if home_phone: legislator.add_contact_detail(type="voice", value=home_phone, note="District Office") legislator.add_source(url) legislator.add_link(url) committees = page.xpath( '//div[@id="divCommittees"]/span/section/table/tbody/tr/td/a') for committee in committees: self.scrape_committee(legislator, url, committee, chamber) yield legislator
def scrape_chamber(self, chamber): if chamber == "lower": url = "http://www.scstatehouse.gov/member.php?chamber=H" else: url = "http://www.scstatehouse.gov/member.php?chamber=S" seen_committees = {} data = self.get(url).text doc = lxml.html.fromstring(data) doc.make_links_absolute(url) for a in doc.xpath('//a[@class="membername"]'): full_name = a.text leg_url = a.get("href") if full_name.startswith("Senator"): full_name = full_name.replace("Senator ", "") if full_name.startswith("Representative"): full_name = full_name.replace("Representative ", "") leg_html = self.get(leg_url).text leg_doc = lxml.html.fromstring(leg_html) leg_doc.make_links_absolute(leg_url) if "Resigned effective" in leg_html: self.info("Resigned") continue party, district, _ = leg_doc.xpath( '//p[@style="font-size: 17px;' ' margin: 0 0 0 0; padding: 0;"]/text()') if "Republican" in party: party = "Republican" elif "Democrat" in party: party = "Democratic" # District # - County - Map district = district.split()[1] try: photo_url = leg_doc.xpath( '//img[contains(@src,"/members/")]/@src')[0] except IndexError: self.warning("No Photo URL for {}".format(full_name)) photo_url = "" person = Person( name=full_name, district=district, party=party, primary_org=chamber, image=photo_url, ) # capitol office address try: capitol_address = lxml.etree.tostring( leg_doc.xpath('//h2[text()="Columbia Address"]/../p[1]') [0]).decode() if capitol_address: capitol_address = parse_address(capitol_address) person.add_contact_detail(type="address", value=capitol_address, note="Capitol Office") except IndexError: self.warning("no capitol address for {0}".format(full_name)) # capitol office phone try: capitol_phone = ( leg_doc.xpath('//h2[text()="Columbia Address"]/../p[2]') [0].text_content().strip()) label, number = parse_phone(capitol_phone) if number: person.add_contact_detail(type="voice", value=number, note="Capitol Office") except IndexError: self.warning("no capitol phone for {0}".format(full_name)) # home address try: home_address = lxml.etree.tostring( leg_doc.xpath('//h2[text()="Home Address"]/../p[1]') [0]).decode() if home_address: home_address = parse_address(home_address) person.add_contact_detail(type="address", value=home_address, note="District Office") except IndexError: self.warning("no home address for {0}".format(full_name)) # home or business phone try: home_phone = ( leg_doc.xpath('//h2[text()="Home Address"]/../p[2]') [0].text_content().strip()) label, number = parse_phone(home_phone) if number: label = ("Primary Office" if label == "Business" else "District Office") person.add_contact_detail(type="voice", value=number, note=label) except IndexError: self.warning( "no home or business phone for {0}".format(full_name)) # business or home phone try: business_phone = ( leg_doc.xpath('//h2[text()="Home Address"]/../p[3]') [0].text_content().strip()) label, number = parse_phone(business_phone) if number: label = ("Primary Office" if label == "Business" else "District Office") person.add_contact_detail(type="voice", value=number, note=label) except IndexError: pass person.add_link(leg_url) person.add_source(url) person.add_source(leg_url) # committees (skip first link) for com in leg_doc.xpath( '//a[contains(@href, "committee.php")]')[1:]: if com.text.endswith(", "): committee, role = com.text_content().rsplit(", ", 1) # known roles role = { "Treas.": "treasurer", "Secy.": "secretary", "Secy./Treas.": "secretary/treasurer", "V.C.": "vice-chair", "1st V.C.": "first vice-chair", "Co 1st V.C.": "co-first vice-chair", "2nd V.C.": "second vice-chair", "3rd V.C.": "third vice-chair", "Ex.Officio Member": "ex-officio member", "Chairman": "chairman", }[role] else: committee = com.text role = "member" # only yield each committee once if committee not in seen_committees: com = Organization(name=committee, classification="committee", chamber=chamber) com.add_source(url) seen_committees[committee] = com yield com else: com = seen_committees[committee] person.add_membership(com, role=role) yield person
def scrape(self, session=None): if session is None: session = self.latest_session() year_slug = self.jurisdiction.get_year_slug(session) # Load all members via the private API legislator_dump_url = "http://legislature.vermont.gov/people/loadAll/{}".format( year_slug ) json_data = self.get(legislator_dump_url).text legislators = json.loads(json_data)["data"] # Parse the information from each legislator for info in legislators: # Strip whitespace from strings info = {k: v.strip() for k, v in info.items()} # Skip duplicate record for Christopher Mattos (appointed Rep September 2017) if info["PersonID"] == "29034": self.info("skipping first Christopher Mattos record") continue # Gather photo URL from the member's page member_url = "http://legislature.vermont.gov/people/single/{}/{}".format( year_slug, info["PersonID"] ) page = self.lxmlize(member_url) (photo_url,) = page.xpath('//img[@class="profile-photo"]/@src') # Also grab their state email address state_email = page.xpath( '//dl[@class="summary-table profile-summary"]/' 'dt[text()="Email"]/following-sibling::dd[1]/a/text()' ) if state_email: (state_email,) = state_email else: state_email = None district = info["District"].replace(" District", "") leg = Person( primary_org=self.CHAMBERS[info["Title"]], district=district, party=info["Party"].replace("Democrat", "Democratic"), name="{0} {1}".format(info["FirstName"], info["LastName"]), image=photo_url, ) leg.add_contact_detail( note="Capitol Office", type="address", value="Vermont State House\n115 State Street\nMontpelier, VT 05633", ) if state_email: leg.add_contact_detail( note="Capitol Office", type="email", value=state_email ) leg.add_contact_detail( note="District Office", type="address", value="{0}{1}\n{2}, {3} {4}".format( info["MailingAddress1"], ( "\n" + info["MailingAddress2"] if info["MailingAddress2"].strip() else "" ), info["MailingCity"], info["MailingState"], info["MailingZIP"], ), ) if info["HomePhone"]: leg.add_contact_detail( note="District Office", type="voice", value=info["HomePhone"] ) district_email = info["Email"] or info["HomeEmail"] or info["WorkEmail"] if district_email: leg.add_contact_detail( note="District Office", type="email", value=district_email ) leg.add_link(member_url) leg.add_source(legislator_dump_url) leg.add_source(member_url) yield leg
def _scrape_representative(self, url, parties): # logger.info(f'Generating representative person object from {url}') """ Returns a Person object representing a member of the lower legislative chamber. """ # url = self.get(url).text.replace('<br>', '') member_page = self.lxmlize(url) photo_url = member_page.xpath('//img[@class="member-photo"]/@src')[0] if photo_url.endswith("/.jpg"): photo_url = None scraped_name, district_text = member_page.xpath( '//div[@class="member-info"]/h2') scraped_name = scraped_name.text_content().strip().replace("Rep. ", "") scraped_name = " ".join(scraped_name.split()) name = " ".join(scraped_name.split(", ")[::-1]) district_text = district_text.text_content().strip() district = str(self.district_re.search(district_text).group(1)) # Vacant house "members" are named after their district numbers: if re.match(r"^District \d+$", scraped_name): return None party = parties[district] person = Person(name=name, district=district, party=party, primary_org="lower") if photo_url is not None: person.image = photo_url person.add_link(url) person.add_source(url) def office_name(element): """Returns the office address type.""" return element.xpath("preceding-sibling::h4[1]/text()")[0].rstrip( ":") offices_text = [{ "name": office_name(p_tag), "type": office_name(p_tag).replace(" Address", "").lower(), "details": p_tag.text_content(), } for p_tag in member_page.xpath( '//h4/following-sibling::p[@class="double-space"]')] for office_text in offices_text: details = office_text["details"].strip() # A few member pages have blank office listings: if details == "": continue # At the time of writing, this case of multiple district # offices occurs exactly once, for the representative at # District 43: if details.count("Office") > 1: district_offices = [ district_office.strip() for district_office in re.findall( r"(\w+ Office.+?(?=\w+ Office|$))", details, flags=re.DOTALL) ] offices_text += [{ "name": re.match(r"\w+ Office", office).group(), "type": "district", "details": re.search(r"(?<=Office).+(?=\w+ Office|$)?", office, re.DOTALL).group(), } for office in district_offices] match = self.address_re.search(details) if match is not None: address = re.sub( " +$", "", match.group().replace("\r", "").replace("\n\n", "\n"), flags=re.MULTILINE, ) else: # No valid address found in the details. continue phone_number = extract_phone(details) fax_number = extract_fax(details) if address: person.add_contact_detail(type="address", value=address, note=office_text["name"]) if phone_number: person.add_contact_detail(type="voice", value=phone_number, note=office_text["name"]) if fax_number: person.add_contact_detail(type="fax", value=fax_number, note=office_text["name"]) yield person
def handle_list_item(self, item): link = item.xpath('.//div[contains(@class, "rep_style")]/a')[0] name = link.text_content().strip() if "Vacant" in name or "Resigned" in name or "Pending" in name: return party = item.xpath( './/div[contains(@class, "party_style")]/text()')[0].strip() party = {"D": "Democratic", "R": "Republican"}[party] district = item.xpath( './/div[contains(@class, "district_style")]/text()')[0].strip() leg_url = link.get("href") split_url = parse.urlsplit(leg_url) member_id = parse.parse_qs(split_url.query)["MemberId"][0] image = "http://www.flhouse.gov/FileStores/Web/Imaging/Member/{}.jpg".format( member_id) name = fix_name(name) rep = Person( name=name, district=district, party=party, primary_org="lower", role="Representative", image=image, ) rep.add_link(leg_url) rep.add_source(leg_url) rep.add_source(self.url) self.scrape_page(RepDetail, leg_url, obj=rep) # look for email in the list from the PDF directory - ideally # we'd find a way to better index the source data which # wouldn't require guessing the email, but this does at least # confirm that it's correct # deal with some stuff that ends up in name that won't work in # email, spaces, quotes, high latin1 email_name = rep.name.replace('"', "").replace("La ", "La").replace("ñ", "n") (last, *other) = re.split(r"[-\s,]+", email_name) # deal with a missing nickname used in an email address if "Patricia" in other: other.append("Pat") # search through all possible first names and nicknames # present - needed for some of the more elaborate concoctions found_email = False for first in other: email = "*****@*****.**" % (first, last) if email in self.member_emails: # it's bad if we can't uniquely match emails, so throw an error if email in self.claimed_member_emails: raise ValueError( "Email address %s matches multiple reps - %s and %s." % (email, rep.name, self.claimed_member_emails[email])) self.claimed_member_emails[email] = rep.name rep.add_contact_detail(type="email", value=email, note="Capitol Office") rep.add_source(self.directory_pdf_url) found_email = True break if not found_email: log.warning("Rep %s does not have an email in the directory PDF." % (rep.name, )) return rep
def scrape_table(self, chamber): url = self.urls[chamber] html = self.get(url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) seen = set() for row in doc.xpath( '//div[contains(@class, "member-index-cell")]/div/div'): img_cell, text_cell = row.getchildren() if "to be announced" in text_cell.text_content().lower(): continue leg_a = text_cell.xpath('.//a')[0] leg_url = leg_a.attrib['href'] name = leg_a.text district = re.search(r"District (\d{1,2}[ABCD]?)", text_cell.text_content()).group(1) key = name + district if key in seen: # leadership listed twice, skip the 2nd continue seen.add(key) photo_url = img_cell.xpath("a/img/@src")[0] # get details html = self.get(leg_url).text ldoc = lxml.html.fromstring(html) ldoc.make_links_absolute(leg_url) party = _get_table_item(ldoc, "Party").text if party == "Democrat": party = "Democratic" capitol_info = _get_table_item(ldoc, "Annapolis Info") addr_lines, phone_lines = capitol_info.xpath("dl/dd") address = [ s.strip() for s in addr_lines.text_content().split('\n') if s.strip() ] address = "\n".join(address) phone = None fax = None for line in phone_lines.text_content().split('\n'): if "Phone" in line: phone = re.findall(r"Phone (\d{3}-\d{3}-\d{4})", line)[0] elif "Fax" in line: # Number oddities: one has two dashes, one has a dash and then a space. line = line.replace("--", "-").replace("- ", "-") fax = re.findall(r"Fax (\d{3}-\d{3}-\d{4})", line)[0] email_path = ldoc.xpath('//a[contains(@href, "mailto:")]/@href') emails = set() for path in email_path: emails.add(re.match(r"mailto:([^?]+)", path).group(1)) if not emails: email = None elif len(emails) == 1: email = emails.pop() else: raise AssertionError("Multiple email links found on page") img_src = ldoc.xpath('//img[@class="sponimg"]/@src') if img_src: photo_url = img_src[0] names = name.split(", ") name = " ".join([names[1], names[0]] + names[2:]) leg = Person( primary_org=chamber, district=district, name=name, party=party, image=photo_url, ) leg.add_source(url=leg_url) leg.add_link(url=leg_url) if address: leg.add_contact_detail(type="address", value=address, note="Capitol Office") if phone: leg.add_contact_detail(type="voice", value=phone, note="Capitol Office") if fax: leg.add_contact_detail(type="fax", value=fax, note="Capitol Office") if email: leg.add_contact_detail(type="email", value=email, note="Capitol Office") yield leg
def legislators(self, latest_only): legs = {} for member, chamber, term, url in self._memberships(latest_only): name, _, _, district, party = member.xpath("td") district = district.text detail_url = name.xpath("a/@href")[0] if party.text_content().strip() == "": party = "Independent" else: party = {"D": "Democratic", "R": "Republican", "I": "Independent"}[ party.text ] name = name.text_content().strip() # inactive legislator, skip them for now if name.endswith("*"): name = name.strip("*") continue name = AKA.get(name, name) if name in legs: p, terms = legs[name] terms.append((chamber, district, term, party)) else: p = Person(name, party=party) legs[name] = p, [(chamber, district, term, party)] p.add_source(url) p.add_source(detail_url) p.add_link(detail_url) birth_date = BIRTH_DATES.get(name, None) if birth_date: p.birth_date = birth_date leg_html = self.get(detail_url).text leg_doc = lxml.html.fromstring(leg_html) leg_doc.make_links_absolute(detail_url) hotgarbage = ( "Senate Biography Information for the 98th General " "Assembly is not currently available." ) if hotgarbage in leg_html: # The legislator's bio isn't available yet. self.logger.warning("No legislator bio available for " + name) continue photo_url = leg_doc.xpath('//img[contains(@src, "/members/")]/@src')[0] p.image = photo_url p.contact_details = [] # email email = leg_doc.xpath('//b[text()="Email: "]') if email: p.add_contact_detail( type="email", value=email[0].tail.strip(), note="Capitol Office" ) offices = { "Capitol Office": '//table[contains(string(), "Springfield Office")]', "District Office": '//table[contains(string(), "District Office")]', } for location, xpath in offices.items(): table = leg_doc.xpath(xpath) if table: for type, value in self._table_to_office(table[3]): if type in ("fax", "voice") and not validate_phone_number( value ): continue p.add_contact_detail(type=type, value=value, note=location) return legs