def test_full_person(): person = ScrapePerson("Tom Sawyer") person.add_identifier("1") person.add_name("Tommy", start_date="1880") person.add_contact_detail(type="phone", value="555-555-1234", note="this is fake") person.add_link("http://example.com/link") person.add_source("http://example.com/source") # import person pd = person.as_dict() PersonImporter("jid").import_data([pd]) # get person from db and assert it imported correctly p = Person.objects.get() assert "ocd-person" in p.id assert p.name == person.name assert p.identifiers.all()[0].identifier == "1" assert p.identifiers.all()[0].scheme == "" assert p.other_names.all()[0].name == "Tommy" assert p.other_names.all()[0].start_date == "1880" assert p.contact_details.all()[0].type == "phone" assert p.contact_details.all()[0].value == "555-555-1234" assert p.contact_details.all()[0].note == "this is fake" assert p.links.all()[0].url == "http://example.com/link" assert p.sources.all()[0].url == "http://example.com/source"
def handle_list_item(self, item): name = " ".join(item.xpath(".//text()")) name = re.sub(r"\s+", " ", name).replace(" ,", ",").strip() if "Vacant" in name: return district = item.xpath("string(../../td[1])") party = item.xpath("string(../../td[2])") if party == "Democrat": party = "Democratic" leg_url = item.get("href") name = fix_name(name) leg = Person( name=name, district=district, party=party, primary_org="upper", role="Senator", ) leg.add_link(leg_url) leg.add_source(self.url) leg.add_source(leg_url) self.scrape_page(SenDetail, leg_url, obj=leg) return leg
def scrape_member_page(self, chamber, url): page = self.get(url).text page = lxml.html.fromstring(page) page.make_links_absolute(url) for legislator in page.xpath( "//div[contains(concat(' ', normalize-space(@class), ' '), " "' memberModule ')]" ): img = legislator.xpath(".//div[@class='thumbnail']//img")[0].attrib["src"] data = legislator.xpath(".//div[@class='data']")[0] homepage = data.xpath(".//a[@class='black']")[0] full_name = homepage.text_content() if "Vacant" in full_name: continue homepage = homepage.attrib["href"] party = data.xpath(".//span[@class='partyLetter']")[0].text_content() party = {"R": "Republican", "D": "Democratic"}[party] office_lines = data.xpath("child::text()") phone = office_lines.pop(-1) if re.search(r"(Leader|Whip|Speaker)", office_lines[0]): office_lines.pop(0) office = "\n".join(office_lines) h3 = data.xpath("./h3") if len(h3): h3 = h3[0] district = h3.xpath("./br")[0].tail.replace("District", "").strip() else: district = re.findall(r"\d+\.png", legislator.attrib["style"])[ -1 ].split(".", 1)[0] full_name = re.sub(r"\s+", " ", full_name).strip() email = ( "rep{0:0{width}}@ohiohouse.gov" if chamber == "lower" else "sd{0:0{width}}@ohiosenate.gov" ).format(int(district), width=2) leg = Person( name=full_name, district=district, party=party, primary_org=chamber, image=img, ) leg.add_contact_detail(type="address", value=office, note="Capitol Office") leg.add_contact_detail(type="voice", value=phone, note="Capitol Office") leg.add_contact_detail(type="email", value=email, note="Capitol Office") self.scrape_homepage(leg, chamber, homepage) leg.add_source(url) leg.add_link(homepage) yield leg
def scrape_legislator(self, chamber, name, url): html = self.get(url).text page = lxml.html.fromstring(html) page.make_links_absolute(url) district = (page.xpath('//h1[contains(., "DISTRICT")]/text()').pop(). split()[1].strip().lstrip("0")) party = page.xpath("//h2").pop().text_content() party = re.search(r"\((R|D|I)[ \-\]]", party).group(1) if party == "D": party = "Democratic" elif party == "R": party = "Republican" elif party == "I": party = "Independent" photo_url = page.xpath( "//img[contains(@src, 'images/members/')]")[0].attrib["src"] leg = Person(name, district=district, party=party, image=photo_url, primary_org=chamber) leg.add_link(url) leg.add_source(url) self.scrape_offices(leg, page) yield leg
def scrape_member(self, chamber, link): name = link.text.strip() leg_url = link.get("href") district = link.xpath("string(../../td[3])") party = link.xpath("string(../../td[4])") # we get email on the next page now # email = link.xpath("string(../../td[5])") if party == "Democrat": party = "Democratic" elif party == "No Party Specified": party = "Independent" pid = re.search(r"personID=(\d+)", link.attrib["href"]).group(1) photo_url = ("https://www.legis.iowa.gov/photo" "?action=getPhoto&ga=%s&pid=%s" % (self.latest_session(), pid)) leg = Person( name=name, primary_org=chamber, district=district, party=party, image=photo_url, ) leg.add_link(leg_url) leg.add_source(leg_url) leg_page = lxml.html.fromstring(self.get(link.attrib["href"]).text) self.scrape_member_page(leg, leg_page) yield leg
def scrape_lower(self, chamber): url = "http://www.house.mi.gov/mhrpublic/frmRepList.aspx" table = ["website", "district", "name", "party", "location", "phone", "email"] data = self.get(url).text doc = lxml.html.fromstring(data) # skip two rows at top for row in doc.xpath('//table[@id="grvRepInfo"]/*'): tds = row.xpath(".//td") if len(tds) == 0: continue metainf = {} for i in range(0, len(table)): metainf[table[i]] = tds[i] district = str(int(metainf["district"].text_content().strip())) party = metainf["party"].text_content().strip() phone = metainf["phone"].text_content().strip() email = metainf["email"].text_content().strip() name = metainf["name"].text_content().strip() if name == "Vacant" or re.match(r"^District \d{1,3}$", name): self.warning( "District {} appears vacant, and will be skipped".format(district) ) continue leg_url = metainf["website"].xpath("./a")[0].attrib["href"] office = metainf["location"].text_content().strip() office = re.sub( " HOB", " Anderson House Office Building\n124 North Capitol Avenue\nLansing, MI 48933", office, ) office = re.sub(" CB", " State Capitol Building\nLansing, MI 48909", office) try: photo_url = self.get_photo_url(leg_url)[0] except (scrapelib.HTTPError, IndexError): photo_url = "" self.warning("no photo url for %s", name) person = Person( name=name, district=district, party=abbr[party], primary_org="lower", image=photo_url, ) person.add_link(leg_url) person.add_source(leg_url) person.add_contact_detail( type="address", value=office, note="Capitol Office" ) person.add_contact_detail(type="voice", value=phone, note="Capitol Office") person.add_contact_detail(type="email", value=email, note="Capitol Office") yield person
def scrape_chamber(self, chamber): client = ApiClient(self) session = self.latest_session() base_url = "http://iga.in.gov/legislative" api_base_url = "https://api.iga.in.gov" chamber_name = "senate" if chamber == "upper" else "house" r = client.get("chamber_legislators", session=session, chamber=chamber_name) all_pages = client.unpaginate(r) for leg in all_pages: firstname = leg["firstName"] lastname = leg["lastName"] party = leg["party"] link = leg["link"] api_link = api_base_url + link html_link = base_url + link.replace("legislators/", "legislators/legislator_") try: html = get_with_increasing_timeout(self, html_link, fail=True, kwargs={"verify": False}) except scrapelib.HTTPError: self.logger.warning("Legislator's page is not available.") continue doc = lxml.html.fromstring(html.text) doc.make_links_absolute(html_link) address, phone = doc.xpath("//address") address = address.text_content().strip() address = "\n".join([ln.strip() for ln in address.split("\n")]) phone = phone.text_content().strip() try: district = (doc.xpath("//span[@class='district-heading']") [0].text.lower().replace("district", "").strip()) except IndexError: self.warning("skipping legislator w/o district") continue image_link = base_url + link.replace("legislators/", "portraits/legislator_") legislator = Person( primary_org=chamber, district=district, name=" ".join([firstname, lastname]), party=party, image=image_link, ) legislator.add_contact_detail(type="address", note="Capitol Office", value=address) legislator.add_contact_detail(type="voice", note="Capitol Office", value=phone) legislator.add_link(html_link) legislator.add_source(html_link) legislator.add_source(api_link) yield legislator
def scrape_senator_page(self, chamber, url): page = self.get(url).text page = lxml.html.fromstring(page) page.make_links_absolute(url) for legislator in page.xpath( "//div[@id='senators']//div[contains(concat(' ', normalize-space(@class), ' '), " "' portraitContainer ')]" ): img = legislator.xpath( ".//div[@class='profileThumbnailBoundingBox']/@style" )[0] img = img[img.find("(") + 1 : img.find(")")] full_name = legislator.xpath(".//div[@class='profileName']/a/text()")[0] homepage_url = legislator.xpath(".//a[@class='profileImageLink']")[ 0 ].attrib["href"] district = legislator.xpath(".//div[@class='profileDistrict']" "/a/text()")[ 0 ].split("#")[1] if "Vacant" in full_name: continue homepage = self.get(homepage_url).text page = lxml.html.fromstring(homepage) phone = page.xpath("//div[@class='phone']/span/text()")[0] address_lines = page.xpath("//div[@class='address']/descendant::*/text()") address = "\n".join(address_lines) party_image = page.xpath('//div[@class="senatorParty"]/img/@src')[0] if "Republican" in party_image: party = "Republican" elif "Democrat" in party_image: party = "Democratic" email = ( "rep{0:0{width}}@ohiohouse.gov" if chamber == "lower" else "sd{0:0{width}}@ohiosenate.gov" ).format(int(district), width=2) leg = Person( name=full_name, district=district, primary_org=chamber, image=img, party=party, ) leg.add_contact_detail(type="address", value=address, note="Capitol Office") leg.add_contact_detail(type="voice", value=phone, note="Capitol Office") leg.add_contact_detail(type="email", value=email, note="Capitol Office") leg.add_source(url) leg.add_link(homepage_url) yield leg
def scrape_rep(self, url): page = self.get(url).text page = lxml.html.fromstring(page) page.make_links_absolute(url) main = page.xpath('//div[@id="main-info"]')[0] if "Resigned" in main.text_content(): print("Member resigned {}".format(url)) raise StopIteration # don't yield anything if "Deceased" in main.text_content(): print("Member is deceased {}".format(url)) raise StopIteration # don't yield anything name = page.xpath('//div[@class="member-name"]/text()')[0].strip() name = re.sub(r"\s+", " ", name) district_number = page.xpath( '//span[contains(text(), "House District:")]' "/following-sibling::span/text()")[0].strip() # remove anything after first whitespace district_number = re.sub(r"\s.*", "", district_number.strip()) email = None email_content = page.xpath( '//a[./i[contains(@class,"fa-envelope")]]/text()') if email_content and email_content[0].strip(): email = email_content[0].strip() photo_url = page.xpath('//header[@id="home"]/img/@src')[0] party = self.get_rep_table_by_header(page, "Party Affiliation").text.strip() party = _party_map[party[0]] # standardize main_p_text = page.xpath('//div[@id="main-info"]/p/text()') address = [t.strip() for t in main_p_text if t.strip()][0] person = Person( name=name, district=district_number, primary_org="lower", party=party, image=photo_url, ) person.add_contact_detail(type="address", value=address, note="District Office") if email: person.add_contact_detail(type="email", value=email, note="District Office") person.add_link(url) person.add_source(url) yield person
def test_basic_invalid_person(): bob = Person("Bob B. Johnson") bob.add_source(url="http://example.com") bob.validate() bob.name = None with pytest.raises(ScrapeValueError): bob.validate()
def get_member(self, session, chamber, kpid): url = "%smembers/%s" % (ksapi.url, kpid) content = json.loads(self.get(url).text)["content"] party = content["PARTY"] if party == "Democrat": party = "Democratic" slug = { "2013-2014": "b2013_14", "2015-2016": "b2015_16", "2017-2018": "b2017_18", "2019-2020": "b2019_20", }[session] leg_url = "http://www.kslegislature.org/li/%s/members/%s/" % (slug, kpid) try: legislator_page = self.lxmlize(leg_url) (photo_url, ) = legislator_page.xpath('//img[@class="profile-picture"]/@src') except scrapelib.HTTPError: self.warning("{}'s legislator bio page not found".format( content["FULLNAME"])) leg_url = "" photo_url = "" person = Person( name=content["FULLNAME"], district=str(content["DISTRICT"]), primary_org=chamber, party=party, image=photo_url, ) person.extras = {"occupation": content["OCCUPATION"]} address = "\n".join([ "Room {}".format(content["OFFICENUM"]), "Kansas State Capitol Building", "300 SW 10th St.", "Topeka, KS 66612", ]) note = "Capitol Office" person.add_contact_detail(type="address", value=address, note=note) person.add_contact_detail(type="email", value=content["EMAIL"], note=note) if content["OFFPH"]: person.add_contact_detail(type="voice", value=content["OFFPH"], note=note) person.add_source(url) person.add_link(leg_url) yield person
def handle_list_item(self, row): if not row["First Name"]: return name = "{} {}".format(row["First Name"], row["Last Name"]) party = PARTIES[row["Party"]] leg = Person( name=name, district=row["District"].lstrip("0"), party=party, primary_org="upper", role="Senator", image=self.extra_info[name]["image"], ) leg.add_link(self.extra_info[name]["url"]) leg.add_contact_detail(type="voice", value=self.extra_info[name]["office_phone"], note="capitol") if "email" in self.extra_info[name]: leg.add_contact_detail(type="email", value=self.extra_info[name]["email"], note="capitol") row["Zipcode"] = row["Zipcode"].strip() # Accommodate for multiple address column naming conventions. address1_fields = [row.get("Address"), row.get("Office Building")] address2_fields = [row.get("Address2"), row.get("Office Address")] row["Address"] = next((a for a in address1_fields if a is not None), False) row["Address2"] = next((a for a in address2_fields if a is not None), False) if (a in row["Address2"] for a in ["95 University Avenue W", "100 Rev. Dr. Martin Luther King"]): address = "{Address}\n{Address2}\n{City}, {State} {Zipcode}".format( **row) if "Rm. Number" in row: address = "{0} {1}".format(row["Rm. Number"], address) leg.add_contact_detail(type="address", value=address, note="capitol") elif row["Address2"]: address = "{Address}\n{Address2}\n{City}, {State} {Zipcode}".format( **row) leg.add_contact_detail(type="address", value=address, note="district") else: address = "{Address}\n{City}, {State} {Zipcode}".format(**row) leg.add_contact_detail(type="address", value=address, note="district") leg.add_source(self.url) leg.add_source(self._html_url) return leg
def _scrape_legislator(self, row, chamber): name_cell = row.xpath('./td[@class="rosterCell nameCell"]/a')[0] name = " ".join([ line.strip() for line in name_cell.text_content().split("\n") if len(line.strip()) > 0 ]) party_letter = row.xpath( './td[@class="rosterCell partyCell"]/text()')[0].strip() party = dict(D="Democratic", R="Republican")[party_letter] chamber_abbr = self._chamber_map[chamber] district = (row.xpath('./td[@class="rosterCell seatCell"]' "/text()")[0].replace(chamber_abbr, "").strip()) try: email = (row.xpath('./td[@class="rosterCell emailCell"]' "/a/@href")[0].replace("mailto:", "").strip()) except IndexError: email = None phone = (row.xpath('./td[@class="rosterCell phoneCell"]' "/text()")[0].strip() or None) details_url = "https://leg.mt.gov{}".format(name_cell.attrib["href"]) response = self.get(details_url) details_page = lxml.html.fromstring(response.text) address_lines = (details_page.xpath( '//div[@class="col-lg-6 col-md-12 text-lg-left align-self-center"]' '/p[contains(text(), "Address")]')[0].text_content().replace( "Address", "").split("\n")) address = "\n".join( [line.strip() for line in address_lines if len(line.strip()) > 0]) legislator = Person(name=name, district=district, party=party, primary_org=chamber) legislator.add_contact_detail(type="address", value=address, note="Capitol Office") if phone is not None: legislator.add_contact_detail(type="voice", value=phone, note="Capitol Office") if email is not None: legislator.add_contact_detail(type="email", value=email, note="E-mail") legislator.add_link(details_url) legislator.add_source(self._roster_url) yield legislator
def scrape_senator(self, district): link = "https://legislature.maine.gov/District-{}".format(district) page = lxml.html.fromstring(self.get(link).text) page.make_links_absolute(link) main = page.xpath('//div[@id="main"]/div[@id="content"]')[0] title = main.xpath("h1")[0].text # e.g. District 25 - State Senator Catherine Breen (D - Cumberland)... title_match = re.match( r"District (\d+) - State Senator ([^\(]+) \(([DRI])", title) _, name, party = title_match.groups() name = re.sub(r"\s+", " ", name.strip()) party = _party_map[party] image_url = address = phone = email = None for p in main.xpath("p"): if p.xpath(".//img") and not image_url: image_url = p.xpath(".//img/@src")[0] continue field, _, value = p.text_content().partition(":") value = value.strip() if field in ("Address", "Mailing Address"): address = value elif field in ("Phone", "Home Phone"): phone = value elif field == "Email": email = value person = Person( name=name, district=district, image=image_url, primary_org="upper", party=party, ) person.add_link(link) person.add_source(link) if address: person.add_contact_detail(type="address", value=address, note="District Office") if phone: person.add_contact_detail(type="voice", value=clean_phone(phone), note="District Phone") person.add_contact_detail(type="email", value=email, note="District Email") yield person
def test_save_object_basics(): # ensure that save object dumps a file s = Scraper(juris, "/tmp/") p = Person("Michael Jordan") p.add_source("http://example.com") with mock.patch("json.dump") as json_dump: s.save_object(p) # ensure object is saved in right place filename = "person_" + p._id + ".json" assert filename in s.output_names["person"] json_dump.assert_called_once_with(p.as_dict(), mock.ANY, cls=mock.ANY)
def scrape_lower_legislator(self, url, leg_info): page = self.lxmlize(url) name = page.xpath( '//span[@id="body_FormView5_FULLNAMELabel"]/text()')[0].strip() if name.startswith("District ") or name.startswith("Vacant "): self.warning("Seat is vacant: {}".format(name)) return photo = page.xpath( '//img[contains(@src, "/h_reps/RepPics")]')[0].attrib["src"] party_flags = { "Democrat": "Democratic", "Republican": "Republican", "Independent": "Independent", } party_info = page.xpath( '//span[@id="body_FormView5_PARTYAFFILIATIONLabel"]/text()' )[0].strip() party = party_flags[party_info] try: email = page.xpath( '//span[@id="body_FormView6_EMAILADDRESSPUBLICLabel"]/text()' )[0].strip() except IndexError: email = None district = leg_info["dist"].replace("Dist", "").strip() person = Person(name=name, party=party, district=district, primary_org="lower", image=photo) contacts = [ (leg_info["office"], "address"), (leg_info["phone"], "voice"), (email, "email"), ] for value, key in contacts: if value: person.add_contact_detail(type=key, value=value, note="District Office") person.add_source(url) person.add_link(url) yield person
def test_person_add_membership_org(): p = Person("Bob B. Bear") p.add_source("http://example.com") o = Organization("test org", classification="unknown") p.add_membership(o, role="member", start_date="2007", end_date=datetime.date(2015, 5, 8)) assert len(p._related) == 1 p._related[0].validate() assert p._related[0].person_id == p._id assert p._related[0].organization_id == o._id assert p._related[0].start_date == "2007" assert p._related[0].end_date == datetime.date(2015, 5, 8)
def test_save_related(): s = Scraper(juris, "/tmp/") p = Person("Michael Jordan") p.add_source("http://example.com") o = Organization("Chicago Bulls", classification="committee") o.add_source("http://example.com") p._related.append(o) with mock.patch("json.dump") as json_dump: s.save_object(p) assert json_dump.mock_calls == [ mock.call(p.as_dict(), mock.ANY, cls=mock.ANY), mock.call(o.as_dict(), mock.ANY, cls=mock.ANY), ]
def scrape_chamber(self, chamber): leg_list_url = utils.urls["people"][chamber] page = self.get(leg_list_url).text page = lxml.html.fromstring(page) page.make_links_absolute(leg_list_url) # email addresses are hidden away on a separate page now, at # least for Senators contact_url = utils.urls["contacts"][chamber] contact_page = self.get(contact_url).text contact_page = lxml.html.fromstring(contact_page) for link in page.xpath("//a[contains(@href, '_bio.cfm')]"): full_name = " ".join(link.text.split(", ")[::-1]).strip() full_name = re.sub(r"\s+", " ", full_name) district = link.getparent().getnext().tail.strip() district = re.search(r"District (\d+)", district).group(1) party = link.getparent().tail.strip()[-2] if party == "R": party = "Republican" elif party == "D": party = "Democratic" elif party == "I": party = "Independent" url = link.get("href") leg_id = url.split("?id=")[1] person = Person(name=full_name, district=district, party=party, primary_org=chamber) person.add_link(leg_list_url) person.add_source(leg_list_url) # Scrape email, offices, photo. page = self.get(url).text doc = lxml.html.fromstring(page) doc.make_links_absolute(url) email = self.scrape_email_address(contact_page, leg_id) self.scrape_offices(url, doc, person, email) self.scrape_photo_url(url, doc, person) yield person
def scrape_chamber(self, session): session_key = SESSION_KEYS[session] legislators_reponse = self.api_client.get("legislators", session=session_key) for legislator in legislators_reponse: url_name = legislator["WebSiteUrl"].split("/")[-1] chamber_name = "house" if legislator["Chamber"] == "H" else "senate" img = "https://www.oregonlegislature.gov/{}/MemberPhotos/{}.jpg".format( chamber_name, url_name ) party = legislator["Party"] if party == "Democrat": party = "Democratic" person = Person( name="{} {}".format(legislator["FirstName"], legislator["LastName"]), primary_org={"S": "upper", "H": "lower"}[legislator["Chamber"]], party=party, district=legislator["DistrictNumber"], image=img, ) person.add_link(legislator["WebSiteUrl"]) person.add_source(legislator["WebSiteUrl"]) if legislator["CapitolAddress"]: person.add_contact_detail( type="address", value=legislator["CapitolAddress"], note="Capitol Office", ) if legislator["CapitolPhone"]: person.add_contact_detail( type="voice", value=legislator["CapitolPhone"], note="Capitol Office", ) person.add_contact_detail( type="email", value=legislator["EmailAddress"], note="Capitol Office" ) yield person
def handle_list_item(self, item): photo_url = item.xpath("./img/@src")[0] url = item.xpath(".//h5/a/@href")[0] name_text = item.xpath(".//h5/a/b/text()")[0] name_match = re.match(r"^(.+)\(([0-9]{2}[AB]), ([A-Z]+)\)$", name_text) name = name_match.group(1).strip() district = name_match.group(2).lstrip("0").upper() party_text = name_match.group(3) party = PARTIES[party_text] info_texts = [ x.strip() for x in item.xpath("./div/text()[normalize-space()]") if x.strip() ] address = "\n".join((info_texts[0], info_texts[1])) phone_text = info_texts[2] if validate_phone_number(phone_text): phone = phone_text email_text = item.xpath(".//a/@href")[1].replace("mailto:", "").strip() if validate_email_address(email_text): email = email_text rep = Person( name=name, district=district, party=party, primary_org="lower", role="Representative", image=photo_url, ) rep.add_link(url) rep.add_contact_detail(type="address", value=address, note="capitol") rep.add_contact_detail(type="voice", value=phone, note="capitol") rep.add_contact_detail(type="email", value=email, note="capitol") rep.add_source(self.url) yield rep
def scrape_upper_chamber(self, term): url = "http://oksenate.gov/Senators/Default.aspx" html = self.get(url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) for a in doc.xpath("//table[@summary]")[0].xpath( './/td//a[contains(@href, "biographies")]'): tail = a.xpath("..")[0].tail if tail: district = tail.split()[1] else: district = a.xpath("../../span")[1].text.split()[1] if a.text is None or a.text.strip() == "Vacant": self.warning( "District {} appears to be empty".format(district)) continue else: match = re.match(r"(.+) \(([A-Z])\)", a.text.strip()) if match: name, party = match.group(1), self._parties[match.group(2)] else: self.warning( "District {} appears to have empty Representative name,party" .format(district)) continue url = a.get("href") person = Person(primary_org="upper", district=district, name=name.strip(), party=party) person.add_link(url) person.add_source(url) self.scrape_upper_offices(person, url) yield person
def scrape_chamber(self, chamber): url = { "upper": "https://legis.delaware.gov/json/Senate/GetSenators", "lower": "https://legis.delaware.gov/json/House/" + "GetRepresentatives", }[chamber] source_url = { "upper": "https://legis.delaware.gov/Senate", "lower": "https://legis.delaware.gov/House", }[chamber] data = self.post(url).json()["Data"] for item in data: if item["PersonFullName"] is None: # Vacant district self.warning("District {} was detected as vacant".format( item["DistrictNumber"])) continue leg_url = ("https://legis.delaware.gov/" + "LegislatorDetail?personId={}".format(item["PersonId"])) doc = self.lxmlize(leg_url) image_url = doc.xpath("//img/@src")[0] leg = Person( name=item["PersonFullName"], district=str(item["DistrictNumber"]), party=PARTY[item["PartyCode"]], primary_org=chamber, image=image_url, ) self.scrape_contact_info(leg, doc) leg.add_link(leg_url, note="legislator page") leg.add_source(source_url, note="legislator list page") yield leg
def handle_list_item(self, item): name = " ".join(item.xpath(".//text()")) name = re.sub(r"\s+", " ", name).replace(" ,", ",").strip() if "Vacant" in name: return district = item.xpath("string(../../td[1])") party = item.xpath("string(../../td[2])") if party == "Democrat": party = "Democratic" leg_url = item.get("href") parts = name.split(", ") parts[:2] = parts[1::-1] # reverse first two name = " ".join(parts) leg = Person( name=name, district=district, party=party, primary_org="upper", role="Senator", ) response = requests.head(leg_url) if 300 <= response.status_code < 400: leg_url = response.headers["Location"] if leg_url.startswith("/"): leg_url = "https://www.flsenate.gov" + leg_url leg.add_link(leg_url) leg.add_source(self.url) leg.add_source(leg_url) self.scrape_page(SenDetail, leg_url, obj=leg) return leg
def handle_list_item(self, item): name = item.text lname = name.lower() if "resigned" in lname or "vacated" in lname or "retired" in lname: return if name in CHAMBER_MOVES and (self.chamber != CHAMBER_MOVES[name]): return name, action, date = clean_name(name) leg = Person(name=name) leg.add_source(self.url) leg.add_source(item.get("href")) leg.add_link(item.get("href")) yield from self.scrape_page( self.detail_page, item.get("href"), session=self.kwargs["session"], committees=self.kwargs["committees"], obj=leg, ) yield leg
def scrape(self): base_url = "http://news.legislature.ne.gov/dist" # there are 49 districts for district in range(1, 50): rep_url = base_url + str(district).zfill(2) full_name = None address = None phone = None email = None photo_url = None try: page = self.lxmlize(rep_url) info_node = self.get_node( page, '//div[@class="container view-front"]' '//div[@class="col-sm-4 col-md-3 ltc-col-right"]' '/div[@class="block-box"]', ) full_name = self.get_node(info_node, "./h2/text()[normalize-space()]") full_name = re.sub(r"^Sen\.[\s]+", "", full_name).strip() if full_name == "Seat Vacant": continue address_node = self.get_node( info_node, './address[@class="feature-content"]') email = self.get_node( address_node, './a[starts-with(@href, "mailto:")]/text()') contact_text_nodes = self.get_nodes( address_node, "./text()[following-sibling::br]") address_sections = [] for text in contact_text_nodes: text = text.strip() if not text: continue phone_match = re.search(r"Phone:", text) if phone_match: phone = re.sub(r"^Phone:[\s]+", "", text) continue # If neither a phone number nor e-mail address. address_sections.append(text) address = "\n".join(address_sections) photo_url = ( "http://www.nebraskalegislature.gov/media/images/blogs" "/dist{:2d}.jpg").format(district) # Nebraska is offically nonpartisan. party = "Nonpartisan" person = Person( name=full_name, district=str(district), party=party, image=photo_url, primary_org="legislature", ) person.add_link(rep_url) person.add_source(rep_url) note = "Capitol Office" person.add_contact_detail(type="address", value=address, note=note) if phone: person.add_contact_detail(type="voice", value=phone, note=note) if email: person.add_contact_detail(type="email", value=email, note=note) yield person except scrapelib.HTTPError: self.warning("could not retrieve %s" % rep_url)
def scrape_chamber(self, chamber): # the url for each rep is unfindable (by me) # and the parts needed to make it up do not appear in the html or js. # we can find basic information on the main rep page, and sponsor # info on a version of their indivdual page called using only their # sponsor ID (which we have to scrape from ALISON) # we can't get detailed information without another ID # which I have not been able to find. if chamber == "upper": member_list_url = self._base_url + "Senate/ALSenators.aspx" legislator_base_url = self._base_url + "ALSenator.aspx" elif chamber == "lower": member_list_url = self._base_url + "House/ALRepresentatives.aspx" legislator_base_url = self._base_url + "ALRepresentative.aspx" page = self.lxmlize(member_list_url) legislator_nodes = self.get_nodes( page, '//div[@class="container container-main"]/table/tr/td/input') legislator_url_template = (legislator_base_url + "?OID_SPONSOR=" "{oid_sponsor}&OID_PERSON={oid_person}") html_parser = HTMLParser() for legislator_node in legislator_nodes: # Set identifiers internal to AlisonDB. # Have to do this to OID_SPONSOR because they don't know # how to HTML and I'm making links absolute out of convenience. try: oid_sponsor = legislator_node.attrib["longdesc"].split("/")[-1] oid_person = legislator_node.attrib["alt"] except KeyError: continue legislator_url = legislator_url_template.format( oid_sponsor=oid_sponsor, oid_person=oid_person) legislator_page = self.lxmlize(legislator_url) name_text = self.get_node( legislator_page, '//span[@id="ContentPlaceHolder1_lblMember"]').text_content() # This just makes processing the text easier. name_text = name_text.lower() # Skip vacant seats. if "vacant" in name_text: continue photo_url = self.get_node( legislator_page, '//input[@id="ContentPlaceHolder1_TabSenator_TabLeg_imgLEG"]' "/@src", ) # Another check for vacant seats if "VACANT.jpeg" in photo_url or "pending.jpeg" in photo_url: continue # Removes titles and nicknames. name = html_parser.unescape( re.sub(r"(?i)(representative|senator|".*")", "", name_text).strip().title()) # Assemble full name by reversing last name, first name format. name_parts = [x.strip() for x in name.split(",")] full_name = "{0} {1}".format(name_parts[1], name_parts[0]) info_node = self.get_node( legislator_page, '//div[@id="ContentPlaceHolder1_TabSenator_body"]//table', ) district_text = self.get_node(info_node, "./tr[2]/td[2]").text_content() district_text = district_text.replace(" ", u"") if chamber == "upper": district = district_text.replace("Senate District", "").strip() elif chamber == "lower": district = district_text.replace("House District", "").strip() party_text = self.get_node(info_node, "./tr[1]/td[2]").text_content() if not full_name.strip() and party_text == "()": self.warning( "Found empty seat, for district {}; skipping".format( district)) continue if party_text.strip() in self._parties.keys(): party = self._parties[party_text.strip()] else: party = None phone_number = (self.get_node( info_node, "./tr[4]/td[2]").text_content().strip()) fax_number = (self.get_node( info_node, "./tr[5]/td[2]").text_content().strip().replace("\u00a0", "")) suite_text = self.get_node(info_node, "./tr[7]/td[2]").text_content() office_address = "{}\n11 S. Union Street\nMontgomery, AL 36130".format( suite_text) email_address = self.get_node(info_node, "./tr[11]/td[2]").text_content() photo_url = self.get_node( legislator_page, '//input[@id="ContentPlaceHolder1_TabSenator_TabLeg_imgLEG"]' "/@src", ) # add basic leg info and main office person = Person( name=full_name, district=district, primary_org=chamber, party=party, image=photo_url, ) person.add_contact_detail(type="address", value=office_address, note="Capitol Office") if phone_number: person.add_contact_detail(type="voice", value=phone_number, note="Capitol Office") if fax_number: person.add_contact_detail(type="fax", value=fax_number, note="Capitol Office") if email_address: person.add_contact_detail(type="email", value=email_address, note="Capitol Office") self.add_committees(legislator_page, person, chamber, legislator_url) person.add_link(legislator_url) person.add_source(legislator_url) person.add_source(member_list_url) yield person
def scrape_people(self): p = Person("Michael Jordan") p.add_source("http://example.com") yield p
def scrape_member(self, chamber, member_url): member_page = self.get(member_url).text doc = lxml.html.fromstring(member_page) doc.make_links_absolute(member_url) photo_url = doc.xpath('//a[@class="download"]/@href')[0] name_pieces = doc.xpath( '//div[@class="row profile-top"]/h2/text()')[0].split() full_name = " ".join(name_pieces[1:-1]).strip() party = name_pieces[-1] if party == "(R)": party = "Republican" elif party == "(D)": party = "Democratic" elif party == "(I)": party = "Independent" sidebar = doc.xpath( '//div[@class="relativeContent col-sm-4 col-xs-12"]')[0] district = sidebar.xpath('//div[@class="circle"]/h3/text()')[0] district = district.lstrip("0") person = Person( name=full_name, district=district, party=party, primary_org=chamber, image=photo_url, ) person.add_source(member_url) person.add_link(member_url) info = {} sidebar_items = iter(sidebar.getchildren()) for item in sidebar_items: if item.tag == "p": info[item.text] = next(sidebar_items) address = "\n".join(info["Legislative Address"].xpath("./text()")) phone = None fax = None phone_numbers = info["Phone Number(s)"].xpath("./text()") for num in phone_numbers: kind, num = num.split(": ") if kind == "LRC": if num.endswith(" (fax)"): fax = num.replace(" (fax)", "") else: phone = num email = info["Email"].text if phone: person.add_contact_detail(type="voice", value=phone, note="Capitol Office") if fax: person.add_contact_detail(type="fax", value=fax, note="Capitol Office") if email: person.add_contact_detail(type="email", value=email, note="Capitol Office") if address.strip() == "": self.warning("Missing Capitol Office!!") else: person.add_contact_detail(type="address", value=address, note="Capitol Office") yield person
def scrape_chamber(self, chamber=None): if chamber == "upper": url = "http://www.rilegislature.gov/SiteAssets/MailingLists/Senators.xls" rep_type = "Senator" contact_url = ( "http://webserver.rilin.state.ri.us/Email/SenEmailListDistrict.asp" ) elif chamber == "lower": url = "http://www.rilegislature.gov/SiteAssets/MailingLists/Representatives.xls" rep_type = "Representative" contact_url = ( "http://webserver.rilin.state.ri.us/Email/RepEmailListDistrict.asp" ) contact_page = self.lxmlize(contact_url) contact_info_by_district = {} for row in contact_page.xpath('//tr[@valign="TOP"]'): tds = row.xpath("td") (detail_link,) = tds[link_col_ix].xpath(".//a/@href") # Ignore name (2nd col). We have a regex built up below for the spreadsheet name # I don't want to touch district, _, email, phone = [ td.text_content().strip() for td in tds[:link_col_ix] ] contact_info_by_district[district] = { "email": email, "phone": phone, "detail_link": detail_link, } self.urlretrieve(url, "ri_leg.xls") wb = xlrd.open_workbook("ri_leg.xls") sh = wb.sheet_by_index(0) for rownum in range(1, sh.nrows): d = { field: sh.cell(rownum, col_num).value for field, col_num in excel_mapping.items() } # Convert float to an int, and then to string, the required format district = str(int(d["district"])) if d["full_name"].upper() == "VACANT": self.warning("District {}'s seat is vacant".format(district)) continue contact_info = contact_info_by_district[district] # RI is very fond of First M. Last name formats and # they're being misparsed upstream, so fix here (first, middle, last) = ("", "", "") full_name = re.sub( r"^{}(?=\s?[A-Z].*$)".format(rep_type), "", d["full_name"] ).strip() if re.match(r"^\S+\s[A-Z]\.\s\S+$", full_name): (first, middle, last) = full_name.split() # Note - if we ever need to speed this up, it looks like photo_url can be mapped # from the detail_link a la /senators/Paolino/ -> /senators/pictures/Paolino.jpg detail_page = self.lxmlize(contact_info["detail_link"]) try: (photo_url,) = detail_page.xpath('//div[@class="ms-WPBody"]//img/@src') except ValueError: photo_url = "" person = Person( primary_org=chamber, district=district, name=full_name, party=translate[d["party"]], image=photo_url, ) person.extras["town_represented"] = d["town_represented"] person.extras["name_first"] = first person.extras["name_middle"] = middle person.extras["name_last"] = last person.add_link(detail_link) if d["address"]: person.add_contact_detail( type="address", value=d["address"], note="District Office" ) if contact_info["phone"]: person.add_contact_detail( type="voice", value=contact_info["phone"], note="District Office" ) if contact_info["email"]: person.add_contact_detail( type="email", value=contact_info["email"], note="District Office" ) person.add_source(contact_url) person.add_source(contact_info["detail_link"]) yield person