def test_same_name_people(): create_jurisdiction() o = Organization.objects.create(name="WWE", jurisdiction_id="jid") # importing two people with the same name to a pristine database should error p1 = ScrapePerson("Dwayne Johnson", image="http://example.com/1") p2 = ScrapePerson("Dwayne Johnson", image="http://example.com/2") with pytest.raises(SameNameError): PersonImporter("jid").import_data([p1.as_dict(), p2.as_dict()]) # importing one person should pass PersonImporter("jid").import_data([p1.as_dict()]) # create fake memberships so that future lookups work on the imported people for p in Person.objects.all(): Membership.objects.create(person=p, organization=o) # importing another person with the same name should fail with pytest.raises(SameNameError): PersonImporter("jid").import_data([p1.as_dict(), p2.as_dict()]) # adding birth dates should pass p1.birth_date = "1970" p2.birth_date = "1930" resp = PersonImporter("jid").import_data([p1.as_dict(), p2.as_dict()]) assert resp["person"]["insert"] == 1 assert resp["person"]["noop"] == 0 assert resp["person"]["update"] == 1 assert Person.objects.count() == 2 # create fake memberships so that future lookups work on the imported people for p in Person.objects.all(): Membership.objects.create(person=p, organization=o) # adding a third person with the same name but without a birthday should error p3 = ScrapePerson("Dwayne Johnson", image="http://example.com/3") with pytest.raises(SameNameError): PersonImporter("jid").import_data([p3.as_dict()]) # and now test that an update works and we can insert a new one with the same name p1.image = "http://example.com/1.jpg" p2.birth_date = "1931" # change birth_date, means a new insert resp = PersonImporter("jid").import_data([p1.as_dict(), p2.as_dict()]) assert Person.objects.count() == 3 assert resp["person"]["insert"] == 1 assert resp["person"]["noop"] == 0 assert resp["person"]["update"] == 1
def _parse_person(self, row, chamber, seat_map): # Capture legislator vitals. first_name = row["FirstName"] middle_name = row["MiddleName"] last_name = row["LastName"] full_name = "{} {} {}".format(first_name, middle_name, last_name) full_name = re.sub(r"[\s]{2,}", " ", full_name) if chamber == "lower": district = "{} {}".format(row["County"], int(row["District"])).strip() else: district = str(int(row["District"])).strip() party = self.party_map[row["party"].upper()] email = row["WorkEmail"] if district == "0": self.warning("Skipping {}, district is set to 0".format(full_name)) return person = Person(primary_org=chamber, district=district, name=full_name, party=party) extras = { "first_name": first_name, "middle_name": middle_name, "last_name": last_name, } person.extras = extras if email: office = "Capitol" if email.endswith( "@leg.state.nh.us") else "District" person.add_contact_detail(type="email", value=email, note=office + " Office") # Capture legislator office contact information. district_address = "{}\n{}\n{}, {} {}".format(row["Address"], row["address2"], row["city"], row["State"], row["Zipcode"]).strip() phone = row["Phone"].strip() if not phone: phone = None if district_address: office = "Capitol" if chamber == "upper" else "District" person.add_contact_detail(type="address", value=district_address, note=office + " Office") if phone: office = "Capitol" if "271-" in phone else "District" person.add_contact_detail(type="voice", value=phone, note=office + " Office") # Retrieve legislator portrait. profile_url = None if chamber == "upper": profile_url = self.senate_profile_url.format(row["District"]) elif chamber == "lower": try: seat_number = seat_map[row["seatno"]] profile_url = self.house_profile_url.format(seat_number) except KeyError: pass if profile_url: person.image = self._get_photo(profile_url, chamber) person.add_source(profile_url) return person
def _scrape_representative(self, url, parties): # logger.info(f'Generating representative person object from {url}') """ Returns a Person object representing a member of the lower legislative chamber. """ # url = self.get(url).text.replace('<br>', '') member_page = self.lxmlize(url) photo_url = member_page.xpath('//img[@class="member-photo"]/@src')[0] if photo_url.endswith("/.jpg"): photo_url = None scraped_name, district_text = member_page.xpath( '//div[@class="member-info"]/h2') scraped_name = scraped_name.text_content().strip().replace("Rep. ", "") scraped_name = " ".join(scraped_name.split()) name = " ".join(scraped_name.split(", ")[::-1]) district_text = district_text.text_content().strip() district = str(self.district_re.search(district_text).group(1)) # Vacant house "members" are named after their district numbers: if re.match(r"^District \d+$", scraped_name): return None party = parties[district] person = Person(name=name, district=district, party=party, primary_org="lower") if photo_url is not None: person.image = photo_url person.add_link(url) person.add_source(url) def office_name(element): """Returns the office address type.""" return element.xpath("preceding-sibling::h4[1]/text()")[0].rstrip( ":") offices_text = [{ "name": office_name(p_tag), "type": office_name(p_tag).replace(" Address", "").lower(), "details": p_tag.text_content(), } for p_tag in member_page.xpath( '//h4/following-sibling::p[@class="double-space"]')] for office_text in offices_text: details = office_text["details"].strip() # A few member pages have blank office listings: if details == "": continue # At the time of writing, this case of multiple district # offices occurs exactly once, for the representative at # District 43: if details.count("Office") > 1: district_offices = [ district_office.strip() for district_office in re.findall( r"(\w+ Office.+?(?=\w+ Office|$))", details, flags=re.DOTALL) ] offices_text += [{ "name": re.match(r"\w+ Office", office).group(), "type": "district", "details": re.search(r"(?<=Office).+(?=\w+ Office|$)?", office, re.DOTALL).group(), } for office in district_offices] match = self.address_re.search(details) if match is not None: address = re.sub( " +$", "", match.group().replace("\r", "").replace("\n\n", "\n"), flags=re.MULTILINE, ) else: # No valid address found in the details. continue phone_number = extract_phone(details) fax_number = extract_fax(details) if address: person.add_contact_detail(type="address", value=address, note=office_text["name"]) if phone_number: person.add_contact_detail(type="voice", value=phone_number, note=office_text["name"]) if fax_number: person.add_contact_detail(type="fax", value=fax_number, note=office_text["name"]) yield person
def _scrape_senator(self, url, parties): # logger.info(f'Generating senator person object from {url}') """ Returns a Person object representing a member of the upper legislative chamber. """ # Scrape legislator information from roster URL # Example: view-source:https://senate.texas.gov/member.php?d=1 member_page = self.lxmlize(url) photo_url = member_page.xpath('//img[@id="memhead"]/@src')[0] scraped_name_district_text = member_page.xpath( '//div[@class="pgtitle"]/text()')[0] scraped_name, district_text = scraped_name_district_text.split(":") name = " ".join(scraped_name.replace("Senator ", "").split()).strip() district = str(district_text.split()[1]).strip() # Vacant house "members" are named after their district numbers: if re.match(r"^District \d+$", name): return None bio = " ".join(member_page.xpath('//div[@class="bio"]/text()')) party = parties[district] person = Person( name=name, district=district, party=party, primary_org="upper", biography=bio, ) if photo_url is not None: person.image = photo_url person.add_link(url) person.add_source(url) office_ids = [] # Get offices based on table headers for th_tag in member_page.xpath('//table[@class="memdir"]/tr/th'): # logger.warn([th_tag.xpath('text()'),th_tag.xpath('@id')]) id = th_tag.xpath("@id")[0] if th_tag.xpath("@id") else "" label = th_tag.xpath("text()")[0].strip() if th_tag.xpath( "text()") else "" if id != "" and label != "": office_ids.append({"id": id, "label": label}) # logger.warn(office_ids) for office in office_ids: # logger.warn(office) row = member_page.xpath( f'//table[@class="memdir"]/tr/td[@headers="{office["id"]}"]') # A few member pages have broken ids for office listings: if len(row) == 0: row = member_page.xpath( '//table[@class="memdir"]/tr/td[@headers="dDA1"]') if len(row) > 0: details = " ".join(row[0].xpath("text()")).strip() details = details.replace("\r", "").replace("\n", "") # logger.warn(details) # A few member pages have blank office listings: if details == "": continue match = self.address_re.search(details) if match is not None: address = re.sub( " +$", "", match.group().replace("\r", "").replace("\n", ""), flags=re.MULTILINE, ) else: # No valid address found in the details. continue phone_number = extract_phone(details) fax_number = extract_fax(details) if address: person.add_contact_detail(type="address", value=address, note=office["label"]) if phone_number: person.add_contact_detail(type="voice", value=phone_number, note=office["label"]) if fax_number: person.add_contact_detail(type="fax", value=fax_number, note=office["label"]) yield person
def legislators(self, latest_only): legs = {} for member, chamber, term, url in self._memberships(latest_only): name, _, _, district, party = member.xpath("td") district = district.text detail_url = name.xpath("a/@href")[0] if party.text_content().strip() == "": party = "Independent" else: party = {"D": "Democratic", "R": "Republican", "I": "Independent"}[ party.text ] name = name.text_content().strip() # inactive legislator, skip them for now if name.endswith("*"): name = name.strip("*") continue name = AKA.get(name, name) if name in legs: p, terms = legs[name] terms.append((chamber, district, term, party)) else: p = Person(name, party=party) legs[name] = p, [(chamber, district, term, party)] p.add_source(url) p.add_source(detail_url) p.add_link(detail_url) birth_date = BIRTH_DATES.get(name, None) if birth_date: p.birth_date = birth_date leg_html = self.get(detail_url).text leg_doc = lxml.html.fromstring(leg_html) leg_doc.make_links_absolute(detail_url) hotgarbage = ( "Senate Biography Information for the 98th General " "Assembly is not currently available." ) if hotgarbage in leg_html: # The legislator's bio isn't available yet. self.logger.warning("No legislator bio available for " + name) continue photo_url = leg_doc.xpath('//img[contains(@src, "/members/")]/@src')[0] p.image = photo_url p.contact_details = [] # email email = leg_doc.xpath('//b[text()="Email: "]') if email: p.add_contact_detail( type="email", value=email[0].tail.strip(), note="Capitol Office" ) offices = { "Capitol Office": '//table[contains(string(), "Springfield Office")]', "District Office": '//table[contains(string(), "District Office")]', } for location, xpath in offices.items(): table = leg_doc.xpath(xpath) if table: for type, value in self._table_to_office(table[3]): if type in ("fax", "voice") and not validate_phone_number( value ): continue p.add_contact_detail(type=type, value=value, note=location) return legs
def scrape_chamber(self, chamber, session): url = "https://docs.legis.wisconsin.gov/{}/legislators/{}".format( session, { "upper": "senate", "lower": "assembly" }[chamber]) body = self.get(url).text page = lxml.html.fromstring(body) page.make_links_absolute(url) for row in page.xpath( ".//div[@class='box-content']/div[starts-with(@id,'district')]" ): if row.xpath( ".//a/@href") and not row.xpath(".//a[text()='Vacant']"): rep_url = row.xpath(".//a[text()='Details']/@href")[0].strip( "https://") rep_url = "https://" + rep_url rep_doc = lxml.html.fromstring(self.get(rep_url).text) rep_doc.make_links_absolute(rep_url) full_name = (rep_doc.xpath('.//div[@id="district"]/h1/text()') [0].replace("Senator ", "").replace("Representative ", "")) party = rep_doc.xpath('.//div[@id="district"]//small/text()') if len(party) > 0: party = PARTY_DICT[party[0].split("-")[0].strip( "(").strip()] else: party = None district = rep_doc.xpath( './/div[@id="district"]/h3/a/@href')[1] district = district.split("/")[-1] district = str(int(district)) # email email = rep_doc.xpath("//span[@class='info email']/a/text()") if email: email = email[0] else: email = "" assert party is not None, "{} is missing party".format( full_name) person = Person(name=full_name, district=district, primary_org=chamber, party=party) img = rep_doc.xpath('.//div[@id="district"]/img/@src') if img: person.image = img[0] # office #### address_lines = rep_doc.xpath( './/span[@class="info office"]/text()') address = "\n".join([ line.strip() for line in address_lines if line.strip() != "" ]) person.add_contact_detail(type="address", value=address, note="Capitol Office") phone = rep_doc.xpath( './/span[@class="info telephone"]/text()') if phone: phone = re.sub(r"\s+", " ", phone[1]).strip() person.add_contact_detail(type="voice", value=phone, note="Capitol Office") fax = rep_doc.xpath('.//span[@class="info fax"]/text()') if fax: fax = re.sub(r"\s+", " ", fax[1]).strip() person.add_contact_detail(type="fax", value=fax, note="Capitol Office") if email: person.add_contact_detail(type="email", value=email, note="Capitol Office") person.add_link(rep_url) person.add_source(rep_url) yield person
def _scrape_upper_chamber(self): self.info("Scraping upper chamber for legislators.") chamber = "upper" url = self._senators_url source_url = url page = self.get(url).text page = lxml.html.fromstring(page) table = page.xpath('//*[@id="content-2"]//table//tr') rowcount = 0 for tr in table: rowcount += 1 # the first two rows are headers, skip: if rowcount <= 2: continue tds = tr.xpath("td") full_name = tds[0].xpath("div/a")[0].text_content().strip() if full_name.startswith( ("Vacant", "Vacancy")) or full_name.endswith(("Vacant")): self.warning("Skipping vacancy, named '{}'".format(full_name)) continue party_and_district = tds[1].text_content().strip().split("-") if party_and_district[0] == "D": party = "Democratic" elif party_and_district[0] == "R": party = "Republican" district = party_and_district[1].lstrip("0") phone = tds[3].xpath("div")[0].text_content().strip() url = self._senator_details_url.format(int(district)) details_page = self.get(url).text if "currently vacant" in details_page: continue person = Person(name=full_name, primary_org=chamber, district=district, party=party) person.add_source(source_url) person.add_source(url) person.add_link(url) page = lxml.html.fromstring(details_page) photo_url = page.xpath( '//*[@id="content-2"]//img[contains(@src, "uploads")]/@src')[0] contact_info = [ line.strip() for line in page.xpath('//div[@class="textwidget"]/p[1]') [0].text_content().split("\n") if "Capitol Office:" not in line ] address = "\n".join(contact_info[:2]) email = next((line for line in iter(contact_info) if "@" in line), None) phone_pattern = re.compile(r"\(\d{3}\) \d{3}-\d{4}") phone_numbers = [ line for line in contact_info if phone_pattern.search(line) is not None ] phone = phone_pattern.search(phone_numbers[0]).group() fax = next( (phone_pattern.search(phone_number).group() for phone_number in iter(phone_numbers) if "fax" in phone_number.lower()), None, ) person.add_contact_detail(type="address", value=address, note="Capitol Office") person.add_contact_detail(type="voice", value=phone, note="Capitol Office") if fax: person.add_contact_detail(type="fax", value=fax, note="Capitol Office") if email: person.add_contact_detail(type="email", value=email, note="Capitol Office") person.image = photo_url yield person
def _scrape_lower_chamber(self): self.info("Scraping lower chamber for legislators.") chamber = "lower" roster_url = self._reps_url page = self.get(roster_url).text page = lxml.html.fromstring(page) # This is the ASP.net table container table_xpath = "//table[@id='theTable']" table = page.xpath(table_xpath)[0] for tr in table.xpath("tr")[3:]: # If a given term hasn't occurred yet, then ignore it # Eg, in 2017, the 2018 term page will have a blank table if tr.attrib.get("class") == "dxgvEmptyDataRow": self.warning("No House members found") return tds = tr.xpath("td") last_name = tds[1].text_content().strip() first_name = tds[2].text_content().strip() full_name = "{} {}".format(first_name, last_name) district = str(int(tds[3].text_content().strip())) party = tds[4].text_content().strip() if party == "D": party = "Democratic" elif party == "R": party = "Republican" if party.strip() == "": # Workaround for now. party = "Other" phone = tds[6].text_content().strip() room = tds[7].text_content().strip() address = self._assumed_address_fmt.format(room if room else "") if last_name == "Vacant": person = Person(name=full_name, primary_org=chamber, district=district, party=party) person.extras = { "first_name": first_name, "last_name": last_name } person.add_contact_detail(type="address", value=address, note="Capitol Office") if phone.strip(): person.add_contact_detail(type="voice", value=phone, note="Capitol Office") person.add_source(roster_url) self._save_vacant_legislator(person) else: party_override = { " Green": "Democratic", " Sisco": "Republican" } if party == "" and full_name in party_override: party = party_override[full_name] details_url = self._rep_details_url.format(district) details_page = lxml.html.fromstring(self.get(details_url).text) person = Person(name=full_name, primary_org=chamber, district=district, party=party) person.extras = { "first_name": first_name, "last_name": last_name } person.add_source(roster_url) person.add_source(details_url) person.add_link(details_url) email = details_page.xpath( '//*[@id="ContentPlaceHolder1_lblAddresses"] ' '//a[starts-with(@href,"mailto:")]/@href') if len(email) > 0 and email[0].lower() != "mailto:": email = email[0].split(":")[1] else: email = None person.add_contact_detail(type="address", value=address, note="Capitol Office") if phone: person.add_contact_detail(type="voice", value=phone, note="Capitol Office") if email: person.add_contact_detail(type="email", value=email, note="Capitol Office") picture = details_page.xpath( '//*[@id="ContentPlaceHolder1_imgPhoto"]/@src') if len(picture) > 0: person.image = picture[0] yield person