def process_page(self): data_elem = CSS("#__NEXT_DATA__").match_one(self.root).text_content() data = json.loads(data_elem) for item in data["props"]["pageProps"]["legrosterData"][0]: first = item["First_Name"] middle = item["Middle_Name"] last = item["Last_Name"] suffix = item["Suffix"] member_id = item["BioLink"].split("/")[2] url = "https://www.njleg.state.nj.us" + item["BioLink"] party = {"D": "Democratic", "R": "Republican"}[item["Party"]] district = item["Roster_District"] chamber = "upper" if item["Roster_House"] == "Senate" else "lower" if middle: name = f"{first} {middle} {last}" else: name = f"{first} {last}" if suffix: name += f", {suffix}" p = ScrapePerson( name=name, given_name=first, family_name=last, state="nj", chamber=chamber, party=party, district=district, ) p.add_source(self.source.url) p.add_source(url) p.add_link(url) api_url = f"https://www.njleg.state.nj.us/api/legislatorData/legislatorBio/{member_id}" p.add_source(api_url) yield LegDetail(p, source=api_url)
def process_page(self): name = self.name_css.match_one(self.root).text.split(maxsplit=1)[1] p = ScrapePerson( name=name, state="ok", chamber="upper", party=self.party_css.match_one(self.root).text, district=self.district_css.match_one(self.root).text.split()[1], ) p.image = self.image_selector.match_one(self.root).get("href") contact_url = self.source.url.replace("District.aspx", "Contact.aspx") assert contact_url.startswith( "https://www.okhouse.gov/Members/Contact.aspx?District=") p.add_link(contact_url, note="Contact Form") # capitol address check_capitol_address = (CSS(".districtheadleft").match( self.root)[0].text_content().strip()) if check_capitol_address == "Capitol Address:": capitol_address_div = (CSS(".districtheadleft + div").match( self.root)[0].text_content().strip().splitlines()) p.capitol_office.address = "; ".join( [ln.strip() for ln in capitol_address_div[:-1]]) p.capitol_office.phone = capitol_address_div[-1].strip() return p
def process_page(self): member_code = self.data["MemberCode"] image = f"https://malegislature.gov/Legislators/Profile/170/{member_code}.jpg" chamber = "upper" if self.data["Branch"] == "Senate" else "lower" party = self.data["Party"] if party == "Unenrolled": party = "Independent" p = ScrapePerson( name=self.data["Name"], state="ma", party=party, district=self.data["District"], chamber=chamber, image=image, email=self.data["EmailAddress"], ) room_num = self.data["RoomNumber"] if room_num: capitol_address = f"24 Beacon St., Room {room_num}; Boston, MA 02133" p.capitol_office.address = capitol_address # phone number and fax number (if it exists) are both from capitol office address phone = self.data["PhoneNumber"] numbers_only_phone_length = 10 if phone: # there are 3 formats for phone numbers (some must be adjusted for extensions): # 61772228007309 is 617 722 2800x7309 # (617) 722-1660 is (617) 722-1660 # (617) 722-2800 x7306 is (617) 722-2800 x7306 if ( len(phone) > numbers_only_phone_length and " " not in phone and "x" not in phone ): phone = phone[:10] + " x" + phone[10:] p.capitol_office.voice = phone try: fax = self.data["FaxNumber"] if fax: p.capitol_office.fax = fax except SelectorError: pass if self.data["LeadershipPosition"]: p.extras["leadership position"] = self.data["LeadershipPosition"] p.extras["member code"] = member_code p.add_source(self.source.url) p.add_source(list_url()) p.add_link(f"https://malegislature.gov/Legislators/Profile/{member_code}") return p
def process_item(self, item): title = CSS("td").match(item)[0].text_content().strip() if title == "Representative": chamber = "lower" elif title == "Senator": chamber = "upper" district = CSS("td").match(item)[2].text_content() party = CSS("td").match(item)[3].text_content() if party == "Democrat": party = "Democratic" p = ScrapePerson( name="", state="co", party=party, chamber=chamber, district=district, ) p.capitol_office.voice = CSS("td").match( item)[4].text_content().strip() p.email = CSS("td").match(item)[5].text_content().strip() detail_link = CSS("td a").match_one(item).get("href") p.add_source(self.source.url) p.add_source(detail_link) p.add_link(detail_link, note="homepage") return LegDetail(p, source=detail_link)
def process_item(self, item): name = CSS("h3").match_one(item).text_content() district = CSS("p.list-district").match_one(item).text_content() district = re.search(r"District\s(\d+)", district).groups()[0] img = CSS("img").match_one(item).get("src") p = ScrapePerson( name=name, state="in", chamber=self.chamber, district=district, party=self.party, image=img, ) if len(CSS("p").match(item)) > 2: title = CSS("p").match(item)[0].text_content() p.extras["title"] = title detail_link = CSS("a").match_one(item).get("href") p.add_source(self.source.url) p.add_source(detail_link) p.add_link(detail_link, note="homepage") return RedSenDetail(p, source=detail_link)
def process_item(self, item): name_party = CSS("span").match(item)[0].text_content().strip().split( " - ") name = name_party[0].strip() party = name_party[1].strip() if party == "(D)": party = "Democratic" elif party == "(R)": party = "Republican" elif party == "(DTS)": party = "Independent" district = CSS("span").match(item)[1].text_content().strip() district = re.search(r"District:\s(.+)", district).groups()[0].strip() p = ScrapePerson( name=name, state="nm", chamber=self.chamber, district=district, party=party, ) detail_link = CSS("a").match_one(item).get("href") p.add_source(self.source.url) p.add_source(detail_link) p.add_link(detail_link, note="homepage") img = CSS("img").match_one(item).get("src") p.image = img return LegDetail(p, source=detail_link)
def process_item(self, row): if not row["First Name"]: return name = "{} {}".format(row["First Name"], row["Last Name"]) party = PARTIES[row["Party"]] leg = ScrapePerson( name=name, district=row["District"].lstrip("0"), party=party, state="mn", chamber="upper", image=self.extra_info[name]["image"], ) if "url" in self.extra_info[name]: leg.add_link(self.extra_info[name]["url"]) if "office_phone" in self.extra_info[name]: leg.capitol_office.voice = self.extra_info[name]["office_phone"] if "email" in self.extra_info[name]: leg.email = self.extra_info[name]["email"] row["Zipcode"] = row["Zipcode"].strip() if (a in row["Address2"] for a in ["95 University Avenue W", "100 Rev. Dr. Martin Luther King"]): address = "{Address}\n{Address2}\n{City}, {State} {Zipcode}".format( **row) if "Rm. Number" in row: address = "{0} {1}".format(row["Rm. Number"], address) leg.capitol_office.address = address leg.add_source(self.source.url) leg.add_source(SEN_HTML_URL) return leg
def process_item(self, item): name = CSS("a.membername").match_one(item).text_content() name = re.search(r"(Senator|Representative)\s(.+)", name).groups()[1] party = CSS("a.membername").match_one(item).tail.strip() if party == "(D)": party = "Democratic" elif party == "(R)": party = "Republican" district = CSS("div.district a").match_one(item).text_content().strip() district = re.search(r"District\s(.+)", district).groups()[0] p = ScrapePerson( name=name, state="sc", chamber=self.chamber, district=district, party=party, ) detail_link = CSS("div.district a").match_one(item).get("href") p.add_source(self.source.url) p.add_source(detail_link) p.add_link(detail_link, note="homepage") img = CSS("img").match_one(item).get("src") p.image = img return LegDetail(p, source=URL(detail_link, timeout=20))
def process_item(self, item): name = CSS("div a").match(item)[1].text_content() district = ( CSS("div .esg-content.eg-senators-grid-element-1") .match_one(item) .text_content() .split("|")[1] .strip() .lower() ) district = re.search(r"district\s(\d+)", district).groups()[0] img = CSS("div img").match_one(item).get("data-lazysrc") p = ScrapePerson( name=name, state="in", chamber=self.chamber, district=district, party=self.party, image=img, ) city = ( CSS("div .esg-content.eg-senators-grid-element-27") .match_one(item) .text_content() ) p.extras["city"] = city detail_link = CSS("div a").match(item)[1].get("href") p.add_link(detail_link, note="homepage") p.add_source(self.source.url) p.add_source(detail_link) return BlueSenDetail(p, source=detail_link)
def process_item(self, item): name = CSS("header").match_one(item).text_content() district = CSS("div.district").match_one(item).text_content() district = re.search(r"House\sDistrict\s(\d+)", district).groups()[0] img = CSS("img").match_one(item).get("src") p = ScrapePerson( name=name, state="in", chamber=self.chamber, district=district, party=self.party, image=img, ) p.extras["city"] = CSS("div.city").match_one(item).text_content() detail_link = item.get("href") p.add_link(detail_link, note="homepage") detail_link_full = detail_link + "/full" p.add_source(detail_link_full) p.add_source(self.source.url) return BlueRepDetail(p, source=detail_link_full)
def process_item(self, item): chamber_id = item["district"]["chamberType"] p = ScrapePerson( state="ga", chamber=self.chamber_types[chamber_id], district=str(item["district"]["number"]), name=item["fullName"], family_name=item["name"]["familyName"], given_name=item["name"]["first"], suffix=item["name"]["suffix"] or "", party=self.party_ids[item["party"]], ) # district address da = item["districtAddress"] if da["email"]: p.email = da["email"] if da["phone"]: p.district_office.voice = da["phone"] if da["fax"]: p.district_office.fax = da["fax"] if da["address1"]: p.district_office.address = da["address1"] if da["address2"]: p.district_office.address += "; " + da["address2"] p.district_office.address += "; {city}, {state} {zip}".format(**da) p.district_office.address = p.district_office.address.strip() # photos if not item["photos"]: pass elif len(item["photos"]) == 1: p.image = item["photos"][0]["url"].split("?")[ 0] # strip off ?size=mpSm for full size else: raise Exception("unknown photos configuration: " + str(item["photos"])) # extras p.extras["residence"] = item["residence"] p.extras["city"] = item["city"].strip() p.extras["georgia_id"] = item["id"] url = ( f"https://www.legis.ga.gov/members/{self.chamber_names[chamber_id]}/" f"{item['id']}?session={item['sessionId']}") p.add_source(url, note="Initial list page (requires authorization token)") source = URL( f"https://www.legis.ga.gov/api/members/detail/{item['id']}?session=1029&chamber={chamber_id}", headers={"Authorization": get_token()}, ) return LegDetail(p, source=source)
def process_item(self, item): name_dirty = CSS("h4 span").match_one(item).text_content().strip() if re.search(r"Vacant", name_dirty): self.skip() name_dirty = name_dirty.split(", ") last_name = name_dirty[0] first_name = name_dirty[1] name = first_name + " " + last_name district = CSS("i.fa.fa-map").match_one( item).getnext().text_content().strip() party = CSS("i.fa.fa-users").match_one( item).getnext().text_content().strip() if party == "Democrat": party = "Democratic" email = CSS("a").match(item)[2].text_content().strip() img = CSS("img").match_one(item).get("src") p = ScrapePerson( name=name, state="la", party=party, district=district, chamber=self.chamber, email=email, image=img, ) detail_link = CSS("a").match(item)[1].get("href") p.add_source(self.source.url) p.add_source(detail_link) p.add_link(detail_link, note="homepage") return LegislatorDetail(p, source=detail_link)
def process_page(self): for item in self.data["Data"]: name = item["PersonFullName"] party_code = item["PartyCode"] party_dict = { "D": "Democratic", "R": "Republican", "I": "Independent" } party = party_dict[party_code] district = item["DistrictNumber"] p = ScrapePerson( name=name, state="de", party=party, chamber=self.chamber, district=district, ) p.add_source(self.source.url) detail_link = URL( f"https://legis.delaware.gov/LegislatorDetail?personId={item['PersonId']}" ) p.add_source(detail_link.url) p.add_link(detail_link.url, note="homepage") yield LegDetail(p, source=detail_link.url)
def process_item(self, item): member, party, district, contact_link, phone, office = item.getchildren( ) name = member.text_content() district = district.text_content() # skip vacant districts if "Interim District" in name: self.skip() # each of these <td> have a single link leg_url = CSS("a").match_one(member).get("href") contact_url = CSS("a").match_one(contact_link).get("href") # construct this URL based on observation elsewhere on senate.michigan.gov image_url = ( f"https://senate.michigan.gov/_images/{district}{ord_suffix(district)}.jpg" ) p = ScrapePerson( **split_name(name), state="mi", chamber="upper", district=district, party=self.PARTY_MAP[party.text], image=image_url, ) p.capitol_office.voice = str(phone.text_content()) p.capitol_office.address = str(office.text_content()) p.add_source(self.source.url) p.add_link(leg_url) p.add_link(contact_url, note="Contact") return p
def process_item(self, item): name_dirty = CSS("a").match_one(item).text_content().strip().split( ", ") name = name_dirty[1] + " " + name_dirty[0] district = CSS("br").match(item)[-1].tail.strip() district = re.search(r"District\s(.+)", district).groups()[0] party = CSS("b").match_one(item).tail.strip() if party == "(D)": party = "Democratic" elif party == "(R)": party = "Republican" elif party == "(I)": party = "Independent" p = ScrapePerson( name=name, state="pa", chamber=self.chamber, district=district, party=party, ) detail_link = CSS("a").match_one(item).get("href") p.add_source(self.source.url) p.add_source(detail_link) p.add_link(detail_link, note="homepage") return LegDetail(p, source=URL(detail_link, timeout=10))
def process_item(self, item): # skip header rows if ( len(CSS("td").match(item)) == 1 or CSS("td").match(item)[0].get("class") == "header" ): self.skip() first_link = CSS("td a").match(item)[0] name = first_link.text_content() detail_link = first_link.get("href") district = CSS("td").match(item)[3].text_content() party_letter = CSS("td").match(item)[4].text_content() party_dict = {"D": "Democratic", "R": "Republican", "I": "Independent"} party = party_dict[party_letter] p = ScrapePerson( name=name, state="il", party=party, chamber=self.chamber, district=district, ) p.add_source(self.source.url) p.add_source(detail_link) p.add_link(detail_link, note="homepage") return LegDetail(p, source=detail_link)
def process_item(self, item): name = CSS("h3").match_one(item).text_content() if name == " - Vacant Seat": self.skip() party = CSS("small").match_one(item).text_content() if party == "Democrat": party = "Democratic" district = CSS("p").match(item)[0].text_content() district = ( re.search(r"District:\r\n(.+)", district).groups()[0].strip().lstrip("0") ) p = ScrapePerson( name=name, state="ky", party=party, chamber=self.chamber, district=district, ) detail_link = item.get("href") p.add_source(self.source.url) p.add_source(detail_link) p.add_link(detail_link, note="homepage") return LegDetail(p, source=detail_link)
def process_page(self): p = ScrapePerson( state="fl", chamber="lower", name=fix_name(self.input.name), party=str(self.input.party), district=str(self.input.district), image=self.input.image, ) for otype in ("district", "capitol"): odoc = self.root.xpath( f"//h3[@id='{otype}-office']/following-sibling::ul") if odoc: odoc = odoc[0] else: continue spans = odoc.xpath(".//span") office = p.capitol_office if otype == "capitol" else p.district_office office.address = "; ".join( line.strip() for line in spans[0].text_content().strip().splitlines() if line.strip()) office.voice = spans[1].text_content().strip() return p
def process_item(self, item): name = CSS("a").match(item)[2].text_content() name = re.sub(r"Contact Assembly Member", "", name).strip() party = CSS("td").match(item)[2].text_content().strip() if party == "Democrat": party = "Democratic" district = CSS("td").match(item)[1].text_content().strip().lstrip("0") # District 18 has a vacant spot if name == "edit": self.skip("skipping Vacant seat in District {}".format(district)) photo_url = CSS("img").match(item, min_items=0) if photo_url: photo_url = photo_url[0].get("src") p = ScrapePerson( name=name, state="ca", chamber="lower", district=district, party=party, image=photo_url, ) capitol_office_header = CSS("h3").match(item)[0].text_content() capitol_office_text = ( XPath( "//*[@id='block-views-view-members-block-1']/div/div/div/table/tbody/tr[1]/td[4]/text()" ) .match(item)[1] .strip() ) capitol_office_text, capitol_office_phone = capitol_office_text.split("; ") capitol_office_address = capitol_office_header + capitol_office_text p.capitol_office.address = capitol_office_address p.capitol_office.voice = capitol_office_phone district_offices = XPath(".//td/p[1]/text()").match(item) for office in district_offices: district_address, district_phone = office.split("; ") p.add_office( classification="district", address=district_address.strip(), voice=district_phone.strip(), ) url = CSS("a").match(item)[0].get("href") p.add_link(url) p.add_source(self.source.url) return p
def process_item(self, item): tds = item.getchildren() email, name, party, seat, phone = tds chamber, district = seat.text_content().strip().split() url = str(name.xpath("a/@href")[0]) person = ScrapePerson( name=clean_name(name.text_content()), state="mt", party=party.text_content().strip(), chamber=("upper" if chamber == "SD" else "lower"), district=district, ) person.add_link(url) person.add_source(url) phone = phone.text_content().strip() if len(phone) == 14: person.capitol_office.voice = phone elif len(phone) > 30: person.capitol_office.voice = phone.split(" ")[0] email = email.xpath("./a/@href") if email: person.email = email[0].split(":", 1)[1] return person
def process_item(self, item): if CSS("td").match(item)[1].text_content().strip() == "Vacant": return elif CSS("td").match(item)[1].text_content().strip() == "Martin, Greg": return else: name_dirty = CSS("td").match(item)[1].text_content().strip().split( ", ") name = name_dirty[1] + " " + name_dirty[0] if "Speaker" in name: name = re.sub(r"Speaker ", "", name) party = CSS("td").match(item)[2].text_content().strip() if party == "D": party = "Democratic" elif party == "R": party = "Republican" district = CSS("td").match(item)[4].text_content().strip() district = re.search(r"District\s(.+)", district).groups()[0] p = ScrapePerson( name=name, state="tn", chamber=self.chamber, district=district, party=party, ) detail_link = CSS("td a").match(item)[1].get("href") p.add_source(self.source.url) p.add_source(detail_link) p.add_link(detail_link, note="homepage") email = CSS("td a").match(item)[0].get("href") email = re.search(r"mailto:(.+)", email).groups()[0] p.email = email # this is also being grabbed above in capitol_office.address office_room = CSS("td").match(item)[5].text_content().strip() p.extras["office"] = office_room return LegDetail(p, source=detail_link)
def process_page(self): for bio in CSS(".bSenBio__infoIt").match(self.root): if "Party:" in bio.text_content(): party = bio.text_content().split(":")[1].strip() p = ScrapePerson( name=self.name_css.match_one(self.root).text, state="ok", chamber="upper", party=party, image=self.image_css.match_one(self.root).get("href"), district=self.district_css.match_one(self.root).text.strip().split()[1], ) p.capitol_office.address = self.address_css.match_one(self.root).text p.capitol_office.voice = self.phone_css.match_one(self.root).text p.add_link( self.contact_link_sel.match_one(self.root).get("href"), "Contact Form" ) return p
def process_item(self, item): try: name = name_title = CSS("a").match(item)[0].text_content() except SelectorError: self.skip("header row") if "--" in name_title: name, title = [word.strip() for word in name.split("--")] _, district, party, email, room, capitol_phone = item.getchildren() district = district.text_content() party = party.text_content() if party == "R": party = "Republican" elif party == "D": party = "Democratic" email = email.text_content() if email.startswith("Email: "): email = email.replace("Email: ", "").lower() + "@azleg.gov" else: email = "" room = room.text_content() if self.chamber == "lower": address = "House of Representatives\n " elif self.chamber == "upper": address = "Senate\n " address = address + "1700 West Washington\n " + room + "\nPhoenix, AZ 85007" capitol_phone = capitol_phone.text_content() image = CSS("td a img").match(item) if image: image = image[0].get("src") p = ScrapePerson( name=name, state="az", chamber=self.chamber, district=district, party=party, email=email, image=image, ) p.capitol_office.address = address p.capitol_office.voice = capitol_phone p.add_source(self.source.url) p.add_link(CSS("a").match(item)[0].get("href")) if "--" in name_title: p.extras["title"] = title return p
def process_item(self, item): if "Vacant" in item.text_content(): self.skip("vacant") link = item.xpath(".//a")[0] url = link.get("href") ( name, party, district, ) = re.match(r"\s+([^\(]+)\((\w+)\)\s+District-(\d+)", link.text).groups() contact = item.getchildren()[1].getchildren()[0:3] office = contact[0].text_content().strip() phone = contact[1].text_content().strip() email = contact[2].text_content().strip() p = ScrapePerson( **split_name(name), state="mi", chamber="lower", district=district, party=party, email=email, ) if url.startswith("http:/r"): url = url.replace("http:/", "http://") p.add_link(url) p.add_source(self.source.url) p.capitol_office.voice = phone p.capitol_office.address = office return p
def process_item(self, item): website, district, name, party, office, phone, email = item.getchildren( ) # skip header row if website.tag == "th": self.skip() office = office.text_content() for abbr, full in self.office_names.items(): office = office.replace(abbr, full) p = ScrapePerson( name=name.text_content(), state="mi", chamber="lower", district=district.text_content().lstrip("0"), party=party.text_content(), email=email.text_content(), ) link = CSS("a").match_one(website).get("href") if link.startswith("http:/r"): link = link.replace(":/", "://") p.add_link(link) p.add_source(self.source.url) p.capitol_office.voice = phone.text_content() p.capitol_office.address = office return p
def process_page(self): # construct person from the details from above p = ScrapePerson( state="oh", chamber="lower", district=self.input.district, name=self.input.name, party=self.input.party, image=self.input.image, ) p.add_source(self.input.url) p.add_link(self.input.url) divs = CSS(".member-info-bar-module").match(self.root) # last div is contact details contact_details = CSS(".member-info-bar-value").match(divs[-1]) for div in contact_details: dtc = div.text_content() if ", OH" in dtc: # join parts of the div together to make whole address children = div.getchildren() p.capitol_office.address = "; ".join([ children[0].text.strip(), children[0].tail.strip(), children[1].tail.strip(), ]) elif "Phone:" in dtc: p.capitol_office.voice = dtc.split(": ")[1] elif "Fax:" in dtc: p.capitol_office.fax = dtc.split(": ")[1] return p
def process_item(self, item): try: link = CSS("a").match(item)[1] except SelectorError: self.skip() data = { "last_name": link.text_content(), "url": link.get("href"), } for key, label in self.LABELS.items(): data[key] = CSS(f"[id$={label}]").match_one( item).text_content().strip() party = {"(D)": "Democratic", "(R)": "Republican"}[data["party"]] address = "Hawaii State Capitol, Room " + data["room"] chamber = "upper" if data["chamber"] == "S" else "lower" p = ScrapePerson( name=data["first_name"] + " " + data["last_name"], state="hi", chamber=chamber, district=data["district"], given_name=data["first_name"], family_name=data["last_name"], party=party, email=data["email"], ) p.capitol_office.address = address p.capitol_office.voice = data["voice"] p.capitol_office.fax = data["fax"] p.add_source(data["url"]) p.add_link(data["url"]) return p
def process_page(self): # annapolis_info = ( # XPath("//dt[text()='Annapolis Info']/following-sibling::dd[1]") # .match_one(self.root) # .text_content() # ) # interim_info = ( # XPath("//dt[text()='Interim Info']/following-sibling::dd[1]") # .match_one(self.root) # .text_content() # ) # email is formatted mailto:<addr>?body... email = SimilarLink("mailto:").match_one(self.root).get("href") email = email.split(":", 1)[1].split("?")[0] p = ScrapePerson( name=CSS("h2").match_one(self.root).text.split(" ", 1)[1], state="md", image=self.image_sel.match_one(self.root).get("src"), party=self.extract_dd("Party"), district=self.extract_dd("District"), chamber=None, email=email, ) p.add_link(self.source.url) p.add_source(self.source.url) return p
def process_page(self): party = {"D": "Democratic", "R": "Republican"}[self.input.party] photo = CSS("img#ContentPlaceHolder1_imgPhoto1").match_one(self.root).get("src") p = ScrapePerson( state="mo", party=party, image=photo, chamber="lower", district=self.input.district, name=f"{self.input.first_name} {self.input.last_name}", given_name=self.input.first_name, family_name=self.input.last_name, ) # TODO # p.extras["hometown"] = self.input.hometown p.capitol_office.voice = self.input.voice p.capitol_office.address = ( "MO House of Representatives; 201 West Capitol Avenue; " f"Room {self.input.room}; Jefferson City MO 65101 " ) p.add_link(self.input.url) p.add_source(self.input.url) return p
def process_page(self): p = ScrapePerson( name=self.input.name, state="tx", party=self.input.party, district=self.input.district, chamber="lower", image=self.input.image, ) def office_name(element): """Returns the office address type.""" return element.xpath("preceding-sibling::h4[1]/text()")[0].rstrip( ":") offices_text = [{ "label": office_name(p_tag), "type": office_name(p_tag), "details": p_tag.text_content(), } for p_tag in self.root.xpath( '//h4/following-sibling::p[@class="double-space"]')] for office_text in offices_text: details = office_text["details"].strip() # A few member pages have blank office listings: if details == "": continue # At the time of writing, this case of multiple district # offices occurs exactly once, for the representative at # District 4: if details.count("Office") > 1: district_offices = [ district_office.strip() for district_office in re.findall( r"(\w+ Office.+?(?=\w+ Office|$))", details, flags=re.DOTALL) ] offices_text += [{ "label": re.match(r"\w+ Office", office).group(), "type": "District Address", "details": re.search(r"(?<=Office).+(?=\w+ Office|$)?", office, re.DOTALL).group(), } for office in district_offices] process_address(details, p, office_text) return p