class HouseDetail(HtmlPage): image_selector = SimilarLink( "https://www.okhouse.gov/Members/Pictures/HiRes/") prefix = "#ctl00_ContentPlaceHolder1_lbl" name_css = CSS(prefix + "Name") district_css = CSS(prefix + "District") party_css = CSS(prefix + "Party") def process_page(self): name = self.name_css.match_one(self.root).text.split(maxsplit=1)[1] p = Person( name=name, state="ok", chamber="upper", party=self.party_css.match_one(self.root).text, district=self.district_css.match_one(self.root).text.split()[1], ) p.image = self.image_selector.match_one(self.root).get("href") contact_url = self.source.url.replace("District.aspx", "Contact.aspx") assert contact_url.startswith( "https://www.okhouse.gov/Members/Contact.aspx?District=") p.add_link(contact_url, note="Contact Form") # capitol address check_capitol_address = CSS(".districtheadleft").match( self.root)[0].text_content().strip() if check_capitol_address == "Capitol Address:": capitol_address_div = (CSS(".districtheadleft + div").match( self.root)[0].text_content().strip().splitlines()) p.capitol_office.address = "; ".join( [ln.strip() for ln in capitol_address_div[:-1]]) p.capitol_office.phone = capitol_address_div[-1].strip() return p
def process_item(self, item): name = CSS("div a").match(item)[1].text_content() district = ( CSS("div .esg-content.eg-senators-grid-element-1").match_one( item).text_content().split("|")[1].strip().lower()) district = re.search(r"district\s(\d+)", district).groups()[0] img = CSS("div img").match_one(item).get("data-lazysrc") p = ScrapePerson( name=name, state="in", chamber=self.chamber, district=district, party=self.party, image=img, ) city = (CSS("div .esg-content.eg-senators-grid-element-27").match_one( item).text_content()) p.extras["city"] = city detail_link = CSS("div a").match(item)[1].get("href") p.add_link(detail_link, note="homepage") p.add_source(self.source.url) p.add_source(detail_link) return BlueSenDetail(p, source=detail_link)
def process_item(self, item): name = CSS("header").match_one(item).text_content() district = CSS("div.district").match_one(item).text_content() district = re.search(r"House\sDistrict\s(\d+)", district).groups()[0] img = CSS("img").match_one(item).get("src") p = ScrapePerson( name=name, state="in", chamber=self.chamber, district=district, party=self.party, image=img, ) p.extras["city"] = CSS("div.city").match_one(item).text_content() detail_link = item.get("href") p.add_link(detail_link, note="homepage") detail_link_full = detail_link + "/full" p.add_source(detail_link_full) p.add_source(self.source.url) return BlueRepDetail(p, source=detail_link_full)
def process_item(self, item): name_party = CSS("span").match(item)[0].text_content().strip().split( " - ") name = name_party[0].strip() party = name_party[1].strip() if party == "(D)": party = "Democratic" elif party == "(R)": party = "Republican" elif party == "(DTS)": party = "Independent" district = CSS("span").match(item)[1].text_content().strip() district = re.search(r"District:\s(.+)", district).groups()[0].strip() p = ScrapePerson( name=name, state="nm", chamber=self.chamber, district=district, party=party, ) detail_link = CSS("a").match_one(item).get("href") p.add_source(self.source.url) p.add_source(detail_link) p.add_link(detail_link, note="homepage") img = CSS("img").match_one(item).get("src") p.image = img return LegDetail(p, source=detail_link)
def process_item(self, item): name = CSS("strong").match(item)[0].text_content() # skip header row if name == "Committees": self.skip() com = ScrapeCommittee( name=name, chamber=self.chamber, ) all_text = CSS("p").match(item)[0].text_content().strip() secretary, email, phone = re.search( r"\n?Secretary:(.+)\n?Email:(.+)\n?Phone:(.+)", all_text ).groups() com.extras["secretary"] = secretary.strip() com.extras["email"] = email.strip() com.extras["phone"] = phone.strip() detail_link = CSS("a").match(item)[0].get("href") com.add_source(self.source.url) com.add_source(detail_link) com.add_link(detail_link, note="homepage") return DetailCommitteePage(com, source=detail_link)
def process_page(self): com = self.input # no members if ( CSS("div.Membership fieldset").match_one(self.root).text_content().strip() == "" ): raise SkipItem("empty committee") members = CSS("fieldset div.area-holder ul.list li span.col01").match(self.root) num_members = 0 for member in members: role = member.getnext().text_content().strip() # skip Public Members if role == "Public Member": continue if role == "Member": role = "member" num_members += 1 mem_name = CSS("span span").match_one(member).text_content().strip() mem_name = re.search(r"(Representative|Senator)\s(.+)", mem_name).groups()[ 1 ] com.add_member(mem_name, role) if not num_members: raise SkipItem("only public members") return com
def process_page(self): com = self.input com.add_source(self.source.url) com.add_link(self.source.url, note="homepage") # a few committees don't have chair positions try: chair_role = (CSS(".c-chair-block--position").match_one( self.root).text_content().lower()) chair_name = CSS(".c-chair--title").match_one( self.root).text_content() com.add_member(chair_name, chair_role) except SelectorError: pass try: for p in XPath( "//div[contains(@class, 'c-senators-container')]//div[@class='view-content']/div[contains(@class, 'odd') or contains(@class, 'even')]" ).match(self.root): name = CSS(".nys-senator--name").match_one(p).text_content() role = CSS(".nys-senator--position").match_one( p).text_content().lower() if role == "": role = "member" com.add_member(name, role) except SelectorError: pass return com
def process_item(self, item): try: name = name_title = CSS("a").match(item)[0].text_content() except SelectorError: self.skip("header row") if "--" in name_title: name, title = [word.strip() for word in name.split("--")] _, district, party, email, room, capitol_phone = item.getchildren() district = district.text_content() party = party.text_content() if party == "R": party = "Republican" elif party == "D": party = "Democratic" email = email.text_content() if email.startswith("Email: "): email = email.replace("Email: ", "").lower() + "@azleg.gov" else: email = "" room = room.text_content() if self.chamber == "lower": address = "House of Representatives\n " elif self.chamber == "upper": address = "Senate\n " address = address + "1700 West Washington\n " + room + "\nPhoenix, AZ 85007" capitol_phone = capitol_phone.text_content() image = CSS("td a img").match(item) if image: image = image[0].get("src") p = ScrapePerson( name=name, state="az", chamber=self.chamber, district=district, party=party, email=email, image=image, ) p.capitol_office.address = address p.capitol_office.voice = capitol_phone p.add_source(self.source.url) p.add_link(CSS("a").match(item)[0].get("href")) if "--" in name_title: p.extras["title"] = title return p
def process_item(self, item): com_link = CSS("a").match(item)[0] name = com_link.text_content() com = ScrapeCommittee(name=name, classification="committee", chamber=self.chamber) detail_link = com_link.get("href") com.add_source(detail_link) com.add_link(detail_link, note="homepage") return CommitteeDetail(com, source=detail_link)
def process_page(self): p = self.input img = CSS("div.field-person-photo img").match_one(self.root).get("src") p.image = img bio_info = CSS("div.pane-content ul li").match(self.root) if len(bio_info) > 0: p.extras["bio info"] = [] for info in bio_info: p.extras["bio info"] += info try: street = (CSS("div.street-address").match_one( self.root).text_content().strip()) town = CSS("span.locality").match_one( self.root).text_content().strip() zip_code = (CSS("span.postal-code").match_one( self.root).text_content().strip()) address = street + ", " + town + ", ND " + zip_code p.district_office.address = address except SelectorError: pass try: phones = XPath( "//*[@id='block-system-main']//div[contains(text(), 'phone')]" ).match(self.root) for phone in phones: phone_type = phone.text_content().strip() phone_number = phone.getnext().text_content().strip() if phone_type == "Cellphone:": p.extras["Cell phone"] = phone_number elif phone_type == "Home Telephone:": p.extras["Home phone"] = phone_number elif phone_type == "Office Telephone:": p.district_office.voice = phone_number except SelectorError: pass email = (XPath( "//*[@id='block-system-main']//div[contains(text(), 'Email')]"). match_one(self.root).getnext().text_content().strip()) p.email = email try: fax = (XPath( "//*[@id='block-system-main']//div[contains(text(), 'Fax')]"). match_one(self.root).getnext().text_content().strip()) p.district_office.fax = fax except SelectorError: pass return p
def process_page(self): p = self.input img = CSS("div#content p img").match_one(self.root).get("src") p.image = img if self.source.url == "https://legislature.maine.gov/District-22": addr = CSS("div#content p strong").match(self.root)[2].tail.strip() else: addr = ( CSS("div#content p strong") .match(self.root)[1] .tail.strip() .lstrip(":") .strip() ) if addr != p.district_office.address: p.extras["Additional address"] = addr try: state_phone = ( XPath("//*[@id='content']/p/strong[contains(text(), 'State')]") .match_one(self.root) .tail.strip() ) state_phone = state_phone.lstrip(":").strip() p.capitol_office.voice = state_phone except SelectorError: pass try: state_phone = ( XPath("//*[@id='content']/p/b[contains(text(), 'State')]") .match_one(self.root) .tail.strip() ) state_phone = state_phone.lstrip(":").strip() p.capitol_office.voice = state_phone except SelectorError: pass website = ( XPath("//*[@id='content']/p/strong[contains(text(), 'Website')]") .match_one(self.root) .getnext() ) if website.get("href") is None: website = website.getnext().get("href") else: website = website.get("href") p.add_link(website, note="website") return p
def process_page(self): com = self.input com.add_source(self.source.url) com.add_link(self.source.url, note="homepage") try: chairs = CSS(".chair-info").match(self.root) except SelectorError: raise SkipItem("skipping committee without full information") # in case there are co-chairs num_chairs = len(chairs) for chair in chairs: chair_name = CSS(".comm-chair-name").match_one(chair).text_content().strip() chair_role = ( XPath(f"..//preceding-sibling::header[{num_chairs}]") .match_one(chair) .text_content() .strip() .lower() ) com.add_member(chair_name, chair_role) # some committees only have chairs and no members list try: for p in CSS("#comm-membership ul li").match(self.root): name = p.text_content().strip() role = "member" com.add_member(name, role) except SelectorError: pass # some committees have temporary addresses, others have permanent ones try: temp, room, zip = XPath( "//section[@id='comm-addr']/div[@class='mod-inner']//text()" ).match(self.root) com.extras["address"] = f"{temp}: {room}; {zip}" except ValueError: room, zip = XPath( "//section[@id='comm-addr']/div[@class='mod-inner']//text()" ).match(self.root) com.extras["address"] = f"{room}; {zip}" # some committees have press releases try: news_link = CSS("#page-content .read-more").match(self.root)[0].get("href") com.add_link(news_link) except SelectorError: pass return com
def process_page(self): chamber = "upper" if self.input.identifier.startswith("S") else "lower" short_title = self.get_column_div("Summary").text long_title = CSS("#title").match_one(self.root).text if "*" in self.input.identifier: stars = re.search(r"\*+", self.input.identifier).group() if ( self.input.session in CARRYOVERS and stars in CARRYOVERS[self.input.session] ): self.input.identifier = re.sub( r"\*+", "-" + CARRYOVERS[self.input.session][stars], self.input.identifier, ) else: self.logger.error( f"Unidentified carryover bill {self.input.identifier}. Update CARRYOVERS dict in bills.py" ) return bill = Bill( identifier=self.input.identifier, legislative_session=self.input.session, title=short_title, chamber=chamber, ) bill.subject = self.input.subjects # use the pretty source URL bill.add_source(self.input.source_url) bill.add_title(long_title) try: sponsors = self.get_column_div("Primary Sponsor") self.add_sponsors(bill, CSS("a").match(sponsors), primary=True) except SelectorError: pass try: cosponsors = self.get_column_div("Co-Sponsor") self.add_sponsors(bill, CSS("a").match(cosponsors), primary=False) except SelectorError: pass # TODO: figure out cosponsor div name, can't find any as of Feb 2021 self.add_actions(bill, chamber) bdr = extract_bdr(short_title) if bdr: bill.extras["BDR"] = bdr text_url = self.source.url.replace("Overview", "Text") yield BillTabText(bill, source=text_url)
def process_item(self, item): # Convert names to title case as they are in all-caps name = CSS("span.name").match_one(item).text_content().strip() name = re.sub(r"^Hon\.", "", name, flags=re.IGNORECASE).strip().title() party = CSS("span.partido").match_one(item).text_content().strip() # Translate to English since being an Independent is a universal construct if party == "Independiente": party = "Independent" detail_link = CSS("a").match_one(item).get("href") partial = PartialSen(name=name, party=party, source=self.source.url) return SenDetail(partial, source=detail_link)
def process_page(self): com = self.input try: # This section has the chair memebers the regular, democratic and minority and the roles # main chair chair_member = (CSS( "div.MemberInfoList-MemberWrapper.ChairWrapper div.ChairNameText a" ).match(self.root)[0].text.strip()) # main chair role chair_member_role = (CSS( "div.MemberInfoList-MemberWrapper.ChairWrapper div.ChairNameText div" ).match(self.root)[0].text.strip()) except IndexError: pass try: com.add_member(fix_name(chair_member), chair_member_role) # Democratic Chair member and or the minority chair member demo_chair_member = (CSS( "div.MemberInfoList-MemberWrapper.ChairWrapper div.ChairNameText a" ).match(self.root)[1].text.strip()) # Democratic Chair member and or the minority chair member role demo_chair_member_role = (CSS( "div.MemberInfoList-MemberWrapper.ChairWrapper div.ChairNameText div" ).match(self.root)[1].text.strip()) com.add_member(fix_name(demo_chair_member), demo_chair_member_role) except IndexError: pass majority_members = CSS( ".Widget.CteeInfo-MajorityList .MemberInfoList-MemberWrapper.Member" ).match(self.root) for mem in majority_members: try: major_member_name = CSS("div a").match_one(mem).text.strip() major_mem_position = CSS(".position").match_one( mem).text.strip() except SelectorError: major_mem_position = "member" com.add_member(fix_name(major_member_name), major_mem_position) minority_members = CSS( ".Widget.CteeInfo-MinorityList .MemberInfoList-MemberWrapper.Member" ).match(self.root) for mem in minority_members: try: minor_member_name = CSS("div a").match_one(mem).text.strip() minor_mem_position = CSS(".position").match_one( mem).text.strip() except SelectorError: minor_mem_position = "member" com.add_member(fix_name(minor_member_name), minor_mem_position) return com
def process_item(self, item): # skip header rows if ( len(CSS("td").match(item)) == 1 or CSS("td").match(item)[0].get("class") == "header" ): self.skip() first_link = CSS("td a").match(item)[0] name = first_link.text_content() detail_link = first_link.get("href") district = CSS("td").match(item)[3].text_content() party_letter = CSS("td").match(item)[4].text_content() party_dict = {"D": "Democratic", "R": "Republican", "I": "Independent"} party = party_dict[party_letter] p = ScrapePerson( name=name, state="il", party=party, chamber=self.chamber, district=district, ) p.add_source(self.source.url) p.add_source(detail_link) p.add_link(detail_link, note="homepage") return LegDetail(p, source=detail_link)
def process_item(self, item): name_dirty = CSS("h4 span").match_one(item).text_content().strip() if re.search(r"Vacant", name_dirty): self.skip() name_dirty = name_dirty.split(", ") last_name = name_dirty[0] first_name = name_dirty[1] name = first_name + " " + last_name district = CSS("i.fa.fa-map").match_one( item).getnext().text_content().strip() party = CSS("i.fa.fa-users").match_one( item).getnext().text_content().strip() if party == "Democrat": party = "Democratic" email = CSS("a").match(item)[2].text_content().strip() img = CSS("img").match_one(item).get("src") p = ScrapePerson( name=name, state="la", party=party, district=district, chamber=self.chamber, email=email, image=img, ) detail_link = CSS("a").match(item)[1].get("href") p.add_source(self.source.url) p.add_source(detail_link) p.add_link(detail_link, note="homepage") return LegislatorDetail(p, source=detail_link)
class LegPage(HtmlPage): name_css = CSS("h1.mt-0") district_css = CSS(".col-9 h2") image_css = CSS("img#sen-image") address_css = CSS("address") def process_page(self): name = self.name_css.match_one(self.root).text.replace("Sen. ", "").strip() district = self.district_css.match_one(self.root).text.split()[1] image = self.image_css.match_one(self.root).get("src") addrlines = self.address_css.match_one(self.root).text_content() # example: # Room 11th Floor # P.O. Box 94604 # Lincoln, NE 68509 # (402) 471-2733 # Email: [email protected] mode = "address" address = [] phone = None email = None for line in addrlines.splitlines(): line = line.strip() if not line: continue if line.startswith("(402)"): phone = line mode = None if line.startswith("Email:"): email = line.replace("Email: ", "") if mode == "address": address.append(line) p = Person( chamber="legislature", party="Nonpartisan", state="ne", district=district, image=image, name=name, email=email, ) p.capitol_office.address = "; ".join(address) p.capitol_office.voice = phone p.add_source(self.source.url) p.add_link(self.source.url) return p
def process_item(self, item): name = CSS(".mediaCaptionTitle").match_one(item).text subtitle = CSS(".mediaCaptionSubtitle").match_one(item).text image = CSS(".photo").match_one(item).get("style") image = background_image_re.findall(image)[0] # e.g. District 25 | D district, party = subtitle.split(" | ") district = district.split()[1] party = {"D": "Democratic", "R": "Republican"}[party] return HouseDetail( HousePartial( name=name, district=district, party=party, url=item.get("href"), image=image, ) )
class SenateCommitteeList(HtmlListPage): source = "https://committees.senate.michigan.gov/" selector = CSS("form .col-md-6 ul li") chamber = "upper" def process_item(self, item): try: title = XPath("..//preceding-sibling::h3/text()").match(item) except SelectorError: title = XPath("../../..//preceding-sibling::h3/text()").match(item) for comm_name in title: if (comm_name == "Standing Committees" or comm_name == "Appropriations Subcommittees"): name_link = CSS("a").match_one(item) name = name_link.text_content() source = name_link.get("href") if comm_name == "Standing Committees": com = ScrapeCommittee(name=name, chamber=self.chamber) else: com = ScrapeCommittee( name=name, classification="subcommittee", chamber=self.chamber, parent="Appropriations", ) return SenateCommitteeDetail(com, source=source) else: self.skip()
class LegList(HtmlListPage): selector = CSS("a.Legislator-Card.col-md-4.col-sm-6.col-xs-12") def process_item(self, item): name = CSS("h3").match_one(item).text_content() if name == " - Vacant Seat": self.skip() party = CSS("small").match_one(item).text_content() if party == "Democrat": party = "Democratic" district = CSS("p").match(item)[0].text_content() district = ( re.search(r"District:\r\n(.+)", district).groups()[0].strip().lstrip("0") ) p = ScrapePerson( name=name, state="ky", party=party, chamber=self.chamber, district=district, ) detail_link = item.get("href") p.add_source(self.source.url) p.add_source(detail_link) p.add_link(detail_link, note="homepage") return LegDetail(p, source=detail_link)
def process_item(self, item): link = (XPath( ".//div[contains(@class, 'container')]//a[contains(@href, 'members')]" ).match(item)[0].get("href")) name = CSS("h2 a").match(item)[0].text_content() com = ScrapeCommittee(name=name, chamber=self.chamber) for links in XPath(".//div[contains(@class, 'container')]//a").match( item): url = links.get("href") if url == link: continue else: if links == XPath( ".//div[contains(@class, 'container')]//a[contains(@href, 'home')]" ).match_one(item): com.add_link(url, note="homepage") homepage = True else: com.add_link(url) if not homepage: self.warn("no homepage found") com.add_source(self.source.url) return HouseCommitteeDetail(com, source=link)
class HouseCommitteeList(HtmlListPage): selector = CSS(".mb-3 .card-body") source = "https://www.house.leg.state.mn.us/committees" chamber = "lower" def process_item(self, item): link = (XPath( ".//div[contains(@class, 'container')]//a[contains(@href, 'members')]" ).match(item)[0].get("href")) name = CSS("h2 a").match(item)[0].text_content() com = ScrapeCommittee(name=name, chamber=self.chamber) for links in XPath(".//div[contains(@class, 'container')]//a").match( item): url = links.get("href") if url == link: continue else: if links == XPath( ".//div[contains(@class, 'container')]//a[contains(@href, 'home')]" ).match_one(item): com.add_link(url, note="homepage") homepage = True else: com.add_link(url) if not homepage: self.warn("no homepage found") com.add_source(self.source.url) return HouseCommitteeDetail(com, source=link)
def process_page(self): # annapolis_info = ( # XPath("//dt[text()='Annapolis Info']/following-sibling::dd[1]") # .match_one(self.root) # .text_content() # ) # interim_info = ( # XPath("//dt[text()='Interim Info']/following-sibling::dd[1]") # .match_one(self.root) # .text_content() # ) # email is formatted mailto:<addr>?body... email = SimilarLink("mailto:").match_one(self.root).get("href") email = email.split(":", 1)[1].split("?")[0] p = Person( name=CSS("h2").match_one(self.root).text.split(" ", 1)[1], state="md", image=self.image_sel.match_one(self.root).get("src"), party=self.extract_dd("Party"), district=self.extract_dd("District"), chamber=None, email=email, ) p.add_link(self.source.url) p.add_source(self.source.url) return p
def process_page(self): data_elem = CSS("#__NEXT_DATA__").match_one(self.root).text_content() data = json.loads(data_elem) for item in data["props"]["pageProps"]["legrosterData"][0]: first = item["First_Name"] middle = item["Middle_Name"] last = item["Last_Name"] suffix = item["Suffix"] member_id = item["BioLink"].split("/")[2] url = "https://www.njleg.state.nj.us" + item["BioLink"] party = {"D": "Democratic", "R": "Republican"}[item["Party"]] district = item["Roster_District"] chamber = "upper" if item["Roster_House"] == "Senate" else "lower" if middle: name = f"{first} {middle} {last}" else: name = f"{first} {last}" if suffix: name += f", {suffix}" p = ScrapePerson( name=name, given_name=first, family_name=last, state="nj", chamber=chamber, party=party, district=district, ) p.add_source(self.source.url) p.add_source(url) p.add_link(url) api_url = f"https://www.njleg.state.nj.us/api/legislatorData/legislatorBio/{member_id}" p.add_source(api_url) yield LegDetail(p, source=api_url)
class House(HtmlListPage): source = URL( "http://www.tucamarapr.org/dnncamara/ComposiciondelaCamara/Biografia.aspx" ) selector = CSS("ul.list-article li", num_items=49) def process_item(self, item): bio_info = (CSS("div.biodiv a").match_one( item).text_content().strip().split("\n")) name = bio_info[0].strip() name = re.sub(r"^Hon\.", "", name, flags=re.IGNORECASE).strip() district = bio_info[2].strip() if district == "Representante por Acumulación": district = "At-Large" else: district = re.search(r"Representante\sdel\sDistrito\s(.+)", district).groups()[0] partial = PartialRep(name=name, district=district, source=self.source.url) detail_link = CSS("a").match_one(item).get("href") return RepDetail(partial, source=detail_link)
class Representatives(HtmlListPage): # note: there is a CSV, but it requires a bunch of ASP.net hoops to actually get source = URL( "https://house.mo.gov/MemberGridCluster.aspx?year=2021&code=R+&filter=clear" ) selector = CSS("tr") def process_item(self, item): tds = CSS("td").match(item, min_items=0, max_items=8) if not tds: self.skip() _, last, first, district, party, town, phone, room = tds if last.text_content() == "Vacant": self.skip() return HouseDetail( HousePartial( last_name=last.text_content(), first_name=first.text_content(), district=int(district.text_content()), party=party.text_content(), hometown=town.text_content().strip(), voice=phone.text_content(), room=room.text_content(), url=CSS("a").match_one(last).get("href"), ) )
class LegList(HtmlListPage): selector = CSS("div.MemberInfoList-MemberWrapper") def process_item(self, item): name_dirty = CSS("a").match_one(item).text_content().strip().split( ", ") name = name_dirty[1] + " " + name_dirty[0] district = CSS("br").match(item)[-1].tail.strip() district = re.search(r"District\s(.+)", district).groups()[0] party = CSS("b").match_one(item).tail.strip() if party == "(D)": party = "Democratic" elif party == "(R)": party = "Republican" elif party == "(I)": party = "Independent" p = ScrapePerson( name=name, state="pa", chamber=self.chamber, district=district, party=party, ) detail_link = CSS("a").match_one(item).get("href") p.add_source(self.source.url) p.add_source(detail_link) p.add_link(detail_link, note="homepage") return LegDetail(p, source=URL(detail_link, timeout=10))
class Legislators(HtmlListPage): selector = CSS("div.member") def process_item(self, item): name = CSS("a.membername").match_one(item).text_content() name = re.search(r"(Senator|Representative)\s(.+)", name).groups()[1] party = CSS("a.membername").match_one(item).tail.strip() if party == "(D)": party = "Democratic" elif party == "(R)": party = "Republican" district = CSS("div.district a").match_one(item).text_content().strip() district = re.search(r"District\s(.+)", district).groups()[0] p = ScrapePerson( name=name, state="sc", chamber=self.chamber, district=district, party=party, ) detail_link = CSS("div.district a").match_one(item).get("href") p.add_source(self.source.url) p.add_source(detail_link) p.add_link(detail_link, note="homepage") img = CSS("img").match_one(item).get("src") p.image = img return LegDetail(p, source=URL(detail_link, timeout=20))
def process_page(self): p = self.input img = CSS("img.rounded").match_one(self.root).get("src") p.image = img contact_info = XPath("//strong[contains(text(), 'Contact Information')]").match( self.root )[0] cap_addr = contact_info.getnext().tail.strip() cap_addr += " " cap_addr += contact_info.getnext().getnext().tail.strip() cap_addr += " " cap_addr += contact_info.getnext().getnext().getnext().tail.strip() p.capitol_office.address = cap_addr try: phone = ( XPath("//strong[contains(text(), 'Phone:')]") .match(self.root)[0] .tail.strip() ) phone = re.search(r"(\d{3}-\d{3}-\d{4})(.+)?", phone).groups()[0] p.capitol_office.voice = phone except SelectorError: pass return p