class SenateDetail(HtmlPage): name_css = CSS(".field--name-title") image_css = CSS(".bSenBio__media-btn") district_css = CSS(".bDistrict h2") address_css = CSS(".bSenBio__address p") phone_css = CSS(".bSenBio__tel a") contact_link_sel = SimilarLink( r"https://oksenate.gov/contact-senator\?sid=") def get_source_from_input(self): return self.input["url"] def get_data(self): for bio in CSS(".bSenBio__infoIt").match(self.root): if "Party:" in bio.text_content(): party = bio.text_content().split(":")[1].strip() p = Person( name=self.name_css.match_one(self.root).text, state="ok", chamber="upper", party=party, image=self.image_css.match_one(self.root).get("href"), district=self.district_css.match_one( self.root).text.strip().split()[1], ) p.capitol_office.address = self.address_css.match_one(self.root).text p.capitol_office.phone = self.phone_css.match_one(self.root).text p.add_link( self.contact_link_sel.match_one(self.root).get("href"), "Contact Form") return p
def process_item(self, item): try: link = CSS("a").match(item)[1] except SelectorError: self.skip() data = { "last_name": link.text_content(), "url": link.get("href"), } for key, label in self.LABELS.items(): data[key] = CSS(f"[id$={label}]").match_one(item).text_content().strip() party = {"(D)": "Democratic", "(R)": "Republican"}[data["party"]] address = "Hawaii State Capitol, Room " + data["room"] chamber = "upper" if data["chamber"] == "S" else "lower" p = Person( name=data["first_name"] + " " + data["last_name"], state="hi", chamber=chamber, district=data["district"], given_name=data["first_name"], family_name=data["last_name"], party=party, email=data["email"], ) p.capitol_office.address = address p.capitol_office.voice = data["voice"] p.capitol_office.fax = data["fax"] p.add_source(data["url"]) p.add_link(data["url"]) return p
def process_item(self, item): member, party, district, contact_link, phone, office = item.getchildren( ) name = member.text_content() district = district.text_content() # skip vacant districts if "Interim District" in name: self.skip() # each of these <td> have a single link leg_url = CSS("a").match_one(member).get("href") contact_url = CSS("a").match_one(contact_link).get("href") # construct this URL based on observation elsewhere on senate.michigan.gov image_url = f"https://senate.michigan.gov/_images/{district}{ord_suffix(district)}.jpg" p = Person( **split_name(name), state="mi", chamber="upper", district=district, party=self.PARTY_MAP[party.text], image=image_url, ) p.capitol_office.voice = str(phone.text_content()) p.capitol_office.address = str(office.text_content()) p.add_source(self.source.url) p.add_link(leg_url) p.add_link(contact_url, note="Contact") return p
def process_page(self): name = self.name_css.match_one(self.root).text.split(maxsplit=1)[1] p = Person( name=name, state="ok", chamber="upper", party=self.party_css.match_one(self.root).text, district=self.district_css.match_one(self.root).text.split()[1], ) p.image = self.image_selector.match_one(self.root).get("href") contact_url = self.source.url.replace("District.aspx", "Contact.aspx") assert contact_url.startswith( "https://www.okhouse.gov/Members/Contact.aspx?District=") p.add_link(contact_url, note="Contact Form") # capitol address check_capitol_address = CSS(".districtheadleft").match( self.root)[0].text_content().strip() if check_capitol_address == "Capitol Address:": capitol_address_div = (CSS(".districtheadleft + div").match( self.root)[0].text_content().strip().splitlines()) p.capitol_office.address = "; ".join( [ln.strip() for ln in capitol_address_div[:-1]]) p.capitol_office.phone = capitol_address_div[-1].strip() return p
def process_page(self): # construct person from the details from above p = Person( state="oh", chamber="lower", district=self.input.district, name=self.input.name, party=self.input.party, image=self.input.image, ) p.add_source(self.input.url) p.add_link(self.input.url) divs = CSS(".member-info-bar-module").match(self.root) # last div is contact details contact_details = CSS(".member-info-bar-value").match(divs[-1]) for div in contact_details: dtc = div.text_content() if ", OH" in dtc: # join parts of the div together to make whole address children = div.getchildren() p.capitol_office.address = "; ".join([ children[0].text.strip(), children[0].tail.strip(), children[1].tail.strip() ]) elif "Phone:" in dtc: p.capitol_office.voice = dtc.split(": ")[1] elif "Fax:" in dtc: p.capitol_office.fax = dtc.split(": ")[1] return p
class LegPage(HtmlPage): name_css = CSS("h1.mt-0") district_css = CSS(".col-9 h2") image_css = CSS("img#sen-image") address_css = CSS("address") def get_source_from_input(self): return self.input def process_page(self): name = self.name_css.match_one(self.root).text.replace("Sen. ", "").strip() district = self.district_css.match_one(self.root).text.split()[1] image = self.image_css.match_one(self.root).get("src") addrlines = self.address_css.match_one(self.root).text_content() # example: # Room 11th Floor # P.O. Box 94604 # Lincoln, NE 68509 # (402) 471-2733 # Email: [email protected] mode = "address" address = [] phone = None email = None for line in addrlines.splitlines(): line = line.strip() if not line: continue if line.startswith("(402)"): phone = line mode = None if line.startswith("Email:"): email = line.replace("Email: ", "") if mode == "address": address.append(line) p = Person( chamber="legislature", party="Nonpartisan", state="ne", district=district, image=image, name=name, email=email, ) p.capitol_office.address = "; ".join(address) p.capitol_office.voice = phone p.add_source(self.source.url) p.add_link(self.source.url) return p
def process_page(self): party = {"D": "Democratic", "R": "Republican"}[self.input.party] photo = CSS("img#ContentPlaceHolder1_imgPhoto1").match_one( self.root).get("src") p = Person( state="mo", party=party, image=photo, chamber="lower", district=self.input.district, name=f"{self.input.first_name} {self.input.last_name}", given_name=self.input.first_name, family_name=self.input.last_name, ) # TODO # p.extras["hometown"] = self.input.hometown p.capitol_office.voice = self.input.voice p.capitol_office.address = ( "MO House of Representatives; 201 West Capitol Avenue; " f"Room {self.input.room}; Jefferson City MO 65101 ") p.add_link(self.input.url) p.add_source(self.input.url) return p
class RepList(HtmlListPage): source = "https://www.house.mi.gov/MHRPublic/frmRepListMilenia.aspx?all=true" selector = CSS("#grvRepInfo tr", num_items=111) office_names = { "SHOB": "South House Office Building", "NHOB": "North House Office Building", "CB": "Capitol Building", } def process_item(self, item): website, district, name, party, office, phone, email = item.getchildren( ) # skip header row if website.tag == "th": self.skip() office = office.text_content() for abbr, full in self.office_names.items(): office = office.replace(abbr, full) p = Person( name=name.text_content(), state="mi", chamber="lower", district=district.text_content().lstrip("0"), party=party.text_content(), email=email.text_content(), ) p.add_link(CSS("a").match_one(website).get("href")) p.add_source(self.source.url) p.capitol_office.voice = phone.text_content() p.capitol_office.address = office return p
def process_page(self): # annapolis_info = ( # XPath("//dt[text()='Annapolis Info']/following-sibling::dd[1]") # .match_one(self.root) # .text_content() # ) # interim_info = ( # XPath("//dt[text()='Interim Info']/following-sibling::dd[1]") # .match_one(self.root) # .text_content() # ) # email is formatted mailto:<addr>?body... email = SimilarLink("mailto:").match_one(self.root).get("href") email = email.split(":", 1)[1].split("?")[0] p = Person( name=CSS("h2").match_one(self.root).text.split(" ", 1)[1], state="md", image=self.image_sel.match_one(self.root).get("src"), party=self.extract_dd("Party"), district=self.extract_dd("District"), chamber=None, email=email, ) p.add_link(self.source.url) p.add_source(self.source.url) return p
def process_item(self, item): website, district, name, party, office, phone, email = item.getchildren( ) # skip header row if website.tag == "th": self.skip() office = office.text_content() for abbr, full in self.office_names.items(): office = office.replace(abbr, full) p = Person( name=name.text_content(), state="mi", chamber="lower", district=district.text_content().lstrip("0"), party=party.text_content(), email=email.text_content(), ) p.add_link(CSS("a").match_one(website).get("href")) p.add_source(self.source.url) p.capitol_office.voice = phone.text_content() p.capitol_office.address = office return p
def process_item(self, item): name = CSS(".mediaCaptionTitle").match_one(item).text subtitle = CSS(".mediaCaptionSubtitle").match_one(item).text image = CSS(".photo").match_one(item).get("style") image = background_image_re.findall(image)[0] # e.g. District 25 | D district, party = subtitle.split(" | ") district = district.split()[1] party = {"D": "Democratic", "R": "Republican"}[party] return HousePartial( name=name, district=district, party=party, url=item.get("href"), image=image, )
def process_item(self, item): tds = CSS("td").match(item, min_items=0, max_items=8) if not tds: self.skip() _, last, first, district, party, town, phone, room = tds if last.text_content() == "Vacant": self.skip() return HousePartial( last_name=last.text_content(), first_name=first.text_content(), district=int(district.text_content()), party=party.text_content(), hometown=town.text_content().strip(), voice=phone.text_content(), room=room.text_content(), url=CSS("a").match_one(last).get("href"), )
def process_addresses(self, item): # 1-3 address blocks, last is always Capitol address_blocks = CSS(".full-addr").match(item, min_items=1, max_items=3) # district address #1 district = parse_address_lines(block_to_text(address_blocks[0])) # capitol address capitol = parse_address_lines(block_to_text(address_blocks[-1])) # TODO: handle district address #2 if it exists return district, capitol
class HouseDetail(HtmlPage): image_selector = SimilarLink( "https://www.okhouse.gov/Members/Pictures/HiRes/") prefix = "#ctl00_ContentPlaceHolder1_lbl" name_css = CSS(prefix + "Name") district_css = CSS(prefix + "District") party_css = CSS(prefix + "Party") def get_source_from_input(self): return self.input["url"] def process_page(self): name = self.name_css.match_one(self.root).text.split(maxsplit=1)[1] p = Person( name=name, state="ok", chamber="upper", party=self.party_css.match_one(self.root).text, district=self.district_css.match_one(self.root).text.split()[1], ) p.image = self.image_selector.match_one(self.root).get("href") contact_url = self.source.url.replace("District.aspx", "Contact.aspx") assert contact_url.startswith( "https://www.okhouse.gov/Members/Contact.aspx?District=") p.add_link(contact_url, note="Contact Form") # capitol address check_capitol_address = CSS(".districtheadleft").match( self.root)[0].text_content().strip() if check_capitol_address == "Capitol Address:": capitol_address_div = (CSS(".districtheadleft + div").match( self.root)[0].text_content().strip().splitlines()) p.capitol_office.address = "; ".join( [ln.strip() for ln in capitol_address_div[:-1]]) p.capitol_office.phone = capitol_address_div[-1].strip() return p
class HawaiiLegislators(HtmlListPage): source = FormSource( "https://www.capitol.hawaii.gov/members/legislators.aspx", "//form", "Show All" ) selector = CSS("#ctl00_ContentPlaceHolderCol1_GridView1 tr") LABELS = { "first_name": "LabelFirst", "party": "LabelParty", "room": "LabelRoom2", "voice": "LabelPhone2", "fax": "LabelFAX2", "email": "HyperLinkEmail", "chamber": "LabelDis", "district": "LabelDistrict", } def process_item(self, item): try: link = CSS("a").match(item)[1] except SelectorError: self.skip() data = { "last_name": link.text_content(), "url": link.get("href"), } for key, label in self.LABELS.items(): data[key] = CSS(f"[id$={label}]").match_one(item).text_content().strip() party = {"(D)": "Democratic", "(R)": "Republican"}[data["party"]] address = "Hawaii State Capitol, Room " + data["room"] chamber = "upper" if data["chamber"] == "S" else "lower" p = Person( name=data["first_name"] + " " + data["last_name"], state="hi", chamber=chamber, district=data["district"], given_name=data["first_name"], family_name=data["last_name"], party=party, email=data["email"], ) p.capitol_office.address = address p.capitol_office.voice = data["voice"] p.capitol_office.fax = data["fax"] p.add_source(data["url"]) p.add_link(data["url"]) return p
def get_data(self): for bio in CSS(".bSenBio__infoIt").match(self.root): if "Party:" in bio.text_content(): party = bio.text_content().split(":")[1].strip() p = Person( name=self.name_css.match_one(self.root).text, state="ok", chamber="upper", party=party, image=self.image_css.match_one(self.root).get("href"), district=self.district_css.match_one( self.root).text.strip().split()[1], ) p.capitol_office.address = self.address_css.match_one(self.root).text p.capitol_office.phone = self.phone_css.match_one(self.root).text p.add_link( self.contact_link_sel.match_one(self.root).get("href"), "Contact Form") return p
class HouseList(HtmlListPage): source = "https://www.legislature.ohio.gov/legislators/house-directory" selector = CSS(".mediaGrid a[target='_blank']", num_items=99) def process_item(self, item): name = CSS(".mediaCaptionTitle").match_one(item).text subtitle = CSS(".mediaCaptionSubtitle").match_one(item).text image = CSS(".photo").match_one(item).get("style") image = background_image_re.findall(image)[0] # e.g. District 25 | D district, party = subtitle.split(" | ") district = district.split()[1] party = {"D": "Democratic", "R": "Republican"}[party] return HousePartial( name=name, district=district, party=party, url=item.get("href"), image=image, )
class HouseList(HtmlListPage): # note: there is a CSV, but it requires a bunch of ASP.net hoops to actually get source = URL( "https://house.mo.gov/MemberGridCluster.aspx?year=2021&code=R+&filter=clear" ) selector = CSS("tr") def process_item(self, item): tds = CSS("td").match(item, min_items=0, max_items=8) if not tds: self.skip() _, last, first, district, party, town, phone, room = tds if last.text_content() == "Vacant": self.skip() return HousePartial( last_name=last.text_content(), first_name=first.text_content(), district=int(district.text_content()), party=party.text_content(), hometown=town.text_content().strip(), voice=phone.text_content(), room=room.text_content(), url=CSS("a").match_one(last).get("href"), )
class PersonDetail(HtmlPage): def get_source_from_input(self): return str(self.input["url"]) def parse_address_block(self, block): state = "address" # group lines by type values = {"address": [], "phone": [], "fax": []} for line in block.splitlines(): line = line.strip() if not line: continue if line.startswith("Phone"): state = "phone" elif line.startswith("Fax"): state = "fax" values[state].append(line) # postprocess values phones = [] for line in values["phone"]: for match in re.findall(r"\d{3}-\d{3}-\d{4}", line): phones.append(match) faxes = [] for line in values["fax"]: for match in re.findall(r"\d{3}-\d{3}-\d{4}", line): faxes.append(match) return { "address": "; ".join(values["address"]), "phones": phones, "faxes": faxes } def extract_dd(self, name): return ( XPath(f"//dt[text()='{name}']/following-sibling::dd[1]").match_one( self.root).text_content()) image_sel = CSS("img.details-page-image-padding") def process_page(self): # annapolis_info = ( # XPath("//dt[text()='Annapolis Info']/following-sibling::dd[1]") # .match_one(self.root) # .text_content() # ) # interim_info = ( # XPath("//dt[text()='Interim Info']/following-sibling::dd[1]") # .match_one(self.root) # .text_content() # ) # email is formatted mailto:<addr>?body... email = SimilarLink("mailto:").match_one(self.root).get("href") email = email.split(":", 1)[1].split("?")[0] p = Person( name=CSS("h2").match_one(self.root).text.split(" ", 1)[1], state="md", image=self.image_sel.match_one(self.root).get("src"), party=self.extract_dd("Party"), district=self.extract_dd("District"), chamber=None, email=email, ) p.add_link(self.source.url) p.add_source(self.source.url) return p
class AssemblyList(HtmlListPage): source = URL("https://assembly.state.ny.us/mem/") selector = CSS("section.mem-item", num_items=150) dependencies = {"party_mapping": PartyAugmentation()} def process_addresses(self, item): # 1-3 address blocks, last is always Capitol address_blocks = CSS(".full-addr").match(item, min_items=1, max_items=3) # district address #1 district = parse_address_lines(block_to_text(address_blocks[0])) # capitol address capitol = parse_address_lines(block_to_text(address_blocks[-1])) # TODO: handle district address #2 if it exists return district, capitol def process_item(self, item): # strip leading zero district = str(int(item.get("id"))) image = CSS(".mem-pic a img").match_one(item).get("src") name = CSS(".mem-name a").match_one(item) district_addr, capitol_addr = self.process_addresses(item) # email, twitter, facebook are all sometimes present try: email = CSS(".mem-email a").match_one(item).text.strip() except SelectorError: email = "" try: twitter = CSS(".fa-twitter").match_one(item) twitter = twitter.getparent().get("href").split("/")[-1] except SelectorError: twitter = "" try: facebook = CSS(".fa-facebook").match_one(item) facebook = facebook.getparent().get("href").split("/")[-1] except SelectorError: facebook = "" party = self.party_mapping[district][1] p = Person( state="ny", chamber="lower", image=image, party=party, district=district, name=name.text.strip(), email=email, ) p.add_link(url=name.get("href")) p.add_source(url=name.get("href")) if twitter: p.ids["twitter"] = twitter if facebook: p.ids["facebook"] = facebook p.district_office.address = district_addr["address"] p.district_office.voice = district_addr["phone"] p.district_office.fax = district_addr["fax"] p.capitol_office.address = capitol_addr["address"] p.capitol_office.voice = capitol_addr["phone"] p.capitol_office.fax = capitol_addr["fax"] return p
def find_rows(self): # the first table on the page that has a bunch of rows for table in CSS("table.wikitable").match(self.root): rows = CSS("tr").match(table) if len(rows) >= 150: return rows
def process_item(self, item): # strip leading zero district = str(int(item.get("id"))) image = CSS(".mem-pic a img").match_one(item).get("src") name = CSS(".mem-name a").match_one(item) district_addr, capitol_addr = self.process_addresses(item) # email, twitter, facebook are all sometimes present try: email = CSS(".mem-email a").match_one(item).text.strip() except SelectorError: email = "" try: twitter = CSS(".fa-twitter").match_one(item) twitter = twitter.getparent().get("href").split("/")[-1] except SelectorError: twitter = "" try: facebook = CSS(".fa-facebook").match_one(item) facebook = facebook.getparent().get("href").split("/")[-1] except SelectorError: facebook = "" party = self.party_mapping[district][1] p = Person( state="ny", chamber="lower", image=image, party=party, district=district, name=name.text.strip(), email=email, ) p.add_link(url=name.get("href")) p.add_source(url=name.get("href")) if twitter: p.ids["twitter"] = twitter if facebook: p.ids["facebook"] = facebook p.district_office.address = district_addr["address"] p.district_office.voice = district_addr["phone"] p.district_office.fax = district_addr["fax"] p.capitol_office.address = capitol_addr["address"] p.capitol_office.voice = capitol_addr["phone"] p.capitol_office.fax = capitol_addr["fax"] return p