p = Person( chamber="legislature", party="Nonpartisan", state="ne", district=district, image=image, name=name, email=email, ) p.capitol_office.address = "; ".join(address) p.capitol_office.voice = phone p.add_source(self.source.url) p.add_link(self.source.url) return p class LegPageGenerator(ListPage): source = NullSource() """ NE is an interesting test case for Spatula, since there are individual senator pages but no real index that's useful at all. Right now this is using a dummy source page to spawn the 49 subpage scrapers. """ def process_page(self): for n in range(1, 50): yield LegPage( source=f"http://news.legislature.ne.gov/dist{n:02d}/") legislators = PeopleWorkflow(LegPageGenerator)
state="oh", chamber="lower", district=self.input.district, name=self.input.name, party=self.input.party, image=self.input.image, ) p.add_source(self.input.url) p.add_link(self.input.url) divs = CSS(".member-info-bar-module").match(self.root) # last div is contact details contact_details = CSS(".member-info-bar-value").match(divs[-1]) for div in contact_details: dtc = div.text_content() if ", OH" in dtc: # join parts of the div together to make whole address children = div.getchildren() p.capitol_office.address = "; ".join( [children[0].text.strip(), children[0].tail.strip(), children[1].tail.strip()] ) elif "Phone:" in dtc: p.capitol_office.voice = dtc.split(": ")[1] elif "Fax:" in dtc: p.capitol_office.fax = dtc.split(": ")[1] return p house_members = PeopleWorkflow(HouseList)
person = Person( name="{FirstName} {LastName}".format(**item_dict), given_name=item_dict["FirstName"], family_name=item_dict["LastName"], state="ak", party=item_dict["Party"], chamber=("upper" if chamber == "S" else "lower"), district=item_dict["District"], image=f"http://akleg.gov/images/legislators/{code}.jpg", email=item_dict["EMail"], ) person.add_link( "http://www.akleg.gov/basis/Member/Detail/{}?code={}".format(self.session_num, code) ) person.add_source("http://w3.akleg.gov/") if item_dict["Phone"]: phone = "907-" + item_dict["Phone"][0:3] + "-" + item_dict["Phone"][3:] person.capitol_office.voice = phone if item_dict["Building"] == "CAPITOL": person.capitol_office.address = "State Capitol Room {}; Juneau AK, 99801".format( item_dict["Room"] ) return person legislators = PeopleWorkflow(Legislators)
name=name, family_name=last, given_name=first, state="sd", district=item["District"].lstrip("0"), chamber="upper" if item["MemberType"] == "S" else "lower", party=item["Politics"], email=item["EmailState"], image= "https://lawmakerdocuments.blob.core.usgovcloudapi.net/photos/" + item["Picture"].lower(), ) address = item["HomeAddress1"] if item["HomeAddress2"]: address += "; " + item["HomeAddress2"] address += f"{item['HomeCity']}, {item['HomeState']} {item['HomeZip']}" p.district_office.address = address p.district_office.voice = item["HomePhone"] p.capitol_office.voice = item["CapitolPhone"] p.extras["occupation"] = item["Occupation"] url = f"https://sdlegislature.gov/Legislators/Profile/{item['SessionMemberId']}/Detail" p.add_link(url) p.add_source(url) return p legislators = PeopleWorkflow(DirectoryListing)
def process_item(self, item): website, district, name, party, office, phone, email = item.getchildren( ) # skip header row if website.tag == "th": self.skip() office = office.text_content() for abbr, full in self.office_names.items(): office = office.replace(abbr, full) p = Person( name=name.text_content(), state="mi", chamber="lower", district=district.text_content().lstrip("0"), party=party.text_content(), email=email.text_content(), ) p.add_link(CSS("a").match_one(website).get("href")) p.add_source(self.source.url) p.capitol_office.voice = phone.text_content() p.capitol_office.address = office return p senators = PeopleWorkflow(SenList) reps = PeopleWorkflow(RepList)
email=email, ) p.add_link(self.source.url) p.add_source(self.source.url) return p class PersonList(HtmlListPage): selector = XPath("//div[@id='myDIV']//div[@class='p-0 member-index-cell']") def process_item(self, item): dd_text = XPath(".//dd/text()").match(item) district = dd_text[2].strip().split()[1] party = dd_text[4].strip() return PersonDetail( dict( chamber="upper" if "senate" in self.source.url else "lower", district=district, party=party, ), source=str(XPath(".//dd/a[1]/@href").match_one(item)), ) house_members = PeopleWorkflow( PersonList( source="http://mgaleg.maryland.gov/mgawebsite/Members/Index/house")) senate_members = PeopleWorkflow( PersonList( source="http://mgaleg.maryland.gov/mgawebsite/Members/Index/senate"))
return SenateDetail(self.input) class DelegateDetail(MemberDetail): role = "Delegate" chamber = "lower" def process_page(self): p = super().process_page() lis_id = get_lis_id(self.chamber, self.input.url) if lis_id: lis_id = "{}{:04d}".format(lis_id[0], int(lis_id[1:])) p.image = f"http://memdata.virginiageneralassembly.gov/images/display_image/{lis_id}" return p class SenateList(MemberList): chamber = "upper" selector = XPath('//div[@class="lColRt"]/ul/li/a') next_page_cls = SenatePhotoDetail class DelegateList(MemberList): chamber = "lower" selector = XPath('//div[@class="lColLt"]/ul/li/a') next_page_cls = DelegateDetail senators = PeopleWorkflow(SenateList) delegates = PeopleWorkflow(DelegateList)
p.district_office.address = p.district_office.address.strip() # photos if not item["photos"]: pass elif len(item["photos"]) == 1: p.image = item["photos"][0]["url"].split("?")[ 0] # strip off ?size=mpSm for full size else: raise Exception("unknown photos configuration: " + str(item["photos"])) # extras p.extras["residence"] = item["residence"] p.extras["city"] = item["city"] p.extras["georgia_id"] = item["id"] if item["dateVacated"]: p.end_date = item["dateVacated"] url = ( f"https://www.legis.ga.gov/members/{self.chamber_names[chamber_id]}/" f"{item['id']}?session={item['sessionId']}") p.add_source(url) p.add_link(url) return p legislators = PeopleWorkflow( DirectoryListing(source="https://www.legis.ga.gov/api/members/list/1029"))
district_css = CSS(".bDistrict h2") address_css = CSS(".bSenBio__address p") phone_css = CSS(".bSenBio__tel a") contact_link_sel = SimilarLink( r"https://oksenate.gov/contact-senator\?sid=") def process_page(self): for bio in CSS(".bSenBio__infoIt").match(self.root): if "Party:" in bio.text_content(): party = bio.text_content().split(":")[1].strip() p = Person( name=self.name_css.match_one(self.root).text, state="ok", chamber="upper", party=party, image=self.image_css.match_one(self.root).get("href"), district=self.district_css.match_one( self.root).text.strip().split()[1], ) p.capitol_office.address = self.address_css.match_one(self.root).text p.capitol_office.phone = self.phone_css.match_one(self.root).text p.add_link( self.contact_link_sel.match_one(self.root).get("href"), "Contact Form") return p house_members = PeopleWorkflow(HouseList) senate_members = PeopleWorkflow(SenateList)
] address = "\n".join((info_texts[0], info_texts[1])) phone_text = info_texts[2] # if validate_phone_number(phone_text): phone = phone_text email_text = item.xpath(".//a/@href")[1].replace("mailto:", "").strip() # if validate_email_address(email_text): email = email_text rep = Person( name=name, district=district, party=party, state="mn", chamber="lower", image=photo_url, email=email, ) rep.add_link(url) rep.add_source(self.source.url) rep.capitol_office.address = address rep.capitol_office.phone = phone return rep reps = PeopleWorkflow(RepList) sens = PeopleWorkflow(SenList)
except SelectorError: facebook = "" party = self.party_mapping[district][1] p = Person( state="ny", chamber="lower", image=image, party=party, district=district, name=name.text.strip(), email=email, ) p.add_link(url=name.get("href")) p.add_source(url=name.get("href")) if twitter: p.ids["twitter"] = twitter if facebook: p.ids["facebook"] = facebook p.district_office.address = district_addr["address"] p.district_office.voice = district_addr["phone"] p.district_office.fax = district_addr["fax"] p.capitol_office.address = capitol_addr["address"] p.capitol_office.voice = capitol_addr["phone"] p.capitol_office.fax = capitol_addr["fax"] return p assembly_members = PeopleWorkflow(AssemblyList)