def process_item(self, item): link = (XPath( ".//div[contains(@class, 'container')]//a[contains(@href, 'members')]" ).match(item)[0].get("href")) name = CSS("h2 a").match(item)[0].text_content() com = ScrapeCommittee(name=name, chamber=self.chamber) for links in XPath(".//div[contains(@class, 'container')]//a").match( item): url = links.get("href") if url == link: continue else: if links == XPath( ".//div[contains(@class, 'container')]//a[contains(@href, 'home')]" ).match_one(item): com.add_link(url, note="homepage") homepage = True else: com.add_link(url) if not homepage: self.warn("no homepage found") com.add_source(self.source.url) return HouseCommitteeDetail(com, source=link)
def process_page(self): com = self.input com.add_source(self.source.url) com.add_link(self.source.url, note="homepage") room, time = XPath( "//div[@class='col-sm-12 pb-2']//p[2]/text()").match(self.root) if re.search("On Call", time): time = time.split(" -")[0] com.extras["room"] = room.strip() com.extras["meeting schedule"] = time.strip() for link in XPath( '//div[contains(@class, "media-body")]//a[contains(@href, "member_bio")]' ).match(self.root): name = link.text_content().split(",")[0] if name: try: positions = ("chair", "vice chair", "ranking minority member") position = XPath("..//preceding-sibling::b/text()").match( link) for role in position: position_str = "" position_str += role.lower() if position_str not in positions: raise ValueError("unknown position") except SelectorError: position_str = "member" com.add_member(name, position_str) return com
def process_page(self): com = self.input com.add_source(self.source.url) time, room = (CSS(".border-0 .pl-2").match( self.root)[0].text_content().split("in ")) time = time.split("Meets:")[1] com.extras["room"] = room.strip() com.extras["meeting schedule"] = time.strip() for p in XPath('//div[@class="media pl-2 py-4"]').match(self.root): name = (XPath(".//div[@class='media-body']/span/b/text()").match(p) [0].replace("Rep.", "").split("(R)")[0].split("(DFL")[0].strip()) positions = ["committee chair", "vice chair", "republican lead"] if name: try: position = CSS("span b u").match( p)[0].text_content().lower() if position in positions: role = position except SelectorError: role = "member" com.add_member(name, role) return com
def process_item(self, item): comm_name = XPath("text()").match_one(item) if comm_name in [ "Teleconference How-To Information", "Legislative Process" ]: self.skip() comm_url = XPath("@href").match_one(item) if comm_name.startswith("Joint"): com = ScrapeCommittee(name=comm_name, classification="committee", chamber="legislature") elif comm_name.startswith("Subcommittee"): parent_comm = (item.getparent().getparent().getparent().getparent( ).getchildren()[0].text_content()) com = ScrapeCommittee( name=comm_name, classification="subcommittee", chamber="upper", parent=parent_comm, ) else: com = ScrapeCommittee(name=comm_name, classification="committee", chamber="upper") com.add_source(self.source.url) com.add_source(comm_url) com.add_link(comm_url, note="homepage") return ChooseType(com, source=URL(comm_url))
def process_item(self, item): try: title = XPath("..//preceding-sibling::h3/text()").match(item) except SelectorError: title = XPath("../../..//preceding-sibling::h3/text()").match(item) for comm_name in title: if (comm_name == "Standing Committees" or comm_name == "Appropriations Subcommittees"): name_link = CSS("a").match_one(item) name = name_link.text_content() source = name_link.get("href") if comm_name == "Standing Committees": com = ScrapeCommittee(name=name, chamber=self.chamber) else: com = ScrapeCommittee( name=name, classification="subcommittee", chamber=self.chamber, parent="Appropriations", ) return SenateCommitteeDetail(com, source=source) else: self.skip()
def process_item(self, item): href = XPath("@href").match_one(item) if not href.startswith("http"): href = f"https://ultrasignup.com{href}" race_id = href.split("=")[-1] return RaceResultDetail(dict(race_id=race_id, race_results_url=href, **self.input), source=href)
def process_item(self, item): name = CSS("a").match(item)[2].text_content() name = re.sub(r"Contact Assembly Member", "", name).strip() party = CSS("td").match(item)[2].text_content().strip() if party == "Democrat": party = "Democratic" district = CSS("td").match(item)[1].text_content().strip().lstrip("0") # District 18 has a vacant spot if name == "edit": self.skip("skipping Vacant seat in District {}".format(district)) photo_url = CSS("img").match(item, min_items=0) if photo_url: photo_url = photo_url[0].get("src") p = ScrapePerson( name=name, state="ca", chamber="lower", district=district, party=party, image=photo_url, ) capitol_office_header = CSS("h3").match(item)[0].text_content() capitol_office_text = ( XPath( "//*[@id='block-views-view-members-block-1']/div/div/div/table/tbody/tr[1]/td[4]/text()" ) .match(item)[1] .strip() ) capitol_office_text, capitol_office_phone = capitol_office_text.split("; ") capitol_office_address = capitol_office_header + capitol_office_text p.capitol_office.address = capitol_office_address p.capitol_office.voice = capitol_office_phone district_offices = XPath(".//td/p[1]/text()").match(item) for office in district_offices: district_address, district_phone = office.split("; ") p.add_office( classification="district", address=district_address.strip(), voice=district_phone.strip(), ) url = CSS("a").match(item)[0].get("href") p.add_link(url) p.add_source(self.source.url) return p
def process_page(self): p = self.input img = CSS("div.field-person-photo img").match_one(self.root).get("src") p.image = img bio_info = CSS("div.pane-content ul li").match(self.root) if len(bio_info) > 0: p.extras["bio info"] = [] for info in bio_info: p.extras["bio info"] += info try: street = (CSS("div.street-address").match_one( self.root).text_content().strip()) town = CSS("span.locality").match_one( self.root).text_content().strip() zip_code = (CSS("span.postal-code").match_one( self.root).text_content().strip()) address = street + ", " + town + ", ND " + zip_code p.district_office.address = address except SelectorError: pass try: phones = XPath( "//*[@id='block-system-main']//div[contains(text(), 'phone')]" ).match(self.root) for phone in phones: phone_type = phone.text_content().strip() phone_number = phone.getnext().text_content().strip() if phone_type == "Cellphone:": p.extras["Cell phone"] = phone_number elif phone_type == "Home Telephone:": p.extras["Home phone"] = phone_number elif phone_type == "Office Telephone:": p.district_office.voice = phone_number except SelectorError: pass email = (XPath( "//*[@id='block-system-main']//div[contains(text(), 'Email')]"). match_one(self.root).getnext().text_content().strip()) p.email = email try: fax = (XPath( "//*[@id='block-system-main']//div[contains(text(), 'Fax')]"). match_one(self.root).getnext().text_content().strip()) p.district_office.fax = fax except SelectorError: pass return p
def process_page(self): com = self.input com.add_source(self.source.url) com.add_link(self.source.url, note="homepage") try: chairs = CSS(".chair-info").match(self.root) except SelectorError: raise SkipItem("skipping committee without full information") # in case there are co-chairs num_chairs = len(chairs) for chair in chairs: chair_name = CSS(".comm-chair-name").match_one(chair).text_content().strip() chair_role = ( XPath(f"..//preceding-sibling::header[{num_chairs}]") .match_one(chair) .text_content() .strip() .lower() ) com.add_member(chair_name, chair_role) # some committees only have chairs and no members list try: for p in CSS("#comm-membership ul li").match(self.root): name = p.text_content().strip() role = "member" com.add_member(name, role) except SelectorError: pass # some committees have temporary addresses, others have permanent ones try: temp, room, zip = XPath( "//section[@id='comm-addr']/div[@class='mod-inner']//text()" ).match(self.root) com.extras["address"] = f"{temp}: {room}; {zip}" except ValueError: room, zip = XPath( "//section[@id='comm-addr']/div[@class='mod-inner']//text()" ).match(self.root) com.extras["address"] = f"{room}; {zip}" # some committees have press releases try: news_link = CSS("#page-content .read-more").match(self.root)[0].get("href") com.add_link(news_link) except SelectorError: pass return com
def process_page(self): p = self.input img = CSS("div#content p img").match_one(self.root).get("src") p.image = img if self.source.url == "https://legislature.maine.gov/District-22": addr = CSS("div#content p strong").match(self.root)[2].tail.strip() else: addr = ( CSS("div#content p strong") .match(self.root)[1] .tail.strip() .lstrip(":") .strip() ) if addr != p.district_office.address: p.extras["Additional address"] = addr try: state_phone = ( XPath("//*[@id='content']/p/strong[contains(text(), 'State')]") .match_one(self.root) .tail.strip() ) state_phone = state_phone.lstrip(":").strip() p.capitol_office.voice = state_phone except SelectorError: pass try: state_phone = ( XPath("//*[@id='content']/p/b[contains(text(), 'State')]") .match_one(self.root) .tail.strip() ) state_phone = state_phone.lstrip(":").strip() p.capitol_office.voice = state_phone except SelectorError: pass website = ( XPath("//*[@id='content']/p/strong[contains(text(), 'Website')]") .match_one(self.root) .getnext() ) if website.get("href") is None: website = website.getnext().get("href") else: website = website.get("href") p.add_link(website, note="website") return p
def process_item(self, item): dd_text = XPath(".//dd/text()").match(item) district = dd_text[2].strip().split()[1] party = dd_text[4].strip() return PersonDetail( dict( chamber="upper" if "senate" in self.source.url else "lower", district=district, party=party, ), source=str(XPath(".//dd/a[1]/@href").match_one(item)), )
def process_item(self, item): dd_text = XPath(".//dd/text()").match(item) district = dd_text[2].strip().split()[1] party = dd_text[4].strip() url = str(XPath(".//dd/a[1]/@href").match_one(item)) if "Details" not in url: raise SkipItem(f"skipping {url}") return PersonDetail( dict( chamber="upper" if "senate" in self.source.url else "lower", district=district, party=party, ), source=url, )
def process_page(self): com = self.input try: members = XPath( "//*[@id='committeesIntroRoster']/div/div/div/a").match( self.root) for member in members: member_dirty = member.text_content().strip().split("\n") mem_name = member_dirty[0].strip( ) + " " + member_dirty[1].strip() role = (member.getparent().getprevious().getprevious(). text_content().strip()) if role.strip() == "": role = "member" com.add_member(mem_name, role) # many 'ex officio' roles for House Subcommittees, Joint Committees, and Joint Subcommittees except SelectorError: raise SkipItem("empty committee") try: extra_info = CSS("div#bodyContent b").match(self.root) for title in extra_info: position = title.text_content().strip() name = title.getnext().tail.strip() com.extras[position] = name except SelectorError: pass return com
class LegList(HtmlListPage): selector = XPath(".//form/table[1]/tr") def process_item(self, item): # skip header rows if ( len(CSS("td").match(item)) == 1 or CSS("td").match(item)[0].get("class") == "header" ): self.skip() first_link = CSS("td a").match(item)[0] name = first_link.text_content() detail_link = first_link.get("href") district = CSS("td").match(item)[3].text_content() party_letter = CSS("td").match(item)[4].text_content() party_dict = {"D": "Democratic", "R": "Republican", "I": "Independent"} party = party_dict[party_letter] p = ScrapePerson( name=name, state="il", party=party, chamber=self.chamber, district=district, ) p.add_source(self.source.url) p.add_source(detail_link) p.add_link(detail_link, note="homepage") return LegDetail(p, source=detail_link)
def process_page(self): p = self.input capitol_addr_lst = XPath(".//*[@id='district']/span[1]/text()").match( self.root) capitol_addr = "" for line in capitol_addr_lst: capitol_addr += line.strip() capitol_addr += " " p.capitol_office.address = capitol_addr.strip() try: fax = (CSS("span.info.fax").match_one( self.root).text_content().strip().split("\n")) fax = fax[-1].strip() p.capitol_office.fax = fax except SelectorError: pass try: staff_spans = CSS("span.info.staff span").match(self.root) for num, span in enumerate(grouper(staff_spans[1:], 2)): staff_name = span[0].text_content().strip() staff_email = span[1].text_content().strip() p.extras["staff" + str(num + 1)] = staff_name p.extras["staff_email" + str(num + 1)] = staff_email except SelectorError: pass return p
def get_column_div(self, name): # lots of places where we have a <div class='col-md-2 font-weight-bold'> # followeed by a <div class='col'> # with interesting content in the latter element return XPath( f"//div[contains(text(),'{name}')]/following-sibling::div[@class='col']" ).match_one(self.root)
class Legislators(HtmlListPage): session_num = "116" source = "https://leg.mt.gov/legislator-information/?session_select=" + session_num selector = XPath("//table[1]/tbody/tr") def process_item(self, item): tds = item.getchildren() email, name, party, seat, phone = tds chamber, district = seat.text_content().strip().split() url = str(name.xpath("a/@href")[0]) person = ScrapePerson( name=clean_name(name.text_content()), state="mt", party=party.text_content().strip(), chamber=("upper" if chamber == "SD" else "lower"), district=district, ) person.add_link(url) person.add_source(url) phone = phone.text_content().strip() if len(phone) == 14: person.capitol_office.voice = phone elif len(phone) > 30: person.capitol_office.voice = phone.split(" ")[0] email = email.xpath("./a/@href") if email: person.email = email[0].split(":", 1)[1] return person
class SenDetail(HtmlPage): contact_xpath = XPath('//h4[contains(text(), "Office")]') input_type = PartialPerson def get_source_from_input(self): return self.input.url def process_page(self): email = (self.root.xpath('//a[contains(@href, "mailto:")]')[0].get( "href").split(":")[-1]) p = ScrapePerson( state="fl", chamber="upper", name=fix_name(self.input.name), party=str(self.input.party), district=str(self.input.district), email=email, image=str(self.root.xpath('//div[@id="sidebar"]//img/@src').pop()), ) for item in self.contact_xpath.match(self.root): self.handle_office(item, p) return p def handle_office(self, office, person): (name, ) = office.xpath("text()") if name == "Tallahassee Office": obj_office = person.capitol_office else: obj_office = person.district_office address_lines = [ x.strip() for x in office.xpath("following-sibling::div[1]") [0].text_content().splitlines() if x.strip() ] clean_address_lines = [] fax = phone = None PHONE_RE = r"\(\d{3}\)\s\d{3}\-\d{4}" after_phone = False for line in address_lines: if re.search(r"(?i)open\s+\w+day", address_lines[0]): continue elif "FAX" in line: fax = line.replace("FAX ", "") after_phone = True elif re.search(PHONE_RE, line): phone = line after_phone = True elif not after_phone: clean_address_lines.append(line) address = "; ".join(clean_address_lines) address = re.sub(r"\s{2,}", " ", address) obj_office.address = address obj_office.phone = phone obj_office.fax = fax
def process_item(self, item): committee_name = item.text_content() # only scrape joint coms on senate scrape if ("Joint" in committee_name or "Task Force" in committee_name or "Conference" in committee_name): self.skip() committee_name = remove_comm(committee_name) committee_name = committee_name.strip() if "Subcommittee" in committee_name: name = committee_name.replace("Subcommittee on ", "").replace(", Subcommittee", "") parent = remove_comm( XPath("..//..//preceding-sibling::a").match(item) [0].text_content()) com = ScrapeCommittee( name=name, chamber=self.chamber, classification="subcommittee", parent=parent, ) else: com = ScrapeCommittee(name=committee_name, chamber=self.chamber) # We can construct a URL that would make scraping easier, as opposed to the link that is directly given comm_link = item.get("href").replace("https://www.house.mo.gov/", "") source = f"https://www.house.mo.gov/MemberGridCluster.aspx?filter=compage&category=committee&{comm_link}" return HouseCommitteeDetail(com, source=URL(source, timeout=30))
class HouseParties(HtmlListPage): source = ( "https://lrl.texas.gov/legeLeaders/members/membersearch.cfm?leg=87&chamber=H" ) selector = XPath('//table[@id="tableToSort"]/tbody/', num_items=1) def process_page(self): tds = self.root.xpath( '//table[@id="tableToSort"]//td[contains(@class, ' '"results")]', ) party_map = {"D": "Democratic", "R": "Republican"} parties = {} for td_index, td in enumerate(tds): # 0, 2nd and 6th column if td_index % 9 == 0: name = td.text_content().strip() if td_index % 9 == 2: district = td.text_content().strip() if td_index % 9 == 6: party_code = td.text_content().strip() if len(party_code) > 1: party_code = re.search(r"[A-Z]", party_code)[0] if party_code == "": continue party = party_map[party_code] parties[district] = {"name": name, "party": party} return parties
def process_page(self): com = self.input com.add_source(self.source.url) com.add_link(self.source.url, note="homepage") # a few committees don't have chair positions try: chair_role = ( CSS(".c-chair-block--position") .match_one(self.root) .text_content() .lower() ) chair_name = CSS(".c-chair--title").match_one(self.root).text_content() com.add_member(chair_name, chair_role) except SelectorError: pass try: for p in XPath( "//div[contains(@class, 'c-senators-container')]//div[@class='view-content']/div[contains(@class, 'odd') or contains(@class, 'even')]" ).match(self.root): name = CSS(".nys-senator--name").match_one(p).text_content() role = CSS(".nys-senator--position").match_one(p).text_content().lower() if role == "": role = "member" com.add_member(name, role) except SelectorError: pass return com
class HouseComList(HtmlPage): source = "https://www.myfloridahouse.gov/Sections/Committees/committees.aspx" selector = XPath("//a[contains(@href, 'committeesdetail.aspx')]") def process_page(self): # don't use list page because we need to look back at prior element parent = None chamber = "lower" for item in self.selector.match(self.root): cssclass = item.attrib.get("class", "") name = item.text_content().strip() if "parentcommittee" in cssclass: parent = None chamber = "lower" comm = ScrapeCommittee(name=name, classification="committee", chamber=chamber, parent=parent) yield HouseComDetail(comm, source=item.attrib["href"]) # parent for next time if "parentcommittee" in cssclass: parent = comm._id chamber = None
def process_page(self): com = self.input Rolez = XPath("//*[@id='form1']/div/div/div/div/div[1]/text()").match(self.root) Chair_mem = ( CSS("#form1 div div div div div a") .match(self.root)[0] .text_content() .strip() ) Chair_role = Rolez[0].replace(":", "").strip() com.add_member(Chair_mem, Chair_role) VChair_mem = ( CSS("#form1 div div div div div a") .match(self.root)[1] .text_content() .strip() ) VChair_role = Rolez[1].replace(":", "").strip() com.add_member(VChair_mem, VChair_role) members = CSS("#form1 div div.card-body div a").match(self.root)[7:] for mem in members: member = mem.text_content().strip() role_mem = "Member" com.add_member(member, role_mem) return com
class SenateCommitteeList(HtmlListPage): source = URL("http://senate.ca.gov/committees") selector = XPath("//h2/../following-sibling::div//a") def process_item(self, item): comm_name = XPath("text()").match_one(item) if comm_name in [ "Teleconference How-To Information", "Legislative Process" ]: self.skip() comm_url = XPath("@href").match_one(item) if comm_name.startswith("Joint"): com = ScrapeCommittee(name=comm_name, classification="committee", chamber="legislature") elif comm_name.startswith("Subcommittee"): parent_comm = (item.getparent().getparent().getparent().getparent( ).getchildren()[0].text_content()) com = ScrapeCommittee( name=comm_name, classification="subcommittee", chamber="upper", parent=parent_comm, ) else: com = ScrapeCommittee(name=comm_name, classification="committee", chamber="upper") com.add_source(self.source.url) com.add_source(comm_url) com.add_link(comm_url, note="homepage") return ChooseType(com, source=URL(comm_url))
def test_xml_list_page(): p = XmlListPage(source=SOURCE) p.selector = XPath("//item/text()") p.response = Response( "<resp><item>one</item><item>two</item><item>three</item></resp>") p.postprocess_response() data = list(p.process_page()) assert data == ["one", "two", "three"]
def test_html_list_page(): p = HtmlListPage(source=SOURCE) p.selector = XPath("//li/text()") p.response = Response("<ul><li>one</li><li>two</li><li>three</li></ul>") p.postprocess_response() data = list(p.process_page()) assert len(data) == 3 assert data == ["one", "two", "three"]
def process_page(self): com = self.input try: chair = (XPath("//h5[text()='Chair']").match_one( self.root).getnext().text_content().strip()) chair = re.search(r"(Senator|Representative)\s(.+)", chair).groups()[1] com.add_member(chair, "Chair") except SelectorError: pass try: vice_chair = (XPath("//h5[text()='Vice-Chair']").match_one( self.root).getnext().text_content().strip()) vice_chair = re.search(r"(Senator|Representative)\s(.+)", vice_chair).groups()[1] com.add_member(vice_chair, "Vice-Chair") except SelectorError: pass try: additional_members = ( XPath("//h5[text()='Additional Members']").match_one( self.root).getnext().getchildren()) for member in additional_members: member = member.text_content().strip() member = re.search(r"(Senator|Representative)\s(.+)", member).groups()[1] com.add_member(member, "member") except SelectorError: pass try: extra_info = CSS("section.content strong").match(self.root) for title in extra_info: position = title.text_content().strip() name = title.tail.strip().lstrip(":").strip() com.extras[position] = name except SelectorError: pass if not com.members: raise SkipItem("empty committee") return com
class LegList(JsonListPage): source = list_url() selector = XPath("//LegislativeMemberSummary/Details") def process_item(self, item): url = item["Details"] return LegDetail(source=url)
def process_page(self): image = XPath("//img[contains(@src, '/photo')]").match_one(self.root).get("src") p = ScrapePerson( name=self.input.name, state="ia", chamber=self.input.chamber, party=self.input.party, district=self.input.district, email=self.input.email, image=image, ) p.add_source(self.source.url) p.add_source(self.input.url) try: for link in CSS(".link_list a").match(self.root): url = link.get("href") if re.search("leaving?", url): url = url.replace("https://www.legis.iowa.gov/leaving?forward=", "") if not re.search("http://", url) or re.search("https://", url): url = "http://" + url p.add_link(url) except SelectorError: pass table = XPath("//div[@class='legisIndent divideVert']//td//text()").match( self.root ) # the fields, like "cell phone", etc. are located at every odd indice # the information for each field, like the phone number, are located at every even indice fields = list(map(self.get_field, table[0::2])) extra = table[1::2] num_of_fields = range(len(fields)) for i in num_of_fields: if fields[i] == "Legislative Email": continue p.extras[fields[i].lower()] = extra[i].strip() return p
class HouseSearchPage(HtmlListPage): """ House committee roll calls are not available on the Senate's website. Furthermore, the House uses an internal ID system in its URLs, making accessing those pages non-trivial. This will fetch all the House committee votes for the given bill, and add the votes to that object. """ input_type = Bill example_input = Bill("HB 1", "2020", "title", chamber="upper", classification="bill") selector = XPath( '//a[contains(@href, "/Bills/billsdetail.aspx?BillId=")]/@href') def get_source_from_input(self): url = "https://www.myfloridahouse.gov/Sections/Bills/bills.aspx" # Keep the digits and all following characters in the bill's ID bill_number = re.search(r"^\w+\s(\d+\w*)$", self.input.identifier).group(1) session_number = { "2022D": "96", "2022C": "95", "2022": "93", "2021B": "94", "2021A": "92", "2021": "90", "2020": "89", "2019": "87", "2018": "86", "2017A": "85", "2017": "83", "2016": "80", "2015C": "82", "2015B": "81", "2015A": "79", "2015": "76", "2014O": "78", "2014A": "77", "2016O": "84", }[self.input.legislative_session] form = { "Chamber": "B", "SessionId": session_number, "BillNumber": bill_number } return url + "?" + urlencode(form) def process_item(self, item): return HouseBillPage(self.input, source=item)