def process_item(self, item): committee_name = item.text_content() # only scrape joint coms on senate scrape if ("Joint" in committee_name or "Task Force" in committee_name or "Conference" in committee_name): self.skip() committee_name = remove_comm(committee_name) committee_name = committee_name.strip() if "Subcommittee" in committee_name: name = committee_name.replace("Subcommittee on ", "").replace(", Subcommittee", "") parent = remove_comm( XPath("..//..//preceding-sibling::a").match(item) [0].text_content()) com = ScrapeCommittee( name=name, chamber=self.chamber, classification="subcommittee", parent=parent, ) else: com = ScrapeCommittee(name=committee_name, chamber=self.chamber) # We can construct a URL that would make scraping easier, as opposed to the link that is directly given comm_link = item.get("href").replace("https://www.house.mo.gov/", "") source = f"https://www.house.mo.gov/MemberGridCluster.aspx?filter=compage&category=committee&{comm_link}" return HouseCommitteeDetail(com, source=URL(source, timeout=30))
def process_item(self, item): try: title = XPath("..//preceding-sibling::h3/text()").match(item) except SelectorError: title = XPath("../../..//preceding-sibling::h3/text()").match(item) for comm_name in title: if (comm_name == "Standing Committees" or comm_name == "Appropriations Subcommittees"): name_link = CSS("a").match_one(item) name = name_link.text_content() source = name_link.get("href") if comm_name == "Standing Committees": com = ScrapeCommittee(name=name, chamber=self.chamber) else: com = ScrapeCommittee( name=name, classification="subcommittee", chamber=self.chamber, parent="Appropriations", ) return SenateCommitteeDetail(com, source=source) else: self.skip()
def process_item(self, item): comm_name = CSS("a").match_one(item).text_content() comm_url = CSS("a").match_one(item).get("href") # "https://jtlegbudget.legislature.ca.gov/sublegislativeanalyst" has no members if comm_url == "https://jtlegbudget.legislature.ca.gov/sublegislativeanalyst": self.skip() # Joint Committees are being skipped to avoid duplicates (they were already grabbed during SenateCommitteeList()) if comm_name.startswith("Joint Committee") or comm_name.startswith( "Joint Legislative"): self.skip() elif comm_name.startswith("Subcommittee"): parent_comm = item.getparent().getparent().getchildren( )[0].text_content() com = ScrapeCommittee( name=comm_name, classification="subcommittee", chamber="lower", parent=parent_comm, ) else: com = ScrapeCommittee(name=comm_name, classification="committee", chamber="lower") com.add_source(self.source.url) com.add_source(comm_url) com.add_link(comm_url, note="homepage") return ChooseType(com, source=URL(comm_url))
def process_page(self): name = self.root.xpath('//h2[@class="committeeName"]')[1].text if name.startswith("Appropriations Subcommittee"): return # TODO: restore scraping of Appropriations Subcommittees # name = name.replace('Appropriations ', '') # parent = {'name': 'Appropriations', 'classification': 'upper'} # chamber = None else: if name.startswith("Committee on"): name = name.replace("Committee on ", "") parent = None chamber = "upper" print(name) comm = ScrapeCommittee(name=name, classification="committee", chamber=chamber, parent=parent) for dt in self.root.xpath('//div[@id="members"]/dl/dt'): role = dt.text.replace(": ", "").strip().lower() member = dt.xpath("./following-sibling::dd")[0].text_content() member = self.clean_name(member) comm.add_member(member, role=role) for ul in self.root.xpath('//div[@id="members"]/ul/li'): member = self.clean_name(ul.text_content()) comm.add_member(member) comm.add_source(self.source.url) return comm
def process_item(self, item): link = (XPath( ".//div[contains(@class, 'container')]//a[contains(@href, 'members')]" ).match(item)[0].get("href")) name = CSS("h2 a").match(item)[0].text_content() com = ScrapeCommittee(name=name, chamber=self.chamber) for links in XPath(".//div[contains(@class, 'container')]//a").match( item): url = links.get("href") if url == link: continue else: if links == XPath( ".//div[contains(@class, 'container')]//a[contains(@href, 'home')]" ).match_one(item): com.add_link(url, note="homepage") homepage = True else: com.add_link(url) if not homepage: self.warn("no homepage found") com.add_source(self.source.url) return HouseCommitteeDetail(com, source=link)
def process_item(self, item): comm_name = CSS("a").match(item)[0].text_content().strip() previous_sibs = item.getparent().itersiblings(preceding=True) for sib in previous_sibs: if len(sib.getchildren()) == 0: chamber_type = sib.text_content().strip() break if chamber_type == "Senate Committees": chamber = "upper" elif chamber_type == "Joint Committees": self.skip() elif chamber_type == "Task Forces": self.skip() com = ScrapeCommittee( name=comm_name, classification="committee", chamber=chamber, ) detail_link = CSS("a").match(item)[0].get("href") com.add_source(self.source.url) com.add_source(detail_link) com.add_link(detail_link, note="homepage") return SenDetail(com, source=detail_link)
def process_item(self, item): return CommitteeDetail( ScrapeCommittee( name=item.text_content(), chamber=self.chamber, ), source=item.get("href"), )
def process_item(self, item): name = CSS("div span.bTiles__title").match(item)[0].text_content() com = ScrapeCommittee(name=name, classification="committee", chamber=self.chamber) detail_link = item.get("href") return SenateCommitteeDetail(com, source=detail_link)
def process_item(self, item): comm_name = ( item.text_content().strip().split(" (")[0].title().replace( "(Fin Sub)", "")) if "Conference" in comm_name: self.skip() chamber = item.getparent().getprevious().getprevious().text_content( ).strip() if chamber == "House": chamber = "lower" elif chamber == "Senate": chamber = "upper" elif chamber == "Joint Committee": chamber = "legislature" classification = item.getparent().getprevious().text_content().strip() if classification == "Finance Subcommittee": # work around duplicate name of Judiciary committees # a current limitation in how Open States can handle committees # see https://github.com/openstates/issues/issues/598 if comm_name == "Judiciary": comm_name = "Judiciary (Finance)" com = ScrapeCommittee( name=comm_name, classification="subcommittee", chamber=chamber, parent="Finance", ) else: com = ScrapeCommittee( name=comm_name, classification="committee", chamber=chamber, ) detail_link = CSS("a").match_one(item).get("href") com.add_source(self.source.url) com.add_source(detail_link) com.add_link(detail_link, note="homepage") return CommiteeDetail(com, source=URL(detail_link, timeout=30))
class CommitteeDetail(HtmlPage): example_source = "https://www.legis.state.pa.us/cfdocs/CteeInfo/index.cfm?Code=32&CteeBody=H&SessYear=2021" example_name = "Aging & Older Adult Services" example_input = ScrapeCommittee(name=example_name, classification="committee", chamber="lower") def process_page(self): com = self.input try: # This section has the chair memebers the regular, democratic and minority and the roles # main chair chair_member = (CSS( "div.MemberInfoList-MemberWrapper.ChairWrapper div.ChairNameText a" ).match(self.root)[0].text.strip()) # main chair role chair_member_role = (CSS( "div.MemberInfoList-MemberWrapper.ChairWrapper div.ChairNameText div" ).match(self.root)[0].text.strip()) except IndexError: pass try: com.add_member(fix_name(chair_member), chair_member_role) # Democratic Chair member and or the minority chair member demo_chair_member = (CSS( "div.MemberInfoList-MemberWrapper.ChairWrapper div.ChairNameText a" ).match(self.root)[1].text.strip()) # Democratic Chair member and or the minority chair member role demo_chair_member_role = (CSS( "div.MemberInfoList-MemberWrapper.ChairWrapper div.ChairNameText div" ).match(self.root)[1].text.strip()) com.add_member(fix_name(demo_chair_member), demo_chair_member_role) except IndexError: pass majority_members = CSS( ".Widget.CteeInfo-MajorityList .MemberInfoList-MemberWrapper.Member" ).match(self.root) for mem in majority_members: try: major_member_name = CSS("div a").match_one(mem).text.strip() major_mem_position = CSS(".position").match_one( mem).text.strip() except SelectorError: major_mem_position = "member" com.add_member(fix_name(major_member_name), major_mem_position) minority_members = CSS( ".Widget.CteeInfo-MinorityList .MemberInfoList-MemberWrapper.Member" ).match(self.root) for mem in minority_members: try: minor_member_name = CSS("div a").match_one(mem).text.strip() minor_mem_position = CSS(".position").match_one( mem).text.strip() except SelectorError: minor_mem_position = "member" com.add_member(fix_name(minor_member_name), minor_mem_position) return com
def process_item(self, item): comm_name = ( item.text_content().strip().split(" (")[0].title().replace("(Fin Sub)", "") ) if "Conference" in comm_name: self.skip() chamber = item.getparent().getprevious().getprevious().text_content().strip() if chamber == "House": chamber = "lower" elif chamber == "Senate": chamber = "upper" elif chamber == "Joint Committee": chamber = "legislature" classification = item.getparent().getprevious().text_content().strip() if classification == "Finance Subcommittee": com = ScrapeCommittee( name=comm_name, classification="subcommittee", chamber=chamber, parent="Finance", ) else: com = ScrapeCommittee( name=comm_name, classification="committee", chamber=chamber, ) detail_link = CSS("a").match_one(item).get("href") com.add_source(self.source.url) com.add_source(detail_link) com.add_link(detail_link, note="homepage") return CommiteeDetail(com, source=URL(detail_link, timeout=30))
def process_item(self, item): name = item.text_content() if "Joint" in name: chamber = "legislature" else: chamber = "upper" if (name != "2021 Senate Committee Hearing Schedule" and name != "Assigned Bills" and name != "Committee Minutes" and name != "Appointees To Be Considered"): if "Committee" in name: comm_name = (name.replace( "Joint Committee on the ", "").replace("Joint Committee on ", "").replace("Committee on ", "").replace(" Committee", "")) if "Subcommittee" in name: name_parent = comm_name.split(" – ") parent = name_parent[0] comm_name = name_parent[1].replace("Subcommittee", "") com = ScrapeCommittee( name=comm_name, chamber=chamber, classification="subcommittee", parent=parent, ) else: com = ScrapeCommittee(name=comm_name, chamber=chamber) else: com = ScrapeCommittee(name=name, chamber=chamber) return SenateCommitteeDetail(com, source=URL(item.get("href"), timeout=30)) else: self.skip()
def process_item(self, item): comm_name = item.text_content().strip() com = ScrapeCommittee( name=comm_name.title(), classification="committee", chamber="legislature", ) detail_link = item.get("href") com.add_source(self.source.url) com.add_source(detail_link) com.add_link(detail_link, note="homepage") return HouseJointDetail(com, source=detail_link)
def process_item(self, item): name = item.text_content().strip() com = ScrapeCommittee(name=name, classification="committee", chamber=self.chamber) detail_link = item.get("href") com.add_source(detail_link) com.add_link(detail_link, "homepage") return CommitteeDetail(com, source=detail_link)
def process_item(self, item): if item["chamber"] == 2: chamber = "upper" elif item["chamber"] == 1: chamber = "lower" source = URL( f"https://www.legis.ga.gov/api/committees/details/{item['id']}/1029", headers={"Authorization": get_token()}, ) com = ScrapeCommittee( name=item["name"], chamber=chamber, ) com.add_source( self.source.url, note="Initial list page (requires authorization token)" ) return CommitteeDetail( com, source=source, )
def process_item(self, item): com_link = CSS("a").match(item)[0] name = com_link.text_content() com = ScrapeCommittee(name=name, classification="committee", chamber=self.chamber) detail_link = com_link.get("href") com.add_source(detail_link) com.add_link(detail_link, note="homepage") return CommitteeDetail(com, source=detail_link)
class HouseCommitteeDetail(HtmlPage): example_source = "https://www.house.mi.gov/Committee/AGRI" example_input = ScrapeCommittee(name="Agriculture", chamber="lower") def process_page(self): com = self.input com.add_source(self.source.url) com.add_link(self.source.url, note="homepage") member_links = CSS(".mb40 li a").match(self.root) for link in member_links: if link.text.startswith("Rep."): title = link.getnext().text_content().strip() name = link.text.split("(")[0].replace("Rep. ", "") com.add_member(name, title or "member") return com
def process_item(self, item): com_link = CSS("a").match_one(item) name = com_link.text_content() com = ScrapeCommittee( name=name, chamber=self.chamber, ) detail_link = com_link.get("href") com.add_source(self.source.url) com.add_source(detail_link) com.add_link(detail_link, note="homepage") # this link has broken html (not able to grab member info) # just returning name, chamber, and link if detail_link == "https://legislature.idaho.gov/sessioninfo/2021/joint/cec/": return com return DetailCommitteePage(com, source=detail_link)
def process_item(self, item): comm_name = item.text_content().strip() com = ScrapeCommittee( name=comm_name, classification="committee", chamber=self.chamber, ) detail_link = item.get("href") com.add_source(self.source.url) # detail links for Joint Committees are hidden # "javascript:__doPostBack('ctl00$ContentPlaceHolder1$gvJICommittees','cmdCommittee$0')" if self.chamber != "legislature": com.add_source(detail_link) com.add_link(detail_link, note="homepage") return CommDetail(com, source=detail_link) else: raise SkipItem("joint committee")
def process_item(self, item): name = item.text_content().strip() chamber = ( item.getparent() .getparent() .getparent() .getprevious() .text_content() .strip() .split()[0] ) if chamber == "House": chamber = "lower" elif chamber == "Senate": chamber = "upper" elif chamber == "Joint": chamber = "legislature" elif chamber == "Legislative": self.skip() # skipping Legislative Agencies com = ScrapeCommittee( name=name, chamber=chamber, ) com.add_source(self.source.url) # new source href = item.get("href") href_lst = href.split("/") new_source = f"https://app.leg.wa.gov/ContentParts/CommitteeMembers/?agency={href_lst[-3]}&committee={href_lst[-1]}" com.add_source(new_source) com.add_link(href, note="homepage") return CommitteeDetail(com, source=new_source)
def process_item(self, item): name = item.text_content().strip() if re.search(" - ", name): parent, com_name = name.split(" - Subcommittee on ") com = ScrapeCommittee( name=com_name, classification="subcommittee", parent=parent, chamber=self.chamber, ) else: com = ScrapeCommittee(name=name, chamber=self.chamber) com.add_source(self.source.url) return SenateCommitteeDetail(com, source=item.get("href"))
def process_item(self, item): sub_name = item.text_content().strip() parent = (item.getparent().getparent().getparent().getparent(). getchildren()[0].text_content().strip()) if parent.title() == "Alc-Jbc Budget Hearings": self.skip() com = ScrapeCommittee( name=sub_name.title(), classification="subcommittee", chamber="legislature", parent=parent.title(), ) detail_link = item.get("href") com.add_source(self.source.url) com.add_source(detail_link) com.add_link(detail_link, note="homepage") return HouseJointDetail(com, source=detail_link)
def process_page(self): # don't use list page because we need to look back at prior element parent = None chamber = "lower" for item in self.selector.match(self.root): cssclass = item.attrib.get("class", "") name = item.text_content().strip() if "parentcommittee" in cssclass: parent = None chamber = "lower" comm = ScrapeCommittee(name=name, classification="committee", chamber=chamber, parent=parent) yield HouseComDetail(comm, source=item.attrib["href"]) # parent for next time if "parentcommittee" in cssclass: parent = comm._id chamber = None
def process_item(self, item): name = item.text_content() if re.search(" - ", name): parent, name = name.split(" - ") # there is one subcommittee that has a shortened parent called "Approps." if parent == "Approps.": parent = "Appropriations" committee = ScrapeCommittee( name=name, classification="subcommittee", parent=parent, chamber=self.chamber, ) else: committee = ScrapeCommittee(name=name, chamber=self.chamber) committee.add_source(self.source.url) return CommitteeDetail(committee, source=item.get("href"))
def process_item(self, item): name = item.text_content().strip() com = ScrapeCommittee(name=name, chamber=self.chamber) com.add_source(self.source.url) return SenateCommitteeDetail(com, source=item.get("href"))
def process_item(self, item): name = item["CommitteeName"] chamber = item["LegislativeBody"] if chamber == "H": chamber = "lower" elif chamber == "S": chamber = "upper" else: # a few Ad Hoc Committees don't have chambers, but are not included in the Standing Committees Scrape anyway self.logger.warning("Committee not assigned to chamber") chamber = "lower" if item["IsSubCommittee"] is False: com = ScrapeCommittee(name=name, chamber=chamber) else: try: parent, name = name.split(" Subcommittee on ") except ValueError: self.logger.warning(f"No parent listed for {name}") com = ScrapeCommittee( name=name, classification="subcommittee", chamber=chamber, parent=parent, ) members = [] for member in item["Members"]: name = member["FirstName"] + " " + member["LastName"] if member["IsChair"]: position = "Chair" elif member["IsViceChair"]: position = "Vice Chair" else: position = "member" # As of now, the API lists all members twice, so we must check for duplicates for members if f"{name} {position}" in members: continue else: members.append(f"{name} {position}") com.add_member(name, position) com.extras["Committee ID"] = item["CommitteeId"] com.extras["Committee Short Name"] = item["CommitteeShortName"] com.extras["Committee Type"] = item["TypeName"] com.add_source(self.source.url) return com
def process_item(self, item): name = item.text_content().strip() com = ScrapeCommittee(name=name, chamber=self.chamber) return HouseCommitteeDetail(com, source=item.get("href"))
def process_item(self, item): name = item.text_content() com = ScrapeCommittee(name=name, chamber=self.chamber) com.add_source(self.source.url) return HouseCommitteeDetail(com, source=URL(item.get("href"), timeout=30))
def process_item(self, item): comm_name = XPath("text()").match_one(item) if comm_name in [ "Teleconference How-To Information", "Legislative Process" ]: self.skip() comm_url = XPath("@href").match_one(item) if comm_name.startswith("Joint"): com = ScrapeCommittee(name=comm_name, classification="committee", chamber="legislature") elif comm_name.startswith("Subcommittee"): parent_comm = (item.getparent().getparent().getparent().getparent( ).getchildren()[0].text_content()) com = ScrapeCommittee( name=comm_name, classification="subcommittee", chamber="upper", parent=parent_comm, ) else: com = ScrapeCommittee(name=comm_name, classification="committee", chamber="upper") com.add_source(self.source.url) com.add_source(comm_url) com.add_link(comm_url, note="homepage") return ChooseType(com, source=URL(comm_url))
def process_item(self, item): name = CSS("strong").match(item)[0].text_content() # skip header row if name == "Committees": self.skip() com = ScrapeCommittee( name=name, chamber=self.chamber, ) all_text = CSS("p").match(item)[0].text_content().strip() secretary, email, phone = re.search( r"\n?Secretary:(.+)\n?Email:(.+)\n?Phone:(.+)", all_text ).groups() com.extras["secretary"] = secretary.strip() com.extras["email"] = email.strip() com.extras["phone"] = phone.strip() detail_link = CSS("a").match(item)[0].get("href") com.add_source(self.source.url) com.add_source(detail_link) com.add_link(detail_link, note="homepage") return DetailCommitteePage(com, source=detail_link)