def process_item(self, item): comm_name = CSS("a").match_one(item).text_content() comm_url = CSS("a").match_one(item).get("href") # "https://jtlegbudget.legislature.ca.gov/sublegislativeanalyst" has no members if comm_url == "https://jtlegbudget.legislature.ca.gov/sublegislativeanalyst": self.skip() # Joint Committees are being skipped to avoid duplicates (they were already grabbed during SenateCommitteeList()) if comm_name.startswith("Joint Committee") or comm_name.startswith( "Joint Legislative"): self.skip() elif comm_name.startswith("Subcommittee"): parent_comm = item.getparent().getparent().getchildren( )[0].text_content() com = ScrapeCommittee( name=comm_name, classification="subcommittee", chamber="lower", parent=parent_comm, ) else: com = ScrapeCommittee(name=comm_name, classification="committee", chamber="lower") com.add_source(self.source.url) com.add_source(comm_url) com.add_link(comm_url, note="homepage") return ChooseType(com, source=URL(comm_url))
def process_item(self, item): link = (XPath( ".//div[contains(@class, 'container')]//a[contains(@href, 'members')]" ).match(item)[0].get("href")) name = CSS("h2 a").match(item)[0].text_content() com = ScrapeCommittee(name=name, chamber=self.chamber) for links in XPath(".//div[contains(@class, 'container')]//a").match( item): url = links.get("href") if url == link: continue else: if links == XPath( ".//div[contains(@class, 'container')]//a[contains(@href, 'home')]" ).match_one(item): com.add_link(url, note="homepage") homepage = True else: com.add_link(url) if not homepage: self.warn("no homepage found") com.add_source(self.source.url) return HouseCommitteeDetail(com, source=link)
def process_item(self, item): comm_name = CSS("a").match(item)[0].text_content().strip() previous_sibs = item.getparent().itersiblings(preceding=True) for sib in previous_sibs: if len(sib.getchildren()) == 0: chamber_type = sib.text_content().strip() break if chamber_type == "Senate Committees": chamber = "upper" elif chamber_type == "Joint Committees": self.skip() elif chamber_type == "Task Forces": self.skip() com = ScrapeCommittee( name=comm_name, classification="committee", chamber=chamber, ) detail_link = CSS("a").match(item)[0].get("href") com.add_source(self.source.url) com.add_source(detail_link) com.add_link(detail_link, note="homepage") return SenDetail(com, source=detail_link)
def process_item(self, item): comm_name = XPath("text()").match_one(item) if comm_name in [ "Teleconference How-To Information", "Legislative Process" ]: self.skip() comm_url = XPath("@href").match_one(item) if comm_name.startswith("Joint"): com = ScrapeCommittee(name=comm_name, classification="committee", chamber="legislature") elif comm_name.startswith("Subcommittee"): parent_comm = (item.getparent().getparent().getparent().getparent( ).getchildren()[0].text_content()) com = ScrapeCommittee( name=comm_name, classification="subcommittee", chamber="upper", parent=parent_comm, ) else: com = ScrapeCommittee(name=comm_name, classification="committee", chamber="upper") com.add_source(self.source.url) com.add_source(comm_url) com.add_link(comm_url, note="homepage") return ChooseType(com, source=URL(comm_url))
def process_item(self, item): name = CSS("strong").match(item)[0].text_content() # skip header row if name == "Committees": self.skip() com = ScrapeCommittee( name=name, chamber=self.chamber, ) all_text = CSS("p").match(item)[0].text_content().strip() secretary, email, phone = re.search( r"\n?Secretary:(.+)\n?Email:(.+)\n?Phone:(.+)", all_text ).groups() com.extras["secretary"] = secretary.strip() com.extras["email"] = email.strip() com.extras["phone"] = phone.strip() detail_link = CSS("a").match(item)[0].get("href") com.add_source(self.source.url) com.add_source(detail_link) com.add_link(detail_link, note="homepage") return DetailCommitteePage(com, source=detail_link)
def process_item(self, item): name = item.text_content().strip() com = ScrapeCommittee(name=name, classification="committee", chamber=self.chamber) detail_link = item.get("href") com.add_source(detail_link) com.add_link(detail_link, "homepage") return CommitteeDetail(com, source=detail_link)
def process_item(self, item): com_link = CSS("a").match(item)[0] name = com_link.text_content() com = ScrapeCommittee(name=name, classification="committee", chamber=self.chamber) detail_link = com_link.get("href") com.add_source(detail_link) com.add_link(detail_link, note="homepage") return CommitteeDetail(com, source=detail_link)
def process_item(self, item): comm_name = item.text_content().strip() com = ScrapeCommittee( name=comm_name.title(), classification="committee", chamber="legislature", ) detail_link = item.get("href") com.add_source(self.source.url) com.add_source(detail_link) com.add_link(detail_link, note="homepage") return HouseJointDetail(com, source=detail_link)
def process_item(self, item): comm_name = ( item.text_content().strip().split(" (")[0].title().replace( "(Fin Sub)", "")) if "Conference" in comm_name: self.skip() chamber = item.getparent().getprevious().getprevious().text_content( ).strip() if chamber == "House": chamber = "lower" elif chamber == "Senate": chamber = "upper" elif chamber == "Joint Committee": chamber = "legislature" classification = item.getparent().getprevious().text_content().strip() if classification == "Finance Subcommittee": # work around duplicate name of Judiciary committees # a current limitation in how Open States can handle committees # see https://github.com/openstates/issues/issues/598 if comm_name == "Judiciary": comm_name = "Judiciary (Finance)" com = ScrapeCommittee( name=comm_name, classification="subcommittee", chamber=chamber, parent="Finance", ) else: com = ScrapeCommittee( name=comm_name, classification="committee", chamber=chamber, ) detail_link = CSS("a").match_one(item).get("href") com.add_source(self.source.url) com.add_source(detail_link) com.add_link(detail_link, note="homepage") return CommiteeDetail(com, source=URL(detail_link, timeout=30))
def process_item(self, item): com_link = CSS("a").match_one(item) name = com_link.text_content() com = ScrapeCommittee( name=name, chamber=self.chamber, ) detail_link = com_link.get("href") com.add_source(self.source.url) com.add_source(detail_link) com.add_link(detail_link, note="homepage") # this link has broken html (not able to grab member info) # just returning name, chamber, and link if detail_link == "https://legislature.idaho.gov/sessioninfo/2021/joint/cec/": return com return DetailCommitteePage(com, source=detail_link)
def process_item(self, item): comm_name = item.text_content().strip() com = ScrapeCommittee( name=comm_name, classification="committee", chamber=self.chamber, ) detail_link = item.get("href") com.add_source(self.source.url) # detail links for Joint Committees are hidden # "javascript:__doPostBack('ctl00$ContentPlaceHolder1$gvJICommittees','cmdCommittee$0')" if self.chamber != "legislature": com.add_source(detail_link) com.add_link(detail_link, note="homepage") return CommDetail(com, source=detail_link) else: raise SkipItem("joint committee")
def process_item(self, item): name = item.text_content().strip() chamber = ( item.getparent() .getparent() .getparent() .getprevious() .text_content() .strip() .split()[0] ) if chamber == "House": chamber = "lower" elif chamber == "Senate": chamber = "upper" elif chamber == "Joint": chamber = "legislature" elif chamber == "Legislative": self.skip() # skipping Legislative Agencies com = ScrapeCommittee( name=name, chamber=chamber, ) com.add_source(self.source.url) # new source href = item.get("href") href_lst = href.split("/") new_source = f"https://app.leg.wa.gov/ContentParts/CommitteeMembers/?agency={href_lst[-3]}&committee={href_lst[-1]}" com.add_source(new_source) com.add_link(href, note="homepage") return CommitteeDetail(com, source=new_source)
def process_item(self, item): comm_name = ( item.text_content().strip().split(" (")[0].title().replace("(Fin Sub)", "") ) if "Conference" in comm_name: self.skip() chamber = item.getparent().getprevious().getprevious().text_content().strip() if chamber == "House": chamber = "lower" elif chamber == "Senate": chamber = "upper" elif chamber == "Joint Committee": chamber = "legislature" classification = item.getparent().getprevious().text_content().strip() if classification == "Finance Subcommittee": com = ScrapeCommittee( name=comm_name, classification="subcommittee", chamber=chamber, parent="Finance", ) else: com = ScrapeCommittee( name=comm_name, classification="committee", chamber=chamber, ) detail_link = CSS("a").match_one(item).get("href") com.add_source(self.source.url) com.add_source(detail_link) com.add_link(detail_link, note="homepage") return CommiteeDetail(com, source=URL(detail_link, timeout=30))
def process_item(self, item): sub_name = item.text_content().strip() parent = (item.getparent().getparent().getparent().getparent(). getchildren()[0].text_content().strip()) if parent.title() == "Alc-Jbc Budget Hearings": self.skip() com = ScrapeCommittee( name=sub_name.title(), classification="subcommittee", chamber="legislature", parent=parent.title(), ) detail_link = item.get("href") com.add_source(self.source.url) com.add_source(detail_link) com.add_link(detail_link, note="homepage") return HouseJointDetail(com, source=detail_link)