def process_item(self, item): comm_name = CSS("a").match_one(item).text_content() comm_url = CSS("a").match_one(item).get("href") # "https://jtlegbudget.legislature.ca.gov/sublegislativeanalyst" has no members if comm_url == "https://jtlegbudget.legislature.ca.gov/sublegislativeanalyst": self.skip() # Joint Committees are being skipped to avoid duplicates (they were already grabbed during SenateCommitteeList()) if comm_name.startswith("Joint Committee") or comm_name.startswith( "Joint Legislative"): self.skip() elif comm_name.startswith("Subcommittee"): parent_comm = item.getparent().getparent().getchildren( )[0].text_content() com = ScrapeCommittee( name=comm_name, classification="subcommittee", chamber="lower", parent=parent_comm, ) else: com = ScrapeCommittee(name=comm_name, classification="committee", chamber="lower") com.add_source(self.source.url) com.add_source(comm_url) com.add_link(comm_url, note="homepage") return ChooseType(com, source=URL(comm_url))
def process_item(self, item): website, district, name, party, office, phone, email = item.getchildren( ) # skip header row if website.tag == "th": self.skip() office = office.text_content() for abbr, full in self.office_names.items(): office = office.replace(abbr, full) p = ScrapePerson( name=name.text_content(), state="mi", chamber="lower", district=district.text_content().lstrip("0"), party=party.text_content(), email=email.text_content(), ) link = CSS("a").match_one(website).get("href") if link.startswith("http:/r"): link = link.replace(":/", "://") p.add_link(link) p.add_source(self.source.url) p.capitol_office.voice = phone.text_content() p.capitol_office.address = office return p