def process_page(self):
        name = self.root.xpath('//h2[@class="committeeName"]')[1].text
        if name.startswith("Appropriations Subcommittee"):
            return
            # TODO: restore scraping of Appropriations Subcommittees
            # name = name.replace('Appropriations ', '')
            # parent = {'name': 'Appropriations', 'classification': 'upper'}
            # chamber = None
        else:
            if name.startswith("Committee on"):
                name = name.replace("Committee on ", "")
            parent = None
            chamber = "upper"
        print(name)
        comm = ScrapeCommittee(name=name,
                               classification="committee",
                               chamber=chamber,
                               parent=parent)

        for dt in self.root.xpath('//div[@id="members"]/dl/dt'):
            role = dt.text.replace(": ", "").strip().lower()
            member = dt.xpath("./following-sibling::dd")[0].text_content()
            member = self.clean_name(member)
            comm.add_member(member, role=role)

        for ul in self.root.xpath('//div[@id="members"]/ul/li'):
            member = self.clean_name(ul.text_content())
            comm.add_member(member)

        comm.add_source(self.source.url)

        return comm
    def process_item(self, item):
        comm_name = CSS("a").match(item)[0].text_content().strip()

        previous_sibs = item.getparent().itersiblings(preceding=True)
        for sib in previous_sibs:
            if len(sib.getchildren()) == 0:
                chamber_type = sib.text_content().strip()
                break

        if chamber_type == "Senate Committees":
            chamber = "upper"
        elif chamber_type == "Joint Committees":
            self.skip()
        elif chamber_type == "Task Forces":
            self.skip()

        com = ScrapeCommittee(
            name=comm_name,
            classification="committee",
            chamber=chamber,
        )

        detail_link = CSS("a").match(item)[0].get("href")

        com.add_source(self.source.url)
        com.add_source(detail_link)
        com.add_link(detail_link, note="homepage")

        return SenDetail(com, source=detail_link)
    def process_item(self, item):
        comm_name = CSS("a").match_one(item).text_content()
        comm_url = CSS("a").match_one(item).get("href")

        # "https://jtlegbudget.legislature.ca.gov/sublegislativeanalyst" has no members
        if comm_url == "https://jtlegbudget.legislature.ca.gov/sublegislativeanalyst":
            self.skip()

        # Joint Committees are being skipped to avoid duplicates (they were already grabbed during SenateCommitteeList())
        if comm_name.startswith("Joint Committee") or comm_name.startswith(
                "Joint Legislative"):
            self.skip()
        elif comm_name.startswith("Subcommittee"):
            parent_comm = item.getparent().getparent().getchildren(
            )[0].text_content()
            com = ScrapeCommittee(
                name=comm_name,
                classification="subcommittee",
                chamber="lower",
                parent=parent_comm,
            )
        else:
            com = ScrapeCommittee(name=comm_name,
                                  classification="committee",
                                  chamber="lower")
        com.add_source(self.source.url)
        com.add_source(comm_url)
        com.add_link(comm_url, note="homepage")
        return ChooseType(com, source=URL(comm_url))
Exemple #4
0
    def process_item(self, item):
        name = CSS("strong").match(item)[0].text_content()

        # skip header row
        if name == "Committees":
            self.skip()

        com = ScrapeCommittee(
            name=name,
            chamber=self.chamber,
        )

        all_text = CSS("p").match(item)[0].text_content().strip()
        secretary, email, phone = re.search(
            r"\n?Secretary:(.+)\n?Email:(.+)\n?Phone:(.+)", all_text
        ).groups()
        com.extras["secretary"] = secretary.strip()
        com.extras["email"] = email.strip()
        com.extras["phone"] = phone.strip()

        detail_link = CSS("a").match(item)[0].get("href")

        com.add_source(self.source.url)
        com.add_source(detail_link)
        com.add_link(detail_link, note="homepage")

        return DetailCommitteePage(com, source=detail_link)
    def process_item(self, item):
        link = (XPath(
            ".//div[contains(@class, 'container')]//a[contains(@href, 'members')]"
        ).match(item)[0].get("href"))
        name = CSS("h2 a").match(item)[0].text_content()
        com = ScrapeCommittee(name=name, chamber=self.chamber)

        for links in XPath(".//div[contains(@class, 'container')]//a").match(
                item):
            url = links.get("href")
            if url == link:
                continue
            else:
                if links == XPath(
                        ".//div[contains(@class, 'container')]//a[contains(@href, 'home')]"
                ).match_one(item):
                    com.add_link(url, note="homepage")
                    homepage = True
                else:
                    com.add_link(url)
        if not homepage:
            self.warn("no homepage found")

        com.add_source(self.source.url)
        return HouseCommitteeDetail(com, source=link)
    def process_item(self, item):
        comm_name = XPath("text()").match_one(item)
        if comm_name in [
                "Teleconference How-To Information", "Legislative Process"
        ]:
            self.skip()

        comm_url = XPath("@href").match_one(item)

        if comm_name.startswith("Joint"):
            com = ScrapeCommittee(name=comm_name,
                                  classification="committee",
                                  chamber="legislature")
        elif comm_name.startswith("Subcommittee"):
            parent_comm = (item.getparent().getparent().getparent().getparent(
            ).getchildren()[0].text_content())
            com = ScrapeCommittee(
                name=comm_name,
                classification="subcommittee",
                chamber="upper",
                parent=parent_comm,
            )
        else:
            com = ScrapeCommittee(name=comm_name,
                                  classification="committee",
                                  chamber="upper")
        com.add_source(self.source.url)
        com.add_source(comm_url)
        com.add_link(comm_url, note="homepage")
        return ChooseType(com, source=URL(comm_url))
Exemple #7
0
 def process_item(self, item):
     name = item.text_content().strip()
     com = ScrapeCommittee(name=name,
                           classification="committee",
                           chamber=self.chamber)
     detail_link = item.get("href")
     com.add_source(detail_link)
     com.add_link(detail_link, "homepage")
     return CommitteeDetail(com, source=detail_link)
Exemple #8
0
    def process_item(self, item):

        name = item["CommitteeName"]
        chamber = item["LegislativeBody"]

        if chamber == "H":
            chamber = "lower"
        elif chamber == "S":
            chamber = "upper"
        else:
            # a few Ad Hoc Committees don't have chambers, but are not included in the Standing Committees Scrape anyway
            self.logger.warning("Committee not assigned to chamber")
            chamber = "lower"

        if item["IsSubCommittee"] is False:
            com = ScrapeCommittee(name=name, chamber=chamber)

        else:

            try:
                parent, name = name.split(" Subcommittee on ")
            except ValueError:
                self.logger.warning(f"No parent listed for {name}")

            com = ScrapeCommittee(
                name=name,
                classification="subcommittee",
                chamber=chamber,
                parent=parent,
            )

        members = []
        for member in item["Members"]:

            name = member["FirstName"] + " " + member["LastName"]
            if member["IsChair"]:
                position = "Chair"
            elif member["IsViceChair"]:
                position = "Vice Chair"
            else:
                position = "member"

            # As of now, the API lists all members twice, so we must check for duplicates for members
            if f"{name} {position}" in members:
                continue
            else:
                members.append(f"{name} {position}")
                com.add_member(name, position)

        com.extras["Committee ID"] = item["CommitteeId"]
        com.extras["Committee Short Name"] = item["CommitteeShortName"]
        com.extras["Committee Type"] = item["TypeName"]

        com.add_source(self.source.url)

        return com
 def process_item(self, item):
     com_link = CSS("a").match(item)[0]
     name = com_link.text_content()
     com = ScrapeCommittee(name=name,
                           classification="committee",
                           chamber=self.chamber)
     detail_link = com_link.get("href")
     com.add_source(detail_link)
     com.add_link(detail_link, note="homepage")
     return CommitteeDetail(com, source=detail_link)
    def process_item(self, item):
        name = item.text_content().strip()
        if re.search(" - ", name):
            parent, com_name = name.split(" - Subcommittee on ")
            com = ScrapeCommittee(
                name=com_name,
                classification="subcommittee",
                parent=parent,
                chamber=self.chamber,
            )
        else:
            com = ScrapeCommittee(name=name, chamber=self.chamber)

        com.add_source(self.source.url)
        return SenateCommitteeDetail(com, source=item.get("href"))
    def process_item(self, item):
        comm_name = item.text_content().strip()

        com = ScrapeCommittee(
            name=comm_name.title(),
            classification="committee",
            chamber="legislature",
        )

        detail_link = item.get("href")

        com.add_source(self.source.url)
        com.add_source(detail_link)
        com.add_link(detail_link, note="homepage")

        return HouseJointDetail(com, source=detail_link)
Exemple #12
0
    def process_item(self, item):
        comm_name = (
            item.text_content().strip().split(" (")[0].title().replace(
                "(Fin Sub)", ""))

        if "Conference" in comm_name:
            self.skip()

        chamber = item.getparent().getprevious().getprevious().text_content(
        ).strip()
        if chamber == "House":
            chamber = "lower"
        elif chamber == "Senate":
            chamber = "upper"
        elif chamber == "Joint Committee":
            chamber = "legislature"

        classification = item.getparent().getprevious().text_content().strip()

        if classification == "Finance Subcommittee":
            # work around duplicate name of Judiciary committees
            # a current limitation in how Open States can handle committees
            # see https://github.com/openstates/issues/issues/598
            if comm_name == "Judiciary":
                comm_name = "Judiciary (Finance)"
            com = ScrapeCommittee(
                name=comm_name,
                classification="subcommittee",
                chamber=chamber,
                parent="Finance",
            )
        else:
            com = ScrapeCommittee(
                name=comm_name,
                classification="committee",
                chamber=chamber,
            )

        detail_link = CSS("a").match_one(item).get("href")

        com.add_source(self.source.url)
        com.add_source(detail_link)
        com.add_link(detail_link, note="homepage")

        return CommiteeDetail(com, source=URL(detail_link, timeout=30))
    def process_item(self, item):
        name = item.text_content()
        if re.search(" - ", name):
            parent, name = name.split(" - ")

            # there is one subcommittee that has a shortened parent called "Approps."
            if parent == "Approps.":
                parent = "Appropriations"
            committee = ScrapeCommittee(
                name=name,
                classification="subcommittee",
                parent=parent,
                chamber=self.chamber,
            )
        else:
            committee = ScrapeCommittee(name=name, chamber=self.chamber)

        committee.add_source(self.source.url)
        return CommitteeDetail(committee, source=item.get("href"))
Exemple #14
0
    def process_item(self, item):
        com_link = CSS("a").match_one(item)
        name = com_link.text_content()

        com = ScrapeCommittee(
            name=name,
            chamber=self.chamber,
        )

        detail_link = com_link.get("href")

        com.add_source(self.source.url)
        com.add_source(detail_link)
        com.add_link(detail_link, note="homepage")

        # this link has broken html (not able to grab member info)
        # just returning name, chamber, and link
        if detail_link == "https://legislature.idaho.gov/sessioninfo/2021/joint/cec/":
            return com

        return DetailCommitteePage(com, source=detail_link)
    def process_item(self, item):

        name = item.text_content().strip()

        chamber = (
            item.getparent()
            .getparent()
            .getparent()
            .getprevious()
            .text_content()
            .strip()
            .split()[0]
        )
        if chamber == "House":
            chamber = "lower"
        elif chamber == "Senate":
            chamber = "upper"
        elif chamber == "Joint":
            chamber = "legislature"
        elif chamber == "Legislative":
            self.skip()
            # skipping Legislative Agencies

        com = ScrapeCommittee(
            name=name,
            chamber=chamber,
        )

        com.add_source(self.source.url)

        # new source
        href = item.get("href")
        href_lst = href.split("/")
        new_source = f"https://app.leg.wa.gov/ContentParts/CommitteeMembers/?agency={href_lst[-3]}&committee={href_lst[-1]}"

        com.add_source(new_source)
        com.add_link(href, note="homepage")

        return CommitteeDetail(com, source=new_source)
Exemple #16
0
    def process_item(self, item):
        comm_name = item.text_content().strip()

        com = ScrapeCommittee(
            name=comm_name,
            classification="committee",
            chamber=self.chamber,
        )

        detail_link = item.get("href")

        com.add_source(self.source.url)

        # detail links for Joint Committees are hidden
        # "javascript:__doPostBack('ctl00$ContentPlaceHolder1$gvJICommittees','cmdCommittee$0')"
        if self.chamber != "legislature":
            com.add_source(detail_link)
            com.add_link(detail_link, note="homepage")

            return CommDetail(com, source=detail_link)
        else:
            raise SkipItem("joint committee")
Exemple #17
0
    def process_item(self, item):
        comm_name = (
            item.text_content().strip().split(" (")[0].title().replace("(Fin Sub)", "")
        )

        if "Conference" in comm_name:
            self.skip()

        chamber = item.getparent().getprevious().getprevious().text_content().strip()
        if chamber == "House":
            chamber = "lower"
        elif chamber == "Senate":
            chamber = "upper"
        elif chamber == "Joint Committee":
            chamber = "legislature"

        classification = item.getparent().getprevious().text_content().strip()

        if classification == "Finance Subcommittee":
            com = ScrapeCommittee(
                name=comm_name,
                classification="subcommittee",
                chamber=chamber,
                parent="Finance",
            )
        else:
            com = ScrapeCommittee(
                name=comm_name,
                classification="committee",
                chamber=chamber,
            )

        detail_link = CSS("a").match_one(item).get("href")

        com.add_source(self.source.url)
        com.add_source(detail_link)
        com.add_link(detail_link, note="homepage")

        return CommiteeDetail(com, source=URL(detail_link, timeout=30))
    def process_item(self, item):
        sub_name = item.text_content().strip()

        parent = (item.getparent().getparent().getparent().getparent().
                  getchildren()[0].text_content().strip())

        if parent.title() == "Alc-Jbc Budget Hearings":
            self.skip()

        com = ScrapeCommittee(
            name=sub_name.title(),
            classification="subcommittee",
            chamber="legislature",
            parent=parent.title(),
        )

        detail_link = item.get("href")

        com.add_source(self.source.url)
        com.add_source(detail_link)
        com.add_link(detail_link, note="homepage")

        return HouseJointDetail(com, source=detail_link)
    def process_item(self, item):
        if item["chamber"] == 2:
            chamber = "upper"
        elif item["chamber"] == 1:
            chamber = "lower"

        source = URL(
            f"https://www.legis.ga.gov/api/committees/details/{item['id']}/1029",
            headers={"Authorization": get_token()},
        )

        com = ScrapeCommittee(
            name=item["name"],
            chamber=chamber,
        )

        com.add_source(
            self.source.url, note="Initial list page (requires authorization token)"
        )

        return CommitteeDetail(
            com,
            source=source,
        )
Exemple #20
0
 def process_item(self, item):
     name = item.text_content().strip()
     com = ScrapeCommittee(name=name, chamber=self.chamber)
     com.add_source(self.source.url)
     return SenateCommitteeDetail(com, source=item.get("href"))
 def process_item(self, item):
     name = item.text_content()
     com = ScrapeCommittee(name=name, chamber=self.chamber)
     com.add_source(self.source.url)
     return HouseCommitteeDetail(com, source=URL(item.get("href"), timeout=30))