Python ScrapeCommittee.add_link Examples, openstates.models.ScrapeCommittee.add_link Python Examples

Example #1

0

Show file

File: committees.py Project: jessemortenson/openstates

    def process_item(self, item):
        comm_name = CSS("a").match_one(item).text_content()
        comm_url = CSS("a").match_one(item).get("href")

        # "https://jtlegbudget.legislature.ca.gov/sublegislativeanalyst" has no members
        if comm_url == "https://jtlegbudget.legislature.ca.gov/sublegislativeanalyst":
            self.skip()

        # Joint Committees are being skipped to avoid duplicates (they were already grabbed during SenateCommitteeList())
        if comm_name.startswith("Joint Committee") or comm_name.startswith(
                "Joint Legislative"):
            self.skip()
        elif comm_name.startswith("Subcommittee"):
            parent_comm = item.getparent().getparent().getchildren(
            )[0].text_content()
            com = ScrapeCommittee(
                name=comm_name,
                classification="subcommittee",
                chamber="lower",
                parent=parent_comm,
            )
        else:
            com = ScrapeCommittee(name=comm_name,
                                  classification="committee",
                                  chamber="lower")
        com.add_source(self.source.url)
        com.add_source(comm_url)
        com.add_link(comm_url, note="homepage")
        return ChooseType(com, source=URL(comm_url))

Example #2

0

Show file

File: committees.py Project: jessemortenson/openstates

    def process_item(self, item):
        link = (XPath(
            ".//div[contains(@class, 'container')]//a[contains(@href, 'members')]"
        ).match(item)[0].get("href"))
        name = CSS("h2 a").match(item)[0].text_content()
        com = ScrapeCommittee(name=name, chamber=self.chamber)

        for links in XPath(".//div[contains(@class, 'container')]//a").match(
                item):
            url = links.get("href")
            if url == link:
                continue
            else:
                if links == XPath(
                        ".//div[contains(@class, 'container')]//a[contains(@href, 'home')]"
                ).match_one(item):
                    com.add_link(url, note="homepage")
                    homepage = True
                else:
                    com.add_link(url)
        if not homepage:
            self.warn("no homepage found")

        com.add_source(self.source.url)
        return HouseCommitteeDetail(com, source=link)

Example #3

0

Show file

File: committees.py Project: jealob/openstates-scrapers

    def process_item(self, item):
        comm_name = CSS("a").match(item)[0].text_content().strip()

        previous_sibs = item.getparent().itersiblings(preceding=True)
        for sib in previous_sibs:
            if len(sib.getchildren()) == 0:
                chamber_type = sib.text_content().strip()
                break

        if chamber_type == "Senate Committees":
            chamber = "upper"
        elif chamber_type == "Joint Committees":
            self.skip()
        elif chamber_type == "Task Forces":
            self.skip()

        com = ScrapeCommittee(
            name=comm_name,
            classification="committee",
            chamber=chamber,
        )

        detail_link = CSS("a").match(item)[0].get("href")

        com.add_source(self.source.url)
        com.add_source(detail_link)
        com.add_link(detail_link, note="homepage")

        return SenDetail(com, source=detail_link)

Example #4

0

Show file

File: committees.py Project: jessemortenson/openstates

    def process_item(self, item):
        comm_name = XPath("text()").match_one(item)
        if comm_name in [
                "Teleconference How-To Information", "Legislative Process"
        ]:
            self.skip()

        comm_url = XPath("@href").match_one(item)

        if comm_name.startswith("Joint"):
            com = ScrapeCommittee(name=comm_name,
                                  classification="committee",
                                  chamber="legislature")
        elif comm_name.startswith("Subcommittee"):
            parent_comm = (item.getparent().getparent().getparent().getparent(
            ).getchildren()[0].text_content())
            com = ScrapeCommittee(
                name=comm_name,
                classification="subcommittee",
                chamber="upper",
                parent=parent_comm,
            )
        else:
            com = ScrapeCommittee(name=comm_name,
                                  classification="committee",
                                  chamber="upper")
        com.add_source(self.source.url)
        com.add_source(comm_url)
        com.add_link(comm_url, note="homepage")
        return ChooseType(com, source=URL(comm_url))

Example #5

0

Show file

    def process_item(self, item):
        name = CSS("strong").match(item)[0].text_content()

        # skip header row
        if name == "Committees":
            self.skip()

        com = ScrapeCommittee(
            name=name,
            chamber=self.chamber,
        )

        all_text = CSS("p").match(item)[0].text_content().strip()
        secretary, email, phone = re.search(
            r"\n?Secretary:(.+)\n?Email:(.+)\n?Phone:(.+)", all_text
        ).groups()
        com.extras["secretary"] = secretary.strip()
        com.extras["email"] = email.strip()
        com.extras["phone"] = phone.strip()

        detail_link = CSS("a").match(item)[0].get("href")

        com.add_source(self.source.url)
        com.add_source(detail_link)
        com.add_link(detail_link, note="homepage")

        return DetailCommitteePage(com, source=detail_link)

Example #6

0

Show file

 def process_item(self, item):
     name = item.text_content().strip()
     com = ScrapeCommittee(name=name,
                           classification="committee",
                           chamber=self.chamber)
     detail_link = item.get("href")
     com.add_source(detail_link)
     com.add_link(detail_link, "homepage")
     return CommitteeDetail(com, source=detail_link)

Example #7

0

Show file

File: committees.py Project: jealob/openstates-scrapers

 def process_item(self, item):
     com_link = CSS("a").match(item)[0]
     name = com_link.text_content()
     com = ScrapeCommittee(name=name,
                           classification="committee",
                           chamber=self.chamber)
     detail_link = com_link.get("href")
     com.add_source(detail_link)
     com.add_link(detail_link, note="homepage")
     return CommitteeDetail(com, source=detail_link)

Example #8

0

Show file

File: committees.py Project: jealob/openstates-scrapers

    def process_item(self, item):
        comm_name = item.text_content().strip()

        com = ScrapeCommittee(
            name=comm_name.title(),
            classification="committee",
            chamber="legislature",
        )

        detail_link = item.get("href")

        com.add_source(self.source.url)
        com.add_source(detail_link)
        com.add_link(detail_link, note="homepage")

        return HouseJointDetail(com, source=detail_link)

Example #9

0

Show file

    def process_item(self, item):
        comm_name = (
            item.text_content().strip().split(" (")[0].title().replace(
                "(Fin Sub)", ""))

        if "Conference" in comm_name:
            self.skip()

        chamber = item.getparent().getprevious().getprevious().text_content(
        ).strip()
        if chamber == "House":
            chamber = "lower"
        elif chamber == "Senate":
            chamber = "upper"
        elif chamber == "Joint Committee":
            chamber = "legislature"

        classification = item.getparent().getprevious().text_content().strip()

        if classification == "Finance Subcommittee":
            # work around duplicate name of Judiciary committees
            # a current limitation in how Open States can handle committees
            # see https://github.com/openstates/issues/issues/598
            if comm_name == "Judiciary":
                comm_name = "Judiciary (Finance)"
            com = ScrapeCommittee(
                name=comm_name,
                classification="subcommittee",
                chamber=chamber,
                parent="Finance",
            )
        else:
            com = ScrapeCommittee(
                name=comm_name,
                classification="committee",
                chamber=chamber,
            )

        detail_link = CSS("a").match_one(item).get("href")

        com.add_source(self.source.url)
        com.add_source(detail_link)
        com.add_link(detail_link, note="homepage")

        return CommiteeDetail(com, source=URL(detail_link, timeout=30))

Example #10

0

Show file

    def process_item(self, item):
        com_link = CSS("a").match_one(item)
        name = com_link.text_content()

        com = ScrapeCommittee(
            name=name,
            chamber=self.chamber,
        )

        detail_link = com_link.get("href")

        com.add_source(self.source.url)
        com.add_source(detail_link)
        com.add_link(detail_link, note="homepage")

        # this link has broken html (not able to grab member info)
        # just returning name, chamber, and link
        if detail_link == "https://legislature.idaho.gov/sessioninfo/2021/joint/cec/":
            return com

        return DetailCommitteePage(com, source=detail_link)

Example #11

0

Show file

    def process_item(self, item):
        comm_name = item.text_content().strip()

        com = ScrapeCommittee(
            name=comm_name,
            classification="committee",
            chamber=self.chamber,
        )

        detail_link = item.get("href")

        com.add_source(self.source.url)

        # detail links for Joint Committees are hidden
        # "javascript:__doPostBack('ctl00$ContentPlaceHolder1$gvJICommittees','cmdCommittee$0')"
        if self.chamber != "legislature":
            com.add_source(detail_link)
            com.add_link(detail_link, note="homepage")

            return CommDetail(com, source=detail_link)
        else:
            raise SkipItem("joint committee")

Example #12

0

Show file

File: committees.py Project: jealob/openstates-scrapers

    def process_item(self, item):

        name = item.text_content().strip()

        chamber = (
            item.getparent()
            .getparent()
            .getparent()
            .getprevious()
            .text_content()
            .strip()
            .split()[0]
        )
        if chamber == "House":
            chamber = "lower"
        elif chamber == "Senate":
            chamber = "upper"
        elif chamber == "Joint":
            chamber = "legislature"
        elif chamber == "Legislative":
            self.skip()
            # skipping Legislative Agencies

        com = ScrapeCommittee(
            name=name,
            chamber=chamber,
        )

        com.add_source(self.source.url)

        # new source
        href = item.get("href")
        href_lst = href.split("/")
        new_source = f"https://app.leg.wa.gov/ContentParts/CommitteeMembers/?agency={href_lst[-3]}&committee={href_lst[-1]}"

        com.add_source(new_source)
        com.add_link(href, note="homepage")

        return CommitteeDetail(com, source=new_source)

Example #13

0

Show file

File: committees.py Project: csnardi/openstates

    def process_item(self, item):
        comm_name = (
            item.text_content().strip().split(" (")[0].title().replace("(Fin Sub)", "")
        )

        if "Conference" in comm_name:
            self.skip()

        chamber = item.getparent().getprevious().getprevious().text_content().strip()
        if chamber == "House":
            chamber = "lower"
        elif chamber == "Senate":
            chamber = "upper"
        elif chamber == "Joint Committee":
            chamber = "legislature"

        classification = item.getparent().getprevious().text_content().strip()

        if classification == "Finance Subcommittee":
            com = ScrapeCommittee(
                name=comm_name,
                classification="subcommittee",
                chamber=chamber,
                parent="Finance",
            )
        else:
            com = ScrapeCommittee(
                name=comm_name,
                classification="committee",
                chamber=chamber,
            )

        detail_link = CSS("a").match_one(item).get("href")

        com.add_source(self.source.url)
        com.add_source(detail_link)
        com.add_link(detail_link, note="homepage")

        return CommiteeDetail(com, source=URL(detail_link, timeout=30))

Example #14

0

Show file

File: committees.py Project: jealob/openstates-scrapers

    def process_item(self, item):
        sub_name = item.text_content().strip()

        parent = (item.getparent().getparent().getparent().getparent().
                  getchildren()[0].text_content().strip())

        if parent.title() == "Alc-Jbc Budget Hearings":
            self.skip()

        com = ScrapeCommittee(
            name=sub_name.title(),
            classification="subcommittee",
            chamber="legislature",
            parent=parent.title(),
        )

        detail_link = item.get("href")

        com.add_source(self.source.url)
        com.add_source(detail_link)
        com.add_link(detail_link, note="homepage")

        return HouseJointDetail(com, source=detail_link)