Python XPath Examples, spatula.XPath Python Examples

Example #1

0

Show file

File: committees.py Project: jessemortenson/openstates

    def process_item(self, item):
        link = (XPath(
            ".//div[contains(@class, 'container')]//a[contains(@href, 'members')]"
        ).match(item)[0].get("href"))
        name = CSS("h2 a").match(item)[0].text_content()
        com = ScrapeCommittee(name=name, chamber=self.chamber)

        for links in XPath(".//div[contains(@class, 'container')]//a").match(
                item):
            url = links.get("href")
            if url == link:
                continue
            else:
                if links == XPath(
                        ".//div[contains(@class, 'container')]//a[contains(@href, 'home')]"
                ).match_one(item):
                    com.add_link(url, note="homepage")
                    homepage = True
                else:
                    com.add_link(url)
        if not homepage:
            self.warn("no homepage found")

        com.add_source(self.source.url)
        return HouseCommitteeDetail(com, source=link)

Example #2

0

Show file

File: committees.py Project: jessemortenson/openstates

    def process_page(self):
        com = self.input
        com.add_source(self.source.url)
        com.add_link(self.source.url, note="homepage")

        room, time = XPath(
            "//div[@class='col-sm-12 pb-2']//p[2]/text()").match(self.root)
        if re.search("On Call", time):
            time = time.split(" -")[0]
        com.extras["room"] = room.strip()
        com.extras["meeting schedule"] = time.strip()

        for link in XPath(
                '//div[contains(@class, "media-body")]//a[contains(@href, "member_bio")]'
        ).match(self.root):
            name = link.text_content().split(",")[0]
            if name:
                try:
                    positions = ("chair", "vice chair",
                                 "ranking minority member")
                    position = XPath("..//preceding-sibling::b/text()").match(
                        link)
                    for role in position:
                        position_str = ""
                        position_str += role.lower()
                        if position_str not in positions:
                            raise ValueError("unknown position")
                except SelectorError:
                    position_str = "member"
            com.add_member(name, position_str)

        return com

Example #3

0

Show file

File: committees.py Project: jessemortenson/openstates

    def process_page(self):

        com = self.input
        com.add_source(self.source.url)

        time, room = (CSS(".border-0 .pl-2").match(
            self.root)[0].text_content().split("in "))
        time = time.split("Meets:")[1]

        com.extras["room"] = room.strip()
        com.extras["meeting schedule"] = time.strip()

        for p in XPath('//div[@class="media pl-2 py-4"]').match(self.root):

            name = (XPath(".//div[@class='media-body']/span/b/text()").match(p)
                    [0].replace("Rep.",
                                "").split("(R)")[0].split("(DFL")[0].strip())

            positions = ["committee chair", "vice chair", "republican lead"]
            if name:
                try:
                    position = CSS("span b u").match(
                        p)[0].text_content().lower()
                    if position in positions:
                        role = position
                except SelectorError:
                    role = "member"
            com.add_member(name, role)
        return com

Example #4

0

Show file

File: committees.py Project: jessemortenson/openstates

    def process_item(self, item):
        comm_name = XPath("text()").match_one(item)
        if comm_name in [
                "Teleconference How-To Information", "Legislative Process"
        ]:
            self.skip()

        comm_url = XPath("@href").match_one(item)

        if comm_name.startswith("Joint"):
            com = ScrapeCommittee(name=comm_name,
                                  classification="committee",
                                  chamber="legislature")
        elif comm_name.startswith("Subcommittee"):
            parent_comm = (item.getparent().getparent().getparent().getparent(
            ).getchildren()[0].text_content())
            com = ScrapeCommittee(
                name=comm_name,
                classification="subcommittee",
                chamber="upper",
                parent=parent_comm,
            )
        else:
            com = ScrapeCommittee(name=comm_name,
                                  classification="committee",
                                  chamber="upper")
        com.add_source(self.source.url)
        com.add_source(comm_url)
        com.add_link(comm_url, note="homepage")
        return ChooseType(com, source=URL(comm_url))

Example #5

0

Show file

File: committees.py Project: jealob/openstates-scrapers

    def process_item(self, item):
        try:
            title = XPath("..//preceding-sibling::h3/text()").match(item)

        except SelectorError:
            title = XPath("../../..//preceding-sibling::h3/text()").match(item)

        for comm_name in title:
            if (comm_name == "Standing Committees"
                    or comm_name == "Appropriations Subcommittees"):
                name_link = CSS("a").match_one(item)
                name = name_link.text_content()
                source = name_link.get("href")
                if comm_name == "Standing Committees":
                    com = ScrapeCommittee(name=name, chamber=self.chamber)
                else:
                    com = ScrapeCommittee(
                        name=name,
                        classification="subcommittee",
                        chamber=self.chamber,
                        parent="Appropriations",
                    )
                return SenateCommitteeDetail(com, source=source)
            else:
                self.skip()

Example #6

0

Show file

File: scrape_ultrasignup.py Project: trailhawks/trailhawks.com

 def process_item(self, item):
     href = XPath("@href").match_one(item)
     if not href.startswith("http"):
         href = f"https://ultrasignup.com{href}"
     race_id = href.split("=")[-1]
     return RaceResultDetail(dict(race_id=race_id,
                                  race_results_url=href,
                                  **self.input),
                             source=href)

Example #7

0

Show file

File: people.py Project: jealob/openstates-scrapers

    def process_item(self, item):
        name = CSS("a").match(item)[2].text_content()
        name = re.sub(r"Contact Assembly Member", "", name).strip()

        party = CSS("td").match(item)[2].text_content().strip()
        if party == "Democrat":
            party = "Democratic"

        district = CSS("td").match(item)[1].text_content().strip().lstrip("0")

        # District 18 has a vacant spot
        if name == "edit":
            self.skip("skipping Vacant seat in District {}".format(district))

        photo_url = CSS("img").match(item, min_items=0)
        if photo_url:
            photo_url = photo_url[0].get("src")

        p = ScrapePerson(
            name=name,
            state="ca",
            chamber="lower",
            district=district,
            party=party,
            image=photo_url,
        )

        capitol_office_header = CSS("h3").match(item)[0].text_content()
        capitol_office_text = (
            XPath(
                "//*[@id='block-views-view-members-block-1']/div/div/div/table/tbody/tr[1]/td[4]/text()"
            )
            .match(item)[1]
            .strip()
        )
        capitol_office_text, capitol_office_phone = capitol_office_text.split("; ")
        capitol_office_address = capitol_office_header + capitol_office_text

        p.capitol_office.address = capitol_office_address
        p.capitol_office.voice = capitol_office_phone

        district_offices = XPath(".//td/p[1]/text()").match(item)

        for office in district_offices:
            district_address, district_phone = office.split("; ")
            p.add_office(
                classification="district",
                address=district_address.strip(),
                voice=district_phone.strip(),
            )

        url = CSS("a").match(item)[0].get("href")
        p.add_link(url)
        p.add_source(self.source.url)

        return p

Example #8

0

Show file

File: people.py Project: jealob/openstates-scrapers

    def process_page(self):
        p = self.input

        img = CSS("div.field-person-photo img").match_one(self.root).get("src")
        p.image = img

        bio_info = CSS("div.pane-content ul li").match(self.root)
        if len(bio_info) > 0:
            p.extras["bio info"] = []
            for info in bio_info:
                p.extras["bio info"] += info

        try:
            street = (CSS("div.street-address").match_one(
                self.root).text_content().strip())
            town = CSS("span.locality").match_one(
                self.root).text_content().strip()
            zip_code = (CSS("span.postal-code").match_one(
                self.root).text_content().strip())
            address = street + ", " + town + ", ND " + zip_code
            p.district_office.address = address
        except SelectorError:
            pass

        try:
            phones = XPath(
                "//*[@id='block-system-main']//div[contains(text(), 'phone')]"
            ).match(self.root)
            for phone in phones:
                phone_type = phone.text_content().strip()
                phone_number = phone.getnext().text_content().strip()
                if phone_type == "Cellphone:":
                    p.extras["Cell phone"] = phone_number
                elif phone_type == "Home Telephone:":
                    p.extras["Home phone"] = phone_number
                elif phone_type == "Office Telephone:":
                    p.district_office.voice = phone_number
        except SelectorError:
            pass

        email = (XPath(
            "//*[@id='block-system-main']//div[contains(text(), 'Email')]").
                 match_one(self.root).getnext().text_content().strip())
        p.email = email

        try:
            fax = (XPath(
                "//*[@id='block-system-main']//div[contains(text(), 'Fax')]").
                   match_one(self.root).getnext().text_content().strip())
            p.district_office.fax = fax
        except SelectorError:
            pass

        return p

Example #9

0

Show file

File: committees.py Project: jealob/openstates-scrapers

    def process_page(self):
        com = self.input
        com.add_source(self.source.url)
        com.add_link(self.source.url, note="homepage")

        try:
            chairs = CSS(".chair-info").match(self.root)
        except SelectorError:
            raise SkipItem("skipping committee without full information")

        # in case there are co-chairs
        num_chairs = len(chairs)

        for chair in chairs:
            chair_name = CSS(".comm-chair-name").match_one(chair).text_content().strip()
            chair_role = (
                XPath(f"..//preceding-sibling::header[{num_chairs}]")
                .match_one(chair)
                .text_content()
                .strip()
                .lower()
            )
            com.add_member(chair_name, chair_role)

        # some committees only have chairs and no members list
        try:
            for p in CSS("#comm-membership ul li").match(self.root):
                name = p.text_content().strip()
                role = "member"
                com.add_member(name, role)
        except SelectorError:
            pass

        # some committees have temporary addresses, others have permanent ones
        try:
            temp, room, zip = XPath(
                "//section[@id='comm-addr']/div[@class='mod-inner']//text()"
            ).match(self.root)
            com.extras["address"] = f"{temp}: {room}; {zip}"
        except ValueError:
            room, zip = XPath(
                "//section[@id='comm-addr']/div[@class='mod-inner']//text()"
            ).match(self.root)
            com.extras["address"] = f"{room}; {zip}"

        # some committees have press releases
        try:
            news_link = CSS("#page-content .read-more").match(self.root)[0].get("href")
            com.add_link(news_link)
        except SelectorError:
            pass

        return com

Example #10

0

Show file

    def process_page(self):
        p = self.input

        img = CSS("div#content p img").match_one(self.root).get("src")
        p.image = img

        if self.source.url == "https://legislature.maine.gov/District-22":
            addr = CSS("div#content p strong").match(self.root)[2].tail.strip()
        else:
            addr = (
                CSS("div#content p strong")
                .match(self.root)[1]
                .tail.strip()
                .lstrip(":")
                .strip()
            )
        if addr != p.district_office.address:
            p.extras["Additional address"] = addr

        try:
            state_phone = (
                XPath("//*[@id='content']/p/strong[contains(text(), 'State')]")
                .match_one(self.root)
                .tail.strip()
            )
            state_phone = state_phone.lstrip(":").strip()
            p.capitol_office.voice = state_phone
        except SelectorError:
            pass

        try:
            state_phone = (
                XPath("//*[@id='content']/p/b[contains(text(), 'State')]")
                .match_one(self.root)
                .tail.strip()
            )
            state_phone = state_phone.lstrip(":").strip()
            p.capitol_office.voice = state_phone
        except SelectorError:
            pass

        website = (
            XPath("//*[@id='content']/p/strong[contains(text(), 'Website')]")
            .match_one(self.root)
            .getnext()
        )
        if website.get("href") is None:
            website = website.getnext().get("href")
        else:
            website = website.get("href")
        p.add_link(website, note="website")

        return p

Example #11

0

Show file

 def process_item(self, item):
     dd_text = XPath(".//dd/text()").match(item)
     district = dd_text[2].strip().split()[1]
     party = dd_text[4].strip()
     return PersonDetail(
         dict(
             chamber="upper" if "senate" in self.source.url else "lower",
             district=district,
             party=party,
         ),
         source=str(XPath(".//dd/a[1]/@href").match_one(item)),
     )

Example #12

0

Show file

File: people.py Project: jealob/openstates-scrapers

 def process_item(self, item):
     dd_text = XPath(".//dd/text()").match(item)
     district = dd_text[2].strip().split()[1]
     party = dd_text[4].strip()
     url = str(XPath(".//dd/a[1]/@href").match_one(item))
     if "Details" not in url:
         raise SkipItem(f"skipping {url}")
     return PersonDetail(
         dict(
             chamber="upper" if "senate" in self.source.url else "lower",
             district=district,
             party=party,
         ),
         source=url,
     )

Example #13

0

Show file

File: committees.py Project: jealob/openstates-scrapers

    def process_page(self):
        com = self.input

        try:
            members = XPath(
                "//*[@id='committeesIntroRoster']/div/div/div/a").match(
                    self.root)
            for member in members:
                member_dirty = member.text_content().strip().split("\n")
                mem_name = member_dirty[0].strip(
                ) + " " + member_dirty[1].strip()
                role = (member.getparent().getprevious().getprevious().
                        text_content().strip())
                if role.strip() == "":
                    role = "member"
                com.add_member(mem_name, role)
                # many 'ex officio' roles for House Subcommittees, Joint Committees, and Joint Subcommittees
        except SelectorError:
            raise SkipItem("empty committee")

        try:
            extra_info = CSS("div#bodyContent b").match(self.root)
            for title in extra_info:
                position = title.text_content().strip()
                name = title.getnext().tail.strip()
                com.extras[position] = name
        except SelectorError:
            pass

        return com

Example #14

0

Show file

class LegList(HtmlListPage):
    selector = XPath(".//form/table[1]/tr")

    def process_item(self, item):
        # skip header rows
        if (
            len(CSS("td").match(item)) == 1
            or CSS("td").match(item)[0].get("class") == "header"
        ):
            self.skip()

        first_link = CSS("td a").match(item)[0]
        name = first_link.text_content()
        detail_link = first_link.get("href")

        district = CSS("td").match(item)[3].text_content()
        party_letter = CSS("td").match(item)[4].text_content()
        party_dict = {"D": "Democratic", "R": "Republican", "I": "Independent"}
        party = party_dict[party_letter]

        p = ScrapePerson(
            name=name,
            state="il",
            party=party,
            chamber=self.chamber,
            district=district,
        )

        p.add_source(self.source.url)
        p.add_source(detail_link)
        p.add_link(detail_link, note="homepage")

        return LegDetail(p, source=detail_link)

Example #15

0

Show file

    def process_page(self):
        p = self.input

        capitol_addr_lst = XPath(".//*[@id='district']/span[1]/text()").match(
            self.root)
        capitol_addr = ""
        for line in capitol_addr_lst:
            capitol_addr += line.strip()
            capitol_addr += " "
        p.capitol_office.address = capitol_addr.strip()

        try:
            fax = (CSS("span.info.fax").match_one(
                self.root).text_content().strip().split("\n"))
            fax = fax[-1].strip()
            p.capitol_office.fax = fax
        except SelectorError:
            pass

        try:
            staff_spans = CSS("span.info.staff span").match(self.root)
            for num, span in enumerate(grouper(staff_spans[1:], 2)):
                staff_name = span[0].text_content().strip()
                staff_email = span[1].text_content().strip()
                p.extras["staff" + str(num + 1)] = staff_name
                p.extras["staff_email" + str(num + 1)] = staff_email
        except SelectorError:
            pass

        return p

Example #16

0

Show file

 def get_column_div(self, name):
     # lots of places where we have a <div class='col-md-2 font-weight-bold'>
     # followeed by a <div class='col'>
     # with interesting content in the latter element
     return XPath(
         f"//div[contains(text(),'{name}')]/following-sibling::div[@class='col']"
     ).match_one(self.root)

Example #17

0

Show file

File: people.py Project: jealob/openstates-scrapers

class Legislators(HtmlListPage):
    session_num = "116"
    source = "https://leg.mt.gov/legislator-information/?session_select=" + session_num
    selector = XPath("//table[1]/tbody/tr")

    def process_item(self, item):
        tds = item.getchildren()
        email, name, party, seat, phone = tds

        chamber, district = seat.text_content().strip().split()
        url = str(name.xpath("a/@href")[0])

        person = ScrapePerson(
            name=clean_name(name.text_content()),
            state="mt",
            party=party.text_content().strip(),
            chamber=("upper" if chamber == "SD" else "lower"),
            district=district,
        )
        person.add_link(url)
        person.add_source(url)

        phone = phone.text_content().strip()
        if len(phone) == 14:
            person.capitol_office.voice = phone
        elif len(phone) > 30:
            person.capitol_office.voice = phone.split("    ")[0]

        email = email.xpath("./a/@href")
        if email:
            person.email = email[0].split(":", 1)[1]

        return person

Example #18

0

Show file

File: people.py Project: jessemortenson/openstates

class SenDetail(HtmlPage):
    contact_xpath = XPath('//h4[contains(text(), "Office")]')
    input_type = PartialPerson

    def get_source_from_input(self):
        return self.input.url

    def process_page(self):
        email = (self.root.xpath('//a[contains(@href, "mailto:")]')[0].get(
            "href").split(":")[-1])

        p = ScrapePerson(
            state="fl",
            chamber="upper",
            name=fix_name(self.input.name),
            party=str(self.input.party),
            district=str(self.input.district),
            email=email,
            image=str(self.root.xpath('//div[@id="sidebar"]//img/@src').pop()),
        )

        for item in self.contact_xpath.match(self.root):
            self.handle_office(item, p)

        return p

    def handle_office(self, office, person):
        (name, ) = office.xpath("text()")
        if name == "Tallahassee Office":
            obj_office = person.capitol_office
        else:
            obj_office = person.district_office

        address_lines = [
            x.strip() for x in office.xpath("following-sibling::div[1]")
            [0].text_content().splitlines() if x.strip()
        ]

        clean_address_lines = []
        fax = phone = None
        PHONE_RE = r"\(\d{3}\)\s\d{3}\-\d{4}"
        after_phone = False

        for line in address_lines:
            if re.search(r"(?i)open\s+\w+day", address_lines[0]):
                continue
            elif "FAX" in line:
                fax = line.replace("FAX ", "")
                after_phone = True
            elif re.search(PHONE_RE, line):
                phone = line
                after_phone = True
            elif not after_phone:
                clean_address_lines.append(line)

        address = "; ".join(clean_address_lines)
        address = re.sub(r"\s{2,}", " ", address)
        obj_office.address = address
        obj_office.phone = phone
        obj_office.fax = fax

Example #19

0

Show file

File: committees.py Project: jealob/openstates-scrapers

    def process_item(self, item):
        committee_name = item.text_content()

        # only scrape joint coms on senate scrape
        if ("Joint" in committee_name or "Task Force" in committee_name
                or "Conference" in committee_name):
            self.skip()

        committee_name = remove_comm(committee_name)
        committee_name = committee_name.strip()

        if "Subcommittee" in committee_name:
            name = committee_name.replace("Subcommittee on ",
                                          "").replace(", Subcommittee", "")

            parent = remove_comm(
                XPath("..//..//preceding-sibling::a").match(item)
                [0].text_content())

            com = ScrapeCommittee(
                name=name,
                chamber=self.chamber,
                classification="subcommittee",
                parent=parent,
            )
        else:
            com = ScrapeCommittee(name=committee_name, chamber=self.chamber)

        # We can construct a URL that would make scraping easier, as opposed to the link that is directly given
        comm_link = item.get("href").replace("https://www.house.mo.gov/", "")
        source = f"https://www.house.mo.gov/MemberGridCluster.aspx?filter=compage&category=committee&{comm_link}"
        return HouseCommitteeDetail(com, source=URL(source, timeout=30))

Example #20

0

Show file

class HouseParties(HtmlListPage):
    source = (
        "https://lrl.texas.gov/legeLeaders/members/membersearch.cfm?leg=87&chamber=H"
    )
    selector = XPath('//table[@id="tableToSort"]/tbody/', num_items=1)

    def process_page(self):
        tds = self.root.xpath(
            '//table[@id="tableToSort"]//td[contains(@class, '
            '"results")]', )

        party_map = {"D": "Democratic", "R": "Republican"}
        parties = {}
        for td_index, td in enumerate(tds):
            # 0, 2nd and 6th column
            if td_index % 9 == 0:
                name = td.text_content().strip()
            if td_index % 9 == 2:
                district = td.text_content().strip()
            if td_index % 9 == 6:
                party_code = td.text_content().strip()
                if len(party_code) > 1:
                    party_code = re.search(r"[A-Z]", party_code)[0]
                if party_code == "":
                    continue
                party = party_map[party_code]
                parties[district] = {"name": name, "party": party}
        return parties

Example #21

0

Show file

File: committees.py Project: jealob/openstates-scrapers

    def process_page(self):
        com = self.input
        com.add_source(self.source.url)
        com.add_link(self.source.url, note="homepage")

        # a few committees don't have chair positions
        try:
            chair_role = (
                CSS(".c-chair-block--position")
                .match_one(self.root)
                .text_content()
                .lower()
            )
            chair_name = CSS(".c-chair--title").match_one(self.root).text_content()
            com.add_member(chair_name, chair_role)

        except SelectorError:
            pass
        try:
            for p in XPath(
                "//div[contains(@class, 'c-senators-container')]//div[@class='view-content']/div[contains(@class, 'odd') or contains(@class, 'even')]"
            ).match(self.root):
                name = CSS(".nys-senator--name").match_one(p).text_content()

                role = CSS(".nys-senator--position").match_one(p).text_content().lower()
                if role == "":
                    role = "member"

                com.add_member(name, role)
        except SelectorError:
            pass

        return com

Example #22

0

Show file

File: committees.py Project: jealob/openstates-scrapers

class HouseComList(HtmlPage):
    source = "https://www.myfloridahouse.gov/Sections/Committees/committees.aspx"
    selector = XPath("//a[contains(@href, 'committeesdetail.aspx')]")

    def process_page(self):
        # don't use list page because we need to look back at prior element
        parent = None
        chamber = "lower"

        for item in self.selector.match(self.root):
            cssclass = item.attrib.get("class", "")
            name = item.text_content().strip()

            if "parentcommittee" in cssclass:
                parent = None
                chamber = "lower"

            comm = ScrapeCommittee(name=name,
                                   classification="committee",
                                   chamber=chamber,
                                   parent=parent)
            yield HouseComDetail(comm, source=item.attrib["href"])

            # parent for next time
            if "parentcommittee" in cssclass:
                parent = comm._id
                chamber = None

Example #23

0

Show file

File: committees.py Project: jealob/openstates-scrapers

    def process_page(self):
        com = self.input
        Rolez = XPath("//*[@id='form1']/div/div/div/div/div[1]/text()").match(self.root)
        Chair_mem = (
            CSS("#form1 div div div div div a")
            .match(self.root)[0]
            .text_content()
            .strip()
        )
        Chair_role = Rolez[0].replace(":", "").strip()
        com.add_member(Chair_mem, Chair_role)
        VChair_mem = (
            CSS("#form1 div div div div div a")
            .match(self.root)[1]
            .text_content()
            .strip()
        )
        VChair_role = Rolez[1].replace(":", "").strip()
        com.add_member(VChair_mem, VChair_role)

        members = CSS("#form1 div div.card-body div a").match(self.root)[7:]
        for mem in members:
            member = mem.text_content().strip()
            role_mem = "Member"
            com.add_member(member, role_mem)
        return com

Example #24

0

Show file

File: committees.py Project: jessemortenson/openstates

class SenateCommitteeList(HtmlListPage):
    source = URL("http://senate.ca.gov/committees")

    selector = XPath("//h2/../following-sibling::div//a")

    def process_item(self, item):
        comm_name = XPath("text()").match_one(item)
        if comm_name in [
                "Teleconference How-To Information", "Legislative Process"
        ]:
            self.skip()

        comm_url = XPath("@href").match_one(item)

        if comm_name.startswith("Joint"):
            com = ScrapeCommittee(name=comm_name,
                                  classification="committee",
                                  chamber="legislature")
        elif comm_name.startswith("Subcommittee"):
            parent_comm = (item.getparent().getparent().getparent().getparent(
            ).getchildren()[0].text_content())
            com = ScrapeCommittee(
                name=comm_name,
                classification="subcommittee",
                chamber="upper",
                parent=parent_comm,
            )
        else:
            com = ScrapeCommittee(name=comm_name,
                                  classification="committee",
                                  chamber="upper")
        com.add_source(self.source.url)
        com.add_source(comm_url)
        com.add_link(comm_url, note="homepage")
        return ChooseType(com, source=URL(comm_url))

Example #25

0

Show file

File: test_pages.py Project: jamesturk/spatula

def test_xml_list_page():
    p = XmlListPage(source=SOURCE)
    p.selector = XPath("//item/text()")
    p.response = Response(
        "<resp><item>one</item><item>two</item><item>three</item></resp>")
    p.postprocess_response()
    data = list(p.process_page())
    assert data == ["one", "two", "three"]

Example #26

0

Show file

File: test_pages.py Project: jamesturk/spatula

def test_html_list_page():
    p = HtmlListPage(source=SOURCE)
    p.selector = XPath("//li/text()")
    p.response = Response("<ul><li>one</li><li>two</li><li>three</li></ul>")
    p.postprocess_response()
    data = list(p.process_page())
    assert len(data) == 3
    assert data == ["one", "two", "three"]

Example #27

0

Show file

File: committees.py Project: jealob/openstates-scrapers

    def process_page(self):
        com = self.input

        try:
            chair = (XPath("//h5[text()='Chair']").match_one(
                self.root).getnext().text_content().strip())
            chair = re.search(r"(Senator|Representative)\s(.+)",
                              chair).groups()[1]
            com.add_member(chair, "Chair")
        except SelectorError:
            pass

        try:
            vice_chair = (XPath("//h5[text()='Vice-Chair']").match_one(
                self.root).getnext().text_content().strip())
            vice_chair = re.search(r"(Senator|Representative)\s(.+)",
                                   vice_chair).groups()[1]
            com.add_member(vice_chair, "Vice-Chair")
        except SelectorError:
            pass

        try:
            additional_members = (
                XPath("//h5[text()='Additional Members']").match_one(
                    self.root).getnext().getchildren())
            for member in additional_members:
                member = member.text_content().strip()
                member = re.search(r"(Senator|Representative)\s(.+)",
                                   member).groups()[1]
                com.add_member(member, "member")
        except SelectorError:
            pass

        try:
            extra_info = CSS("section.content strong").match(self.root)
            for title in extra_info:
                position = title.text_content().strip()
                name = title.tail.strip().lstrip(":").strip()
                com.extras[position] = name
        except SelectorError:
            pass

        if not com.members:
            raise SkipItem("empty committee")
        return com

Example #28

0

Show file

class LegList(JsonListPage):

    source = list_url()
    selector = XPath("//LegislativeMemberSummary/Details")

    def process_item(self, item):

        url = item["Details"]
        return LegDetail(source=url)

Example #29

0

Show file

File: people.py Project: jealob/openstates-scrapers

    def process_page(self):

        image = XPath("//img[contains(@src, '/photo')]").match_one(self.root).get("src")

        p = ScrapePerson(
            name=self.input.name,
            state="ia",
            chamber=self.input.chamber,
            party=self.input.party,
            district=self.input.district,
            email=self.input.email,
            image=image,
        )
        p.add_source(self.source.url)
        p.add_source(self.input.url)

        try:
            for link in CSS(".link_list a").match(self.root):
                url = link.get("href")
                if re.search("leaving?", url):
                    url = url.replace("https://www.legis.iowa.gov/leaving?forward=", "")
                if not re.search("http://", url) or re.search("https://", url):
                    url = "http://" + url
                p.add_link(url)
        except SelectorError:
            pass

        table = XPath("//div[@class='legisIndent divideVert']//td//text()").match(
            self.root
        )

        # the fields, like "cell phone", etc. are located at every odd indice
        # the information for each field, like the phone number, are located at every even indice
        fields = list(map(self.get_field, table[0::2]))
        extra = table[1::2]

        num_of_fields = range(len(fields))

        for i in num_of_fields:
            if fields[i] == "Legislative Email":
                continue
            p.extras[fields[i].lower()] = extra[i].strip()

        return p

Example #30

0

Show file

class HouseSearchPage(HtmlListPage):
    """
    House committee roll calls are not available on the Senate's
    website. Furthermore, the House uses an internal ID system in
    its URLs, making accessing those pages non-trivial.

    This will fetch all the House committee votes for the
    given bill, and add the votes to that object.
    """

    input_type = Bill
    example_input = Bill("HB 1",
                         "2020",
                         "title",
                         chamber="upper",
                         classification="bill")
    selector = XPath(
        '//a[contains(@href, "/Bills/billsdetail.aspx?BillId=")]/@href')

    def get_source_from_input(self):
        url = "https://www.myfloridahouse.gov/Sections/Bills/bills.aspx"
        # Keep the digits and all following characters in the bill's ID
        bill_number = re.search(r"^\w+\s(\d+\w*)$",
                                self.input.identifier).group(1)
        session_number = {
            "2022D": "96",
            "2022C": "95",
            "2022": "93",
            "2021B": "94",
            "2021A": "92",
            "2021": "90",
            "2020": "89",
            "2019": "87",
            "2018": "86",
            "2017A": "85",
            "2017": "83",
            "2016": "80",
            "2015C": "82",
            "2015B": "81",
            "2015A": "79",
            "2015": "76",
            "2014O": "78",
            "2014A": "77",
            "2016O": "84",
        }[self.input.legislative_session]

        form = {
            "Chamber": "B",
            "SessionId": session_number,
            "BillNumber": bill_number
        }
        return url + "?" + urlencode(form)

    def process_item(self, item):
        return HouseBillPage(self.input, source=item)