Python CSSの例、spatula.selectors.CSS Pythonの例

コード例 #1

0

ファイルを表示

class SenateDetail(HtmlPage):
    name_css = CSS(".field--name-title")
    image_css = CSS(".bSenBio__media-btn")
    district_css = CSS(".bDistrict h2")
    address_css = CSS(".bSenBio__address p")
    phone_css = CSS(".bSenBio__tel a")
    contact_link_sel = SimilarLink(
        r"https://oksenate.gov/contact-senator\?sid=")

    def get_source_from_input(self):
        return self.input["url"]

    def get_data(self):
        for bio in CSS(".bSenBio__infoIt").match(self.root):
            if "Party:" in bio.text_content():
                party = bio.text_content().split(":")[1].strip()
        p = Person(
            name=self.name_css.match_one(self.root).text,
            state="ok",
            chamber="upper",
            party=party,
            image=self.image_css.match_one(self.root).get("href"),
            district=self.district_css.match_one(
                self.root).text.strip().split()[1],
        )
        p.capitol_office.address = self.address_css.match_one(self.root).text
        p.capitol_office.phone = self.phone_css.match_one(self.root).text
        p.add_link(
            self.contact_link_sel.match_one(self.root).get("href"),
            "Contact Form")

        return p

コード例 #2

0

ファイルを表示

ファイル: hi.py プロジェクト: resistbot/people

    def process_item(self, item):
        try:
            link = CSS("a").match(item)[1]
        except SelectorError:
            self.skip()
        data = {
            "last_name": link.text_content(),
            "url": link.get("href"),
        }
        for key, label in self.LABELS.items():
            data[key] = CSS(f"[id$={label}]").match_one(item).text_content().strip()

        party = {"(D)": "Democratic", "(R)": "Republican"}[data["party"]]
        address = "Hawaii State Capitol, Room " + data["room"]
        chamber = "upper" if data["chamber"] == "S" else "lower"

        p = Person(
            name=data["first_name"] + " " + data["last_name"],
            state="hi",
            chamber=chamber,
            district=data["district"],
            given_name=data["first_name"],
            family_name=data["last_name"],
            party=party,
            email=data["email"],
        )
        p.capitol_office.address = address
        p.capitol_office.voice = data["voice"]
        p.capitol_office.fax = data["fax"]
        p.add_source(data["url"])
        p.add_link(data["url"])
        return p

コード例 #3

0

ファイルを表示

ファイル: mi.py プロジェクト: resistbot/people

    def process_item(self, item):
        member, party, district, contact_link, phone, office = item.getchildren(
        )

        name = member.text_content()
        district = district.text_content()

        # skip vacant districts
        if "Interim District" in name:
            self.skip()

        # each of these <td> have a single link
        leg_url = CSS("a").match_one(member).get("href")
        contact_url = CSS("a").match_one(contact_link).get("href")
        # construct this URL based on observation elsewhere on senate.michigan.gov
        image_url = f"https://senate.michigan.gov/_images/{district}{ord_suffix(district)}.jpg"

        p = Person(
            **split_name(name),
            state="mi",
            chamber="upper",
            district=district,
            party=self.PARTY_MAP[party.text],
            image=image_url,
        )
        p.capitol_office.voice = str(phone.text_content())
        p.capitol_office.address = str(office.text_content())
        p.add_source(self.source.url)
        p.add_link(leg_url)
        p.add_link(contact_url, note="Contact")
        return p

コード例 #4

0

ファイルを表示

    def process_page(self):
        name = self.name_css.match_one(self.root).text.split(maxsplit=1)[1]
        p = Person(
            name=name,
            state="ok",
            chamber="upper",
            party=self.party_css.match_one(self.root).text,
            district=self.district_css.match_one(self.root).text.split()[1],
        )
        p.image = self.image_selector.match_one(self.root).get("href")

        contact_url = self.source.url.replace("District.aspx", "Contact.aspx")
        assert contact_url.startswith(
            "https://www.okhouse.gov/Members/Contact.aspx?District=")
        p.add_link(contact_url, note="Contact Form")

        # capitol address
        check_capitol_address = CSS(".districtheadleft").match(
            self.root)[0].text_content().strip()
        if check_capitol_address == "Capitol Address:":
            capitol_address_div = (CSS(".districtheadleft + div").match(
                self.root)[0].text_content().strip().splitlines())
            p.capitol_office.address = "; ".join(
                [ln.strip() for ln in capitol_address_div[:-1]])
            p.capitol_office.phone = capitol_address_div[-1].strip()
        return p

コード例 #5

0

ファイルを表示

ファイル: oh.py プロジェクト: resistbot/people

    def process_page(self):
        # construct person from the details from above
        p = Person(
            state="oh",
            chamber="lower",
            district=self.input.district,
            name=self.input.name,
            party=self.input.party,
            image=self.input.image,
        )
        p.add_source(self.input.url)
        p.add_link(self.input.url)

        divs = CSS(".member-info-bar-module").match(self.root)
        # last div is contact details
        contact_details = CSS(".member-info-bar-value").match(divs[-1])
        for div in contact_details:
            dtc = div.text_content()
            if ", OH" in dtc:
                # join parts of the div together to make whole address
                children = div.getchildren()
                p.capitol_office.address = "; ".join([
                    children[0].text.strip(), children[0].tail.strip(),
                    children[1].tail.strip()
                ])
            elif "Phone:" in dtc:
                p.capitol_office.voice = dtc.split(": ")[1]
            elif "Fax:" in dtc:
                p.capitol_office.fax = dtc.split(": ")[1]

        return p

コード例 #6

0

ファイルを表示

class LegPage(HtmlPage):
    name_css = CSS("h1.mt-0")
    district_css = CSS(".col-9 h2")
    image_css = CSS("img#sen-image")
    address_css = CSS("address")

    def get_source_from_input(self):
        return self.input

    def process_page(self):
        name = self.name_css.match_one(self.root).text.replace("Sen. ",
                                                               "").strip()
        district = self.district_css.match_one(self.root).text.split()[1]
        image = self.image_css.match_one(self.root).get("src")
        addrlines = self.address_css.match_one(self.root).text_content()

        # example:
        # Room 11th Floor
        # P.O. Box 94604
        # Lincoln, NE 68509
        # (402) 471-2733
        # Email: [email protected]
        mode = "address"
        address = []
        phone = None
        email = None
        for line in addrlines.splitlines():
            line = line.strip()
            if not line:
                continue
            if line.startswith("(402)"):
                phone = line
                mode = None
            if line.startswith("Email:"):
                email = line.replace("Email: ", "")
            if mode == "address":
                address.append(line)

        p = Person(
            chamber="legislature",
            party="Nonpartisan",
            state="ne",
            district=district,
            image=image,
            name=name,
            email=email,
        )
        p.capitol_office.address = "; ".join(address)
        p.capitol_office.voice = phone
        p.add_source(self.source.url)
        p.add_link(self.source.url)
        return p

コード例 #7

0

ファイルを表示

ファイル: mo.py プロジェクト: resistbot/people

    def process_page(self):
        party = {"D": "Democratic", "R": "Republican"}[self.input.party]

        photo = CSS("img#ContentPlaceHolder1_imgPhoto1").match_one(
            self.root).get("src")

        p = Person(
            state="mo",
            party=party,
            image=photo,
            chamber="lower",
            district=self.input.district,
            name=f"{self.input.first_name} {self.input.last_name}",
            given_name=self.input.first_name,
            family_name=self.input.last_name,
        )
        # TODO
        # p.extras["hometown"] = self.input.hometown
        p.capitol_office.voice = self.input.voice
        p.capitol_office.address = (
            "MO House of Representatives; 201 West Capitol Avenue; "
            f"Room {self.input.room}; Jefferson City MO 65101 ")
        p.add_link(self.input.url)
        p.add_source(self.input.url)
        return p

コード例 #8

0

ファイルを表示

ファイル: mi.py プロジェクト: resistbot/people

class RepList(HtmlListPage):
    source = "https://www.house.mi.gov/MHRPublic/frmRepListMilenia.aspx?all=true"
    selector = CSS("#grvRepInfo tr", num_items=111)
    office_names = {
        "SHOB": "South House Office Building",
        "NHOB": "North House Office Building",
        "CB": "Capitol Building",
    }

    def process_item(self, item):
        website, district, name, party, office, phone, email = item.getchildren(
        )

        # skip header row
        if website.tag == "th":
            self.skip()

        office = office.text_content()
        for abbr, full in self.office_names.items():
            office = office.replace(abbr, full)

        p = Person(
            name=name.text_content(),
            state="mi",
            chamber="lower",
            district=district.text_content().lstrip("0"),
            party=party.text_content(),
            email=email.text_content(),
        )
        p.add_link(CSS("a").match_one(website).get("href"))
        p.add_source(self.source.url)
        p.capitol_office.voice = phone.text_content()
        p.capitol_office.address = office
        return p

コード例 #9

0

ファイルを表示

ファイル: md.py プロジェクト: resistbot/people

    def process_page(self):
        # annapolis_info = (
        #     XPath("//dt[text()='Annapolis Info']/following-sibling::dd[1]")
        #     .match_one(self.root)
        #     .text_content()
        # )
        # interim_info = (
        #     XPath("//dt[text()='Interim Info']/following-sibling::dd[1]")
        #     .match_one(self.root)
        #     .text_content()
        # )

        # email is formatted mailto:<addr>?body...
        email = SimilarLink("mailto:").match_one(self.root).get("href")
        email = email.split(":", 1)[1].split("?")[0]

        p = Person(
            name=CSS("h2").match_one(self.root).text.split(" ", 1)[1],
            state="md",
            image=self.image_sel.match_one(self.root).get("src"),
            party=self.extract_dd("Party"),
            district=self.extract_dd("District"),
            chamber=None,
            email=email,
        )
        p.add_link(self.source.url)
        p.add_source(self.source.url)
        return p

コード例 #10

0

ファイルを表示

ファイル: mi.py プロジェクト: resistbot/people

    def process_item(self, item):
        website, district, name, party, office, phone, email = item.getchildren(
        )

        # skip header row
        if website.tag == "th":
            self.skip()

        office = office.text_content()
        for abbr, full in self.office_names.items():
            office = office.replace(abbr, full)

        p = Person(
            name=name.text_content(),
            state="mi",
            chamber="lower",
            district=district.text_content().lstrip("0"),
            party=party.text_content(),
            email=email.text_content(),
        )
        p.add_link(CSS("a").match_one(website).get("href"))
        p.add_source(self.source.url)
        p.capitol_office.voice = phone.text_content()
        p.capitol_office.address = office
        return p

コード例 #11

0

ファイルを表示

ファイル: oh.py プロジェクト: resistbot/people

    def process_item(self, item):
        name = CSS(".mediaCaptionTitle").match_one(item).text
        subtitle = CSS(".mediaCaptionSubtitle").match_one(item).text
        image = CSS(".photo").match_one(item).get("style")
        image = background_image_re.findall(image)[0]
        # e.g. District 25 | D
        district, party = subtitle.split(" | ")
        district = district.split()[1]
        party = {"D": "Democratic", "R": "Republican"}[party]

        return HousePartial(
            name=name,
            district=district,
            party=party,
            url=item.get("href"),
            image=image,
        )

コード例 #12

0

ファイルを表示

ファイル: mo.py プロジェクト: resistbot/people

 def process_item(self, item):
     tds = CSS("td").match(item, min_items=0, max_items=8)
     if not tds:
         self.skip()
     _, last, first, district, party, town, phone, room = tds
     if last.text_content() == "Vacant":
         self.skip()
     return HousePartial(
         last_name=last.text_content(),
         first_name=first.text_content(),
         district=int(district.text_content()),
         party=party.text_content(),
         hometown=town.text_content().strip(),
         voice=phone.text_content(),
         room=room.text_content(),
         url=CSS("a").match_one(last).get("href"),
     )

コード例 #13

0

ファイルを表示

ファイル: ny.py プロジェクト: resistbot/people

    def process_addresses(self, item):
        # 1-3 address blocks, last is always Capitol
        address_blocks = CSS(".full-addr").match(item,
                                                 min_items=1,
                                                 max_items=3)

        # district address #1
        district = parse_address_lines(block_to_text(address_blocks[0]))
        # capitol address
        capitol = parse_address_lines(block_to_text(address_blocks[-1]))
        # TODO: handle district address #2 if it exists

        return district, capitol

コード例 #14

0

ファイルを表示

class HouseDetail(HtmlPage):
    image_selector = SimilarLink(
        "https://www.okhouse.gov/Members/Pictures/HiRes/")
    prefix = "#ctl00_ContentPlaceHolder1_lbl"
    name_css = CSS(prefix + "Name")
    district_css = CSS(prefix + "District")
    party_css = CSS(prefix + "Party")

    def get_source_from_input(self):
        return self.input["url"]

    def process_page(self):
        name = self.name_css.match_one(self.root).text.split(maxsplit=1)[1]
        p = Person(
            name=name,
            state="ok",
            chamber="upper",
            party=self.party_css.match_one(self.root).text,
            district=self.district_css.match_one(self.root).text.split()[1],
        )
        p.image = self.image_selector.match_one(self.root).get("href")

        contact_url = self.source.url.replace("District.aspx", "Contact.aspx")
        assert contact_url.startswith(
            "https://www.okhouse.gov/Members/Contact.aspx?District=")
        p.add_link(contact_url, note="Contact Form")

        # capitol address
        check_capitol_address = CSS(".districtheadleft").match(
            self.root)[0].text_content().strip()
        if check_capitol_address == "Capitol Address:":
            capitol_address_div = (CSS(".districtheadleft + div").match(
                self.root)[0].text_content().strip().splitlines())
            p.capitol_office.address = "; ".join(
                [ln.strip() for ln in capitol_address_div[:-1]])
            p.capitol_office.phone = capitol_address_div[-1].strip()
        return p

コード例 #15

0

ファイルを表示

ファイル: hi.py プロジェクト: resistbot/people

class HawaiiLegislators(HtmlListPage):
    source = FormSource(
        "https://www.capitol.hawaii.gov/members/legislators.aspx", "//form", "Show All"
    )
    selector = CSS("#ctl00_ContentPlaceHolderCol1_GridView1 tr")

    LABELS = {
        "first_name": "LabelFirst",
        "party": "LabelParty",
        "room": "LabelRoom2",
        "voice": "LabelPhone2",
        "fax": "LabelFAX2",
        "email": "HyperLinkEmail",
        "chamber": "LabelDis",
        "district": "LabelDistrict",
    }

    def process_item(self, item):
        try:
            link = CSS("a").match(item)[1]
        except SelectorError:
            self.skip()
        data = {
            "last_name": link.text_content(),
            "url": link.get("href"),
        }
        for key, label in self.LABELS.items():
            data[key] = CSS(f"[id$={label}]").match_one(item).text_content().strip()

        party = {"(D)": "Democratic", "(R)": "Republican"}[data["party"]]
        address = "Hawaii State Capitol, Room " + data["room"]
        chamber = "upper" if data["chamber"] == "S" else "lower"

        p = Person(
            name=data["first_name"] + " " + data["last_name"],
            state="hi",
            chamber=chamber,
            district=data["district"],
            given_name=data["first_name"],
            family_name=data["last_name"],
            party=party,
            email=data["email"],
        )
        p.capitol_office.address = address
        p.capitol_office.voice = data["voice"]
        p.capitol_office.fax = data["fax"]
        p.add_source(data["url"])
        p.add_link(data["url"])
        return p

コード例 #16

0

ファイルを表示

    def get_data(self):
        for bio in CSS(".bSenBio__infoIt").match(self.root):
            if "Party:" in bio.text_content():
                party = bio.text_content().split(":")[1].strip()
        p = Person(
            name=self.name_css.match_one(self.root).text,
            state="ok",
            chamber="upper",
            party=party,
            image=self.image_css.match_one(self.root).get("href"),
            district=self.district_css.match_one(
                self.root).text.strip().split()[1],
        )
        p.capitol_office.address = self.address_css.match_one(self.root).text
        p.capitol_office.phone = self.phone_css.match_one(self.root).text
        p.add_link(
            self.contact_link_sel.match_one(self.root).get("href"),
            "Contact Form")

        return p

コード例 #17

0

ファイルを表示

ファイル: oh.py プロジェクト: resistbot/people

class HouseList(HtmlListPage):
    source = "https://www.legislature.ohio.gov/legislators/house-directory"
    selector = CSS(".mediaGrid a[target='_blank']", num_items=99)

    def process_item(self, item):
        name = CSS(".mediaCaptionTitle").match_one(item).text
        subtitle = CSS(".mediaCaptionSubtitle").match_one(item).text
        image = CSS(".photo").match_one(item).get("style")
        image = background_image_re.findall(image)[0]
        # e.g. District 25 | D
        district, party = subtitle.split(" | ")
        district = district.split()[1]
        party = {"D": "Democratic", "R": "Republican"}[party]

        return HousePartial(
            name=name,
            district=district,
            party=party,
            url=item.get("href"),
            image=image,
        )

コード例 #18

0

ファイルを表示

ファイル: mo.py プロジェクト: resistbot/people

class HouseList(HtmlListPage):
    # note: there is a CSV, but it requires a bunch of ASP.net hoops to actually get
    source = URL(
        "https://house.mo.gov/MemberGridCluster.aspx?year=2021&code=R+&filter=clear"
    )
    selector = CSS("tr")

    def process_item(self, item):
        tds = CSS("td").match(item, min_items=0, max_items=8)
        if not tds:
            self.skip()
        _, last, first, district, party, town, phone, room = tds
        if last.text_content() == "Vacant":
            self.skip()
        return HousePartial(
            last_name=last.text_content(),
            first_name=first.text_content(),
            district=int(district.text_content()),
            party=party.text_content(),
            hometown=town.text_content().strip(),
            voice=phone.text_content(),
            room=room.text_content(),
            url=CSS("a").match_one(last).get("href"),
        )

コード例 #19

0

ファイルを表示

ファイル: md.py プロジェクト: resistbot/people

class PersonDetail(HtmlPage):
    def get_source_from_input(self):
        return str(self.input["url"])

    def parse_address_block(self, block):
        state = "address"
        # group lines by type
        values = {"address": [], "phone": [], "fax": []}
        for line in block.splitlines():
            line = line.strip()
            if not line:
                continue
            if line.startswith("Phone"):
                state = "phone"
            elif line.startswith("Fax"):
                state = "fax"

            values[state].append(line)

        # postprocess values

        phones = []
        for line in values["phone"]:
            for match in re.findall(r"\d{3}-\d{3}-\d{4}", line):
                phones.append(match)

        faxes = []
        for line in values["fax"]:
            for match in re.findall(r"\d{3}-\d{3}-\d{4}", line):
                faxes.append(match)

        return {
            "address": "; ".join(values["address"]),
            "phones": phones,
            "faxes": faxes
        }

    def extract_dd(self, name):
        return (
            XPath(f"//dt[text()='{name}']/following-sibling::dd[1]").match_one(
                self.root).text_content())

    image_sel = CSS("img.details-page-image-padding")

    def process_page(self):
        # annapolis_info = (
        #     XPath("//dt[text()='Annapolis Info']/following-sibling::dd[1]")
        #     .match_one(self.root)
        #     .text_content()
        # )
        # interim_info = (
        #     XPath("//dt[text()='Interim Info']/following-sibling::dd[1]")
        #     .match_one(self.root)
        #     .text_content()
        # )

        # email is formatted mailto:<addr>?body...
        email = SimilarLink("mailto:").match_one(self.root).get("href")
        email = email.split(":", 1)[1].split("?")[0]

        p = Person(
            name=CSS("h2").match_one(self.root).text.split(" ", 1)[1],
            state="md",
            image=self.image_sel.match_one(self.root).get("src"),
            party=self.extract_dd("Party"),
            district=self.extract_dd("District"),
            chamber=None,
            email=email,
        )
        p.add_link(self.source.url)
        p.add_source(self.source.url)
        return p

コード例 #20

0

ファイルを表示

ファイル: ny.py プロジェクト: resistbot/people

class AssemblyList(HtmlListPage):
    source = URL("https://assembly.state.ny.us/mem/")
    selector = CSS("section.mem-item", num_items=150)
    dependencies = {"party_mapping": PartyAugmentation()}

    def process_addresses(self, item):
        # 1-3 address blocks, last is always Capitol
        address_blocks = CSS(".full-addr").match(item,
                                                 min_items=1,
                                                 max_items=3)

        # district address #1
        district = parse_address_lines(block_to_text(address_blocks[0]))
        # capitol address
        capitol = parse_address_lines(block_to_text(address_blocks[-1]))
        # TODO: handle district address #2 if it exists

        return district, capitol

    def process_item(self, item):
        # strip leading zero
        district = str(int(item.get("id")))
        image = CSS(".mem-pic a img").match_one(item).get("src")
        name = CSS(".mem-name a").match_one(item)

        district_addr, capitol_addr = self.process_addresses(item)

        # email, twitter, facebook are all sometimes present
        try:
            email = CSS(".mem-email a").match_one(item).text.strip()
        except SelectorError:
            email = ""
        try:
            twitter = CSS(".fa-twitter").match_one(item)
            twitter = twitter.getparent().get("href").split("/")[-1]
        except SelectorError:
            twitter = ""
        try:
            facebook = CSS(".fa-facebook").match_one(item)
            facebook = facebook.getparent().get("href").split("/")[-1]
        except SelectorError:
            facebook = ""

        party = self.party_mapping[district][1]

        p = Person(
            state="ny",
            chamber="lower",
            image=image,
            party=party,
            district=district,
            name=name.text.strip(),
            email=email,
        )
        p.add_link(url=name.get("href"))
        p.add_source(url=name.get("href"))
        if twitter:
            p.ids["twitter"] = twitter
        if facebook:
            p.ids["facebook"] = facebook
        p.district_office.address = district_addr["address"]
        p.district_office.voice = district_addr["phone"]
        p.district_office.fax = district_addr["fax"]
        p.capitol_office.address = capitol_addr["address"]
        p.capitol_office.voice = capitol_addr["phone"]
        p.capitol_office.fax = capitol_addr["fax"]
        return p

コード例 #21

0

ファイルを表示

ファイル: ny.py プロジェクト: resistbot/people

 def find_rows(self):
     # the first table on the page that has a bunch of rows
     for table in CSS("table.wikitable").match(self.root):
         rows = CSS("tr").match(table)
         if len(rows) >= 150:
             return rows

コード例 #22

0

ファイルを表示

ファイル: ny.py プロジェクト: resistbot/people

    def process_item(self, item):
        # strip leading zero
        district = str(int(item.get("id")))
        image = CSS(".mem-pic a img").match_one(item).get("src")
        name = CSS(".mem-name a").match_one(item)

        district_addr, capitol_addr = self.process_addresses(item)

        # email, twitter, facebook are all sometimes present
        try:
            email = CSS(".mem-email a").match_one(item).text.strip()
        except SelectorError:
            email = ""
        try:
            twitter = CSS(".fa-twitter").match_one(item)
            twitter = twitter.getparent().get("href").split("/")[-1]
        except SelectorError:
            twitter = ""
        try:
            facebook = CSS(".fa-facebook").match_one(item)
            facebook = facebook.getparent().get("href").split("/")[-1]
        except SelectorError:
            facebook = ""

        party = self.party_mapping[district][1]

        p = Person(
            state="ny",
            chamber="lower",
            image=image,
            party=party,
            district=district,
            name=name.text.strip(),
            email=email,
        )
        p.add_link(url=name.get("href"))
        p.add_source(url=name.get("href"))
        if twitter:
            p.ids["twitter"] = twitter
        if facebook:
            p.ids["facebook"] = facebook
        p.district_office.address = district_addr["address"]
        p.district_office.voice = district_addr["phone"]
        p.district_office.fax = district_addr["fax"]
        p.capitol_office.address = capitol_addr["address"]
        p.capitol_office.voice = capitol_addr["phone"]
        p.capitol_office.fax = capitol_addr["fax"]
        return p