Esempio n. 1
0
    def process_page(self):
        p = self.input

        img = XPath("//img[@valign = 'top']").match(self.root)[0].get("src")
        p.image = img

        try:
            district = XPath(
                "//div[@id = 'wrapleftcolr']/b[contains(text(), 'District:')]"
            ).match_one(self.root)
            distr_addr = ""
            for line in district.itersiblings():
                if line.tail is not None and line.tail.strip() != "":
                    if re.search(r"\(\d{3}\)\s\s?\d{3}-\d{4}",
                                 line.tail.strip()):
                        p.district_office.voice = line.tail.strip()
                    else:
                        distr_addr += line.tail.strip()
                        distr_addr += " "
            p.district_office.address = distr_addr.strip()
        except SelectorError:
            pass

        if p.district_office.voice == "":
            try:
                distr_phone = XPath(
                    "//*[@id='wrapleftcolr']/i[contains(text(), 'District Phone')]"
                ).match_one(self.root)
                p.district_office.voice = distr_phone.tail.strip()
            except SelectorError:
                pass

        title = CSS("div#wrapleftcolr b").match(
            self.root)[0].text_content().strip()
        if title not in ["Capitol Office:", "District:", ""]:
            p.extras["title"] = title

        return p
Esempio n. 2
0
    def process_page(self):
        p = self.input

        img = CSS("img").match(self.root)[6].get("src")
        p.image = img

        addresses = CSS("address").match(self.root)
        for num, address in enumerate(addresses):
            addr = ""
            phone = None
            fax = None
            lines = XPath("text()").match(address)
            for line in lines:
                if re.search(r"(Senator|Hon\.)", line.strip()):
                    continue
                elif re.search(r"(FAX|Fax|fax)", line.strip()):
                    fax = line.strip()
                elif re.search(r"\(\d{3}\)\s\d{3}-\d{4}", line.strip()):
                    phone = line.strip()
                else:
                    addr_lines = line.strip().split("\n")
                    for addr_line in addr_lines:
                        addr += addr_line.strip()
                        addr += " "

            if (p.chamber == "upper"
                    and num == 0) or (p.chamber == "lower"
                                      and num == len(addresses) - 1):
                p.capitol_office.address = addr.strip()
                if phone:
                    p.capitol_office.voice = fix_phone(phone)
                if fax:
                    fax = re.search(r"(FAX|Fax|fax):\s(.+)", fax).groups()[1]
                    p.capitol_office.fax = fix_phone(fax)
            else:
                p.district_office.address = addr.strip()
                if phone:
                    p.district_office.voice = fix_phone(phone)
                if fax:
                    fax = re.search(r"(FAX|Fax|fax):\s(.+)", fax).groups()[1]
                    p.district_office.fax = fix_phone(fax)

        social_links = CSS("div.Widget.MemberBio-SocialLinks a").match(
            self.root)
        for link in social_links:
            if re.search(
                    r"(enewsletters|library|pacapitol|news|(C|c)ontact|linkedin|vimeo|email|feed|google|RSS|protect|sk=wall)",
                    link.get("href"),
            ):
                continue
            elif re.search(r"(F|f)acebook", link.get("href")):
                fb = link.get("href").split("/")
                if fb[-1] == "" or not re.search(r"[A-Za-z]", fb[-1]):
                    fb_id = fb[-2]
                else:
                    fb_id = fb[-1]
                p.ids.facebook = fb_id
            elif re.search(r"twitter", link.get("href")):
                twitter = link.get("href").split("/")
                if twitter[-1] == "":
                    tw_id = twitter[-2]
                else:
                    tw_id = twitter[-1]
                p.ids.twitter = tw_id.lstrip("@")
            elif re.search(r"instagram", link.get("href")):
                insta = link.get("href").split("/")
                if insta[-1] == "" or re.search(r"hl=en", insta[-1]):
                    insta_id = insta[-2]
                else:
                    insta_id = insta[-1]
                p.ids.instagram = insta_id
            elif re.search(r"youtube", link.get("href")):
                youtube = link.get("href").split("/")
                if youtube[-1] == "" or re.search(r"featured", youtube[-1]):
                    youtube_id = youtube[-2]
                else:
                    youtube_id = youtube[-1]
                p.ids.youtube = youtube_id
            else:
                p.extras["website"] = link.get("href")

        try:
            if p.chamber == "lower":
                sibs = XPath(
                    "/html/body/div/section/div/div[2]/div/div[3]/h4[contains(text(), 'Education')]"
                ).match_one(self.root)
                ed1 = sibs.tail.strip()
                if ed1 != "":
                    p.extras["Education"] = [ed1]
                for sib in sibs.itersiblings():
                    if sib.text_content().strip() == "Military Service":
                        break
                    p.extras["Education"] += [sib.tail.strip()]
            else:
                sibs = (
                    XPath("//*[@id='Mbr-Bio']/h4[contains(text(), 'Attended')]"
                          ).match_one(self.root).itersiblings())
                education = []
                for sib in sibs:
                    if (sib.text_content().strip() == "Career" or
                            sib.text_content().strip() == "Military Service"):
                        break
                    if sib.text_content().strip() != "":
                        education += [sib.text_content().strip()]
                if len(education) > 0:
                    p.extras["Education"] = education
        except SelectorError:
            pass

        return p