def test_full_person():
    person = ScrapePerson("Tom Sawyer")
    person.add_identifier("1")
    person.add_name("Tommy", start_date="1880")
    person.add_contact_detail(type="phone",
                              value="555-555-1234",
                              note="this is fake")
    person.add_link("http://example.com/link")
    person.add_source("http://example.com/source")

    # import person
    pd = person.as_dict()
    PersonImporter("jid").import_data([pd])

    # get person from db and assert it imported correctly
    p = Person.objects.get()
    assert "ocd-person" in p.id
    assert p.name == person.name

    assert p.identifiers.all()[0].identifier == "1"
    assert p.identifiers.all()[0].scheme == ""

    assert p.other_names.all()[0].name == "Tommy"
    assert p.other_names.all()[0].start_date == "1880"

    assert p.contact_details.all()[0].type == "phone"
    assert p.contact_details.all()[0].value == "555-555-1234"
    assert p.contact_details.all()[0].note == "this is fake"

    assert p.links.all()[0].url == "http://example.com/link"
    assert p.sources.all()[0].url == "http://example.com/source"
Beispiel #2
0
    def handle_list_item(self, item):
        name = " ".join(item.xpath(".//text()"))
        name = re.sub(r"\s+", " ", name).replace(" ,", ",").strip()

        if "Vacant" in name:
            return

        district = item.xpath("string(../../td[1])")
        party = item.xpath("string(../../td[2])")
        if party == "Democrat":
            party = "Democratic"

        leg_url = item.get("href")

        name = fix_name(name)
        leg = Person(
            name=name,
            district=district,
            party=party,
            primary_org="upper",
            role="Senator",
        )
        leg.add_link(leg_url)
        leg.add_source(self.url)
        leg.add_source(leg_url)

        self.scrape_page(SenDetail, leg_url, obj=leg)

        return leg
Beispiel #3
0
    def scrape_member_page(self, chamber, url):
        page = self.get(url).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        for legislator in page.xpath(
            "//div[contains(concat(' ', normalize-space(@class), ' '), "
            "' memberModule ')]"
        ):
            img = legislator.xpath(".//div[@class='thumbnail']//img")[0].attrib["src"]
            data = legislator.xpath(".//div[@class='data']")[0]
            homepage = data.xpath(".//a[@class='black']")[0]
            full_name = homepage.text_content()

            if "Vacant" in full_name:
                continue

            homepage = homepage.attrib["href"]
            party = data.xpath(".//span[@class='partyLetter']")[0].text_content()
            party = {"R": "Republican", "D": "Democratic"}[party]
            office_lines = data.xpath("child::text()")
            phone = office_lines.pop(-1)

            if re.search(r"(Leader|Whip|Speaker)", office_lines[0]):
                office_lines.pop(0)

            office = "\n".join(office_lines)
            h3 = data.xpath("./h3")
            if len(h3):
                h3 = h3[0]
                district = h3.xpath("./br")[0].tail.replace("District", "").strip()
            else:
                district = re.findall(r"\d+\.png", legislator.attrib["style"])[
                    -1
                ].split(".", 1)[0]

            full_name = re.sub(r"\s+", " ", full_name).strip()
            email = (
                "rep{0:0{width}}@ohiohouse.gov"
                if chamber == "lower"
                else "sd{0:0{width}}@ohiosenate.gov"
            ).format(int(district), width=2)

            leg = Person(
                name=full_name,
                district=district,
                party=party,
                primary_org=chamber,
                image=img,
            )

            leg.add_contact_detail(type="address", value=office, note="Capitol Office")
            leg.add_contact_detail(type="voice", value=phone, note="Capitol Office")
            leg.add_contact_detail(type="email", value=email, note="Capitol Office")

            self.scrape_homepage(leg, chamber, homepage)

            leg.add_source(url)
            leg.add_link(homepage)
            yield leg
    def scrape_legislator(self, chamber, name, url):
        html = self.get(url).text
        page = lxml.html.fromstring(html)
        page.make_links_absolute(url)

        district = (page.xpath('//h1[contains(., "DISTRICT")]/text()').pop().
                    split()[1].strip().lstrip("0"))

        party = page.xpath("//h2").pop().text_content()
        party = re.search(r"\((R|D|I)[ \-\]]", party).group(1)

        if party == "D":
            party = "Democratic"
        elif party == "R":
            party = "Republican"
        elif party == "I":
            party = "Independent"

        photo_url = page.xpath(
            "//img[contains(@src, 'images/members/')]")[0].attrib["src"]

        leg = Person(name,
                     district=district,
                     party=party,
                     image=photo_url,
                     primary_org=chamber)
        leg.add_link(url)
        leg.add_source(url)
        self.scrape_offices(leg, page)

        yield leg
Beispiel #5
0
    def scrape_member(self, chamber, link):
        name = link.text.strip()
        leg_url = link.get("href")
        district = link.xpath("string(../../td[3])")
        party = link.xpath("string(../../td[4])")

        # we get email on the next page now
        # email = link.xpath("string(../../td[5])")

        if party == "Democrat":
            party = "Democratic"
        elif party == "No Party Specified":
            party = "Independent"

        pid = re.search(r"personID=(\d+)", link.attrib["href"]).group(1)
        photo_url = ("https://www.legis.iowa.gov/photo"
                     "?action=getPhoto&ga=%s&pid=%s" %
                     (self.latest_session(), pid))

        leg = Person(
            name=name,
            primary_org=chamber,
            district=district,
            party=party,
            image=photo_url,
        )

        leg.add_link(leg_url)
        leg.add_source(leg_url)

        leg_page = lxml.html.fromstring(self.get(link.attrib["href"]).text)
        self.scrape_member_page(leg, leg_page)
        yield leg
Beispiel #6
0
    def scrape_lower(self, chamber):
        url = "http://www.house.mi.gov/mhrpublic/frmRepList.aspx"
        table = ["website", "district", "name", "party", "location", "phone", "email"]

        data = self.get(url).text
        doc = lxml.html.fromstring(data)

        # skip two rows at top
        for row in doc.xpath('//table[@id="grvRepInfo"]/*'):
            tds = row.xpath(".//td")
            if len(tds) == 0:
                continue
            metainf = {}
            for i in range(0, len(table)):
                metainf[table[i]] = tds[i]
            district = str(int(metainf["district"].text_content().strip()))
            party = metainf["party"].text_content().strip()
            phone = metainf["phone"].text_content().strip()
            email = metainf["email"].text_content().strip()
            name = metainf["name"].text_content().strip()
            if name == "Vacant" or re.match(r"^District \d{1,3}$", name):
                self.warning(
                    "District {} appears vacant, and will be skipped".format(district)
                )
                continue
            leg_url = metainf["website"].xpath("./a")[0].attrib["href"]

            office = metainf["location"].text_content().strip()
            office = re.sub(
                " HOB",
                " Anderson House Office Building\n124 North Capitol Avenue\nLansing, MI 48933",
                office,
            )
            office = re.sub(" CB", " State Capitol Building\nLansing, MI 48909", office)

            try:
                photo_url = self.get_photo_url(leg_url)[0]
            except (scrapelib.HTTPError, IndexError):
                photo_url = ""
                self.warning("no photo url for %s", name)

            person = Person(
                name=name,
                district=district,
                party=abbr[party],
                primary_org="lower",
                image=photo_url,
            )

            person.add_link(leg_url)
            person.add_source(leg_url)

            person.add_contact_detail(
                type="address", value=office, note="Capitol Office"
            )
            person.add_contact_detail(type="voice", value=phone, note="Capitol Office")
            person.add_contact_detail(type="email", value=email, note="Capitol Office")

            yield person
    def scrape_chamber(self, chamber):
        client = ApiClient(self)
        session = self.latest_session()
        base_url = "http://iga.in.gov/legislative"
        api_base_url = "https://api.iga.in.gov"
        chamber_name = "senate" if chamber == "upper" else "house"
        r = client.get("chamber_legislators",
                       session=session,
                       chamber=chamber_name)
        all_pages = client.unpaginate(r)
        for leg in all_pages:
            firstname = leg["firstName"]
            lastname = leg["lastName"]
            party = leg["party"]
            link = leg["link"]
            api_link = api_base_url + link
            html_link = base_url + link.replace("legislators/",
                                                "legislators/legislator_")
            try:
                html = get_with_increasing_timeout(self,
                                                   html_link,
                                                   fail=True,
                                                   kwargs={"verify": False})
            except scrapelib.HTTPError:
                self.logger.warning("Legislator's page is not available.")
                continue
            doc = lxml.html.fromstring(html.text)
            doc.make_links_absolute(html_link)
            address, phone = doc.xpath("//address")
            address = address.text_content().strip()
            address = "\n".join([ln.strip() for ln in address.split("\n")])
            phone = phone.text_content().strip()
            try:
                district = (doc.xpath("//span[@class='district-heading']")
                            [0].text.lower().replace("district", "").strip())
            except IndexError:
                self.warning("skipping legislator w/o district")
                continue
            image_link = base_url + link.replace("legislators/",
                                                 "portraits/legislator_")
            legislator = Person(
                primary_org=chamber,
                district=district,
                name=" ".join([firstname, lastname]),
                party=party,
                image=image_link,
            )
            legislator.add_contact_detail(type="address",
                                          note="Capitol Office",
                                          value=address)
            legislator.add_contact_detail(type="voice",
                                          note="Capitol Office",
                                          value=phone)
            legislator.add_link(html_link)
            legislator.add_source(html_link)
            legislator.add_source(api_link)

            yield legislator
    def scrape_senator_page(self, chamber, url):
        page = self.get(url).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        for legislator in page.xpath(
            "//div[@id='senators']//div[contains(concat(' ', normalize-space(@class), ' '), "
            "' portraitContainer ')]"
        ):
            img = legislator.xpath(
                ".//div[@class='profileThumbnailBoundingBox']/@style"
            )[0]
            img = img[img.find("(") + 1 : img.find(")")]
            full_name = legislator.xpath(".//div[@class='profileName']/a/text()")[0]
            homepage_url = legislator.xpath(".//a[@class='profileImageLink']")[
                0
            ].attrib["href"]
            district = legislator.xpath(".//div[@class='profileDistrict']" "/a/text()")[
                0
            ].split("#")[1]

            if "Vacant" in full_name:
                continue

            homepage = self.get(homepage_url).text
            page = lxml.html.fromstring(homepage)
            phone = page.xpath("//div[@class='phone']/span/text()")[0]

            address_lines = page.xpath("//div[@class='address']/descendant::*/text()")
            address = "\n".join(address_lines)

            party_image = page.xpath('//div[@class="senatorParty"]/img/@src')[0]
            if "Republican" in party_image:
                party = "Republican"
            elif "Democrat" in party_image:
                party = "Democratic"

            email = (
                "rep{0:0{width}}@ohiohouse.gov"
                if chamber == "lower"
                else "sd{0:0{width}}@ohiosenate.gov"
            ).format(int(district), width=2)

            leg = Person(
                name=full_name,
                district=district,
                primary_org=chamber,
                image=img,
                party=party,
            )

            leg.add_contact_detail(type="address", value=address, note="Capitol Office")
            leg.add_contact_detail(type="voice", value=phone, note="Capitol Office")
            leg.add_contact_detail(type="email", value=email, note="Capitol Office")

            leg.add_source(url)
            leg.add_link(homepage_url)
            yield leg
Beispiel #9
0
    def scrape_rep(self, url):

        page = self.get(url).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        main = page.xpath('//div[@id="main-info"]')[0]
        if "Resigned" in main.text_content():
            print("Member resigned {}".format(url))
            raise StopIteration  # don't yield anything

        if "Deceased" in main.text_content():
            print("Member is deceased {}".format(url))
            raise StopIteration  # don't yield anything

        name = page.xpath('//div[@class="member-name"]/text()')[0].strip()
        name = re.sub(r"\s+", " ", name)
        district_number = page.xpath(
            '//span[contains(text(), "House District:")]'
            "/following-sibling::span/text()")[0].strip()
        # remove anything after first whitespace
        district_number = re.sub(r"\s.*", "", district_number.strip())

        email = None
        email_content = page.xpath(
            '//a[./i[contains(@class,"fa-envelope")]]/text()')
        if email_content and email_content[0].strip():
            email = email_content[0].strip()

        photo_url = page.xpath('//header[@id="home"]/img/@src')[0]

        party = self.get_rep_table_by_header(page,
                                             "Party Affiliation").text.strip()
        party = _party_map[party[0]]  # standardize

        main_p_text = page.xpath('//div[@id="main-info"]/p/text()')
        address = [t.strip() for t in main_p_text if t.strip()][0]

        person = Person(
            name=name,
            district=district_number,
            primary_org="lower",
            party=party,
            image=photo_url,
        )

        person.add_contact_detail(type="address",
                                  value=address,
                                  note="District Office")
        if email:
            person.add_contact_detail(type="email",
                                      value=email,
                                      note="District Office")

        person.add_link(url)
        person.add_source(url)

        yield person
def test_basic_invalid_person():
    bob = Person("Bob B. Johnson")
    bob.add_source(url="http://example.com")
    bob.validate()

    bob.name = None

    with pytest.raises(ScrapeValueError):
        bob.validate()
Beispiel #11
0
    def get_member(self, session, chamber, kpid):
        url = "%smembers/%s" % (ksapi.url, kpid)
        content = json.loads(self.get(url).text)["content"]

        party = content["PARTY"]
        if party == "Democrat":
            party = "Democratic"

        slug = {
            "2013-2014": "b2013_14",
            "2015-2016": "b2015_16",
            "2017-2018": "b2017_18",
            "2019-2020": "b2019_20",
        }[session]
        leg_url = "http://www.kslegislature.org/li/%s/members/%s/" % (slug,
                                                                      kpid)

        try:
            legislator_page = self.lxmlize(leg_url)
            (photo_url,
             ) = legislator_page.xpath('//img[@class="profile-picture"]/@src')
        except scrapelib.HTTPError:
            self.warning("{}'s legislator bio page not found".format(
                content["FULLNAME"]))
            leg_url = ""
            photo_url = ""

        person = Person(
            name=content["FULLNAME"],
            district=str(content["DISTRICT"]),
            primary_org=chamber,
            party=party,
            image=photo_url,
        )
        person.extras = {"occupation": content["OCCUPATION"]}

        address = "\n".join([
            "Room {}".format(content["OFFICENUM"]),
            "Kansas State Capitol Building",
            "300 SW 10th St.",
            "Topeka, KS 66612",
        ])

        note = "Capitol Office"
        person.add_contact_detail(type="address", value=address, note=note)
        person.add_contact_detail(type="email",
                                  value=content["EMAIL"],
                                  note=note)
        if content["OFFPH"]:
            person.add_contact_detail(type="voice",
                                      value=content["OFFPH"],
                                      note=note)

        person.add_source(url)
        person.add_link(leg_url)

        yield person
Beispiel #12
0
    def handle_list_item(self, row):
        if not row["First Name"]:
            return
        name = "{} {}".format(row["First Name"], row["Last Name"])
        party = PARTIES[row["Party"]]
        leg = Person(
            name=name,
            district=row["District"].lstrip("0"),
            party=party,
            primary_org="upper",
            role="Senator",
            image=self.extra_info[name]["image"],
        )
        leg.add_link(self.extra_info[name]["url"])
        leg.add_contact_detail(type="voice",
                               value=self.extra_info[name]["office_phone"],
                               note="capitol")
        if "email" in self.extra_info[name]:
            leg.add_contact_detail(type="email",
                                   value=self.extra_info[name]["email"],
                                   note="capitol")

        row["Zipcode"] = row["Zipcode"].strip()
        # Accommodate for multiple address column naming conventions.
        address1_fields = [row.get("Address"), row.get("Office Building")]
        address2_fields = [row.get("Address2"), row.get("Office Address")]
        row["Address"] = next((a for a in address1_fields if a is not None),
                              False)
        row["Address2"] = next((a for a in address2_fields if a is not None),
                               False)

        if (a in row["Address2"] for a in
            ["95 University Avenue W", "100 Rev. Dr. Martin Luther King"]):
            address = "{Address}\n{Address2}\n{City}, {State} {Zipcode}".format(
                **row)
            if "Rm. Number" in row:
                address = "{0} {1}".format(row["Rm. Number"], address)
            leg.add_contact_detail(type="address",
                                   value=address,
                                   note="capitol")
        elif row["Address2"]:
            address = "{Address}\n{Address2}\n{City}, {State} {Zipcode}".format(
                **row)
            leg.add_contact_detail(type="address",
                                   value=address,
                                   note="district")
        else:
            address = "{Address}\n{City}, {State} {Zipcode}".format(**row)
            leg.add_contact_detail(type="address",
                                   value=address,
                                   note="district")

        leg.add_source(self.url)
        leg.add_source(self._html_url)

        return leg
Beispiel #13
0
    def _scrape_legislator(self, row, chamber):
        name_cell = row.xpath('./td[@class="rosterCell nameCell"]/a')[0]
        name = " ".join([
            line.strip() for line in name_cell.text_content().split("\n")
            if len(line.strip()) > 0
        ])

        party_letter = row.xpath(
            './td[@class="rosterCell partyCell"]/text()')[0].strip()
        party = dict(D="Democratic", R="Republican")[party_letter]

        chamber_abbr = self._chamber_map[chamber]
        district = (row.xpath('./td[@class="rosterCell seatCell"]'
                              "/text()")[0].replace(chamber_abbr, "").strip())
        try:
            email = (row.xpath('./td[@class="rosterCell emailCell"]'
                               "/a/@href")[0].replace("mailto:", "").strip())
        except IndexError:
            email = None

        phone = (row.xpath('./td[@class="rosterCell phoneCell"]'
                           "/text()")[0].strip() or None)

        details_url = "https://leg.mt.gov{}".format(name_cell.attrib["href"])
        response = self.get(details_url)
        details_page = lxml.html.fromstring(response.text)

        address_lines = (details_page.xpath(
            '//div[@class="col-lg-6 col-md-12 text-lg-left align-self-center"]'
            '/p[contains(text(), "Address")]')[0].text_content().replace(
                "Address", "").split("\n"))
        address = "\n".join(
            [line.strip() for line in address_lines if len(line.strip()) > 0])

        legislator = Person(name=name,
                            district=district,
                            party=party,
                            primary_org=chamber)

        legislator.add_contact_detail(type="address",
                                      value=address,
                                      note="Capitol Office")
        if phone is not None:
            legislator.add_contact_detail(type="voice",
                                          value=phone,
                                          note="Capitol Office")

        if email is not None:
            legislator.add_contact_detail(type="email",
                                          value=email,
                                          note="E-mail")

        legislator.add_link(details_url)
        legislator.add_source(self._roster_url)

        yield legislator
    def scrape_senator(self, district):
        link = "https://legislature.maine.gov/District-{}".format(district)
        page = lxml.html.fromstring(self.get(link).text)
        page.make_links_absolute(link)

        main = page.xpath('//div[@id="main"]/div[@id="content"]')[0]
        title = main.xpath("h1")[0].text
        # e.g. District 25 - State Senator Catherine Breen (D - Cumberland)...
        title_match = re.match(
            r"District (\d+) - State Senator ([^\(]+) \(([DRI])", title)
        _, name, party = title_match.groups()
        name = re.sub(r"\s+", " ", name.strip())
        party = _party_map[party]

        image_url = address = phone = email = None

        for p in main.xpath("p"):
            if p.xpath(".//img") and not image_url:
                image_url = p.xpath(".//img/@src")[0]
                continue
            field, _, value = p.text_content().partition(":")
            value = value.strip()
            if field in ("Address", "Mailing Address"):
                address = value
            elif field in ("Phone", "Home Phone"):
                phone = value
            elif field == "Email":
                email = value

        person = Person(
            name=name,
            district=district,
            image=image_url,
            primary_org="upper",
            party=party,
        )

        person.add_link(link)
        person.add_source(link)

        if address:
            person.add_contact_detail(type="address",
                                      value=address,
                                      note="District Office")

        if phone:
            person.add_contact_detail(type="voice",
                                      value=clean_phone(phone),
                                      note="District Phone")
        person.add_contact_detail(type="email",
                                  value=email,
                                  note="District Email")

        yield person
Beispiel #15
0
def test_save_object_basics():
    # ensure that save object dumps a file
    s = Scraper(juris, "/tmp/")
    p = Person("Michael Jordan")
    p.add_source("http://example.com")

    with mock.patch("json.dump") as json_dump:
        s.save_object(p)

    # ensure object is saved in right place
    filename = "person_" + p._id + ".json"
    assert filename in s.output_names["person"]
    json_dump.assert_called_once_with(p.as_dict(), mock.ANY, cls=mock.ANY)
Beispiel #16
0
    def scrape_lower_legislator(self, url, leg_info):
        page = self.lxmlize(url)

        name = page.xpath(
            '//span[@id="body_FormView5_FULLNAMELabel"]/text()')[0].strip()
        if name.startswith("District ") or name.startswith("Vacant "):
            self.warning("Seat is vacant: {}".format(name))
            return

        photo = page.xpath(
            '//img[contains(@src, "/h_reps/RepPics")]')[0].attrib["src"]
        party_flags = {
            "Democrat": "Democratic",
            "Republican": "Republican",
            "Independent": "Independent",
        }
        party_info = page.xpath(
            '//span[@id="body_FormView5_PARTYAFFILIATIONLabel"]/text()'
        )[0].strip()
        party = party_flags[party_info]
        try:
            email = page.xpath(
                '//span[@id="body_FormView6_EMAILADDRESSPUBLICLabel"]/text()'
            )[0].strip()
        except IndexError:
            email = None
        district = leg_info["dist"].replace("Dist", "").strip()

        person = Person(name=name,
                        party=party,
                        district=district,
                        primary_org="lower",
                        image=photo)

        contacts = [
            (leg_info["office"], "address"),
            (leg_info["phone"], "voice"),
            (email, "email"),
        ]

        for value, key in contacts:
            if value:
                person.add_contact_detail(type=key,
                                          value=value,
                                          note="District Office")

        person.add_source(url)
        person.add_link(url)

        yield person
def test_person_add_membership_org():
    p = Person("Bob B. Bear")
    p.add_source("http://example.com")
    o = Organization("test org", classification="unknown")
    p.add_membership(o,
                     role="member",
                     start_date="2007",
                     end_date=datetime.date(2015, 5, 8))
    assert len(p._related) == 1
    p._related[0].validate()
    assert p._related[0].person_id == p._id
    assert p._related[0].organization_id == o._id
    assert p._related[0].start_date == "2007"
    assert p._related[0].end_date == datetime.date(2015, 5, 8)
Beispiel #18
0
def test_save_related():
    s = Scraper(juris, "/tmp/")
    p = Person("Michael Jordan")
    p.add_source("http://example.com")
    o = Organization("Chicago Bulls", classification="committee")
    o.add_source("http://example.com")
    p._related.append(o)

    with mock.patch("json.dump") as json_dump:
        s.save_object(p)

    assert json_dump.mock_calls == [
        mock.call(p.as_dict(), mock.ANY, cls=mock.ANY),
        mock.call(o.as_dict(), mock.ANY, cls=mock.ANY),
    ]
Beispiel #19
0
    def scrape_chamber(self, chamber):
        leg_list_url = utils.urls["people"][chamber]
        page = self.get(leg_list_url).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(leg_list_url)

        # email addresses are hidden away on a separate page now, at
        # least for Senators
        contact_url = utils.urls["contacts"][chamber]
        contact_page = self.get(contact_url).text
        contact_page = lxml.html.fromstring(contact_page)

        for link in page.xpath("//a[contains(@href, '_bio.cfm')]"):
            full_name = " ".join(link.text.split(", ")[::-1]).strip()
            full_name = re.sub(r"\s+", " ", full_name)
            district = link.getparent().getnext().tail.strip()
            district = re.search(r"District (\d+)", district).group(1)

            party = link.getparent().tail.strip()[-2]
            if party == "R":
                party = "Republican"
            elif party == "D":
                party = "Democratic"
            elif party == "I":
                party = "Independent"

            url = link.get("href")
            leg_id = url.split("?id=")[1]

            person = Person(name=full_name,
                            district=district,
                            party=party,
                            primary_org=chamber)
            person.add_link(leg_list_url)
            person.add_source(leg_list_url)

            # Scrape email, offices, photo.
            page = self.get(url).text
            doc = lxml.html.fromstring(page)
            doc.make_links_absolute(url)

            email = self.scrape_email_address(contact_page, leg_id)
            self.scrape_offices(url, doc, person, email)
            self.scrape_photo_url(url, doc, person)

            yield person
    def scrape_chamber(self, session):
        session_key = SESSION_KEYS[session]
        legislators_reponse = self.api_client.get("legislators", session=session_key)

        for legislator in legislators_reponse:
            url_name = legislator["WebSiteUrl"].split("/")[-1]
            chamber_name = "house" if legislator["Chamber"] == "H" else "senate"
            img = "https://www.oregonlegislature.gov/{}/MemberPhotos/{}.jpg".format(
                chamber_name, url_name
            )

            party = legislator["Party"]
            if party == "Democrat":
                party = "Democratic"

            person = Person(
                name="{} {}".format(legislator["FirstName"], legislator["LastName"]),
                primary_org={"S": "upper", "H": "lower"}[legislator["Chamber"]],
                party=party,
                district=legislator["DistrictNumber"],
                image=img,
            )
            person.add_link(legislator["WebSiteUrl"])
            person.add_source(legislator["WebSiteUrl"])

            if legislator["CapitolAddress"]:
                person.add_contact_detail(
                    type="address",
                    value=legislator["CapitolAddress"],
                    note="Capitol Office",
                )

            if legislator["CapitolPhone"]:
                person.add_contact_detail(
                    type="voice",
                    value=legislator["CapitolPhone"],
                    note="Capitol Office",
                )

            person.add_contact_detail(
                type="email", value=legislator["EmailAddress"], note="Capitol Office"
            )

            yield person
Beispiel #21
0
    def handle_list_item(self, item):
        photo_url = item.xpath("./img/@src")[0]
        url = item.xpath(".//h5/a/@href")[0]
        name_text = item.xpath(".//h5/a/b/text()")[0]

        name_match = re.match(r"^(.+)\(([0-9]{2}[AB]), ([A-Z]+)\)$", name_text)
        name = name_match.group(1).strip()
        district = name_match.group(2).lstrip("0").upper()
        party_text = name_match.group(3)
        party = PARTIES[party_text]

        info_texts = [
            x.strip() for x in item.xpath("./div/text()[normalize-space()]")
            if x.strip()
        ]
        address = "\n".join((info_texts[0], info_texts[1]))

        phone_text = info_texts[2]
        if validate_phone_number(phone_text):
            phone = phone_text

        email_text = item.xpath(".//a/@href")[1].replace("mailto:", "").strip()
        if validate_email_address(email_text):
            email = email_text

        rep = Person(
            name=name,
            district=district,
            party=party,
            primary_org="lower",
            role="Representative",
            image=photo_url,
        )
        rep.add_link(url)
        rep.add_contact_detail(type="address", value=address, note="capitol")
        rep.add_contact_detail(type="voice", value=phone, note="capitol")
        rep.add_contact_detail(type="email", value=email, note="capitol")
        rep.add_source(self.url)

        yield rep
Beispiel #22
0
    def scrape_upper_chamber(self, term):
        url = "http://oksenate.gov/Senators/Default.aspx"
        html = self.get(url).text
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)

        for a in doc.xpath("//table[@summary]")[0].xpath(
                './/td//a[contains(@href, "biographies")]'):
            tail = a.xpath("..")[0].tail
            if tail:
                district = tail.split()[1]
            else:
                district = a.xpath("../../span")[1].text.split()[1]

            if a.text is None or a.text.strip() == "Vacant":
                self.warning(
                    "District {} appears to be empty".format(district))
                continue
            else:
                match = re.match(r"(.+) \(([A-Z])\)", a.text.strip())
                if match:
                    name, party = match.group(1), self._parties[match.group(2)]
                else:
                    self.warning(
                        "District {} appears to have empty Representative name,party"
                        .format(district))
                    continue

            url = a.get("href")

            person = Person(primary_org="upper",
                            district=district,
                            name=name.strip(),
                            party=party)
            person.add_link(url)
            person.add_source(url)
            self.scrape_upper_offices(person, url)
            yield person
    def scrape_chamber(self, chamber):
        url = {
            "upper":
            "https://legis.delaware.gov/json/Senate/GetSenators",
            "lower":
            "https://legis.delaware.gov/json/House/" + "GetRepresentatives",
        }[chamber]
        source_url = {
            "upper": "https://legis.delaware.gov/Senate",
            "lower": "https://legis.delaware.gov/House",
        }[chamber]

        data = self.post(url).json()["Data"]

        for item in data:
            if item["PersonFullName"] is None:
                # Vacant district
                self.warning("District {} was detected as vacant".format(
                    item["DistrictNumber"]))
                continue

            leg_url = ("https://legis.delaware.gov/" +
                       "LegislatorDetail?personId={}".format(item["PersonId"]))

            doc = self.lxmlize(leg_url)
            image_url = doc.xpath("//img/@src")[0]

            leg = Person(
                name=item["PersonFullName"],
                district=str(item["DistrictNumber"]),
                party=PARTY[item["PartyCode"]],
                primary_org=chamber,
                image=image_url,
            )
            self.scrape_contact_info(leg, doc)
            leg.add_link(leg_url, note="legislator page")
            leg.add_source(source_url, note="legislator list page")
            yield leg
Beispiel #24
0
    def handle_list_item(self, item):
        name = " ".join(item.xpath(".//text()"))
        name = re.sub(r"\s+", " ", name).replace(" ,", ",").strip()

        if "Vacant" in name:
            return

        district = item.xpath("string(../../td[1])")
        party = item.xpath("string(../../td[2])")
        if party == "Democrat":
            party = "Democratic"

        leg_url = item.get("href")

        parts = name.split(", ")
        parts[:2] = parts[1::-1]  # reverse first two
        name = " ".join(parts)
        leg = Person(
            name=name,
            district=district,
            party=party,
            primary_org="upper",
            role="Senator",
        )

        response = requests.head(leg_url)
        if 300 <= response.status_code < 400:
            leg_url = response.headers["Location"]
            if leg_url.startswith("/"):
                leg_url = "https://www.flsenate.gov" + leg_url

        leg.add_link(leg_url)
        leg.add_source(self.url)
        leg.add_source(leg_url)

        self.scrape_page(SenDetail, leg_url, obj=leg)

        return leg
    def handle_list_item(self, item):
        name = item.text

        lname = name.lower()
        if "resigned" in lname or "vacated" in lname or "retired" in lname:
            return
        if name in CHAMBER_MOVES and (self.chamber != CHAMBER_MOVES[name]):
            return

        name, action, date = clean_name(name)

        leg = Person(name=name)
        leg.add_source(self.url)
        leg.add_source(item.get("href"))
        leg.add_link(item.get("href"))
        yield from self.scrape_page(
            self.detail_page,
            item.get("href"),
            session=self.kwargs["session"],
            committees=self.kwargs["committees"],
            obj=leg,
        )
        yield leg
Beispiel #26
0
    def scrape(self):
        base_url = "http://news.legislature.ne.gov/dist"

        # there are 49 districts
        for district in range(1, 50):
            rep_url = base_url + str(district).zfill(2)

            full_name = None
            address = None
            phone = None
            email = None
            photo_url = None

            try:
                page = self.lxmlize(rep_url)

                info_node = self.get_node(
                    page,
                    '//div[@class="container view-front"]'
                    '//div[@class="col-sm-4 col-md-3 ltc-col-right"]'
                    '/div[@class="block-box"]',
                )

                full_name = self.get_node(info_node,
                                          "./h2/text()[normalize-space()]")
                full_name = re.sub(r"^Sen\.[\s]+", "", full_name).strip()
                if full_name == "Seat Vacant":
                    continue

                address_node = self.get_node(
                    info_node, './address[@class="feature-content"]')

                email = self.get_node(
                    address_node, './a[starts-with(@href, "mailto:")]/text()')

                contact_text_nodes = self.get_nodes(
                    address_node, "./text()[following-sibling::br]")

                address_sections = []
                for text in contact_text_nodes:
                    text = text.strip()

                    if not text:
                        continue

                    phone_match = re.search(r"Phone:", text)

                    if phone_match:
                        phone = re.sub(r"^Phone:[\s]+", "", text)
                        continue

                    # If neither a phone number nor e-mail address.
                    address_sections.append(text)

                address = "\n".join(address_sections)

                photo_url = (
                    "http://www.nebraskalegislature.gov/media/images/blogs"
                    "/dist{:2d}.jpg").format(district)

                # Nebraska is offically nonpartisan.
                party = "Nonpartisan"

                person = Person(
                    name=full_name,
                    district=str(district),
                    party=party,
                    image=photo_url,
                    primary_org="legislature",
                )

                person.add_link(rep_url)
                person.add_source(rep_url)

                note = "Capitol Office"
                person.add_contact_detail(type="address",
                                          value=address,
                                          note=note)
                if phone:
                    person.add_contact_detail(type="voice",
                                              value=phone,
                                              note=note)
                if email:
                    person.add_contact_detail(type="email",
                                              value=email,
                                              note=note)

                yield person
            except scrapelib.HTTPError:
                self.warning("could not retrieve %s" % rep_url)
Beispiel #27
0
    def scrape_chamber(self, chamber):
        # the url for each rep is unfindable (by me)
        # and the parts needed to make it up do not appear in the html or js.
        # we can find basic information on the main rep page, and sponsor
        # info on a version of their indivdual page called using only their
        # sponsor ID (which we have to scrape from ALISON)
        # we can't get detailed information without another ID
        # which I have not been able to find.
        if chamber == "upper":
            member_list_url = self._base_url + "Senate/ALSenators.aspx"
            legislator_base_url = self._base_url + "ALSenator.aspx"
        elif chamber == "lower":
            member_list_url = self._base_url + "House/ALRepresentatives.aspx"
            legislator_base_url = self._base_url + "ALRepresentative.aspx"

        page = self.lxmlize(member_list_url)

        legislator_nodes = self.get_nodes(
            page, '//div[@class="container container-main"]/table/tr/td/input')

        legislator_url_template = (legislator_base_url + "?OID_SPONSOR="
                                   "{oid_sponsor}&OID_PERSON={oid_person}")

        html_parser = HTMLParser()

        for legislator_node in legislator_nodes:
            # Set identifiers internal to AlisonDB.
            # Have to do this to OID_SPONSOR because they don't know
            # how to HTML and I'm making links absolute out of convenience.
            try:
                oid_sponsor = legislator_node.attrib["longdesc"].split("/")[-1]
                oid_person = legislator_node.attrib["alt"]
            except KeyError:
                continue

            legislator_url = legislator_url_template.format(
                oid_sponsor=oid_sponsor, oid_person=oid_person)

            legislator_page = self.lxmlize(legislator_url)

            name_text = self.get_node(
                legislator_page,
                '//span[@id="ContentPlaceHolder1_lblMember"]').text_content()

            # This just makes processing the text easier.
            name_text = name_text.lower()

            # Skip vacant seats.
            if "vacant" in name_text:
                continue

            photo_url = self.get_node(
                legislator_page,
                '//input[@id="ContentPlaceHolder1_TabSenator_TabLeg_imgLEG"]'
                "/@src",
            )

            # Another check for vacant seats
            if "VACANT.jpeg" in photo_url or "pending.jpeg" in photo_url:
                continue

            # Removes titles and nicknames.
            name = html_parser.unescape(
                re.sub(r"(?i)(representative|senator|&quot.*&quot)", "",
                       name_text).strip().title())

            # Assemble full name by reversing last name, first name format.
            name_parts = [x.strip() for x in name.split(",")]
            full_name = "{0} {1}".format(name_parts[1], name_parts[0])

            info_node = self.get_node(
                legislator_page,
                '//div[@id="ContentPlaceHolder1_TabSenator_body"]//table',
            )

            district_text = self.get_node(info_node,
                                          "./tr[2]/td[2]").text_content()
            district_text = district_text.replace("&nbsp;", u"")

            if chamber == "upper":
                district = district_text.replace("Senate District", "").strip()
            elif chamber == "lower":
                district = district_text.replace("House District", "").strip()

            party_text = self.get_node(info_node,
                                       "./tr[1]/td[2]").text_content()

            if not full_name.strip() and party_text == "()":
                self.warning(
                    "Found empty seat, for district {}; skipping".format(
                        district))
                continue

            if party_text.strip() in self._parties.keys():
                party = self._parties[party_text.strip()]
            else:
                party = None

            phone_number = (self.get_node(
                info_node, "./tr[4]/td[2]").text_content().strip())

            fax_number = (self.get_node(
                info_node,
                "./tr[5]/td[2]").text_content().strip().replace("\u00a0", ""))

            suite_text = self.get_node(info_node,
                                       "./tr[7]/td[2]").text_content()

            office_address = "{}\n11 S. Union Street\nMontgomery, AL 36130".format(
                suite_text)

            email_address = self.get_node(info_node,
                                          "./tr[11]/td[2]").text_content()

            photo_url = self.get_node(
                legislator_page,
                '//input[@id="ContentPlaceHolder1_TabSenator_TabLeg_imgLEG"]'
                "/@src",
            )

            # add basic leg info and main office
            person = Person(
                name=full_name,
                district=district,
                primary_org=chamber,
                party=party,
                image=photo_url,
            )

            person.add_contact_detail(type="address",
                                      value=office_address,
                                      note="Capitol Office")
            if phone_number:
                person.add_contact_detail(type="voice",
                                          value=phone_number,
                                          note="Capitol Office")
            if fax_number:
                person.add_contact_detail(type="fax",
                                          value=fax_number,
                                          note="Capitol Office")
            if email_address:
                person.add_contact_detail(type="email",
                                          value=email_address,
                                          note="Capitol Office")

            self.add_committees(legislator_page, person, chamber,
                                legislator_url)

            person.add_link(legislator_url)
            person.add_source(legislator_url)
            person.add_source(member_list_url)

            yield person
Beispiel #28
0
 def scrape_people(self):
     p = Person("Michael Jordan")
     p.add_source("http://example.com")
     yield p
    def scrape_member(self, chamber, member_url):
        member_page = self.get(member_url).text
        doc = lxml.html.fromstring(member_page)
        doc.make_links_absolute(member_url)

        photo_url = doc.xpath('//a[@class="download"]/@href')[0]

        name_pieces = doc.xpath(
            '//div[@class="row profile-top"]/h2/text()')[0].split()

        full_name = " ".join(name_pieces[1:-1]).strip()

        party = name_pieces[-1]
        if party == "(R)":
            party = "Republican"
        elif party == "(D)":
            party = "Democratic"
        elif party == "(I)":
            party = "Independent"

        sidebar = doc.xpath(
            '//div[@class="relativeContent col-sm-4 col-xs-12"]')[0]

        district = sidebar.xpath('//div[@class="circle"]/h3/text()')[0]
        district = district.lstrip("0")

        person = Person(
            name=full_name,
            district=district,
            party=party,
            primary_org=chamber,
            image=photo_url,
        )
        person.add_source(member_url)
        person.add_link(member_url)

        info = {}
        sidebar_items = iter(sidebar.getchildren())
        for item in sidebar_items:
            if item.tag == "p":
                info[item.text] = next(sidebar_items)

        address = "\n".join(info["Legislative Address"].xpath("./text()"))

        phone = None
        fax = None
        phone_numbers = info["Phone Number(s)"].xpath("./text()")
        for num in phone_numbers:
            kind, num = num.split(": ")
            if kind == "LRC":
                if num.endswith(" (fax)"):
                    fax = num.replace(" (fax)", "")
                else:
                    phone = num

        email = info["Email"].text

        if phone:
            person.add_contact_detail(type="voice",
                                      value=phone,
                                      note="Capitol Office")

        if fax:
            person.add_contact_detail(type="fax",
                                      value=fax,
                                      note="Capitol Office")

        if email:
            person.add_contact_detail(type="email",
                                      value=email,
                                      note="Capitol Office")

        if address.strip() == "":
            self.warning("Missing Capitol Office!!")
        else:
            person.add_contact_detail(type="address",
                                      value=address,
                                      note="Capitol Office")

        yield person
Beispiel #30
0
    def scrape_chamber(self, chamber=None):
        if chamber == "upper":
            url = "http://www.rilegislature.gov/SiteAssets/MailingLists/Senators.xls"
            rep_type = "Senator"
            contact_url = (
                "http://webserver.rilin.state.ri.us/Email/SenEmailListDistrict.asp"
            )
        elif chamber == "lower":
            url = "http://www.rilegislature.gov/SiteAssets/MailingLists/Representatives.xls"
            rep_type = "Representative"
            contact_url = (
                "http://webserver.rilin.state.ri.us/Email/RepEmailListDistrict.asp"
            )

        contact_page = self.lxmlize(contact_url)
        contact_info_by_district = {}
        for row in contact_page.xpath('//tr[@valign="TOP"]'):
            tds = row.xpath("td")
            (detail_link,) = tds[link_col_ix].xpath(".//a/@href")
            # Ignore name (2nd col). We have a regex built up below for the spreadsheet name
            # I don't want to touch
            district, _, email, phone = [
                td.text_content().strip() for td in tds[:link_col_ix]
            ]
            contact_info_by_district[district] = {
                "email": email,
                "phone": phone,
                "detail_link": detail_link,
            }

        self.urlretrieve(url, "ri_leg.xls")

        wb = xlrd.open_workbook("ri_leg.xls")
        sh = wb.sheet_by_index(0)

        for rownum in range(1, sh.nrows):
            d = {
                field: sh.cell(rownum, col_num).value
                for field, col_num in excel_mapping.items()
            }

            # Convert float to an int, and then to string, the required format
            district = str(int(d["district"]))
            if d["full_name"].upper() == "VACANT":
                self.warning("District {}'s seat is vacant".format(district))
                continue

            contact_info = contact_info_by_district[district]

            # RI is very fond of First M. Last name formats and
            # they're being misparsed upstream, so fix here
            (first, middle, last) = ("", "", "")
            full_name = re.sub(
                r"^{}(?=\s?[A-Z].*$)".format(rep_type), "", d["full_name"]
            ).strip()
            if re.match(r"^\S+\s[A-Z]\.\s\S+$", full_name):
                (first, middle, last) = full_name.split()

            # Note - if we ever need to speed this up, it looks like photo_url can be mapped
            # from the detail_link a la /senators/Paolino/ -> /senators/pictures/Paolino.jpg
            detail_page = self.lxmlize(contact_info["detail_link"])
            try:
                (photo_url,) = detail_page.xpath('//div[@class="ms-WPBody"]//img/@src')
            except ValueError:
                photo_url = ""

            person = Person(
                primary_org=chamber,
                district=district,
                name=full_name,
                party=translate[d["party"]],
                image=photo_url,
            )
            person.extras["town_represented"] = d["town_represented"]
            person.extras["name_first"] = first
            person.extras["name_middle"] = middle
            person.extras["name_last"] = last
            person.add_link(detail_link)

            if d["address"]:
                person.add_contact_detail(
                    type="address", value=d["address"], note="District Office"
                )
            if contact_info["phone"]:
                person.add_contact_detail(
                    type="voice", value=contact_info["phone"], note="District Office"
                )
            if contact_info["email"]:
                person.add_contact_detail(
                    type="email", value=contact_info["email"], note="District Office"
                )

            person.add_source(contact_url)
            person.add_source(contact_info["detail_link"])

            yield person