Esempi in Python per Person.add_contact_detail, esempi in Python per openstates.scrape.Person.add_contact_detail

Esempio n. 1

0

Mostra file

File: people.py Progetto: TheWalkers/openstates

    def scrape_member_page(self, chamber, url):
        page = self.get(url).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        for legislator in page.xpath(
            "//div[contains(concat(' ', normalize-space(@class), ' '), "
            "' memberModule ')]"
        ):
            img = legislator.xpath(".//div[@class='thumbnail']//img")[0].attrib["src"]
            data = legislator.xpath(".//div[@class='data']")[0]
            homepage = data.xpath(".//a[@class='black']")[0]
            full_name = homepage.text_content()

            if "Vacant" in full_name:
                continue

            homepage = homepage.attrib["href"]
            party = data.xpath(".//span[@class='partyLetter']")[0].text_content()
            party = {"R": "Republican", "D": "Democratic"}[party]
            office_lines = data.xpath("child::text()")
            phone = office_lines.pop(-1)

            if re.search(r"(Leader|Whip|Speaker)", office_lines[0]):
                office_lines.pop(0)

            office = "\n".join(office_lines)
            h3 = data.xpath("./h3")
            if len(h3):
                h3 = h3[0]
                district = h3.xpath("./br")[0].tail.replace("District", "").strip()
            else:
                district = re.findall(r"\d+\.png", legislator.attrib["style"])[
                    -1
                ].split(".", 1)[0]

            full_name = re.sub(r"\s+", " ", full_name).strip()
            email = (
                "rep{0:0{width}}@ohiohouse.gov"
                if chamber == "lower"
                else "sd{0:0{width}}@ohiosenate.gov"
            ).format(int(district), width=2)

            leg = Person(
                name=full_name,
                district=district,
                party=party,
                primary_org=chamber,
                image=img,
            )

            leg.add_contact_detail(type="address", value=office, note="Capitol Office")
            leg.add_contact_detail(type="voice", value=phone, note="Capitol Office")
            leg.add_contact_detail(type="email", value=email, note="Capitol Office")

            self.scrape_homepage(leg, chamber, homepage)

            leg.add_source(url)
            leg.add_link(homepage)
            yield leg

Esempio n. 2

0

Mostra file

File: test_people_importer.py Progetto: sgodfrey66/openstates-core

def test_full_person():
    person = ScrapePerson("Tom Sawyer")
    person.add_identifier("1")
    person.add_name("Tommy", start_date="1880")
    person.add_contact_detail(type="phone",
                              value="555-555-1234",
                              note="this is fake")
    person.add_link("http://example.com/link")
    person.add_source("http://example.com/source")

    # import person
    pd = person.as_dict()
    PersonImporter("jid").import_data([pd])

    # get person from db and assert it imported correctly
    p = Person.objects.get()
    assert "ocd-person" in p.id
    assert p.name == person.name

    assert p.identifiers.all()[0].identifier == "1"
    assert p.identifiers.all()[0].scheme == ""

    assert p.other_names.all()[0].name == "Tommy"
    assert p.other_names.all()[0].start_date == "1880"

    assert p.contact_details.all()[0].type == "phone"
    assert p.contact_details.all()[0].value == "555-555-1234"
    assert p.contact_details.all()[0].note == "this is fake"

    assert p.links.all()[0].url == "http://example.com/link"
    assert p.sources.all()[0].url == "http://example.com/source"

Esempio n. 3

0

Mostra file

File: people.py Progetto: TheWalkers/openstates

    def scrape_lower(self, chamber):
        url = "http://www.house.mi.gov/mhrpublic/frmRepList.aspx"
        table = ["website", "district", "name", "party", "location", "phone", "email"]

        data = self.get(url).text
        doc = lxml.html.fromstring(data)

        # skip two rows at top
        for row in doc.xpath('//table[@id="grvRepInfo"]/*'):
            tds = row.xpath(".//td")
            if len(tds) == 0:
                continue
            metainf = {}
            for i in range(0, len(table)):
                metainf[table[i]] = tds[i]
            district = str(int(metainf["district"].text_content().strip()))
            party = metainf["party"].text_content().strip()
            phone = metainf["phone"].text_content().strip()
            email = metainf["email"].text_content().strip()
            name = metainf["name"].text_content().strip()
            if name == "Vacant" or re.match(r"^District \d{1,3}$", name):
                self.warning(
                    "District {} appears vacant, and will be skipped".format(district)
                )
                continue
            leg_url = metainf["website"].xpath("./a")[0].attrib["href"]

            office = metainf["location"].text_content().strip()
            office = re.sub(
                " HOB",
                " Anderson House Office Building\n124 North Capitol Avenue\nLansing, MI 48933",
                office,
            )
            office = re.sub(" CB", " State Capitol Building\nLansing, MI 48909", office)

            try:
                photo_url = self.get_photo_url(leg_url)[0]
            except (scrapelib.HTTPError, IndexError):
                photo_url = ""
                self.warning("no photo url for %s", name)

            person = Person(
                name=name,
                district=district,
                party=abbr[party],
                primary_org="lower",
                image=photo_url,
            )

            person.add_link(leg_url)
            person.add_source(leg_url)

            person.add_contact_detail(
                type="address", value=office, note="Capitol Office"
            )
            person.add_contact_detail(type="voice", value=phone, note="Capitol Office")
            person.add_contact_detail(type="email", value=email, note="Capitol Office")

            yield person

Esempio n. 4

0

Mostra file

File: people.py Progetto: washabstract/openstates-scrapers

    def scrape_chamber(self, chamber):
        client = ApiClient(self)
        session = self.latest_session()
        base_url = "http://iga.in.gov/legislative"
        api_base_url = "https://api.iga.in.gov"
        chamber_name = "senate" if chamber == "upper" else "house"
        r = client.get("chamber_legislators",
                       session=session,
                       chamber=chamber_name)
        all_pages = client.unpaginate(r)
        for leg in all_pages:
            firstname = leg["firstName"]
            lastname = leg["lastName"]
            party = leg["party"]
            link = leg["link"]
            api_link = api_base_url + link
            html_link = base_url + link.replace("legislators/",
                                                "legislators/legislator_")
            try:
                html = get_with_increasing_timeout(self,
                                                   html_link,
                                                   fail=True,
                                                   kwargs={"verify": False})
            except scrapelib.HTTPError:
                self.logger.warning("Legislator's page is not available.")
                continue
            doc = lxml.html.fromstring(html.text)
            doc.make_links_absolute(html_link)
            address, phone = doc.xpath("//address")
            address = address.text_content().strip()
            address = "\n".join([ln.strip() for ln in address.split("\n")])
            phone = phone.text_content().strip()
            try:
                district = (doc.xpath("//span[@class='district-heading']")
                            [0].text.lower().replace("district", "").strip())
            except IndexError:
                self.warning("skipping legislator w/o district")
                continue
            image_link = base_url + link.replace("legislators/",
                                                 "portraits/legislator_")
            legislator = Person(
                primary_org=chamber,
                district=district,
                name=" ".join([firstname, lastname]),
                party=party,
                image=image_link,
            )
            legislator.add_contact_detail(type="address",
                                          note="Capitol Office",
                                          value=address)
            legislator.add_contact_detail(type="voice",
                                          note="Capitol Office",
                                          value=phone)
            legislator.add_link(html_link)
            legislator.add_source(html_link)
            legislator.add_source(api_link)

            yield legislator

Esempio n. 5

0

Mostra file

File: people.py Progetto: vikrantmygamma/openstates-scrapers

    def scrape_senator_page(self, chamber, url):
        page = self.get(url).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        for legislator in page.xpath(
            "//div[@id='senators']//div[contains(concat(' ', normalize-space(@class), ' '), "
            "' portraitContainer ')]"
        ):
            img = legislator.xpath(
                ".//div[@class='profileThumbnailBoundingBox']/@style"
            )[0]
            img = img[img.find("(") + 1 : img.find(")")]
            full_name = legislator.xpath(".//div[@class='profileName']/a/text()")[0]
            homepage_url = legislator.xpath(".//a[@class='profileImageLink']")[
                0
            ].attrib["href"]
            district = legislator.xpath(".//div[@class='profileDistrict']" "/a/text()")[
                0
            ].split("#")[1]

            if "Vacant" in full_name:
                continue

            homepage = self.get(homepage_url).text
            page = lxml.html.fromstring(homepage)
            phone = page.xpath("//div[@class='phone']/span/text()")[0]

            address_lines = page.xpath("//div[@class='address']/descendant::*/text()")
            address = "\n".join(address_lines)

            party_image = page.xpath('//div[@class="senatorParty"]/img/@src')[0]
            if "Republican" in party_image:
                party = "Republican"
            elif "Democrat" in party_image:
                party = "Democratic"

            email = (
                "rep{0:0{width}}@ohiohouse.gov"
                if chamber == "lower"
                else "sd{0:0{width}}@ohiosenate.gov"
            ).format(int(district), width=2)

            leg = Person(
                name=full_name,
                district=district,
                primary_org=chamber,
                image=img,
                party=party,
            )

            leg.add_contact_detail(type="address", value=address, note="Capitol Office")
            leg.add_contact_detail(type="voice", value=phone, note="Capitol Office")
            leg.add_contact_detail(type="email", value=email, note="Capitol Office")

            leg.add_source(url)
            leg.add_link(homepage_url)
            yield leg

Esempio n. 6

0

Mostra file

File: people.py Progetto: TheWalkers/openstates

    def scrape_rep(self, url):

        page = self.get(url).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        main = page.xpath('//div[@id="main-info"]')[0]
        if "Resigned" in main.text_content():
            print("Member resigned {}".format(url))
            raise StopIteration  # don't yield anything

        if "Deceased" in main.text_content():
            print("Member is deceased {}".format(url))
            raise StopIteration  # don't yield anything

        name = page.xpath('//div[@class="member-name"]/text()')[0].strip()
        name = re.sub(r"\s+", " ", name)
        district_number = page.xpath(
            '//span[contains(text(), "House District:")]'
            "/following-sibling::span/text()")[0].strip()
        # remove anything after first whitespace
        district_number = re.sub(r"\s.*", "", district_number.strip())

        email = None
        email_content = page.xpath(
            '//a[./i[contains(@class,"fa-envelope")]]/text()')
        if email_content and email_content[0].strip():
            email = email_content[0].strip()

        photo_url = page.xpath('//header[@id="home"]/img/@src')[0]

        party = self.get_rep_table_by_header(page,
                                             "Party Affiliation").text.strip()
        party = _party_map[party[0]]  # standardize

        main_p_text = page.xpath('//div[@id="main-info"]/p/text()')
        address = [t.strip() for t in main_p_text if t.strip()][0]

        person = Person(
            name=name,
            district=district_number,
            primary_org="lower",
            party=party,
            image=photo_url,
        )

        person.add_contact_detail(type="address",
                                  value=address,
                                  note="District Office")
        if email:
            person.add_contact_detail(type="email",
                                      value=email,
                                      note="District Office")

        person.add_link(url)
        person.add_source(url)

        yield person

Esempio n. 7

0

Mostra file

File: people.py Progetto: hiteshgarg14/openstates

    def handle_list_item(self, row):
        if not row["First Name"]:
            return
        name = "{} {}".format(row["First Name"], row["Last Name"])
        party = PARTIES[row["Party"]]
        leg = Person(
            name=name,
            district=row["District"].lstrip("0"),
            party=party,
            primary_org="upper",
            role="Senator",
            image=self.extra_info[name]["image"],
        )
        leg.add_link(self.extra_info[name]["url"])
        leg.add_contact_detail(type="voice",
                               value=self.extra_info[name]["office_phone"],
                               note="capitol")
        if "email" in self.extra_info[name]:
            leg.add_contact_detail(type="email",
                                   value=self.extra_info[name]["email"],
                                   note="capitol")

        row["Zipcode"] = row["Zipcode"].strip()
        # Accommodate for multiple address column naming conventions.
        address1_fields = [row.get("Address"), row.get("Office Building")]
        address2_fields = [row.get("Address2"), row.get("Office Address")]
        row["Address"] = next((a for a in address1_fields if a is not None),
                              False)
        row["Address2"] = next((a for a in address2_fields if a is not None),
                               False)

        if (a in row["Address2"] for a in
            ["95 University Avenue W", "100 Rev. Dr. Martin Luther King"]):
            address = "{Address}\n{Address2}\n{City}, {State} {Zipcode}".format(
                **row)
            if "Rm. Number" in row:
                address = "{0} {1}".format(row["Rm. Number"], address)
            leg.add_contact_detail(type="address",
                                   value=address,
                                   note="capitol")
        elif row["Address2"]:
            address = "{Address}\n{Address2}\n{City}, {State} {Zipcode}".format(
                **row)
            leg.add_contact_detail(type="address",
                                   value=address,
                                   note="district")
        else:
            address = "{Address}\n{City}, {State} {Zipcode}".format(**row)
            leg.add_contact_detail(type="address",
                                   value=address,
                                   note="district")

        leg.add_source(self.url)
        leg.add_source(self._html_url)

        return leg

Esempio n. 8

0

Mostra file

    def scrape_lower_legislator(self, url, leg_info):
        page = self.lxmlize(url)

        name = page.xpath(
            '//span[@id="body_FormView5_FULLNAMELabel"]/text()')[0].strip()
        if name.startswith("District ") or name.startswith("Vacant "):
            self.warning("Seat is vacant: {}".format(name))
            return

        photo = page.xpath(
            '//img[contains(@src, "/h_reps/RepPics")]')[0].attrib["src"]
        party_flags = {
            "Democrat": "Democratic",
            "Republican": "Republican",
            "Independent": "Independent",
        }
        party_info = page.xpath(
            '//span[@id="body_FormView5_PARTYAFFILIATIONLabel"]/text()'
        )[0].strip()
        party = party_flags[party_info]
        try:
            email = page.xpath(
                '//span[@id="body_FormView6_EMAILADDRESSPUBLICLabel"]/text()'
            )[0].strip()
        except IndexError:
            email = None
        district = leg_info["dist"].replace("Dist", "").strip()

        person = Person(name=name,
                        party=party,
                        district=district,
                        primary_org="lower",
                        image=photo)

        contacts = [
            (leg_info["office"], "address"),
            (leg_info["phone"], "voice"),
            (email, "email"),
        ]

        for value, key in contacts:
            if value:
                person.add_contact_detail(type=key,
                                          value=value,
                                          note="District Office")

        person.add_source(url)
        person.add_link(url)

        yield person

Esempio n. 9

0

Mostra file

File: people.py Progetto: TheWalkers/openstates

    def handle_list_item(self, item):
        photo_url = item.xpath("./img/@src")[0]
        url = item.xpath(".//h5/a/@href")[0]
        name_text = item.xpath(".//h5/a/b/text()")[0]

        name_match = re.match(r"^(.+)\(([0-9]{2}[AB]), ([A-Z]+)\)$", name_text)
        name = name_match.group(1).strip()
        district = name_match.group(2).lstrip("0").upper()
        party_text = name_match.group(3)
        party = PARTIES[party_text]

        info_texts = [
            x.strip() for x in item.xpath("./div/text()[normalize-space()]")
            if x.strip()
        ]
        address = "\n".join((info_texts[0], info_texts[1]))

        phone_text = info_texts[2]
        if validate_phone_number(phone_text):
            phone = phone_text

        email_text = item.xpath(".//a/@href")[1].replace("mailto:", "").strip()
        if validate_email_address(email_text):
            email = email_text

        rep = Person(
            name=name,
            district=district,
            party=party,
            primary_org="lower",
            role="Representative",
            image=photo_url,
        )
        rep.add_link(url)
        rep.add_contact_detail(type="address", value=address, note="capitol")
        rep.add_contact_detail(type="voice", value=phone, note="capitol")
        rep.add_contact_detail(type="email", value=email, note="capitol")
        rep.add_source(self.url)

        yield rep

Esempio n. 10

0

Mostra file

File: people.py Progetto: hiteshgarg14/openstates

    def get_member(self, session, chamber, kpid):
        url = "%smembers/%s" % (ksapi.url, kpid)
        content = json.loads(self.get(url).text)["content"]

        party = content["PARTY"]
        if party == "Democrat":
            party = "Democratic"

        slug = {
            "2013-2014": "b2013_14",
            "2015-2016": "b2015_16",
            "2017-2018": "b2017_18",
            "2019-2020": "b2019_20",
        }[session]
        leg_url = "http://www.kslegislature.org/li/%s/members/%s/" % (slug,
                                                                      kpid)

        try:
            legislator_page = self.lxmlize(leg_url)
            (photo_url,
             ) = legislator_page.xpath('//img[@class="profile-picture"]/@src')
        except scrapelib.HTTPError:
            self.warning("{}'s legislator bio page not found".format(
                content["FULLNAME"]))
            leg_url = ""
            photo_url = ""

        person = Person(
            name=content["FULLNAME"],
            district=str(content["DISTRICT"]),
            primary_org=chamber,
            party=party,
            image=photo_url,
        )
        person.extras = {"occupation": content["OCCUPATION"]}

        address = "\n".join([
            "Room {}".format(content["OFFICENUM"]),
            "Kansas State Capitol Building",
            "300 SW 10th St.",
            "Topeka, KS 66612",
        ])

        note = "Capitol Office"
        person.add_contact_detail(type="address", value=address, note=note)
        person.add_contact_detail(type="email",
                                  value=content["EMAIL"],
                                  note=note)
        if content["OFFPH"]:
            person.add_contact_detail(type="voice",
                                      value=content["OFFPH"],
                                      note=note)

        person.add_source(url)
        person.add_link(leg_url)

        yield person

Esempio n. 11

0

Mostra file

    def _scrape_legislator(self, row, chamber):
        name_cell = row.xpath('./td[@class="rosterCell nameCell"]/a')[0]
        name = " ".join([
            line.strip() for line in name_cell.text_content().split("\n")
            if len(line.strip()) > 0
        ])

        party_letter = row.xpath(
            './td[@class="rosterCell partyCell"]/text()')[0].strip()
        party = dict(D="Democratic", R="Republican")[party_letter]

        chamber_abbr = self._chamber_map[chamber]
        district = (row.xpath('./td[@class="rosterCell seatCell"]'
                              "/text()")[0].replace(chamber_abbr, "").strip())
        try:
            email = (row.xpath('./td[@class="rosterCell emailCell"]'
                               "/a/@href")[0].replace("mailto:", "").strip())
        except IndexError:
            email = None

        phone = (row.xpath('./td[@class="rosterCell phoneCell"]'
                           "/text()")[0].strip() or None)

        details_url = "https://leg.mt.gov{}".format(name_cell.attrib["href"])
        response = self.get(details_url)
        details_page = lxml.html.fromstring(response.text)

        address_lines = (details_page.xpath(
            '//div[@class="col-lg-6 col-md-12 text-lg-left align-self-center"]'
            '/p[contains(text(), "Address")]')[0].text_content().replace(
                "Address", "").split("\n"))
        address = "\n".join(
            [line.strip() for line in address_lines if len(line.strip()) > 0])

        legislator = Person(name=name,
                            district=district,
                            party=party,
                            primary_org=chamber)

        legislator.add_contact_detail(type="address",
                                      value=address,
                                      note="Capitol Office")
        if phone is not None:
            legislator.add_contact_detail(type="voice",
                                          value=phone,
                                          note="Capitol Office")

        if email is not None:
            legislator.add_contact_detail(type="email",
                                          value=email,
                                          note="E-mail")

        legislator.add_link(details_url)
        legislator.add_source(self._roster_url)

        yield legislator

Esempio n. 12

0

Mostra file

File: people.py Progetto: washabstract/openstates-scrapers

    def scrape_senator(self, district):
        link = "https://legislature.maine.gov/District-{}".format(district)
        page = lxml.html.fromstring(self.get(link).text)
        page.make_links_absolute(link)

        main = page.xpath('//div[@id="main"]/div[@id="content"]')[0]
        title = main.xpath("h1")[0].text
        # e.g. District 25 - State Senator Catherine Breen (D - Cumberland)...
        title_match = re.match(
            r"District (\d+) - State Senator ([^\(]+) \(([DRI])", title)
        _, name, party = title_match.groups()
        name = re.sub(r"\s+", " ", name.strip())
        party = _party_map[party]

        image_url = address = phone = email = None

        for p in main.xpath("p"):
            if p.xpath(".//img") and not image_url:
                image_url = p.xpath(".//img/@src")[0]
                continue
            field, _, value = p.text_content().partition(":")
            value = value.strip()
            if field in ("Address", "Mailing Address"):
                address = value
            elif field in ("Phone", "Home Phone"):
                phone = value
            elif field == "Email":
                email = value

        person = Person(
            name=name,
            district=district,
            image=image_url,
            primary_org="upper",
            party=party,
        )

        person.add_link(link)
        person.add_source(link)

        if address:
            person.add_contact_detail(type="address",
                                      value=address,
                                      note="District Office")

        if phone:
            person.add_contact_detail(type="voice",
                                      value=clean_phone(phone),
                                      note="District Phone")
        person.add_contact_detail(type="email",
                                  value=email,
                                  note="District Email")

        yield person

Esempio n. 13

0

Mostra file

File: people.py Progetto: washabstract/openstates-scrapers

def table_row_to_legislator_and_profile_url(table_row_element, chamber):
    """Derive a Legislator from an HTML table row lxml Element, and a link to their profile"""
    td_elements = table_row_element.xpath("td")
    (
        role_element,
        name_element,
        district_element,
        party_element,
        phone_element,
        email_element,
    ) = td_elements

    # Name comes in the form Last, First
    # last_name_first_name = name_element.text_content().strip()
    # full_name = last_name_first_name_to_full_name(last_name_first_name)
    full_name = name_element.text_content().strip()
    if full_name.count(", ") == 1:
        full_name = " ".join(full_name.split(", ")[::-1]).strip()
    district = district_element.text_content().strip()
    party = party_element.text_content().strip()
    if party == "Democrat":
        party = "Democratic"
    elif party == "Unaffiliated":
        party = "Independent"

    role = role_element.text_content().strip()
    address = co_address_from_role(role)
    phone = phone_element.text_content().strip()
    email = email_element.text_content().strip()

    (profile_url, ) = name_element.xpath("a/@href")
    print(chamber, district, party)
    legislator = Person(primary_org=chamber,
                        name=full_name,
                        district=district,
                        party=party)
    legislator.add_contact_detail(type="address",
                                  value=address,
                                  note="Capitol Office")
    if phone:
        legislator.add_contact_detail(type="voice",
                                      value=phone,
                                      note="Capitol Office")
    if email:
        legislator.add_contact_detail(type="email",
                                      value=email,
                                      note="Capitol Office")

    return legislator, profile_url

Esempio n. 14

0

Mostra file

File: people.py Progetto: vikrantmygamma/openstates-scrapers

    def scrape_chamber(self, session):
        session_key = SESSION_KEYS[session]
        legislators_reponse = self.api_client.get("legislators", session=session_key)

        for legislator in legislators_reponse:
            url_name = legislator["WebSiteUrl"].split("/")[-1]
            chamber_name = "house" if legislator["Chamber"] == "H" else "senate"
            img = "https://www.oregonlegislature.gov/{}/MemberPhotos/{}.jpg".format(
                chamber_name, url_name
            )

            party = legislator["Party"]
            if party == "Democrat":
                party = "Democratic"

            person = Person(
                name="{} {}".format(legislator["FirstName"], legislator["LastName"]),
                primary_org={"S": "upper", "H": "lower"}[legislator["Chamber"]],
                party=party,
                district=legislator["DistrictNumber"],
                image=img,
            )
            person.add_link(legislator["WebSiteUrl"])
            person.add_source(legislator["WebSiteUrl"])

            if legislator["CapitolAddress"]:
                person.add_contact_detail(
                    type="address",
                    value=legislator["CapitolAddress"],
                    note="Capitol Office",
                )

            if legislator["CapitolPhone"]:
                person.add_contact_detail(
                    type="voice",
                    value=legislator["CapitolPhone"],
                    note="Capitol Office",
                )

            person.add_contact_detail(
                type="email", value=legislator["EmailAddress"], note="Capitol Office"
            )

            yield person

Esempio n. 15

0

Mostra file

File: people.py Progetto: jessemortenson/openstates

    def scrape_chamber(self, chamber=None):
        if chamber == "upper":
            url = "http://www.rilegislature.gov/SiteAssets/MailingLists/Senators.xls"
            rep_type = "Senator"
            contact_url = (
                "http://webserver.rilin.state.ri.us/Email/SenEmailListDistrict.asp"
            )
        elif chamber == "lower":
            url = "http://www.rilegislature.gov/SiteAssets/MailingLists/Representatives.xls"
            rep_type = "Representative"
            contact_url = (
                "http://webserver.rilin.state.ri.us/Email/RepEmailListDistrict.asp"
            )

        contact_page = self.lxmlize(contact_url)
        contact_info_by_district = {}
        for row in contact_page.xpath('//tr[@valign="TOP"]'):
            tds = row.xpath("td")
            (detail_link,) = tds[link_col_ix].xpath(".//a/@href")
            # Ignore name (2nd col). We have a regex built up below for the spreadsheet name
            # I don't want to touch
            district, _, email, phone = [
                td.text_content().strip() for td in tds[:link_col_ix]
            ]
            contact_info_by_district[district] = {
                "email": email,
                "phone": phone,
                "detail_link": detail_link,
            }

        self.urlretrieve(url, "ri_leg.xls")

        wb = xlrd.open_workbook("ri_leg.xls")
        sh = wb.sheet_by_index(0)

        for rownum in range(1, sh.nrows):
            d = {
                field: sh.cell(rownum, col_num).value
                for field, col_num in excel_mapping.items()
            }

            # Convert float to an int, and then to string, the required format
            district = str(int(d["district"]))
            if d["full_name"].upper() == "VACANT":
                self.warning("District {}'s seat is vacant".format(district))
                continue

            contact_info = contact_info_by_district[district]

            # RI is very fond of First M. Last name formats and
            # they're being misparsed upstream, so fix here
            (first, middle, last) = ("", "", "")
            full_name = re.sub(
                r"^{}(?=\s?[A-Z].*$)".format(rep_type), "", d["full_name"]
            ).strip()
            if re.match(r"^\S+\s[A-Z]\.\s\S+$", full_name):
                (first, middle, last) = full_name.split()

            # Note - if we ever need to speed this up, it looks like photo_url can be mapped
            # from the detail_link a la /senators/Paolino/ -> /senators/pictures/Paolino.jpg
            detail_page = self.lxmlize(contact_info["detail_link"])
            try:
                (photo_url,) = detail_page.xpath('//div[@class="ms-WPBody"]//img/@src')
            except ValueError:
                photo_url = ""

            person = Person(
                primary_org=chamber,
                district=district,
                name=full_name,
                party=translate[d["party"]],
                image=photo_url,
            )
            person.extras["town_represented"] = d["town_represented"]
            person.extras["name_first"] = first
            person.extras["name_middle"] = middle
            person.extras["name_last"] = last
            person.add_link(detail_link)

            if d["address"]:
                person.add_contact_detail(
                    type="address", value=d["address"], note="District Office"
                )
            if contact_info["phone"]:
                person.add_contact_detail(
                    type="voice", value=contact_info["phone"], note="District Office"
                )
            if contact_info["email"]:
                person.add_contact_detail(
                    type="email", value=contact_info["email"], note="District Office"
                )

            person.add_source(contact_url)
            person.add_source(contact_info["detail_link"])

            yield person

Esempio n. 16

0

Mostra file

File: people.py Progetto: washabstract/openstates-scrapers

    def scrape_member(self, chamber, member_url):
        member_page = self.get(member_url).text
        doc = lxml.html.fromstring(member_page)
        doc.make_links_absolute(member_url)

        photo_url = doc.xpath('//a[@class="download"]/@href')[0]

        name_pieces = doc.xpath(
            '//div[@class="row profile-top"]/h2/text()')[0].split()

        full_name = " ".join(name_pieces[1:-1]).strip()

        party = name_pieces[-1]
        if party == "(R)":
            party = "Republican"
        elif party == "(D)":
            party = "Democratic"
        elif party == "(I)":
            party = "Independent"

        sidebar = doc.xpath(
            '//div[@class="relativeContent col-sm-4 col-xs-12"]')[0]

        district = sidebar.xpath('//div[@class="circle"]/h3/text()')[0]
        district = district.lstrip("0")

        person = Person(
            name=full_name,
            district=district,
            party=party,
            primary_org=chamber,
            image=photo_url,
        )
        person.add_source(member_url)
        person.add_link(member_url)

        info = {}
        sidebar_items = iter(sidebar.getchildren())
        for item in sidebar_items:
            if item.tag == "p":
                info[item.text] = next(sidebar_items)

        address = "\n".join(info["Legislative Address"].xpath("./text()"))

        phone = None
        fax = None
        phone_numbers = info["Phone Number(s)"].xpath("./text()")
        for num in phone_numbers:
            kind, num = num.split(": ")
            if kind == "LRC":
                if num.endswith(" (fax)"):
                    fax = num.replace(" (fax)", "")
                else:
                    phone = num

        email = info["Email"].text

        if phone:
            person.add_contact_detail(type="voice",
                                      value=phone,
                                      note="Capitol Office")

        if fax:
            person.add_contact_detail(type="fax",
                                      value=fax,
                                      note="Capitol Office")

        if email:
            person.add_contact_detail(type="email",
                                      value=email,
                                      note="Capitol Office")

        if address.strip() == "":
            self.warning("Missing Capitol Office!!")
        else:
            person.add_contact_detail(type="address",
                                      value=address,
                                      note="Capitol Office")

        yield person

Esempio n. 17

0

Mostra file

File: people.py Progetto: TheWalkers/openstates

    def scrape_chamber(self, chamber):
        # the url for each rep is unfindable (by me)
        # and the parts needed to make it up do not appear in the html or js.
        # we can find basic information on the main rep page, and sponsor
        # info on a version of their indivdual page called using only their
        # sponsor ID (which we have to scrape from ALISON)
        # we can't get detailed information without another ID
        # which I have not been able to find.
        if chamber == "upper":
            member_list_url = self._base_url + "Senate/ALSenators.aspx"
            legislator_base_url = self._base_url + "ALSenator.aspx"
        elif chamber == "lower":
            member_list_url = self._base_url + "House/ALRepresentatives.aspx"
            legislator_base_url = self._base_url + "ALRepresentative.aspx"

        page = self.lxmlize(member_list_url)

        legislator_nodes = self.get_nodes(
            page, '//div[@class="container container-main"]/table/tr/td/input')

        legislator_url_template = (legislator_base_url + "?OID_SPONSOR="
                                   "{oid_sponsor}&OID_PERSON={oid_person}")

        html_parser = HTMLParser()

        for legislator_node in legislator_nodes:
            # Set identifiers internal to AlisonDB.
            # Have to do this to OID_SPONSOR because they don't know
            # how to HTML and I'm making links absolute out of convenience.
            try:
                oid_sponsor = legislator_node.attrib["longdesc"].split("/")[-1]
                oid_person = legislator_node.attrib["alt"]
            except KeyError:
                continue

            legislator_url = legislator_url_template.format(
                oid_sponsor=oid_sponsor, oid_person=oid_person)

            legislator_page = self.lxmlize(legislator_url)

            name_text = self.get_node(
                legislator_page,
                '//span[@id="ContentPlaceHolder1_lblMember"]').text_content()

            # This just makes processing the text easier.
            name_text = name_text.lower()

            # Skip vacant seats.
            if "vacant" in name_text:
                continue

            photo_url = self.get_node(
                legislator_page,
                '//input[@id="ContentPlaceHolder1_TabSenator_TabLeg_imgLEG"]'
                "/@src",
            )

            # Another check for vacant seats
            if "VACANT.jpeg" in photo_url or "pending.jpeg" in photo_url:
                continue

            # Removes titles and nicknames.
            name = html_parser.unescape(
                re.sub(r"(?i)(representative|senator|&quot.*&quot)", "",
                       name_text).strip().title())

            # Assemble full name by reversing last name, first name format.
            name_parts = [x.strip() for x in name.split(",")]
            full_name = "{0} {1}".format(name_parts[1], name_parts[0])

            info_node = self.get_node(
                legislator_page,
                '//div[@id="ContentPlaceHolder1_TabSenator_body"]//table',
            )

            district_text = self.get_node(info_node,
                                          "./tr[2]/td[2]").text_content()
            district_text = district_text.replace("&nbsp;", u"")

            if chamber == "upper":
                district = district_text.replace("Senate District", "").strip()
            elif chamber == "lower":
                district = district_text.replace("House District", "").strip()

            party_text = self.get_node(info_node,
                                       "./tr[1]/td[2]").text_content()

            if not full_name.strip() and party_text == "()":
                self.warning(
                    "Found empty seat, for district {}; skipping".format(
                        district))
                continue

            if party_text.strip() in self._parties.keys():
                party = self._parties[party_text.strip()]
            else:
                party = None

            phone_number = (self.get_node(
                info_node, "./tr[4]/td[2]").text_content().strip())

            fax_number = (self.get_node(
                info_node,
                "./tr[5]/td[2]").text_content().strip().replace("\u00a0", ""))

            suite_text = self.get_node(info_node,
                                       "./tr[7]/td[2]").text_content()

            office_address = "{}\n11 S. Union Street\nMontgomery, AL 36130".format(
                suite_text)

            email_address = self.get_node(info_node,
                                          "./tr[11]/td[2]").text_content()

            photo_url = self.get_node(
                legislator_page,
                '//input[@id="ContentPlaceHolder1_TabSenator_TabLeg_imgLEG"]'
                "/@src",
            )

            # add basic leg info and main office
            person = Person(
                name=full_name,
                district=district,
                primary_org=chamber,
                party=party,
                image=photo_url,
            )

            person.add_contact_detail(type="address",
                                      value=office_address,
                                      note="Capitol Office")
            if phone_number:
                person.add_contact_detail(type="voice",
                                          value=phone_number,
                                          note="Capitol Office")
            if fax_number:
                person.add_contact_detail(type="fax",
                                          value=fax_number,
                                          note="Capitol Office")
            if email_address:
                person.add_contact_detail(type="email",
                                          value=email_address,
                                          note="Capitol Office")

            self.add_committees(legislator_page, person, chamber,
                                legislator_url)

            person.add_link(legislator_url)
            person.add_source(legislator_url)
            person.add_source(member_list_url)

            yield person

Esempio n. 18

0

Mostra file

File: people.py Progetto: hiteshgarg14/openstates

    def scrape(self):
        base_url = "http://news.legislature.ne.gov/dist"

        # there are 49 districts
        for district in range(1, 50):
            rep_url = base_url + str(district).zfill(2)

            full_name = None
            address = None
            phone = None
            email = None
            photo_url = None

            try:
                page = self.lxmlize(rep_url)

                info_node = self.get_node(
                    page,
                    '//div[@class="container view-front"]'
                    '//div[@class="col-sm-4 col-md-3 ltc-col-right"]'
                    '/div[@class="block-box"]',
                )

                full_name = self.get_node(info_node,
                                          "./h2/text()[normalize-space()]")
                full_name = re.sub(r"^Sen\.[\s]+", "", full_name).strip()
                if full_name == "Seat Vacant":
                    continue

                address_node = self.get_node(
                    info_node, './address[@class="feature-content"]')

                email = self.get_node(
                    address_node, './a[starts-with(@href, "mailto:")]/text()')

                contact_text_nodes = self.get_nodes(
                    address_node, "./text()[following-sibling::br]")

                address_sections = []
                for text in contact_text_nodes:
                    text = text.strip()

                    if not text:
                        continue

                    phone_match = re.search(r"Phone:", text)

                    if phone_match:
                        phone = re.sub(r"^Phone:[\s]+", "", text)
                        continue

                    # If neither a phone number nor e-mail address.
                    address_sections.append(text)

                address = "\n".join(address_sections)

                photo_url = (
                    "http://www.nebraskalegislature.gov/media/images/blogs"
                    "/dist{:2d}.jpg").format(district)

                # Nebraska is offically nonpartisan.
                party = "Nonpartisan"

                person = Person(
                    name=full_name,
                    district=str(district),
                    party=party,
                    image=photo_url,
                    primary_org="legislature",
                )

                person.add_link(rep_url)
                person.add_source(rep_url)

                note = "Capitol Office"
                person.add_contact_detail(type="address",
                                          value=address,
                                          note=note)
                if phone:
                    person.add_contact_detail(type="voice",
                                              value=phone,
                                              note=note)
                if email:
                    person.add_contact_detail(type="email",
                                              value=email,
                                              note=note)

                yield person
            except scrapelib.HTTPError:
                self.warning("could not retrieve %s" % rep_url)

Esempio n. 19

0

Mostra file

File: people.py Progetto: washabstract/openstates-scrapers

    def scrape(self):
        # chambers = [chamber] if chamber is not None else ['upper', 'lower']
        leg_url = "ftp://ftp.cga.ct.gov/pub/data/LegislatorDatabase.csv"
        page = self.get(leg_url)

        committees = {}

        # Ensure that the spreadsheet's structure hasn't generally changed
        _row_headers = page.text.split("\r\n")[0].replace('"', "").split(",")
        assert _row_headers == HEADERS, "Spreadsheet structure may have changed"

        page = open_csv(page)
        for row in page:

            chamber = {"H": "lower", "S": "upper"}[row["office code"]]

            district = row["dist"].lstrip("0")
            assert district.isdigit(), "Invalid district found: {}".format(
                district)

            name = row["first name"]
            mid = row["middle initial"].strip()
            if mid:
                name += " %s" % mid
            name += " %s" % row["last name"]
            suffix = row["suffix"].strip()
            if suffix:
                name += " %s" % suffix

            party = row["party"]
            if party == "Democrat":
                party = "Democratic"

            leg = Person(primary_org=chamber,
                         name=name,
                         district=district,
                         party=party)

            legislator_url = row["URL"].replace("\\", "//").strip()
            if legislator_url != "":
                if not legislator_url.startswith("http"):
                    legislator_url = "http://"
                leg.add_link(legislator_url)

            leg.add_party(party=party)

            office_address = "%s\nRoom %s\nHartford, CT 06106" % (
                row["capitol street address"],
                row["room number"],
            )
            # extra_office_fields = dict()
            email = row["email"].strip()
            if "@" not in email:
                if not email:
                    email = None
                elif email.startswith("http://") or email.startswith(
                        "https://"):
                    # extra_office_fields['contact_form'] = email
                    email = None
                else:
                    raise ValueError(
                        "Problematic email found: {}".format(email))
            leg.add_contact_detail(type="address",
                                   value=office_address,
                                   note="Capitol Office")
            leg.add_contact_detail(type="voice",
                                   value=row["capitol phone"],
                                   note="Capitol Office")
            if email:
                leg.add_contact_detail(type="email", value=email)

            home_address = "{}\n{}, {} {}".format(
                row["home street address"],
                row["home city"],
                row["home state"],
                row["home zip code"],
            )
            if "Legislative Office Building" not in home_address:
                leg.add_contact_detail(type="address",
                                       value=home_address,
                                       note="District Office")
                if row["home phone"].strip():
                    leg.add_contact_detail(type="voice",
                                           value=row["home phone"],
                                           note="District Office")
            leg.add_source(leg_url)

            for comm_name in row["committee member1"].split(";"):
                if " (" in comm_name:
                    comm_name, role = comm_name.split(" (")
                    role = role.strip(")").lower()
                else:
                    role = "member"
                comm_name = comm_name.strip()
                if comm_name:
                    if comm_name in committees:
                        com = committees[comm_name]
                    else:
                        com = Organization(comm_name,
                                           classification="committee",
                                           chamber=chamber)
                        com.add_source(leg_url)
                        committees[comm_name] = com
                        yield com

                    leg.add_membership(name_or_org=com, role=role)

            yield leg

Esempio n. 20

0

Mostra file

File: people.py Progetto: vikrantmygamma/openstates-scrapers

    def _parse_person(self, row, chamber, seat_map):
        # Capture legislator vitals.
        first_name = row["FirstName"]
        middle_name = row["MiddleName"]
        last_name = row["LastName"]
        full_name = "{} {} {}".format(first_name, middle_name, last_name)
        full_name = re.sub(r"[\s]{2,}", " ", full_name)

        if chamber == "lower":
            district = "{} {}".format(row["County"],
                                      int(row["District"])).strip()
        else:
            district = str(int(row["District"])).strip()

        party = self.party_map[row["party"].upper()]
        email = row["WorkEmail"]

        if district == "0":
            self.warning("Skipping {}, district is set to 0".format(full_name))
            return

        person = Person(primary_org=chamber,
                        district=district,
                        name=full_name,
                        party=party)

        extras = {
            "first_name": first_name,
            "middle_name": middle_name,
            "last_name": last_name,
        }

        person.extras = extras
        if email:
            office = "Capitol" if email.endswith(
                "@leg.state.nh.us") else "District"
            person.add_contact_detail(type="email",
                                      value=email,
                                      note=office + " Office")

        # Capture legislator office contact information.
        district_address = "{}\n{}\n{}, {} {}".format(row["Address"],
                                                      row["address2"],
                                                      row["city"],
                                                      row["State"],
                                                      row["Zipcode"]).strip()

        phone = row["Phone"].strip()
        if not phone:
            phone = None

        if district_address:
            office = "Capitol" if chamber == "upper" else "District"
            person.add_contact_detail(type="address",
                                      value=district_address,
                                      note=office + " Office")
        if phone:
            office = "Capitol" if "271-" in phone else "District"
            person.add_contact_detail(type="voice",
                                      value=phone,
                                      note=office + " Office")

        # Retrieve legislator portrait.
        profile_url = None
        if chamber == "upper":
            profile_url = self.senate_profile_url.format(row["District"])
        elif chamber == "lower":
            try:
                seat_number = seat_map[row["seatno"]]
                profile_url = self.house_profile_url.format(seat_number)
            except KeyError:
                pass

        if profile_url:
            person.image = self._get_photo(profile_url, chamber)
            person.add_source(profile_url)

        return person

Esempio n. 21

0

Mostra file

    def _scrape_senator(self, url, parties):
        # logger.info(f'Generating senator person object from {url}')
        """
        Returns a Person object representing a member of the upper
        legislative chamber.
        """
        # Scrape legislator information from roster URL
        # Example: view-source:https://senate.texas.gov/member.php?d=1
        member_page = self.lxmlize(url)

        photo_url = member_page.xpath('//img[@id="memhead"]/@src')[0]
        scraped_name_district_text = member_page.xpath(
            '//div[@class="pgtitle"]/text()')[0]
        scraped_name, district_text = scraped_name_district_text.split(":")
        name = " ".join(scraped_name.replace("Senator ", "").split()).strip()
        district = str(district_text.split()[1]).strip()
        # Vacant house "members" are named after their district numbers:
        if re.match(r"^District \d+$", name):
            return None
        bio = " ".join(member_page.xpath('//div[@class="bio"]/text()'))
        party = parties[district]

        person = Person(
            name=name,
            district=district,
            party=party,
            primary_org="upper",
            biography=bio,
        )

        if photo_url is not None:
            person.image = photo_url
        person.add_link(url)
        person.add_source(url)

        office_ids = []
        # Get offices based on table headers
        for th_tag in member_page.xpath('//table[@class="memdir"]/tr/th'):
            # logger.warn([th_tag.xpath('text()'),th_tag.xpath('@id')])
            id = th_tag.xpath("@id")[0] if th_tag.xpath("@id") else ""
            label = th_tag.xpath("text()")[0].strip() if th_tag.xpath(
                "text()") else ""
            if id != "" and label != "":
                office_ids.append({"id": id, "label": label})

        # logger.warn(office_ids)
        for office in office_ids:
            # logger.warn(office)
            row = member_page.xpath(
                f'//table[@class="memdir"]/tr/td[@headers="{office["id"]}"]')
            # A few member pages have broken ids for office listings:
            if len(row) == 0:
                row = member_page.xpath(
                    '//table[@class="memdir"]/tr/td[@headers="dDA1"]')
            if len(row) > 0:
                details = " ".join(row[0].xpath("text()")).strip()
                details = details.replace("\r", "").replace("\n", "")
            # logger.warn(details)
            # A few member pages have blank office listings:
            if details == "":
                continue

            match = self.address_re.search(details)
            if match is not None:
                address = re.sub(
                    " +$",
                    "",
                    match.group().replace("\r", "").replace("\n", ""),
                    flags=re.MULTILINE,
                )
            else:
                # No valid address found in the details.
                continue

            phone_number = extract_phone(details)
            fax_number = extract_fax(details)

            if address:
                person.add_contact_detail(type="address",
                                          value=address,
                                          note=office["label"])
            if phone_number:
                person.add_contact_detail(type="voice",
                                          value=phone_number,
                                          note=office["label"])
            if fax_number:
                person.add_contact_detail(type="fax",
                                          value=fax_number,
                                          note=office["label"])

        yield person

Esempio n. 22

0

Mostra file

    def scrape_chamber(self, chamber, session):

        if chamber == "upper":
            chamber_slug = "Senate"
        elif chamber == "lower":
            chamber_slug = "Assembly"
        session_slug = self.jurisdiction.session_slugs[session]

        leg_base_url = "http://www.leg.state.nv.us/App/Legislator/A/%s/%s/" % (
            chamber_slug,
            session_slug,
        )
        leg_json_url = (
            "http://www.leg.state.nv.us/App/Legislator/A/api/%s/Legislator?house=%s"
            % (session_slug, chamber_slug))

        resp = json.loads(self.get(leg_json_url).text)
        for item in resp:
            # empty district
            empty_names = ["District No", "Vacant"]
            if any(name in item["FullName"] for name in empty_names):
                continue
            name_parts = item["FullName"].split(",")
            last, first = name_parts[:2]
            item["FullName"] = "{first} {last}".format(last=last.strip(),
                                                       first=first.strip())
            person = Person(
                name=item["FullName"],
                district=item["DistrictNbr"],
                party=item["Party"],
                primary_org=chamber,
                image=item["PhotoURL"],
            )
            capitol_phone = item["LCBPhone"]
            if capitol_phone:
                person.add_contact_detail(type="voice",
                                          value=capitol_phone,
                                          note="Capitol Office")

            leg_url = leg_base_url + item["DistrictNbr"]

            # hack to get the legislator ID
            html = self.get(leg_url).text
            for ln in html.split("\n"):
                if "GetLegislatorDetails" in ln:
                    leg_id = ln.split(",")[1].split("'")[1]

            # fetch the json used by the page
            leg_details_url = (
                "https://www.leg.state.nv.us/App/Legislator/A/api/{}/Legislator?id="
                .format(session_slug) + leg_id)
            leg_resp = json.loads(self.get(leg_details_url).text)
            details = leg_resp["legislatorDetails"]

            address = details["Address1"]
            address2 = details["Address2"]
            if address2:
                address += " " + address2
            address += "\n%s, NV %s" % (details["City"], details["Zip"])

            phone = details["LCBPhone"]
            email = details["LCBEmail"]
            if address:
                person.add_contact_detail(type="address",
                                          value=address,
                                          note="District Office")
            if phone:
                person.add_contact_detail(type="voice",
                                          value=phone,
                                          note="District Office")
            if email:
                person.add_contact_detail(type="email",
                                          value=email,
                                          note="District Office")
            person.add_link(leg_details_url)
            person.add_source(leg_details_url)
            yield person

Esempio n. 23

0

Mostra file

File: people.py Progetto: vikrantmygamma/openstates-scrapers

    def scrape(self, session=None):
        if not session:
            session = self.jurisdiction.legislative_sessions[-1]["name"]
            self.info("no session specified, using %s", session)

        year_abr = session[0:4]

        self._init_mdb(int(year_abr))

        roster_csv = self.access_to_csv("Roster")
        bio_csv = self.access_to_csv("LegBio")

        photos = {}
        for rec in bio_csv:
            photos[rec["Roster Key"]] = rec["URLPicture"]

        for rec in roster_csv:
            first_name = rec["Firstname"]
            middle_name = rec["MidName"]
            last_name = rec["LastName"]
            suffix = rec["Suffix"]
            full_name = first_name + " " + middle_name + " " + last_name + " " + suffix
            full_name = full_name.replace("  ", " ")
            full_name = full_name[0:len(full_name) - 1]

            district = str(int(rec["District"]))
            party = rec["Party"]
            if party == "R":
                party = "Republican"
            elif party == "D":
                party = "Democratic"
            else:
                party = party
            chamber = rec["House"]
            if chamber == "A":
                chamber = "lower"
            elif chamber == "S":
                chamber = "upper"

            leg_status = rec["LegStatus"]
            # skip Deceased/Retired members
            if leg_status != "Active":
                continue
            phone = rec["Phone"] or None
            email = None
            if rec["Email"]:
                email = rec["Email"]

            # Email has been removed from the Access DB, but it's
            # still [email protected] and [email protected] - many
            # reps have these emails on their personal pages even if
            # they're gone from the DB file
            if not email:
                email = self._construct_email(chamber, rec["Sex"], last_name)

            try:
                photo_url = photos[rec["Roster Key"]]
            except KeyError:
                photo_url = ""
                self.warning("no photo url for %s", rec["Roster Key"])
            url = "http://www.njleg.state.nj.us/members/bio.asp?Leg=" + str(
                int(rec["Roster Key"]))
            address = "{0}\n{1}, {2} {3}".format(rec["Address"], rec["City"],
                                                 rec["State"], rec["Zipcode"])
            gender = {"M": "Male", "F": "Female"}[rec["Sex"]]

            person = Person(
                name=full_name,
                district=district,
                primary_org=chamber,
                party=party,
                image=photo_url,
                gender=gender,
            )

            person.add_link(url)
            person.add_source(url)
            person.add_source("http://www.njleg.state.nj.us/downloads.asp")

            person.add_contact_detail(type="address",
                                      value=address,
                                      note="District Office")
            if phone is not None:
                person.add_contact_detail(type="voice",
                                          value=phone,
                                          note="District Office")
            if email is not None:
                person.add_contact_detail(type="email",
                                          value=email,
                                          note="District Office")

            yield person

Esempio n. 24

0

Mostra file

    def scrape_legislator(self, name, chamber, url, contact_page):
        page = self.get(url).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        party = page.xpath("string(//span[contains(@id, 'Party')])")
        party = party.strip()

        if party == "Democrat":
            party = "Democratic"

        district = page.xpath("string(//span[contains(@id, 'District')])")
        district = district.strip().lstrip("0")

        occupation = page.xpath("string(//span[contains(@id, 'Occupation')])")
        occupation = occupation.strip()

        (photo_url, ) = page.xpath('//img[contains(@id, "_imgMember")]/@src')

        office_phone = page.xpath(
            "string(//span[contains(@id, 'CapitolPhone')])").strip()

        legislator = Person(
            primary_org=chamber,
            image=photo_url,
            name=name,
            party=party,
            district=district,
        )
        legislator.extras["occupation"] = occupation
        if office_phone.strip() != "":
            legislator.add_contact_detail(type="voice",
                                          value=office_phone,
                                          note="Capitol Office")

        # SD removed email from the detail pages but it's still in the
        # contact page, shared for all congress people
        member_id = re.search(r"Member=(\d+)", url).group(1)

        # find the profile block by finding a link inside it to their
        # detail page
        profile_link = contact_page.xpath(
            '//ul[@id="contact-list"]//a[contains(@href, "Member=%s")]' %
            (member_id, ))
        if profile_link:
            # look for the adjacent email mailto link
            profile_link = profile_link[0]
            profile_block = profile_link.getparent().getparent().getparent()
            email_link = profile_block.xpath(
                './span/span/a[@class="mail-break"]')
            if email_link:
                email = email_link[0].text
                email = email.lstrip()
                email = email.rstrip()
                if email:
                    legislator.add_contact_detail(type="email",
                                                  value=email,
                                                  note="Capitol Office")
        home_address = [
            x.strip() for x in page.xpath(
                '//td/span[contains(@id, "HomeAddress")]/text()') if x.strip()
        ]
        if home_address:
            home_address = "\n".join(home_address)
            home_phone = page.xpath(
                "string(//span[contains(@id, 'HomePhone')])").strip()
            legislator.add_contact_detail(type="address",
                                          value=home_address,
                                          note="District Office")
            if home_phone:
                legislator.add_contact_detail(type="voice",
                                              value=home_phone,
                                              note="District Office")

        legislator.add_source(url)
        legislator.add_link(url)

        committees = page.xpath(
            '//div[@id="divCommittees"]/span/section/table/tbody/tr/td/a')
        for committee in committees:
            self.scrape_committee(legislator, url, committee, chamber)
        yield legislator

Esempio n. 25

0

Mostra file

    def scrape_chamber(self, chamber):
        if chamber == "lower":
            url = "http://www.scstatehouse.gov/member.php?chamber=H"
        else:
            url = "http://www.scstatehouse.gov/member.php?chamber=S"

        seen_committees = {}

        data = self.get(url).text
        doc = lxml.html.fromstring(data)
        doc.make_links_absolute(url)

        for a in doc.xpath('//a[@class="membername"]'):
            full_name = a.text
            leg_url = a.get("href")

            if full_name.startswith("Senator"):
                full_name = full_name.replace("Senator ", "")
            if full_name.startswith("Representative"):
                full_name = full_name.replace("Representative ", "")

            leg_html = self.get(leg_url).text
            leg_doc = lxml.html.fromstring(leg_html)
            leg_doc.make_links_absolute(leg_url)

            if "Resigned effective" in leg_html:
                self.info("Resigned")
                continue

            party, district, _ = leg_doc.xpath(
                '//p[@style="font-size: 17px;'
                ' margin: 0 0 0 0; padding: 0;"]/text()')

            if "Republican" in party:
                party = "Republican"
            elif "Democrat" in party:
                party = "Democratic"

            # District # - County - Map
            district = district.split()[1]
            try:
                photo_url = leg_doc.xpath(
                    '//img[contains(@src,"/members/")]/@src')[0]
            except IndexError:
                self.warning("No Photo URL for {}".format(full_name))
                photo_url = ""
            person = Person(
                name=full_name,
                district=district,
                party=party,
                primary_org=chamber,
                image=photo_url,
            )

            # capitol office address
            try:
                capitol_address = lxml.etree.tostring(
                    leg_doc.xpath('//h2[text()="Columbia Address"]/../p[1]')
                    [0]).decode()
                if capitol_address:
                    capitol_address = parse_address(capitol_address)
                    person.add_contact_detail(type="address",
                                              value=capitol_address,
                                              note="Capitol Office")
            except IndexError:
                self.warning("no capitol address for {0}".format(full_name))

            # capitol office phone
            try:
                capitol_phone = (
                    leg_doc.xpath('//h2[text()="Columbia Address"]/../p[2]')
                    [0].text_content().strip())
                label, number = parse_phone(capitol_phone)
                if number:
                    person.add_contact_detail(type="voice",
                                              value=number,
                                              note="Capitol Office")
            except IndexError:
                self.warning("no capitol phone for {0}".format(full_name))

            # home address
            try:
                home_address = lxml.etree.tostring(
                    leg_doc.xpath('//h2[text()="Home Address"]/../p[1]')
                    [0]).decode()
                if home_address:
                    home_address = parse_address(home_address)
                    person.add_contact_detail(type="address",
                                              value=home_address,
                                              note="District Office")
            except IndexError:
                self.warning("no home address for {0}".format(full_name))

            # home or business phone
            try:
                home_phone = (
                    leg_doc.xpath('//h2[text()="Home Address"]/../p[2]')
                    [0].text_content().strip())
                label, number = parse_phone(home_phone)
                if number:
                    label = ("Primary Office"
                             if label == "Business" else "District Office")
                    person.add_contact_detail(type="voice",
                                              value=number,
                                              note=label)
            except IndexError:
                self.warning(
                    "no home or business phone for {0}".format(full_name))

            # business or home phone
            try:
                business_phone = (
                    leg_doc.xpath('//h2[text()="Home Address"]/../p[3]')
                    [0].text_content().strip())
                label, number = parse_phone(business_phone)
                if number:
                    label = ("Primary Office"
                             if label == "Business" else "District Office")
                    person.add_contact_detail(type="voice",
                                              value=number,
                                              note=label)
            except IndexError:
                pass

            person.add_link(leg_url)
            person.add_source(url)
            person.add_source(leg_url)

            # committees (skip first link)
            for com in leg_doc.xpath(
                    '//a[contains(@href, "committee.php")]')[1:]:
                if com.text.endswith(", "):
                    committee, role = com.text_content().rsplit(", ", 1)

                    # known roles
                    role = {
                        "Treas.": "treasurer",
                        "Secy.": "secretary",
                        "Secy./Treas.": "secretary/treasurer",
                        "V.C.": "vice-chair",
                        "1st V.C.": "first vice-chair",
                        "Co 1st V.C.": "co-first vice-chair",
                        "2nd V.C.": "second vice-chair",
                        "3rd V.C.": "third vice-chair",
                        "Ex.Officio Member": "ex-officio member",
                        "Chairman": "chairman",
                    }[role]
                else:
                    committee = com.text
                    role = "member"

                # only yield each committee once
                if committee not in seen_committees:
                    com = Organization(name=committee,
                                       classification="committee",
                                       chamber=chamber)
                    com.add_source(url)
                    seen_committees[committee] = com
                    yield com
                else:
                    com = seen_committees[committee]

                person.add_membership(com, role=role)

            yield person

Esempio n. 26

0

Mostra file

File: people.py Progetto: vikrantmygamma/openstates-scrapers

    def scrape(self, session=None):
        if session is None:
            session = self.latest_session()

        year_slug = self.jurisdiction.get_year_slug(session)

        # Load all members via the private API
        legislator_dump_url = "http://legislature.vermont.gov/people/loadAll/{}".format(
            year_slug
        )
        json_data = self.get(legislator_dump_url).text
        legislators = json.loads(json_data)["data"]

        # Parse the information from each legislator
        for info in legislators:
            # Strip whitespace from strings
            info = {k: v.strip() for k, v in info.items()}

            # Skip duplicate record for Christopher Mattos (appointed Rep September 2017)
            if info["PersonID"] == "29034":
                self.info("skipping first Christopher Mattos record")
                continue

            # Gather photo URL from the member's page
            member_url = "http://legislature.vermont.gov/people/single/{}/{}".format(
                year_slug, info["PersonID"]
            )
            page = self.lxmlize(member_url)
            (photo_url,) = page.xpath('//img[@class="profile-photo"]/@src')

            # Also grab their state email address
            state_email = page.xpath(
                '//dl[@class="summary-table profile-summary"]/'
                'dt[text()="Email"]/following-sibling::dd[1]/a/text()'
            )
            if state_email:
                (state_email,) = state_email
            else:
                state_email = None

            district = info["District"].replace(" District", "")

            leg = Person(
                primary_org=self.CHAMBERS[info["Title"]],
                district=district,
                party=info["Party"].replace("Democrat", "Democratic"),
                name="{0} {1}".format(info["FirstName"], info["LastName"]),
                image=photo_url,
            )

            leg.add_contact_detail(
                note="Capitol Office",
                type="address",
                value="Vermont State House\n115 State Street\nMontpelier, VT 05633",
            )
            if state_email:
                leg.add_contact_detail(
                    note="Capitol Office", type="email", value=state_email
                )

            leg.add_contact_detail(
                note="District Office",
                type="address",
                value="{0}{1}\n{2}, {3} {4}".format(
                    info["MailingAddress1"],
                    (
                        "\n" + info["MailingAddress2"]
                        if info["MailingAddress2"].strip()
                        else ""
                    ),
                    info["MailingCity"],
                    info["MailingState"],
                    info["MailingZIP"],
                ),
            )
            if info["HomePhone"]:
                leg.add_contact_detail(
                    note="District Office", type="voice", value=info["HomePhone"]
                )
            district_email = info["Email"] or info["HomeEmail"] or info["WorkEmail"]
            if district_email:
                leg.add_contact_detail(
                    note="District Office", type="email", value=district_email
                )

            leg.add_link(member_url)

            leg.add_source(legislator_dump_url)
            leg.add_source(member_url)

            yield leg

Esempio n. 27

0

Mostra file

    def _scrape_representative(self, url, parties):
        # logger.info(f'Generating representative person object from {url}')
        """
        Returns a Person object representing a member of the lower
        legislative chamber.
        """
        # url = self.get(url).text.replace('<br>', '')
        member_page = self.lxmlize(url)

        photo_url = member_page.xpath('//img[@class="member-photo"]/@src')[0]
        if photo_url.endswith("/.jpg"):
            photo_url = None

        scraped_name, district_text = member_page.xpath(
            '//div[@class="member-info"]/h2')
        scraped_name = scraped_name.text_content().strip().replace("Rep. ", "")
        scraped_name = " ".join(scraped_name.split())

        name = " ".join(scraped_name.split(", ")[::-1])

        district_text = district_text.text_content().strip()
        district = str(self.district_re.search(district_text).group(1))

        # Vacant house "members" are named after their district numbers:
        if re.match(r"^District \d+$", scraped_name):
            return None

        party = parties[district]

        person = Person(name=name,
                        district=district,
                        party=party,
                        primary_org="lower")

        if photo_url is not None:
            person.image = photo_url

        person.add_link(url)
        person.add_source(url)

        def office_name(element):
            """Returns the office address type."""
            return element.xpath("preceding-sibling::h4[1]/text()")[0].rstrip(
                ":")

        offices_text = [{
            "name":
            office_name(p_tag),
            "type":
            office_name(p_tag).replace(" Address", "").lower(),
            "details":
            p_tag.text_content(),
        } for p_tag in member_page.xpath(
            '//h4/following-sibling::p[@class="double-space"]')]

        for office_text in offices_text:
            details = office_text["details"].strip()

            # A few member pages have blank office listings:
            if details == "":
                continue

            # At the time of writing, this case of multiple district
            # offices occurs exactly once, for the representative at
            # District 43:
            if details.count("Office") > 1:
                district_offices = [
                    district_office.strip() for district_office in re.findall(
                        r"(\w+ Office.+?(?=\w+ Office|$))",
                        details,
                        flags=re.DOTALL)
                ]
                offices_text += [{
                    "name":
                    re.match(r"\w+ Office", office).group(),
                    "type":
                    "district",
                    "details":
                    re.search(r"(?<=Office).+(?=\w+ Office|$)?", office,
                              re.DOTALL).group(),
                } for office in district_offices]

            match = self.address_re.search(details)
            if match is not None:
                address = re.sub(
                    " +$",
                    "",
                    match.group().replace("\r", "").replace("\n\n", "\n"),
                    flags=re.MULTILINE,
                )
            else:
                # No valid address found in the details.
                continue

            phone_number = extract_phone(details)
            fax_number = extract_fax(details)

            if address:
                person.add_contact_detail(type="address",
                                          value=address,
                                          note=office_text["name"])
            if phone_number:
                person.add_contact_detail(type="voice",
                                          value=phone_number,
                                          note=office_text["name"])
            if fax_number:
                person.add_contact_detail(type="fax",
                                          value=fax_number,
                                          note=office_text["name"])

        yield person

Esempio n. 28

0

Mostra file

File: people.py Progetto: hiteshgarg14/openstates

    def handle_list_item(self, item):
        link = item.xpath('.//div[contains(@class, "rep_style")]/a')[0]
        name = link.text_content().strip()

        if "Vacant" in name or "Resigned" in name or "Pending" in name:
            return

        party = item.xpath(
            './/div[contains(@class, "party_style")]/text()')[0].strip()
        party = {"D": "Democratic", "R": "Republican"}[party]

        district = item.xpath(
            './/div[contains(@class, "district_style")]/text()')[0].strip()

        leg_url = link.get("href")
        split_url = parse.urlsplit(leg_url)
        member_id = parse.parse_qs(split_url.query)["MemberId"][0]
        image = "http://www.flhouse.gov/FileStores/Web/Imaging/Member/{}.jpg".format(
            member_id)

        name = fix_name(name)
        rep = Person(
            name=name,
            district=district,
            party=party,
            primary_org="lower",
            role="Representative",
            image=image,
        )
        rep.add_link(leg_url)
        rep.add_source(leg_url)
        rep.add_source(self.url)

        self.scrape_page(RepDetail, leg_url, obj=rep)

        # look for email in the list from the PDF directory - ideally
        # we'd find a way to better index the source data which
        # wouldn't require guessing the email, but this does at least
        # confirm that it's correct

        # deal with some stuff that ends up in name that won't work in
        # email, spaces, quotes, high latin1
        email_name = rep.name.replace('"', "").replace("La ",
                                                       "La").replace("ñ", "n")
        (last, *other) = re.split(r"[-\s,]+", email_name)

        # deal with a missing nickname used in an email address
        if "Patricia" in other:
            other.append("Pat")

        # search through all possible first names and nicknames
        # present - needed for some of the more elaborate concoctions
        found_email = False
        for first in other:
            email = "*****@*****.**" % (first, last)
            if email in self.member_emails:
                # it's bad if we can't uniquely match emails, so throw an error
                if email in self.claimed_member_emails:
                    raise ValueError(
                        "Email address %s matches multiple reps - %s and %s." %
                        (email, rep.name, self.claimed_member_emails[email]))

                self.claimed_member_emails[email] = rep.name

                rep.add_contact_detail(type="email",
                                       value=email,
                                       note="Capitol Office")
                rep.add_source(self.directory_pdf_url)

                found_email = True

                break

        if not found_email:
            log.warning("Rep %s does not have an email in the directory PDF." %
                        (rep.name, ))

        return rep

Esempio n. 29

0

Mostra file

File: people.py Progetto: TheWalkers/openstates

    def scrape_table(self, chamber):
        url = self.urls[chamber]
        html = self.get(url).text
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)

        seen = set()

        for row in doc.xpath(
                '//div[contains(@class, "member-index-cell")]/div/div'):

            img_cell, text_cell = row.getchildren()

            if "to be announced" in text_cell.text_content().lower():
                continue

            leg_a = text_cell.xpath('.//a')[0]
            leg_url = leg_a.attrib['href']
            name = leg_a.text

            district = re.search(r"District (\d{1,2}[ABCD]?)",
                                 text_cell.text_content()).group(1)

            key = name + district
            if key in seen:  # leadership listed twice, skip the 2nd
                continue
            seen.add(key)

            photo_url = img_cell.xpath("a/img/@src")[0]

            # get details
            html = self.get(leg_url).text
            ldoc = lxml.html.fromstring(html)
            ldoc.make_links_absolute(leg_url)

            party = _get_table_item(ldoc, "Party").text
            if party == "Democrat":
                party = "Democratic"
            capitol_info = _get_table_item(ldoc, "Annapolis Info")
            addr_lines, phone_lines = capitol_info.xpath("dl/dd")

            address = [
                s.strip() for s in addr_lines.text_content().split('\n')
                if s.strip()
            ]
            address = "\n".join(address)

            phone = None
            fax = None
            for line in phone_lines.text_content().split('\n'):
                if "Phone" in line:
                    phone = re.findall(r"Phone (\d{3}-\d{3}-\d{4})", line)[0]
                elif "Fax" in line:
                    # Number oddities: one has two dashes, one has a dash and then a space.
                    line = line.replace("--", "-").replace("- ", "-")
                    fax = re.findall(r"Fax (\d{3}-\d{3}-\d{4})", line)[0]

            email_path = ldoc.xpath('//a[contains(@href, "mailto:")]/@href')
            emails = set()
            for path in email_path:
                emails.add(re.match(r"mailto:([^?]+)", path).group(1))
            if not emails:
                email = None
            elif len(emails) == 1:
                email = emails.pop()
            else:
                raise AssertionError("Multiple email links found on page")

            img_src = ldoc.xpath('//img[@class="sponimg"]/@src')
            if img_src:
                photo_url = img_src[0]

            names = name.split(", ")
            name = " ".join([names[1], names[0]] + names[2:])

            leg = Person(
                primary_org=chamber,
                district=district,
                name=name,
                party=party,
                image=photo_url,
            )
            leg.add_source(url=leg_url)
            leg.add_link(url=leg_url)

            if address:
                leg.add_contact_detail(type="address",
                                       value=address,
                                       note="Capitol Office")
            if phone:
                leg.add_contact_detail(type="voice",
                                       value=phone,
                                       note="Capitol Office")
            if fax:
                leg.add_contact_detail(type="fax",
                                       value=fax,
                                       note="Capitol Office")
            if email:
                leg.add_contact_detail(type="email",
                                       value=email,
                                       note="Capitol Office")

            yield leg

Esempio n. 30

0

Mostra file

File: people.py Progetto: jessemortenson/openstates

    def legislators(self, latest_only):
        legs = {}

        for member, chamber, term, url in self._memberships(latest_only):
            name, _, _, district, party = member.xpath("td")
            district = district.text
            detail_url = name.xpath("a/@href")[0]

            if party.text_content().strip() == "":
                party = "Independent"
            else:
                party = {"D": "Democratic", "R": "Republican", "I": "Independent"}[
                    party.text
                ]
            name = name.text_content().strip()

            # inactive legislator, skip them for now
            if name.endswith("*"):
                name = name.strip("*")
                continue

            name = AKA.get(name, name)

            if name in legs:
                p, terms = legs[name]
                terms.append((chamber, district, term, party))
            else:
                p = Person(name, party=party)
                legs[name] = p, [(chamber, district, term, party)]

            p.add_source(url)
            p.add_source(detail_url)
            p.add_link(detail_url)

            birth_date = BIRTH_DATES.get(name, None)
            if birth_date:
                p.birth_date = birth_date

            leg_html = self.get(detail_url).text
            leg_doc = lxml.html.fromstring(leg_html)
            leg_doc.make_links_absolute(detail_url)

            hotgarbage = (
                "Senate Biography Information for the 98th General "
                "Assembly is not currently available."
            )

            if hotgarbage in leg_html:
                # The legislator's bio isn't available yet.
                self.logger.warning("No legislator bio available for " + name)
                continue

            photo_url = leg_doc.xpath('//img[contains(@src, "/members/")]/@src')[0]
            p.image = photo_url

            p.contact_details = []
            # email
            email = leg_doc.xpath('//b[text()="Email: "]')
            if email:
                p.add_contact_detail(
                    type="email", value=email[0].tail.strip(), note="Capitol Office"
                )

            offices = {
                "Capitol Office": '//table[contains(string(), "Springfield Office")]',
                "District Office": '//table[contains(string(), "District Office")]',
            }

            for location, xpath in offices.items():
                table = leg_doc.xpath(xpath)
                if table:
                    for type, value in self._table_to_office(table[3]):
                        if type in ("fax", "voice") and not validate_phone_number(
                            value
                        ):
                            continue

                        p.add_contact_detail(type=type, value=value, note=location)

        return legs