def test_person_add_membership_name():
    p = Person("Leonardo DiCaprio")
    p.add_membership("Academy of Motion Picture Arts and Sciences",
                     role="winner",
                     start_date="2016")
    p._related[0].validate()
    assert get_pseudo_id(p._related[0].organization_id) == {
        "name": "Academy of Motion Picture Arts and Sciences"
    }
    assert p._related[0].person_id == p._id
    assert p._related[0].role == "winner"
    assert p._related[0].start_date == "2016"
def test_person_add_membership_org():
    p = Person("Bob B. Bear")
    p.add_source("http://example.com")
    o = Organization("test org", classification="unknown")
    p.add_membership(o,
                     role="member",
                     start_date="2007",
                     end_date=datetime.date(2015, 5, 8))
    assert len(p._related) == 1
    p._related[0].validate()
    assert p._related[0].person_id == p._id
    assert p._related[0].organization_id == o._id
    assert p._related[0].start_date == "2007"
    assert p._related[0].end_date == datetime.date(2015, 5, 8)
    def scrape(self):
        # chambers = [chamber] if chamber is not None else ['upper', 'lower']
        leg_url = "ftp://ftp.cga.ct.gov/pub/data/LegislatorDatabase.csv"
        page = self.get(leg_url)

        committees = {}

        # Ensure that the spreadsheet's structure hasn't generally changed
        _row_headers = page.text.split("\r\n")[0].replace('"', "").split(",")
        assert _row_headers == HEADERS, "Spreadsheet structure may have changed"

        page = open_csv(page)
        for row in page:

            chamber = {"H": "lower", "S": "upper"}[row["office code"]]

            district = row["dist"].lstrip("0")
            assert district.isdigit(), "Invalid district found: {}".format(
                district)

            name = row["first name"]
            mid = row["middle initial"].strip()
            if mid:
                name += " %s" % mid
            name += " %s" % row["last name"]
            suffix = row["suffix"].strip()
            if suffix:
                name += " %s" % suffix

            party = row["party"]
            if party == "Democrat":
                party = "Democratic"

            leg = Person(primary_org=chamber,
                         name=name,
                         district=district,
                         party=party)

            legislator_url = row["URL"].replace("\\", "//").strip()
            if legislator_url != "":
                if not legislator_url.startswith("http"):
                    legislator_url = "http://"
                leg.add_link(legislator_url)

            leg.add_party(party=party)

            office_address = "%s\nRoom %s\nHartford, CT 06106" % (
                row["capitol street address"],
                row["room number"],
            )
            # extra_office_fields = dict()
            email = row["email"].strip()
            if "@" not in email:
                if not email:
                    email = None
                elif email.startswith("http://") or email.startswith(
                        "https://"):
                    # extra_office_fields['contact_form'] = email
                    email = None
                else:
                    raise ValueError(
                        "Problematic email found: {}".format(email))
            leg.add_contact_detail(type="address",
                                   value=office_address,
                                   note="Capitol Office")
            leg.add_contact_detail(type="voice",
                                   value=row["capitol phone"],
                                   note="Capitol Office")
            if email:
                leg.add_contact_detail(type="email", value=email)

            home_address = "{}\n{}, {} {}".format(
                row["home street address"],
                row["home city"],
                row["home state"],
                row["home zip code"],
            )
            if "Legislative Office Building" not in home_address:
                leg.add_contact_detail(type="address",
                                       value=home_address,
                                       note="District Office")
                if row["home phone"].strip():
                    leg.add_contact_detail(type="voice",
                                           value=row["home phone"],
                                           note="District Office")
            leg.add_source(leg_url)

            for comm_name in row["committee member1"].split(";"):
                if " (" in comm_name:
                    comm_name, role = comm_name.split(" (")
                    role = role.strip(")").lower()
                else:
                    role = "member"
                comm_name = comm_name.strip()
                if comm_name:
                    if comm_name in committees:
                        com = committees[comm_name]
                    else:
                        com = Organization(comm_name,
                                           classification="committee",
                                           chamber=chamber)
                        com.add_source(leg_url)
                        committees[comm_name] = com
                        yield com

                    leg.add_membership(name_or_org=com, role=role)

            yield leg
Example #4
0
    def scrape_chamber(self, chamber):
        if chamber == "lower":
            url = "http://www.scstatehouse.gov/member.php?chamber=H"
        else:
            url = "http://www.scstatehouse.gov/member.php?chamber=S"

        seen_committees = {}

        data = self.get(url).text
        doc = lxml.html.fromstring(data)
        doc.make_links_absolute(url)

        for a in doc.xpath('//a[@class="membername"]'):
            full_name = a.text
            leg_url = a.get("href")

            if full_name.startswith("Senator"):
                full_name = full_name.replace("Senator ", "")
            if full_name.startswith("Representative"):
                full_name = full_name.replace("Representative ", "")

            leg_html = self.get(leg_url).text
            leg_doc = lxml.html.fromstring(leg_html)
            leg_doc.make_links_absolute(leg_url)

            if "Resigned effective" in leg_html:
                self.info("Resigned")
                continue

            party, district, _ = leg_doc.xpath(
                '//p[@style="font-size: 17px;'
                ' margin: 0 0 0 0; padding: 0;"]/text()')

            if "Republican" in party:
                party = "Republican"
            elif "Democrat" in party:
                party = "Democratic"

            # District # - County - Map
            district = district.split()[1]
            try:
                photo_url = leg_doc.xpath(
                    '//img[contains(@src,"/members/")]/@src')[0]
            except IndexError:
                self.warning("No Photo URL for {}".format(full_name))
                photo_url = ""
            person = Person(
                name=full_name,
                district=district,
                party=party,
                primary_org=chamber,
                image=photo_url,
            )

            # capitol office address
            try:
                capitol_address = lxml.etree.tostring(
                    leg_doc.xpath('//h2[text()="Columbia Address"]/../p[1]')
                    [0]).decode()
                if capitol_address:
                    capitol_address = parse_address(capitol_address)
                    person.add_contact_detail(type="address",
                                              value=capitol_address,
                                              note="Capitol Office")
            except IndexError:
                self.warning("no capitol address for {0}".format(full_name))

            # capitol office phone
            try:
                capitol_phone = (
                    leg_doc.xpath('//h2[text()="Columbia Address"]/../p[2]')
                    [0].text_content().strip())
                label, number = parse_phone(capitol_phone)
                if number:
                    person.add_contact_detail(type="voice",
                                              value=number,
                                              note="Capitol Office")
            except IndexError:
                self.warning("no capitol phone for {0}".format(full_name))

            # home address
            try:
                home_address = lxml.etree.tostring(
                    leg_doc.xpath('//h2[text()="Home Address"]/../p[1]')
                    [0]).decode()
                if home_address:
                    home_address = parse_address(home_address)
                    person.add_contact_detail(type="address",
                                              value=home_address,
                                              note="District Office")
            except IndexError:
                self.warning("no home address for {0}".format(full_name))

            # home or business phone
            try:
                home_phone = (
                    leg_doc.xpath('//h2[text()="Home Address"]/../p[2]')
                    [0].text_content().strip())
                label, number = parse_phone(home_phone)
                if number:
                    label = ("Primary Office"
                             if label == "Business" else "District Office")
                    person.add_contact_detail(type="voice",
                                              value=number,
                                              note=label)
            except IndexError:
                self.warning(
                    "no home or business phone for {0}".format(full_name))

            # business or home phone
            try:
                business_phone = (
                    leg_doc.xpath('//h2[text()="Home Address"]/../p[3]')
                    [0].text_content().strip())
                label, number = parse_phone(business_phone)
                if number:
                    label = ("Primary Office"
                             if label == "Business" else "District Office")
                    person.add_contact_detail(type="voice",
                                              value=number,
                                              note=label)
            except IndexError:
                pass

            person.add_link(leg_url)
            person.add_source(url)
            person.add_source(leg_url)

            # committees (skip first link)
            for com in leg_doc.xpath(
                    '//a[contains(@href, "committee.php")]')[1:]:
                if com.text.endswith(", "):
                    committee, role = com.text_content().rsplit(", ", 1)

                    # known roles
                    role = {
                        "Treas.": "treasurer",
                        "Secy.": "secretary",
                        "Secy./Treas.": "secretary/treasurer",
                        "V.C.": "vice-chair",
                        "1st V.C.": "first vice-chair",
                        "Co 1st V.C.": "co-first vice-chair",
                        "2nd V.C.": "second vice-chair",
                        "3rd V.C.": "third vice-chair",
                        "Ex.Officio Member": "ex-officio member",
                        "Chairman": "chairman",
                    }[role]
                else:
                    committee = com.text
                    role = "member"

                # only yield each committee once
                if committee not in seen_committees:
                    com = Organization(name=committee,
                                       classification="committee",
                                       chamber=chamber)
                    com.add_source(url)
                    seen_committees[committee] = com
                    yield com
                else:
                    com = seen_committees[committee]

                person.add_membership(com, role=role)

            yield person
Example #5
0
    def scrape_chamber(self, chamber):
        body = {"lower": "H", "upper": "S"}[chamber]
        url = "http://www.azleg.gov/MemberRoster/?body=" + body
        page = self.get(url).text

        # there is a bad comment closing tag on this page
        page = page.replace("--!>", "-->")

        root = html.fromstring(page)

        path = "//table//tr"
        roster = root.xpath(path)[1:]
        for row in roster:
            position = ""
            name, district, party, email, room, phone, = row.xpath("td")

            if email.attrib.get("class") == "vacantmember":
                continue  # Skip any vacant members.

            link = name.xpath("string(a/@href)")
            if len(name) == 1:
                name = name.text_content().strip()
            else:
                position = name.tail.strip()
                name = name[0].text_content().strip()
            if "--" in name:
                name = name.split("--")[0].strip()

            linkpage = self.get(link).text
            linkpage = linkpage.replace("--!>", "-->")
            linkroot = html.fromstring(linkpage)
            linkroot.make_links_absolute(link)

            photos = linkroot.xpath("//img[contains(@src, 'MemberPhoto')]")

            if len(photos) != 1:
                self.warning("no photo on " + link)
                photo_url = ""
            else:
                photo_url = photos[0].attrib["src"]

            district = district.text_content().strip()
            party = party.text_content().strip()
            email = email.text_content().strip()

            if email.startswith("Email: "):
                email = email.replace("Email: ", "").lower() + "@azleg.gov"
            else:
                email = ""

            party = self.get_party(party)
            room = room.text_content().strip()
            if chamber == "lower":
                address = "House of Representatives\n"
            else:
                address = "Senate\n"
            address = (
                address + "1700 West Washington\n Room " + room + "\nPhoenix, AZ 85007"
            )

            phone = phone.text_content().strip()
            if "602" not in re.findall(r"(\d+)", phone):
                phone = "602-" + phone

            leg = Person(
                primary_org=chamber,
                image=photo_url,
                name=name,
                district=district,
                party=party,
            )
            leg.add_contact_detail(type="address", value=address, note="Capitol Office")
            leg.add_contact_detail(type="voice", value=phone, note="Capitol Office")
            leg.add_party(party=party)
            leg.add_link(link)

            if email:
                leg.add_contact_detail(type="email", value=email)
            if position:
                leg.add_membership(name_or_org=party, role=position)
                # leg.add_role(position, term, chamber=chamber,
                #             district=district, party=party)

            leg.add_source(url)

            # Probably just get this from the committee scraper
            # self.scrape_member_page(link, session, chamber, leg)
            yield leg