コード例 #1
0
    def scrape_interim_committee(self, link, name):
        url = re.sub(r"\s+", "", link.attrib["href"])
        html = self.get(url).text
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)

        if "Subcommittee" in name:
            # Check whether the parent committee is manually defined first
            # before attempting to automatically resolve it.
            parent = WVCommitteeScraper.subcommittee_parent_map.get(name, None)
            if parent is None:
                parent = name.partition("Subcommittee")[0].strip()

            comm = Organization(
                name=name,
                classification="committee",
                parent_id=self._joint_committees[parent],
            )
        else:
            comm = Organization(name=name,
                                classification="committee",
                                chamber="legislature")
            self._joint_committees[name] = comm
        comm.add_source(url)

        xpath = '//a[contains(@href, "?member=")]'
        for link in doc.xpath(xpath):
            name = link.text_content().strip()
            name = re.sub(r"^Delegate\s+", "", name)
            name = re.sub(r"^Senator\s+", "", name)
            role = link.getnext().text or "member"
            comm.add_member(name, role.strip())

        return comm
コード例 #2
0
    def scrape_reps_comm(self):
        # As of 1/27/15, the committee page has the wrong
        # session number (126th) at the top, but
        # has newly elected people, so we're rolling with it.

        url = "http://legislature.maine.gov/house/hsecoms.htm"
        page = self.get(url).text
        root = lxml.html.fromstring(page)

        count = 0

        for n in range(1, 12, 2):
            path = "string(//body/center[%s]/h1/a)" % (n)
            comm_name = root.xpath(path)
            committee = Organization(chamber="lower",
                                     name=comm_name,
                                     classification="committee")
            count = count + 1

            path2 = "/html/body/ul[%s]/li/a" % (count)

            for el in root.xpath(path2):
                rep = el.text
                if rep.find("(") != -1:
                    mark = rep.find("(")
                    rep = rep[15:mark].strip()
                if "chair" in rep.lower():
                    role = "chair"
                    rep = re.sub(r"(?i)[\s,]*chair\s*$", "", rep).strip()
                else:
                    role = "member"
                committee.add_member(rep, role)
            committee.add_source(url)

            yield committee
コード例 #3
0
    def _scrape_lower_special_committees(self):
        url = "http://house.louisiana.gov/H_Cmtes/SpecialCommittees.aspx"
        page = self.lxmlize(url)

        committee_list = page.xpath('//div[@class="accordion"]')[0]

        headers = committee_list.xpath("./h3")

        for header in headers:
            committee_name_text = header.xpath("string()")
            committee_name = committee_name_text.strip()
            committee_name = self._normalize_committee_name(committee_name)

            chamber = "legislature" if committee_name.startswith("Joint") else "lower"

            committee = Organization(
                committee_name, chamber=chamber, classification="committee"
            )
            committee.add_source(url)

            committee_memberlist = header.xpath(
                './following-sibling::div[@class="pane"]' '//tr[@class="linkStyle2"]'
            )

            for row in committee_memberlist:
                member_name = row.xpath("normalize-space(string(./th[1]))")
                member_name = self._normalize_member_name(member_name)
                member_role = row.xpath("normalize-space(string(./th[2]))")
                member_role = self._normalize_member_role(member_role)

                committee.add_member(member_name, member_role)

            yield committee
コード例 #4
0
    def scrape_chamber(self, chamber):
        session = self.latest_session()
        # since we are scraping only latest_session
        session_id = session_metadata.session_id_meta_data[session]

        client = AZClient()
        committees = client.list_committees(
            sessionId=session_id,
            includeOnlyCommitteesWithAgendas="false",
            legislativeBody="S" if chamber == "upper" else "H",
        )
        for committee in committees.json():
            c = Organization(
                name=committee["CommitteeName"],
                chamber=chamber,
                classification="committee",
            )
            details = client.get_standing_committee(
                sessionId=session_id,
                legislativeBody="S" if chamber == "upper" else "H",
                committeeId=committee["CommitteeId"],
                includeMembers="true",
            )
            for member in details.json()[0]["Members"]:
                c.add_member(
                    u"{} {}".format(member["FirstName"], member["LastName"]),
                    role=parse_role(member),
                )
                c.add_source(details.url)

            c.add_source(committees.url)
            yield c
コード例 #5
0
    def scrape_page(self, link, chamber=None):
        page = self.lxmlize(link.attrib["href"])
        comName = link.text
        roles = {
            "Chair": "chair",
            "Vice Chair": "vice-chair",
            "Vice-Chair": "vice-chair",
        }
        committee = Organization(comName,
                                 chamber=chamber,
                                 classification="committee")
        committee.add_source(link.attrib["href"])

        for member in page.xpath('//div[@class="members"]/' +
                                 'div[@class="roster-item"]'):
            details = member.xpath('.//div[@class="member-details"]')[0]
            person = details.xpath("./h4")[0].text_content()
            # This page does random weird things with whitespace to names
            person = " ".join(person.strip().split())
            if not person:
                continue
            role = details.xpath('./span[@class="member-role"]')
            if role:
                role = roles[role[0].text]
            else:
                role = "member"
            committee.add_member(person, role=role)
        yield committee
コード例 #6
0
    def scrape_lower_committee(self, name, url):
        page = self.lxmlize(url)

        committee = Organization(chamber="lower",
                                 name=name,
                                 classification="committee")
        committee.add_source(url)

        seen = set()

        member_links = self.get_nodes(
            page, '//div[@class="mod-inner"]//a[contains(@href, "mem")]')

        for member_link in member_links:
            member_name = None
            member_role = None

            member_name = member_link.text
            if member_name is None:
                continue

            # Figure out if this person is the chair.
            if member_link == member_links[0]:
                member_role = "chair"
            else:
                member_role = "member"

            if name not in seen:
                committee.add_member(member_name, member_role)
                seen.add(member_name)

        return committee
コード例 #7
0
    def _scrape_upper_committee(self, name, url2):
        cat = "Assignments.asp"
        url3 = url2.replace("default.asp", cat)

        committee = Organization(name, chamber="upper", classification="committee")
        committee.add_source(url2)

        page = self.lxmlize(url3)

        members = page.xpath('//table[@id="table38"]//font/a/b')

        for link in members:
            role = "member"
            if link == members[0]:
                role = "Chairman"
            if link == members[1]:
                role = "Vice-Chairman"

            name = link.xpath("string()")
            name = name.replace("Senator ", "")
            name = re.sub(r"[\s]{2,}", " ", name).strip()

            committee.add_member(name, role)

        yield committee
コード例 #8
0
    def scrape_senate_comm(self):
        url = ("http://legislature.maine.gov/committee-information/"
               "standing-committees-of-the-senate")
        html = self.get(url).text
        doc = lxml.html.fromstring(html)

        headings = doc.xpath("//p/strong")
        for heading in headings:
            committee = Organization(
                chamber="upper",
                name=heading.text.strip(":"),
                classification="committee",
            )
            committee.add_source(url)
            par = heading.getparent().getnext()
            while True:
                link = par.xpath("a")
                if len(link) == 0:
                    break
                res = self.senate_committee_pattern.search(link[0].text)
                name, chair = res.groups()
                committee.add_member(
                    name, "chair" if chair is not None else "member")
                par = par.getnext()

            yield committee
コード例 #9
0
    def scrape_committee(self, name, url, chamber):
        org = Organization(name=name,
                           chamber=chamber,
                           classification="committee")
        org.add_source(url)
        data = self.get(url).text
        doc = lxml.html.fromstring(data)

        for leg in doc.xpath(
                '//div[@id="members"]/div[@id="members"]/p/a/text()'):
            leg = leg.replace("Representative ", "")
            leg = leg.replace("Senator ", "")
            leg = leg.strip()
            if " (" in leg:
                leg, role = leg.split(" (")
                if "Vice-Chair" in role:
                    role = "vice-chair"
                elif "Co-Chair" in role:
                    role = "co-chair"
                elif "Chair" in role:
                    role = "chair"
                else:
                    raise Exception("unknown role: %s" % role)
            else:
                role = "member"
            org.add_member(leg, role)

        return org
コード例 #10
0
    def scrape_senate_committee(self, url):
        html = self.get(url).text
        doc = lxml.html.fromstring(html)

        headers = doc.xpath('(//div[@class="row"])[2]//h1')
        assert len(headers) == 1
        name = " ".join(headers[0].xpath("./text()"))
        name = re.sub(r"\s+Committee.*$", "", name)

        com = Organization(chamber="upper",
                           name=name,
                           classification="committee")

        for member in doc.xpath('(//div[@class="row"])[3]/div[1]/ul[1]/li'):
            text = member.text_content()
            member_name = member.xpath("./a/text()")[0].replace(
                "Representative ", "")
            if "Committee Chair" in text:
                role = "chair"
            elif "Minority Vice" in text:
                role = "minority vice chair"
            elif "Vice" in text:
                role = "majority vice chair"
            else:
                role = "member"

            com.add_member(member_name, role=role)

        com.add_source(url)

        if com.name == "Appropriations":
            self._senate_appropriations = com

        yield com
コード例 #11
0
    def scrape_approp_subcommittees(self):
        URL = "http://www.senate.michigan.gov/committee/appropssubcommittee.html"
        html = self.get(URL).text
        doc = lxml.html.fromstring(html)

        for strong in doc.xpath("//strong"):
            com = Organization(
                name=strong.text.strip(),
                parent_id=self._senate_appropriations,
                classification="committee",
            )
            com.add_source(URL)

            legislators = strong.getnext().tail.replace("Senators", "").strip()
            for leg in re.split(", | and ", legislators):
                if leg.endswith("(C)"):
                    role = "chairman"
                    leg = leg[:-4]
                elif leg.endswith("(VC)"):
                    role = "vice chairman"
                    leg = leg[:-5]
                elif leg.endswith("(MVC)"):
                    role = "minority vice chairman"
                    leg = leg[:-6]
                else:
                    role = "member"
                com.add_member(leg, role=role)

            yield com
コード例 #12
0
def test_committee_add_member_person():
    c = Organization("Defense", classification="committee")
    p = Person("John Adams")
    c.add_member(p, role="chairman")
    assert c._related[0].person_id == p._id
    assert c._related[0].organization_id == c._id
    assert c._related[0].role == "chairman"
コード例 #13
0
    def handle_page(self):
        name = self.doc.xpath('//h2[@class="committeeName"]')[0].text
        if name.startswith("Appropriations Subcommittee"):
            return
            # TODO: restore scraping of Appropriations Subcommittees
            # name = name.replace('Appropriations ', '')
            # parent = {'name': 'Appropriations', 'classification': 'upper'}
            # chamber = None
        else:
            if name.startswith("Committee on"):
                name = name.replace("Committee on ", "")
            parent = None
            chamber = "upper"
        comm = Organization(name=name,
                            classification="committee",
                            chamber=chamber,
                            parent_id=parent)

        for dt in self.doc.xpath('//div[@id="members"]/dl/dt'):
            role = dt.text.replace(": ", "").strip().lower()
            member = dt.xpath("./following-sibling::dd")[0].text_content()
            member = self.clean_name(member)
            comm.add_member(member, role=role)

        for ul in self.doc.xpath('//div[@id="members"]/ul/li'):
            member = self.clean_name(ul.text_content())
            comm.add_member(member)

        comm.add_source(self.url)

        yield comm
コード例 #14
0
    def scrape(self, session=None):
        if session is None:
            session = self.latest_session()
            self.info("no session specified, using %s", session)

        # com_types = ['J', 'SE', 'O']
        # base_url = 'https://wyoleg.gov/LsoService/api/committeeList/2018/J'
        url = "https://wyoleg.gov/LsoService/api/committees/{}".format(session)

        response = self.get(url)
        coms_json = json.loads(response.content.decode("utf-8"))

        for row in coms_json:
            com_url = "https://wyoleg.gov/LsoService/api/committeeDetail/{}/{}".format(
                session, row["ownerID"])
            com_response = self.get(com_url)
            com = json.loads(com_response.content.decode("utf-8"))

            # WY doesn't seem to have any house/senate only committees that I can find
            committee = Organization(name=com["commName"],
                                     chamber="legislature",
                                     classification="committee")

            for member in com["commMembers"]:
                role = "chairman" if member[
                    "chairman"] == "Chairman" else "member"
                committee.add_member(member["name"], role)

            # some WY committees have non-legislators appointed to the member by the Governor
            # but the formatting is super inconsistent
            if com["otherMembers"]:
                committee.extras["other_members"] = com["otherMembers"]

            committee.extras["wy_id"] = com["commID"]
            committee.extras["wy_code"] = com["ownerID"]
            committee.extras["wy_type_code"] = com["type"]
            committee.extras["budget"] = com["budget"]

            if com["statAuthority"]:
                committee.extras["statutory_authority"] = com["statAuthority"]

            if com["number"]:
                committee.extras["seat_distribution"] = com["number"]

            committee.add_identifier(scheme="WY Committee ID",
                                     identifier=str(com["commID"]))
            committee.add_identifier(scheme="WY Committee Code",
                                     identifier=str(com["ownerID"]))

            if com["description"]:
                committee.add_identifier(scheme="Common Name",
                                         identifier=com["description"])

            source_url = "http://wyoleg.gov/Committees/{}/{}".format(
                session, com["ownerID"])
            committee.add_source(source_url)

            yield committee
コード例 #15
0
    def _scrape_standing_committees(self):
        """Scrapes the Standing Committees page of the Nebraska state
        legislature."""
        main_url = (
            "http://www.nebraskalegislature.gov/committees/standing-committees.php"
        )
        page = self.lxmlize(main_url)

        committee_nodes = self.get_nodes(
            page,
            '//a[@class="accordion-switch"][contains(text(), "Standing Committees")]'
            '/ancestor::div[@class="panel panel-leg"]//div[@class="list-group"]'
            '/a[@class="list-group-item"]',
        )

        for committee_node in committee_nodes:
            committee_page_url = committee_node.attrib["href"]
            committee_page = self.lxmlize(committee_page_url)

            name_text = self.get_node(
                committee_page,
                '//div[@class="container view-front"]/div[@class="row"]/'
                'div[@class="col-sm-6 col-md-7"]/h1/text()[normalize-space()]',
            )
            name = name_text.split()[0:-1]

            committee_name = ""
            for x in range(len(name)):
                committee_name += name[x] + " "
            committee_name = committee_name[0:-1]

            org = Organization(name=committee_name,
                               chamber="legislature",
                               classification="committee")

            members = self.get_nodes(
                committee_page,
                '//div[@class="col-sm-4 col-md-3 ltc-col-right"][1]/'
                'div[@class="block-box"][1]/ul[@class="list-unstyled '
                'feature-content"]/li/a/text()[normalize-space()]',
            )

            for member in members:
                member_name = re.sub(r"\Sen\.\s+", "", member)
                member_name = re.sub(r", Chairperson", "", member_name).strip()
                if "Chairperson" in member:
                    member_role = "Chairperson"
                else:
                    member_role = "member"
                org.add_member(member_name, member_role)

            org.add_source(main_url)
            org.add_source(committee_page_url)

            yield org
コード例 #16
0
    def scrape(self):
        com_url = "http://dccouncil.us/committees"
        data = self.get(com_url).text
        doc = lxml.html.fromstring(data)
        doc.make_links_absolute(com_url)

        comms = set(doc.xpath('//a[contains(@href, "dccouncil.us/committees/")]'))

        for committee in comms:
            url = committee.attrib["href"]
            name = committee.text_content().strip()
            comm_data = self.get(url).text
            comm_page = lxml.html.fromstring(comm_data)
            comm_page.make_links_absolute(url)

            # classify these as belonging to the legislature
            committee = Organization(
                name=name, classification="committee", chamber="legislature"
            )

            if comm_page.xpath('//p[@class="page-summary"]'):
                summary = (
                    comm_page.xpath('//p[@class="page-summary"]')[0]
                    .text_content()
                    .strip()
                )
                committee.extras["summary"] = summary

            chair = comm_page.xpath("//h4[text()='Chairperson']/following-sibling::p")
            chair_name = chair[0].text_content().strip()
            chair_name = self.remove_title(chair_name)
            committee.add_member(chair_name, role="chair")

            members = comm_page.xpath(
                "//h4[text()='Councilmembers']/following-sibling::ul"
            )
            members = members[0].xpath("./li")

            for m in members:
                mem_name = m.text_content().strip()
                mem_name = self.remove_title(mem_name)
                if mem_name != chair_name:
                    committee.add_member(mem_name)

            committee.add_source(url)
            committee.add_link(url, note="Official Website")

            if not committee._related:
                self.warning("empty committee: %s;", name)
            else:
                yield committee
コード例 #17
0
ファイル: committees.py プロジェクト: csnardi/openstates
    def scrape(self, session=None):
        year_abr = ((int(session) - 209) * 2) + 2000
        self._init_mdb(year_abr)
        members_csv = self.access_to_csv("COMember")
        info_csv = self.access_to_csv("Committee")

        org_dictionary = {}

        # Committee Info Database
        for rec in info_csv:
            abrv = rec["Code"]
            comm_name = rec["Description"]

            if abrv[0] == "A":
                chamber = "lower"
            elif abrv[0] == "S":
                chamber = "upper"

            org = Organization(name=comm_name,
                               chamber=chamber,
                               classification="committee")
            org.add_source("http://www.njleg.state.nj.us/downloads.asp")
            org_dictionary[abrv] = org

        # Committee Member Database
        # FIXME: E.g. "SCEB" is the Select Commission on Emergency COVID-19 Borrowing.
        # https://www.njleg.state.nj.us/committees/sceb.asp
        # Its members have "O" for their position code. What does that mean? They're
        # only called members on the web page, so I'll go with that.
        POSITIONS = {
            "C": "chair",
            "V": "vice-chair",
            "": "member",
            "O": "member",
        }
        for member_rec in members_csv:
            # assignment=P means they are active, assignment=R means removed
            if member_rec["Assignment_to_Committee"] == "P":
                abr = member_rec["Code"]
                org = org_dictionary[abr]

                leg = member_rec["Member"]
                role = POSITIONS[member_rec["Position_on_Committee"]]
                leg = " ".join(leg.split(", ")[::-1])
                org.add_member(leg, role=role)

        for org in org_dictionary.values():
            yield org
コード例 #18
0
    def _scrape_select_special_committees(self):
        """Scrapes the Select and Special Committees page of the
        Nebraska state legislature."""
        main_url = "http://www.nebraskalegislature.gov/committees/select-committees.php"
        page = self.lxmlize(main_url)

        committee_nodes = self.get_nodes(
            page,
            '//a[contains(@class, "accordion-switch")]'
            '/ancestor::div[@class="panel panel-leg"]',
        )

        for committee_node in committee_nodes:
            committee_name = self.get_node(
                committee_node,
                './/h2[@class="panel-title"]/text()[normalize-space()]')

            if committee_name is None:
                committee_name = self.get_node(
                    committee_node,
                    './/h2[@class="panel-title"]/a/text()[normalize-space()]',
                )

            org = Organization(name=committee_name,
                               chamber="legislature",
                               classification="committee")
            org.add_source(main_url)

            members = self.get_nodes(
                committee_node,
                './/a[@class="list-group-item"]'
                "/text()[normalize-space()]",
            )

            for member in members:
                member_name = re.sub(r"\Sen\.\s+", "", member)
                member_name = re.sub(r", Chairperson", "", member_name).strip()
                if "Chairperson" in member:
                    member_role = "Chairperson"
                else:
                    member_role = "member"
                org.add_member(member_name, member_role)

            if not org._related:
                self.warning("No members found in {} committee.".format(
                    org.name))
            else:
                yield org
コード例 #19
0
    def scrape_committee(self, chamber, url):
        html = self.get(url).text
        doc = lxml.html.fromstring(html)

        name = doc.xpath("//title/text()")[0]
        com = Organization(name, chamber=chamber, classification="committee")
        com.add_source(url)

        members = doc.xpath('//a[contains(@href, "/Legislators/Profile")]')
        for member in members:
            title = member.xpath("../span")
            role = title[0].text.lower() if title else "member"
            com.add_member(member.text, role)

        if members:
            return com
コード例 #20
0
    def _scrape_committee(self, committee_name, link, chamber):
        """Scrape individual committee page and add members"""

        page = self.get(link).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(link)

        is_subcommittee = bool(page.xpath('//li/a[text()="Committee"]'))
        if is_subcommittee:
            # All TN subcommittees are just the name of the parent committee with " Subcommittee"
            # at the end
            parent_committee_name = re.sub(r"\s*(Study )?Subcommittee\s*", "",
                                           committee_name)
            com = Organization(
                committee_name,
                classification="committee",
                parent_id=self.parents[parent_committee_name],
            )
        else:
            com = Organization(committee_name,
                               chamber=chamber,
                               classification="committee")
            self.parents[committee_name] = com._id

        OFFICER_SEARCH = ('//h2[contains(text(), "Committee Officers")]/'
                          "following-sibling::div/ul/li/a")
        MEMBER_SEARCH = ('//h2[contains(text(), "Committee Members")]/'
                         "following-sibling::div/ul/li/a")
        for a in page.xpath(OFFICER_SEARCH) + page.xpath(MEMBER_SEARCH):

            member_name = " ".join([
                x.strip() for x in a.xpath("text()") + a.xpath("span/text()")
                if x.strip()
            ])
            role = a.xpath("small")
            if role:
                role = role[0].xpath("text()")[0].strip()
            else:
                role = "member"
            if "(Vacant)" in role:
                continue

            com.add_member(member_name, role)

        com.add_link(link)
        com.add_source(link)
        return com
コード例 #21
0
    def scrape_house_committees(self):
        url = "http://www.house.leg.state.mn.us/comm/commemlist.asp"

        html = self.get(url).text
        doc = lxml.html.fromstring(html)

        for com in doc.xpath('//h2[@class="commhighlight"]'):
            members_url = com.xpath(
                'following-sibling::p[1]/a[text()="Members"]/@href')[0]

            com = Organization(com.text,
                               chamber="lower",
                               classification="committee")
            com.add_source(members_url)

            try:
                member_html = self.get(members_url).text
                mdoc = lxml.html.fromstring(member_html)
            except HTTPError:
                self.warning(
                    "Member list for {} failed to respond; skipping".format(
                        com.name))
                continue

            # each legislator in their own table
            # first row, second column contains all the info
            for ltable in mdoc.xpath("//table/tr[1]/td[2]/p/b[1]"):

                # name is tail string of last element
                name = ltable.text_content()
                text = ltable.text
                if text and name != text:
                    name = name.replace(text, "")

                # role is inside a nested b tag
                role = ltable.xpath("b/*/text()")
                if role:
                    # if there was a role, remove it from name
                    role = role[0]
                    name = name.replace(role, "")
                else:
                    role = "member"
                name = name.split(" (")[0]
                com.add_member(name.strip(), role)

            # save
            yield com
コード例 #22
0
    def scrape_upper_committee(self, url):
        doc = self.lxmlize(url)
        inner_content = self.get_node(doc, '//section[@class="inner-content"]')
        comm_name = self.get_node(inner_content, ".//h2").text.strip()

        # Remove "Committee" from committee names
        comm_name = (comm_name.replace("Comisión de ", "").replace(
            "Comisión sobre ",
            "").replace("Comisión para ",
                        "").replace("Comisión Especial para el Estudio de ",
                                    "").replace("Comisión Especial para ",
                                                "").replace("Comisión ", ""))
        comm_name = re.sub(r"(?u)^(las?|el|los)\s", "", comm_name)
        comm_name = comm_name[0].upper() + comm_name[1:]

        comm = Organization(comm_name,
                            chamber="upper",
                            classification="committee")
        comm.add_source(url)

        members = self.get_nodes(inner_content, ".//li")
        for member in members:
            name_parts = member.text.split("-")
            name = name_parts[0].replace("Hon. ", "").strip()

            if len(name_parts) > 1:
                title = name_parts[1].strip()

                # Translate titles to English for parity with other states
                if "President" in title:
                    title = "chairman"
                elif title.startswith("Vicepresident"):
                    title = "vicechairman"
                elif title.startswith("Secretari"):
                    title = "secretary"
                else:
                    raise AssertionError(
                        "Unknown member type: {}".format(title))

                comm.add_member(name, title)
            else:
                comm.add_member(name)

        yield comm
コード例 #23
0
    def scrape_house_committees(self):
        base_url = "http://house.mi.gov/MHRPublic/CommitteeInfo.aspx?comkey="
        html = self.get("http://house.mi.gov/mhrpublic/committee.aspx").text
        doc = lxml.html.fromstring(html)

        # get values out of drop down
        for opt in doc.xpath("//option"):
            name = opt.text
            # skip invalid choice
            if opt.text in ("Statutory Committees", "Select One"):
                continue
            if "have not been created" in opt.text:
                self.warning("no committees yet for the house")
                return
            com_url = base_url + opt.get("value")
            com_html = self.get(com_url).text
            cdoc = lxml.html.fromstring(com_html)
            com = Organization(chamber="lower",
                               name=name,
                               classification="committee")
            com.add_source(com_url)

            for a in doc.xpath('//a[starts-with(@id, "memberLink")]'):
                name = a.text.strip()

            # all links to http:// pages in servicecolumn2 are legislators
            members = cdoc.xpath('//div[contains(@id,"memberPanelRow")]')
            for mem in members:
                name = mem.xpath("./a")
                if name:
                    name = name[0].text.strip()
                else:
                    # this is a blank row
                    continue
                text = mem.xpath("./span")[0].text
                if "Committee Chair" in text:
                    role = "chair"
                elif "Vice-Chair" in text:
                    role = "vice chair"
                else:
                    role = "member"
                com.add_member(name, role=role)

            yield com
コード例 #24
0
    def scrape(self, session=None):
        if not session:
            session = self.jurisdiction.legislative_sessions[-1]["name"]
            self.info("no session specified, using %s", session)

        year_abr = session[0:4]

        self._init_mdb(year_abr)
        members_csv = self.access_to_csv("COMember")
        info_csv = self.access_to_csv("Committee")

        org_dictionary = {}

        # Committee Info Database
        for rec in info_csv:
            abrv = rec["Code"]
            comm_name = rec["Description"]

            if abrv[0] == "A":
                chamber = "lower"
            elif abrv[0] == "S":
                chamber = "upper"

            org = Organization(name=comm_name,
                               chamber=chamber,
                               classification="committee")
            org.add_source("http://www.njleg.state.nj.us/downloads.asp")
            org_dictionary[abrv] = org

        # Committee Member Database
        POSITIONS = {"C": "chair", "V": "vice-chair", "": "member"}
        for member_rec in members_csv:
            # assignment=P means they are active, assignment=R means removed
            if member_rec["Assignment_to_Committee"] == "P":
                abr = member_rec["Code"]
                org = org_dictionary[abr]

                leg = member_rec["Member"]
                role = POSITIONS[member_rec["Position_on_Committee"]]
                leg = " ".join(leg.split(", ")[::-1])
                org.add_member(leg, role=role)

        for org in org_dictionary.values():
            yield org
コード例 #25
0
    def scrape_senate_committee(self, url):
        html = self.get(url).text
        doc = lxml.html.fromstring(html)

        com_name = doc.xpath('//a[contains(@href, "committee_bio")]/text()')[0]
        parent = doc.xpath('//h4//a[contains(@href, "committee_bio")]/text()')
        if parent:
            self.log("%s is subcommittee of %s", com_name, parent[0])
            com = Organization(
                com_name,
                chamber="upper",
                classification="committee",
                parent_id={
                    "name": parent[0],
                    "classification": "upper"
                },
            )
        else:
            com = Organization(com_name,
                               chamber="upper",
                               classification="committee")

        for link in doc.xpath(
                '//div[@id="members"]//a[contains(@href, "member_bio")]'):
            name = link.text_content().strip()
            if name:
                position = link.xpath(".//preceding-sibling::b/text()")
                if not position:
                    position = "member"
                elif position[0] == "Chair:":
                    position = "chair"
                elif position[0] == "Vice Chair:":
                    position = "vice chair"
                elif position[0] == "Ranking Minority Member:":
                    position = "ranking minority member"
                else:
                    raise ValueError("unknown position: %s" % position[0])

                name = name.split(" (")[0]
                com.add_member(name.strip(), position)

        com.add_source(url)
        yield com
コード例 #26
0
    def scrape_current(self, chamber):
        if chamber == "upper":
            chambers = ["special_committees", "senate_committees"]
        else:
            chambers = ["house_committees"]

        committee_request = self.get(ksapi.url + "ctte/").text
        committee_json = json.loads(committee_request)

        for com_type in chambers:
            committees = committee_json["content"][com_type]

            for committee_data in committees:

                # set to joint if we are using the special_committees
                com_chamber = ("legislature" if com_type
                               == "special_committees" else chamber)

                committee = Organization(
                    committee_data["TITLE"],
                    chamber=com_chamber,
                    classification="committee",
                )

                com_url = ksapi.url + "ctte/%s/" % committee_data["KPID"]
                try:
                    detail_json = self.get(com_url).text
                except scrapelib.HTTPError:
                    self.warning("error fetching committee %s" % com_url)
                    continue
                details = json.loads(detail_json)["content"]
                for chair in details["CHAIR"]:
                    if chair.get("FULLNAME", None):
                        chair_name = chair["FULLNAME"]
                    else:
                        chair_name = self.parse_kpid(chair["KPID"])
                        self.warning("no FULLNAME for %s", chair["KPID"])
                    committee.add_member(chair_name, "chairman")
                for vicechair in details["VICECHAIR"]:
                    committee.add_member(vicechair["FULLNAME"],
                                         "vice-chairman")
                for rankedmember in details["RMMEM"]:
                    committee.add_member(rankedmember["FULLNAME"],
                                         "ranking member")
                for member in details["MEMBERS"]:
                    committee.add_member(member["FULLNAME"])

                if not committee._related:
                    self.warning("skipping blank committee %s" %
                                 committee_data["TITLE"])
                else:
                    committee.add_source(com_url)
                    yield committee
コード例 #27
0
    def scrape_lower_committee(self, link, name):
        url = re.sub(r"\s+", "", link.attrib["href"])
        html = self.get(url).text
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)

        comm = Organization(name=name,
                            chamber="lower",
                            classification="committee")
        comm.add_source(url)

        xpath = '//a[contains(@href, "?member=")]'
        for link in doc.xpath(xpath):
            name = link.text_content().strip()
            name = re.sub(r"^Delegate\s+", "", name)
            role = link.getnext().text or "member"
            comm.add_member(name, role.strip())

        return comm
コード例 #28
0
    def scrape_lower_committee(self, name, parent, url):
        page = self.curl_lxmlize(url)

        if "Joint" in name or (parent and "Joint" in parent):
            chamber = "joint"
        else:
            chamber = "lower"

        if parent:
            comm = Organization(
                name=parent, chamber=chamber, classification="committee"
            )
            subcomm = Organization(
                name=name, parent_id=comm, classification="committee"
            )
        else:
            comm = Organization(name=name, chamber=chamber, classification="committee")
        comm.add_source(url)

        xpath = "//a[contains(@href, 'District')]"
        for link in page.xpath(xpath):
            member = link.xpath("string()").strip()
            member = re.sub(r"\s+", " ", member)

            if not member or member == "House District Maps":
                continue

            match = re.match(r"((Co-)?(Vice )?Chair)?Rep\. ([^\(]+)", member)
            member = match.group(4).strip()
            role = match.group(1) or "member"

            member = member.replace("Representative ", "")

            comm.add_member(member, role.lower())

        if not comm._related:
            if subcomm.name == "test":
                # Whoopsie, prod data.
                return

            raise Exception("no members for %s (%s)" % (comm.name, subcomm.name))

        yield comm
コード例 #29
0
    def scrape_lower_committee(self, committee_name, url):
        page = self.lxmlize(url)

        committee_name = committee_name.strip()
        comm = Organization(committee_name,
                            chamber="lower",
                            classification="committee")
        comm.add_source(url)

        info_node = self.get_node(
            page,
            './/div[@id = "dnn_ctr1109_ViewWebCommission_WebCommission1_'
            'pnlCommission"]',
        )

        # This will likely capture empty text nodes as well.
        members = self.get_nodes(
            info_node,
            './/div[@class="two-cols com"]/div[@class="col"]//text()'
            "[normalize-space() and preceding-sibling::br]",
        )

        member_count = 0

        for member in members:
            member = re.sub(r"Hon\.\s*", "", member).strip()

            # Skip empty nodes.
            if not member:
                continue

            member, title = self._match_title(member)

            if title is not None:
                comm.add_member(member, title)
            else:
                comm.add_member(member)

            member_count += 1

        if member_count > 0:
            yield comm
コード例 #30
0
    def scrape(self, chamber=None):
        committees_url = "http://le.utah.gov/data/committees.json"
        committees = self.get(committees_url).json()["committees"]

        people_url = "http://le.utah.gov/data/legislators.json"
        people = self.get(people_url).json()["legislators"]

        # The committee JSON only has legislator IDs, not names
        ids_to_names = {}
        for person in people:
            ids_to_names[person["id"]] = person["formatName"]

        for committee in committees:
            name = committee["description"]
            if name.endswith(" Committee"):
                name = name[: len(name) - len(" Committee")]
            elif name.endswith(" Subcommittee"):
                name = name[: len(name) - len(" Subcommittee")]
            if name.startswith("House "):
                name = name[len("House ") :]
                chamber = "lower"
            elif name.startswith("Senate "):
                name = name[len("Senate ") :]
                chamber = "upper"
            else:
                chamber = "legislature"

            c = Organization(chamber=chamber, name=name, classification="committee")
            c.add_source(committees_url)
            c.add_source(people_url)
            c.add_link(committee["link"])

            for member in committee["members"]:
                try:
                    member_name = ids_to_names[member["id"]]
                except KeyError:
                    self.warning(
                        "Found unknown legislator ID in committee JSON: " + member["id"]
                    )
                c.add_member(member_name, role=member["position"])

            yield c