Example #1
0
    def scrape_committees(self, session):
        session_key = SESSION_KEYS[session]
        committees_response = self.api_client.get('committees', session=session_key)

        legislators = index_legislators(self, session_key)

        for committee in committees_response:
            org = Organization(
                chamber={'S': 'upper', 'H': 'lower',
                         'J': 'legislature'}[committee['HouseOfAction']],
                name=committee['CommitteeName'],
                classification='committee')
            org.add_source(
                'https://olis.leg.state.or.us/liz/{session}'
                '/Committees/{committee}/Overview'.format(session=session_key,
                                                          committee=committee['CommitteeName']))
            members_response = self.api_client.get('committee_members',
                                                   session=session_key,
                                                   committee=committee['CommitteeCode'])
            for member in members_response:
                try:
                    member_name = legislators[member['LegislatorCode']]
                except KeyError:
                    logger.warn('Legislator {} not found in session {}'.format(
                        member['LegislatorCode'], session_key))
                    member_name = member['LegislatorCode']
                org.add_member(member_name, role=member['Title'] if member['Title'] else '')

            yield org
Example #2
0
    def scrape_reps_comm(self):
        # As of 1/27/15, the committee page has the wrong
        # session number (126th) at the top, but
        # has newly elected people, so we're rolling with it.

        url = 'http://legislature.maine.gov/house/hsecoms.htm'
        page = self.get(url).text
        root = lxml.html.fromstring(page)

        count = 0

        for n in range(1, 12, 2):
            path = 'string(//body/center[%s]/h1/a)' % (n)
            comm_name = root.xpath(path)
            committee = Organization(chamber='lower', name=comm_name, classification='committee')
            count = count + 1

            path2 = '/html/body/ul[%s]/li/a' % (count)

            for el in root.xpath(path2):
                rep = el.text
                if rep.find('(') != -1:
                    mark = rep.find('(')
                    rep = rep[15: mark].strip()
                if 'chair' in rep.lower():
                    role = 'chair'
                    rep = re.sub(r'(?i)[\s,]*chair\s*$', '', rep).strip()
                else:
                    role = 'member'
                committee.add_member(rep, role)
            committee.add_source(url)

            yield committee
Example #3
0
    def _scrape_upper_committee(self, name, url2):
        cat = "Assignments.asp"
        url3 = url2.replace("default.asp", cat)

        committee = Organization(name,
                                 chamber="upper",
                                 classification="committee"
                                 )
        committee.add_source(url2)

        page = self.lxmlize(url3)

        members = page.xpath('//table[@id="table38"]//font/a/b')

        for link in members:
            role = "member"
            if link == members[0]:
                role = "Chairman"
            if link == members[1]:
                role = "Vice-Chairman"

            name = link.xpath('string()')
            name = name.replace('Senator ', '')
            name = re.sub('[\s]{2,}', ' ', name).strip()

            committee.add_member(name, role)

        yield committee
Example #4
0
    def _scrape_lower_special_committees(self):
        url = 'http://house.louisiana.gov/H_Cmtes/SpecialCommittees.aspx'
        page = self.lxmlize(url)

        committee_list = page.xpath('//div[@class="accordion"]')[0]

        headers = committee_list.xpath('./h3')

        for header in headers:
            committee_name_text = header.xpath('string()')
            committee_name = committee_name_text.strip()
            committee_name = self._normalize_committee_name(committee_name)

            chamber = 'legislature' if committee_name.startswith('Joint') else 'lower'

            committee = Organization(committee_name, chamber=chamber,
                                     classification='committee')
            committee.add_source(url)

            committee_memberlist = header.xpath('./following-sibling::div[@class="pane"]'
                                                '//tr[@class="linkStyle2"]')

            for row in committee_memberlist:
                member_name = row.xpath('normalize-space(string(./th[1]))')
                member_name = self._normalize_member_name(member_name)
                member_role = row.xpath('normalize-space(string(./th[2]))')
                member_role = self._normalize_member_role(member_role)

                committee.add_member(member_name, member_role)

            yield committee
def test_committee_add_member_person():
    c = Organization('Defense', classification='committee')
    p = Person('John Adams')
    c.add_member(p, role='chairman')
    assert c._related[0].person_id == p._id
    assert c._related[0].organization_id == c._id
    assert c._related[0].role == 'chairman'
Example #6
0
    def scrape_chamber(self, chamber):
        session = self.latest_session()
        # since we are scraping only latest_session
        session_id = session_metadata.session_id_meta_data[session]

        client = AZClient()
        committees = client.list_committees(
            sessionId=session_id,
            includeOnlyCommitteesWithAgendas='false',
            legislativeBody='S' if chamber == 'upper' else 'H',
        )
        for committee in committees.json():
            c = Organization(name=committee['CommitteeName'],
                             chamber=chamber, classification='committee')
            details = client.get_standing_committee(
                sessionId=session_id,
                legislativeBody='S' if chamber == 'upper' else 'H',
                committeeId=committee['CommitteeId'],
                includeMembers='true',
            )
            for member in details.json()[0]['Members']:
                c.add_member(
                    u'{} {}'.format(member['FirstName'], member['LastName']),
                    role=parse_role(member),
                )
                c.add_source(details.url)

            c.add_source(committees.url)
            yield c
Example #7
0
    def handle_page(self):
        name = self.doc.xpath('//h2[@class="committeeName"]')[0].text
        if name.startswith('Appropriations Subcommittee'):
            name = name.replace('Appropriations ', '')
            parent = {'name': 'Appropriations', 'classification': 'upper'}
            chamber = None
        else:
            if name.startswith('Committee on'):
                name = name.replace('Committee on ', '')
            parent = None
            chamber = 'upper'
        comm = Organization(name=name, classification="committee",
                            chamber=chamber, parent_id=parent,
                            )

        for dt in self.doc.xpath('//div[@id="members"]/dl/dt'):
            role = dt.text.replace(': ', '').strip().lower()
            member = dt.xpath('./following-sibling::dd')[0].text_content()
            member = self.clean_name(member)
            comm.add_member(member, role=role)

        for ul in self.doc.xpath('//div[@id="members"]/ul/li'):
            member = self.clean_name(ul.text_content())
            comm.add_member(member)

        comm.add_source(self.url)

        yield comm
Example #8
0
    def scrape_interim_committee(self, link, name):
        url = re.sub(r'\s+', '', link.attrib['href'])
        html = self.get(url).text
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)

        if 'Subcommittee' in name:
            # Check whether the parent committee is manually defined first
            # before attempting to automatically resolve it.
            parent = WVCommitteeScraper.subcommittee_parent_map.get(name, None)
            if parent is None:
                parent = name.partition('Subcommittee')[0].strip()

            comm = Organization(
                name=name,
                classification='committee',
                parent_id={'name': parent, 'classification': 'joint'}
            )
        else:
            comm = Organization(name=name, classification='committee', chamber='joint')
        comm.add_source(url)

        xpath = '//a[contains(@href, "?member=")]'
        for link in doc.xpath(xpath):
            name = link.text_content().strip()
            name = re.sub(r'^Delegate\s+', '', name)
            name = re.sub(r'^Senator\s+', '', name)
            role = link.getnext().text or 'member'
            comm.add_member(name, role.strip())

        return comm
Example #9
0
    def scrape_approp_subcommittees(self):
        URL = 'http://www.senate.michigan.gov/committee/appropssubcommittee.html'
        html = self.get(URL).text
        doc = lxml.html.fromstring(html)

        for strong in doc.xpath('//strong'):
            com = Organization(
                name=strong.text.strip(),
                parent_id=self._senate_appropriations,
                classification='committee',
            )
            com.add_source(URL)

            legislators = strong.getnext().tail.replace('Senators', '').strip()
            for leg in re.split(', | and ', legislators):
                if leg.endswith('(C)'):
                    role = 'chairman'
                    leg = leg[:-4]
                elif leg.endswith('(VC)'):
                    role = 'vice chairman'
                    leg = leg[:-5]
                elif leg.endswith('(MVC)'):
                    role = 'minority vice chairman'
                    leg = leg[:-6]
                else:
                    role = 'member'
                com.add_member(leg, role=role)

            yield com
Example #10
0
    def scrape_committee(self, chamber, name, url):
        page = self.get(url).text
        page = lxml.html.fromstring(page)

        if page.xpath("//h3[. = 'Joint Committee']"):
            chamber = 'joint'

        subcommittee = page.xpath("//h3[@align='center']/text()")[0]
        if "Subcommittee" not in subcommittee:
            comm = Organization(
                chamber=chamber, name=name, classification='committee')
        else:
            comm = Organization(
                name=subcommittee, classification='committee',
                parent_id={'classification': chamber, 'name': name})

        comm.add_source(url)

        for link in page.xpath("//a[contains(@href, 'member=')]"):
            member = link.text.strip()

            mtype = link.xpath("string(../preceding-sibling::td[1])")
            mtype = mtype.strip(": \r\n\t").lower()

            comm.add_member(member, mtype)

        if not comm._related:
            self.warning('not saving %s, appears to be empty' % name)
        else:
            yield comm
Example #11
0
    def scrape_committee(self, term, href, name):
        page = self.get(href).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(href)
        members = page.xpath("//div[@class='view-content']"
                             "//a[contains(@href, 'members')]")

        if '/joint/' in href:
            chamber = 'legislature'
        elif '/senate/' in href:
            chamber = 'upper'
        elif '/house/' in href:
            chamber = 'lower'
        else:
            # interim committees and others were causing duplicate committee issues, skipping
            self.warning('Failed to identify chamber for {}; skipping'.format(href))
            return

        cttie = Organization(name, chamber=chamber, classification='committee')
        for a in members:
            member = a.text
            role = a.xpath("ancestor::div/h2[@class='pane-title']/text()")[0].strip()
            role = {"Legislative Members": "member",
                    "Chairman": "chair",
                    "Vice Chairman": "member"}[role]

            if member is None or member.startswith("District"):
                continue

            member = member.replace('Senator ', '').replace('Representative ', '')

            cttie.add_member(member, role=role)

        cttie.add_source(href)
        yield cttie
Example #12
0
    def scrape_approp_subcommittees(self, url):
        html = self.get(url).text
        doc = lxml.html.fromstring(html)

        for strong in doc.xpath('//strong'):
            com = Organization(
                name=strong.text.strip(),
                parent_id={
                    'name': 'Appropriations',
                    'classification': 'committee',
                },
                classification='committee',
            )
            com.add_source(url)

            legislators = strong.getnext().tail.replace('Senators', '').strip()
            for leg in re.split(', | and ', legislators):
                if leg.endswith('(C)'):
                    role = 'chairman'
                    leg = leg[:-4]
                elif leg.endswith('(VC)'):
                    role = 'vice chairman'
                    leg = leg[:-5]
                elif leg.endswith('(MVC)'):
                    role = 'minority vice chairman'
                    leg = leg[:-6]
                else:
                    role = 'member'
                com.add_member(leg, role=role)

            yield com
Example #13
0
    def scrape(self):
        url = 'http://www.mec.mo.gov/EthicsWeb/CampaignFinance/CF11_SearchComm.aspx'

        
        for letter in ['a', 'e', 'i', 'o', 'u', 'y']:

            print("Searching '{}'".format(letter))
            initial = self.get(url).text
            parsed = lxml.html.fromstring(initial)        

            page_n = 0

            data = get_form_data(parsed, first_time=True)
            data['ctl00$ContentPlaceHolder$txtCandLast'] = letter
            
            while True:
                page_n += 1
            
                print("Page: {}".format(page_n))
            
                r = self.post(url, data=data, cookies=dict(PageIndex=str(1)))
                    
                output = lxml.html.fromstring(r.text)

                rows = output.cssselect('#ctl00_ContentPlaceHolder_grvSearch tr')
                
                for r in rows:
                    tds = r.cssselect('td')
                    if len(tds) > 3:

                        name = tds[2].text_content().strip()

                        _registrant = Person(
                            name=name,
                            source_identified=True
                        )
                        
                        committee_name = tds[1].text_content().strip()
                        _office = Organization(
                            name=committee_name,
                            classification='Committee',
                            # parent_id=self.jurisdiction._state,
                            source_identified=True
                        )

                        _office.add_member(
                            _registrant,
                            role='committee candidate',
                            label='candidate for {n}'.format(n=_office.name),
                        )

                        yield _registrant
                        yield _office

                            
                if not output.xpath("//*[@id='ctl00_ContentPlaceHolder_grvSearch_ctl28_lbtnNextPage']"):
                    print(output.xpath("//*[@id='ctl00_ContentPlaceHolder_grvSearch_ctl28_lbtnNextPage']"))
                    break
            
                data = get_form_data(output)
Example #14
0
    def scrape_senate_committee(self, url):
        html = self.get(url).text
        doc = lxml.html.fromstring(html)

        headers = doc.xpath('(//div[@class="row"])[2]//h1')
        assert len(headers) == 1
        name = ' '.join(headers[0].xpath('./text()'))
        name = re.sub(r'\s+Committee.*$', '', name)

        com = Organization(chamber='upper', name=name, classification='committee')

        for member in doc.xpath('(//div[@class="row"])[3]/div[1]/ul[1]/li'):
            text = member.text_content()
            member_name = member.xpath('./a/text()')[0].replace('Representative ', '')
            if 'Committee Chair' in text:
                role = 'chair'
            elif 'Minority Vice' in text:
                role = 'minority vice chair'
            elif 'Vice' in text:
                role = 'majority vice chair'
            else:
                role = 'member'

            com.add_member(member_name, role=role)

        com.add_source(url)
        yield com
Example #15
0
    def scrape_committee(self, name, url, chamber):
        org = Organization(name=name, chamber=chamber, classification='committee')
        org.add_source(url)
        data = self.get(url).text
        doc = lxml.html.fromstring(data)

        for leg in doc.xpath('//div[@id="members"]/div[@id="members"]/p/a/text()'):
            leg = leg.replace('Representative ', '')
            leg = leg.replace('Senator ', '')
            leg = leg.strip()
            if ' (' in leg:
                leg, role = leg.split(' (')
                if 'Vice-Chair' in role:
                    role = 'vice-chair'
                elif 'Co-Chair' in role:
                    role = 'co-chair'
                elif 'Chair' in role:
                    role = 'chair'
                else:
                    raise Exception('unknown role: %s' % role)
            else:
                role = 'member'
            org.add_member(leg, role)

        return org
Example #16
0
    def scrape_lower_committee(self, name, url):
        page = self.lxmlize(url)

        committee = Organization(chamber='lower', name=name,
                                 classification="committee")
        committee.add_source(url)

        seen = set()

        member_links = self.get_nodes(
            page,
            '//div[@class="mod-inner"]//a[contains(@href, "mem")]')

        for member_link in member_links:
            member_name = None
            member_role = None

            member_name = member_link.text
            if member_name is None:
                continue

            # Figure out if this person is the chair.
            if member_link == member_links[0]:
                member_role = 'chair'
            else:
                member_role = 'member'

            if name not in seen:
                committee.add_member(member_name, member_role)
                seen.add(member_name)

        return committee
Example #17
0
    def scrape_page(self, link, chamber=None):
        page = self.lxmlize(link.attrib['href'])
        comName = link.text
        roles = {
            "Chair": "chair",
            "Vice Chair": "vice-chair",
            "Vice-Chair": "vice-chair",
        }
        committee = Organization(comName,
                                 chamber=chamber,
                                 classification='committee')
        committee.add_source(link.attrib['href'])

        for member in page.xpath('//div[@class="members"]/' +
                                 'div[@class="roster-item"]'):
            details = member.xpath('.//div[@class="member-details"]')[0]
            person = details.xpath('./h4')[0].text_content()
            # This page does random weird things with whitepace to names
            person = ' '.join(person.strip().split())
            if not person:
                continue
            role = details.xpath('./span[@class="member-role"]')
            if role:
                role = roles[role[0].text]
            else:
                role = 'member'
            committee.add_member(person, role=role)
        yield committee
Example #18
0
    def scrape_committees_pdf(self, year, chamber, filename, url):
        if chamber == 'lower' and year == '2015':
            text = self._fix_house_text(filename).decode()
        else:
            text = convert_pdf(filename, type='text-nolayout').decode()

        for hotgarbage, replacement in (
            (r'Judicial Branch, Law Enforcement,\s+and\s+Justice',
                'Judicial Branch, Law Enforcement, and Justice'),

            (r'Natural Resources and\s+Transportation',
                'Natural Resources and Transportation'),

            (r'(?u)Federal Relations, Energy,?\s+and\s+Telecommunications',
                'Federal Relations, Energy, and Telecommunications')
                ):
            text = re.sub(hotgarbage, replacement, text)

        lines = iter(text.splitlines())

        # Drop any lines before the ag committee.
        lines = dropwhile(lambda s: 'Agriculture' not in s, lines)

        comm = None
        for line in lines:
            # Replace Unicode variants with ASCII equivalents
            line = line.replace(" ", " ").replace("‐", "-")

            if 'Subcommittees' in line:
                self.warning("Currently, we're skipping subcommittees")
                # https://github.com/openstates/openstates/issues/2099
                break
            if is_committee_name(line):
                if comm and comm._related:
                    yield comm

                committee = line.strip()
                comm = Organization(name=committee, chamber=chamber,
                                    classification='committee')

                comm.add_source(url)

            elif is_legislator_name(line):
                name, party = line.rsplit('(', 1)
                name = name.strip().replace("Rep. ", "").replace("Sen. ", "")
                if re.search(' Ch', party):
                    role = 'chair'
                elif ' VCh' in party:
                    role = 'vice chair'
                elif ' MVCh' in party:
                    role = 'minority vice chair'
                else:
                    role = 'member'
                comm.add_member(name, role)

        if comm._related:
            yield comm
Example #19
0
    def scrape(self, session=None):
        if session is None:
            session = self.latest_session()
            self.info('no session specified, using %s', session)

        # com_types = ['J', 'SE', 'O']
        # base_url = 'https://wyoleg.gov/LsoService/api/committeeList/2018/J'
        url = 'https://wyoleg.gov/LsoService/api/committees/{}'.format(session)

        response = self.get(url)
        coms_json = json.loads(response.content.decode('utf-8'))

        for row in coms_json:
            com_url = 'https://wyoleg.gov/LsoService/api/committeeDetail/{}/{}'.format(
                session, row['ownerID'])
            com_response = self.get(com_url)
            com = json.loads(com_response.content.decode('utf-8'))

            # WY doesn't seem to have any house/senate only committees that I can find
            committee = Organization(
                name=com['commName'], chamber='legislature', classification='committee')

            for member in com['commMembers']:
                role = 'chairman' if member['chairman'] == 'Chairman' else 'member'
                committee.add_member(member['name'], role)

            # some WY committees have non-legislators appointed to the member by the Governor
            # but the formatting is super inconsistent
            if com['otherMembers']:
                committee.extras['other_members'] = com['otherMembers']

            committee.extras['wy_id'] = com['commID']
            committee.extras['wy_code'] = com['ownerID']
            committee.extras['wy_type_code'] = com['type']
            committee.extras['budget'] = com['budget']

            if com['statAuthority']:
                committee.extras['statutory_authority'] = com['statAuthority']

            if com['number']:
                committee.extras['seat_distribution'] = com['number']

            committee.add_identifier(
                scheme='WY Committee ID', identifier=str(com['commID']))
            committee.add_identifier(
                scheme='WY Committee Code', identifier=str(com['ownerID']))

            if com['description']:
                committee.add_identifier(
                    scheme='Common Name', identifier=com['description'])

            source_url = 'http://wyoleg.gov/Committees/{}/{}'.format(
                session, com['ownerID'])
            committee.add_source(source_url)

            yield committee
Example #20
0
    def scrape_chamber(self, chamber=None):
        metainf = self.scrape_leg_page(get_legislator_listing_url(chamber))
        for leg in metainf:
            try:
                chamber = {"House": "lower",
                           "Senate": "upper"}[leg['chamber']]
            except KeyError:
                print("")
                print("  ERROR: Bad Legislator page.")
                print("    -> " + "\n    -> ".join(leg['source']))
                print("")
                print("  Added this workaround because of a bad legislator")
                print("  page, while they filled their info out.")
                print("")
                print("  Emailed webmaster. Told to wait.")
                print("   - PRT, Jun 23, 2014")
                print("")
                continue

            person = Person(name=leg['name'], district=leg['district'],
                            party=leg['party'], primary_org=chamber,
                            image=leg['image'])

            for source in leg['source']:
                person.add_source(source)

            try:
                for ctty in leg['ctty']:
                    flag = 'Joint Legislative'
                    if ctty['name'][:len(flag)] == flag:
                        ctty_chamber = "joint"
                    else:
                        ctty_chamber = chamber

                    comm = Organization(name=ctty['name'], classification="committee",
                                        chamber=ctty_chamber)
                    comm.add_member(person, role="member")

            except KeyError:
                self.warn("%s has no scraped Committees" % leg['name'])

            person.add_link(leg['homepage'])

            if leg['addr']:
                person.add_contact_detail(type='address', value=leg['addr'], note='Capitol Office')
            if leg['phone']:
                person.add_contact_detail(type='voice', value=leg['phone'], note='Capitol Office')
            if leg['email']:
                person.add_contact_detail(type='email', value=leg['email'], note='Capitol Office')
            if leg['fax']:
                person.add_contact_detail(type='fax', value=leg['fax'], note='Capitol Office')
            yield person
Example #21
0
    def scrape(self, session=None):
        if not session:
            session = self.jurisdiction.legislative_sessions[-1]['name']
            self.info('no session specified, using %s', session)

        year_abr = session[0:4]

        self._init_mdb(year_abr)
        members_csv = self.access_to_csv('COMember')
        info_csv = self.access_to_csv('Committee')

        org_dictionary = {}

        # Committee Info Database
        for rec in info_csv:
            abrv = rec["Code"]
            comm_name = rec["Description"]

            if abrv[0] == "A":
                chamber = "lower"
            elif abrv[0] == "S":
                chamber = "upper"

            org = Organization(
                name=comm_name,
                chamber=chamber,
                classification='committee',
            )
            org.add_source('http://www.njleg.state.nj.us/downloads.asp')
            org_dictionary[abrv] = org

        # Committee Member Database
        POSITIONS = {
            'C': 'chair',
            'V': 'vice-chair',
            '': 'member'
        }
        for member_rec in members_csv:
            # assignment=P means they are active, assignment=R means removed
            if member_rec['Assignment_to_Committee'] == 'P':
                abr = member_rec["Code"]
                org = org_dictionary[abr]

                leg = member_rec["Member"]
                role = POSITIONS[member_rec["Position_on_Committee"]]
                leg = ' '.join(leg.split(', ')[::-1])
                org.add_member(leg, role=role)

        for org in org_dictionary.values():
            yield org
Example #22
0
    def _scrape_committee(self, committee_name, link, chamber):
        """Scrape individual committee page and add members"""

        page = self.get(link).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(link)

        is_subcommittee = bool(page.xpath('//li/a[text()="Committee"]'))
        if is_subcommittee:
            # All TN subcommittees are just the name of the parent committee with " Subcommittee"
            # at the end
            parent_committee_name = re.sub(r'\s*(Study )?Subcommittee\s*', '', committee_name)
            com = Organization(
                    committee_name,
                    classification='committee',
                    parent_id=self.parents[parent_committee_name]
                    )
        else:
            com = Organization(
                committee_name,
                chamber=chamber,
                classification='committee',
            )
            self.parents[committee_name] = com._id

        OFFICER_SEARCH = '//h2[contains(text(), "Committee Officers")]/' \
                         'following-sibling::div/ul/li/a'
        MEMBER_SEARCH = '//h2[contains(text(), "Committee Members")]/' \
                        'following-sibling::div/ul/li/a'
        for a in (page.xpath(OFFICER_SEARCH) + page.xpath(MEMBER_SEARCH)):

            member_name = ' '.join([
                    x.strip() for x in
                    a.xpath('text()') + a.xpath('span/text()')
                    if x.strip()
                    ])
            role = a.xpath('small')
            if role:
                role = role[0].xpath('text()')[0].strip()
            else:
                role = 'member'
            if '(Vacant)' in role:
                continue

            com.add_member(member_name, role)

        com.add_link(link)
        com.add_source(link)
        return com
Example #23
0
    def _scrape_standing_committees(self):
        """Scrapes the Standing Committees page of the Nebraska state
        legislature."""
        main_url = 'http://www.nebraskalegislature.gov/committees/standing-committees.php'
        page = self.lxmlize(main_url)

        committee_nodes = self.get_nodes(
            page,
            '//a[@class="accordion-switch"][contains(text(), "Standing Committees")]'
            '/ancestor::div[@class="panel panel-leg"]//div[@class="list-group"]'
            '/a[@class="list-group-item"]')

        for committee_node in committee_nodes:
            committee_page_url = committee_node.attrib['href']
            committee_page = self.lxmlize(committee_page_url)

            name_text = self.get_node(
                committee_page,
                '//div[@class="container view-front"]/div[@class="row"]/'
                'div[@class="col-sm-6 col-md-7"]/h1/text()[normalize-space()]')
            name = name_text.split()[0:-1]

            committee_name = ''
            for x in range(len(name)):
                committee_name += name[x] + ' '
            committee_name = committee_name[0: -1]

            org = Organization(name=committee_name, chamber='legislature',
                               classification='committee')

            members = self.get_nodes(
                committee_page,
                '//div[@class="col-sm-4 col-md-3 ltc-col-right"][1]/'
                'div[@class="block-box"][1]/ul[@class="list-unstyled '
                'feature-content"]/li/a/text()[normalize-space()]')

            for member in members:
                member_name = re.sub(r'\Sen\.\s+', '', member)
                member_name = re.sub(r', Chairperson', '', member_name).strip()
                if 'Chairperson' in member:
                    member_role = 'Chairperson'
                else:
                    member_role = 'member'
                org.add_member(member_name, member_role)

            org.add_source(main_url)
            org.add_source(committee_page_url)

            yield org
Example #24
0
    def scrape(self):
        sessions = reversed(self.jurisdiction.legislative_sessions)
        committee_term_instances = committees_from_sessions(self, sessions)
        committees_by_code = build_lookup_dict(self, data_list=committee_term_instances, index_key='code')

        for code, instances in committees_by_code.items():
            # TODO: Figure out how to edit city council org.
            if code == 'CC':
                continue

            # When there are no meetings scheduled and was no way to deduce committee code.
            if not code:
                continue

            extras = {'tmmis_decision_body_ids': []}
            for i, inst in enumerate(instances):
                # TODO: Ensure this survives addition of new term (2017)
                #       so specific year always creates
                canonical_i = 0
                if i == canonical_i:
                    o = Organization(name=inst['name'], classification='committee')
                    extras.update({'description': inst['info']})
                    o.add_identifier(inst['code'], scheme=TWO_LETTER_ORG_CODE_SCHEME)

                    # TODO: Scrape non-councillor members
                    meeting_id = self.referenceMeetingId(inst['code'], inst['term'])
                    if meeting_id:
                        seen_posts = []
                        membership_url = MEMBERSHIP_URL_TEMPLATE.format(meeting_id)
                        for councillor in self.councillorMembers(membership_url):
                            o.add_member(councillor['name'], councillor['role'])
                            if councillor['role'] not in seen_posts:
                                o.add_post(
                                    role=councillor['role'],
                                    label=councillor['role'],
                                    # TODO: More specific divisions for some committee?
                                    division_id=self.jurisdiction.division_id,
                                )
                                seen_posts.append(councillor['role'])

                extras['tmmis_decision_body_ids'].append({inst['term']: inst['decision_body_id']})
                o.extras = extras
                o.add_source(inst['source_url'])
                if instances[canonical_i]['name'] != inst['name']:
                    # TODO: Add start_date and end_date
                    o.add_name(inst['name'])

            yield o
Example #25
0
    def scrape(self):
        com_url = 'http://dccouncil.us/committees'
        data = self.get(com_url).text
        doc = lxml.html.fromstring(data)
        doc.make_links_absolute(com_url)

        comms = set(
            doc.xpath('//a[contains(@href, "dccouncil.us/committees/")]'))

        for committee in comms:
            url = committee.attrib['href']
            name = committee.text_content().strip()
            comm_data = self.get(url).text
            comm_page = lxml.html.fromstring(comm_data)
            comm_page.make_links_absolute(url)

            # classify these as belonging to the legislature
            committee = Organization(name=name, classification='committee',
                                     chamber='legislature')

            if comm_page.xpath('//p[@class="page-summary"]'):
                summary = comm_page.xpath(
                    '//p[@class="page-summary"]')[0].text_content().strip()
                committee.extras['summary'] = summary

            chair = comm_page.xpath(
                "//h4[text()='Chairperson']/following-sibling::p")
            chair_name = chair[0].text_content().strip()
            chair_name = self.remove_title(chair_name)
            committee.add_member(chair_name, role="chair")

            members = comm_page.xpath(
                "//h4[text()='Councilmembers']/following-sibling::ul")
            members = members[0].xpath("./li")

            for m in members:
                mem_name = m.text_content().strip()
                mem_name = self.remove_title(mem_name)
                if mem_name != chair_name:
                    committee.add_member(mem_name)

            committee.add_source(url)
            committee.add_link(url, note='Official Website')

            if not committee._related:
                self.warning('empty committee: %s;', name)
            else:
                yield committee
Example #26
0
    def scrape_committee(self, chamber, url):
        html = self.get(url).text
        doc = lxml.html.fromstring(html)

        name = doc.xpath('//title/text()')[0]
        com = Organization(name, chamber=chamber, classification='committee')
        com.add_source(url)

        members = doc.xpath('//a[contains(@href, "/Legislators/Profile")]')
        for member in members:
            title = member.xpath('../span')
            role = title[0].text.lower() if title else 'member'
            com.add_member(member.text, role)

        if members:
            return com
Example #27
0
    def scrape(self, chamber=None):
        committees_url = 'http://le.utah.gov/data/committees.json'
        committees = self.get(committees_url).json()['committees']

        people_url = 'http://le.utah.gov/data/legislators.json'
        people = self.get(people_url).json()['legislators']

        # The committee JSON only has legislator IDs, not names
        ids_to_names = {}
        for person in people:
            ids_to_names[person['id']] = person['formatName']

        for committee in committees:
            name = committee['description']
            if name.endswith(' Committee'):
                name = name[:len(name) - len(' Committee')]
            elif name.endswith(' Subcommittee'):
                name = name[:len(name) - len(' Subcommittee')]
            if name.startswith('House '):
                name = name[len('House '):]
                chamber = 'lower'
            elif name.startswith('Senate '):
                name = name[len('Senate '):]
                chamber = 'upper'
            else:
                chamber = 'legislature'

            c = Organization(
                chamber=chamber,
                name=name,
                classification='committee'
            )
            c.add_source(committees_url)
            c.add_source(people_url)
            c.add_link(committee['link'])

            for member in committee['members']:
                try:
                    member_name = ids_to_names[member['id']]
                except KeyError:
                    self.warning(
                        "Found unknown legislator ID in committee JSON: " +
                        member['id']
                    )
                c.add_member(member_name, role=member['position'])

            yield c
Example #28
0
    def scrape_lower_committee(self, link, name):
        url = re.sub(r'\s+', '', link.attrib['href'])
        html = self.get(url).text
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)

        comm = Organization(name=name, chamber='lower', classification='committee')
        comm.add_source(url)

        xpath = '//a[contains(@href, "?member=")]'
        for link in doc.xpath(xpath):
            name = link.text_content().strip()
            name = re.sub(r'^Delegate\s+', '', name)
            role = link.getnext().text or 'member'
            comm.add_member(name, role.strip())

        return comm
Example #29
0
    def scrape_upper_committee(self, url):
        doc = self.lxmlize(url)
        inner_content = self.get_node(doc, '//section[@class="inner-content"]')
        comm_name = self.get_node(inner_content, './/h2').text.strip()

        # Remove "Committee" from committee names
        comm_name = (
            comm_name.
            replace(u"Comisión de ", "").
            replace(u"Comisión sobre ", "").
            replace(u"Comisión para ", "").
            replace(u"Comisión Especial para el Estudio de ", "").
            replace(u"Comisión Especial para ", "").
            replace(u"Comisión ", "")
        )
        comm_name = re.sub(r'(?u)^(las?|el|los)\s', "", comm_name)
        comm_name = comm_name[0].upper() + comm_name[1:]

        comm = Organization(comm_name, chamber='upper',
                            classification='committee')
        comm.add_source(url)

        members = self.get_nodes(inner_content, './/li')
        for member in members:
            name_parts = member.text.split("-")
            name = name_parts[0].replace("Hon. ", "").strip()

            if len(name_parts) > 1:
                title = name_parts[1].strip()

                # Translate titles to English for parity with other states
                if "President" in title:
                    title = 'chairman'
                elif title.startswith("Vicepresident"):
                    title = 'vicechairman'
                elif title.startswith("Secretari"):
                    title = 'secretary'
                else:
                    raise AssertionError("Unknown member type: {}".
                                         format(title))

                comm.add_member(name, title)
            else:
                comm.add_member(name)

        yield comm
Example #30
0
    def scrape(self, session=None):
        if not session:
            session = self.jurisdiction.legislative_sessions[-1]["name"]
            self.info("no session specified, using %s", session)

        year_abr = session[0:4]

        self._init_mdb(year_abr)
        members_csv = self.access_to_csv("COMember")
        info_csv = self.access_to_csv("Committee")

        org_dictionary = {}

        # Committee Info Database
        for rec in info_csv:
            abrv = rec["Code"]
            comm_name = rec["Description"]

            if abrv[0] == "A":
                chamber = "lower"
            elif abrv[0] == "S":
                chamber = "upper"

            org = Organization(
                name=comm_name, chamber=chamber, classification="committee"
            )
            org.add_source("http://www.njleg.state.nj.us/downloads.asp")
            org_dictionary[abrv] = org

        # Committee Member Database
        POSITIONS = {"C": "chair", "V": "vice-chair", "": "member"}
        for member_rec in members_csv:
            # assignment=P means they are active, assignment=R means removed
            if member_rec["Assignment_to_Committee"] == "P":
                abr = member_rec["Code"]
                org = org_dictionary[abr]

                leg = member_rec["Member"]
                role = POSITIONS[member_rec["Position_on_Committee"]]
                leg = " ".join(leg.split(", ")[::-1])
                org.add_member(leg, role=role)

        for org in org_dictionary.values():
            yield org
Example #31
0
    def scrape(self, chamber=None):
        committees_url = 'http://le.utah.gov/data/committees.json'
        committees = self.get(committees_url).json()['committees']

        people_url = 'http://le.utah.gov/data/legislators.json'
        people = self.get(people_url).json()['legislators']

        # The committee JSON only has legislator IDs, not names
        ids_to_names = {}
        for person in people:
            ids_to_names[person['id']] = person['formatName']

        for committee in committees:
            name = committee['description']
            if name.endswith(' Committee'):
                name = name[:len(name) - len(' Committee')]
            elif name.endswith(' Subcommittee'):
                name = name[:len(name) - len(' Subcommittee')]
            if name.startswith('House '):
                name = name[len('House '):]
                chamber = 'lower'
            elif name.startswith('Senate '):
                name = name[len('Senate '):]
                chamber = 'upper'
            else:
                chamber = 'legislature'

            c = Organization(chamber=chamber,
                             name=name,
                             classification='committee')
            c.add_source(committees_url)
            c.add_source(people_url)
            c.add_link(committee['link'])

            for member in committee['members']:
                try:
                    member_name = ids_to_names[member['id']]
                except KeyError:
                    self.warning(
                        "Found unknown legislator ID in committee JSON: " +
                        member['id'])
                c.add_member(member_name, role=member['position'])

            yield c
Example #32
0
    def scrape(self):
        # com_url = 'http://www.dccouncil.washington.dc.us/committees'
        com_url = 'http://dccouncil.us/committees'
        data = self.get(com_url).text
        doc = lxml.html.fromstring(data)
        doc.make_links_absolute(com_url)


        #dc spelled committe(e) two different ways
        #IN THEIR HTML CLASS NAMES!
        comms = doc.xpath('//li[contains(@class,"node_committee-on")]/a')
        comms += doc.xpath('//li[contains(@class,"node_committe-on")]/a')
        for committee in comms:
            url = committee.attrib['href']
            # committee = Organization(name=name, chamber=chamber, classification='committee')
            name = committee.text_content().strip()
            comm_data = self.get(url).text
            comm_page = lxml.html.fromstring(comm_data)
            comm_page.make_links_absolute(url)
            
            # comm = Committee("upper",name)
            committee = Organization(name=name, classification='committee', chamber='upper')

            chair = comm_page.xpath("//h3[text()='Committee Chair']/following-sibling::p")
            chair_name = chair[0].text_content().strip()
            committee.add_member(chair_name,role="chair")

            members = comm_page.xpath("//h3[text()='Councilmembers']/following-sibling::ul")
            members = members[0].xpath("./li")

            

            for m in members:
                mem_name = m.text_content().strip()
                if mem_name != chair_name:
                    committee.add_member(mem_name)

            committee.add_source(url)
            # self.save_committee(comm)

            if not committee._related:
                self.warning('empty committee: %s;', name)
            else:
                yield committee
    def scrape_lower_committee(self, name, parent, url):
        page = self.curl_lxmlize(url)

        if 'Joint' in name or (parent and 'Joint' in parent):
            chamber = 'joint'
        else:
            chamber = 'lower'

        if parent:
            comm = Organization(name=parent,
                                chamber=chamber,
                                classification='committee')
            subcomm = Organization(name=name,
                                   parent_id=comm,
                                   classification='committee')
        else:
            comm = Organization(name=name,
                                chamber=chamber,
                                classification='committee')
        comm.add_source(url)

        xpath = "//a[contains(@href, 'District')]"
        for link in page.xpath(xpath):
            member = link.xpath('string()').strip()
            member = re.sub(r'\s+', ' ', member)

            if not member or member == 'House District Maps':
                continue

            match = re.match(r'((Co-)?(Vice )?Chair)?Rep\. ([^\(]+)', member)
            member = match.group(4).strip()
            role = match.group(1) or 'member'

            comm.add_member(member, role.lower())

        if not comm._related:
            if subcomm.name == 'test':
                # Whoopsie, prod data.
                return

            raise Exception('no members for %s (%s)' %
                            (comm.name, subcomm.name))

        yield comm
Example #34
0
    def scrape_house_committees(self):
        base_url = "http://house.mi.gov/MHRPublic/CommitteeInfo.aspx?comkey="
        html = self.get("http://house.mi.gov/mhrpublic/committee.aspx").text
        doc = lxml.html.fromstring(html)

        # get values out of drop down
        for opt in doc.xpath("//option"):
            name = opt.text
            # skip invalid choice
            if opt.text in ("Statutory Committees", "Select One"):
                continue
            if "have not been created" in opt.text:
                self.warning("no committees yet for the house")
                return
            com_url = base_url + opt.get("value")
            com_html = self.get(com_url).text
            cdoc = lxml.html.fromstring(com_html)
            com = Organization(chamber="lower",
                               name=name,
                               classification="committee")
            com.add_source(com_url)

            for a in doc.xpath('//a[starts-with(@id, "memberLink")]'):
                name = a.text.strip()

            # all links to http:// pages in servicecolumn2 are legislators
            members = cdoc.xpath('//div[contains(@id,"memberPanelRow")]')
            for mem in members:
                name = mem.xpath("./a")
                if name:
                    name = name[0].text.strip()
                else:
                    # this is a blank row
                    continue
                text = mem.xpath("./span")[0].text
                if "Committee Chair" in text:
                    role = "chair"
                elif "Vice-Chair" in text:
                    role = "vice chair"
                else:
                    role = "member"
                com.add_member(name, role=role)

            yield com
Example #35
0
    def scrape_current(self, chamber):
        if chamber == "upper":
            chambers = ["special_committees", "senate_committees"]
        else:
            chambers = ["house_committees"]

        committee_request = self.get(ksapi.url + "ctte/").text
        committee_json = json.loads(committee_request)

        for com_type in chambers:
            committees = committee_json["content"][com_type]

            for committee_data in committees:

                # set to joint if we are using the special_committees
                com_chamber = (
                    "legislature" if com_type == "special_committees" else chamber
                )

                committee = Organization(
                    committee_data["TITLE"],
                    chamber=com_chamber,
                    classification="committee",
                )

                com_url = ksapi.url + "ctte/%s/" % committee_data["KPID"]
                try:
                    detail_json = self.get(com_url).text
                except scrapelib.HTTPError:
                    self.warning("error fetching committee %s" % com_url)
                    continue
                details = json.loads(detail_json)["content"]
                for chair in details["CHAIR"]:
                    if chair.get("FULLNAME", None):
                        chair_name = chair["FULLNAME"]
                    else:
                        chair_name = self.parse_kpid(chair["KPID"])
                        self.warning("no FULLNAME for %s", chair["KPID"])
                    committee.add_member(chair_name, "chairman")
                for vicechair in details["VICECHAIR"]:
                    committee.add_member(vicechair["FULLNAME"], "vice-chairman")
                for rankedmember in details["RMMEM"]:
                    committee.add_member(rankedmember["FULLNAME"], "ranking member")
                for member in details["MEMBERS"]:
                    committee.add_member(member["FULLNAME"])

                if not committee._related:
                    self.warning(
                        "skipping blank committee %s" % committee_data["TITLE"]
                    )
                else:
                    committee.add_source(com_url)
                    yield committee
Example #36
0
    def scrape_lower_committee(self, name, parent, url):
        page = self.curl_lxmlize(url)

        if "Joint" in name or (parent and "Joint" in parent):
            chamber = "joint"
        else:
            chamber = "lower"

        if parent:
            comm = Organization(
                name=parent, chamber=chamber, classification="committee"
            )
            subcomm = Organization(
                name=name, parent_id=comm, classification="committee"
            )
        else:
            comm = Organization(name=name, chamber=chamber, classification="committee")
        comm.add_source(url)

        xpath = "//a[contains(@href, 'District')]"
        for link in page.xpath(xpath):
            member = link.xpath("string()").strip()
            member = re.sub(r"\s+", " ", member)

            if not member or member == "House District Maps":
                continue

            match = re.match(r"((Co-)?(Vice )?Chair)?Rep\. ([^\(]+)", member)
            member = match.group(4).strip()
            role = match.group(1) or "member"

            member = member.replace("Representative ", "")

            comm.add_member(member, role.lower())

        if not comm._related:
            if subcomm.name == "test":
                # Whoopsie, prod data.
                return

            raise Exception("no members for %s (%s)" % (comm.name, subcomm.name))

        yield comm
Example #37
0
    def scrape_current(self, chamber):
        if chamber == 'upper':
            chambers = ['special_committees', 'senate_committees']
        else:
            chambers = ['house_committees']

        committee_request = self.get(ksapi.url + 'ctte/').text
        committee_json = json.loads(committee_request)

        for com_type in chambers:
            committees = committee_json['content'][com_type]

            for committee_data in committees:

                # set to joint if we are using the special_committees
                com_chamber = ('legislature' if com_type
                               == 'special_committees' else chamber)

                committee = Organization(
                    committee_data['TITLE'],
                    chamber=com_chamber,
                    classification='committee',
                )

                com_url = ksapi.url + 'ctte/%s/' % committee_data['KPID']
                try:
                    detail_json = self.get(com_url).text
                except scrapelib.HTTPError:
                    self.warning("error fetching committee %s" % com_url)
                    continue
                details = json.loads(detail_json)['content']
                for chair in details['CHAIR']:
                    if chair.get('FULLNAME', None):
                        chair_name = chair['FULLNAME']
                    else:
                        chair_name = self.parse_kpid(chair['KPID'])
                        self.warning('no FULLNAME for %s', chair['KPID'])
                    committee.add_member(chair_name, 'chairman')
                for vicechair in details['VICECHAIR']:
                    committee.add_member(vicechair['FULLNAME'],
                                         'vice-chairman')
                for rankedmember in details['RMMEM']:
                    committee.add_member(rankedmember['FULLNAME'],
                                         'ranking member')
                for member in details['MEMBERS']:
                    committee.add_member(member['FULLNAME'])

                if not committee._related:
                    self.warning('skipping blank committee %s' %
                                 committee_data['TITLE'])
                else:
                    committee.add_source(com_url)
                    yield committee
Example #38
0
    def scrape_lower_committee(self, link, name):
        url = re.sub(r'\s+', '', link.attrib['href'])
        html = self.get(url).text
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)

        comm = Organization(name=name,
                            chamber='lower',
                            classification='committee')
        comm.add_source(url)

        xpath = '//a[contains(@href, "?member=")]'
        for link in doc.xpath(xpath):
            name = link.text_content().strip()
            name = re.sub(r'^Delegate\s+', '', name)
            role = link.getnext().text or 'member'
            comm.add_member(name, role.strip())

        return comm
Example #39
0
    def scrape(self, session=None):
        if session is None:
            session = self.latest_session()
            self.info('no session specified, using %s', session)

        list_url = self.urls["list"] % (session, )
        committees = {}
        page = self.get(list_url).text
        page = lxml.html.fromstring(page)
        for el in page.xpath(".//a[contains(@href, 'CommitteeMembers')]"):
            committees[el.text.strip()] = el.get("href")

        for c in committees:
            self.info(c)
            detail_url = self.urls["detail"] % (committees[c], )
            page = self.get(detail_url).text
            page = lxml.html.fromstring(page)
            if re.match('\d{1,2}-', c):
                c = c.split('-', 1)[1]
            jcomm = Organization(name=c.strip(),
                                 chamber='joint',
                                 classification='committee')
            for table in page.xpath(
                    ".//table[contains(@id, 'CommitteeMembers')]"):
                rows = table.xpath(".//tr")
                chamber = rows[0].xpath('.//td')[0].text_content().strip()
                chamber = 'upper' if chamber == 'Senator' else 'lower'
                comm = Organization(name=c.strip(),
                                    chamber=chamber,
                                    classification='committee')
                for row in rows[1:]:
                    tds = row.xpath('.//td')
                    name = tds[0].text_content().strip()
                    role = 'chairman' if tds[3].text_content().strip(
                    ) == 'Chairman' else 'member'
                    comm.add_member(name, role)
                    jcomm.add_member(name, role)

                comm.add_source(detail_url)
                yield comm

            jcomm.add_source(detail_url)
            yield jcomm
Example #40
0
    def scrape_upper_committee(self, link, name):
        url = re.sub(r"\s+", "", link.attrib["href"])
        html = self.get(url).text
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)

        comm = Organization(name=name,
                            chamber="upper",
                            classification="committee")
        comm.add_source(url)

        xpath = '//a[contains(@href, "?member=")]'
        for link in doc.xpath(xpath):
            name = link.text_content().strip()
            name = re.sub(r"^Delegate\s+", "", name)
            role = link.getnext().text or "member"
            comm.add_member(name, role.strip())

        return comm
Example #41
0
    def _scrape_select_special_committees(self):
        """Scrapes the Select and Special Committees page of the
        Nebraska state legislature."""
        main_url = 'http://www.nebraskalegislature.gov/committees/select-committees.php'
        page = self.lxmlize(main_url)

        committee_nodes = self.get_nodes(
            page, '//a[contains(@class, "accordion-switch")]'
            '/ancestor::div[@class="panel panel-leg"]')

        for committee_node in committee_nodes:
            committee_name = self.get_node(
                committee_node,
                './/h2[@class="panel-title"]/text()[normalize-space()]')

            if committee_name is None:
                committee_name = self.get_node(
                    committee_node,
                    './/h2[@class="panel-title"]/a/text()[normalize-space()]')

            org = Organization(name=committee_name,
                               chamber='legislature',
                               classification='committee')
            org.add_source(main_url)

            members = self.get_nodes(
                committee_node, './/a[@class="list-group-item"]'
                '/text()[normalize-space()]')

            for member in members:
                member_name = re.sub(r'\Sen\.\s+', '', member)
                member_name = re.sub(r', Chairperson', '', member_name).strip()
                if 'Chairperson' in member:
                    member_role = 'Chairperson'
                else:
                    member_role = 'member'
                org.add_member(member_name, member_role)

            if not org._related:
                self.warning('No members found in {} committee.'.format(
                    org.name))
            else:
                yield org
Example #42
0
    def scrape(self, chamber=None):
        committees_url = "http://le.utah.gov/data/committees.json"
        committees = self.get(committees_url).json()["committees"]

        people_url = "http://le.utah.gov/data/legislators.json"
        people = self.get(people_url).json()["legislators"]

        # The committee JSON only has legislator IDs, not names
        ids_to_names = {}
        for person in people:
            ids_to_names[person["id"]] = person["formatName"]

        for committee in committees:
            name = committee["description"]
            if name.endswith(" Committee"):
                name = name[: len(name) - len(" Committee")]
            elif name.endswith(" Subcommittee"):
                name = name[: len(name) - len(" Subcommittee")]
            if name.startswith("House "):
                name = name[len("House ") :]
                chamber = "lower"
            elif name.startswith("Senate "):
                name = name[len("Senate ") :]
                chamber = "upper"
            else:
                chamber = "legislature"

            c = Organization(chamber=chamber, name=name, classification="committee")
            c.add_source(committees_url)
            c.add_source(people_url)
            c.add_link(committee["link"])

            for member in committee["members"]:
                try:
                    member_name = ids_to_names[member["id"]]
                except KeyError:
                    self.warning(
                        "Found unknown legislator ID in committee JSON: " + member["id"]
                    )
                c.add_member(member_name, role=member["position"])

            yield c
Example #43
0
    def scrape_committee(self, comm_num):
        url = self.committee_url(comm_num)
        page = self.lxmlize(url)
        # get title
        comm_name = page.xpath("//h1/text()")[0]

        # create object
        comm = Organization(name=comm_name,
                            classification="committee",
                            chamber="legislature")
        comm.add_source(url=url)

        # add posts
        comm.add_post(label="chair", role="chair")
        # FIXME do we need a separate post for each member?
        # FIXME is member an appropriate name?
        comm.add_post(label="member", role="member")

        # helper for finding other nodes
        landmark_node = page.xpath("//h2[text()='Committee Members']")[0]

        # add memberships
        member_names = landmark_node.xpath(
            "following-sibling::div/ul/li/a/text()")
        fl_names = [HumanName.name_firstandlast(name) for name in member_names]
        print("My attempt to scrub people's names:",
              list(zip(member_names, fl_names)))
        chair_name, *other_names = fl_names
        if chair_name not in {'Lewis Reed'}:
            comm.add_member(chair_name, role="chair")
        for name in other_names:
            if name not in {'Lewis Reed'}:
                comm.add_member(name, role="member")

    # add description
        about_node = page.xpath("//h2[text()='About']")[0]
        (description, ) = about_node.xpath(
            "parent::div//div[@class='content-block']/p[2]/text()")
        description = description.strip()
        comm.extras = {"description": description}

        yield comm
Example #44
0
    def scrape_house_committees(self):
        url = 'http://www.house.leg.state.mn.us/comm/commemlist.asp'

        html = self.get(url).text
        doc = lxml.html.fromstring(html)

        for com in doc.xpath('//h2[@class="commhighlight"]'):
            members_url = com.xpath(
                'following-sibling::p[1]/a[text()="Members"]/@href')[0]

            com = Organization(com.text,
                               chamber='lower',
                               classification='committee')
            com.add_source(members_url)

            member_html = self.get(members_url).text
            mdoc = lxml.html.fromstring(member_html)

            # each legislator in their own table
            # first row, second column contains all the info
            for ltable in mdoc.xpath('//table/tr[1]/td[2]/p/b[1]'):

                # name is tail string of last element
                name = ltable.text_content()
                text = ltable.text
                if text and name != text:
                    name = name.replace(text, '')

                # role is inside a nested b tag
                role = ltable.xpath('b/*/text()')
                if role:
                    # if there was a role, remove it from name
                    role = role[0]
                    name = name.replace(role, '')
                else:
                    role = 'member'
                name = name.split(' (')[0]
                com.add_member(name, role)

            # save
            yield com
Example #45
0
    def scrape_senate_committee(self, url):
        html = self.get(url).text
        doc = lxml.html.fromstring(html)

        com_name = doc.xpath('//a[contains(@href, "committee_bio")]/text()')[0]
        parent = doc.xpath('//h4//a[contains(@href, "committee_bio")]/text()')
        if parent:
            self.log('%s is subcommittee of %s', com_name, parent[0])
            com = Organization(com_name,
                               chamber='upper',
                               classification='committee',
                               parent_id={
                                   'name': parent[0],
                                   'classification': 'upper'
                               })
        else:
            com = Organization(com_name,
                               chamber='upper',
                               classification='committee')

        for link in doc.xpath(
                '//div[@id="members"]//a[contains(@href, "member_bio")]'):
            name = link.text_content().strip()
            if name:
                position = link.xpath('.//preceding-sibling::b/text()')
                if not position:
                    position = 'member'
                elif position[0] == 'Chair:':
                    position = 'chair'
                elif position[0] == 'Vice Chair:':
                    position = 'vice chair'
                elif position[0] == 'Ranking Minority Member:':
                    position = 'ranking minority member'
                else:
                    raise ValueError('unknown position: %s' % position[0])

                name = name.split(' (')[0]
                com.add_member(name, position)

        com.add_source(url)
        yield com
    def _scrape_lower_standing_committee(self, committee_name, url):
        page = self.lxmlize(url)

        committee = Organization(committee_name,
                                 chamber="lower",
                                 classification="committee"
                                 )
        committee.add_source(url)

        rows = page.xpath('//table[@id="body_ListView1_itemPlaceholderContainer"]'
                          '/tr[@class="linkStyle2"]')

        for row in rows:
            member_name = row.xpath('normalize-space(string(./td[1]/a))')
            member_name = self._normalize_member_name(member_name)
            member_role = row.xpath('normalize-space(string(./td[2]))')
            member_role = self._normalize_member_role(member_role)

            committee.add_member(member_name, member_role)

        yield committee
Example #47
0
    def scrape_lower_committee(self, name, url):
        page = self.lxmlize(url)

        committee = Organization(chamber='lower',
                                 name=name,
                                 classification="committee")
        committee.add_source(url)

        seen = set()

        member_links = self.get_nodes(
            page, '//div[@class="commlinks"]//a[contains(@href, "mem")]')

        for member_link in member_links:
            member_name = None
            member_role = None

            member_text = member_link.text
            if member_text is not None:
                member = member_text.strip()
                member = re.sub(r'\s+', ' ', member)
                member_name, member_role = self._parse_name(member)

            if member_name is None:
                continue

            # Figure out if this person is the chair.
            role_type = self.get_node(
                member_link, '../../preceding-sibling::div[1]/text()')

            if role_type in (['Chair'], ['Co-Chair']):
                member_role = 'chair'
            else:
                member_role = 'member'

            if name not in seen:
                committee.add_member(member_name, member_role)
                seen.add(member_name)

        return committee
Example #48
0
    def scrape_committee(self, term, href, name):
        page = self.get(href).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(href)
        members = page.xpath("//div[@class='view-content']"
                             "//a[contains(@href, 'members')]")

        if '/joint/' in href:
            chamber = 'legislature'
        elif '/senate/' in href:
            chamber = 'upper'
        elif '/house/' in href:
            chamber = 'lower'
        else:
            # interim committees and others were causing duplicate committee issues, skipping
            self.warning(
                'Failed to identify chamber for {}; skipping'.format(href))
            return

        cttie = Organization(name, chamber=chamber, classification='committee')
        for a in members:
            member = a.text
            role = a.xpath(
                "ancestor::div/h2[@class='pane-title']/text()")[0].strip()
            role = {
                "Legislative Members": "member",
                "Chairman": "chair",
                "Vice Chairman": "member"
            }[role]

            if member is None or member.startswith("District"):
                continue

            member = member.replace('Senator ',
                                    '').replace('Representative ', '')

            cttie.add_member(member, role=role)

        cttie.add_source(href)
        yield cttie
Example #49
0
    def scrape_comm(self, url, chamber):
        data = self.post(url).json()['Data']

        for item in data:
            comm_name = item['CommitteeName']
            committee = Organization(name=comm_name,
                                     chamber=chamber,
                                     classification='committee')
            chair_man = str(item['ChairName'])
            vice_chair = str(item['ViceChairName'])
            comm_id = item['CommitteeId']
            comm_url = self.get_comm_url(chamber, comm_id, comm_name)
            members = self.scrape_member_info(comm_url)
            if vice_chair != 'None':
                committee.add_member(vice_chair, role='Vice-Chair')
            if chair_man != 'None':
                committee.add_member(chair_man, role='Chairman')

            for member in members:
                # vice_chair and chair_man already added.
                if chair_man not in member and vice_chair not in member:
                    member = " ".join(member.split())
                    if member:
                        committee.add_member(member)

            committee.add_source(comm_url)
            committee.add_source(url)
            yield committee
Example #50
0
    def scrape_comm(self, chamber):
        url = "http://billstatus.ls.state.ms.us/htms/%s_cmtememb.xml" % chamber
        comm_page = self.get(url)
        root = lxml.etree.fromstring(comm_page.content)
        if chamber == "h":
            chamber = "lower"
        else:
            chamber = "upper"
        for mr in root.xpath("//COMMITTEE"):
            name = mr.xpath("string(NAME)")
            comm = Organization(name,
                                chamber=chamber,
                                classification="committee")
            chair = mr.xpath("string(CHAIR)")
            chair = chair.replace(", Chairman", "")
            role = "Chairman"
            if len(chair) > 0:
                comm.add_member(chair, role=role)
            vice_chair = mr.xpath("string(VICE_CHAIR)")
            vice_chair = vice_chair.replace(", Vice-Chairman", "")
            role = "Vice-Chairman"
            if len(vice_chair) > 0:
                comm.add_member(vice_chair, role=role)
            members = mr.xpath("string(MEMBERS)").split(";")
            if "" in members:
                members.remove("")

            for leg in members:
                leg = leg.strip()
                comm.add_member(leg)

            comm.add_source(url)
            yield comm
    def scrape(self):
        com_url = 'http://dccouncil.us/committees'
        data = self.get(com_url).text
        doc = lxml.html.fromstring(data)
        doc.make_links_absolute(com_url)

        # dc spelled committe(e) two different ways
        # IN THEIR HTML CLASS NAMES!
        comms = doc.xpath('//li[contains(@class,"node_committee-on")]/a')
        comms += doc.xpath('//li[contains(@class,"node_committe-on")]/a')
        for committee in comms:
            url = committee.attrib['href']
            name = committee.text_content().strip()
            comm_data = self.get(url).text
            comm_page = lxml.html.fromstring(comm_data)
            comm_page.make_links_absolute(url)

            # classify these as belonging to the legislature
            committee = Organization(name=name, classification='committee',
                                     chamber='legislature')

            chair = comm_page.xpath("//h3[text()='Committee Chair']/following-sibling::p")
            chair_name = chair[0].text_content().strip()
            committee.add_member(chair_name, role="chair")

            members = comm_page.xpath("//h3[text()='Councilmembers']/following-sibling::ul")
            members = members[0].xpath("./li")

            for m in members:
                mem_name = m.text_content().strip()
                if mem_name != chair_name:
                    committee.add_member(mem_name)

            committee.add_source(url)

            if not committee._related:
                self.warning('empty committee: %s;', name)
            else:
                yield committee
Example #52
0
    def scrape_lower_committee(self, committee_name, url):
        page = self.lxmlize(url)

        committee_name = committee_name.strip()
        comm = Organization(committee_name,
                            chamber='lower',
                            classification='committee')
        comm.add_source(url)

        info_node = self.get_node(
            page, './/div[@id = "dnn_ctr1109_ViewWebCommission_WebCommission1_'
            'pnlCommission"]')

        # This will likely capture empty text nodes as well.
        members = self.get_nodes(
            info_node,
            './/div[@class="two-cols com"]/div[@class="col"]//text()'
            '[normalize-space() and preceding-sibling::br]')

        member_count = 0

        for member in members:
            member = re.sub(r'Hon\.\s*', '', member).strip()

            # Skip empty nodes.
            if not member:
                continue

            member, title = self._match_title(member)

            if title is not None:
                comm.add_member(member, title)
            else:
                comm.add_member(member)

            member_count += 1

        if member_count > 0:
            yield comm
    def scrape(self, chamber=None):
        url = "http://le.utah.gov/asp/interim/Main.asp?ComType=All&Year=2015&List=2#Results"
        page = self.lxmlize(url)

        for comm_link in page.xpath("//a[contains(@href, 'Com=')]"):
            comm_name = comm_link.text.strip()

            if "House" in comm_name:
                chamber = "lower"
            elif "Senate" in comm_name:
                chamber = "upper"
            else:
                chamber = "legislature"

            # Drop leading "House" or "Senate" from name
            comm_name = re.sub(r"^(House|Senate) ", "", comm_name)
            comm = Organization(name=comm_name, chamber=chamber,
                                classification='committee')
            committee_page = self.lxmlize(comm_link.attrib['href'])

            for mbr_link in committee_page.xpath(
                    "//table[@class='memberstable']//a"):

                name = mbr_link.text.strip()
                name = re.sub(r' \([A-Z]\)$', "", name)
                name = re.sub(r'^Sen. ', "", name)
                name = re.sub(r'^Rep. ', "", name)

                role = mbr_link.tail.strip().strip(",").strip()
                typ = "member"
                if role:
                    typ = role
                comm.add_member(name, typ)

            comm.add_source(url)
            comm.add_source(comm_link.get('href'))

            yield comm
Example #54
0
    def scrape_committees(self, session):
        session_key = SESSION_KEYS[session]
        committees_response = self.api_client.get("committees",
                                                  session=session_key)

        legislators = index_legislators(self, session_key)

        for committee in committees_response:
            org = Organization(
                chamber={
                    "S": "upper",
                    "H": "lower",
                    "J": "legislature"
                }[committee["HouseOfAction"]],
                name=committee["CommitteeName"],
                classification="committee",
            )
            org.add_source("https://olis.leg.state.or.us/liz/{session}"
                           "/Committees/{committee}/Overview".format(
                               session=session_key,
                               committee=committee["CommitteeName"]))
            members_response = self.api_client.get(
                "committee_members",
                session=session_key,
                committee=committee["CommitteeCode"],
            )
            for member in members_response:
                try:
                    member_name = legislators[member["LegislatorCode"]]
                except KeyError:
                    logger.warn("Legislator {} not found in session {}".format(
                        member["LegislatorCode"], session_key))
                    member_name = member["LegislatorCode"]
                org.add_member(member_name,
                               role=member["Title"] if member["Title"] else "")

            yield org
Example #55
0
    def scrape_senate_committee(self, url):
        html = self.get(url).text
        doc = lxml.html.fromstring(html)

        com_name = doc.xpath('//a[contains(@href, "committee_bio")]/text()')[0]
        parent = doc.xpath('//h4//a[contains(@href, "committee_bio")]/text()')
        if parent:
            self.log("%s is subcommittee of %s", com_name, parent[0])
            com = Organization(
                com_name,
                chamber="upper",
                classification="committee",
                parent_id={"name": parent[0], "classification": "upper"},
            )
        else:
            com = Organization(com_name, chamber="upper", classification="committee")

        for link in doc.xpath('//div[@id="members"]//a[contains(@href, "member_bio")]'):
            name = link.text_content().strip()
            if name:
                position = link.xpath(".//preceding-sibling::b/text()")
                if not position:
                    position = "member"
                elif position[0] == "Chair:":
                    position = "chair"
                elif position[0] == "Vice Chair:":
                    position = "vice chair"
                elif position[0] == "Ranking Minority Member:":
                    position = "ranking minority member"
                else:
                    raise ValueError("unknown position: %s" % position[0])

                name = name.split(" (")[0]
                com.add_member(name.strip(), position)

        com.add_source(url)
        yield com
Example #56
0
    def scrape_upper_committee(self, name, url):
        page = lxml.html.fromstring(self.get(url).text)

        comm = Organization(name=name, chamber="upper", classification="committee")
        comm.add_source(url)

        for link in page.xpath("//a[contains(@href, 'biographies')]"):
            member = link.xpath("string()").strip()
            member = re.sub(r"\s+", " ", member)
            if not member:
                continue
            role = link.tail
            if not role:
                role = "member"
            elif "Vice Chair" in role:
                role = "vice chair"
            elif "Chair" in role:
                role = "chair"
            member = member.replace("Senator ", "")
            comm.add_member(member, role=role)

        if not comm._related:
            raise Exception("no members for %s", comm.name)
        yield comm
Example #57
0
    def scrape_committee(self, chamber, name, url):
        page = self.get(url).text
        page = lxml.html.fromstring(page)

        if page.xpath("//h3[. = 'Joint Committee']"):
            chamber = "joint"

        subcommittee = page.xpath("//h3[@align='center']/text()")[0]
        if "Subcommittee" not in subcommittee:
            comm = Organization(chamber=chamber,
                                name=name,
                                classification="committee")
        else:
            comm = Organization(
                name=subcommittee,
                classification="committee",
                parent_id={
                    "classification": chamber,
                    "name": name
                },
            )

        comm.add_source(url)

        for link in page.xpath("//a[contains(@href, 'member=')]"):
            member = link.text.strip()

            mtype = link.xpath("string(../preceding-sibling::td[1])")
            mtype = mtype.strip(": \r\n\t").lower()

            comm.add_member(member, mtype)

        if not comm._related:
            self.warning("not saving %s, appears to be empty" % name)
        else:
            yield comm
Example #58
0
    def scrape_senate_comm(self):
        url = ('http://legislature.maine.gov/committee-information/'
               'standing-committees-of-the-senate')
        html = self.get(url).text
        doc = lxml.html.fromstring(html)

        headings = doc.xpath('//p/strong')
        for heading in headings:
            committee = Organization(chamber='upper',
                                     name=heading.text.strip(':'),
                                     classification='committee')
            committee.add_source(url)
            par = heading.getparent().getnext()
            while True:
                link = par.xpath('a')
                if len(link) == 0:
                    break
                res = self.senate_committee_pattern.search(link[0].text)
                name, chair = res.groups()
                committee.add_member(
                    name, 'chair' if chair is not None else 'member')
                par = par.getnext()

            yield committee
Example #59
0
    def scrape_committee(self, chamber, com_name, url):
        html = self.get(url).text
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)

        com = Organization(chamber=chamber, name=com_name, classification='committee')
        com.add_source(url)

        if 'stab=04' in url:
            for table in doc.xpath('//table[@class="grid"]'):
                rows = table.xpath('tr')
                sub_name = rows[0].getchildren()[0].text.strip()

                # new table - subcommittee
                if sub_name != 'Full Committee':
                    sub_name = sub_name.replace("Subcommittee", "").strip()
                    com = Organization(
                        name=sub_name, classification='committee',
                        parent_id=self.parents[(chamber, com_name)])
                    com.add_source(url)

                for row in rows[1:]:
                    name = row.getchildren()[0].text_content().strip()
                    name, role = define_role(name)
                    com.add_member(name, role)

                return com
        else:
            table_source = doc.xpath('//table[@class="noncogrid"]')

            if table_source != []:
                for table in table_source:
                    row = table.xpath('tr/td/a[contains(@href, "sponpage")]/text()')
                    sub_name_source = table.xpath('tr/th/text()')

                    if "Subcommittee" in sub_name_source[0]:
                        sub_name = sub_name_source[0]
                        sub_name = sub_name.replace("Subcommittee", "").strip()
                        com = Organization(
                            name=sub_name, classification='committee',
                            parent_id=self.parents[(chamber, com_name)])
                        com.add_source(url)

                    for name in row:
                        name, role = define_role(name)
                        com.add_member(name, role)

                    return com
            else:
                row = doc.xpath('//table[@class="spco"]/tr[1]/td/text()')
                for name in row:
                    name, role = define_role(name)
                    com.add_member(name, role)

                return com
Example #60
0
 def get_joint_committees_data(self, name, url):
     page = self.get(url).text
     html = lxml.html.fromstring(page)
     org = Organization(name=name,
                        chamber='joint',
                        classification="committee")
     table = html.xpath("//section[@class=' row-equal-height no-padding']")
     for td in table:
         senate_members = td.xpath('div[1]/div/div/div[2]/div/p/strong')
         if (len(senate_members) > 0):
             member_string = list(senate_members[0].itertext())
             if (len(member_string) > 1):
                 name = member_string[0]
                 role = member_string[1]
                 for ch in ['Sen.', ',', u'\u00a0']:
                     name = name.replace(ch, ' ').strip()
                     role = role.replace(ch, ' ').strip()
                 org.add_member(name, role=role)
             else:
                 name = member_string[0].replace('Sen.', ' ').strip()
                 for ch in ['Sen.', ',', u'\u00a0']:
                     name = name.replace(ch, ' ').strip()
                 org.add_member(name)
         house_members = list(
             td.xpath('div[2]/div/div/div[2]/div/p/strong'))
         if (len(house_members) > 0):
             member_string = list(house_members[0].itertext())
             if (len(member_string) > 1):
                 name = member_string[0].replace('Rep.', ' ').strip()
                 role = member_string[1].replace(',', ' ').strip()
                 for ch in ['Rep.', ',', u'\u00a0']:
                     name = name.replace(ch, ' ').strip()
                     role = role.replace(ch, ' ').strip()
                 org.add_member(name, role=role)
             else:
                 name = member_string[0].replace('Rep.', ' ').strip()
                 for ch in ['Rep.', ',', u'\u00a0']:
                     name = name.replace(ch, ' ').strip()
                 org.add_member(name)
     org.add_source(url)
     return org