Ejemplo n.º 1
0
    def scrape(self):
        urls = Urls(dict(list=legislators_url), self)

        council = Organization(
            'Temecula City Council',
            classification='legislature')
        council.add_source(urls.list.url)
        yield council

        for tr in urls.list.xpath('//table[2]//tr')[1:]:

            # Parse some attributes.
            name, role = tr.xpath('td/p[1]//font/text()')
            image = tr.xpath('td/img/@src').pop()

            # Create legislator.
            person = Person(name, image=image)

            # Add membership on council.
            memb = person.add_membership(council, role=role)

            # Add email address.
            email, detail_url = tr.xpath('td//a/@href')
            email = email[7:]
            memb.contact_details.append(
                dict(type='email', value=email, note='work'))

            # Add sources.
            person.add_source(urls.list.url)
            person.add_source(detail_url)

            yield person
Ejemplo n.º 2
0
    def scrape_approp_subcommittees(self):
        URL = 'http://www.senate.michigan.gov/committee/appropssubcommittee.html'
        html = self.get(URL).text
        doc = lxml.html.fromstring(html)

        for strong in doc.xpath('//strong'):
            com = Organization(
                name=strong.text.strip(),
                parent_id=self._senate_appropriations,
                classification='committee',
            )
            com.add_source(URL)

            legislators = strong.getnext().tail.replace('Senators', '').strip()
            for leg in re.split(', | and ', legislators):
                if leg.endswith('(C)'):
                    role = 'chairman'
                    leg = leg[:-4]
                elif leg.endswith('(VC)'):
                    role = 'vice chairman'
                    leg = leg[:-5]
                elif leg.endswith('(MVC)'):
                    role = 'minority vice chairman'
                    leg = leg[:-6]
                else:
                    role = 'member'
                com.add_member(leg, role=role)

            yield com
Ejemplo n.º 3
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//div[@class="entry-content"]//p/strong')
        for councillor in councillors:
            district = councillor.xpath('./ancestor::p/preceding-sibling::h2')[-1].text_content().split('–'.decode('utf-8'))[0]
            name = ' '.join(councillor.text_content().split()[-2:]).replace('-Â'.decode('utf-8'), '')
            role = councillor.text_content().replace(name, '').split('-')[0]
            if 'SAO' in role or not role:
                continue

            org = Organization(name=district + ' Municipal Council', classification='legislature', jurisdiction_id=self.jurisdiction.jurisdiction_id)
            org.add_source(COUNCIL_PAGE)
            yield org

            p = Person(primary_org='legislature', name=name, district=district)
            p.add_source(COUNCIL_PAGE)
            membership = p.add_membership(org, role=role, district=district)

            info = councillor.xpath('./ancestor::p/text()')
            for contact in info:
                if 'NT' in contact:
                    membership.add_contact_detail('address', contact.strip(), 'legislature')
                if 'Tel' in contact:
                    contact = contact.replace('Tel. ', '').replace('(', '').replace(') ', '-').strip()
                    membership.add_contact_detail('voice', contact, 'legislature')
                if 'Fax' in contact:
                    contact = contact.replace('Fax ', '').replace('(', '').replace(') ', '-').strip()
                    membership.add_contact_detail('fax', contact, 'legislature')
            email = self.get_email(councillor, './parent::p')
            membership.add_contact_detail('email', email)

            if 'Website' in councillor.xpath('./parent::p')[0].text_content():
                p.add_link(councillor.xpath('./parent::p//a')[1].attrib['href'])
            yield p
Ejemplo n.º 4
0
    def scrape_comm(self, chamber):
        url = 'http://billstatus.ls.state.ms.us/htms/%s_cmtememb.xml' % chamber
        comm_page = self.get(url)
        root = lxml.etree.fromstring(comm_page.content)
        if chamber == 'h':
            chamber = "lower"
        else:
            chamber = "upper"
        for mr in root.xpath('//COMMITTEE'):
            name = mr.xpath('string(NAME)')
            comm = Organization(name,
                                chamber=chamber,
                                classification='committee'
                                )
            chair = mr.xpath('string(CHAIR)')
            chair = chair.replace(", Chairman", "")
            role = "Chairman"
            if len(chair) > 0:
                comm.add_member(chair, role=role)
            vice_chair = mr.xpath('string(VICE_CHAIR)')
            vice_chair = vice_chair.replace(", Vice-Chairman", "")
            role = "Vice-Chairman"
            if len(vice_chair) > 0:
                comm.add_member(vice_chair, role=role)
            members = mr.xpath('string(MEMBERS)').split(";")
            if "" in members:
                members.remove("")

            for leg in members:
                leg = leg.strip()
                comm.add_member(leg)

            comm.add_source(url)
            yield comm
Ejemplo n.º 5
0
    def scrape_chamber(self, chamber):
        session = self.latest_session()
        # since we are scraping only latest_session
        session_id = session_metadata.session_id_meta_data[session]

        client = AZClient()
        committees = client.list_committees(
            sessionId=session_id,
            includeOnlyCommitteesWithAgendas='false',
            legislativeBody='S' if chamber == 'upper' else 'H',
        )
        for committee in committees.json():
            c = Organization(name=committee['CommitteeName'],
                             chamber=chamber, classification='committee')
            details = client.get_standing_committee(
                sessionId=session_id,
                legislativeBody='S' if chamber == 'upper' else 'H',
                committeeId=committee['CommitteeId'],
                includeMembers='true',
            )
            for member in details.json()[0]['Members']:
                c.add_member(
                    u'{} {}'.format(member['FirstName'], member['LastName']),
                    role=parse_role(member),
                )
                c.add_source(details.url)

            c.add_source(committees.url)
            yield c
Ejemplo n.º 6
0
    def scrape_committee(self, chamber, name, url):
        page = self.get(url).text
        page = lxml.html.fromstring(page)

        if page.xpath("//h3[. = 'Joint Committee']"):
            chamber = 'joint'

        subcommittee = page.xpath("//h3[@align='center']/text()")[0]
        if "Subcommittee" not in subcommittee:
            comm = Organization(
                chamber=chamber, name=name, classification='committee')
        else:
            comm = Organization(
                name=subcommittee, classification='committee',
                parent_id={'classification': chamber, 'name': name})

        comm.add_source(url)

        for link in page.xpath("//a[contains(@href, 'member=')]"):
            member = link.text.strip()

            mtype = link.xpath("string(../preceding-sibling::td[1])")
            mtype = mtype.strip(": \r\n\t").lower()

            comm.add_member(member, mtype)

        if not comm._related:
            self.warning('not saving %s, appears to be empty' % name)
        else:
            yield comm
Ejemplo n.º 7
0
    def scrape_committees(self, session):
        session_key = SESSION_KEYS[session]
        committees_response = self.api_client.get('committees', session=session_key)

        legislators = index_legislators(self, session_key)

        for committee in committees_response:
            org = Organization(
                chamber={'S': 'upper', 'H': 'lower',
                         'J': 'legislature'}[committee['HouseOfAction']],
                name=committee['CommitteeName'],
                classification='committee')
            org.add_source(
                'https://olis.leg.state.or.us/liz/{session}'
                '/Committees/{committee}/Overview'.format(session=session_key,
                                                          committee=committee['CommitteeName']))
            members_response = self.api_client.get('committee_members',
                                                   session=session_key,
                                                   committee=committee['CommitteeCode'])
            for member in members_response:
                try:
                    member_name = legislators[member['LegislatorCode']]
                except KeyError:
                    logger.warn('Legislator {} not found in session {}'.format(
                        member['LegislatorCode'], session_key))
                    member_name = member['LegislatorCode']
                org.add_member(member_name, role=member['Title'] if member['Title'] else '')

            yield org
Ejemplo n.º 8
0
    def _scrape_upper_committee(self, name, url2):
        cat = "Assignments.asp"
        url3 = url2.replace("default.asp", cat)

        committee = Organization(name,
                                 chamber="upper",
                                 classification="committee"
                                 )
        committee.add_source(url2)

        page = self.lxmlize(url3)

        members = page.xpath('//table[@id="table38"]//font/a/b')

        for link in members:
            role = "member"
            if link == members[0]:
                role = "Chairman"
            if link == members[1]:
                role = "Vice-Chairman"

            name = link.xpath('string()')
            name = name.replace('Senator ', '')
            name = re.sub('[\s]{2,}', ' ', name).strip()

            committee.add_member(name, role)

        yield committee
Ejemplo n.º 9
0
    def scrape_chamber(self, chamber, session):

        url = "%s/GetActiveCommittees?biennium=%s" % (self._base_url, session)
        page = self.get(url)
        page = lxml.etree.fromstring(page.content)

        for comm in xpath(page, "//wa:Committee"):
            agency = xpath(comm, "string(wa:Agency)")
            comm_chamber = {'House': 'lower', 'Senate': 'upper'}[agency]
            if comm_chamber != chamber:
                continue

            name = xpath(comm, "string(wa:Name)")
            # comm_id = xpath(comm, "string(wa:Id)")
            # acronym = xpath(comm, "string(wa:Acronym)")
            phone = xpath(comm, "string(wa:Phone)")

            comm = Organization(name, chamber=chamber, classification='committee')
            comm.extras['phone'] = phone
            self.scrape_members(comm, agency)
            comm.add_source(url)
            if not comm._related:
                self.warning('empty committee: %s', name)
            else:
                yield comm
Ejemplo n.º 10
0
    def scrape_lower_committee(self, name, url):
        page = self.lxmlize(url)

        committee = Organization(chamber='lower', name=name,
                                 classification="committee")
        committee.add_source(url)

        seen = set()

        member_links = self.get_nodes(
            page,
            '//div[@class="mod-inner"]//a[contains(@href, "mem")]')

        for member_link in member_links:
            member_name = None
            member_role = None

            member_name = member_link.text
            if member_name is None:
                continue

            # Figure out if this person is the chair.
            if member_link == member_links[0]:
                member_role = 'chair'
            else:
                member_role = 'member'

            if name not in seen:
                committee.add_member(member_name, member_role)
                seen.add(member_name)

        return committee
Ejemplo n.º 11
0
    def _scrape_lower_special_committees(self):
        url = 'http://house.louisiana.gov/H_Cmtes/SpecialCommittees.aspx'
        page = self.lxmlize(url)

        committee_list = page.xpath('//div[@class="accordion"]')[0]

        headers = committee_list.xpath('./h3')

        for header in headers:
            committee_name_text = header.xpath('string()')
            committee_name = committee_name_text.strip()
            committee_name = self._normalize_committee_name(committee_name)

            chamber = 'legislature' if committee_name.startswith('Joint') else 'lower'

            committee = Organization(committee_name, chamber=chamber,
                                     classification='committee')
            committee.add_source(url)

            committee_memberlist = header.xpath('./following-sibling::div[@class="pane"]'
                                                '//tr[@class="linkStyle2"]')

            for row in committee_memberlist:
                member_name = row.xpath('normalize-space(string(./th[1]))')
                member_name = self._normalize_member_name(member_name)
                member_role = row.xpath('normalize-space(string(./th[2]))')
                member_role = self._normalize_member_role(member_role)

                committee.add_member(member_name, member_role)

            yield committee
Ejemplo n.º 12
0
    def add_committees(self, legislator_page, legislator, chamber, url):
        # as of today, both chambers do committees the same way! Yay!
        rows = self.get_nodes(
            legislator_page,
            '//div[@id="ContentPlaceHolder1_TabSenator_TabCommittees"]//table/'
            'tr')

        if len(rows) == 0:
            return

        for row in rows[1:]:
            committee_name_text = self.get_node(row, './td[2]').text_content()
            committee_name = committee_name_text.strip()

            if not committee_name:
                continue

            role_text = self.get_node(row, './td[3]').text_content()
            role = role_text.strip()

            if committee_name not in self.committees:
                comm = Organization(
                    name=committee_name, chamber=chamber, classification='committee')
                comm.add_source(url)
                self.committees[committee_name] = comm

            self.committees[committee_name].add_member(
                legislator.name,
                role=role,
            )
Ejemplo n.º 13
0
def test_full_organization():
    org = ScrapeOrganization('United Nations', classification='international')
    org.add_identifier('un')
    org.add_name('UN', start_date='1945')
    org.add_contact_detail(type='phone', value='555-555-1234', note='this is fake')
    org.add_link('http://example.com/link')
    org.add_source('http://example.com/source')

    # import org
    od = org.as_dict()
    OrganizationImporter('jurisdiction-id').import_data([od])

    # get person from db and assert it imported correctly
    o = Organization.objects.get()
    assert 'ocd-organization' in o.id
    assert o.name == org.name

    assert o.identifiers.all()[0].identifier == 'un'
    assert o.identifiers.all()[0].scheme == ''

    assert o.other_names.all()[0].name == 'UN'
    assert o.other_names.all()[0].start_date == '1945'

    assert o.contact_details.all()[0].type == 'phone'
    assert o.contact_details.all()[0].value == '555-555-1234'
    assert o.contact_details.all()[0].note == 'this is fake'

    assert o.links.all()[0].url == 'http://example.com/link'
    assert o.sources.all()[0].url == 'http://example.com/source'
Ejemplo n.º 14
0
    def scrape(self, chamber=None):
        if chamber:
            chambers = [chamber]
        else:
            chambers = ['upper', 'lower']
        for chamber in chambers:
            insert = self.jurisdiction.session_slugs[self.latest_session()]

            chamber_names = {'lower': 'Assembly', 'upper': 'Senate'}
            list_url = '%s/%s/HomeCommittee/LoadCommitteeListTab' % (nelis_root, insert)
            html = self.get(list_url).text
            doc = lxml.html.fromstring(html)

            sel = 'panel%sCommittees' % chamber_names[chamber]

            ul = doc.xpath('//ul[@id="%s"]' % sel)[0]
            coms = ul.xpath('li/div/div/div[@class="col-md-4"]/a')

            for com in coms:
                name = com.text.strip()
                com_id = (re.match(r'.*/Committee/(?P<id>[0-9]+)/Overview', com.attrib['href'])
                          .group('id'))
                com_url = '%s/%s/Committee/FillSelectedCommitteeTab?committeeOrSubCommitteeKey=%s'\
                          '&selectedTab=Overview' % (nelis_root, insert, com_id)
                org = Organization(name=name, chamber=chamber, classification="committee")
                org.add_source(com_url)
                self.scrape_comm_members(chamber, org, com_url)
                yield org
Ejemplo n.º 15
0
    def scrape_approp_subcommittees(self, url):
        html = self.get(url).text
        doc = lxml.html.fromstring(html)

        for strong in doc.xpath('//strong'):
            com = Organization(
                name=strong.text.strip(),
                parent_id={
                    'name': 'Appropriations',
                    'classification': 'committee',
                },
                classification='committee',
            )
            com.add_source(url)

            legislators = strong.getnext().tail.replace('Senators', '').strip()
            for leg in re.split(', | and ', legislators):
                if leg.endswith('(C)'):
                    role = 'chairman'
                    leg = leg[:-4]
                elif leg.endswith('(VC)'):
                    role = 'vice chairman'
                    leg = leg[:-5]
                elif leg.endswith('(MVC)'):
                    role = 'minority vice chairman'
                    leg = leg[:-6]
                else:
                    role = 'member'
                com.add_member(leg, role=role)

            yield com
Ejemplo n.º 16
0
    def scrape_senate_committee(self, url):
        html = self.get(url).text
        doc = lxml.html.fromstring(html)

        headers = doc.xpath('(//div[@class="row"])[2]//h1')
        assert len(headers) == 1
        name = ' '.join(headers[0].xpath('./text()'))
        name = re.sub(r'\s+Committee.*$', '', name)

        com = Organization(chamber='upper', name=name, classification='committee')

        for member in doc.xpath('(//div[@class="row"])[3]/div[1]/ul[1]/li'):
            text = member.text_content()
            member_name = member.xpath('./a/text()')[0].replace('Representative ', '')
            if 'Committee Chair' in text:
                role = 'chair'
            elif 'Minority Vice' in text:
                role = 'minority vice chair'
            elif 'Vice' in text:
                role = 'majority vice chair'
            else:
                role = 'member'

            com.add_member(member_name, role=role)

        com.add_source(url)
        yield com
Ejemplo n.º 17
0
    def scrape_comm(self, url, chamber):
        data = self.post(url).json()['Data']

        for item in data:
            comm_name = item['CommitteeName']
            committee = Organization(name=comm_name, chamber=chamber, classification='committee')
            chair_man = str(item['ChairName'])
            vice_chair = str(item['ViceChairName'])
            comm_id = item['CommitteeId']
            comm_url = self.get_comm_url(chamber, comm_id, comm_name)
            members = self.scrape_member_info(comm_url)
            if vice_chair != 'None':
                committee.add_member(vice_chair, role='Vice-Chair')
            if chair_man != 'None':
                committee.add_member(chair_man, role='Chairman')

            for member in members:
                # vice_chair and chair_man already added.
                if chair_man not in member and vice_chair not in member:
                    member = " ".join(member.split())
                    if member:
                        committee.add_member(member)

            committee.add_source(comm_url)
            committee.add_source(url)
            yield committee
Ejemplo n.º 18
0
    def scrape_reps_comm(self):
        # As of 1/27/15, the committee page has the wrong
        # session number (126th) at the top, but
        # has newly elected people, so we're rolling with it.

        url = 'http://legislature.maine.gov/house/hsecoms.htm'
        page = self.get(url).text
        root = lxml.html.fromstring(page)

        count = 0

        for n in range(1, 12, 2):
            path = 'string(//body/center[%s]/h1/a)' % (n)
            comm_name = root.xpath(path)
            committee = Organization(chamber='lower', name=comm_name, classification='committee')
            count = count + 1

            path2 = '/html/body/ul[%s]/li/a' % (count)

            for el in root.xpath(path2):
                rep = el.text
                if rep.find('(') != -1:
                    mark = rep.find('(')
                    rep = rep[15: mark].strip()
                if 'chair' in rep.lower():
                    role = 'chair'
                    rep = re.sub(r'(?i)[\s,]*chair\s*$', '', rep).strip()
                else:
                    role = 'member'
                committee.add_member(rep, role)
            committee.add_source(url)

            yield committee
Ejemplo n.º 19
0
    def handle_page(self):
        name = self.doc.xpath('//h2[@class="committeeName"]')[0].text
        if name.startswith('Appropriations Subcommittee'):
            name = name.replace('Appropriations ', '')
            parent = {'name': 'Appropriations', 'classification': 'upper'}
            chamber = None
        else:
            if name.startswith('Committee on'):
                name = name.replace('Committee on ', '')
            parent = None
            chamber = 'upper'
        comm = Organization(name=name, classification="committee",
                            chamber=chamber, parent_id=parent,
                            )

        for dt in self.doc.xpath('//div[@id="members"]/dl/dt'):
            role = dt.text.replace(': ', '').strip().lower()
            member = dt.xpath('./following-sibling::dd')[0].text_content()
            member = self.clean_name(member)
            comm.add_member(member, role=role)

        for ul in self.doc.xpath('//div[@id="members"]/ul/li'):
            member = self.clean_name(ul.text_content())
            comm.add_member(member)

        comm.add_source(self.url)

        yield comm
Ejemplo n.º 20
0
    def scrape_committees(self, repos):
        for repo in repos:
            source = "https://raw.githubusercontent.com/unitedstates/congress-legislators/master/{0}".format(repo)
            committees = self.fetch_yaml(source)
            for committee in committees:
                org = Organization(committee["name"], classification="committee")

                org.add_source(source)

                for key in committee.keys() & {"url", "rss_url"}:
                    org.add_link(committee[key])

                for key in committee.keys() & {"phone", "address"}:
                    org.add_contact_detail(
                        type="voice", value=committee[key]
                    ) if key == "phone" else org.add_contact_detail(type=key, value=committee[key])

                for key in committee.keys() & {"senate_committee_id", "house_committee_id", "thomas_id"}:
                    org.add_identifier(committee[key], scheme=key)

                if "subcommittees" in committee:
                    for subcommittee in committee["subcommittees"]:
                        sub_org = Organization(subcommittee["name"], classification="committee", parent_id=org._id)

                        sub_org.add_identifier(subcommittee["thomas_id"], scheme="thomas")
                        sub_org.add_source(source)

                        for key in subcommittee.keys() & {"phone", "address"}:
                            sub_org.add_contact_detail(
                                type="voice", value=committee[key]
                            ) if key == "phone" else sub_org.add_contact_detail(type=key, value=committee[key])

                        yield sub_org

                yield org
Ejemplo n.º 21
0
    def scrape(self):
        sessions = reversed(self.jurisdiction.legislative_sessions)
        committee_term_instances = committees_from_sessions(self, sessions)
        committees_by_code = build_lookup_dict(self, data_list=committee_term_instances, index_key='code')

        for code, instances in committees_by_code.items():
            # TODO: Figure out how to edit city council org.
            if code == 'CC':
                continue

            extras = {'tmmis_decision_body_ids': []}
            for i, inst in enumerate(instances):
                # TODO: Ensure this survives addition of new term (2017)
                #       so specific year always creates
                canonical_i = 0
                if i == canonical_i:
                    o = Organization(name=inst['name'], classification='committee')
                    extras.update({'description': inst['info']})
                    o.add_identifier(inst['code'], scheme=TWO_LETTER_ORG_CODE_SCHEME)
                extras['tmmis_decision_body_ids'].append({inst['term']: inst['decision_body_id']})
                o.extras = extras
                o.add_source(inst['source_url'])
                if instances[canonical_i]['name'] != inst['name']:
                    # TODO: Add start_date and end_date
                    o.add_name(inst['name'])

            yield o
Ejemplo n.º 22
0
    def scrape_interim_committee(self, link, name):
        url = re.sub(r'\s+', '', link.attrib['href'])
        html = self.get(url).text
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)

        if 'Subcommittee' in name:
            # Check whether the parent committee is manually defined first
            # before attempting to automatically resolve it.
            parent = WVCommitteeScraper.subcommittee_parent_map.get(name, None)
            if parent is None:
                parent = name.partition('Subcommittee')[0].strip()

            comm = Organization(
                name=name,
                classification='committee',
                parent_id={'name': parent, 'classification': 'joint'}
            )
        else:
            comm = Organization(name=name, classification='committee', chamber='joint')
        comm.add_source(url)

        xpath = '//a[contains(@href, "?member=")]'
        for link in doc.xpath(xpath):
            name = link.text_content().strip()
            name = re.sub(r'^Delegate\s+', '', name)
            name = re.sub(r'^Senator\s+', '', name)
            role = link.getnext().text or 'member'
            comm.add_member(name, role.strip())

        return comm
Ejemplo n.º 23
0
    def scrape_committee(self, term, href, name):
        page = self.get(href).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(href)
        members = page.xpath("//div[@class='view-content']"
                             "//a[contains(@href, 'members')]")

        if '/joint/' in href:
            chamber = 'legislature'
        elif '/senate/' in href:
            chamber = 'upper'
        elif '/house/' in href:
            chamber = 'lower'
        else:
            # interim committees and others were causing duplicate committee issues, skipping
            self.warning('Failed to identify chamber for {}; skipping'.format(href))
            return

        cttie = Organization(name, chamber=chamber, classification='committee')
        for a in members:
            member = a.text
            role = a.xpath("ancestor::div/h2[@class='pane-title']/text()")[0].strip()
            role = {"Legislative Members": "member",
                    "Chairman": "chair",
                    "Vice Chairman": "member"}[role]

            if member is None or member.startswith("District"):
                continue

            member = member.replace('Senator ', '').replace('Representative ', '')

            cttie.add_member(member, role=role)

        cttie.add_source(href)
        yield cttie
Ejemplo n.º 24
0
    def scrape_committee(self, name, url, chamber):
        org = Organization(name=name, chamber=chamber, classification='committee')
        org.add_source(url)
        data = self.get(url).text
        doc = lxml.html.fromstring(data)

        for leg in doc.xpath('//div[@id="members"]/div[@id="members"]/p/a/text()'):
            leg = leg.replace('Representative ', '')
            leg = leg.replace('Senator ', '')
            leg = leg.strip()
            if ' (' in leg:
                leg, role = leg.split(' (')
                if 'Vice-Chair' in role:
                    role = 'vice-chair'
                elif 'Co-Chair' in role:
                    role = 'co-chair'
                elif 'Chair' in role:
                    role = 'chair'
                else:
                    raise Exception('unknown role: %s' % role)
            else:
                role = 'member'
            org.add_member(leg, role)

        return org
Ejemplo n.º 25
0
    def scrape_page(self, link, chamber=None):
        page = self.lxmlize(link.attrib['href'])
        comName = link.text
        roles = {
            "Chair": "chair",
            "Vice Chair": "vice-chair",
            "Vice-Chair": "vice-chair",
        }
        committee = Organization(comName,
                                 chamber=chamber,
                                 classification='committee')
        committee.add_source(link.attrib['href'])

        for member in page.xpath('//div[@class="members"]/' +
                                 'div[@class="roster-item"]'):
            details = member.xpath('.//div[@class="member-details"]')[0]
            person = details.xpath('./h4')[0].text_content()
            # This page does random weird things with whitepace to names
            person = ' '.join(person.strip().split())
            if not person:
                continue
            role = details.xpath('./span[@class="member-role"]')
            if role:
                role = roles[role[0].text]
            else:
                role = 'member'
            committee.add_member(person, role=role)
        yield committee
Ejemplo n.º 26
0
    def scrape(self, chamber=None):
        base_url = ('http://www.ncga.state.nc.us/gascripts/Committees/'
                    'Committees.asp?bPrintable=true&sAction=ViewCommitteeType&sActionDetails=')

        chamber_slugs = {'upper': ['Senate%20Standing', 'Senate%20Select'],
                         'lower': ['House%20Standing', 'House%20Select']}

        if chamber:
            chambers = [chamber]
        else:
            chambers = ['upper', 'lower']

        for chamber in chambers:
            for ctype in chamber_slugs[chamber]:
                data = self.get(base_url + ctype).text
                doc = lxml.html.fromstring(data)
                doc.make_links_absolute(base_url + ctype)
                for comm in doc.xpath('//ul/li/a'):
                    name = comm.text
                    # skip committee of whole Senate
                    if 'Whole Senate' in name:
                        continue
                    url = comm.get('href')
                    committee = Organization(name=name, chamber=chamber,
                                             classification="committee")
                    self.scrape_committee(committee, url)
                    committee.add_source(url)
                    if not committee._related:
                        self.warning('empty committee: %s', name)
                    else:
                        yield committee
Ejemplo n.º 27
0
    def scrape_homepage(self, leg, chamber, homepage):
        page = self.get(homepage).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(homepage)
        bio = page.xpath(
            "//div[@class='biography']//div[@class='right']//p/text()")
        if bio != []:
            bio = bio[0]
            leg.extras['biography'] = bio

        fax_line = [
            x.strip() for x in
            page.xpath(
                "//div[@class='contactModule']/div[@class='data']/text()"
            )
            if "Fax" in x
        ]
        if fax_line:
            fax_number = re.search(
                r'(\(\d{3}\)\s\d{3}\-\d{4})', fax_line[0]
            ).group(1)
            leg.add_contact_detail(type='fax', value=fax_number, note='Capitol Office')

        ctties = page.xpath("//div[@class='committeeList']//a")
        for a in ctties:
            entry = a.text_content()

            if entry in committee_cache:
                committee_positions = committee_cache[entry]
            else:
                committee_positions = self.fetch_committee_positions(a)
                committee_cache[entry] = committee_positions

            chmbr = "legislature" if "joint" in entry.lower() else chamber
            if entry in JOINT_COMMITTEE_OVERRIDE:
                chmbr = "legislature"

            kwargs = {}

            if "subcommittee" in entry.lower():
                if entry in SUBCOMMITTEES:
                    kwargs['subcommittee'] = entry
                    entry = SUBCOMMITTEES[entry]
                else:
                    self.warning("No subcommittee known: '%s'" % (entry))
                    raise Exception
            if (chmbr, entry) not in self.committees:
                org = Organization(
                    name=entry,
                    chamber=chmbr,
                    classification='committee',
                )
                self.committees[(chmbr, entry)] = org
            else:
                org = self.committees[(chmbr, entry)]
            org.add_source(homepage)
            leg.add_membership(org)
Ejemplo n.º 28
0
    def scrape_committees_pdf(self, year, chamber, filename, url):
        if chamber == 'lower' and year == '2015':
            text = self._fix_house_text(filename).decode()
        else:
            text = convert_pdf(filename, type='text-nolayout').decode()

        for hotgarbage, replacement in (
            (r'Judicial Branch, Law Enforcement,\s+and\s+Justice',
                'Judicial Branch, Law Enforcement, and Justice'),

            (r'Natural Resources and\s+Transportation',
                'Natural Resources and Transportation'),

            (r'(?u)Federal Relations, Energy,?\s+and\s+Telecommunications',
                'Federal Relations, Energy, and Telecommunications')
                ):
            text = re.sub(hotgarbage, replacement, text)

        lines = iter(text.splitlines())

        # Drop any lines before the ag committee.
        lines = dropwhile(lambda s: 'Agriculture' not in s, lines)

        comm = None
        for line in lines:
            # Replace Unicode variants with ASCII equivalents
            line = line.replace(" ", " ").replace("‐", "-")

            if 'Subcommittees' in line:
                self.warning("Currently, we're skipping subcommittees")
                # https://github.com/openstates/openstates/issues/2099
                break
            if is_committee_name(line):
                if comm and comm._related:
                    yield comm

                committee = line.strip()
                comm = Organization(name=committee, chamber=chamber,
                                    classification='committee')

                comm.add_source(url)

            elif is_legislator_name(line):
                name, party = line.rsplit('(', 1)
                name = name.strip().replace("Rep. ", "").replace("Sen. ", "")
                if re.search(' Ch', party):
                    role = 'chair'
                elif ' VCh' in party:
                    role = 'vice chair'
                elif ' MVCh' in party:
                    role = 'minority vice chair'
                else:
                    role = 'member'
                comm.add_member(name, role)

        if comm._related:
            yield comm
Ejemplo n.º 29
0
 def scrape_comms(self, chamber, ctype):
     for a in self.scrape_comm_list(ctype):
         link = a.attrib['href']
         commName = clean(a.text_content())
         self.info("url " + link)
         c = Organization(chamber=chamber, name=commName, classification='committee')
         self.add_members(c, link)
         c.add_source(link)
         yield c
Ejemplo n.º 30
0
    def scrape(self, session=None):
        if session is None:
            session = self.latest_session()
            self.info('no session specified, using %s', session)

        # com_types = ['J', 'SE', 'O']
        # base_url = 'https://wyoleg.gov/LsoService/api/committeeList/2018/J'
        url = 'https://wyoleg.gov/LsoService/api/committees/{}'.format(session)

        response = self.get(url)
        coms_json = json.loads(response.content.decode('utf-8'))

        for row in coms_json:
            com_url = 'https://wyoleg.gov/LsoService/api/committeeDetail/{}/{}'.format(
                session, row['ownerID'])
            com_response = self.get(com_url)
            com = json.loads(com_response.content.decode('utf-8'))

            # WY doesn't seem to have any house/senate only committees that I can find
            committee = Organization(
                name=com['commName'], chamber='legislature', classification='committee')

            for member in com['commMembers']:
                role = 'chairman' if member['chairman'] == 'Chairman' else 'member'
                committee.add_member(member['name'], role)

            # some WY committees have non-legislators appointed to the member by the Governor
            # but the formatting is super inconsistent
            if com['otherMembers']:
                committee.extras['other_members'] = com['otherMembers']

            committee.extras['wy_id'] = com['commID']
            committee.extras['wy_code'] = com['ownerID']
            committee.extras['wy_type_code'] = com['type']
            committee.extras['budget'] = com['budget']

            if com['statAuthority']:
                committee.extras['statutory_authority'] = com['statAuthority']

            if com['number']:
                committee.extras['seat_distribution'] = com['number']

            committee.add_identifier(
                scheme='WY Committee ID', identifier=str(com['commID']))
            committee.add_identifier(
                scheme='WY Committee Code', identifier=str(com['ownerID']))

            if com['description']:
                committee.add_identifier(
                    scheme='Common Name', identifier=com['description'])

            source_url = 'http://wyoleg.gov/Committees/{}/{}'.format(
                session, com['ownerID'])
            committee.add_source(source_url)

            yield committee
Ejemplo n.º 31
0
    def scrape_upper_committee(self, committee_name, url):
        page = self.lxmlize(url)

        committee = Organization(chamber="upper",
                                 name=committee_name,
                                 classification="committee")
        committee.add_source(url)

        # Committee member attributes.
        member_name = None
        member_role = None

        # Attempt to record the committee chair.
        committee_chair = self.get_node(
            page,
            '//div[@class="nys-senator" and div[@class="nys-senator--info"'
            ' and p[@class="nys-senator--title" and'
            ' normalize-space(text())="Chair"]]]',
        )
        if committee_chair is not None:
            info_node = self.get_node(
                committee_chair,
                'div[@class="nys-senator--info" and p[@class='
                '"nys-senator--title" and contains(text(), "Chair")]]',
            )
            if info_node is not None:
                # Attempt to retrieve committee chair's name.
                member_name_text = self.get_node(
                    info_node,
                    './h4[@class="nys-senator--name"][1]/a[1]/text()')

                if member_name_text is not None:
                    member_name = member_name_text.strip()
                else:
                    warning = (
                        "Could not find the name of the chair for the {} committee"
                    )
                    self.logger.warning(warning.format(committee_name))

                # Attempt to retrieve committee chair's role (explicitly).
                member_role_text = self.get_node(
                    info_node,
                    './p[@class="nys-senator--title" and contains(text(), '
                    '"Chair")][1]/text()',
                )

                if member_role_text is not None:
                    member_role = member_role_text.strip()
                else:
                    # This seems like a silly case, but could still be useful
                    # to check for.
                    warning = (
                        "Could not find the role of the chair for the {} committee"
                    )
                    self.logger.warning(warning.format(committee_name))

                if member_name is not None and member_role is not None:
                    committee.add_member(member_name, member_role)
            else:
                warning = (
                    "Could not find information for the chair of the {} committee."
                )
                self.logger.warning(warning.format(committee_name))
        else:
            warning = "Missing chairperson for the {} committee."
            self.logger.warning(warning.format(committee_name))

        # Get list of regular committee members.
        member_nodes = self.get_nodes(
            page,
            '//div[contains(concat(" ", @class, " "), '
            '" c-senators-container ")]//div[@class="view-content"]/'
            " div/a",
        )

        # Attempt to record each committee member.
        for member_node in member_nodes:
            member_name = None

            member_name_text = self.get_node(
                member_node,
                './/div[@class="nys-senator--info"][1]/h4[@class='
                '"nys-senator--name"][1]/text()',
            )

            if member_name_text is not None:
                member_name = member_name_text.strip()

            if member_name is not None:
                committee.add_member(member_name, "member")
            else:
                warning = "Could not find the name of a member in the {} committee"
                self.logger.warning(warning.format(committee_name))

        return committee
Ejemplo n.º 32
0
    def get_organizations(self):
        org = Organization(name="Board of Directors",
                           classification="legislature")

        org.add_post(
            'Mayor of the City of Los Angeles',
            'Board Member',
            division_id='ocd-division/country:us/state:ca/place:los_angeles')

        for district in range(1, 6):
            org.add_post(
                'Los Angeles County Board Supervisor, District {}'.format(
                    district),
                'Board Member',
                division_id=
                'ocd-division/country:us/state:ca/county:los_angeles/council_district:{}'
                .format(district))

        org.add_post(
            'Appointee of Mayor of the City of Los Angeles',
            'Board Member',
            division_id='ocd-division/country:us/state:ca/place:los_angeles')

        org.add_post('Appointee of Governor of California',
                     'Nonvoting Board Member',
                     division_id='ocd-division/country:us/state:ca')

        org.add_post(
            'Appointee of Los Angeles County City Selection Committee, North County/San Fernando Valley sector',
            'Board Member',
            division_id=
            'ocd-division/country:us/state:ca/county:los_angeles/la_metro_sector:north_county_san_fernando_valley'
        )

        org.add_post(
            'Appointee of Los Angeles County City Selection Committee, Southwest Corridor sector',
            'Board Member',
            division_id=
            'ocd-division/country:us/state:ca/county:los_angeles/la_metro_sector:southwest_corridor'
        )

        org.add_post(
            'Appointee of Los Angeles County City Selection Committee, San Gabriel Valley sector',
            'Board Member',
            division_id=
            'ocd-division/country:us/state:ca/county:los_angeles/la_metro_sector:san_gabriel_valley'
        )

        org.add_post(
            'Appointee of Los Angeles County City Selection Committee, Southeast Long Beach sector',
            'Board Member',
            division_id=
            'ocd-division/country:us/state:ca/county:los_angeles/la_metro_sector:southeast_long_beach'
        )

        org.add_post('Chair', 'Chair')

        org.add_post('1st Vice Chair', '1st Vice Chair')

        org.add_post('2nd Vice Chair', '2nd Vice Chair')

        yield org

        org = Organization(name="Crenshaw Project Corporation",
                           classification="corporation")
        org.add_source('foo')
        yield org

        org = Organization(name="LA SAFE", classification="corporation")
        org.add_source('foo')
        yield org
Ejemplo n.º 33
0
    def scrape(self):
        committee_d = {}
        non_committees = ('City Council', 'Office of the Mayor')

        for councilman, committees in self.councilMembers():
            if councilman['Ward/Office'] == "":
                continue

            ward = councilman['Ward/Office']
            if ward not in [
                    "Mayor",
                    "Clerk",
            ]:
                ward = "Ward {}".format(int(ward))

            p = Person(councilman['Person Name']['label'],
                       district=ward,
                       primary_org="legislature")

            if councilman['Photo']:
                p.image = councilman['Photo']

            contact_types = {
                "City Hall Office": ("address", "City Hall Office"),
                "City Hall Phone": ("voice", "City Hall Phone"),
                "Ward Office Phone": ("voice", "Ward Office Phone"),
                "Ward Office Address": ("address", "Ward Office Address"),
                "Fax": ("fax", "Fax")
            }

            for contact_type, (type_, _note) in contact_types.items():
                if councilman[contact_type]:
                    p.add_contact_detail(type=type_,
                                         value=councilman[contact_type],
                                         note=_note)

            if councilman["E-mail"]:
                p.add_contact_detail(type="email",
                                     value=councilman['E-mail']['label'],
                                     note='E-mail')

            if councilman['Website']:
                p.add_link(councilman['Website']['url'])
            p.add_source(MEMBERLIST)

            for committee, _, _ in committees:
                committee_name = committee['Legislative Body']['label']
                if committee_name and committee_name not in non_committees:
                    o = committee_d.get(committee_name, None)
                    if o is None:
                        o = Organization(committee_name,
                                         classification='committee')
                        o.add_source(
                            "https://chicago.legistar.com/Departments.aspx")
                        committee_d[committee_name] = o

                    o.add_member(p, role=committee["Title"])
            yield p

        for o in committee_d.values():
            yield o
Ejemplo n.º 34
0
    def _scrape_lower_chamber(self, session):
        self.info('Scraping lower chamber for committees.')

        chamber = 'lower'

        url = '{base}CommitteeHierarchy.aspx'.format(base=self._reps_url_base)
        page_string = self.get(url).text
        page = lxml.html.fromstring(page_string)
        # Last tr has the date
        committee_links = page.xpath('//li//a')
        for committee_link in committee_links:
            committee_name = committee_link.text_content().strip()
            committee_url = committee_link.attrib.get('href')

            committee_url = '{base}{members}{url}'.format(
                base=self._reps_url_base,
                members=
                "MemberGridCluster.aspx?filter=compage&category=committee&",
                url=committee_url)
            actual_chamber = chamber
            if 'joint' in committee_name.lower():
                actual_chamber = 'legislature'

            committee_name = committee_name.replace('Committee On ', '')
            committee_name = committee_name.replace('Special', '')
            committee_name = committee_name.replace('Select', '')
            committee_name = committee_name.replace('Special', '')
            committee_name = committee_name.replace('Joint', '')
            committee_name = committee_name.replace(' Committee', '')
            committee_name = committee_name.strip()

            committee = Organization(
                committee_name,
                chamber=actual_chamber,
                classification='committee',
            )

            committee_page_string = self.get(committee_url).text
            committee_page = lxml.html.fromstring(committee_page_string)
            # First tr has the title (sigh)
            mem_trs = committee_page.xpath(
                "//table[@id='gvMembers_DXMainTable']//tr[contains(@class, 'dxgvDataRow')]"
            )
            for mem_tr in mem_trs:
                mem_code = None
                mem_links = mem_tr.xpath('td/a[1]')

                mem_role_string = mem_tr.xpath(
                    'td[4]')[0].text_content().strip()

                if len(mem_links):
                    mem_code = mem_links[0].attrib.get('href')
                # Output is "Rubble, Barney, Neighbor"

                mem_parts = mem_tr.xpath(
                    'td[2]')[0].text_content().strip().split(',')
                if self._no_members_text in mem_parts:
                    continue
                mem_name = (mem_parts[1].strip() + ' ' + mem_parts[0].strip())
                # Sometimes Senator abbreviation is in the name
                mem_name = mem_name.replace('Sen. ', '')
                mem_name = mem_name.replace('Rep. ', '')

                mem_role = 'member'

                if len(mem_role_string) > 2:
                    mem_role = mem_role_string.lower()

                membership = committee.add_member(mem_name, role=mem_role)
                membership.extras = {'code': mem_code}

            committee.add_source(url)
            committee.add_source(committee_url)

            yield committee
Ejemplo n.º 35
0
    def _scrape_upper_chamber(self, session):
        self.info('Scraping upper chamber for committees.')

        chamber = 'upper'

        if self._is_post_2015 and self.latest_session() != session:
            url = '{base}{year}web/standing-committees'.format(
                base=self._senate_url_base, year=session[2:])
            comm_container_id = 'primary'
        elif session == self.latest_session():
            url = '{base}standing-committees'.format(
                base=self._senate_url_base)
            comm_container_id = 'primary'
        else:
            url = '{base}{year}info/com-standing.htm'.format(
                base=self._senate_url_base, year=session[2:])
            comm_container_id = 'mainContent'

        page = self.lxmlize(url)

        comm_links = self.get_nodes(
            page, '//div[@id = "{}"]//p/a'.format(comm_container_id))

        for comm_link in comm_links:
            # Normalize to uppercase - varies between "Assigned bills" and "Assigned Bills"
            if "ASSIGNED BILLS" in comm_link.text_content().upper():
                continue

            comm_link = comm_link.attrib['href']

            if self._is_post_2015:
                if "web" not in comm_link:
                    continue
            else:
                if "comm" not in comm_link:
                    continue

            comm_page = self.lxmlize(comm_link)

            if self._is_post_2015:
                comm_name = self.get_node(comm_page,
                                          '//h1[@class="entry-title"]/text()')
                members = self.get_nodes(
                    comm_page, '//div[@id="bwg_standart_thumbnails_0"]/a')
            else:
                comm_name = self.get_node(comm_page,
                                          '//div[@id="mainContent"]/p/text()')
                members = self.get_nodes(comm_page,
                                         '//div[@id="mainContent"]//td/a')

            comm_name = comm_name.replace(' Committee', '')
            comm_name = comm_name.strip()

            committee = Organization(comm_name,
                                     chamber=chamber,
                                     classification='committee')

            for member in members:
                mem_link = member.attrib.get("href", '')
                if "mem" not in mem_link:
                    continue

                if self._is_post_2015:
                    mem_parts = self.get_node(
                        member, './/span[@class="bwg_title_spun2_0"]')

                mem_parts = member.text_content().strip().split(',')
                # Senator title stripping mainly for post-2015.
                mem_name = re.sub(r'^Senator[\s]+', '', mem_parts[0])

                # this one time, MO forgot the comma between
                # the member and his district. Very rarely relevant
                try:
                    int(mem_name[-4:-2]
                        )  # the district's # is in this position
                except ValueError:
                    pass
                else:
                    mem_name = " ".join(
                        mem_name.split(" ")[0:-1])  # member name fixed

                    # ok, so this next line. We don't care about
                    # the first 2 elements of mem_parts anymore
                    # so whatever. But if the member as a role, we want
                    # to make sure there are 3 elements in mem_parts and
                    # the last one is actually the role. This sucks, sorry.
                    mem_parts.append(mem_parts[-1])

                mem_role = 'member'
                if len(mem_parts) > 2:
                    mem_role = mem_parts[2].lower().split('    ')[0].strip()

                if mem_name == "":
                    continue

                committee.add_member(mem_name, role=mem_role)

            committee.add_source(url)
            committee.add_source(comm_link)

            yield committee
Ejemplo n.º 36
0
    def scrape(self):
        organizations = {}
        seat_numbers = defaultdict(lambda: defaultdict(int))

        reader = self.csv_reader(self.csv_url,
                                 delimiter=self.delimiter,
                                 header=True,
                                 encoding=self.encoding,
                                 skip_rows=self.skip_rows)
        reader.fieldnames = [
            self.header_converter(field) for field in reader.fieldnames
        ]
        for row in reader:

            try:
                if self.is_valid_row(row):
                    for key, corrections in self.corrections.items():
                        if not isinstance(corrections, dict):
                            row[key] = corrections(row[key])
                        elif row[key] in corrections:
                            row[key] = corrections[row[key]]

                    organization_classification = 'legislature'

                    organization_name = row['organization']
                    organization_key = organization_name.lower()
                    if organization_key in organizations:
                        organization = organizations[organization_key]
                    else:
                        organization = Organization(
                            organization_name,
                            classification=organization_classification)
                        organization.add_source(self.csv_url)
                        yield organization
                        organizations[organization_key] = organization

                    if not row['primary role']:
                        row['primary role'] = 'Councillor'

                    role = row['primary role']

                    post = Post(role=role,
                                label=organization_name,
                                organization_id=organization._id)
                    yield post

                    name = row['name'].strip(' .,')

                    district = row['district name']

                    if self.many_posts_per_area and role not in self.unique_roles:
                        seat_numbers[role][district] += 1
                        district = '{} (seat {})'.format(
                            district, seat_numbers[role][district])

                    p = Person(primary_org=organization_classification,
                               name=name,
                               district=district,
                               role=role,
                               party=row.get('party name'))
                    p.add_source(self.csv_url)

                    if row.get('gender'):
                        p.gender = row['gender']
                    if row.get('photo url'):
                        p.image = row['photo url']

                    if row.get('source url'):
                        p.add_source(row['source url'].strip(' .,'))

                    if row.get('website'):
                        p.add_link(row['website'], note='web site')
                    if row.get('facebook'):
                        p.add_link(re.sub(r'[#?].+', '', row['facebook']))
                    if row.get('twitter'):
                        p.add_link(row['twitter'])

                    if row['email']:
                        p.add_contact('email', row['email'].strip(' .,'))
                    if row['address']:
                        p.add_contact('address', row['address'], 'legislature')
                    if row.get('phone'):
                        p.add_contact('voice', row['phone'], 'legislature')
                    if row.get('fax'):
                        p.add_contact('fax', row['fax'], 'legislature')
                    if row.get('cell'):
                        p.add_contact('cell', row['cell'], 'legislature')
                    if row.get('birth date'):
                        p.birth_date = row['birth date']

                    if row.get('incumbent'):
                        p.extras['incumbent'] = row['incumbent']

                    if name in self.other_names:
                        for other_name in self.other_names[name]:
                            p.add_name(other_name)

                    # Validate person entity so that we can catch the exception if needed.
                    p.validate()

                    yield p
            except Exception as e:
                print(repr(e))
                continue
Ejemplo n.º 37
0
    def scrape(self):
        '''
        Scrape the web to create a dict with all active organizations.
        Then, we can access the correct URL for the organization detail page.
        '''
        web_scraper = LegistarPersonScraper(
            requests_per_minute=self.requests_per_minute)
        web_scraper.MEMBERLIST = 'https://metro.legistar.com/People.aspx'
        web_info = {}

        for _, organizations in web_scraper.councilMembers():
            for organization, _, _ in organizations:
                organization_name = organization['Department Name'][
                    'label'].strip()
                organization_info = organization['Department Name']

                web_info[organization_name] = organization_info

        body_types = self.body_types()

        board_of_directors, = [
            body for body in self.bodies()
            if body['BodyName'] == 'Board of Directors - Regular Board Meeting'
        ]
        board_of_directors["BodyName"] = "Board of Directors"

        terms = collections.defaultdict(list)
        for office in self.body_offices(board_of_directors):
            terms[office['OfficeRecordFullName']].append(office)

        members = {}
        for member, offices in terms.items():
            p = Person(member)
            for term in offices:
                role = term['OfficeRecordTitle']

                if role not in {'Board Member', 'non-voting member'}:
                    p.add_term(
                        role,
                        'legislature',
                        start_date=self.toDate(term['OfficeRecordStartDate']),
                        end_date=self.toDate(term['OfficeRecordEndDate']),
                        appointment=True)
                if role != 'Chief Executive Officer':
                    if role == 'non-voting member':
                        member_type = 'Nonvoting Board Member'
                        post = NONVOTING_POSTS.get(member)
                    else:
                        member_type = 'Board Member'
                        post = VOTING_POSTS.get(member)

                    start_date = self.toDate(term['OfficeRecordStartDate'])
                    end_date = self.toDate(term['OfficeRecordEndDate'])
                    board_membership = p.add_term(member_type,
                                                  'legislature',
                                                  district=post,
                                                  start_date=start_date,
                                                  end_date=end_date)

                    acting_member_end_date = ACTING_MEMBERS_WITH_END_DATE.get(
                        p.name)
                    if acting_member_end_date and acting_member_end_date <= end_date:
                        board_membership.extras = {'acting': 'true'}

            source_urls = self.person_sources_from_office(term)
            person_api_url, person_web_url = source_urls
            p.add_source(person_api_url, note='api')
            p.add_source(person_web_url, note='web')

            members[member] = p

        for body in self.bodies():
            if body['BodyTypeId'] in (
                    body_types['Committee'],
                    body_types['Independent Taxpayer Oversight Committee']):
                organization_name = body['BodyName'].strip()
                o = Organization(organization_name,
                                 classification='committee',
                                 parent_id={'name': 'Board of Directors'})

                organization_info = web_info.get(organization_name, {})
                organization_url = organization_info.get(
                    'url', self.WEB_URL +
                    'https://metro.legistar.com/Departments.aspx')

                o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body),
                             note='api')
                o.add_source(organization_url, note='web')

                for office in self.body_offices(body):
                    role = office['OfficeRecordTitle']

                    if role not in ("Chair", "Vice Chair",
                                    "Chief Executive Officer"):
                        if role == 'non-voting member':
                            role = 'Nonvoting Member'
                        else:
                            role = 'Member'

                    person = office['OfficeRecordFullName']

                    if person in members:
                        p = members[person]
                    else:
                        p = Person(person)

                        source_urls = self.person_sources_from_office(office)
                        person_api_url, person_web_url = source_urls
                        p.add_source(person_api_url, note='api')
                        p.add_source(person_web_url, note='web')

                        members[person] = p

                    start_date = self.toDate(office['OfficeRecordStartDate'])
                    end_date = self.toDate(office['OfficeRecordEndDate'])
                    membership = p.add_membership(organization_name,
                                                  role=role,
                                                  start_date=start_date,
                                                  end_date=end_date)

                    acting_member_end_date = ACTING_MEMBERS_WITH_END_DATE.get(
                        p.name)
                    if acting_member_end_date and acting_member_end_date <= end_date:
                        membership.extras = {'acting': 'true'}

                yield o

        for p in members.values():
            yield p
Ejemplo n.º 38
0
    def scrape(self):
        body_types = self.body_types()

        board_of_directors, = [
            body for body in self.bodies()
            if body['BodyName'] == 'Board of Directors - Regular Board Meeting'
        ]
        board_of_directors["BodyName"] = "Board of Directors"

        terms = collections.defaultdict(list)
        for office in self.body_offices(board_of_directors):
            terms[office['OfficeRecordFullName']].append(office)

        members = {}
        for member, offices in terms.items():
            p = Person(member)
            for term in offices:
                role = term['OfficeRecordTitle']

                if role not in {'Board Member', 'non-voting member'}:
                    p.add_term(
                        role,
                        'legislature',
                        start_date=self.toDate(term['OfficeRecordStartDate']),
                        end_date=self.toDate(term['OfficeRecordEndDate']))
                if role != 'Chief Executive Officer':
                    if role == 'non-voting member':
                        member_type = 'Nonvoting Board Member'
                        post = NONVOTING_POSTS.get(member)
                    else:
                        member_type = 'Board Member'
                        post = VOTING_POSTS.get(member)

                    p.add_term(
                        member_type,
                        'legislature',
                        district=post,
                        start_date=self.toDate(term['OfficeRecordStartDate']),
                        end_date=self.toDate(term['OfficeRecordEndDate']))

            source_urls = self._person_sources_from_office(term)
            person_api_url, person_web_url = source_urls
            p.add_source(person_api_url, note='api')
            p.add_source(person_web_url, note='web')

            members[member] = p

        for body in self.bodies():
            if body['BodyTypeId'] == body_types['Committee']:
                o = Organization(body['BodyName'],
                                 classification='committee',
                                 parent_id={'name': 'Board of Directors'})

                o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body),
                             note='api')
                o.add_source(
                    self.WEB_URL +
                    '/DepartmentDetail.aspx?ID={BodyId}&GUID={BodyGuid}'.
                    format(**body),
                    note='web')

                for office in self.body_offices(body):
                    role = office['OfficeRecordTitle']
                    if role not in ("Chair", "Vice Chair"):
                        role = 'Member'

                    person = office['OfficeRecordFullName']
                    if person in members:
                        p = members[person]
                    else:
                        p = Person(person)

                        source_urls = self._person_sources_from_office(office)
                        person_api_url, person_web_url = source_urls
                        p.add_source(person_api_url, note='api')
                        p.add_source(person_web_url, note='web')

                        members[person] = p

                    p.add_membership(body['BodyName'],
                                     role=role,
                                     start_date=self.toDate(
                                         office['OfficeRecordStartDate']),
                                     end_date=self.toDate(
                                         office['OfficeRecordEndDate']))

                yield o

        for p in members.values():
            yield p
Ejemplo n.º 39
0
    def scrape(self):
        response = urlopen(COUNCIL_PAGE).read()
        pdf = open('/tmp/yt.pdf', 'w')
        pdf.write(response)
        pdf.close()

        data = subprocess.check_output(
            ['pdftotext', '-layout', '/tmp/yt.pdf', '-'])
        data = re.split(r'\n\s*\n', data)
        for municipality in data:

            if 'Councillors' not in municipality:
                continue
            lines = municipality.split('\n')
            if 'Page' in lines[0]:
                lines.pop(0)
                if not lines[0].strip():
                    lines.pop(0)
            col1end = re.search(r'\s{2,}(\w)', lines[0].strip()).end()
            col2end = re.search(r':\s{2,}(\w)', lines[0].strip()).end()

            if 'Council' in lines[1]:
                address = lines[2][:col1end -
                                   1].strip() + ' ' + lines[3][:col1end -
                                                               1].strip()
                district = lines[0][:col1end -
                                    1].strip() + ' ' + lines[1][:col1end -
                                                                1].strip()
            else:
                address = lines[1][:col1end -
                                   1].strip() + ' ' + lines[2][:col1end -
                                                               1].strip()
                district = lines[0][:col1end - 1].strip()

            organization = Organization(
                name=district + ' Council',
                classification='legislature',
                jurisdiction_id=self.jurisdiction.jurisdiction_id)
            organization.add_source(COUNCIL_PAGE)
            yield organization

            phone = re.findall(r'(?<=Phone: )\(?(\d{3}[\)-] ?\d{3}-\d{4})',
                               municipality)[0].replace(') ', '-')
            email = re.findall(r'(?<=E-mail:) (\S*)', municipality)[0]
            fax = None
            if 'Fax' in municipality:
                fax = re.findall(r'(?<=Fax: )\(?(\d{3}[\)-] ?\d{3}-\d{4})',
                                 municipality)[0].replace(') ', '-')
            website = None
            if 'Website' in municipality:
                website = re.findall(r'((http:\/\/|www.)(\S*))',
                                     municipality)[0][0]

            councillor_or_mayor = False
            for line in lines:
                if 'Mayor:' in line:
                    councillor_or_mayor = True
                    role = 'Mayor'
                    continue
                if 'Councillors' in line:
                    councillor_or_mayor = True
                    role = 'Councillor'
                    continue
                if councillor_or_mayor:
                    councillor = line[col1end - 1:col2end - 1].strip()
                    if not councillor:
                        continue
                    p = Person(primary_org='legislature',
                               name=councillor,
                               district=district)
                    p.add_source(COUNCIL_PAGE)
                    membership = p.add_membership(organization,
                                                  role=role,
                                                  district=district)
                    membership.add_contact_detail('address', address,
                                                  'legislature')
                    membership.add_contact_detail('voice', phone,
                                                  'legislature')
                    membership.add_contact_detail('email', email)
                    if fax:
                        membership.add_contact_detail('fax', fax,
                                                      'legislature')
                    if website:
                        p.add_link(website)
                    yield p

        os.system('rm /tmp/yt.pdf')
Ejemplo n.º 40
0
    def get_organizations(self):
        legislature = Organization("United States Congress",
                                   classification='legislature')

        self._legislature = legislature

        yield legislature

        senate = Organization(
            name="United States Senate",
            classification='upper',
            parent_id=legislature._id,
        )

        self._senate = senate

        yield senate

        house = Organization(
            name="United States House",
            classification='lower',
            parent_id=legislature._id,
        )

        self._house = house

        yield house

        sopr = Organization(
            name="Office of Public Record, US Senate",
            classification="office",
            parent_id=senate._id,
        )

        sopr.add_contact_detail(type="voice", value="202-224-0322")

        sopr.add_source(url="http://www.senate.gov/pagelayout/legislative/"
                        "one_item_and_teasers/opr.htm",
                        note="Profile page")

        sopr.add_source(url="http://www.senate.gov/pagelayout/legislative/"
                        "g_three_sections_with_teasers/lobbyingdisc.htm"
                        "#lobbyingdisc=lda",
                        note="Disclosure Home")

        sopr.add_link(url="http://soprweb.senate.gov/index.cfm"
                      "?event=selectfields",
                      note="Disclosure Search Portal")

        sopr.add_link(url="http://soprweb.senate.gov/",
                      note="Disclosure Electronic Filing System")

        self._sopr = sopr

        yield sopr

        house_clerk = Organization(
            name="Office of the Clerk, US House",
            classification="office",
            parent_id=house._id,
        )

        house_clerk.add_contact_detail(type="voice", value="202-225-7000")

        house_clerk.add_source(url="http://clerk.house.gov/", note="Home page")

        self._house_clerk = house_clerk

        yield house_clerk
        yield legislature
Ejemplo n.º 41
0
    def get_organizations(self):
        global date_range

        city = Organization('City of Saint Paul', classification='executive')
        city.add_post(
            'Mayor',
            'Mayor',
            division_id='ocd-division/country:us/state:mn/place:st_paul')
        city.add_post(
            'City Clerk',
            'City Clerk',
            division_id='ocd-division/country:us/state:mn/place:st_paul')
        yield city

        council = Organization(name="Saint Paul City Council",
                               classification="legislature",
                               parent_id=city)
        for x in range(1, 8):
            council.add_post(
                "Ward {}".format(x),
                "Councilmember",
                division_id=
                'ocd-division/country:us/state:mn/place:st_paul/ward:{}'.
                format(x))

        yield council

        carter = Person(name="Melvin Carter")
        carter.add_term('Mayor',
                        'executive',
                        start_date=dtdate(2018, 1, 19),
                        appointment=True)

        carter.add_source('http://www.google.com')
        yield carter

        new_meetings = []
        temp_labels = []
        for date in date_range:
            print('Checking date:', date)
            root = requests.get("https://www.stpaul.gov/calendar/" + date)
            base = html.fromstring(root.text)
            items = base.xpath('.//*/div[@class="view-content"]/div')
            meetings = []
            for i in items:
                if len(
                        i.xpath(
                            './/*/span[@class="date-display-single"]/text()')
                ) > 0:
                    d = {}
                    d['date'] = i.xpath(
                        './/*/span[@class="date-display-single"]/text()')[0]
                    d['info'] = i.xpath(
                        './/*/span[@class="field-content"]/a/text()')[0]
                    d['link'] = i.xpath(
                        './/*/span[@class="field-content"]/a/@href')[0]
                    meetings.append(d)

            for m in meetings:
                m['link'] = "https://www.stpaul.gov" + m['link']
            for m in meetings:
                r = requests.get(m['link'])
                b = html.fromstring(r.text)
                exists = b.xpath('.//div[@class="node-content clearfix"]')
                if len(exists) > 0:
                    if not 'City Council' in m[
                            'info'] and not 'Legislative' in m[
                                'info'] and not 'Holiday' in m['info']:
                        m['name'] = m['info'].replace('Meeting', '').replace(
                            ' - Cancelled', '').replace('Events', '').strip()
                        if not m['name'] in temp_labels:
                            temp_labels.append(m['name'])
                            new_meetings.append(m)

        print('Creating organizations')
        for m in new_meetings:
            print(m)
            cmt = Organization(name=m['name'],
                               classification='committee',
                               parent_id=city)
            cmt.add_source(m['link'])
            yield cmt
Ejemplo n.º 42
0
    def scrape(self):
        '''
        Scrape the web to create a dict with all active organizations.
        Then, we can access the correct URL for the organization detail page.
        '''
        web_scraper = LegistarPersonScraper(
            requests_per_minute=self.requests_per_minute)
        web_scraper.MEMBERLIST = 'https://metro.legistar.com/People.aspx'
        web_info = {}

        for _, organizations in web_scraper.councilMembers():
            for organization, _, _ in organizations:
                organization_name = organization['Department Name'][
                    'label'].strip()
                organization_info = organization['Department Name']

                web_info[organization_name] = organization_info

        body_types = self.body_types()

        board_of_directors, = [
            body for body in self.bodies()
            if body['BodyName'] == 'Board of Directors - Regular Board Meeting'
        ]
        board_of_directors["BodyName"] = "Board of Directors"

        terms = collections.defaultdict(list)
        for office in self.body_offices(board_of_directors):
            terms[office['OfficeRecordFullName']].append(office)

        members = {}
        for member, offices in terms.items():
            p = Person(member)

            for term in offices:
                role = term['OfficeRecordTitle']

                if role not in {'Board Member', 'non-voting member'}:
                    p.add_term(
                        role,
                        'legislature',
                        start_date=self.toDate(term['OfficeRecordStartDate']),
                        end_date=self.toDate(term['OfficeRecordEndDate']),
                        appointment=True)

                if role != 'Chief Executive Officer':
                    if role == 'non-voting member':
                        member_type = 'Nonvoting Board Member'
                        post = NONVOTING_POSTS.get(member)
                    else:
                        member_type = 'Board Member'
                        post = VOTING_POSTS.get(member)

                    start_date = self.toDate(term['OfficeRecordStartDate'])
                    end_date = self.toDate(term['OfficeRecordEndDate'])
                    board_membership = p.add_term(member_type,
                                                  'legislature',
                                                  district=post,
                                                  start_date=start_date,
                                                  end_date=end_date)

                    acting_member_end_date = ACTING_MEMBERS_WITH_END_DATE.get(
                        p.name)

                    if acting_member_end_date and acting_member_end_date <= end_date:
                        board_membership.extras = {'acting': 'true'}

            # Each term contains first and last names. This should be the same
            # across all of a person's terms, so go ahead and grab them from the
            # last term in the array.
            p.family_name = term['OfficeRecordLastName']
            p.given_name = term['OfficeRecordFirstName']

            # Defensively assert that the given and family names match the
            # expected value.
            if member == 'Hilda L. Solis':
                # Given/family name does not contain middle initial.
                assert p.given_name == 'Hilda' and p.family_name == 'Solis'
            else:
                assert member == ' '.join([p.given_name, p.family_name])

            source_urls = self.person_sources_from_office(term)
            person_api_url, person_web_url = source_urls

            p.add_source(person_api_url, note='api')
            p.add_source(person_web_url, note='web')

            members[member] = p

        for body in self.bodies():
            if body['BodyTypeId'] in (
                    body_types['Committee'],
                    body_types['Independent Taxpayer Oversight Committee']):
                organization_name = body['BodyName'].strip()
                o = Organization(organization_name,
                                 classification='committee',
                                 parent_id={'name': 'Board of Directors'})

                organization_info = web_info.get(organization_name, {})
                organization_url = organization_info.get(
                    'url', self.WEB_URL +
                    'https://metro.legistar.com/Departments.aspx')

                o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body),
                             note='api')
                o.add_source(organization_url, note='web')

                for office in self.body_offices(body):
                    role = office['OfficeRecordTitle']

                    if role not in BOARD_OFFICE_ROLES:
                        if role == 'non-voting member':
                            role = 'Nonvoting Member'
                        else:
                            role = 'Member'

                    person = office['OfficeRecordFullName']

                    # Temporarily skip committee memberships, e.g., for
                    # new board members. The content of this array is provided
                    # by Metro.
                    if person in PENDING_COMMITTEE_MEMBERS:
                        self.warning('Skipping {0} membership for {1}'.format(
                            organization_name, person))
                        continue

                    if person in members:
                        p = members[person]
                    else:
                        p = Person(person)

                        source_urls = self.person_sources_from_office(office)
                        person_api_url, person_web_url = source_urls
                        p.add_source(person_api_url, note='api')
                        p.add_source(person_web_url, note='web')

                        members[person] = p

                    start_date = self.toDate(office['OfficeRecordStartDate'])
                    end_date = self.toDate(office['OfficeRecordEndDate'])
                    membership = p.add_membership(organization_name,
                                                  role=role,
                                                  start_date=start_date,
                                                  end_date=end_date)

                    acting_member_end_date = ACTING_MEMBERS_WITH_END_DATE.get(
                        p.name)
                    if acting_member_end_date and acting_member_end_date <= end_date:
                        membership.extras = {'acting': 'true'}

                yield o

        for p in members.values():
            yield p
Ejemplo n.º 43
0
    def scrape(self):
        web_scraper = LegistarPersonScraper(
            requests_per_minute=self.requests_per_minute)
        web_scraper.MEMBERLIST = 'http://legistar.council.nyc.gov/DepartmentDetail.aspx?ID=6897&GUID=CDC6E691-8A8C-4F25-97CB-86F31EDAB081&Mode=MainBody'

        if self.cache_storage:
            web_scraper.cache_storage = self.cache_storage

        if self.requests_per_minute == 0:
            web_scraper.cache_write_only = False

        web_info = {}

        for member, _ in web_scraper.councilMembers():
            name = member['Person Name']['label'].strip()
            web_info[name] = member

        city_council, = [
            body for body in self.bodies()
            if body['BodyName'] == 'City Council'
        ]

        terms = collections.defaultdict(list)

        public_advocates = {  # Match casing to Bill De Blasio as council member
            'The Public Advocate (Mr. de Blasio)': 'Bill De Blasio',
            'The Public Advocate (Ms. James)': 'Letitia James',
        }

        for office in self.body_offices(city_council):
            name = office['OfficeRecordFullName']
            name = public_advocates.get(name, name).strip()

            terms[name].append(office)

            # Add past members (and advocates public)
            if name not in web_info:
                web_info[name] = collections.defaultdict(lambda: None)

        # Check that we have everyone we expect, formatted consistently, in
        # both information arrays. For instance, this will fail if we forget to
        # strip trailing spaces from names on one side or the other (which has
        # the effect of omitting information, such as post, from the scrape).

        assert set(web_info.keys()) == set(terms.keys())

        members = {}

        for member, offices in terms.items():

            p = Person(member)

            web = web_info[member]

            for term in offices:
                role = term['OfficeRecordTitle']

                if role == 'Public Advocate':
                    role = 'Non-Voting Council Member'
                else:
                    role = 'Council Member'

                district = web.get('District', '').replace(' 0', ' ')

                p.add_term(role,
                           'legislature',
                           district=district,
                           start_date=self.toDate(
                               term['OfficeRecordStartDate']),
                           end_date=self.toDate(term['OfficeRecordEndDate']))

                party = web.get('Political Party')

                if party == 'Democrat':
                    party = 'Democratic'

                if party:
                    p.add_party(party)

                if web.get('Photo'):
                    p.image = web['Photo']

                contact_types = {
                    "City Hall Office": ("address", "City Hall Office"),
                    "City Hall Phone": ("voice", "City Hall Phone"),
                    "Ward Office Phone": ("voice", "Ward Office Phone"),
                    "Ward Office Address": ("address", "Ward Office Address"),
                    "Fax": ("fax", "Fax")
                }

                for contact_type, (type_, _note) in contact_types.items():
                    if web.get(contact_type) and web(contact_type) != 'N/A':
                        p.add_contact_detail(type=type_,
                                             value=web[contact_type],
                                             note=_note)

                if web.get('E-mail'):
                    p.add_contact_detail(type="email",
                                         value=web['E-mail']['url'],
                                         note='E-mail')

                if web.get('Web site'):
                    p.add_link(web['Web site']['url'], note='web site')

                if web.get('Notes'):
                    p.extras = {'Notes': web['Notes']}

                if not p.sources:  # Only add sources once
                    source_urls = self.person_sources_from_office(term)
                    person_api_url, person_web_url = source_urls
                    p.add_source(person_api_url, note='api')
                    p.add_source(person_web_url, note='web')

            members[member] = p

        committee_types = [
            'Committee', 'Inactive Committee', 'Select Committee',
            'Subcommittee', 'Task Force', 'Land Use'
        ]  # Committee on Land Use

        body_types = {
            k: v
            for k, v in self.body_types().items() if k in committee_types
        }

        for body in self.bodies():
            if body['BodyTypeName'] in body_types \
                or body['BodyName'] in ('Legislative Documents Unit',
                                        'Legal and Government Affairs Division'):

                # Skip typo in API data
                if body['BodyName'] == 'Committee on Mental Health, Developmental Disability, Alcoholism, Substance Abuse amd Disability Services':
                    continue

                parent_org = PARENT_ORGS.get(body['BodyName'],
                                             'New York City Council')

                body_name = body['BodyName']

                o = Organization(body_name,
                                 classification='committee',
                                 parent_id={'name': parent_org})

                o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body),
                             note='api')
                o.add_source(
                    self.WEB_URL +
                    '/DepartmentDetail.aspx?ID={BodyId}&GUID={BodyGuid}'.
                    format(**body),
                    note='web')

                for office in self.body_offices(body):
                    # Possible roles: 'Council Member', 'MEMBER', 'Ex-Officio',
                    # 'Committee Member', None, 'CHAIRPERSON'

                    role = office['OfficeRecordTitle']

                    if role and role.lower() == 'chairperson':
                        role = 'Chairperson'
                    else:
                        role = 'Member'

                    person = office['OfficeRecordFullName']
                    person = public_advocates.get(person, person).strip()

                    if person in members:
                        p = members[person]
                    else:
                        p = Person(person)

                        source_urls = self.person_sources_from_office(office)
                        person_api_url, person_web_url = source_urls
                        p.add_source(person_api_url, note='api')
                        p.add_source(person_web_url, note='web')

                        members[person] = p

                    p.add_membership(o,
                                     role=role,
                                     start_date=self.toDate(
                                         office['OfficeRecordStartDate']),
                                     end_date=self.toDate(
                                         office['OfficeRecordEndDate']))

                yield o

        for p in members.values():
            yield p
Ejemplo n.º 44
0
    def scrape(self):
        body_types = self.body_types()

        city_council, = [
            body for body in self.bodies()
            if body['BodyName'] == 'City Council'
        ]

        terms = collections.defaultdict(list)
        for office in self.body_offices(city_council):
            if 'VACAN' not in office['OfficeRecordFullName']:
                terms[office['OfficeRecordFullName'].strip()].append(office)

        web_scraper = LegistarPersonScraper(
            requests_per_minute=self.requests_per_minute)
        web_scraper.MEMBERLIST = 'https://chicago.legistar.com/DepartmentDetail.aspx?ID=12357&GUID=4B24D5A9-FED0-4015-9154-6BFFFB2A8CB4&R=8bcbe788-98cd-4040-9086-b34fa8e49881'
        web_scraper.ALL_MEMBERS = '3:3'

        if self.cache_storage:
            web_scraper.cache_storage = self.cache_storage

        if self.requests_per_minute == 0:
            web_scraper.cache_write_only = False

        web_info = {}
        for member, _ in web_scraper.councilMembers(
            {'ctl00$ContentPlaceHolder$lstName': 'City Council'}):
            web_info[member['Person Name']['label']] = member

        web_info['Balcer, James'] = collections.defaultdict(lambda: None)
        web_info['Fioretti, Bob'] = collections.defaultdict(lambda: None)
        web_info['Balcer, James']['Ward/Office'] = 11
        web_info['Fioretti, Bob']['Ward/Office'] = 2

        members = {}
        for member, offices in terms.items():
            web = web_info[member]
            p = Person(member)
            for term in offices:
                role = term['OfficeRecordTitle']
                p.add_term('Alderman',
                           'legislature',
                           district="Ward {}".format(int(web['Ward/Office'])),
                           start_date=self.toDate(
                               term['OfficeRecordStartDate']),
                           end_date=self.toDate(term['OfficeRecordEndDate']))

            if web.get('Photo'):
                p.image = web['Photo']

            contact_types = {
                "City Hall Address": ("address", "City Hall Address"),
                "City Hall Phone": ("voice", "City Hall Phone"),
                "Ward Office Phone": ("voice", "Ward Office Phone"),
                "Ward Office Address": ("address", "Ward Office Address"),
                "Fax": ("fax", "Fax")
            }

            for contact_type, (type_, _note) in contact_types.items():
                if web[contact_type] and web[contact_type] != 'N/A':
                    p.add_contact_detail(type=type_,
                                         value=web[contact_type],
                                         note=_note)

            if web["E-mail"] and web["E-mail"][
                    "label"] and web["E-mail"]["label"] != 'N/A':
                p.add_contact_detail(type="email",
                                     value=web['E-mail']['label'],
                                     note='E-mail')

            if web['Website']:
                p.add_link(web['Website']['url'])

            source_urls = self.person_sources_from_office(term)
            person_api_url, person_web_url = source_urls
            p.add_source(person_api_url, note='api')
            p.add_source(person_web_url, note='web')

            members[member] = p

        for body in self.bodies():
            if body['BodyTypeId'] == body_types['Committee']:
                o = Organization(body['BodyName'],
                                 classification='committee',
                                 parent_id={'name': 'Chicago City Council'})

                o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body),
                             note='api')
                o.add_source(
                    self.WEB_URL +
                    '/DepartmentDetail.aspx?ID={BodyId}&GUID={BodyGuid}'.
                    format(**body),
                    note='web')

                for office in self.body_offices(body):
                    # messed up record for joanna thompson
                    if office['OfficeRecordId'] == 1055:
                        continue

                    role = office['OfficeRecordTitle']
                    if role not in ("Vice Chair", "Chairman"):
                        role = 'Member'

                    person = office['OfficeRecordFullName'].strip()
                    if person in members:
                        p = members[person]
                    else:
                        p = Person(person)

                        source_urls = self.person_sources_from_office(office)
                        person_api_url, person_web_url = source_urls
                        p.add_source(person_api_url, note='api')
                        p.add_source(person_web_url, note='web')

                        members[person] = p

                    p.add_membership(body['BodyName'],
                                     role=role,
                                     start_date=self.toDate(
                                         office['OfficeRecordStartDate']),
                                     end_date=self.toDate(
                                         office['OfficeRecordEndDate']))

                yield o

        for body in self.bodies():
            if body['BodyTypeId'] == body_types['Joint Committee']:
                o = Organization(body['BodyName'],
                                 classification='committee',
                                 parent_id={'name': 'Chicago City Council'})

                o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body),
                             note='api')
                o.add_source(
                    self.WEB_URL +
                    '/DepartmentDetail.aspx?ID={BodyId}&GUID={BodyGuid}'.
                    format(**body),
                    note='web')

                yield o

        for p in members.values():
            yield p
Ejemplo n.º 45
0
    def scrape_committee(self, committee_id):
        old = self.api('committees/' + committee_id + '?')
        id = old.pop('id')
        old.pop('created_at')
        old.pop('updated_at')
        old.pop('country', None)
        old.pop('level', None)
        old.pop('state')
        old.pop('votesmart_id', None)
        old.pop('+short_name', None)
        old.pop('+session', None)
        old.pop('+az_committee_id', None)

        com = old.pop('committee')
        sub = old.pop('subcommittee')
        parent_id = old.pop('parent_id')
        chamber = old.pop('chamber')
        if chamber == 'joint':
            chamber = ''
        if self.state in ('ne', 'dc'):
            chamber = 'legislature'

        if sub:
            if parent_id:
                parent = self._committees[parent_id]._id
                new = Organization(sub,
                                   parent_id=parent,
                                   classification='committee')
            else:
                new = Organization(com + ': ' + sub,
                                   chamber=chamber,
                                   classification='committee')
        else:
            new = Organization(com,
                               chamber=chamber,
                               classification='committee')
            assert parent_id is None

        # all_ids
        for id in old.pop('all_ids'):
            new.add_identifier(id, scheme='openstates')
            self._committees[id] = new

        # sources
        for source in old.pop('sources'):
            new.add_source(**source)

        # members
        start, end = self.get_term_years()
        for role in old.pop('members'):
            # leg_id, com_id, role, start, end
            if role['leg_id']:
                self._roles.add((role['leg_id'], id, role['role'], start, end))

        to_extras = [
            '+twitter',
            '+description',
            '+code',
            '+secretary',
            '+office_hours',
            '+office_phone',
            '+meetings_info',
            '+status',
            '+aide',
            '+contact_info',
            '+comm_type',
            'comm_type',
            'aide',
            'contact_info',
            '+town_represented',
            '+action_code',
        ]
        for k in to_extras:
            v = old.pop(k, None)
            if v:
                new.extras[k.replace('+', '')] = v

        assert not old, old.keys()

        return new
Ejemplo n.º 46
0
    def scrape_chamber(self, chamber):
        committee_list_urls = {
            "lower":
            "https://capitol.texas.gov/Committees/"
            "CommitteesMbrs.aspx?Chamber=H",
            "upper":
            "https://capitol.texas.gov/Committees/"
            "CommitteesMbrs.aspx?Chamber=S",
        }

        committee_list_url = committee_list_urls[chamber]
        committee_list_page = self.lxmlize(committee_list_url)

        committee_nodes = self.get_nodes(
            committee_list_page, '//form[@id="ctl00"]//a[@id="CmteList"]')

        for committee_node in committee_nodes:
            committee_name = committee_node.text.strip()
            committee = Organization(name=committee_name,
                                     chamber=chamber,
                                     classification="committee")

            # Get the committee profile page.
            committee_page_url = committee_node.get("href")
            committee_page = self.lxmlize(committee_page_url)

            # Capture table with committee membership data.
            details_table = self.get_node(committee_page,
                                          '//div[@id="content"]//table[2]')
            if details_table is not None:
                # Skip the first row because it currently contains only headers
                detail_rows = self.get_nodes(details_table, "./tr")[1:]
                for detail_row in detail_rows:
                    label_text = self.get_node(detail_row, "./td[1]//text()")

                    if label_text:
                        label_text = label_text.strip().rstrip(":")

                    if label_text in ("Chair", "Vice Chair"):
                        member_role = "chair"
                    else:
                        member_role = "member"

                    member_name_text = self.get_node(detail_row,
                                                     "./td[2]/a/text()")

                    # Clean titles from member names.
                    if chamber == "upper":
                        member_name = re.sub(r"^Sen\.[\s]*", "",
                                             member_name_text)
                    elif chamber == "lower":
                        member_name = re.sub(r"^Rep\.[\s]*", "",
                                             member_name_text)

                    # Collapse multiple whitespaces in member names.
                    member_name = re.sub(r"[\s]{2,}", " ", member_name).strip()

                    committee.add_member(member_name, member_role)

            committee.add_source(committee_list_url)
            committee.add_source(committee_page_url)

            yield committee
Ejemplo n.º 47
0
    def scrape(self):
        noncommittees = {'Committee of the Whole'}
        committee_d = {}

        people_d = {}

        for councilman, committees in self.councilMembers() :

            
            if 'url' in councilman['Person Name'] :
                councilman_url = councilman['Person Name']['url']

                if councilman_url in people_d :
                    people_d[councilman_url][0].append(councilman) 
                else :
                    people_d[councilman_url] = [councilman], committees

        for person_entries, committees in people_d.values() :

            councilman = person_entries[-1]
            
            p = Person(councilman['Person Name']['label'])
            
            if p.name == 'Letitia James' :
                p.name = 'Letitia Ms. James'
                p.add_name('Letitia James')

            spans = [(self.toTime(entry['Start Date']).date(), 
                      self.toTime(entry['End Date']).date(),
                      entry['District'])
                     for entry in person_entries]

            merged_spans = []
            last_end_date = None
            last_district = None
            for start_date, end_date, district in sorted(spans) :
                if last_end_date is None :
                    span = [start_date, end_date, district]
                elif (start_date - last_end_date) == datetime.timedelta(1) and district == last_district :
                    span[1] = end_date
                else :
                    merged_spans.append(span)
                    span = [start_date, end_date, district]

                last_end_date = end_date
                last_district = district

            merged_spans.append(span)

            for start_date, end_date, district in merged_spans :
                district = councilman['District'].replace(' 0', ' ')
                if end_date == datetime.date(2017, 12, 31) :
                    end_date = ''
                else :
                    end_date = end_date.isoformat()
                print(start_date, end_date)
                p.add_term('Council Member', 'legislature', 
                           district=district, 
                           start_date=start_date.isoformat(),
                           end_date=end_date)

            party = councilman['Political Party']
            if party == 'Democrat' :
                party = 'Democratic'
            
            if party :
                p.add_party(party)

            if councilman['Photo'] :
                p.image = councilman['Photo']

            if councilman["E-mail"]:
                p.add_contact_detail(type="email",
                                     value=councilman['E-mail']['url'],
                                     note='E-mail')

            if councilman['Web site']:
                p.add_link(councilman['Web site']['url'], note='web site')

            p.extras = {'Notes' : councilman['Notes']}
                 
            p.add_source(councilman['Person Name']['url'], note='web')

            for committee, _, _ in committees:
                committee_name = committee['Department Name']['label']
                if committee_name not in noncommittees and 'committee' in committee_name.lower():
                    o = committee_d.get(committee_name, None)
                    if o is None:
                        parent_id = PARENT_ORGS.get(committee_name,
                                                    'New York City Council')
                        o = Organization(committee_name,
                                         classification='committee',
                                         parent_id={'name' : parent_id})
                        o.add_source(committee['Department Name']['url'])
                        committee_d[committee_name] = o

                    membership = o.add_member(p, role=committee["Title"])
                    membership.start_date = self.mdY2Ymd(committee["Start Date"])
            yield p
            

        for o in committee_d.values() :
            if 'Committee' in o.name :
                yield o

        for o in committee_d.values() :
            if 'Subcommittee' in o.name :
                yield o

        o = Organization('Committee on Mental Health, Developmental Disability, Alcoholism, Drug Abuse and Disability Services',
                         classification='committee',
                         parent_id={'name' : 'New York City Council'})
        o.add_source("http://legistar.council.nyc.gov/Departments.aspx")

        yield o

        o = Organization('Subcommittee on Drug Abuse',
                         classification='committee',
                         parent_id={'name' : 'Committee on Mental Health, Developmental Disability, Alcoholism, Drug Abuse and Disability Services'})
        o.add_source("http://legistar.council.nyc.gov/Departments.aspx")

        yield o
Ejemplo n.º 48
0
    def scrape(self):
        body_types = self.body_types()

        board_of_directors, = [
            body for body in self.bodies()
            if body['BodyName'] == 'Board of Directors'
        ]

        members = {}
        for office in self.body_offices(board_of_directors):
            members.setdefault(office['OfficeRecordFullName'],
                               []).append(office)

        for member, offices in members.items():
            p = Person(member)
            for term in offices:
                role = term['OfficeRecordTitle']

                if role != 'non-voting member':
                    role = 'Board Member'
                    post = VOTING_POSTS.get(member)
                else:
                    role = 'Nonvoting Board Member'
                    post = NONVOTING_POSTS.get(member)

                p.add_term(role,
                           'legislature',
                           district=post,
                           start_date=self.toDate(
                               office['OfficeRecordStartDate']),
                           end_date=self.toDate(office['OfficeRecordEndDate']))

            legistar_api = self.BASE_URL + '/OfficeRecords/'

            p.add_source(legistar_api, note='api')
            print(p)

            yield p

        adjunct_members = {}

        for body in self.bodies():
            if body['BodyTypeId'] == body_types['Committee']:
                o = Organization(body['BodyName'],
                                 classification='committee',
                                 parent_id={'name': 'Board of Directors'})

                o.add_source(self.BASE_URL + '/Bodies/')

                for office in self.body_offices(body):
                    role = office['OfficeRecordTitle']
                    if role not in ("Chair", "Vice Chair"):
                        role = 'Member'

                    person = office['OfficeRecordFullName']
                    if person not in members:
                        if person not in adjunct_members:
                            p = Person(person)
                            p.add_source('foo')

                        else:
                            p = adjunct_members[person]

                        p.add_membership(body['BodyName'],
                                         role=role,
                                         start_date=self.toDate(
                                             office['OfficeRecordStartDate']),
                                         end_date=self.toDate(
                                             office['OfficeRecordEndDate']))
                        adjunct_members[person] = p
                    else:
                        o.add_member(office['OfficeRecordFullName'],
                                     role,
                                     start_date=self.toDate(
                                         office['OfficeRecordStartDate']),
                                     end_date=self.toDate(
                                         office['OfficeRecordEndDate']))

                yield o

        for p in adjunct_members.values():
            yield p
Ejemplo n.º 49
0
    def scrape_session(self, session, chambers):
        sid = SESSION_SITE_IDS[session]
        committees = backoff(self.cservice.GetCommitteesBySession, sid)

        # if committees.strip() == "":
        #    return  # If we get here, it's a problem.
        # Commenting this out for future debugging. - PRT

        if str(committees).strip() == "":
            raise ValueError("Error: No committee data for sid: %s" % (sid))

        committees = committees['CommitteeListing']
        for committee in committees:
            cid = committee['Id']
            committee = backoff(self.cservice.GetCommittee, cid)
            subctty_cache = {}

            comname, typ, guid, code, description = [
                committee[x]
                for x in ['Name', 'Type', 'Id', 'Code', 'Description']
            ]
            comchamber = {
                "House": "lower",
                "Senate": "upper",
                "Joint": "joint"
            }[typ]
            ctty_key = '{}-{}'.format(typ, code)
            if ctty_key not in self.ctty_cache:
                ctty = Organization(chamber=comchamber,
                                    name=comname,
                                    classification='committee')
                ctty.extras = {
                    'code': code,
                    'guid': guid,
                    'description': description,
                }
                self.ctty_cache[ctty_key] = ctty

            members = committee['Members']['CommitteeMember']
            for member in members:
                name = "{First} {Last}".format(
                    **dict(member['Member']['Name']))
                role = member['Role']
                membership = ctty.add_member(name, role)
                membership.extras = {'guid': member['Member']['Id']}
                subcoms = member['SubCommittees'] or []
                for subcom in subcoms:
                    subcom = subcom[1][0]
                    subguid = subcom['Id']
                    subcommittee = subcom['Name']
                    if subcommittee in subctty_cache:
                        # Add member to existing subcommittee.
                        subctty = subctty_cache[subcommittee]
                    else:
                        # Create subcommittee.
                        subctty = Organization(name=subcommittee,
                                               classification='committee',
                                               parent_id={
                                                   'classification':
                                                   comchamber,
                                                   'name': comname
                                               })
                        subctty.extras = {
                            'guid': subguid,
                        }
                        subctty.add_source(self.csource)
                        subctty.add_source(
                            CTTIE_URL.format(**{
                                "sid": sid,
                                "cttie": guid,
                            }))
                        subctty_cache[subcommittee] = subctty
                    membership = subctty.add_member(name, role)
                    membership.extras = {'guid': member['Member']['Id']}

            for subctty in subctty_cache.values():
                yield subctty

            ctty.add_source(self.csource)
            ctty.add_source(CTTIE_URL.format(**{
                "sid": sid,
                "cttie": guid,
            }))
            yield ctty
Ejemplo n.º 50
0
    def scrape(self, session=None):
        if session is None:
            session = self.latest_session()
        year_slug = session[5:]

        # Load all committees via the private API
        committee_dump_url = \
                'http://legislature.vermont.gov/committee/loadList/{}/'.\
                format(year_slug)
        json_data = self.get(committee_dump_url).text
        committees = json.loads(json_data)['data']

        # Parse the information from each committee
        for info in committees:
            # Strip whitespace from strings
            info = {k: v.strip() for k, v in info.items()}

            # Determine the chamber
            if info['CommitteeType'] == 'House Standing':
                chamber = 'lower'
            elif info['CommitteeType'] == 'Senate Standing':
                chamber = 'upper'
            elif info['CommitteeType'] == 'Joint Committee':
                chamber = 'joint'
            elif info['CommitteeType'] in ('Study Committee', 'Commissions'):
                if info['CommitteeName'].startswith("House"):
                    chamber = 'lower'
                elif info['CommitteeName'].startswith("Senate"):
                    chamber = 'upper'
                else:
                    chamber = 'joint'
            else:
                raise AssertionError(
                    "Unknown committee type found: '{}'".format(
                        info['CommitteeType']))
            comm = Organization(name=info['CommitteeName'],
                                chamber=chamber,
                                classification='committee')

            # Determine membership and member roles
            # First, parse the member list and make sure it isn't a placeholder
            REMOVE_TAGS_RE = r'<.*?>'
            members = [
                re.sub(REMOVE_TAGS_RE, '', x)
                for x in info['Members'].split('</br>')
            ]
            members = [x.strip() for x in members if x.strip()]

            for member in members:
                # Strip out titles, and exclude committee assistants
                if member.startswith("Rep. "):
                    member = member[len("Rep. "):]
                elif member.startswith("Sen. "):
                    member = member[len("Sen. "):]
                else:
                    self.info("Non-legislator member found: {}".format(member))

                # Determine the member's role in the committee
                if ',' in member:
                    (member, role) = [x.strip() for x in member.split(',')]
                    if 'jr' in role.lower() or 'sr' in role.lower():
                        raise AssertionError(
                            "Name suffix confused for a committee role")
                else:
                    role = 'member'

                comm.add_member(name_or_person=member, role=role)

            comm.add_source(committee_dump_url)

            yield comm
Ejemplo n.º 51
0
    def scrape_committee(self, chamber, name, url, subcommittee=None):
        name = self._fix_committee_name(name)
        name = self._fix_committee_case(name)

        page = self.get(url).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        # Get the subcommittee name.
        xpath = '//div[@class="ms-WPBody"]//table//tr/td/b/text()'

        if subcommittee:
            subcommittee = page.xpath(xpath)
            if subcommittee:
                subcommittee = page.xpath(xpath).pop(0)
                subcommittee = self._fix_committee_name(
                    subcommittee, parent=name, subcommittee=True)
                subcommittee = self._fix_committee_case(subcommittee)
            else:
                subcommittee = None

        # Dedupe.
        if (chamber, name, subcommittee) in self._seen:
            return
        self._seen.add((chamber, name, subcommittee))

        comm = Organization(chamber=chamber, name=name, classification='committee')
        comm.add_source(url)

        member_nodes = page.xpath('//table[@class="dxgvTable"]/tr')

        for member_node in member_nodes:
            # Skip empty rows.
            if member_node.attrib['class'] == 'dxgvEmptyDataRow':
                continue

            mtype = member_node.xpath('string(td[1])').strip()

            if not mtype:
                mtype = 'member'

            member = member_node.xpath('string(td[3])').split()

            member = ' '.join(member[1:])

            comm.add_member(member, role=mtype)

        for a in page.xpath('//table[@id="ctl00_m_g_a194465c_f092_46df_b753_'
                            '354150ac7dbd_ctl00_tblContainer"]//ul/li/a'):
            sub_name = a.text.strip()
            sub_url = a.get('href').replace('../', '/')
            self.scrape_committee(chamber, name, sub_url,
                                  subcommittee=sub_name)

        if not comm._related:
            if subcommittee:
                self.warning('Not saving empty subcommittee {}.'.format(
                    subcommittee))
            else:
                self.warning('Not saving empty committee {}.'.format(name))
        else:
            yield comm
Ejemplo n.º 52
0
    def scrape_committees_pdf(self, year, chamber, filename, url):
        if chamber == "lower" and year == "2015":
            text = self._fix_house_text(filename).decode()
        else:
            text = convert_pdf(filename, type="text-nolayout").decode()

        for hotgarbage, replacement in (
            (
                r"Judicial Branch, Law Enforcement,\s+and\s+Justice",
                "Judicial Branch, Law Enforcement, and Justice",
            ),
            (
                r"Natural Resources and\s+Transportation",
                "Natural Resources and Transportation",
            ),
            (
                r"(?u)Federal Relations, Energy,?\s+and\s+Telecommunications",
                "Federal Relations, Energy, and Telecommunications",
            ),
        ):
            text = re.sub(hotgarbage, replacement, text)

        lines = iter(text.splitlines())

        # Drop any lines before the ag committee.
        lines = dropwhile(lambda s: "Agriculture" not in s, lines)

        comm = None
        for line in lines:
            # Replace Unicode variants with ASCII equivalents
            line = line.replace(" ", " ").replace("‐", "-")

            if "Subcommittees" in line:
                self.warning("Currently, we're skipping subcommittees")
                # https://github.com/openstates/openstates/issues/2099
                break
            if is_committee_name(line):
                if comm and comm._related:
                    yield comm

                committee = line.strip()
                comm = Organization(
                    name=committee, chamber=chamber, classification="committee"
                )

                comm.add_source(url)

            elif is_legislator_name(line):
                name, party = line.rsplit("(", 1)
                name = name.strip().replace("Rep. ", "").replace("Sen. ", "")
                if re.search(" Ch", party):
                    role = "chair"
                elif " VCh" in party:
                    role = "vice chair"
                elif " MVCh" in party:
                    role = "minority vice chair"
                else:
                    role = "member"
                comm.add_member(name, role)

        if comm._related:
            yield comm
Ejemplo n.º 53
0
    def scrape(self):
        body_types = self.body_types()
        city_council, = [body for body in self.bodies()
                         if body["BodyName"] == "City Council"]
        terms = collections.defaultdict(list)

        for office in self.body_offices(city_council):
            if "VACAN" not in office["OfficeRecordFullName"]:
                terms[office["OfficeRecordFullName"].strip()].append(office)

        web_scraper = LegistarPersonScraper(requests_per_minute=self.requests_per_minute)
        web_scraper.MEMBERLIST = "https://pittsburgh.legistar.com/People.aspx"
        web_scraper.COMMITTEELIST = "https://pittsburgh.legistar.com/Departments.aspx"

        if self.cache_storage:
            web_scraper.cache_storage = self.cache_storage

        if self.requests_per_minute == 0:
            web_scraper.cache_write_only = False

        web_info = {}
        for member in web_scraper.councilMembers():
            web_info[member["Person Name"]] = member

        members = {}
        for member, offices in terms.items():
            person = Person(member)
            for term in offices:
                role = term["OfficeRecordTitle"]
                person.add_term("Councilmember",
                                "legislature",
                                start_date = self.toDate(term["OfficeRecordStartDate"]),
                                end_date = self.toDate(term["OfficeRecordEndDate"]))

            if member in web_info:
                web = web_info[member]
                if web["E-mail"] and web["E-mail"]["label"] and web["E-mail"]["label"] != "N/A":
                    person.add_contact_detail(type="email",
                                        value=web["E-mail"]["label"],
                                        note="E-mail")

            person_source_data = self.person_sources_from_office(term)
            person_api_url, person_api_response = person_source_data
            person.add_source(person_api_url, note="api")

            if person_api_response["PersonAddress1"]:
                address = (person_api_response["PersonAddress1"] + ", " + person_api_response["PersonCity1"]
                          + ", " + person_api_response["PersonState1"] + " " + person_api_response["PersonZip1"])
                person.add_contact_detail(type="address",
                                    value=address,
                                    note="Office address")

            if person_api_response["PersonPhone"]:
                person.add_contact_detail(type="voice",
                                    value=person_api_response["PersonPhone"],
                                    note="Office phone")

            if person_api_response["PersonWWW"]:
                person.add_contact_detail(type="url",
                                    value=person_api_response["PersonWWW"],
                                    note="District website")

            members[member] = person


        for body in self.bodies():
            if body["BodyTypeId"] == body_types["Committee"]:
                body_name_clean = body["BodyName"].strip()
                organization = Organization(body_name_clean,
                             classification="committee",
                             parent_id={"name" : "Pittsburgh City Council"})

                organization.add_source(self.BASE_URL + "/bodies/{BodyId}".format(**body), note="api")

                for office in self.body_offices(body):
                    role = office["OfficeRecordMemberType"]
                    if role not in ("Vice Chair", "Chair") or role == "Councilmember":
                        role = "Member"

                    person = office["OfficeRecordFullName"].strip()
                    if person in members:
                        person = members[person]
                    else:
                        person = Person(person)

                    person.add_membership(body_name_clean,
                                     role=role,
                                     start_date = self.toDate(office["OfficeRecordStartDate"]),
                                     end_date = self.toDate(office["OfficeRecordEndDate"]))

                yield organization

        for person in members.values():
            yield person
Ejemplo n.º 54
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)
        url = page.xpath(
            '//a[contains(text(),"Municipal Directory")]/@href')[0]

        response = urlopen(url).read()
        pdf = open('/tmp/nl.pdf', 'w')
        pdf.write(response)
        pdf.close()

        data = subprocess.check_output(
            ['pdftotext', '-layout', '/tmp/nl.pdf', '-'])
        pages = data.split('Municipal Directory')[1:]
        for page in pages:
            page = page.splitlines(True)
            column_index = {}
            for line in page:
                if 'Official Name' in line:
                    column_index['dist_end'] = re.search('Region',
                                                         line).start()
                    column_index['name_start'] = re.search('Mayor',
                                                           line).start() + 1
                    column_index['name_end'] = re.search('Clerk',
                                                         line).start() - 1
                    column_index['phone_start'] = re.search('Line 1',
                                                            line).start()
                    column_index['phone_end'] = re.search('Line 2',
                                                          line).start() - 1
                    column_index['fax_start'] = re.search('Fax', line).start()
                    column_index['fax_end'] = re.search('E-mail',
                                                        line).start() - 2
                    column_index['email_start'] = column_index['fax_end'] + 1
                    column_index['email_end'] = re.search('Address',
                                                          line).start() - 1
                    column_index[
                        'address_start'] = column_index['email_end'] + 1
                    column_index['address_end'] = re.search('Days',
                                                            line).start() - 1
                    break
            for line in page:
                if 'Official Name' in line or not line.strip():
                    continue
                district = line[:column_index['dist_end']]
                name = line[column_index['name_start']:
                            column_index['name_end']].strip()
                phone = line[column_index['phone_start']:
                             column_index['phone_end']].strip().replace(
                                 '(', '').replace(') ', '-')
                # fax = line[column_index['fax_start']:column_index['fax_end']].strip().replace('(', '').replace(') ', '-')
                email = line[column_index['email_start']:
                             column_index['email_end']].strip()
                address = line[column_index['address_start']:
                               column_index['address_end']].strip()
                address = re.sub(r'\s{2,}', ', ', address)
                if not name or not district:
                    continue

                org = Organization(
                    name=district + ' Municipal Council',
                    classification='legislature',
                    jurisdiction_id=self.jurisdiction.jurisdiction_id)
                org.add_source(COUNCIL_PAGE)
                org.add_source(url)
                yield org

                p = Person(primary_org='legislature',
                           name=name,
                           district=district)
                p.add_source(COUNCIL_PAGE)
                p.add_source(url)
                membership = p.add_membership(org,
                                              role='Mayor',
                                              district=district)
                if phone:
                    membership.add_contact_detail('voice', phone,
                                                  'legislature')
                # I'm excluding fax because that column isn't properly aligned
                # if fax:
                #   membership.add_contact_detail('fax', fax)
                if email:
                    membership.add_contact_detail('email', email)
                if address:
                    membership.add_contact_detail('address', address,
                                                  'legislature')
                yield p
        os.system('rm /tmp/nl.pdf')
Ejemplo n.º 55
0
    def scrape(self, chamber=None):
        if chamber:
            chambers = [chamber]
        else:
            chambers = ['upper', 'lower', 'legislature']

        # Xpath query string format for legislative chamber committee urls
        base_xpath = (
            '//table[@id="MainContent_gridView{0}Committees"]//a'
            '[contains(@id, "MainContent_gridView{1}Committees_link'
            '{2}Committee")]/@href')
        chamber_paths = {
            'upper': {'url': '{}Senate_Standing'.format(base_url),
                      'chamber_xpath': base_xpath.format('Senate',
                                                         'Senate',
                                                         'Senate')},
            'lower': {'url': '{}House_Standing'.format(base_url),
                      'chamber_xpath': base_xpath.format('House',
                                                         'House',
                                                         'House')},
            'legislature': {'url': '{}Interim'.format(base_url),
                            'chamber_xpath': base_xpath.format('', '', '')}
        }

        for chamber in chambers:
            page = self.lxmlize(chamber_paths[chamber]['url'])

            committee_urls = self.get_nodes(page, chamber_paths[chamber]['chamber_xpath'])

            for committee_url in committee_urls:
                committee_page = self.lxmlize(committee_url)

                c_name = committee_page.xpath(
                    '//li/a[contains(@id, "siteMapBreadcrumbs_lnkPage_")]')[
                    -1].text_content().strip()

                if c_name:
                    members_xpath = ('//table[@id="MainContent_formView'
                                     'CommitteeInformation_grid'
                                     'ViewCommitteeMembers"]/tbody/tr')
                    member_nodes = self.get_nodes(committee_page, members_xpath)

                    tds = {
                        'title': 0,
                        'name': 1,
                        'role': 3
                    }

                    members = []

                    for member_node in member_nodes:
                        m_title = member_node[tds['title']].text_content()
                        m_name = \
                            self.get_node(
                                member_node[tds['name']],
                                './/a[contains(@href, '
                                '"/Members/Legislator?SponCode=")]'
                            ).text_content()

                        role = member_node[tds['role']].text_content()

                        if m_title == 'Senator':
                            m_chamber = 'upper'
                        elif m_title == 'Representative':
                            m_chamber = 'lower'
                        else:
                            m_chamber = None

                        if role in ('Chair', 'Co-Chair', 'Vice Chair',
                                    'Member', 'Advisory', 'Ranking Member'):
                            if chamber == 'legislature':
                                m_role = 'interim {}'.format(role.lower())
                            else:
                                m_role = role.lower()
                        else:
                            m_role = None

                        if m_role:
                            members.append(Member(name=m_name, role=m_role, chamber=m_chamber))

                    # Interim committees are collected during the scraping
                    # for joint committees, and most interim committees
                    # have members from both chambers. However, a small
                    # number of interim committees (right now, just 1) have
                    # only members from one chamber, so the chamber is set
                    # to their chamber instead of 'legislature' for those
                    # committees.
                    if chamber == 'legislature':
                        m_chambers = set(
                            [mem.chamber for mem in members])
                        if len(m_chambers) == 1:
                            chamber = m_chambers.pop()
                    committee = Organization(name=clean_committee_name(c_name),
                                             chamber=chamber,
                                             classification='committee')
                    for member in members:
                        committee.add_member(member.name, member.role)
                    committee.add_source(committee_url)
                    if not committee._related:
                        self.warning(
                            'skipping blank committee {0} '
                            'at {1}'.format(c_name, committee_url))
                    else:
                        yield committee

                else:
                    self.warning('No legislative committee found at '
                                 '{}'.format(committee_url))
    def scrape_chamber(self, chamber):
        if chamber == 'lower':
            url = 'http://www.scstatehouse.gov/member.php?chamber=H'
        else:
            url = 'http://www.scstatehouse.gov/member.php?chamber=S'

        seen_committees = {}

        data = self.get(url).text
        doc = lxml.html.fromstring(data)
        doc.make_links_absolute(url)

        for a in doc.xpath('//a[@class="membername"]'):
            full_name = a.text
            leg_url = a.get('href')

            if full_name.startswith('Senator'):
                full_name = full_name.replace('Senator ', '')
            if full_name.startswith('Representative'):
                full_name = full_name.replace('Representative ', '')

            leg_html = self.get(leg_url).text
            leg_doc = lxml.html.fromstring(leg_html)
            leg_doc.make_links_absolute(leg_url)

            if 'Resigned effective' in leg_html:
                self.info('Resigned')
                continue

            party, district, _ = leg_doc.xpath('//p[@style="font-size: 17px;'
                                               ' margin: 0 0 0 0; padding: 0;"]/text()')

            if 'Republican' in party:
                party = 'Republican'
            elif 'Democrat' in party:
                party = 'Democratic'

            # District # - County - Map
            district = district.split()[1]
            try:
                photo_url = leg_doc.xpath('//img[contains(@src,"/members/")]/@src')[0]
            except IndexError:
                self.warning("No Photo URL for {}".format(full_name))
                photo_url = ''
            person = Person(name=full_name, district=district,
                            party=party, primary_org=chamber,
                            image=photo_url)

            # office address / phone
            try:
                addr_div = leg_doc.xpath('//div[@style="float: left; width: 225px;'
                                         ' margin: 10px 5px 0 20px; padding: 0;"]')[0]
                capitol_address = addr_div.xpath('p[@style="font-size: 13px;'
                                                 ' margin: 0 0 10px 0; padding: 0;"]'
                                                 )[0].text_content()

                phone = addr_div.xpath('p[@style="font-size: 13px;'
                                       ' margin: 0 0 0 0; padding: 0;"]/text()')[0]
                capitol_phone = phone.strip()

                if capitol_address:
                    person.add_contact_detail(type='address', value=capitol_address,
                                              note='Capitol Office')

                if capitol_phone:
                    person.add_contact_detail(type='voice', value=capitol_phone,
                                              note='Capitol Office')
            except IndexError:
                self.warning('no capitol address for {0}'.format(full_name))

            # home address / phone
            try:
                addr_div = leg_doc.xpath('//div[@style="float: left;'
                                         ' width: 225px; margin: 10px 0 0 20px;"]')[0]
                addr = addr_div.xpath('p[@style="font-size: 13px;'
                                      ' margin: 0 0 10px 0; padding: 0;"]')[0].text_content()

                phone = addr_div.xpath('p[@style="font-size: 13px;'
                                       ' margin: 0 0 0 0; padding: 0;"]/text()')[0]
                phone = phone.strip()
                if addr:
                    person.add_contact_detail(type='address', value=addr,
                                              note='District Office')

                if phone:
                    person.add_contact_detail(type='voice', value=phone,
                                              note='District Office')
            except IndexError:
                self.warning('no district address for {0}'.format(full_name))

            person.add_link(leg_url)
            person.add_source(url)
            person.add_source(leg_url)

            # committees (skip first link)
            for com in leg_doc.xpath('//a[contains(@href, "committee.php")]')[1:]:
                if com.text.endswith(', '):
                    committee, role = com.text_content().rsplit(', ', 1)

                    # known roles
                    role = {'Treas.': 'treasurer',
                            'Secy.': 'secretary',
                            'Secy./Treas.': 'secretary/treasurer',
                            'V.C.': 'vice-chair',
                            '1st V.C.': 'first vice-chair',
                            'Co 1st V.C.': 'co-first vice-chair',
                            '2nd V.C.': 'second vice-chair',
                            '3rd V.C.': 'third vice-chair',
                            'Ex.Officio Member': 'ex-officio member',
                            'Chairman': 'chairman'}[role]
                else:
                    committee = com.text
                    role = 'member'

                # only yield each committee once
                if committee not in seen_committees:
                    com = Organization(name=committee, classification='committee',
                                       chamber=chamber)
                    com.add_source(url)
                    seen_committees[committee] = com
                    yield com
                else:
                    com = seen_committees[committee]

                person.add_membership(com, role=role)

            yield person
Ejemplo n.º 57
0
    def scrape(self):
        committee_d = {}
        non_committees = {
            'City Council', 'Office of the Mayor', 'Office of the City Clerk'
        }

        for councilman, committees in self.councilMembers():
            if councilman['Ward/Office'] == "":
                continue

            ward = councilman['Ward/Office']
            if ward not in {"Mayor", "Clerk"}:

                ward = "Ward {}".format(int(ward))
                role = "Alderman"
                p = Person(councilman['Person Name']['label'],
                           district=ward,
                           primary_org="legislature",
                           role=role)

            if councilman['Photo']:
                p.image = councilman['Photo']

            contact_types = {
                "City Hall Office": ("address", "City Hall Office"),
                "City Hall Phone": ("voice", "City Hall Phone"),
                "Ward Office Phone": ("voice", "Ward Office Phone"),
                "Ward Office Address": ("address", "Ward Office Address"),
                "Fax": ("fax", "Fax")
            }

            for contact_type, (type_, _note) in contact_types.items():
                if councilman[contact_type]:
                    p.add_contact_detail(type=type_,
                                         value=councilman[contact_type],
                                         note=_note)

            if councilman["E-mail"]:
                p.add_contact_detail(type="email",
                                     value=councilman['E-mail']['label'],
                                     note='E-mail')

            if councilman['Website']:
                p.add_link(councilman['Website']['url'])
            p.add_source(councilman['Person Name']['url'], note='web')

            for committee, _, _ in committees:
                committee_name = committee['Legislative Body']['label']
                if committee_name and committee_name not in non_committees:
                    o = committee_d.get(committee_name, None)
                    if o is None:
                        o = Organization(
                            committee_name,
                            classification='committee',
                            parent_id={'name': 'Chicago City Council'})
                        o.add_source(committee['Legislative Body']['url'],
                                     note='web')
                        committee_d[committee_name] = o

                    o.add_member(p, role=committee["Title"])

            yield p

        for name, term in FORMER_ALDERMEN.items():
            p = Person(name=name,
                       primary_org="legislature",
                       start_date=term['term'][0],
                       end_date=term['term'][1],
                       district="Ward {}".format(term['ward']),
                       role='Alderman')
            if name == 'Chandler, Michael D.':
                p.add_term('Alderman',
                           "legislature",
                           district="Ward {}".format(term['ward']),
                           start_date=datetime.date(2011, 5, 16),
                           end_date=datetime.date(2015, 5, 18))

            p.add_source(term['source'], note='web')
            yield p

        for o in committee_d.values():
            yield o

        for committee_name in FORMER_COMMITTEES:
            o = Organization(committee_name,
                             classification='committee',
                             parent_id={'name': 'Chicago City Council'})
            o.add_source("https://chicago.legistar.com/Departments.aspx",
                         note='web')
            yield o

        for joint_committee in JOINT_COMMITTEES:

            o = Organization(joint_committee,
                             classification='committee',
                             parent_id={'name': 'Chicago City Council'})
            o.add_source("https://chicago.legistar.com/Departments.aspx",
                         note='web')
            yield o
Ejemplo n.º 58
0
    def scrape_upper(self):
        # Retrieve index list of committees.
        url = "http://senate.ca.gov/committees"
        doc = self.lxmlize(url)

        standing_committees = doc.xpath(
            '//h2[text()="Standing Committees"]/../following-sibling::div//a'
        )
        sub_committees = doc.xpath(
            '//h2[text()="Sub Committees"]/../following-sibling::div//a'
        )
        joint_committees = doc.xpath(
            '//h2[text()="Joint Committees"]/../following-sibling::div//a'
        )
        other_committees = doc.xpath(
            '//h2[text()="Other"]/../following-sibling::div//a'
        )

        # Iterates over each committee [link] found.
        for committee in (
            standing_committees + sub_committees + joint_committees + other_committees
        ):
            # Get the text of the committee link, which should be the name of
            # the committee.
            (comm_name,) = committee.xpath("text()")

            org = Organization(
                chamber="upper", name=comm_name, classification="committee"
            )

            (comm_url,) = committee.xpath("@href")
            org.add_source(comm_url)
            comm_doc = self.lxmlize(comm_url)

            if comm_name.startswith("Joint"):
                org["chamber"] = "legislature"
                org["committee"] = (
                    comm_name.replace("Joint ", "")
                    .replace("Committee on ", "")
                    .replace(" Committee", "")
                )

            if comm_name.startswith("Subcommittee"):
                (full_comm_name,) = comm_doc.xpath(
                    '//div[@class="banner-sitename"]/a/text()'
                )
                full_comm_name = re.search(
                    r"^Senate (.*) Committee$", full_comm_name
                ).group(1)
                org["committee"] = full_comm_name

                comm_name = re.search(r"^Subcommittee.*?on (.*)$", comm_name).group(1)
                org["subcommittee"] = comm_name

            # Special case of members list being presented in text blob.
            member_blob = comm_doc.xpath(
                'string(//div[contains(@class, "field-item") and '
                'starts-with(text(), "Senate Membership:")][1]/text()[1])'
            )

            if member_blob:
                # Separate senate membership from assembly membership.
                # This should strip the header from assembly membership
                # string automatically.
                delimiter = "Assembly Membership:\n"
                senate_members, delimiter, assembly_members = member_blob.partition(
                    delimiter
                )

                # Strip header from senate membership string.
                senate_members = senate_members.replace("Senate Membership:\n", "")

                # Clean membership strings.
                senate_members = senate_members.strip()
                assembly_members = assembly_members.strip()

                # Parse membership strings into lists.
                senate_members = senate_members.split("\n")
                assembly_members = assembly_members.split("\n")

                members = senate_members + assembly_members
            # Typical membership list format.
            else:
                members = comm_doc.xpath(
                    '//a[(contains(@href, "/sd") or '
                    'contains(@href, "assembly.ca.gov/a")) and '
                    '(starts-with(text(), "Senator") or '
                    'starts-with(text(), "Assembly Member"))]/text()'
                )

            for member in members:
                if not member.strip():
                    continue

                (mem_name, mem_role) = re.search(
                    r"""(?ux)
                        ^(?:Senator|Assembly\sMember)\s  # Legislator title
                        (.+?)  # Capture the senator's full name
                        (?:\s\((.{2,}?)\))?  # There may be role in parentheses
                        (?:\s\([RD]\))?  # There may be a party affiliation
                        \s*$
                        """,
                    member,
                ).groups()
                org.add_member(mem_name, role=mem_role if mem_role else "member")

            if not org._related:
                self.warning("No members found for committee {}".format(comm_name))

            yield org
Ejemplo n.º 59
0
    def get_organizations(self):
        org = Organization(name="Board of Directors",
                           classification="legislature")

        org.add_post(
            'Mayor of the City of Los Angeles',
            'Board Member',
            division_id='ocd-division/country:us/state:ca/place:los_angeles')

        for district in range(1, 6):
            org.add_post(
                'Los Angeles County Board Supervisor, District {}'.format(
                    district),
                'Board Member',
                division_id=
                'ocd-division/country:us/state:ca/county:los_angeles/council_district:{}'
                .format(district))

        org.add_post(
            'Appointee of Mayor of the City of Los Angeles',
            'Board Member',
            division_id='ocd-division/country:us/state:ca/place:los_angeles')

        org.add_post('Appointee of Governor of California',
                     'Nonvoting Board Member',
                     division_id='ocd-division/country:us/state:ca')

        org.add_post(
            'Appointee of Los Angeles County City Selection Committee, North County/San Fernando Valley sector',
            'Board Member',
            division_id=
            'ocd-division/country:us/state:ca/county:los_angeles/la_metro_sector:north_county_san_fernando_valley'
        )

        org.add_post(
            'Appointee of Los Angeles County City Selection Committee, Southwest Corridor sector',
            'Board Member',
            division_id=
            'ocd-division/country:us/state:ca/county:los_angeles/la_metro_sector:southwest_corridor'
        )

        org.add_post(
            'Appointee of Los Angeles County City Selection Committee, San Gabriel Valley sector',
            'Board Member',
            division_id=
            'ocd-division/country:us/state:ca/county:los_angeles/la_metro_sector:san_gabriel_valley'
        )

        org.add_post(
            'Appointee of Los Angeles County City Selection Committee, Southeast Long Beach sector',
            'Board Member',
            division_id=
            'ocd-division/country:us/state:ca/county:los_angeles/la_metro_sector:southeast_long_beach'
        )

        org.add_post('Chair', 'Chair')

        org.add_post('1st Vice Chair', '1st Vice Chair')

        org.add_post('2nd Vice Chair', '2nd Vice Chair')

        org.add_post("Chief Executive Officer", "Chief Executive Officer")

        yield org

        org = Organization(name="Crenshaw Project Corporation",
                           classification="corporation")
        org.add_source(
            'https://metro.legistar.com/DepartmentDetail.aspx?ID=32216&GUID=D790CC05-ACCB-451C-B576-2952090769F1'
        )
        yield org

        org = Organization(name="LA SAFE", classification="corporation")
        org.add_source(
            'https://metro.legistar.com/DepartmentDetail.aspx?ID=30222&GUID=5F27DA83-633F-4FEA-A4B0-0477551061B6&R=aef57793-1826-4cfa-b6e3-d6b42cf77527'
        )
        yield org
Ejemplo n.º 60
0
    def scrape(self):
        session = self.latest_session()

        subcomms = self.get_subcommittee_info(session)

        api_base_url = "https://api.iga.in.gov"
        html_base_url = "http://iga.in.gov/legislative/{}/committees/".format(
            session)
        client = ApiClient(self)
        r = client.get("committees", session=session)
        all_pages = client.unpaginate(r)
        for comm_info in all_pages:
            # this is kind of roundabout, but needed in order
            # to take advantage of all of our machinery to make
            # sure we're not overloading their api
            comm_link = comm_info["link"]
            comm_name = comm_link.split("/")[-1]
            if "withdrawn" in comm_name or "conference" in comm_name:
                continue
            try:
                comm_json = client.get("committee",
                                       committee_link=comm_link[1:])
            except HTTPError:
                self.logger.warning("Page does not exist")
                continue
            try:
                chamber = comm_json["chamber"]["name"]
            except KeyError:
                chamber = 'joint'
            else:
                if chamber == "Senate":
                    chamber = "upper"
                elif chamber == "House":
                    chamber = "lower"
                else:
                    raise AssertionError(
                        "Unknown committee chamber {}".format(chamber))

            name = comm_json["name"]
            try:
                owning_comm = subcomms[name]
            except KeyError:
                name = name.replace("Statutory Committee on", "").strip()
                comm = Organization(name=name,
                                    chamber=chamber,
                                    classification='committee')
                if name in subcomms.values():
                    # Avoid identification issues, if committee names are re-used
                    # between upper and lower chambers
                    assert self._parent_committees.get(name) is None
                    self._parent_committees[name] = comm
            else:
                name = name.replace("Statutory Committee on",
                                    "").replace("Subcommittee", "").strip()
                comm = Organization(
                    name=name,
                    parent_id=self._parent_committees[owning_comm],
                    classification='committee')

            chair = self.process_special_members(comm, comm_json, "chair")
            vicechair = self.process_special_members(comm, comm_json,
                                                     "viceChair")
            ranking = self.process_special_members(comm, comm_json,
                                                   "rankingMinMember")

            # leadership is also listed in membership
            # so we have to make sure we haven't seen them yet
            comm_members = [m for m in [chair, vicechair, ranking] if m]

            for mem in comm_json["members"]:
                mem_name = mem["firstName"] + " " + mem["lastName"]
                if mem_name not in comm_members:
                    comm_members.append(mem_name)
                    comm.add_member(mem_name)

            api_source = api_base_url + comm_link

            if comm_name[:10] == "committee_":
                html_source = html_base_url + comm_name[10:]

            comm.add_source(html_source)
            comm.add_source(api_source)
            yield comm