Esempio n. 1
0
    def select_special_comm(self):
        main_url = 'http://www.nebraskalegislature.gov/committees/select-committees.php'
        with self.urlopen(main_url) as page:
            page = lxml.html.fromstring(page)

            for comm_names in page.xpath('/html/body/div[@id="wrapper"]/div[@id="content"]/div[@id="content_text"]/div[@class="content_box_container"]/div[@class="content_box"]'):
               name = comm_names.xpath('h2')[0].text
               if name != None:
                   committee = Committee('upper', name)
                   committee.add_source(main_url)
                   for senators in comm_names.xpath('ul[@class="nobullet"]/li'):
                       senator = senators[0].text
                       if 'Chairperson' in senator:
                           role = 'Chairperson'
                           senator = senator[5:-13]
                       else:
                           role = 'member'
                           senator = senator[5:-1]
                       committee.add_member(senator, role)
                   self.save_committee(committee)
               else:
                   name = comm_names.xpath('h2/a')[0].text
                   committee = Committee('upper', name)
                   committee.add_source(main_url)
                   for senators in comm_names.xpath('ul[@class="nobullet"]/li'):
                       senator = senators[0].text
                       if 'Chairperson' in senator:
                           role = 'chairperson'
                           senator = senator[5:-13]
                       else:
                           role = 'member'
                           senator = senator[5:-1]
                       committee.add_member(senator, role)
                   self.save_committee(committee)
Esempio n. 2
0
    def scrape_committee(self, name, url, chamber):
        com = Committee(chamber, name)
        com.add_source(url)
        data = self.get(url).text
        doc = lxml.html.fromstring(data)

        for leg in doc.xpath('//div[@id="members"]/div[@id="members"]/p/a/text()'):
            leg = leg.replace('Representative ', '')
            leg = leg.replace('Senator ', '')
            leg = leg.strip()
            if ' (' in leg:
                leg, role = leg.split(' (')
                if 'Vice-Chair' in role:
                    role = 'vice-chair'
                elif 'Co-Chair' in role:
                    role = 'co-chair'
                elif 'Chair' in role:
                    role = 'chair'
                else:
                    raise Exception('unknown role: %s' % role)
            else:
                role = 'member'
            com.add_member(leg, role)

        self.save_committee(com)
Esempio n. 3
0
    def scrape_reps_comm(self):

       url = 'http://www.maine.gov/legis/house/hsecoms.htm'

       with self.urlopen(url) as page:
            root = lxml.html.fromstring(page)

            count = 0

            for n in range(1, 12, 2):
                path = 'string(//body/center[%s]/h1/a)' % (n)
                comm_name = root.xpath(path)
                committee = Committee('lower', comm_name)
                count = count + 1

                path2 = '/html/body/ul[%s]/li/a' % (count)

                for el in root.xpath(path2):
                   rep = el.text
                   if rep.find('(') != -1:
                        mark = rep.find('(')
                        rep = rep[15: mark]
                   committee.add_member(rep)
                committee.add_source(url)

                self.save_committee(committee)
Esempio n. 4
0
    def scrape_senate_committee(self, term, link):
        with self.urlopen(link) as html:
            doc = lxml.html.fromstring(html)

            # strip first 30 and last 10
            # Minnesota Senate Committees - __________ Committee
            committee_name = doc.xpath('//title/text()')[0][30:-10]

            com = Committee('upper', committee_name)

            # first id=bio table is members
            for row in doc.xpath('//table[@id="bio"]')[0].xpath('tr'):
                row = fix_whitespace(row.text_content())

                # switch role
                if ':' in row:
                    position, name = row.split(': ')
                    role = position.lower().strip()
                else:
                    name = row

                # add the member
                com.add_member(name.strip(), role)

            com.add_source(link)
            self.save_committee(com)
Esempio n. 5
0
    def scrape_upper_committee(self, name, url):
        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)

            comm = Committee('upper', name)
            comm.add_source(url)

            member_div = page.xpath("//div[@class = 'committee-members']")[0]

            seen = set()
            for link in member_div.xpath(".//a"):
                if not link.text:
                    continue

                member = link.text.strip()

                next_elem = link.getnext()
                if (next_elem is not None and
                    next_elem.tag == 'a' and
                    next_elem.attrib['href'] == link.attrib['href']):
                    # Sometimes NY is cool and splits names across a
                    # couple links
                    member = "%s %s" % (member, next_elem.text.strip())

                member = re.sub(r'\s+', ' ', member)

                if member in seen or not member:
                    continue
                seen.add(member)

                name, role = parse_name(member)
                comm.add_member(name, role)

            self.save_committee(comm)
Esempio n. 6
0
    def scrape_house_committee(self, committee_name, link):
        """Scrape individual committee page and add members"""

        html = self.urlopen(link)
        doc = lxml.html.fromstring(html)

        subcommittee = False
        for h1 in doc.xpath('//h1/text()'):
            if 'subcommittee' in h1.lower():
                subcommittee = True

        subcomm_name = ('Subcommittee' if subcommittee else None)

        if subcommittee:
            committee_name = committee_name.replace(' Subcommittee', '')
        com = Committee('lower', committee_name, subcomm_name)

        find_expr = "//div[@class='col1']/ul[position()<3]/li/a"
        for a in doc.xpath(find_expr):
            name = a.text
            role = (a.tail or '').strip(', ') or 'member'
            if name:
                com.add_member(name, role)

        com.add_source(link)
        if com['members']:
            self.save_committee(com)
Esempio n. 7
0
    def scrape_upper_committee(self,url):
        filename, resp = self.urlretrieve(url)
        root = lxml.etree.fromstring( convert_pdf(filename,'xml'))
        for link in root.xpath('/pdf2xml/page'):
            comm = None
            for line in link.findall('text'):
                text = line.findtext('b')
                if text is not None and text.startswith('Comisi'):
                    comm = Committee('upper',text);
                    comm.add_source(url)
                else:
                    if line.text and line.text.startswith('Hon.'):
                        line_text = line.text.replace(u'–','-')
                        name_split = line_text.split(u'-',1)
                        title = 'member'
#           print name_split
                        if len(name_split) >= 2:
                            name_split[1] = name_split[1].strip()
                            if name_split[1] == 'Presidenta' or name_split[1] == 'Presidente':
                                title = 'chairman'
                            elif name_split[1] == 'Vicepresidente' or name_split[1] == 'Vicepresidenta':
                                title = 'vicechairman'
                            elif name_split[1] == 'Secretaria' or name_split[1] == 'Secretario':
                                title = 'secretary'
#           if title != 'member':
#               print name_split[0]
                        if name_split[0] != 'VACANTE':
                            comm.add_member(name_split[0].replace('Hon.',''),title)
            self.save_committee(comm)
                        
        
        os.remove(filename);
Esempio n. 8
0
    def select_special_comm(self):
        main_url = "http://www.nebraskalegislature.gov/committees/select-committees.php"
        with self.urlopen(main_url) as page:
            page = lxml.html.fromstring(page)

            for comm_names in page.xpath('//div[@class="content_box"]'):
                name = comm_names.xpath("h2")[0].text
                if name != None:
                    committee = Committee("upper", name)
                    committee.add_source(main_url)
                    for senators in comm_names.xpath('ul[@class="nobullet"]/li'):
                        senator = senators[0].text
                        if "Chairperson" in senator:
                            role = "Chairperson"
                            senator = senator[5:-13].strip()
                        else:
                            role = "member"
                            senator = senator[5:].strip()
                        committee.add_member(senator, role)
                    self.save_committee(committee)
                else:
                    name = comm_names.xpath("h2/a")[0].text
                    committee = Committee("upper", name)
                    committee.add_source(main_url)
                    for senators in comm_names.xpath('ul[@class="nobullet"]/li'):
                        senator = senators[0].text
                        if "Chairperson" in senator:
                            role = "chairperson"
                            senator = senator[5:-13].strip()
                        else:
                            role = "member"
                            senator = senator[5:].strip()
                        committee.add_member(senator, role)
                    self.save_committee(committee)
Esempio n. 9
0
    def scrape(self, chamber, term):

        for t in self.metadata['terms']:
            if t['name'] == term:
                session = t['sessions'][-1]

        sessionsuffix = 'th'
        if str(session)[-1] == '1':
            sessionsuffix = 'st'
        elif str(session)[-1] == '2':
            sessionsuffix = 'nd'
        elif str(session)[-1] == '3':
            sessionsuffix = 'rd'
        insert = str(session) + sessionsuffix + str(term[0:4])

        chamber_letter = {'lower':'A', 'upper':'S'}[chamber]

        url = 'http://www.leg.state.nv.us/Session/%s/Committees/%s_Committees/' % (
            insert, chamber_letter)

        page = self.urlopen(url)
        root = lxml.html.fromstring(page)
        for com_a in root.xpath('//strong/a'):
            com_url = url + com_a.get('href')
            if com_a.text == 'Committee of the Whole':
                continue
            com = Committee(chamber, com_a.text)
            com.add_source(com_url)
            self.scrape_comm_members(chamber, com, com_url)
            self.save_committee(com)
Esempio n. 10
0
    def scrape_committee(self, term, chambers, href, name):
        page = self.get(href).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(href)
        members = page.xpath("//div[@class='view-content']" "//a[contains(@href, 'members')]")

        if "/joint/" in href:
            chamber = "joint"
        elif "/senate/" in href:
            chamber = "upper"
        elif "/house/" in href:
            chamber = "lower"
        else:
            print "XXX: Fail! %s" % (href)
            return

        cttie = Committee(chamber, name)

        for a in members:
            member = a.text
            role = a.xpath("ancestor::div/h2[@class='pane-title']/text()")[0]
            role = {"Legislative Members": "member", "Chairman": "chair", "Vice Chairman": "member"}[role]

            if member is None or member.startswith("District"):
                continue

            cttie.add_member(member, role=role)

        cttie.add_source(href)
        self.save_committee(cttie)
Esempio n. 11
0
    def scrape_upper(self):
        url = "http://senadopr.us/Lists/Listado%20de%20Comisiones/Comisiones%20del%20Senado.aspx"
        with self.urlopen(url) as html:
            doc = lxml.html.fromstring(html)
            doc.make_links_absolute(url)
            table = doc.xpath(
                '//table[@id="{C05AFE0D-D977-4033-8D7B-C43ABF948A4A}-{3E52C91B-AFC8-4493-967A-C8A47AC4E7B6}"]'
            )

            for link in table[0].iterchildren("tr"):
                td_column = list(link)
                name = td_column[0].find("a")
                if name is not None:
                    com_source = name.get("href")
                    # if committee does not have a url use the default.
                    if com_source == "http://senadopr.us/":
                        com_source = url

                    com_name = name.text
                    # check the committee name to see if it's a join one.
                    if td_column[1].text == "Comisi\xf3n Conjunta":
                        chamber = "joint"
                    else:
                        chamber = "upper"
                    com = Committee(chamber, com_name)
                    com.add_source(com_source)
                    com.add_member(clean_spaces(td_column[2].find("a").text), "chairman")
                    self.save_committee(com)
Esempio n. 12
0
    def scrape_senate_committee(self, url):
        html = self.urlopen(url)
        doc = lxml.html.fromstring(html)

        name = doc.xpath('//h6/text()')[0]

        com = Committee(chamber='upper', committee=name)

        for member in doc.xpath('//div[@id="committeelist"]//a'):
            member_name = member.text.strip()

            # don't add clerks
            if member_name == 'Committee Clerk':
                continue

            if 'Committee Chair' in member.tail:
                role = 'chair'
            elif 'Majority Vice' in member.tail:
                role = 'majority vice chair'
            elif 'Minority Vice' in member.tail:
                role = 'minority vice chair'
            else:
                role = 'member'

            com.add_member(member_name, role=role)

        com.add_source(url)
        self.save_committee(com)
Esempio n. 13
0
    def scrape_approp_subcommittees(self, url):
        html = self.urlopen(url)
        doc = lxml.html.fromstring(html)

        for strong in doc.xpath('//strong'):
            com = Committee(chamber='upper', committee='Appropriations',
                            subcommittee=strong.text.strip())
            com.add_source(url)

            legislators = strong.getnext().tail.replace('Senators', '').strip()
            for leg in re.split(', | and ', legislators):
                if leg.endswith('(C)'):
                    role = 'chairman'
                    leg = leg[:-4]
                elif leg.endswith('(VC)'):
                    role = 'vice chairman'
                    leg = leg[:-5]
                elif leg.endswith('(MVC)'):
                    role = 'minority vice chairman'
                    leg = leg[:-6]
                else:
                    role = 'member'
                com.add_member(leg, role=role)

            self.save_committee(com)
Esempio n. 14
0
    def scrape(self, chamber, term):
        self.validate_term(term, latest_only=True)

        chamber_abbr = {'upper': 's', 'lower': 'h'}[chamber]

        url = "http://le.utah.gov/asp/interim/standing.asp?house=%s" % chamber_abbr
        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)
            page.make_links_absolute(url)

            for comm_link in page.xpath("//a[contains(@href, 'Com=')]"):
                comm_name = comm_link.text.strip()

                # Drop leading "House" or "Senate" from name
                comm_name = re.sub(r"^(House|Senate) ", "", comm_name)

                comm = Committee(chamber, comm_name)

                for mbr_link in comm_link.xpath(
                    "../../../font[2]/a[not(contains(@href, 'mailto'))]"):

                    name = mbr_link.text.strip()

                    next_el = mbr_link.getnext()
                    if next_el is not None and next_el.tag == 'i':
                        type = next_el.text.strip()
                    else:
                        type = 'member'

                    comm.add_member(name, type)

                self.save_committee(comm)
Esempio n. 15
0
    def scrape_senate_committee(self, url):
        html = self.get(url).text
        doc = lxml.html.fromstring(html)

        name = doc.xpath('//h3/text()')[0]
        name = name.replace(' Committee', '')

        com = Committee(chamber='upper', committee=name)

        for member in doc.xpath('//div[@id="committeeright"]//a'):
            member_name = member.text.strip()

            # don't add clerks
            if member_name == 'Committee Clerk':
                continue

            # skip phone links
            if member.get("href").startswith("tel:"):
                continue

            if 'Committee Chair' in member.tail:
                role = 'chair'
            elif 'Majority Vice' in member.tail:
                role = 'majority vice chair'
            elif 'Minority Vice' in member.tail:
                role = 'minority vice chair'
            else:
                role = 'member'

            com.add_member(member_name, role=role)

        com.add_source(url)
        self.save_committee(com)
Esempio n. 16
0
    def _scrape_upper_committee(self, name, url2):
        cat = "Assignments.asp"
        url3 = "".join((url2, cat))

        committee = Committee('upper', name)
        committee.add_source(url2)

        page = self.lxmlize(url3)

        members = page.xpath('//table[@id="table38"]//font/a/b')

        for link in members:
            role = "member"
            if link == members[0]:
                role = "Chairman"
            if link == members[1]:
                role = "Vice-Chairman"

            name = link.xpath('string()')
            name = name.replace('Senator ', '')
            name = re.sub('[\s]{2,}', ' ', name).strip()

            committee.add_member(name, role)

        self.save_committee(committee)
Esempio n. 17
0
    def _scrape_lower_special_committees(self):
        url = 'http://house.louisiana.gov/H_Cmtes/SpecialCommittees.aspx'
        page = self.lxmlize(url)
        
        committee_list = page.xpath('//table[@id="table106"]//div[@class='
            '"exBody1A"]/div[@class="accordion"]')[0]
        headers = committee_list.xpath('./h3')

        for header in headers:
            committee_name_text = header.xpath('string()')
            committee_name = committee_name_text.strip()
            committee_name = self._normalize_committee_name(committee_name)

            chamber = 'joint' if committee_name.startswith('Joint') else 'lower'

            committee = Committee(chamber, committee_name)
            committee.add_source(url)

            committee_memberlist = header.xpath('./following-sibling::div['
                '@class="pane"]//tr[@class="linkStyle2"]')

            for row in committee_memberlist:
                member_name = row.xpath('normalize-space(string(./td[1]))')
                member_name = ' '.join(filter(None, name_tools.split(member_name)))
                member_role = row.xpath('normalize-space(string(./td[2]))')

                member_role = self._normalize_member_role(member_role)

                committee.add_member(member_name, member_role)

            self.save_committee(committee)
Esempio n. 18
0
    def scrape_senate_committee(self, name, url2):
        cat = "Assignments.asp"
        url3 = "".join((url2, cat))

        committee = Committee("upper", name)
        committee.add_source(url2)

        text = self.get(url3).text
        page = lxml.html.fromstring(text)

        members = page.xpath('//table[@id="table38"]//font/a/b')

        for link in members:
            role = "member"
            if link == members[0]:
                role = "Chairman"
            if link == members[1]:
                role = "Vice-Chairman"

            name = link.xpath("string()")
            name = name.replace("Senator ", "")
            name = re.sub("[\s]{2,}", " ", name).strip()

            committee.add_member(name, role)

        self.save_committee(committee)
Esempio n. 19
0
    def scrape_joint_committee(self, url):
        html = self.urlopen(url)
        doc = lxml.html.fromstring(html)

        name = doc.xpath('//h1/text()') or doc.xpath('//h2/text()')
        name = name[0].strip()

        comm = Committee('joint', name)
        comm.add_source(url)

        members = chain(doc.xpath('//a[contains(@href, "MemberId")]'),
                        doc.xpath('//a[contains(@href, "Senators")]'))

        seen = set()
        for a in members:
            parent_content = a.getparent().text_content()
            if ':' in parent_content:
                title = parent_content.split(':')[0].strip()
            else:
                title = 'member'

            name = a.text.split(' (')[0].strip()
            if (name, title) not in seen:
                comm.add_member(name, title)
                seen.add((name, title))

        if comm['members']:
            self.save_committee(comm)
Esempio n. 20
0
    def scrape_senate_comm(self):
        url = 'http://www.maine.gov/legis/senate/Senate-Standing-Committees.html'

        html = self.urlopen(url)
        doc = lxml.html.fromstring(html)

        # committee titles
        for item in doc.xpath('//span[@style="FONT-SIZE: 11pt"]'):
            text = item.text_content().strip()
            # some contain COMMITTEE ON & some are blank, drop those
            if not text or text.startswith('COMMITTEE'):
                continue

            # titlecase committee name
            com = Committee('upper', text.title())
            com.add_source(url)

            # up two and get ul sibling
            for leg in item.xpath('../../following-sibling::ul[1]/li'):
                lname = leg.text_content().strip()
                if 'Chair' in lname:
                    role = 'chair'
                else:
                    role = 'member'
                lname = leg.text_content().strip().split(' of ')[0].strip()
                com.add_member(lname, role)

            self.save_committee(com)
Esempio n. 21
0
    def scrape_committee(self, chamber, name, url):
        page = self.urlopen(url)
        page = lxml.html.fromstring(page)

        if page.xpath("//h3[. = 'Joint Committee']"):
            chamber = 'joint'

        subcommittee = page.xpath("//h3[@align='center']/text()")[0]
        if not "Subcommittee" in subcommittee:
            subcommittee = None

        comm = Committee(chamber, name, subcommittee=subcommittee)
        comm.add_source(url)

        for link in page.xpath("//a[contains(@href, 'member=')]"):
            member = link.text.strip()

            mtype = link.xpath("string(../preceding-sibling::td[1])")
            mtype = mtype.strip(": \r\n\t").lower()

            comm.add_member(member, mtype)

        if not comm['members']:
            self.warning('not saving %s, appears to be empty' % name)
        else:
            self.save_committee(comm)
Esempio n. 22
0
    def scrape(self, chamber, term):
        if term != '2011-2012':
            raise NoDataForPeriod(term)

        if chamber == 'upper':
            url = ('http://www.legis.state.pa.us/cfdocs/legis/'
                   'home/member_information/senators_ca.cfm')
        else:
            url = ('http://www.legis.state.pa.us/cfdocs/legis/'
                   'home/member_information/representatives_ca.cfm')

        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)

            committees = {}

            for li in page.xpath("//a[contains(@href, 'bio.cfm')]/../.."):
                name = li.xpath("string(b/a[contains(@href, 'bio.cfm')])")
                name = name[0:-4]

                for link in li.xpath("a"):
                    if not link.tail:
                        continue

                    committee_name = link.tail.strip()
                    committee_name = re.sub(r"\s+", " ", committee_name)
                    subcommittee_name = None
                    role = 'member'

                    rest = link.getnext().text
                    if rest:
                        match = re.match(r',\s+(Subcommittee on .*)\s+-',
                                         rest)

                        if match:
                            subcommittee_name = match.group(1)
                            role = rest.split('-')[1].strip().lower()
                        else:
                            role = rest.replace(', ', '').strip().lower()

                        if role == 'chairman':
                            role = 'chair'

                    try:
                        committee = committees[(chamber, committee_name,
                                                subcommittee_name)]
                    except KeyError:
                        committee = Committee(chamber, committee_name)
                        committee.add_source(url)

                        if subcommittee_name:
                            committee['subcommittee'] = subcommittee_name

                        committees[(chamber, committee_name,
                                    subcommittee_name)] = committee

                    committee.add_member(name, role)

            for committee in committees.values():
                self.save_committee(committee)
Esempio n. 23
0
    def scrape_house(self, url):
        html = self.urlopen(url)
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)

        for a in doc.xpath('//td/a'):
            com_name = a.text.strip()
            # blank entries in table
            if not com_name:
                continue
            if 'Reapportionment' in com_name or 'Horse Racing' in com_name:
                self.warning('skipping %s, known to be problematic' % com_name)
                continue
            com_url = a.get('href')
            com_html = self.urlopen(com_url)
            com_doc = lxml.html.fromstring(com_html)

            com = Committee('lower', com_name)

            for td in com_doc.xpath('//table[@id="commtable"]')[1].xpath('.//td'):
                leg = td.xpath('.//a/text()')
                if leg:
                    leg = leg[0]
                    pieces = td.text_content().split('\n')
                    if len(pieces) == 2:
                        role = pieces[1].lower()
                    else:
                        role = 'member'
                    com.add_member(leg, role)

            com.add_source(com_url)
            self.save_committee(com)
Esempio n. 24
0
    def scrape_house(self):
        url = "http://house.louisiana.gov/H_Reps/H_Reps_CmtesFull.asp"
        comm_cache = {}
        text = self.urlopen(url)
        page = lxml.html.fromstring(text)

        for row in page.xpath("//table[@bordercolorlight='#EAEAEA']/tr"):
            cells = row.xpath('td')

            name = cells[0].xpath('string()').strip()

            if name.startswith('Vacant'):
                continue

            font = cells[1].xpath('font')[0]
            committees = []

            if font.text:
                committees.append(font.text.strip())
            for br in font.xpath('br'):
                if br.text:
                    committees.append(br.text.strip())
                if br.tail:
                    committees.append(br.tail)

            for comm_name in committees:
                mtype = 'member'
                if comm_name.endswith(', Chairman'):
                    mtype = 'chairman'
                    comm_name = comm_name.replace(', Chairman', '')
                elif comm_name.endswith(', Co-Chairmain'):
                    mtype = 'co-chairmain'
                    comm_name = comm_name.replace(', Co-Chairmain', '')
                elif comm_name.endswith(', Vice Chair'):
                    mtype = 'vice chair'
                    comm_name = comm_name.replace(', Vice Chair', '')
                elif comm_name.endswith(', Ex Officio'):
                    mtype = 'ex officio'
                    comm_name = comm_name.replace(', Ex Officio', '')
                elif comm_name.endswith(", Interim Member"):
                    mtype = 'interim'
                    comm_name = comm_name.replace(", Interim Member", "")


                if comm_name.startswith('Joint'):
                    chamber = 'joint'
                else:
                    chamber = 'lower'

                try:
                    committee = comm_cache[comm_name]
                except KeyError:
                    committee = Committee(chamber, comm_name)
                    committee.add_source(url)
                    comm_cache[comm_name] = committee

                committee.add_member(name, mtype)

        for committee in comm_cache.values():
            self.save_committee(committee)
Esempio n. 25
0
    def scrape_lower_committee(self, name, url):
        page = self.get(url).text
        page = lxml.html.fromstring(page)

        comm = Committee('lower', name)
        comm.add_source(url)
        seen = set()
        for link in page.xpath("//div[@class='commlinks']//a[contains(@href, 'mem')]"):

            member = link.text.strip()
            member = re.sub(r'\s+', ' ', member)

            name, role = parse_name(member)
            if name is None:
                continue

            # Figure out if this person is the chair.
            role_type = link.xpath('../../preceding-sibling::div[1]/text()')
            if role_type in (['Chair'], ['Co-Chair']):
                role = 'chair'
            else:
                role = 'member'

            if name not in seen:
                comm.add_member(name, role)
                seen.add(name)

        if comm['members']:
            self.save_committee(comm)
Esempio n. 26
0
    def scrape(self, chamber, term):
        chamber_name = 'senate' if chamber == 'upper' else 'house'

        url = 'http://ilga.gov/{0}/committees/default.asp'.format(chamber_name)
        html = self.urlopen(url)
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)

        top_level_com = None

        for a in doc.xpath('//a[contains(@href, "members.asp")]'):
            name = a.text.strip()
            code = a.getparent().getnext().text_content().strip()
            if 'Sub' in name:
                com = Committee(chamber, top_level_com, name, code=code)
            else:
                top_level_com = name
                com = Committee(chamber, name, code=code)

            com_url = a.get('href')
            self.scrape_members(com, com_url)
            com.add_source(com_url)
            if not com['members']:
                self.log('skipping empty committee on {0}'.format(com_url))
            else:
                self.save_committee(com)
Esempio n. 27
0
    def scrape_lower_committees(self):
        # id range for senate committees on their website
        for comm_id in range(87, 124):
            comm_url = ('http://www.house.state.oh.us/index.php?option='
                        'com_displaycommittees&task=2&type=Regular&'
                        'committeeId=%d' % comm_id)

            with self.urlopen(comm_url) as page:
                page = lxml.html.fromstring(page)

                comm_name = page.xpath(
                    'string(//table/tr[@class="committeeHeader"]/td)')
                comm_name = comm_name.replace("/", " ").strip()

                if not comm_name:
                    continue

                if comm_id < 92:
                    chamber = "joint"

                committee = Committee(chamber, comm_name)
                committee.add_source(comm_url)

                for link in page.xpath("//a[contains(@href, 'district')]"):
                    name = link.text
                    if name and name.strip():
                        committee.add_member(name.strip())

                self.save_committee(committee)
Esempio n. 28
0
    def scrape_reps_comm(self):

        url = 'http://www.maine.gov/legis/house/hsecoms.htm'

        page = self.urlopen(url)
        root = lxml.html.fromstring(page)

        count = 0

        for n in range(1, 12, 2):
            path = 'string(//body/center[%s]/h1/a)' % (n)
            comm_name = root.xpath(path)
            committee = Committee('lower', comm_name)
            count = count + 1

            path2 = '/html/body/ul[%s]/li/a' % (count)

            for el in root.xpath(path2):
                rep = el.text
                if rep.find('(') != -1:
                    mark = rep.find('(')
                    rep = rep[15: mark].strip()
                if 'chair' in rep.lower():
                    role = 'chair'
                    rep = re.sub(r'(?i)[\s,]*chair\s*$', '', rep).strip()
                else:
                    role = 'member'
                committee.add_member(rep, role)
            committee.add_source(url)

            self.save_committee(committee)
Esempio n. 29
0
    def scrape_house_committee(self, committee_name, link):
        """Scrape individual committee page and add members"""
        find_expr = "//div[@class='col1']/ul[position()<3]/li"


        with self.urlopen(link) as page:
            # Find individual committee urls
            page = lxml.html.fromstring(page)

            #sub_committee
            if (len(page.xpath("//div[@class='col2']/h3[3]/a"))>0):
                sub_committee_url = self.base_href + '/house/committees/' + page.xpath("//div[@class='col2']/h3[3]/a")[0].attrib['href']
                sub_committee_name = "General Sub of " + committee_name
                self.scrape_house_sub_committee(sub_committee_name, sub_committee_url)
            else:
                sub_committee_name = None

            com = Committee('lower', committee_name, subcommittee=sub_committee_name)

            for el in page.xpath(find_expr):
                member = [item.strip() for item in el.text_content().split(',',1)]
                if len(member) > 1:
                    member_name, role = member
                else:
                    member_name, role = member[0], 'member'

                if member_name != "":
                    com.add_member(member_name, role)

        com.add_source(link)
        self.save_committee(com)
Esempio n. 30
0
    def scrape_house_committees(self):
        base_url = 'http://house.mi.gov/MHRPublic/CommitteeInfo.aspx?comkey='
        with self.urlopen('http://house.mi.gov/mhrpublic/committee.aspx') as html:
            doc = lxml.html.fromstring(html)

            # get values out of drop down
            for opt in doc.xpath('//option'):
                name = opt.text
                # skip invalid choice
                if opt.text in ('Statutory Committees', 'Select One'):
                    continue
                com_url = base_url + opt.get('value')
                with self.urlopen(com_url) as com_html:
                    cdoc = lxml.html.fromstring(com_html)
                    com = Committee(chamber='lower', committee=name)
                    com.add_source(com_url)

                    for a in doc.xpath('//a[starts-with(@id, "memberLink")]'):
                        name = a.text.strip()

                    # all links to http:// pages in servicecolumn2 are legislators
                    for a in cdoc.xpath('//div[@class="servicecolumn2"]//a[starts-with(@href, "http")]'):
                        name = a.text.strip()
                        text = a.xpath('following-sibling::span/text()')[0]
                        if 'Committee Chair' in text:
                            role = 'chair'
                        elif 'Vice-Chair' in text:
                            role = 'vice chair'
                        else:
                            role = 'member'
                        com.add_member(name, role=role)

                    self.save_committee(com)
Esempio n. 31
0
    def scrape(self, chamber, term):
        url = self.urls[chamber]
        html = self.urlopen(url)
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(self.base_urls[chamber])

        committee_types = {
            'upper': ['Standing', 'Select', 'Joint'],
            'lower': ['Standing', 'Select']
        }

        for type_ in committee_types[chamber]:

            if type_ == 'Joint':
                _chamber = type_.lower()
            else:
                _chamber = chamber

            div = doc.xpath(
                '//div[contains(@class, "view-view-%sCommittee")]' % type_)[0]
            committees = div.xpath(
                'descendant::span[@class="field-content"]/a/text()')
            committees = map(strip, committees)
            urls = div.xpath(
                'descendant::span[@class="field-content"]/a/@href')

            for c, _url in zip(committees, urls):
                if c.endswith('Committee'):
                    if type_ not in c:
                        c = '%s %s' % (type_, c)
                elif ('Subcommittee' not in c):
                    c = '%s Committee on %s' % (type_, c)
                else:
                    if type_ not in c:
                        c = '%s %s' % (type_, c)

                c = Committee(_chamber, c)
                c.add_source(_url)
                c.add_source(url)
                for member, role, kw in self.scrape_membernames(
                        c, _url, chamber, term):
                    c.add_member(member, role, **kw)

                _found = False
                if len(c['members']) == 0:
                    for member, role, kw in self.scrape_membernames(
                            c, _url + '/membersstaff', chamber, term):
                        _found = True
                        c.add_member(member, role, **kw)
                    if _found:
                        source = _url + '/membersstaff'
                        c.add_source(source)

                if len(c['members']) == 0:
                    cname = c['committee']
                    msg = '%r must have at least one member.'
                    raise ValueError(msg % cname)

                code = codes[chamber].get(c['committee'].lower())
                c['action_code'] = code

                self.save_committee(c)

        # Subcommittees
        div = doc.xpath('//div[contains(@class, "view-view-SubCommittee")]')[0]
        for subcom in div.xpath('div/div[@class="item-list"]'):
            committee = subcom.xpath('h4/text()')[0]
            names = subcom.xpath('descendant::a/text()')
            names = map(strip, names)
            urls = subcom.xpath('descendant::a/@href')
            committee = 'Standing Committee on ' + committee
            for n, _url in zip(names, urls):
                c = Committee(chamber, committee, subcommittee=n)
                c.add_source(_url)
                c.add_source(url)

                for member, role, kw in self.scrape_membernames(
                        c, _url, chamber, term):
                    c.add_member(member, role, **kw)

                _found = False
                if len(c['members']) == 0:
                    for member, role, kw in self.scrape_membernames(
                            c, _url + '/membersstaff', chamber, term):
                        _found = True
                        c.add_member(member, role, **kw)
                    if _found:
                        source = _url + '/membersstaff'
                        c.add_source(source)

                if len(c['members']) == 0:
                    cname = c['committee']
                    msg = '%r must have at least one member.'
                    raise ValueError(msg % cname)

                self.save_committee(c)
Esempio n. 32
0
    def scrape_comm(self, chamber, term_name):
        url = 'http://billstatus.ls.state.ms.us/htms/%s_cmtememb.xml' % chamber
        with self.urlopen(url) as comm_page:
            root = lxml.etree.fromstring(comm_page, lxml.etree.HTMLParser())
            if chamber == 'h':
                chamber = "lower"
            else:
                chamber = "upper"
            for mr in root.xpath('//committee'):
                name = mr.xpath('string(name)')
                comm = Committee(chamber, name)

                chair = mr.xpath('string(chair)')
                chair = chair.replace(", Chairman", "")
                role = "Chairman"
                if len(chair) > 0:
                    comm.add_member(chair, role=role)
                vice_chair = mr.xpath('string(vice_chair)')
                vice_chair = vice_chair.replace(", Vice-Chairman", "")
                role = "Vice-Chairman"
                if len(vice_chair) > 0:
                    comm.add_member(vice_chair, role=role)
                members = mr.xpath('string(members)').split(";")
                
                for leg in members:
                    if leg[0] == " ":
                        comm.add_member(leg[1: len(leg)])
                    else:
                        comm.add_member(leg)
                comm.add_source(url)
                self.save_committee(comm)
Esempio n. 33
0
    def scrape_committee(self, chamber, url):

        committee_page = self.lxmlize(url)

        name_node = self.get_node(
            committee_page,
            '//table[@id="MainContent_formViewCommitteeInformation"]/tr//h3')

        c_name = (
            name_node.text_content().strip()
            if name_node is not None and name_node.text_content() else None)

        if c_name:
            committee = Committee(chamber, clean_committee_name(c_name))

            members_xpath = (
                '//table[@id="MainContent_formViewCommitteeInformation_grid'
                'ViewCommitteeMembers"]/tbody/tr'
            )
            members = self.get_nodes(committee_page, members_xpath)

            tds = {
                'title': 0,
                'name': 1,
                'role': 3
            }

            for member in members:
                m_title = member[tds['title']].text_content()
                m_name = self.get_node(
                    member[tds['name']],
                    './/a[contains(@href, "/Members/Legislator?SponCode=")]'
                ).text_content()

                role = member[tds['role']].text_content()

                if m_title == 'Senator':
                    m_chamber = 'upper'
                elif m_title == 'Representative':
                    m_chamber = 'lower'
                else:
                    m_chamber = None

                if role in ('Chair', 'Co-Chair', 'Vice Chair',
                            'Member', 'Advisory'):
                    if chamber == 'joint':
                        m_role = 'interim {}'.format(role.lower())
                    else:
                        m_role = role.lower()
                else:
                    m_role = None

                if m_role:
                    committee.add_member(m_name, m_role, chamber=m_chamber)

            if not committee['members']:
                self.warning(
                    'skipping blank committee {0} at {1}'.format(c_name, url))
            else:
                committee.add_source(url)
                # Interim committees are collected during the scraping
                # for joint committees, and most interim committees
                # have members from both chambers. However, a small
                # number of interim committees (right now, just 1) have
                # only members from one chamber, so the chamber is set
                # to their chamber instead of 'joint' for those
                # committees.
                if chamber == 'joint':
                    m_chambers = set(
                        [mem['chamber'] for mem in committee['members']])
                    if len(m_chambers) == 1:
                        committee['chamber'] = m_chambers.pop()

                self.save_committee(committee)

        else:
            self.warning('No legislative committee found at {}'.format(url))
Esempio n. 34
0
    def scrape(self, chamber, term):
        if chamber == 'lower':
            # Committee members from both houses are listed
            # together. So, we'll only scrape once.
            return None

        session = None

        # Even thought each term spans two years, committee
        # memberships don't appear to change. So we only
        # need to scrape the first year of the term.
        for t in self.metadata["terms"]:
            if term == t["name"]:
                session = t['sessions'][-1]
                # session = self.metadata['session_details'][t['sessions'][-1]]
                break
        else:
            raise NoDataForPeriod(term)

        list_url = self.urls["list"] % (session, )
        committees = {}
        page = self.get(list_url).text
        page = lxml.html.fromstring(page)
        for el in page.xpath(".//a[contains(@href, 'CommitteeMembers')]"):
            committees[el.text.strip()] = el.get("href")

        for c in committees:
            self.log(c)
            detail_url = self.urls["detail"] % (committees[c], )
            page = self.get(detail_url).text
            page = lxml.html.fromstring(page)
            if re.match('\d{1,2}-', c):
                c = c.split('-', 1)[1]
            jcomm = Committee('joint', c.strip())
            for table in page.xpath(
                    ".//table[contains(@id, 'CommitteeMembers')]"):
                rows = table.xpath(".//tr")
                chamber = rows[0].xpath('.//td')[0].text_content().strip()
                chamber = 'upper' if chamber == 'Senator' else 'lower'
                comm = Committee(chamber, c.strip())
                for row in rows[1:]:
                    tds = row.xpath('.//td')
                    name = tds[0].text_content().strip()
                    role = 'chairman' if tds[3].text_content().strip(
                    ) == 'Chairman' else 'member'
                    comm.add_member(name, role, chamber=chamber)
                    jcomm.add_member(name, role, chamber=chamber)

                comm.add_source(detail_url)
                self.save_committee(comm)

            jcomm.add_source(detail_url)
            self.save_committee(jcomm)
Esempio n. 35
0
    def scrape_committee(self, chamber, name, url, subcommittee=None):
        name = self._fix_committee_name(name)
        name = self._fix_committee_case(name)

        page = self.get(url).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        # Get the subcommittee name.
        xpath = '//div[@class="ms-WPBody"]//table//tr/td/b/text()'

        if subcommittee:
            subcommittee = page.xpath(xpath)
            if subcommittee:
                subcommittee = page.xpath(xpath).pop(0)
                subcommittee = self._fix_committee_name(subcommittee,
                                                        parent=name,
                                                        subcommittee=True)
                subcommittee = self._fix_committee_case(subcommittee)
            else:
                subcommittee = None

        # Dedupe.
        if (chamber, name, subcommittee) in self._seen:
            return
        self._seen.add((chamber, name, subcommittee))

        comm = Committee(chamber, name, subcommittee=subcommittee)
        comm.add_source(url)

        member_nodes = page.xpath('//table[@class="dxgvTable"]/tr')

        for member_node in member_nodes:
            # Skip empty rows.
            if member_node.attrib['class'] == 'dxgvEmptyDataRow':
                continue

            mtype = member_node.xpath('string(td[1])').strip()

            if not mtype:
                mtype = 'member'

            member = member_node.xpath('string(td[3])').split()

            title = member[0]
            member = ' '.join(member[1:])

            if title == 'Senator':
                mchamber = 'upper'
            elif title == 'Representative':
                mchamber = 'lower'
            else:
                # skip non-legislative members
                continue

            comm.add_member(member, mtype, chamber=mchamber)

        for a in page.xpath('//table[@id="ctl00_m_g_a194465c_f092_46df_b753_'
                            '354150ac7dbd_ctl00_tblContainer"]//ul/li/a'):
            sub_name = a.text.strip()
            sub_url = urlescape(a.attrib['href'])
            self.scrape_committee(chamber,
                                  name,
                                  sub_url,
                                  subcommittee=sub_name)

        if not comm['members']:
            if subcommittee:
                self.warning(
                    'Not saving empty subcommittee {}.'.format(subcommittee))
            else:
                self.warning('Not saving empty committee {}.'.format(name))
        else:
            self.save_committee(comm)
Esempio n. 36
0
    def scrape_committee(self, chamber, com_name, url):
        html = self.get(url).text
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)

        com = Committee(chamber, com_name)
        com.add_source(url)

        if 'stab=04' in url:
            for table in doc.xpath('//table[@class="grid"]'):
                rows = table.xpath('tr')
                sub_name = rows[0].getchildren()[0].text.strip()

                # new table - subcommittee
                if sub_name != 'Full Committee':
                    sub_name = sub_name.replace("Subcommittee", "").strip()
                    com = Committee(chamber, com_name, subcommittee=sub_name)
                    com.add_source(url)

                for row in rows[1:]:
                    name = row.getchildren()[0].text_content().strip()
                    name, role = define_role(name)
                    com.add_member(name, role)

                self.save_committee(com)
        else:
            table_source = doc.xpath('//table[@class="noncogrid"]')

            if table_source != []:
                for table in table_source:
                    row = table.xpath(
                        'tr/td/a[contains(@href, "sponpage")]/text()')
                    sub_name_source = table.xpath('tr/th/text()')

                    if "Subcommittee" in sub_name_source[0]:
                        sub_name = sub_name_source[0]
                        sub_name = sub_name.replace("Subcommittee", "").strip()
                        com = Committee(chamber,
                                        com_name,
                                        subcommittee=sub_name)
                        com.add_source(url)

                    for name in row:
                        name, role = define_role(name)
                        com.add_member(name, role)

                    self.save_committee(com)
            else:
                row = doc.xpath('//table[@class="spco"]/tr[1]/td/text()')
                for name in row:
                    name, role = define_role(name)
                    com.add_member(name, role)

                self.save_committee(com)
Esempio n. 37
0
    def scrape_upper_committee(self, url):
        filename, resp = self.urlretrieve(url)
        lines = convert_pdf(filename, 'text').split('\n')
        comm = None
        comm_name = ''
        title = ''
        MINIMUM_NAME_LENGTH = len('Hon _ _')

        for line in (x.decode('utf8') for x in lines):
            line = line.strip()
            if not line.strip():
                continue

            if (line.startswith('Comisi') or line.startswith('COMISIONES')
                    or line.startswith('SECRETAR')):

                if comm:
                    # Joint committee rosters are not complete, unfortunately
                    if "Conjunta" not in comm_name:
                        self.save_committee(comm)
                    comm = None
                    comm_name = ''

                if not (line.startswith('COMISIONES')
                        or line.startswith('SECRETAR')):
                    comm_name = line

                    # Remove "Committee" from committee names
                    comm_name = (comm_name.replace(
                        u"Comisión de ",
                        "").replace(u"Comisión Especial para el Estudio de ",
                                    "").replace(u"Comisión Especial para ",
                                                ""))
                    comm_name = re.sub(r'(?u)^(las?|el|los)\s', "", comm_name)
                    comm_name = comm_name[0].upper() + comm_name[1:]

            # Committee president is always listed right after committee name
            elif (not comm and comm_name
                  and not re.search(r'^(?:Co.)?President', line)
                  and not line.startswith('Miembr')):
                comm_name = comm_name + " " + line

            elif (not comm and (re.search(r'^(?:Co.)?President', line)
                                or line.startswith('Miembr'))
                  and len(line) > len('Presidente ') + MINIMUM_NAME_LENGTH):
                comm = Committee('upper', comm_name)
                comm.add_source(url)

            if comm:
                assert re.search(r'(?u)Hon\.?\s\w', line)
                (temp_title, name) = line.split("Hon")
                name = name.strip(". ")

                if temp_title.strip():
                    title = temp_title

                    # Translate titles to English for parity with other states
                    if "President" in title:
                        title = 'chairman'
                    elif title.startswith("Vicepresident"):
                        title = 'vicechairman'
                    elif title.startswith("Secretari"):
                        title = 'secretary'
                    elif "Miembr" in title:
                        title = 'member'
                    else:
                        raise AssertionError(
                            "Unknown member type: {}".format(title))

                # Many of the ex-officio members have appended titles
                if ", " in name:
                    name = name.split(", ")[0]

                if name.lower() != 'vacante':
                    comm.add_member(name, title)

        if comm and "Conjunta" not in comm_name:
            self.save_committee(comm)

        os.remove(filename)
Esempio n. 38
0
    def get_jfac(self, name, url):
        """gets membership info for the Joint Finance and Appropriations
        Committee."""
        jfac_page = self.urlopen(url)
        html = lxml.html.fromstring(jfac_page)
        table = html.xpath('body/table/tr/td[2]/table')[0]
        committee = Committee('joint', name)
        for row in table.xpath('tr')[1:]:
            senate, house = row.xpath('td/strong')
            senate = senate.text.replace(u'\xa0', ' ')
            house = house.text.replace(u'\xa0', ' ')
            if ',' in senate:
                committee.add_member(*senate.split(','), chamber='upper')
            else:
                committee.add_member(senate, chamber='upper')
            if ',' in house:
                committee.add_member(*house.split(','), chamber='lower')
            else:
                committee.add_member(house, chamber='lower')

        committee.add_source(url)
        self.save_committee(committee)
Esempio n. 39
0
    def scrape_upper_committee(self, committee_name, url):
        page = self.lxmlize(url)

        committee = Committee('upper', committee_name)
        committee.add_source(url)

        # Committee member attributes.
        member_name = None
        member_role = None

        # Attempt to record the committee chair.
        committee_chair = self.get_node(
            page,
            '//div[@class="nys-senator" and div[@class="nys-senator--info"'
            ' and p[@class="nys-senator--title" and'
            ' normalize-space(text())="Chair"]]]')
        if committee_chair is not None:
            info_node = self.get_node(
                committee_chair, 'div[@class="nys-senator--info" and p[@class='
                '"nys-senator--title" and contains(text(), "Chair")]]')
            if info_node is not None:
                # Attempt to retrieve committee chair's name.
                member_name_text = self.get_node(
                    info_node,
                    './h4[@class="nys-senator--name"][1]/a[1]/text()')

                if member_name_text is not None:
                    member_name = member_name_text.strip()
                else:
                    warning = ('Could not find the name of the chair for the'
                               ' {} committee')
                    self.logger.warning(warning.format(committee_name))

                # Attempt to retrieve committee chair's role (explicitly).
                member_role_text = self.get_node(
                    info_node,
                    './p[@class="nys-senator--title" and contains(text(), '
                    '"Chair")][1]/text()')

                if member_role_text is not None:
                    member_role = member_role_text.strip()
                else:
                    # This seems like a silly case, but could still be useful
                    # to check for.
                    warning = ('Could not find the role of the chair for the'
                               ' {} committee')
                    self.logger.warning(warning.format(committee_name))

                if member_name is not None and member_role is not None:
                    committee.add_member(member_name, member_role)
            else:
                warning = ('Could not find information for the chair of the'
                           ' {} committee.')
                self.logger.warning(warning.format(committee_name))
        else:
            warning = 'Missing chairperson for the {} committee.'
            self.logger.warning(warning.format(committee_name))

        # Get list of regular committee members.
        member_nodes = self.get_nodes(
            page, '//div[contains(concat(" ", @class, " "), '
            '" c-senators-container ")]//div[@class="view-content"]/'
            ' div/a')

        # Attempt to record each committee member.
        for member_node in member_nodes:
            member_name = None

            member_name_text = self.get_node(
                member_node, './/div[@class="nys-senator--info"][1]/h4[@class='
                '"nys-senator--name"][1]/text()')

            if member_name_text is not None:
                member_name = member_name_text.strip()

            if member_name is not None:
                committee.add_member(member_name, 'member')
            else:
                warning = ('Could not find the name of a member in the {}'
                           ' committee')
                self.logger.warning(warning.format(committee_name))

        return committee
Esempio n. 40
0
    def scrape_lower_committee(self, name, url):
        com = Committee('lower', name)
        com.add_source(url)

        with self.urlopen(url) as html:
            doc = lxml.html.fromstring(html)

            contact, directiva, reps = doc.xpath('//div[@class="sbox"]/div[2]')

            # all members are tails of images (they use img tags for bullets)

            # first three members are in the directiva div
            #pres, vpres, secretary, _ = directiva.xpath('.//img')
            chair = directiva.xpath(
                'b[text()="Presidente:"]/following-sibling::img[1]')
            vchair = directiva.xpath(
                'b[text()="Vice Presidente:"]/following-sibling::img[1]')
            sec = directiva.xpath(
                'b[text()="Secretario(a):"]/following-sibling::img[1]')
            member = 0
            if chair:
                com.add_member(clean_spaces(chair[0].tail), 'chairman')
                ++member
            if vchair:
                com.add_member(clean_spaces(vchair[0].tail), 'vice chairman')
                ++member
            if sec:
                com.add_member(clean_spaces(sec[0].tail), 'secretary')
                ++member

            for img in reps.xpath('.//img'):
                com.add_member(clean_spaces(img.tail))
                ++member
            if member > 0:
                self.save_committee(com)
Esempio n. 41
0
    def scrape_lower(self, chamber, term):
        url = self.urls[chamber]
        html = self.get(url).text
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(self.base_urls[chamber])

        committee_types = {
            'upper': ['Standing', 'Select', 'Joint'],
            'lower': ['Standing', 'Select']
        }

        for type_ in committee_types[chamber]:

            if type_ == 'Joint':
                _chamber = type_.lower()
            else:
                _chamber = chamber

            for xpath in [
                    '//div[contains(@class, "view-view-%sCommittee")]' % type_,
                    '//div[contains(@id, "block-views-view_StandingCommittee-block_1")]',
                    '//div[contains(@class, "views-field-title")]',
            ]:
                div = doc.xpath(xpath)
                if div:
                    break

            div = div[0]
            committees = div.xpath(
                'descendant::span[@class="field-content"]/a/text()')
            committees = map(strip, committees)
            urls = div.xpath(
                'descendant::span[@class="field-content"]/a/@href')

            for c, _url in zip(committees, urls):

                if 'autism' in _url:
                    # The autism page takes a stunning 10 minutes to respond
                    # with a 403. Skip it.
                    continue

                c = c.replace("Committee on ", "").replace(" Committee", "")
                c = Committee(_chamber, c)
                c.add_source(_url)
                c.add_source(url)
                for member, role in self.scrape_lower_members(_url):
                    c.add_member(member, role)

                _found = False
                if not c['members']:
                    for member, role in self.scrape_lower_members(
                            _url + '/membersstaff'):
                        _found = True
                        c.add_member(member, role)
                    if _found:
                        source = _url + '/membersstaff'
                        c.add_source(source)

                if c['members']:
                    self.save_committee(c)
                else:
                    self.warning("No members found: {}".format(c))

        # Subcommittees
        div = doc.xpath('//div[contains(@class, "view-view-SubCommittee")]')[0]
        for subcom in div.xpath('div/div[@class="item-list"]'):
            committee = subcom.xpath('h4/text()')[0]
            names = subcom.xpath('descendant::a/text()')
            names = map(strip, names)
            urls = subcom.xpath('descendant::a/@href')
            for n, _url in zip(names, urls):
                n = re.search(r'^Subcommittee.*?on (.*)$', n).group(1)
                c = Committee(chamber, committee, subcommittee=n)
                c.add_source(_url)
                c.add_source(url)

                for member, role in self.scrape_lower_members(_url):
                    c.add_member(member, role)

                _found = False
                if not c['members']:
                    for member, role in self.scrape_lower_members(
                            _url + '/membersstaff'):
                        _found = True
                        c.add_member(member, role)
                    if _found:
                        source = _url + '/membersstaff'
                        c.add_source(source)

                if c['members']:
                    self.save_committee(c)
                else:
                    self.warning("No members found: {}".format(c))
Esempio n. 42
0
    def scrape(self, chamber, term):
        url = self.urls[chamber]
        html = self.urlopen(url)
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(self.base_urls[chamber])

        committee_types = {
            'upper': ['Standing', 'Select', 'Joint'],
            'lower': ['Standing', 'Select']
        }

        for type_ in committee_types[chamber]:

            if type_ == 'Joint':
                _chamber = type_.lower()
            else:
                _chamber = chamber

            for xpath in [
                    '//div[contains(@class, "view-view-%sCommittee")]' % type_,
                    '//div[contains(@id, "block-views-view_StandingCommittee-block_1")]',
            ]:
                div = doc.xpath(xpath)
                if div:
                    break

            div = div[0]
            committees = div.xpath(
                'descendant::span[@class="field-content"]/a/text()')
            committees = map(strip, committees)
            urls = div.xpath(
                'descendant::span[@class="field-content"]/a/@href')

            for c, _url in zip(committees, urls):

                if 'autism' in _url:
                    # The autism page takes a stunning 10 minutes to respond
                    # with a 403. Skip it.
                    continue

                if c.endswith('Committee'):
                    if type_ not in c:
                        c = '%s %s' % (type_, c)
                elif ('Subcommittee' not in c):
                    c = '%s Committee on %s' % (type_, c)
                else:
                    if type_ not in c:
                        c = '%s %s' % (type_, c)

                c = Committee(_chamber, c)
                c.add_source(_url)
                c.add_source(url)
                for member, role, kw in self.scrape_membernames(
                        c, _url, chamber, term):
                    c.add_member(member, role, **kw)

                _found = False
                if len(c['members']) == 0:
                    for member, role, kw in self.scrape_membernames(
                            c, _url + '/membersstaff', chamber, term):
                        _found = True
                        c.add_member(member, role, **kw)
                    if _found:
                        source = _url + '/membersstaff'
                        c.add_source(source)

                if len(c['members']) == 0:
                    # Some committees weren't staff in early
                    # 2013; opting to skip rather than blow
                    # up the whole scrape.
                    return
                    cname = c['committee']
                    msg = '%r must have at least one member.'
                    raise ValueError(msg % cname)

                if c['members']:
                    self.save_committee(c)

        # Subcommittees
        div = doc.xpath('//div[contains(@class, "view-view-SubCommittee")]')[0]
        for subcom in div.xpath('div/div[@class="item-list"]'):
            committee = subcom.xpath('h4/text()')[0]
            names = subcom.xpath('descendant::a/text()')
            names = map(strip, names)
            urls = subcom.xpath('descendant::a/@href')
            committee = 'Standing Committee on ' + committee
            for n, _url in zip(names, urls):
                c = Committee(chamber, committee, subcommittee=n)
                c.add_source(_url)
                c.add_source(url)

                for member, role, kw in self.scrape_membernames(
                        c, _url, chamber, term):
                    c.add_member(member, role, **kw)

                _found = False
                if len(c['members']) == 0:
                    for member, role, kw in self.scrape_membernames(
                            c, _url + '/membersstaff', chamber, term):
                        _found = True
                        c.add_member(member, role, **kw)
                    if _found:
                        source = _url + '/membersstaff'
                        c.add_source(source)

                if len(c['members']) == 0:
                    # Some committees weren't staff in early
                    # 2013; opting to skip rather than blow
                    # up the whole scrape.
                    return
                    cname = c['committee']
                    msg = '%r must have at least one member.'
                    raise ValueError(msg % cname)

                if c['members']:
                    self.save_committee(c)
Esempio n. 43
0
    def scrape(self, session, chambers):
        year_slug = session[5:]

        # Load all committees via the private API
        committee_dump_url = \
                'http://legislature.vermont.gov/committee/loadList/{}/'.\
                format(year_slug)
        json_data = self.get(committee_dump_url).text
        committees = json.loads(json_data)['data']

        # Parse the information from each committee
        for info in committees:
            # Strip whitespace from strings
            info = {k: v.strip() for k, v in info.iteritems()}

            # Determine the chamber
            if info['CommitteeType'] == 'House Standing':
                chamber = 'lower'
            elif info['CommitteeType'] == 'Senate Standing':
                chamber = 'upper'
            elif info['CommitteeType'] == 'Joint Committee':
                chamber = 'joint'
            elif info['CommitteeType'] in ('Study Committee', 'Commissions'):
                if info['CommitteeName'].startswith("House"):
                    chamber = 'lower'
                elif info['CommitteeName'].startswith("Senate"):
                    chamber = 'upper'
                else:
                    chamber = 'joint'
            else:
                raise AssertionError(
                    "Unknown committee type found: '{}'".format(
                        info['CommitteeType']))
            comm = Committee(chamber=chamber, committee=info['CommitteeName'])

            # Determine membership and member roles
            # First, parse the member list and make sure it isn't a placeholder
            REMOVE_TAGS_RE = r'<.*?>'
            members = [
                re.sub(REMOVE_TAGS_RE, '', x)
                for x in info['Members'].split('</br>')
            ]
            members = [x.strip() for x in members if x.strip()]

            for member in members:
                # Strip out titles, and exclude committee assistants
                if member.startswith("Rep. "):
                    member = member[len("Rep. "):]
                elif member.startswith("Sen. "):
                    member = member[len("Sen. "):]
                else:
                    self.info("Non-legislator member found: {}".format(member))

                # Determine the member's role in the committee
                if ',' in member:
                    (member, role) = [x.strip() for x in member.split(',')]
                    if 'jr' in role.lower() or 'sr' in role.lower():
                        raise AssertionError(
                            "Name suffix confused for a committee role")
                else:
                    role = 'member'

                comm.add_member(legislator=member, role=role)

            comm.add_source(committee_dump_url)

            self.save_committee(comm)
Esempio n. 44
0
 def __missing__(self, key):
     val = Committee('joint', key)
     self[key] = val
     return val
Esempio n. 45
0
    def scrape_page(self, a, chamber, term):
        page, text = self.lxmlize(a.attrib['href'])
        committee = a.text_content()
        twitter_ids = re.findall("setUser\('(.*)'\)", text)
        twitter_id = twitter_ids[0] if twitter_ids != [] else None
        roles = {
            ", Chair": "chair",
            ", Vice-Chair": "member"
        }

        committee = Committee(chamber, committee,
                              twitter=twitter_id)

        committee.add_source(a.attrib['href'])

        tables = page.xpath("//table")
        added = False

        seen_people = set([])
        for table in tables:
            people = table.xpath(
                ".//a[contains(@href, 'MemberDetailPage')]")
            for person in people:
                person = person.text_content().strip()
                role = "member"
                for flag in roles:
                    if person.endswith(flag):
                        role = roles[flag]
                        person = person[:-len(flag)].strip()
                if person in seen_people:
                    continue

                if person in "":
                    continue

                seen_people.add(person)
                committee.add_member(person, role)
                added = True

        if added:
            self.save_committee(committee)
            return

        tables = page.xpath("//table")
        added = False
        seen_people = set([])
        for table in tables:
            if "committee members" in table.text_content().lower():
                for person in table.xpath(".//td/text()"):
                    person = person.strip()
                    if person != "":
                        if person in seen_people:
                            continue
                        seen_people.add(person)
                        committee.add_member(person, "member")
                        added = True

        if added:
            self.save_committee(committee)
            return

        self.warning("Unable to scrape!")
Esempio n. 46
0
    def scrape_upper(self, chamber, term):
        url = 'http://senate.ca.gov/committees'
        doc = self.lxmlize(url)

        standing_committees = doc.xpath(
            '//h2[text()="Standing Committees"]/../following-sibling::div//a')
        sub_committees = doc.xpath(
            '//h2[text()="Sub Committees"]/../following-sibling::div//a')
        joint_committees = doc.xpath(
            '//h2[text()="Joint Committees"]/../following-sibling::div//a')
        other_committees = doc.xpath(
            '//h2[text()="Other"]/../following-sibling::div//a')

        for committee in (standing_committees + sub_committees +
                          joint_committees + other_committees):
            (comm_name, ) = committee.xpath('text()')
            comm = Committee(chamber=chamber, committee=comm_name)

            (comm_url, ) = committee.xpath('@href')
            comm.add_source(comm_url)
            comm_doc = self.lxmlize(comm_url)

            if comm_name.startswith("Joint"):
                comm['chamber'] = 'joint'
                comm['committee'] = (comm_name.replace("Joint ", "").replace(
                    "Committee on ", "").replace(" Committee", ""))

            if comm_name.startswith("Subcommittee"):
                (full_comm_name,
                 ) = comm_doc.xpath('//div[@class="banner-sitename"]/a/text()')
                full_comm_name = re.search(r'^Senate (.*) Committee$',
                                           full_comm_name).group(1)
                comm['committee'] = full_comm_name

                comm_name = re.search(r'^Subcommittee.*?on (.*)$',
                                      comm_name).group(1)
                comm['subcommittee'] = comm_name

            members = comm_doc.xpath(
                '//a[(contains(@href, "/sd") or '
                'contains(@href, "assembly.ca.gov/a")) and '
                '(starts-with(text(), "Senator") or '
                'starts-with(text(), "Assembly Member"))]/text()')
            for member in members:
                if not member.strip():
                    continue

                (mem_name, mem_role) = re.search(
                    r'''(?ux)
                        ^(?:Senator|Assembly\sMember)\s  # Legislator title
                        (.+?)  # Capture the senator's full name
                        (?:\s\((.{2,}?)\))?  # There may be role in parentheses
                        (?:\s\([RD]\))?  # There may be a party affiliation
                        \s*$
                        ''', member).groups()
                comm.add_member(legislator=mem_name,
                                role=mem_role if mem_role else 'member')

            assert comm['members'], "No members found for committee {}".format(
                comm_name)
            self.save_committee(comm)
Esempio n. 47
0
    def select_special_comm(self):
        main_url = 'http://www.nebraskalegislature.gov/committees/select-committees.php'
        with self.urlopen(main_url) as page:
            page = lxml.html.fromstring(page)

            for comm_names in page.xpath('//div[@class="content_box"]'):
                name = comm_names.xpath('h2')[0].text
                if name != None:
                    committee = Committee('upper', name)
                    committee.add_source(main_url)
                    for senators in comm_names.xpath(
                            'ul[@class="nobullet"]/li'):
                        senator = senators[0].text
                        if 'Chairperson' in senator:
                            role = 'Chairperson'
                            senator = senator[5:-13].strip()
                        else:
                            role = 'member'
                            senator = senator[5:].strip()
                        committee.add_member(senator, role)
                    self.save_committee(committee)
                else:
                    name = comm_names.xpath('h2/a')[0].text
                    committee = Committee('upper', name)
                    committee.add_source(main_url)
                    for senators in comm_names.xpath(
                            'ul[@class="nobullet"]/li'):
                        senator = senators[0].text
                        if 'Chairperson' in senator:
                            role = 'chairperson'
                            senator = senator[5:-13].strip()
                        else:
                            role = 'member'
                            senator = senator[5:].strip()
                        committee.add_member(senator, role)
                    self.save_committee(committee)
Esempio n. 48
0
    def _scrape_upper_chamber(self, session, chamber):
        self.log('Scraping upper chamber for committees.')

        if self._is_post_2015:
            url = '{base}{year}web/standing-committees'.format(
                base=self._senate_url_base, year=session[2:])
            comm_container_id = 'primary'
        else:
            url = '{base}{year}info/com-standing.htm'.format(
                base=self._senate_url_base, year=session[2:])
            comm_container_id = 'mainContent'

        page = self.lxmlize(url)

        comm_links = self.get_nodes(
            page, '//div[@id = "{}"]//p/a'.format(comm_container_id))

        for comm_link in comm_links:
            # Normalize to uppercase - varies between "Assigned bills" and "Assigned Bills"
            if "ASSIGNED BILLS" in comm_link.text_content().upper():
                continue

            comm_link = comm_link.attrib['href']

            if self._is_post_2015:
                if not "web" in comm_link:
                    continue
            else:
                if not "comm" in comm_link:
                    continue

            comm_page = self.lxmlize(comm_link)

            if self._is_post_2015:
                comm_name = self.get_node(comm_page,
                                          '//h1[@class="entry-title"]/text()')
                members = self.get_nodes(
                    comm_page, '//div[@id="bwg_standart_thumbnails_0"]/a')
            else:
                comm_name = self.get_node(comm_page,
                                          '//div[@id="mainContent"]/p/text()')
                members = self.get_nodes(comm_page,
                                         '//div[@id="mainContent"]//td/a')

            comm_name = comm_name.replace(' Committee', '')
            comm_name = comm_name.strip()

            committee = Committee(chamber, comm_name)

            for member in members:
                mem_link = member.attrib["href"]
                if not "mem" in mem_link:
                    continue

                if self._is_post_2015:
                    mem_parts = self.get_node(
                        member, './/span[@class="bwg_title_spun2_0"]')

                mem_parts = member.text_content().strip().split(',')
                # Senator title stripping mainly for post-2015.
                mem_name = re.sub('^Senator[\s]+', '', mem_parts[0])

                #this one time, MO forgot the comma between
                #the member and his district. Very rarely relevant
                try:
                    int(mem_name[-4:-2])  #the district's # is in this position
                except ValueError:
                    pass
                else:
                    mem_name = " ".join(
                        mem_name.split(" ")[0:-1])  #member name fixed

                    #ok, so this next line. We don't care about
                    #the first 2 elements of mem_parts anymore
                    #so whatever. But if the member as a role, we want
                    #to make sure there are 3 elements in mem_parts and
                    #the last one is actually the role. This sucks, sorry.
                    mem_parts.append(mem_parts[-1])

                mem_role = 'member'
                if len(mem_parts) > 2:
                    mem_role = mem_parts[2].lower()

                if mem_name == "":
                    continue

                committee.add_member(mem_name, role=mem_role)
            committee.add_source(url)
            committee.add_source(comm_link)
            self.save_committee(committee)
Esempio n. 49
0
    def scrape_senate_committees(self, term_name, chamber):
        years = [t[2:] for t in term_name.split('-')]
        for year in years:
            if int(year) > int(str(dt.datetime.now().year)[2:]):
                self.log("Not running session %s, it's in the future." %
                         (term_name))
                continue
            url = '{base}{year}info/com-standing.htm'.format(
                base=self.senate_url_base, year=year)
            page_string = self.get(url).text
            page = lxml.html.fromstring(page_string)
            comm_links = page.xpath('//div[@id = "mainContent"]//p/a')

            for comm_link in comm_links:
                if "Assigned bills" in comm_link.text_content():
                    continue

                comm_link = comm_link.attrib['href']

                if not "comm" in comm_link:
                    continue

                comm_page = lxml.html.fromstring(self.get(comm_link).text)
                comm_name = comm_page.xpath(
                    "//div[@id='mainContent']/p/text()")[0].strip()
                comm_name = comm_name.replace(' Committee', '')
                comm_name = comm_name.strip()

                committee = Committee(chamber, comm_name)

                members = comm_page.xpath("//div[@id='mainContent']//li/a")
                for member in members:
                    mem_link = member.attrib["href"]
                    if not "members" in mem_link:
                        continue
                    mem_parts = member.text_content().strip().split(',')
                    mem_name = mem_parts[0]

                    #this one time, MO forgot the comma between
                    #the member and his district. Very rarely relevant
                    try:
                        int(mem_name[-4:-2]
                            )  #the district's # is in this position
                    except ValueError:
                        pass
                    else:
                        mem_name = " ".join(
                            mem_name.split(" ")[0:-1])  #member name fixed

                        #ok, so this next line. We don't care about
                        #the first 2 elements of mem_parts anymore
                        #so whatever. But if the member as a role, we want
                        #to make sure there are 3 elements in mem_parts and
                        #the last one is actually the role. This sucks, sorry.
                        mem_parts.append(mem_parts[-1])

                    mem_role = 'member'
                    if len(mem_parts) > 2:
                        mem_role = mem_parts[2].lower()

                    if mem_name == "":
                        continue

                    committee.add_member(mem_name, role=mem_role)
                committee.add_source(url)
                committee.add_source(comm_link)
                self.save_committee(committee)
Esempio n. 50
0
    def scrape_house(self):
        url = "http://house.louisiana.gov/H_Reps/H_Reps_CmtesFull.asp"
        comm_cache = {}
        text = self.get(url).text
        page = lxml.html.fromstring(text)

        for row in page.xpath("//table[@bordercolorlight='#EAEAEA']/tr"):
            cells = row.xpath('td')

            name = cells[0].xpath('string()').strip()

            if name.startswith('Vacant'):
                continue

            font = cells[1]
            committees = []

            if font is not None and font.text:
                committees.append(font.text.strip())
            for br in font.xpath('br'):
                if br.text:
                    committees.append(br.text.strip())
                if br.tail:
                    committees.append(br.tail)

            for comm_name in committees:
                mtype = 'member'
                if comm_name.endswith(', Chairman'):
                    mtype = 'chairman'
                    comm_name = comm_name.replace(', Chairman', '')
                elif comm_name.endswith(', Co-Chairmain'):
                    mtype = 'co-chairmain'
                    comm_name = comm_name.replace(', Co-Chairmain', '')
                elif comm_name.endswith(', Vice Chair'):
                    mtype = 'vice chair'
                    comm_name = comm_name.replace(', Vice Chair', '')
                elif comm_name.endswith(', Ex Officio'):
                    mtype = 'ex officio'
                    comm_name = comm_name.replace(', Ex Officio', '')
                elif comm_name.endswith(", Interim Member"):
                    mtype = 'interim'
                    comm_name = comm_name.replace(", Interim Member", "")

                if comm_name.startswith('Joint'):
                    chamber = 'joint'
                else:
                    chamber = 'lower'

                try:
                    committee = comm_cache[comm_name]
                except KeyError:
                    if comm_name.strip() == "":
                        continue

                    committee = Committee(chamber, comm_name)
                    committee.add_source(url)
                    comm_cache[comm_name] = committee

                committee.add_member(name, mtype)

        special = self.scrape_house_special(comm_cache.keys())
        for name, comm in special.items():
            comm_cache[name] = comm

        for committee in comm_cache.values():
            self.save_committee(committee)
Esempio n. 51
0
    def scrape_committees_pdf(self, year, chamber, filename, url):
        if chamber == 'lower' and year == 2015:
            text = self._fix_house_text(filename)
        else:
            text = convert_pdf(filename, type='text-nolayout')

        for hotgarbage, replacement in (
            (r'Judicial Branch, Law Enforcement,\s+and\s+Justice',
             'Judicial Branch, Law Enforcement, and Justice'),
            (r'Natural Resources and\s+Transportation',
             'Natural Resources and Transportation'),
            (r'(?u)Federal Relations, Energy,?\s+and\s+Telecommunications',
             'Federal Relations, Energy, and Telecommunications')):
            text = re.sub(hotgarbage, replacement, text)

        lines = iter(text.splitlines())

        # Drop any lines before the ag committee.
        lines = dropwhile(lambda s: 'Agriculture' not in s, lines)

        def is_committee_name(line):
            if '(cont.)' in line.lower():
                return False
            for s in ('committee', ' and ', 'business', 'resources',
                      'legislative', 'administration', 'government', 'local',
                      'planning', 'judicial', 'natural', 'resources',
                      'general', 'health', 'human', 'education'):
                if s in line.lower():
                    return True
            if line.istitle() and len(line.split()) == 1:
                return True
            return False

        def is_legislator_name(line):
            return re.search(r'\([RD]', line)

        comm = None
        in_senate_subcommittees = False
        while True:
            try:
                line = lines.next()
            except StopIteration:
                break
            # Replace Unicode variants with ASCII equivalents
            line = line.replace(" ", " ").replace("‐", "-")

            if 'Subcommittees' in line:
                # These appear in both chambers' lists, so de-dup the scraping
                if chamber == 'lower':
                    break
                elif chamber == 'upper':
                    self.info("Beginning scrape of joint subcommittees")

                in_senate_subcommittees = True
                chamber = 'joint'
                continue

            if is_committee_name(line):
                subcommittee = None

                if in_senate_subcommittees:
                    committee = ('Joint Appropriations/Finance & Claims')
                    subcommittee = line.strip()
                else:
                    committee = line.strip()

                if comm and comm['members']:
                    self.save_committee(comm)

                comm = Committee(chamber,
                                 committee=committee,
                                 subcommittee=subcommittee)
                comm.add_source(url)

            elif is_legislator_name(line):
                name, party = line.rsplit('(', 1)
                name = name.strip().replace("Rep. ", "").replace("Sen. ", "")
                if re.search(' Ch', party):
                    role = 'chair'
                elif ' VCh' in party:
                    role = 'vice chair'
                elif ' MVCh' in party:
                    role = 'minority vice chair'
                else:
                    role = 'member'
                comm.add_member(name, role)

        if comm['members']:
            self.save_committee(comm)
Esempio n. 52
0
    def scrape_comm(self, url, chamber):
        data = self.post(url).json()['Data']

        for item in data:
            comm_name = item['CommitteeName']
            committee = Committee(chamber, comm_name)
            chair_man = str(item['ChairName'])
            vice_chair = str(item['ViceChairName'])
            comm_id = item['CommitteeId']
            comm_url = self.get_comm_url(chamber, comm_id, comm_name)
            members = self.scrape_member_info(comm_url)
            if vice_chair != 'None':
                committee.add_member(vice_chair, 'Vice-Chair')
            if chair_man != 'None':
                committee.add_member(chair_man, 'Chairman')

            for member in members:
                # vice_chair and chair_man already added.
                if chair_man not in member and vice_chair not in member:
                    member = " ".join(member.split())
                    if member:
                        committee.add_member(member)

            committee.add_source(comm_url)
            committee.add_source(url)
            self.save_committee(committee)
Esempio n. 53
0
    def scrape(self, chamber, term):
        self.validate_term(term)
        session = self.get_session_for_term(term)
        try:
            session_id = self.get_session_id(session)
        except KeyError:
            raise NoDataForPeriod

        # not getting the floor committees maybe try it during the new session
        # for committee_type in ('S', 'F'):
        #     self.scrape_index(chamber, session, session_id, committee_type)

        url = base_url + 'xml/committees.asp?session=%s' % session_id

        with self.urlopen(url) as page:
            root = etree.fromstring(page.bytes, etree.XMLParser(recover=True))

            body = '//body[@Body="%s"]/committee' % {
                'upper': 'S',
                'lower': 'H'
            }[chamber]
            for com in root.xpath(body):
                c_id, name, short_name, sub = com.values()
                # the really good thing about AZ xml api is that their committee element
                # tells you whether this is a sub committee or not
                if sub == '1':
                    # bad thing is that the committee names are no longer consistant
                    # so we can try to get the parent name:
                    parent = name.split('Subcommittee')[0].strip()
                    # and maybe the Sub Committee's name
                    try:
                        name = name[name.index('Subcommittee'):]
                    except ValueError:
                        # but if that doesn't work out then we will fix it manually
                        # shouldnt be too hard since parent and subcommittee will be the same
                        #self.log("I am my own grandpa: %s" % name)
                        pass

                    c = Committee(chamber,
                                  parent,
                                  short_name=short_name,
                                  subcommittee=name,
                                  session=session,
                                  az_committee_id=c_id)
                else:
                    c = Committee(chamber,
                                  name,
                                  short_name=short_name,
                                  session=session,
                                  az_committee_id=c_id)

                c.add_source(url)
                #for some reason they don't always have any info on the committees'
                try:
                    self.scrape_com_info(session, session_id, c_id, c)
                except HTTPError:
                    pass

                if not c['members']:
                    continue
                self.save_committee(c)
Esempio n. 54
0
    def scrape_joint_committee(self, committee_name, url):
        com = Committee('joint', committee_name)
        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)

            if 'state.tn.us' in url:
                for el in page.xpath(
                        "//div[@class='Blurb']/table//tr[2 <= position() and  position() < 10]/td[1]/a"
                ):
                    member_name = el.text
                    if 'Senator' in member_name:
                        member_name = member_name[8:len(member_name)]
                    elif 'Representative' in member_name:
                        member_name = member_name[15:len(member_name)]
                    else:
                        member_name = member_name[17:len(member_name)]
                    com.add_member(member_name, 'member')
            elif 'gov-opps' in url:
                links = ['senate', 'house']
                for link in links:
                    chamber_link = self.base_href + '/' + link + '/committees/gov-opps.html'
                    with self.urlopen(chamber_link) as chamber_page:
                        chamber_page = lxml.html.fromstring(chamber_page)
                        for mem in chamber_page.xpath(
                                "//div[@class='col1']/ul[position() <= 2]/li/a"
                        ):
                            member = [
                                item.strip()
                                for item in mem.text_content().split(',', 1)
                            ]
                            if len(member) > 1:
                                member_name, role = member
                            else:
                                member_name, role = member[0], 'member'
                            if member_name != "":
                                com.add_member(member_name, role)
                        com.add_source(chamber_link)
            else:
                # If the member sections all state "TBA", skip saving this committee.
                li_text = page.xpath(
                    "//div[@class='col1']/ul[position() <= 3]/li/text()")
                if set(li_text) == set(['TBA']):
                    return
                for el in page.xpath(
                        "//div[@class='col1']/ul[position() <= 3]/li/a"):
                    member = [
                        item.strip()
                        for item in el.text_content().split(',', 1)
                    ]
                    if len(member) > 1:
                        member_name, role = member
                    else:
                        member_name, role = member[0], 'member'
                    if member_name != "":
                        com.add_member(member_name, role)
            com.add_source(url)
            self.save_committee(com)
Esempio n. 55
0
 def get_joint_committees_data(self, name, url):
     page = self.get(url).text
     html = lxml.html.fromstring(page)
     committee = Committee('joint', name)
     table = html.xpath("//section[@class=' row-equal-height no-padding']")
     for td in table:
         senate_members = td.xpath('div[1]/div/div/div[2]/div/p/strong')
         if (len(senate_members) > 0):
             member_string = list(senate_members[0].itertext())
             if (len(member_string) > 1):
                 name = member_string[0]
                 for ch in ['\r\n', u'\xa0', u'\u2013', 'Sen.']:
                     if ch in name:
                         name = name.replace(ch,
                                             ' ').encode('ascii',
                                                         'ignore').strip()
                 role = member_string[1]
                 for ch in ['\r\n', u'\xa0', u'\u2013', ',']:
                     if ch in role:
                         role = role.replace(ch,
                                             ' ').encode('ascii',
                                                         'ignore').strip()
                 committee.add_member(name, role=role, chamber='senate')
             else:
                 name = member_string[0]
                 for ch in ['\r\n', u'\xa0', u'\u2013', 'Sen.']:
                     if ch in name:
                         name = name.replace(ch,
                                             ' ').encode('ascii',
                                                         'ignore').strip()
                 committee.add_member(name, chamber='senate')
         house_members = list(
             td.xpath('div[2]/div/div/div[2]/div/p/strong'))
         if (len(house_members) > 0):
             member_string = list(house_members[0].itertext())
             if (len(member_string) > 1):
                 name = member_string[0]
                 for ch in ['\r\n', u'\xa0', u'\u2013', 'Rep.']:
                     if ch in name:
                         name = name.replace(ch,
                                             ' ').encode('ascii',
                                                         'ignore').strip()
                 role = member_string[1]
                 for ch in ['\r\n', u'\xa0', u'\u2013', ',']:
                     if ch in role:
                         role = role.replace(ch,
                                             ' ').encode('ascii',
                                                         'ignore').strip()
                 committee.add_member(name, role=role, chamber='house')
             else:
                 name = member_string[0]
                 for ch in ['\r\n', u'\xa0', u'\u2013', 'Rep.']:
                     if ch in name:
                         name = name.replace(ch,
                                             ' ').encode('ascii',
                                                         'ignore').strip()
                 committee.add_member(name, chamber='house')
     committee.add_source(url)
     self.save_committee(committee)
Esempio n. 56
0
    def scrape_current(self, chamber, term):
        if chamber == 'upper':
            chambers = ['special_committees', 'senate_committees']
        else:
            chambers = ['house_committees']

        committee_request = self.urlopen(ksapi.url + 'ctte/')
        committee_json = json.loads(committee_request)

        for com_type in chambers:
            committees = committee_json['content'][com_type]

            for committee_data in committees:

                # set to joint if we are using the special_committees
                com_chamber = ('joint' if com_type == 'special_committees' else
                               chamber)

                committee = Committee(com_chamber, committee_data['TITLE'])

                com_url = ksapi.url + 'ctte/%s/' % committee_data['KPID']
                try:
                    detail_json = self.urlopen(com_url)
                except scrapelib.HTTPError:
                    self.warning("error fetching committee %s" % com_url)
                    continue
                details = json.loads(detail_json)['content']
                for chair in details['CHAIR']:
                    committee.add_member(chair['FULLNAME'], 'chairman')
                for vicechair in details['VICECHAIR']:
                    committee.add_member(vicechair['FULLNAME'],
                                         'vice-chairman')
                for rankedmember in details['RMMEM']:
                    committee.add_member(rankedmember['FULLNAME'],
                                         'ranking member')
                for member in details['MEMBERS']:
                    committee.add_member(member['FULLNAME'])

                if not committee['members']:
                    self.warning('skipping blank committee %s' %
                                 committee_data['TITLE'])
                else:
                    committee.add_source(com_url)
                    self.save_committee(committee)
Esempio n. 57
0
    def scrape_committees_pdf(self, year, chamber, filename, url):
        text = convert_pdf(filename, type='text-nolayout')

        # Hot garbage.
        for hotgarbage, replacement in (
            ('Judicial Branch, Law Enforcement,\s+and\s+Justice',
             'Judicial Branch, Law Enforcement, and Justice'),
            ('Natural Resources and\s+Transportation',
             'Natural Resources and Transportation'),
            ('Federal Relations, Energy,\sand\sTelecommunications',
             'Federal Relations, Energy, and Telecommunications')):
            text = re.sub(hotgarbage, replacement, text)

        lines = iter(text.splitlines())

        # Drop any lines before the ag committee.
        lines = dropwhile(lambda s: 'Agriculture' not in s, lines)

        def is_committee_name(line):
            if '(cont.)' in line.lower():
                return False
            for s in ('committee', ' and ', 'business', 'resources',
                      'legislative', 'administration', 'government', 'local',
                      'planning', 'judicial', 'natural', 'resources',
                      'general', 'health', 'human'):
                if s in line.lower():
                    return True
            if line.istitle() and len(line.split()) == 1:
                return True
            return False

        def is_legislator_name(line):
            return re.search(r'\([RD]', line)

        comm = None
        in_senate_subcommittees = False
        while 1:
            try:
                line = lines.next()
            except StopIteration:
                break

            if 'Joint Appropriations/Finance &' in line:
                # Toss the line continuation.
                lines.next()

                # Move on.
                in_senate_subcommittees = True
                chamber = 'joint'
                continue

            if is_committee_name(line):
                subcommittee = None

                if in_senate_subcommittees:
                    committee = ('Joint Appropriations/Finance & Claims')
                    subcommittee = line
                else:
                    committee = line

                if comm and comm['members']:
                    self.save_committee(comm)

                comm = Committee(chamber,
                                 committee=committee,
                                 subcommittee=subcommittee)
                comm.add_source(url)

            elif is_legislator_name(line):
                name, party = line.rsplit('(', 1)
                name = name.strip()
                if re.search('[^V] Ch', party):
                    role = 'chair'
                elif 'V Ch' in party:
                    role = 'vice chair'
                else:
                    role = 'member'
                comm.add_member(name, role)

        if comm['members']:
            self.save_committee(comm)
Esempio n. 58
0
    def scrape(self, term, chambers):
        t = next(
            (item for item in self.metadata["terms"] if item["name"] == term),
            None)
        session = max(t["sessions"])

        subcomms = self.get_subcommittee_info(session)

        api_base_url = "https://api.iga.in.gov"
        html_base_url = "http://iga.in.gov/legislative/{}/committees/".format(
            session)
        client = ApiClient(self)
        r = client.get("committees", session=session)
        all_pages = client.unpaginate(r)
        for comm_info in all_pages:
            #this is kind of roundabout, but needed in order
            #to take advantage of all of our machinery to make
            #sure we're not overloading their api
            comm_link = comm_info["link"]
            comm_name = comm_link.split("/")[-1]
            if "withdrawn" in comm_name or "conference" in comm_name:
                continue
            try:
                comm_json = client.get("committee",
                                       committee_link=comm_link[1:])
            except HTTPError:
                self.logger.warning("Page does not exist")
                continue
            try:
                chamber = comm_json["chamber"]["name"]
            except KeyError:
                chamber = 'joint'
            else:
                if chamber == "Senate":
                    chamber = "upper"
                elif chamber == "House":
                    chamber = "lower"
                else:
                    raise AssertionError(
                        "Unknown committee chamber {}".format(chamber))

            name = comm_json["name"]
            try:
                owning_comm = subcomms[name]
            except KeyError:
                name = name.replace("Statutory Committee on", "").strip()
                comm = Committee(chamber, name)
            else:
                name = name.replace("Statutory Committee on",
                                    "").replace("Subcommittee", "").strip()
                comm = Committee(chamber, owning_comm, subcommittee=name)

            chair = self.process_special_members(comm, comm_json, "chair")
            vicechair = self.process_special_members(comm, comm_json,
                                                     "viceChair")
            ranking = self.process_special_members(comm, comm_json,
                                                   "rankingMinMember")

            #leadership is also listed in membership
            #so we have to make sure we haven't seen them yet
            comm_members = [m for m in [chair, vicechair, ranking] if m]

            for mem in comm_json["members"]:
                mem_name = mem["firstName"] + " " + mem["lastName"]
                if mem_name not in comm_members:
                    comm_members.append(mem_name)
                    comm.add_member(mem_name)

            api_source = api_base_url + comm_link

            if comm_name[:10] == "committee_":
                html_source = html_base_url + comm_name[10:]

            comm.add_source(html_source)
            comm.add_source(api_source)
            self.save_committee(comm)
Esempio n. 59
0
    def scrape_joint_committee(self, committee_name, url):
        if 'state.tn.us' in url:
            com = Committee('joint', committee_name)
            page = self.get(url).text
            page = lxml.html.fromstring(page)

            for el in page.xpath(
                    "//div[@class='Blurb']/table//tr[2 <= position() and  position() < 10]/td[1]"
            ):
                if el.xpath('text()') == ['Vacant']:
                    continue

                (member_name, ) = el.xpath('a/text()')
                if el.xpath('text()'):
                    role = el.xpath('text()')[0].strip(' ,')
                else:
                    role = 'member'

                if 'Senator' in member_name:
                    member_name = member_name[8:len(member_name)]
                elif 'Representative' in member_name:
                    member_name = member_name[15:len(member_name)]
                else:
                    member_name = member_name[17:len(member_name)]
                com.add_member(member_name, role)

            com.add_source(url)
            self.save_committee(com)

        elif 'gov-opps' in url:
            com = Committee('joint', committee_name)
            page = self.get(url).text
            page = lxml.html.fromstring(page)

            links = ['senate', 'house']
            for link in links:
                chamber_link = self.base_href + '/' + link + '/committees/gov-opps.html'
                chamber_page = self.get(chamber_link).text
                chamber_page = lxml.html.fromstring(chamber_page)

                OFFICER_SEARCH = '//h2[contains(text(), "Committee Officers")]/' \
                             'following-sibling::div/ul/li/a'
                MEMBER_SEARCH = '//h2[contains(text(), "Committee Members")]/' \
                             'following-sibling::div/ul/li/a'
                for a in (chamber_page.xpath(OFFICER_SEARCH) +
                          chamber_page.xpath(MEMBER_SEARCH)):
                    member_name = ' '.join(
                        [x.strip() for x in a.xpath('.//text()') if x.strip()])
                    role = a.xpath('small')
                    if role:
                        role = role[0].xpath('text()')[0].strip()
                    else:
                        role = 'member'
                    com.add_member(member_name, role)

                com.add_source(chamber_link)

            com.add_source(url)
            self.save_committee(com)

        else:
            self._scrape_committee(committee_name, url, 'joint')
Esempio n. 60
0
    def scrape(self, chamber, term):

        urls = {
            'upper': 'http://legis.delaware.gov/LIS/LIS%s.nsf/SCommittees',
            'lower': 'http://legis.delaware.gov/LIS/LIS%s.nsf/HCommittees'
        }

        # Mapping of term names to session numbers (see metatdata).
        term2session = {
            "2015-2016": "148",
            "2013-2014": "147",
            "2011-2012": "146"
        }

        session = term2session[term]

        if chamber == "lower":
            #only scrape joint comms once
            self.scrape_joint_committees(term, session)

        url = urls[chamber] % (session, )
        page = lxml.html.fromstring(self.get(url).text)
        page.make_links_absolute(url)

        for row in page.xpath('//tr'):
            if len(row.xpath('./td')) > 0:
                #if statement removes header tr
                comm = row.xpath('.//a')[1]
                comm_name = comm.text_content().strip()

                comm_url = comm.attrib["href"]

                comm_page = lxml.html.fromstring(self.get(comm_url).text)
                comm_page.make_links_absolute(comm_url)
                committee = Committee(chamber, comm_name)
                committee.add_source(comm_url)
                committee.add_source(url)

                chair = comm_page.xpath(".//div[@class='sub_title']")
                chair = chair[0].text.replace("Chairman:", "").strip()
                committee.add_member(chair, "Chairman")

                for table in comm_page.xpath(".//table"):
                    header, content = table.xpath(".//td")
                    header = header.text_content().strip()
                    content = content.text_content().strip()
                    if "Vice" in header:
                        if content:
                            committee.add_member(content, "Vice-Chairman")
                    elif header == "Members:":
                        for m in content.split("\n"):
                            committee.add_member(m.strip())

                self.save_committee(committee)