Esempio n. 1
0
    def scrape_senate_committee(self, url):
        html = self.get(url).text
        doc = lxml.html.fromstring(html)

        name = doc.xpath('//h3/text()')[0]
        name = name.replace(' Committee', '')

        com = Committee(chamber='upper', committee=name)

        for member in doc.xpath('//div[@id="committeeright"]//a'):
            member_name = member.text.strip()

            # don't add clerks
            if member_name == 'Committee Clerk':
                continue

            # skip phone links
            if member.get("href").startswith("tel:"):
                continue

            if 'Committee Chair' in member.tail:
                role = 'chair'
            elif 'Majority Vice' in member.tail:
                role = 'majority vice chair'
            elif 'Minority Vice' in member.tail:
                role = 'minority vice chair'
            else:
                role = 'member'

            com.add_member(member_name, role=role)

        com.add_source(url)
        self.save_committee(com)
Esempio n. 2
0
    def scrape_committee(self, name, url, chamber):
        com = Committee(chamber, name)
        com.add_source(url)
        data = self.get(url).text
        doc = lxml.html.fromstring(data)

        for leg in doc.xpath('//div[@id="members"]/div[@id="members"]/p/a/text()'):
            leg = leg.replace('Representative ', '')
            leg = leg.replace('Senator ', '')
            leg = leg.strip()
            if ' (' in leg:
                leg, role = leg.split(' (')
                if 'Vice-Chair' in role:
                    role = 'vice-chair'
                elif 'Co-Chair' in role:
                    role = 'co-chair'
                elif 'Chair' in role:
                    role = 'chair'
                else:
                    raise Exception('unknown role: %s' % role)
            else:
                role = 'member'
            com.add_member(leg, role)

        self.save_committee(com)
Esempio n. 3
0
    def scrape_committee(self, chamber, url):
        html = self.urlopen(url)
        doc = lxml.html.fromstring(html)

        name = doc.xpath('//span[@class="committeeShortName"]/text()')
        if len(name) == 0:
            self.warning("Had to skip this malformed page.")
            return
        # Because of http://www.malegislature.gov/Committees/Senate/S29 this
        # XXX: hack had to be pushed in. Remove me ASAP. This just skips
        #      malformed pages.

        name = name[0]
        com = Committee(chamber, name)
        com.add_source(url)

        # get both titles and names, order is consistent
        titles = doc.xpath('//p[@class="rankingMemberTitle"]/text()')
        names = doc.xpath('//p[@class="rankingMemberName"]/a/text()')

        for title, name in zip(titles, names):
            com.add_member(name, title)

        for member in doc.xpath('//div[@class="committeeRegularMembers"]//a/text()'):
            com.add_member(member)

        if com['members']:
            self.save_committee(com)
Esempio n. 4
0
    def scrape_assembly(self):
        """Scrape Assembly Committees"""
        assembly_committees_url = "http://assembly.state.ny.us/comm/"

        with self.urlopen(assembly_committees_url) as html:
            doc = lxml.html.fromstring(html)
            standing_committees, subcommittees, legislative_commissions, task_forces = doc.cssselect('#sitelinks ul')
            committee_paths = set([l.get('href') for l in standing_committees.cssselect("li a[href]")
                              if l.get("href").startswith('?sec=mem')])

        for committee_path in committee_paths:
            committee_url = assembly_committees_url+committee_path
            with self.urlopen(committee_url) as chtml:
                cdoc = lxml.html.fromstring(chtml)
                for h in cdoc.cssselect("#content .pagehdg"):
                    if h.text:
                        committee_name = h.text.split('Committee Members')[0].strip()
                        break

                committee = Committee("lower", committee_name)
                committee.add_source(committee_url)
                members = cdoc.cssselect("#sitelinks")[0]

                first = 1
                for member in members.iter('span'):
                    member = member.xpath('li/a')[0].text
                    if first == 1:
                        committee.add_member(member, 'chair')
                        first = 0
                    else:
                        committee.add_member(member)

                self.save_committee(committee)
Esempio n. 5
0
    def select_special_comm(self):
        main_url = 'http://www.nebraskalegislature.gov/committees/select-committees.php'
        with self.urlopen(main_url) as page:
            page = lxml.html.fromstring(page)

            for comm_names in page.xpath('/html/body/div[@id="wrapper"]/div[@id="content"]/div[@id="content_text"]/div[@class="content_box_container"]/div[@class="content_box"]'):
               name = comm_names.xpath('h2')[0].text
               if name != None:
                   committee = Committee('upper', name)
                   committee.add_source(main_url)
                   for senators in comm_names.xpath('ul[@class="nobullet"]/li'):
                       senator = senators[0].text
                       if 'Chairperson' in senator:
                           role = 'Chairperson'
                           senator = senator[5:-13]
                       else:
                           role = 'member'
                           senator = senator[5:-1]
                       committee.add_member(senator, role)
                   self.save_committee(committee)
               else:
                   name = comm_names.xpath('h2/a')[0].text
                   committee = Committee('upper', name)
                   committee.add_source(main_url)
                   for senators in comm_names.xpath('ul[@class="nobullet"]/li'):
                       senator = senators[0].text
                       if 'Chairperson' in senator:
                           role = 'chairperson'
                           senator = senator[5:-13]
                       else:
                           role = 'member'
                           senator = senator[5:-1]
                       committee.add_member(senator, role)
                   self.save_committee(committee)
Esempio n. 6
0
    def scrape_upper_committee(self, name, url):
        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)

            comm = Committee('upper', name)
            comm.add_source(url)

            member_div = page.xpath("//div[@class = 'committee-members']")[0]

            seen = set()
            for link in member_div.xpath(".//a"):
                if not link.text:
                    continue

                member = link.text.strip()

                next_elem = link.getnext()
                if (next_elem is not None and
                    next_elem.tag == 'a' and
                    next_elem.attrib['href'] == link.attrib['href']):
                    # Sometimes NY is cool and splits names across a
                    # couple links
                    member = "%s %s" % (member, next_elem.text.strip())

                member = re.sub(r'\s+', ' ', member)

                if member in seen or not member:
                    continue
                seen.add(member)

                name, role = parse_name(member)
                comm.add_member(name, role)

            self.save_committee(comm)
Esempio n. 7
0
    def scrape_lower_committee(self, name, parent, url):
        page = lxml.html.fromstring(self.urlopen(url))
        page.make_links_absolute(url)

        if 'Joint' in name or (parent and 'Joint' in parent):
            chamber = 'joint'
        else:
            chamber = 'lower'

        if parent:
            comm = Committee(chamber, parent, subcommittee=name)
        else:
            comm = Committee(chamber, name)
        comm.add_source(url)

        xpath = "//a[contains(@href, 'District')]"
        for link in page.xpath(xpath):
            member = link.xpath('string()').strip()
            member = re.sub(r'\s+', ' ', member)

            if not member:
                continue

            match = re.match(r'((Co-)?(Vice )?Chair)?Rep\. ([^\(]+)', member)
            member = match.group(4).strip()
            role = match.group(1) or 'member'

            comm.add_member(member, role.lower())

        self.save_committee(comm)
Esempio n. 8
0
    def standing_comm(self):
       main_url = 'http://www.nebraskalegislature.gov/committees/standing-committees.php'
       with self.urlopen(main_url) as page:
           page = lxml.html.fromstring(page)
           
           for comm_links in page.xpath('/html/body/div[@id="wrapper"]/div[@id="content"]/div[@id="content_text"]/div[@class="content_box_container"]/div[@class="content_box"][1]/ul[@class="nobullet"]/li/a'):
               detail_link = comm_links.attrib['href']

               with self.urlopen(detail_link) as detail_page:
                   detail_page = lxml.html.fromstring(detail_page)
                   name = detail_page.xpath('/html/body[@class="home blog"]/div[@id="page"]/div[@id="content"]/div[@class="content_header"]/div[@class="content_header_right"]/a')[0].text
                   name = name.split()
                   name = name[0:-1]
                   comm_name = ''
                   for x in range(len(name)):
                       comm_name += name[x] + ' '
                   comm_name = comm_name[0: -1]
                   committee = Committee('upper', comm_name)

                   for senators in detail_page.xpath('/html/body[@class="home blog"]/div[@id="page"]/div[@id="sidebar"]/ul[1]/li[1]/ul/li/a'):
                       senator = senators.text
                       if 'Chairperson' in senator:
                           role = 'Chairperson'
                           senator = senator[6: -13]
                       else:
                            role = 'member'
                            senator = senator[6:-1]
                       committee.add_member(senator, role)
                   committee.add_source(main_url)
                   committee.add_source(detail_link)
                   self.save_committee(committee)
Esempio n. 9
0
    def scrape_lower_committee(self, name, parent, url):
        page = lxml.html.fromstring(self.urlopen(url))
        page.make_links_absolute(url)

        if "Joint" in name or (parent and "Joint" in parent):
            chamber = "joint"
        else:
            chamber = "lower"

        if parent:
            comm = Committee(chamber, parent, subcommittee=name)
        else:
            comm = Committee(chamber, name)
        comm.add_source(url)

        for link in page.xpath("//a[contains(@href, 'District')]"):
            member = link.xpath("string()").strip()
            member = re.sub(r"\s+", " ", member)

            if not member:
                continue

            match = re.match(r"((Co-)?(Vice )?Chair)?Rep\. ([^\(]+)", member)
            member = match.group(4).strip()
            role = match.group(1) or "member"

            comm.add_member(member, role.lower())

        self.save_committee(comm)
Esempio n. 10
0
    def scrape(self, chamber, term):
        self.validate_term(term, latest_only=True)

        chamber_abbr = {'upper': 's', 'lower': 'h'}[chamber]

        url = "http://le.utah.gov/asp/interim/standing.asp?house=%s" % chamber_abbr
        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)
            page.make_links_absolute(url)

            for comm_link in page.xpath("//a[contains(@href, 'Com=')]"):
                comm_name = comm_link.text.strip()

                # Drop leading "House" or "Senate" from name
                comm_name = re.sub(r"^(House|Senate) ", "", comm_name)

                comm = Committee(chamber, comm_name)

                for mbr_link in comm_link.xpath(
                    "../../../font[2]/a[not(contains(@href, 'mailto'))]"):

                    name = mbr_link.text.strip()

                    next_el = mbr_link.getnext()
                    if next_el is not None and next_el.tag == 'i':
                        type = next_el.text.strip()
                    else:
                        type = 'member'

                    comm.add_member(name, type)

                self.save_committee(comm)
Esempio n. 11
0
    def scrape_approp_subcommittees(self, url):
        html = self.urlopen(url)
        doc = lxml.html.fromstring(html)

        for strong in doc.xpath('//strong'):
            com = Committee(chamber='upper', committee='Appropriations',
                            subcommittee=strong.text.strip())
            com.add_source(url)

            legislators = strong.getnext().tail.replace('Senators', '').strip()
            for leg in re.split(', | and ', legislators):
                if leg.endswith('(C)'):
                    role = 'chairman'
                    leg = leg[:-4]
                elif leg.endswith('(VC)'):
                    role = 'vice chairman'
                    leg = leg[:-5]
                elif leg.endswith('(MVC)'):
                    role = 'minority vice chairman'
                    leg = leg[:-6]
                else:
                    role = 'member'
                com.add_member(leg, role=role)

            self.save_committee(com)
Esempio n. 12
0
    def scrape(self, term, chambers):
        com_url = 'http://www.dccouncil.washington.dc.us/committees'
        data = self.urlopen(com_url)
        doc = lxml.html.fromstring(data)
        doc.make_links_absolute(com_url)

        urls = set(doc.xpath('//a[contains(@href, "committee-on")]/@href'))
        for url in urls:
            data = self.urlopen(url)
            doc = lxml.html.fromstring(data)

            try:
                name = doc.xpath('//h1/text()')[0].replace('Committee on ', '')
            except IndexError:
                name = doc.xpath('//h2/text()')[0].replace('Committee on ', '')


            # skip link to Committees page
            if name == 'Committees':
                continue

            com = Committee('upper', name)

            for chair in doc.xpath('//h3[text()="Committee Chair"]/following-sibling::p'):
                com.add_member(chair.text_content(), role='chairperson')

            for member in doc.xpath('//h3[text()="Councilmembers"]/following-sibling::ul//a'):
                com.add_member(member.text_content(), role='member')

            com.add_source(url)
            self.save_committee(com)
Esempio n. 13
0
    def scrape_upper_committee(self,url):
        filename, resp = self.urlretrieve(url)
        root = lxml.etree.fromstring( convert_pdf(filename,'xml'))
        for link in root.xpath('/pdf2xml/page'):
            comm = None
            for line in link.findall('text'):
                text = line.findtext('b')
                if text is not None and text.startswith('Comisi'):
                    comm = Committee('upper',text);
                    comm.add_source(url)
                else:
                    if line.text and line.text.startswith('Hon.'):
                        line_text = line.text.replace(u'–','-')
                        name_split = line_text.split(u'-',1)
                        title = 'member'
#           print name_split
                        if len(name_split) >= 2:
                            name_split[1] = name_split[1].strip()
                            if name_split[1] == 'Presidenta' or name_split[1] == 'Presidente':
                                title = 'chairman'
                            elif name_split[1] == 'Vicepresidente' or name_split[1] == 'Vicepresidenta':
                                title = 'vicechairman'
                            elif name_split[1] == 'Secretaria' or name_split[1] == 'Secretario':
                                title = 'secretary'
#           if title != 'member':
#               print name_split[0]
                        if name_split[0] != 'VACANTE':
                            comm.add_member(name_split[0].replace('Hon.',''),title)
            self.save_committee(comm)
                        
        
        os.remove(filename);
Esempio n. 14
0
    def scrape_committee(self, term, chambers, href, name):
        page = self.get(href).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(href)
        members = page.xpath("//div[@class='view-content']" "//a[contains(@href, 'members')]")

        if "/joint/" in href:
            chamber = "joint"
        elif "/senate/" in href:
            chamber = "upper"
        elif "/house/" in href:
            chamber = "lower"
        else:
            print "XXX: Fail! %s" % (href)
            return

        cttie = Committee(chamber, name)

        for a in members:
            member = a.text
            role = a.xpath("ancestor::div/h2[@class='pane-title']/text()")[0]
            role = {"Legislative Members": "member", "Chairman": "chair", "Vice Chairman": "member"}[role]

            if member is None or member.startswith("District"):
                continue

            cttie.add_member(member, role=role)

        cttie.add_source(href)
        self.save_committee(cttie)
Esempio n. 15
0
    def scrape_joint_committee(self, url):
        html = self.urlopen(url)
        doc = lxml.html.fromstring(html)

        name = doc.xpath('//h1/text()') or doc.xpath('//h2/text()')
        name = name[0].strip()

        comm = Committee('joint', name)
        comm.add_source(url)

        members = chain(doc.xpath('//a[contains(@href, "MemberId")]'),
                        doc.xpath('//a[contains(@href, "Senators")]'))

        seen = set()
        for a in members:
            parent_content = a.getparent().text_content()
            if ':' in parent_content:
                title = parent_content.split(':')[0].strip()
            else:
                title = 'member'

            name = a.text.split(' (')[0].strip()
            if (name, title) not in seen:
                comm.add_member(name, title)
                seen.add((name, title))

        if comm['members']:
            self.save_committee(comm)
Esempio n. 16
0
    def _scrape_upper_committee(self, name, url2):
        cat = "Assignments.asp"
        url3 = "".join((url2, cat))

        committee = Committee('upper', name)
        committee.add_source(url2)

        page = self.lxmlize(url3)

        members = page.xpath('//table[@id="table38"]//font/a/b')

        for link in members:
            role = "member"
            if link == members[0]:
                role = "Chairman"
            if link == members[1]:
                role = "Vice-Chairman"

            name = link.xpath('string()')
            name = name.replace('Senator ', '')
            name = re.sub('[\s]{2,}', ' ', name).strip()

            committee.add_member(name, role)

        self.save_committee(committee)
Esempio n. 17
0
    def scrape_upper(self):
        url = "http://senadopr.us/Lists/Listado%20de%20Comisiones/Comisiones%20del%20Senado.aspx"
        with self.urlopen(url) as html:
            doc = lxml.html.fromstring(html)
            doc.make_links_absolute(url)
            table = doc.xpath(
                '//table[@id="{C05AFE0D-D977-4033-8D7B-C43ABF948A4A}-{3E52C91B-AFC8-4493-967A-C8A47AC4E7B6}"]'
            )

            for link in table[0].iterchildren("tr"):
                td_column = list(link)
                name = td_column[0].find("a")
                if name is not None:
                    com_source = name.get("href")
                    # if committee does not have a url use the default.
                    if com_source == "http://senadopr.us/":
                        com_source = url

                    com_name = name.text
                    # check the committee name to see if it's a join one.
                    if td_column[1].text == "Comisi\xf3n Conjunta":
                        chamber = "joint"
                    else:
                        chamber = "upper"
                    com = Committee(chamber, com_name)
                    com.add_source(com_source)
                    com.add_member(clean_spaces(td_column[2].find("a").text), "chairman")
                    self.save_committee(com)
Esempio n. 18
0
    def scrape_senate_committee(self, name, url2):
        cat = "Assignments.asp"
        url3 = "".join((url2, cat))

        committee = Committee("upper", name)
        committee.add_source(url2)

        text = self.get(url3).text
        page = lxml.html.fromstring(text)

        members = page.xpath('//table[@id="table38"]//font/a/b')

        for link in members:
            role = "member"
            if link == members[0]:
                role = "Chairman"
            if link == members[1]:
                role = "Vice-Chairman"

            name = link.xpath("string()")
            name = name.replace("Senator ", "")
            name = re.sub("[\s]{2,}", " ", name).strip()

            committee.add_member(name, role)

        self.save_committee(committee)
Esempio n. 19
0
    def select_special_comm(self):
        main_url = "http://www.nebraskalegislature.gov/committees/select-committees.php"
        with self.urlopen(main_url) as page:
            page = lxml.html.fromstring(page)

            for comm_names in page.xpath('//div[@class="content_box"]'):
                name = comm_names.xpath("h2")[0].text
                if name != None:
                    committee = Committee("upper", name)
                    committee.add_source(main_url)
                    for senators in comm_names.xpath('ul[@class="nobullet"]/li'):
                        senator = senators[0].text
                        if "Chairperson" in senator:
                            role = "Chairperson"
                            senator = senator[5:-13].strip()
                        else:
                            role = "member"
                            senator = senator[5:].strip()
                        committee.add_member(senator, role)
                    self.save_committee(committee)
                else:
                    name = comm_names.xpath("h2/a")[0].text
                    committee = Committee("upper", name)
                    committee.add_source(main_url)
                    for senators in comm_names.xpath('ul[@class="nobullet"]/li'):
                        senator = senators[0].text
                        if "Chairperson" in senator:
                            role = "chairperson"
                            senator = senator[5:-13].strip()
                        else:
                            role = "member"
                            senator = senator[5:].strip()
                        committee.add_member(senator, role)
                    self.save_committee(committee)
Esempio n. 20
0
    def scrape_committee(self, chamber, name, url):
        page = self.urlopen(url)
        page = lxml.html.fromstring(page)

        if page.xpath("//h3[. = 'Joint Committee']"):
            chamber = 'joint'

        subcommittee = page.xpath("//h3[@align='center']/text()")[0]
        if not "Subcommittee" in subcommittee:
            subcommittee = None

        comm = Committee(chamber, name, subcommittee=subcommittee)
        comm.add_source(url)

        for link in page.xpath("//a[contains(@href, 'member=')]"):
            member = link.text.strip()

            mtype = link.xpath("string(../preceding-sibling::td[1])")
            mtype = mtype.strip(": \r\n\t").lower()

            comm.add_member(member, mtype)

        if not comm['members']:
            self.warning('not saving %s, appears to be empty' % name)
        else:
            self.save_committee(comm)
Esempio n. 21
0
    def scrape_senate_comm(self):
        url = 'http://www.maine.gov/legis/senate/Senate-Standing-Committees.html'

        html = self.urlopen(url)
        doc = lxml.html.fromstring(html)

        # committee titles
        for item in doc.xpath('//span[@style="FONT-SIZE: 11pt"]'):
            text = item.text_content().strip()
            # some contain COMMITTEE ON & some are blank, drop those
            if not text or text.startswith('COMMITTEE'):
                continue

            # titlecase committee name
            com = Committee('upper', text.title())
            com.add_source(url)

            # up two and get ul sibling
            for leg in item.xpath('../../following-sibling::ul[1]/li'):
                lname = leg.text_content().strip()
                if 'Chair' in lname:
                    role = 'chair'
                else:
                    role = 'member'
                lname = leg.text_content().strip().split(' of ')[0].strip()
                com.add_member(lname, role)

            self.save_committee(com)
Esempio n. 22
0
    def scrape_senate_committee(self, term, link):
        with self.urlopen(link) as html:
            doc = lxml.html.fromstring(html)

            # strip first 30 and last 10
            # Minnesota Senate Committees - __________ Committee
            committee_name = doc.xpath('//title/text()')[0][30:-10]

            com = Committee('upper', committee_name)

            # first id=bio table is members
            for row in doc.xpath('//table[@id="bio"]')[0].xpath('tr'):
                row = fix_whitespace(row.text_content())

                # switch role
                if ':' in row:
                    position, name = row.split(': ')
                    role = position.lower().strip()
                else:
                    name = row

                # add the member
                com.add_member(name.strip(), role)

            com.add_source(link)
            self.save_committee(com)
Esempio n. 23
0
    def scrape_house_committee(self, committee_name, link):
        """Scrape individual committee page and add members"""

        html = self.urlopen(link)
        doc = lxml.html.fromstring(html)

        subcommittee = False
        for h1 in doc.xpath('//h1/text()'):
            if 'subcommittee' in h1.lower():
                subcommittee = True

        subcomm_name = ('Subcommittee' if subcommittee else None)

        if subcommittee:
            committee_name = committee_name.replace(' Subcommittee', '')
        com = Committee('lower', committee_name, subcomm_name)

        find_expr = "//div[@class='col1']/ul[position()<3]/li/a"
        for a in doc.xpath(find_expr):
            name = a.text
            role = (a.tail or '').strip(', ') or 'member'
            if name:
                com.add_member(name, role)

        com.add_source(link)
        if com['members']:
            self.save_committee(com)
Esempio n. 24
0
    def scrape_reps_comm(self):

        url = 'http://www.maine.gov/legis/house/hsecoms.htm'

        page = self.urlopen(url)
        root = lxml.html.fromstring(page)

        count = 0

        for n in range(1, 12, 2):
            path = 'string(//body/center[%s]/h1/a)' % (n)
            comm_name = root.xpath(path)
            committee = Committee('lower', comm_name)
            count = count + 1

            path2 = '/html/body/ul[%s]/li/a' % (count)

            for el in root.xpath(path2):
                rep = el.text
                if rep.find('(') != -1:
                    mark = rep.find('(')
                    rep = rep[15: mark].strip()
                if 'chair' in rep.lower():
                    role = 'chair'
                    rep = re.sub(r'(?i)[\s,]*chair\s*$', '', rep).strip()
                else:
                    role = 'member'
                committee.add_member(rep, role)
            committee.add_source(url)

            self.save_committee(committee)
Esempio n. 25
0
    def scrape_reps_comm(self):

       url = 'http://www.maine.gov/legis/house/hsecoms.htm'

       with self.urlopen(url) as page:
            root = lxml.html.fromstring(page)

            count = 0

            for n in range(1, 12, 2):
                path = 'string(//body/center[%s]/h1/a)' % (n)
                comm_name = root.xpath(path)
                committee = Committee('lower', comm_name)
                count = count + 1

                path2 = '/html/body/ul[%s]/li/a' % (count)

                for el in root.xpath(path2):
                   rep = el.text
                   if rep.find('(') != -1:
                        mark = rep.find('(')
                        rep = rep[15: mark]
                   committee.add_member(rep)
                committee.add_source(url)

                self.save_committee(committee)
Esempio n. 26
0
    def scrape_senate_committee(self, url):
        html = self.urlopen(url)
        doc = lxml.html.fromstring(html)

        name = doc.xpath('//h6/text()')[0]

        com = Committee(chamber='upper', committee=name)

        for member in doc.xpath('//div[@id="committeelist"]//a'):
            member_name = member.text.strip()

            # don't add clerks
            if member_name == 'Committee Clerk':
                continue

            if 'Committee Chair' in member.tail:
                role = 'chair'
            elif 'Majority Vice' in member.tail:
                role = 'majority vice chair'
            elif 'Minority Vice' in member.tail:
                role = 'minority vice chair'
            else:
                role = 'member'

            com.add_member(member_name, role=role)

        com.add_source(url)
        self.save_committee(com)
Esempio n. 27
0
    def scrape(self, chamber, term):
        url = "http://www.assembly.ab.ca/net/index.aspx?p=membership_list"
        html = self.urlopen(url)
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)

        committees = doc.xpath('//div[@id="_ctl0_Panel_committees"]')
        divs = committees[0].xpath("div")[1:]
        for div in divs[:]:
            if "class" in div.attrib and div.attrib["class"] == "committeetype_header":
                divs.remove(div)
        divs = iter(divs)

        while True:
            try:
                name, _, content = itertools.islice(divs, 3)
            except ValueError, StopIteration:
                break

            committee_name = name.text_content()[4:]
            committee = Committee("lower", committee_name)
            for td in content.xpath("table/descendant::td"):
                if td.xpath('a[contains(@href, "number")]'):
                    name = td.xpath("a")[0].text_content()
                    role = (td.xpath("a")[0].tail or "").strip("() ")
                    committee.add_member(name, role or "member")

            xpath = 'table/descendant::td/a[contains(@href, "committees")]/@href'
            committee_url = content.xpath(xpath).pop()
            committee.add_source(url)
            committee.add_source(committee_url)
            self.save_committee(committee)
Esempio n. 28
0
    def _scrape_lower_special_committees(self):
        url = 'http://house.louisiana.gov/H_Cmtes/SpecialCommittees.aspx'
        page = self.lxmlize(url)
        
        committee_list = page.xpath('//table[@id="table106"]//div[@class='
            '"exBody1A"]/div[@class="accordion"]')[0]
        headers = committee_list.xpath('./h3')

        for header in headers:
            committee_name_text = header.xpath('string()')
            committee_name = committee_name_text.strip()
            committee_name = self._normalize_committee_name(committee_name)

            chamber = 'joint' if committee_name.startswith('Joint') else 'lower'

            committee = Committee(chamber, committee_name)
            committee.add_source(url)

            committee_memberlist = header.xpath('./following-sibling::div['
                '@class="pane"]//tr[@class="linkStyle2"]')

            for row in committee_memberlist:
                member_name = row.xpath('normalize-space(string(./td[1]))')
                member_name = ' '.join(filter(None, name_tools.split(member_name)))
                member_role = row.xpath('normalize-space(string(./td[2]))')

                member_role = self._normalize_member_role(member_role)

                committee.add_member(member_name, member_role)

            self.save_committee(committee)
Esempio n. 29
0
    def standing_comm(self):
        main_url = "http://www.nebraskalegislature.gov/committees/standing-committees.php"
        with self.urlopen(main_url) as page:
            page = lxml.html.fromstring(page)

            for comm_links in page.xpath(
                '//div[@id="content_text"]/div[@class="content_box_container"]/div[@class="content_box"][1]/ul[@class="nobullet"]/li/a'
            ):
                detail_link = comm_links.attrib["href"]

                with self.urlopen(detail_link) as detail_page:
                    detail_page = lxml.html.fromstring(detail_page)
                    name = detail_page.xpath(
                        '//div[@id="content"]/div[@class="content_header"]/div[@class="content_header_right"]/a'
                    )[0].text
                    name = name.split()
                    name = name[0:-1]
                    comm_name = ""
                    for x in range(len(name)):
                        comm_name += name[x] + " "
                    comm_name = comm_name[0:-1]
                    committee = Committee("upper", comm_name)

                    for senators in detail_page.xpath('//div[@id="sidebar"]/ul[1]/li[1]/ul/li/a'):
                        senator = senators.text
                        if "Chairperson" in senator:
                            role = "Chairperson"
                            senator = senator[6:-13].strip()
                        else:
                            role = "member"
                            senator = senator[6:].strip()
                        committee.add_member(senator, role)
                    committee.add_source(main_url)
                    committee.add_source(detail_link)
                    self.save_committee(committee)
Esempio n. 30
0
    def scrape(self, chamber, term):
        if term != '2011-2012':
            raise NoDataForPeriod(term)

        if chamber == 'upper':
            url = ('http://www.legis.state.pa.us/cfdocs/legis/'
                   'home/member_information/senators_ca.cfm')
        else:
            url = ('http://www.legis.state.pa.us/cfdocs/legis/'
                   'home/member_information/representatives_ca.cfm')

        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)

            committees = {}

            for li in page.xpath("//a[contains(@href, 'bio.cfm')]/../.."):
                name = li.xpath("string(b/a[contains(@href, 'bio.cfm')])")
                name = name[0:-4]

                for link in li.xpath("a"):
                    if not link.tail:
                        continue

                    committee_name = link.tail.strip()
                    committee_name = re.sub(r"\s+", " ", committee_name)
                    subcommittee_name = None
                    role = 'member'

                    rest = link.getnext().text
                    if rest:
                        match = re.match(r',\s+(Subcommittee on .*)\s+-',
                                         rest)

                        if match:
                            subcommittee_name = match.group(1)
                            role = rest.split('-')[1].strip().lower()
                        else:
                            role = rest.replace(', ', '').strip().lower()

                        if role == 'chairman':
                            role = 'chair'

                    try:
                        committee = committees[(chamber, committee_name,
                                                subcommittee_name)]
                    except KeyError:
                        committee = Committee(chamber, committee_name)
                        committee.add_source(url)

                        if subcommittee_name:
                            committee['subcommittee'] = subcommittee_name

                        committees[(chamber, committee_name,
                                    subcommittee_name)] = committee

                    committee.add_member(name, role)

            for committee in committees.values():
                self.save_committee(committee)
Esempio n. 31
0
    def scrape_house_special(self, scraped_committees):
        url = 'http://house.louisiana.gov/H_Reps/H_Reps_SpecialCmtes.asp'
        text = self.get(url).text
        page = lxml.html.fromstring(text)
        page.make_links_absolute('http://house.louisiana.gov')

        committees = {}
        for el in page.xpath("//a[contains(@href,'H_Cmtes/')]"):
            comm_name = el.xpath('normalize-space(string())')
            comm_name = self.normalize_committee_name(comm_name)

            # skip committees that have already been scraped from
            # http://house.louisiana.gov/H_Reps/H_Reps_CmtesFull.asp
            if comm_name not in scraped_committees:
                comm_url = el.get('href').replace('../', '')

                try:
                    text = self.get(comm_url).text
                except HTTPError:
                    self.logger.warning("Link not working, skipping.")
                    continue

                # check for no record found
                if re.search('No records returned.', text):
                    self.logger.warning("No record found, skipping.")
                    continue

                chamber = 'joint' if comm_name.startswith('Joint') else 'lower'
                committee = Committee(chamber, comm_name)
                committee.add_source(url)

                page = lxml.html.fromstring(text)
                page.make_links_absolute('http://house.louisiana.gov')

                for row in page.xpath('//table[@id="table1"]//tbody/tr'):
                    member_info = row.xpath('./td')
                    mname = member_info[0].xpath('normalize-space(string())')
                    mtype = member_info[1].xpath('normalize-space(string())')
                    if mtype == 'Chairman':
                        mtype = 'chairman'
                    elif mtype == 'Co-Chairmain':
                        mtype = 'co-chairmain'
                    elif mtype == 'Vice Chair':
                        mtype = 'vice chair'
                    elif mtype == 'Ex Officio':
                        mtype = 'ex officio'
                    elif mtype == 'Interim Member':
                        mtype = 'interim'
                    else:
                        mtype = 'member'
                    committee.add_member(mname, mtype)

                committees[comm_name] = committee

        return committees
Esempio n. 32
0
 def scrape_committee(self, committee_url, committee_name, chamber):
     page = self.lxmlize(committee_url, ignore=[500])
     if page is None:
         return
     people = page.xpath("//div[@id='membership']//tbody/tr")
     c = Committee(chamber=chamber, committee=committee_name)
     for row in people:
         role, who = [x.text_content().strip() for x in row.xpath("./td")]
         c.add_member(who, role=role)
     c.add_source(committee_url)
     self.save_committee(c)
Esempio n. 33
0
    def scrape_senate(self):
        """Scrape Senate Committees"""
        for name, comm in nyss_billyslation.models.committees.items():
            name = name.title().replace('And', 'and')

            committee = Committee('upper', name)

            for member in comm.members:
                committee.add_member(member.fullname)

            self.save_committee(committee)
Esempio n. 34
0
    def scrape_lower_committee(self, name, url):
        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)

            comm = Committee('lower', name)
            comm.add_source(url)

            for link in page.xpath("//a[contains(@href, 'mem?ad')]"):
                member = link.text.strip()
                comm.add_member(member)

            self.save_committee(comm)
Esempio n. 35
0
    def scrape_page(self, a, chamber, term):
        page, text = self.lxmlize(a.attrib['href'])
        committee = a.text_content()
        twitter_ids = re.findall("setUser\('(.*)'\)", text)
        twitter_id = twitter_ids[0] if twitter_ids != [] else None
        roles = {", Chair": "chair", ", Vice-Chair": "member"}

        committee = Committee(chamber, committee, twitter=twitter_id)

        committee.add_source(a.attrib['href'])

        tables = page.xpath("//table[@width='545' or @width='540']")
        added = False

        seen_people = set([])
        for table in tables:
            people = table.xpath(
                ".//a[contains(@href, 'MemberDetailPage')]/text()")
            for person in [x.strip() for x in people]:
                role = "member"
                for flag in roles:
                    if person.endswith(flag):
                        role = roles[flag]
                        person = person[:-len(flag)].strip()
                if person in seen_people:
                    continue
                seen_people.add(person)
                committee.add_member(person, role)
                added = True

        if added:
            self.save_committee(committee)
            return

        tables = page.xpath("//table[@width='466']")
        added = False
        seen_people = set([])
        for table in tables:
            if "committee members" in table.text_content().lower():
                for person in table.xpath(".//td/text()"):
                    person = person.strip()
                    if person != "":
                        if person in seen_people:
                            continue
                        seen_people.add(person)
                        committee.add_member(person, "member")
                        added = True

        if added:
            self.save_committee(committee)
            return

        self.warning("Unable to scrape!")
Esempio n. 36
0
 def scrape_reps_committees(self, term_name, chamber):
     url = '{base}ActiveCommittees.aspx'.format(base=self.reps_url_base)
     with self.urlopen(url) as page_string:
         page = lxml.html.fromstring(page_string)
         table = page.xpath('id("contentdata")/table[1]')[0]
         # Last tr has the date
         trs = table.xpath('tr')[:-1]
         for tr in trs:
             committee_parts = [
                 part.strip() for part in tr.text_content().split(',')
             ]
             committee_name = committee_parts[0].title().strip()
             if len(committee_parts) > 0:
                 status = committee_parts[1].strip()
             committee_url = tr.xpath('td/a')[0].attrib.get('href')
             committee_url = '{base}{url}'.format(base=self.reps_url_base,
                                                  url=committee_url)
             actual_chamber = chamber
             if committee_name.startswith('Joint'):
                 actual_chamber = 'joint'
             committee = Committee(actual_chamber,
                                   committee_name,
                                   status=status)
             with self.urlopen(committee_url) as committee_page_string:
                 committee_page = lxml.html.fromstring(
                     committee_page_string)
                 # First tr has the title (sigh)
                 mem_trs = committee_page.xpath('id("memGroup")/tr')[1:]
                 for mem_tr in mem_trs:
                     mem_code = None
                     mem_links = mem_tr.xpath('td/a[1]')
                     if len(mem_links):
                         mem_code = mem_links[0].attrib.get('href')
                     # Output is "Rubble, Barney, Neighbor"
                     mem_parts = mem_tr.text_content().strip().split(',')
                     if self.no_members_text in mem_parts:
                         continue
                     mem_name = (mem_parts[1].strip() + ' ' +
                                 mem_parts[0].strip())
                     # Sometimes Senator abbreviation is in the name
                     mem_name = mem_name.replace('Sen. ', '')
                     mem_role = 'member'
                     if len(mem_parts) > 2:
                         # Handle the case where there is a comma in the
                         # role name
                         mem_role = ', '.join(
                             [p.strip() for p in mem_parts[2:]]).lower()
                     committee.add_member(mem_name,
                                          role=mem_role,
                                          _code=mem_code)
                 committee.add_source(url)
                 committee.add_source(committee_url)
                 self.save_committee(committee)
Esempio n. 37
0
    def scrape(self, chamber, term):
        if chamber == 'lower':
            # Committee members from both houses are listed
            # together. So, we'll only scrape once.
            return None

        session = None

        # Even thought each term spans two years, committee
        # memberships don't appear to change. So we only
        # need to scrape the first year of the term.
        for t in self.metadata["terms"]:
            if term == t["name"]:
                session = t['sessions'][-1]
                # session = self.metadata['session_details'][t['sessions'][-1]]
                break
        else:
            raise NoDataForPeriod(term)

        list_url = self.urls["list"] % (session, )
        committees = {}
        page = self.urlopen(list_url)
        page = lxml.html.fromstring(page)
        for el in page.xpath(".//a[contains(@href, 'CommitteeMembers')]"):
            committees[el.text.strip()] = el.get("href")

        for c in committees:
            self.log(c)
            detail_url = self.urls["detail"] % (committees[c], )
            page = self.urlopen(detail_url)
            page = lxml.html.fromstring(page)
            if re.match('\d{1,2}-', c):
                c = c.split('-', 1)[1]
            jcomm = Committee('joint', c.strip())
            for table in page.xpath(
                    ".//table[contains(@id, 'CommitteeMembers')]"):
                rows = table.xpath(".//tr")
                chamber = rows[0].xpath('.//td')[0].text_content().strip()
                chamber = 'upper' if chamber == 'Senator' else 'lower'
                comm = Committee(chamber, c.strip())
                for row in rows[1:]:
                    tds = row.xpath('.//td')
                    name = tds[0].text_content().strip()
                    role = 'chairman' if tds[3].text_content().strip(
                    ) == 'Chairman' else 'member'
                    comm.add_member(name, role, chamber=chamber)
                    jcomm.add_member(name, role, chamber=chamber)

                comm.add_source(detail_url)
                self.save_committee(comm)

            jcomm.add_source(detail_url)
            self.save_committee(jcomm)
Esempio n. 38
0
def _committee_data(lines, chamber, url, name_dict):
    '''Given a list of lines of committee data from a td element
    on the committees page, extract the commitee name, the members,
    and yeild a committee object. Also yield the name dict incase
    the calling function needs it for something.
    '''
    name_pattern = r'\s{,20}(?:(.+)\:)?\s{,20}(.+?) \((?:\w\-([^)]+))'

    # Functions to identify unused data.
    junk = [
        lambda s: s != 'On Call', lambda s: 'Staff:' not in s,
        lambda s: 'Secretary:' not in s, lambda s: s.strip(),
        lambda s: not s.isupper()
    ]

    # Toss unused data.
    for j in junk:
        lines = filter(j, lines)

    if (len(lines) < 2) or (u'\xa0' in lines):
        return

    lines = lines[::-1]
    kw = {'chamber': chamber}
    kw['committee'] = lines.pop().strip()

    if lines[-1].startswith('Meets'):
        kw['meetings_info'] = lines.pop().strip()

    c = Committee(**kw)

    for name in reversed(lines):
        kwargs = {}
        m = re.search(name_pattern, name)
        if m:
            title, name, city = m.groups()
            if title:
                title = title.lower()
            house = re.search(r'(Sen\.|Rep\.)\s+', name)
            if house:
                house = house.group()
                if 'Sen.' in house:
                    kwargs['chamber'] = 'upper'
                elif 'Rep.' in house:
                    kwargs['chamber'] = 'lower'
                name = name.replace(house, '').strip()
            name_dict[city.lower()].add(name)
            c.add_member(name, role=(title or 'member'), **kwargs)

    c.add_source(url)

    return name_dict, c
Esempio n. 39
0
    def scrape_joint_comm(self, chamber, session):

        fileurl = 'http://www.maine.gov/legis/house/commlist.xls'
        
        joint = urllib.urlopen(fileurl).read()
        f = open('me_joint.xls', 'w')
        f.write(joint)
        f.close()

        wb = xlrd.open_workbook('me_joint.xls')
        sh = wb.sheet_by_index(0)

        cur_comm_name = ''
        chamber = 'joint'

        for rownum in range(1, sh.nrows):
            
            comm_name = sh.cell(rownum, 0).value

            first_name = sh.cell(rownum, 3).value
            middle_name = sh.cell(rownum, 4).value
            last_name = sh.cell(rownum, 5).value
            jrsr = sh.cell(rownum, 6).value
            full_name = first_name + " " + middle_name + " " + last_name + " " + jrsr

            party = sh.cell(rownum, 7).value
            legalres = sh.cell(rownum, 8).value
            address1 = sh.cell(rownum, 9).value
            address2 = sh.cell(rownum, 10).value
            town = sh.cell(rownum, 11).value
            state = sh.cell(rownum, 12).value
            zipcode = int(sh.cell(rownum, 13).value)
            phone = str(sh.cell(rownum, 14).value)
            home_email = sh.cell(rownum, 15).value
            leg_email = sh.cell(rownum, 16).value
            
            leg_chamber = sh.cell(rownum, 2).value
            chair = sh.cell(rownum, 1).value
            role = "member"

            if chair == 1:
                role = leg_chamber + " " + "Chair"

            if comm_name != cur_comm_name:
                cur_comm_name = comm_name 
                committee = Committee(chamber, comm_name)
                committee.add_member(full_name, role = role, party = party, legalres= legalres, address1 = address1, address2 = address2, town = town, state = state, zipcode = zipcode, phone = phone, home_email = home_email, leg_email = leg_email)
                committee.add_source(fileurl)
            else:
                committee.add_member(full_name, role = role, party = party, legalres = legalres, address1 = address1, address2 = address2, town = town, state = state, zipcode = zipcode, phone = phone, home_email = home_email, leg_email = leg_email)
               
            self.save_committee(committee) 
Esempio n. 40
0
    def scrape_session(self, term, chambers, session):
        sid = self.metadata['session_details'][session]['_guid']
        committees = self.cservice.GetCommitteesBySession(sid)

        #if committees.strip() == "":
        #    return  # If we get here, it's a problem.
        # Commenting this out for future debugging. - PRT

        committees = committees['CommitteeListing']
        for committee in committees:
            cid = committee['Id']
            committee = self.cservice.GetCommittee(cid)

            name, typ, guid, code, description = [
                committee[x]
                for x in ['Name', 'Type', 'Id', 'Code', 'Description']
            ]
            chamber = {
                "House": "lower",
                "Senate": "upper",
                "Joint": "joint"
            }[typ]
            ctty = None
            if code in self.ctty_cache:
                ctty = self.ctty_cache[code]
                if (ctty['chamber'] != chamber) and (description and 'joint'
                                                     in description.lower()):
                    ctty['chamber'] = 'joint'
                else:
                    ctty = None

            if ctty is None:
                ctty = Committee(chamber,
                                 name,
                                 code=code,
                                 _guid=guid,
                                 description=description)
                self.ctty_cache[code] = ctty

            members = committee['Members']['CommitteeMember']
            for member in members:
                name = "{First} {Last}".format(
                    **dict(member['Member']['Name']))
                role = member['Role']
                ctty.add_member(name, role, _guid=member['Member']['Id'])

            ctty.add_source(self.csource)
            ctty.add_source(CTTIE_URL.format(**{
                "sid": sid,
                "cttie": guid,
            }))
            self.save_committee(ctty)
Esempio n. 41
0
def test_committee():
    c = Committee('upper', 'committee name')
    c.add_member('Washington', role='chairman')
    c.add_member('Filmore', note='note')

    assert_equal(c['members'], [{
        'name': 'Washington',
        'role': 'chairman'
    }, {
        'name': 'Filmore',
        'role': 'member',
        'note': 'note'
    }])
Esempio n. 42
0
 def get_jmfc(self, name, url):
     """Gets the Joint Millennium Fund Committee info"""
     with self.urlopen(url) as jfmc_page:
         html = lxml.html.fromstring(jfmc_page)
         committee = Committee('joint', name)
         table = html.xpath('//table')[2]
         for row in table.xpath('tbody/tr'):
             senate, house = [ td.text.replace('\r\n', ' ').replace(u'\xa0', ' ') \
                               for td in row.xpath('td') ]
             committee.add_member(*senate.strip('Sen.').strip().split(','))
             committee.add_member(*house.strip('Rep.').strip().split(','))
         committee.add_source(url)
         self.save_committee(committee)
Esempio n. 43
0
    def scrape_committee(self, chamber, name, url, subcommittee=None):
        if subcommittee:
            split_sub = subcommittee.split('-')
            if len(split_sub) > 1:
                subcommittee = '-'.join(split_sub[1:])
            subcommittee = re.sub(r'^(HOUSE|SENATE)\s+', '',
                                  subcommittee.strip())

        if (name, subcommittee) in self._seen:
            return
        self._seen.add((name, subcommittee))

        comm = Committee(chamber, name, subcommittee=subcommittee)
        comm.add_source(url)

        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)

            for tr in page.xpath('//table[@class="gridtable"]/'
                                 'tr[position()>1]'):
                if tr.xpath('string(td[1])'):
                    mtype = tr.xpath('string(td[1])')
                else:
                    mtype = 'member'

                member = tr.xpath('string(td[3])').split()
                title = member[0]
                member = ' '.join(member[1:])

                if title == 'Senator':
                    mchamber = 'upper'
                elif title == 'Representative':
                    mchamber = 'lower'
                else:
                    # skip non-legislative members
                    continue

                comm.add_member(member, mtype, chamber=mchamber)

            for a in page.xpath('//ul/li/a'):
                sub_name = a.text.strip()
                sub_url = urlescape(a.attrib['href'])
                self.scrape_committee(chamber,
                                      name,
                                      sub_url,
                                      subcommittee=sub_name)

            if not comm['members']:
                self.warning('not saving empty committee %s' % name)
            else:
                self.save_committee(comm)
Esempio n. 44
0
    def scrape_upper_committee(self, name, url):
        page = lxml.html.fromstring(self.urlopen(url))

        comm = Committee('upper', name)
        comm.add_source(url)

        for link in page.xpath("//a[contains(@href, 'biographies')]"):
            member = link.xpath("string()").strip()
            member = re.sub(r'\s+', ' ', member)
            if not member:
                continue
            comm.add_member(member)

        self.save_committee(comm)
Esempio n. 45
0
    def scrape_joint_committees(self,term,session):
        url = "http://legis.delaware.gov/legislature.nsf/testside.html?OpenPage&BaseTarget=right"
        page = self.lxmlize(url)
        joint_comms = page.xpath("//a[text()='Joint Committees']")
        comm_list = joint_comms[0].getnext()
        for li in comm_list.xpath("./li/a"):
            comm_name = li.text
            comm_link = li.attrib["href"]

            if comm_name.strip() == "Sunset": #I don't even want to go into it.
                new_link = "http://legis.delaware.gov/Sunset/"\
                    "Sunset.nsf/general+Info/JSC+Members?opendocument"
                assert new_link != comm_link, "Remove Sunset Committee special casing"
                comm_link = new_link

            committee = Committee("joint", comm_name)
            committee.add_source(comm_link)
            comm_page = self.lxmlize(comm_link)
            people = comm_page.xpath("//a/b")
            things_to_replace = ["Senator",
                                "Representative",
                                "(D)","(R)",
                                "House Minority Whip",
                                "House Majority Whip",
                                "Senate Minority Whip",
                                "Senate Majority Whip",
                                "House Minority Leader",
                                "House Majority Leader",
                                "Senate Minority Leader",
                                "Senate Majority Leader",
                                "President Pro Tempore",
                                "Speaker of the House"]
            for person in people:
                person_name = person.text_content()
                for thing in things_to_replace:
                    person_name = person_name.replace(thing,"")
                person_name = person_name.strip().strip(",")
                role = "Member"
                if person_name.strip()[-1] == ")":
                    person_name,role = person_name.rsplit("(",1)
                    role = role.replace(")","").strip()
                elif ", Vice-Chair" in person_name:
                    role = "Vice-Chair"
                    person_name = person_name.replace(", Vice-Chair","")
                elif ", Chair" in person_name:
                    role = "Chair"
                    person_name = person_name.replace(", Chair","")
                person_name = person_name.strip().strip(",").strip()
                committee.add_member(person_name,role)
            self.save_committee(committee)
Esempio n. 46
0
    def scrape_house_committee(self, name, url):
        com = Committee('lower', name)
        com.add_source(url)

        with self.urlopen(url) as html:
            doc = lxml.html.fromstring(html)

            contact, directiva, reps = doc.xpath('//div[@class="sbox"]/div[2]')

            # all members are tails of images (they use img tags for bullets)

            # first three members are in the directiva div
            #pres, vpres, secretary, _ = directiva.xpath('.//img')
            chair = directiva.xpath('b[text()="Presidente:"]/following-sibling::img[1]')
            vchair = directiva.xpath('b[text()="Vice Presidente:"]/following-sibling::img[1]')
            sec = directiva.xpath('b[text()="Secretario(a):"]/following-sibling::img[1]')
            if chair:
                com.add_member(clean_spaces(chair[0].tail), 'chairman')
            if vchair:
                com.add_member(clean_spaces(vchair[0].tail), 'vice chairman')
            if sec:
                com.add_member(clean_spaces(sec[0].tail), 'secretary')

            for img in reps.xpath('.//img'):
                com.add_member(clean_spaces(img.tail))

            self.save_committee(com)
Esempio n. 47
0
    def scrape_lower_committee(self, name, url):
        com = Committee('lower', name)
        com.add_source(url)
        doc = self.lxmlize(url)

        contact, directiva, reps = doc.xpath('//div[@class="sbox"]/div[2]')
        # all members are tails of images (they use img tags for bullets)
        # first three members are in the directiva div
        chair = directiva.xpath('b[text()="Presidente:"]/following-sibling::img[1]')
        vchair = directiva.xpath('b[text()="Vice Presidente:"]/following-sibling::img[1]')
        sec = directiva.xpath('b[text()="Secretario(a):"]/following-sibling::img[1]')
        member = 0
        if chair and chair[0].tail is not None:
            chair = chair[0].tail
            com.add_member(clean_spaces(chair), 'chairman')
            member += 1
        if vchair and vchair[0].tail is not None:
            vchair = vchair[0].tail
            com.add_member(clean_spaces(vchair), 'vice chairman')
            member += 1
        if sec and sec is not None:
            sec = sec[0].tail
            com.add_member(clean_spaces(sec), 'secretary')
            member += 1

        for img in reps.xpath('.//img'):
            member_name = clean_spaces(img.tail)
            if member_name is not None:
                com.add_member(member_name)
                member += 1
        if member > 0:
            self.save_committee(com)
Esempio n. 48
0
    def scrape_comm(self, chamber, term_name):
        url = 'http://billstatus.ls.state.ms.us/htms/%s_cmtememb.xml' % chamber
        with self.urlopen(url) as comm_page:
            root = lxml.etree.fromstring(comm_page, lxml.etree.HTMLParser())
            if chamber == 'h':
                chamber = "lower"
            else:
                chamber = "upper"
            for mr in root.xpath('//committee'):
                name = mr.xpath('string(name)')
                comm = Committee(chamber, name)

                chair = mr.xpath('string(chair)')
                chair = chair.replace(", Chairman", "")
                role = "Chairman"
                if len(chair) > 0:
                    comm.add_member(chair, role=role)
                vice_chair = mr.xpath('string(vice_chair)')
                vice_chair = vice_chair.replace(", Vice-Chairman", "")
                role = "Vice-Chairman"
                if len(vice_chair) > 0:
                    comm.add_member(vice_chair, role=role)
                members = mr.xpath('string(members)').split(";")
                
                for leg in members:
                    if leg[0] == " ":
                        comm.add_member(leg[1: len(leg)])
                    else:
                        comm.add_member(leg)
                comm.add_source(url)
                self.save_committee(comm)
Esempio n. 49
0
 def get_joint_committees_data(self, name, url):
     page = self.get(url).text
     html = lxml.html.fromstring(page)
     committee = Committee('joint', name)
     table = html.xpath("//section[@class=' row-equal-height no-padding']")
     for td in table:
         senate_members = td.xpath('div[1]/div/div/div[2]/div/p/strong')
         if (len(senate_members) > 0):
             member_string = list(senate_members[0].itertext())
             if (len(member_string) > 1):
                 name = member_string[0]
                 for ch in ['\r\n', u'\xa0', u'\u2013', 'Sen.']:
                     if ch in name:
                         name = name.replace(ch,
                                             ' ').encode('ascii',
                                                         'ignore').strip()
                 role = member_string[1]
                 for ch in ['\r\n', u'\xa0', u'\u2013', ',']:
                     if ch in role:
                         role = role.replace(ch,
                                             ' ').encode('ascii',
                                                         'ignore').strip()
                 committee.add_member(name, role=role, chamber='senate')
             else:
                 name = member_string[0]
                 for ch in ['\r\n', u'\xa0', u'\u2013', 'Sen.']:
                     if ch in name:
                         name = name.replace(ch,
                                             ' ').encode('ascii',
                                                         'ignore').strip()
                 committee.add_member(name, chamber='senate')
         house_members = list(
             td.xpath('div[2]/div/div/div[2]/div/p/strong'))
         if (len(house_members) > 0):
             member_string = list(house_members[0].itertext())
             if (len(member_string) > 1):
                 name = member_string[0]
                 for ch in ['\r\n', u'\xa0', u'\u2013', 'Rep.']:
                     if ch in name:
                         name = name.replace(ch,
                                             ' ').encode('ascii',
                                                         'ignore').strip()
                 role = member_string[1]
                 for ch in ['\r\n', u'\xa0', u'\u2013', ',']:
                     if ch in role:
                         role = role.replace(ch,
                                             ' ').encode('ascii',
                                                         'ignore').strip()
                 committee.add_member(name, role=role, chamber='house')
             else:
                 name = member_string[0]
                 for ch in ['\r\n', u'\xa0', u'\u2013', 'Rep.']:
                     if ch in name:
                         name = name.replace(ch,
                                             ' ').encode('ascii',
                                                         'ignore').strip()
                 committee.add_member(name, chamber='house')
     committee.add_source(url)
     self.save_committee(committee)
Esempio n. 50
0
    def scrape_lower_committee(self, name, url):
        page = self.urlopen(url)
        page = lxml.html.fromstring(page)

        comm = Committee('lower', name)
        comm.add_source(url)

        for link in page.xpath("//a[contains(@href, 'mem?ad')]"):
            member = link.text.strip()
            member = re.sub(r'\s+', ' ', member)

            name, role = parse_name(member)
            comm.add_member(name, role)

        self.save_committee(comm)
Esempio n. 51
0
 def get_jlfc(self, name, url):
     """Gets info for the Joint Legislative Oversight Committee"""
     jlfc_page = self.urlopen(url)
     html = lxml.html.fromstring(jlfc_page)
     committee = Committee('joint', name)
     member_path = '//h3[contains(text(), "%s")]/following-sibling::p[1]'
     for chamber in ('Senate', 'House'):
         members = html.xpath(member_path % chamber)[0]\
                       .text_content().split('\r\n')
         for member in members:
             if member.strip():
                 committee.add_member(*member.replace(u'\xa0', ' ').split(','),
                                  chamber=_REV_CHAMBERS[chamber.lower()])
     committee.add_source(url)
     self.save_committee(committee)
Esempio n. 52
0
    def scrape(self, chamber, term):
        self.validate_term(term, latest_only=True)

        chamber_abbr = {'upper': 'S', 'lower': 'H'}[chamber]

        url = (
            'http://www.leg.state.vt.us/legdir/comms.cfm?Body=%s&Session=2014'
            % chamber_abbr)
        html = self.urlopen(url)
        page = lxml.html.fromstring(html)

        for li in page.xpath("//li"):
            # Strip the room number from the committee name
            comm_name = re.match(r'[^\(]+', li.text_content()).group(0).strip()

            # Strip chamber from beginning of committee name
            comm_name = re.sub(r'^(HOUSE|SENATE) COMMITTEE ON ', '', comm_name)
            # normalize case of committee name
            comm_name = comm_name.title()

            comm = Committee(chamber, comm_name)
            comm.add_source(url)

            for tr in li.xpath("../../following-sibling::tr"):

                name = tr.text_content().strip()

                # Break when we reach the next committee
                if 'COMMITTEE' in name:
                    break

                match = re.search(
                    '^([\w\s\.]+),\s+'
                    '(Chair|Vice Chair|Vice-Chair|Ranking Member|Clerk)$',
                    name)
                if match:
                    name = match.group(1)
                    mtype = match.group(2).lower()
                else:
                    mtype = 'member'

                if not name.startswith(DOUBLED_NAMES):
                    name = re.sub(r'of [\w\s\.]+$', '', name)

                comm.add_member(name, mtype)

            if comm['members']:
                self.save_committee(comm)
Esempio n. 53
0
    def scrape_senate_committees(self, term_name, chamber):
        years = [t[2:] for t in term_name.split('-')]

        for year in years:
            if int(year) > int(str(dt.datetime.now().year)[2:]):
                self.log("Not running session %s, it's in the future." %
                         (term_name))
                continue
            url = '{base}{year}info/com-standing.htm'.format(
                base=self.senate_url_base, year=year)
            page_string = self.urlopen(url)
            page = lxml.html.fromstring(page_string)
            ps = page.xpath('id("mainContent")/table/*[3]/p')
            for p in ps:
                links = p.xpath('a[1]')
                if not links:
                    continue
                a = links[0]
                committee_name = a.text_content().strip()
                committee_url = a.attrib.get('href')

                if 'joint' in committee_name.lower():
                    c = "joint"
                else:
                    c = chamber

                committee = Committee(c, committee_name)
                committee_page_string = self.urlopen(committee_url)
                committee_page = lxml.html.fromstring(committee_page_string)
                lis = committee_page.xpath(
                    "//div[@id='mainContent']/ul/ul[1]/li")
                if len(lis) == 0:
                    lis = committee_page.xpath("//div[@id='mainContent']//li")
                    # This MIGHT cause issues.
                for li in lis:
                    mem_parts = li.text_content().strip().split(',')
                    mem_name = mem_parts[0]
                    mem_role = 'member'
                    if len(mem_parts) > 2:
                        mem_role = mem_parts[2].lower()

                    if mem_name == "":
                        continue

                    committee.add_member(mem_name, role=mem_role)
                committee.add_source(url)
                committee.add_source(committee_url)
                self.save_committee(committee)
Esempio n. 54
0
    def scrape_committee(self, chamber, url):
        html = self.get(url, verify=False).text
        doc = lxml.html.fromstring(html)

        name = doc.xpath('//title/text()')[0]
        com = Committee(chamber, name)
        com.add_source(url)

        members = doc.xpath('//a[contains(@href, "/Legislators/Profile")]')
        for member in members:
            title = member.xpath('../span')
            role = title[0].text.lower() if title else 'member'
            com.add_member(member.text, role)

        if com['members']:
            self.save_committee(com)
Esempio n. 55
0
    def scrape_house_special(self, scraped_committees):
        url = 'http://house.louisiana.gov/H_Reps/H_Reps_SpecialCmtes.asp'
        text = self.urlopen(url)
        page = lxml.html.fromstring(text)
        page.make_links_absolute('http://house.louisiana.gov')

        committees = {}
        for el in page.xpath("//a[contains(@href,'../H_Cmtes/')]"):
            comm_name = el.xpath('normalize-space(string())')
            comm_name = self.normalize_committee_name(comm_name)

            # skip committees that have already been scraped from
            # http://house.louisiana.gov/H_Reps/H_Reps_CmtesFull.asp
            if comm_name not in scraped_committees:
                comm_url = el.get('href').replace('../', '')
                committees[comm_name] = comm_url

        for name, url in committees.items():
            chamber = 'joint' if name.startswith('Joint') else 'lower'
            committee = Committee(chamber, name)
            committee.add_source(url)

            text = self.urlopen(url)
            page = lxml.html.fromstring(text)
            page.make_links_absolute('http://house.louisiana.gov')

            for row in page.xpath('//table[@id="table1"]//tbody/tr'):
                member_info = row.xpath('./td')
                mname = member_info[0].xpath('normalize-space(string())')
                mtype = member_info[1].xpath('normalize-space(string())')
                if mtype == 'Chairman':
                    mtype = 'chairman'
                elif mtype == 'Co-Chairmain':
                    mtype = 'co-chairmain'
                elif mtype == 'Vice Chair':
                    mtype = 'vice chair'
                elif mtype == 'Ex Officio':
                    mtype = 'ex officio'
                elif mtype == 'Interim Member':
                    mtype = 'interim'
                else:
                    mtype = 'member'
                committee.add_member(mname, mtype)

            committees[name] = committee

        return committees
Esempio n. 56
0
    def _scrape_standing_committees(self):
        """Scrapes the Standing Committees page of the Nebraska state
        legislature."""
        main_url = 'http://www.nebraskalegislature.gov/committees/standing-committees.php'
        page = self.lxmlize(main_url)

        committee_nodes = self.get_nodes(
            page,
            '//div[@class="main-content"]/div[@class="panel panel-leg"][1]/'
            'div[@class="list-group"]/a[@class="list-group-item"]')

        for committee_node in committee_nodes:
            committee_page_url = committee_node.attrib['href']
            committee_page = self.lxmlize(committee_page_url)

            name_text = self.get_node(
                committee_page,
                '//div[@class="container view-front"]/div[@class="row"]/'
                'div[@class="col-sm-6 col-md-7"]/h1/text()[normalize-space()]')
            name = name_text.split()[0:-1]

            committee_name = ''
            for x in range(len(name)):
                committee_name += name[x] + ' '
            committee_name = committee_name[0:-1]
            committee = Committee('upper', committee_name)

            members = self.get_nodes(
                committee_page,
                '//div[@class="col-sm-4 col-md-3 ltc-col-right"][1]/'
                'div[@class="block-box"][1]/ul[@class="list-unstyled '
                'feature-content"]/li/a/text()[normalize-space()]')

            for member in members:
                member_name = re.sub(r'\Sen\.\s+', '', member)
                member_name = re.sub(r', Chairperson', '', member_name).strip()
                if 'Chairperson' in member:
                    member_role = 'Chairperson'
                else:
                    member_role = 'member'
                committee.add_member(member_name, member_role)

            committee.add_source(main_url)
            committee.add_source(committee_page_url)

            self.save_committee(committee)
Esempio n. 57
0
    def scrape(self, chamber, term):
        if chamber == 'lower':
            # Committee members from both houses are listed 
            # together. So, we'll only scrape once.
            return None

        year = None

        # Even thought each term spans two years, committee 
        # memberships don't appear to change. So we only 
        # need to scrape the first year of the term.
        for t in self.metadata["terms"]:
            if term == t["name"]:
                year = t["start_year"]
                break

        if not year:
            raise NoDataForPeriod(term)


        list_url = self.urls["list"] % (year, )
        committees = {}
        with self.urlopen(list_url) as page:
            page = lxml.html.fromstring(page)
            for el in page.xpath(".//a[contains(@href, 'CommitteeMembers')]"):
                committees[el.text] = el.get("href")

        for c in committees:
            self.log(c)
            detail_url = self.urls["detail"] % (committees[c],)
            with self.urlopen(detail_url) as page:
                page = lxml.html.fromstring(page)
                for table in page.xpath(".//table[contains(@id, 'CommitteeMembers')]"):
                    rows = table.xpath(".//tr")
                    chamber = rows[0].xpath('.//td')[0].text_content().strip()
                    chamber = 'upper' if chamber == 'Senator' else 'lower'
                    comm = Committee(chamber, c)
                    for row in rows[1:]:
                        tds = row.xpath('.//td')
                        name = tds[0].text_content().strip()
                        role = 'chairman' if tds[3].text_content().strip() == 'Chairman' else 'member'
                        self.log(name)
                        self.log(role)
                        comm.add_member(name, role)
                    comm.add_source(detail_url)
                    self.save_committee(comm)
Esempio n. 58
0
    def scrape_upper_committee(self, link, name):
        url = re.sub(r'\s+', '', link.attrib['href'])
        html = self.urlopen(url)
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)

        comm = Committee('upper', name)
        comm.add_source(url)

        xpath = '//a[contains(@href, "?member=")]'
        for link in doc.xpath(xpath):
            name = link.text_content().strip()
            name = re.sub(r'^Delegate\s+', '', name)
            role = link.getnext().text or 'member'
            comm.add_member(name, role.strip())

        return comm
Esempio n. 59
0
    def scrape(self, chamber, term):
        self.validate_term(term, latest_only=True)

        chamber_abbr = {'upper': 'S', 'lower': 'H'}[chamber]

        url = ('http://www.leg.state.vt.us/lms/legdir/comms.asp?Body=%s' %
               chamber_abbr)
        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)

            for li in page.xpath("//li"):
                # Strip the room number from the committee name
                comm_name = re.match(r'[^\(]+',
                                     li.text_content()).group(0).strip()

                # Strip chamber from beginning of committee name
                comm_name = re.sub(r'^(HOUSE|SENATE) COMMITTEE ON ', '',
                                   comm_name)

                comm_name = comm_name.title()

                comm = Committee(chamber, comm_name)
                comm.add_source(url)

                for tr in li.xpath("../../following-sibling::tr"):
                    # Break when we reach the next committee
                    if tr.xpath("th/li"):
                        break

                    name = tr.xpath("string()").strip()

                    match = re.search(
                        '^([\w\s\.]+),\s+'
                        '(Chair|Vice Chair|Ranking Member|Clerk)$', name)
                    if match:
                        name = match.group(1)
                        mtype = match.group(2).lower()
                    else:
                        mtype = 'member'

                    name = re.sub(r'of [\w\s\.]+$', '', name)

                    comm.add_member(name, mtype)

                self.save_committee(comm)
Esempio n. 60
0
    def process_committee(self, data):
        if data['classification'] != 'committee':
            return

        parent = parse_psuedo_id(data['parent_id'])
        chamber = parent['classification']
        if 'name' in parent:
            com = Committee(chamber, parent['name'], subcommittee=data['name'])
        else:
            com = Committee(chamber, data['name'])

        for member in self.memberships[data['_id']]:
            com.add_member(member['person_name'], role=member['role'])

        for source in data['sources']:
            com.add_source(source['url'])

        self.save_committee(com)