def get_jlfc(self, name, url):
     """Gets info for the Joint Legislative Oversight Committee"""
     jlfc_page = self.urlopen(url)
     html = lxml.html.fromstring(jlfc_page)
     committee = Committee('joint', name)
     member_path = '//h3[contains(text(), "%s")]/following-sibling::p[1]'
     for chamber in ('Senate', 'House'):
         members = html.xpath(member_path % chamber)[0]\
                       .text_content().split('\r\n')
         for member in members:
             if member.strip():
                 committee.add_member(
                     *member.replace(u'\xa0', ' ').split(','),
                     chamber=_REV_CHAMBERS[chamber.lower()])
     committee.add_source(url)
     self.save_committee(committee)
Beispiel #2
0
    def scrape(self, chamber, term):
        if chamber == 'lower':
            # Committee members from both houses are listed
            # together. So, we'll only scrape once.
            return None

        year = None

        # Even thought each term spans two years, committee
        # memberships don't appear to change. So we only
        # need to scrape the first year of the term.
        for t in self.metadata["terms"]:
            if term == t["name"]:
                year = t["start_year"]
                break

        if not year:
            raise NoDataForPeriod(term)

        list_url = self.urls["list"] % (year, )
        committees = {}
        with self.urlopen(list_url) as page:
            page = lxml.html.fromstring(page)
            for el in page.xpath(".//a[contains(@href, 'CommitteeMembers')]"):
                committees[el.text] = el.get("href")

        for c in committees:
            self.log(c)
            detail_url = self.urls["detail"] % (committees[c], )
            with self.urlopen(detail_url) as page:
                page = lxml.html.fromstring(page)
                if re.match('\d{1,2}-', c):
                    c = c.split('-', 1)[1]
                comm = Committee('joint', c.strip())
                for table in page.xpath(
                        ".//table[contains(@id, 'CommitteeMembers')]"):
                    rows = table.xpath(".//tr")
                    chamber = rows[0].xpath('.//td')[0].text_content().strip()
                    chamber = 'upper' if chamber == 'Senator' else 'lower'
                    for row in rows[1:]:
                        tds = row.xpath('.//td')
                        name = tds[0].text_content().strip()
                        role = 'chairman' if tds[3].text_content().strip(
                        ) == 'Chairman' else 'member'
                        comm.add_member(name, role, chamber=chamber)
                    comm.add_source(detail_url)
                self.save_committee(comm)
Beispiel #3
0
    def scrape_house_special(self, scraped_committees):
        url = 'http://house.louisiana.gov/H_Reps/H_Reps_SpecialCmtes.asp'
        text = self.get(url).text
        page = lxml.html.fromstring(text)
        page.make_links_absolute('http://house.louisiana.gov')

        committees = {}
        for el in page.xpath("//a[contains(@href,'../H_Cmtes/')]"):
            comm_name = el.xpath('normalize-space(string())')
            comm_name = self.normalize_committee_name(comm_name)

            # skip committees that have already been scraped from
            # http://house.louisiana.gov/H_Reps/H_Reps_CmtesFull.asp
            if comm_name not in scraped_committees:
                comm_url = el.get('href').replace('../', '')
                committees[comm_name] = comm_url

        for name, url in committees.items():
            chamber = 'joint' if name.startswith('Joint') else 'lower'
            committee = Committee(chamber, name)
            committee.add_source(url)

            text = self.get(url).text
            page = lxml.html.fromstring(text)
            page.make_links_absolute('http://house.louisiana.gov')

            for row in page.xpath('//table[@id="table1"]//tbody/tr'):
                member_info = row.xpath('./td')
                mname = member_info[0].xpath('normalize-space(string())')
                mtype = member_info[1].xpath('normalize-space(string())')
                if mtype == 'Chairman':
                    mtype = 'chairman'
                elif mtype == 'Co-Chairmain':
                    mtype = 'co-chairmain'
                elif mtype == 'Vice Chair':
                    mtype = 'vice chair'
                elif mtype == 'Ex Officio':
                    mtype = 'ex officio'
                elif mtype == 'Interim Member':
                    mtype = 'interim'
                else:
                    mtype = 'member'
                committee.add_member(mname, mtype)

            committees[name] = committee

        return committees
Beispiel #4
0
 def get_joint_committees_data(self, name, url):
     page = self.get(url).text
     html = lxml.html.fromstring(page)
     committee = Committee('joint', name)
     table = html.xpath("//section[@class=' row-equal-height no-padding']")
     for td in table:
         senate_members = td.xpath('div[1]/div/div/div[2]/div/p/strong')
         if (len(senate_members) > 0):
             member_string = list(senate_members[0].itertext())
             if (len(member_string) > 1):
                 name = member_string[0].encode('ascii', 'ignore')
                 for ch in ['\r\n', 'Sen.']:
                     if ch in name:
                         name = name.replace(ch, '').strip()
                 role = member_string[1].encode('ascii', 'ignore')
                 for ch in ['\r\n', ',']:
                     if ch in role:
                         role = role.replace(ch, '').strip()
                 committee.add_member(name, role=role, chamber='senate')
             else:
                 name = member_string[0].encode('ascii', 'ignore')
                 for ch in ['\r\n', 'Sen.']:
                     if ch in name:
                         name = name.replace(ch, '').strip()
                 committee.add_member(name, chamber='senate')
         house_members = list(
             td.xpath('div[2]/div/div/div[2]/div/p/strong'))
         if (len(house_members) > 0):
             member_string = list(house_members[0].itertext())
             if (len(member_string) > 1):
                 name = member_string[0].encode('ascii', 'ignore')
                 for ch in ['\r\n', 'Rep.']:
                     if ch in name:
                         name = name.replace(ch, '').strip()
                 role = member_string[1].encode('ascii', 'ignore')
                 for ch in ['\r\n', ',']:
                     if ch in role:
                         role = role.replace(ch, '').strip()
                 committee.add_member(name, role=role, chamber='house')
             else:
                 name = member_string[0].encode('ascii', 'ignore')
                 for ch in ['\r\n', u'\xa0', 'Rep.']:
                     if ch in name:
                         name = name.replace(ch, '').strip()
                 committee.add_member(name, chamber='house')
     committee.add_source(url)
     self.save_committee(committee)
Beispiel #5
0
    def scrape_senate_committees(self, term_name, chamber):
        years = [ t[2:] for t in term_name.split('-') ]

        for year in years:
            if int(year) > int(str(dt.datetime.now().year)[2:]):
                self.log("Not running session %s, it's in the future." % (
                    term_name
                ))
                continue
            url = '{base}{year}info/com-standing.htm'.format(
                                            base=self.senate_url_base, year=year)
            page_string = self.urlopen(url)
            page = lxml.html.fromstring(page_string)
            ps = page.xpath('id("mainContent")/table/*[3]/p')
            for p in ps:
                links = p.xpath('a[1]')
                if not links:
                    continue
                a = links[0]
                committee_name = a.text_content().strip()
                committee_url = a.attrib.get('href')

                if 'joint' in committee_name.lower():
                    c = "joint"
                else:
                    c = chamber

                committee = Committee(c, committee_name)
                committee_page_string = self.urlopen(committee_url)
                committee_page = lxml.html.fromstring(
                                                    committee_page_string)
                lis = committee_page.xpath(
                    "//div[@id='mainContent']/ul/ul[1]/li")
                if len(lis) == 0:
                    lis = committee_page.xpath(
                        "//div[@id='mainContent']//li")
                    # This MIGHT cause issues.
                for li in lis:
                    mem_parts = li.text_content().strip().split(',')
                    mem_name = mem_parts[0]
                    mem_role = 'member'
                    if len(mem_parts) > 2:
                        mem_role = mem_parts[2].lower()
                    committee.add_member(mem_name, role=mem_role)
                committee.add_source(url)
                committee.add_source(committee_url)
                self.save_committee(committee)
Beispiel #6
0
    def scrape(self, chamber, term):
        self.validate_term(term, latest_only=True)

        chamber_abbr = {'upper': 'S', 'lower': 'H'}[chamber]

        url = ('http://www.leg.state.vt.us/lms/legdir/comms.asp?Body=%s' %
               chamber_abbr)
        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)

            for li in page.xpath("//li"):
                # Strip the room number from the committee name
                comm_name = re.match(r'[^\(]+', li.text_content()).group(0).strip()

                # Strip chamber from beginning of committee name
                comm_name = re.sub(r'^(HOUSE|SENATE) COMMITTEE ON ', '',
                                   comm_name)
                # normalize case of committee name
                comm_name = comm_name.title()

                comm = Committee(chamber, comm_name)
                comm.add_source(url)

                for tr in li.xpath("../../following-sibling::tr"):

                    name = tr.text_content().strip()

                    # Break when we reach the next committee
                    if 'COMMITTEE' in name:
                        break

                    match = re.search(
                        '^([\w\s\.]+),\s+'
                        '(Chair|Vice Chair|Vice-Chair|Ranking Member|Clerk)$',
                        name)
                    if match:
                        name = match.group(1)
                        mtype = match.group(2).lower()
                    else:
                        mtype = 'member'

                    if not name.startswith(DOUBLED_NAMES):
                        name = re.sub(r'of [\w\s\.]+$', '', name)

                    comm.add_member(name, mtype)

                self.save_committee(comm)
Beispiel #7
0
    def _scrape_standing_committees(self):
        """Scrapes the Standing Committees page of the Nebraska state
        legislature."""
        main_url = 'http://www.nebraskalegislature.gov/committees/standing-committees.php'
        page = self.lxmlize(main_url)

        committee_nodes = self.get_nodes(
            page,
            '//div[@class="main-content"]/div[@class="panel panel-leg"][1]/'
            'div[@class="list-group"]/a[@class="list-group-item"]')

        for committee_node in committee_nodes:
            committee_page_url = committee_node.attrib['href']
            committee_page = self.lxmlize(committee_page_url)

            name_text = self.get_node(
                committee_page,
                '//div[@class="container view-front"]/div[@class="row"]/'
                'div[@class="col-sm-6 col-md-7"]/h1/text()[normalize-space()]')
            name = name_text.split()[0:-1]

            committee_name = ''
            for x in range(len(name)):
                committee_name += name[x] + ' '
            committee_name = committee_name[0:-1]
            committee = Committee('upper', committee_name)

            members = self.get_nodes(
                committee_page,
                '//div[@class="col-sm-4 col-md-3 ltc-col-right"][1]/'
                'div[@class="block-box"][1]/ul[@class="list-unstyled '
                'feature-content"]/li/a/text()[normalize-space()]')

            for member in members:
                member_name = re.sub(r'\Sen\.\s+', '', member)
                member_name = re.sub(r', Chairperson', '', member_name).strip()
                if 'Chairperson' in member:
                    member_role = 'Chairperson'
                else:
                    member_role = 'member'
                committee.add_member(member_name, member_role)

            committee.add_source(main_url)
            committee.add_source(committee_page_url)

            self.save_committee(committee)
Beispiel #8
0
    def scrape_upper_committee(self, link, name):
        url = re.sub(r'\s+', '', link.attrib['href'])
        html = self.urlopen(url)
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)

        comm = Committee('upper', name)
        comm.add_source(url)

        xpath = '//a[contains(@href, "?member=")]'
        for link in doc.xpath(xpath):
            name = link.text_content().strip()
            name = re.sub(r'^Delegate\s+', '', name)
            role = link.getnext().text or 'member'
            comm.add_member(name, role.strip())

        return comm
    def scrape_joint_committees(self, term, session):
        url = "http://legis.delaware.gov/legislature.nsf/testside.html?OpenPage&BaseTarget=right"
        page = self.lxmlize(url)
        joint_comms = page.xpath("//a[text()='Joint Committees']")
        comm_list = joint_comms[0].getnext()
        for li in comm_list.xpath("./li/a"):
            comm_name = li.text
            comm_link = li.attrib["href"]

            if comm_name.strip(
            ) == "Sunset":  #I don't even want to go into it.
                new_link = "http://legis.delaware.gov/Sunset/"\
                    "Sunset.nsf/general+Info/JSC+Members?opendocument"
                assert new_link != comm_link, "Remove Sunset Committee special casing"
                comm_link = new_link

            committee = Committee("joint", comm_name)
            committee.add_source(comm_link)
            comm_page = self.lxmlize(comm_link)
            people = comm_page.xpath("//a/b")
            things_to_replace = [
                "Senator", "Representative", "(D)", "(R)",
                "House Minority Whip", "House Majority Whip",
                "Senate Minority Whip", "Senate Majority Whip",
                "House Minority Leader", "House Majority Leader",
                "Senate Minority Leader", "Senate Majority Leader",
                "President Pro Tempore", "Speaker of the House"
            ]
            for person in people:
                person_name = person.text_content()
                for thing in things_to_replace:
                    person_name = person_name.replace(thing, "")
                person_name = person_name.strip().strip(",")
                role = "Member"
                if person_name.strip()[-1] == ")":
                    person_name, role = person_name.rsplit("(", 1)
                    role = role.replace(")", "").strip()
                elif ", Vice-Chair" in person_name:
                    role = "Vice-Chair"
                    person_name = person_name.replace(", Vice-Chair", "")
                elif ", Chair" in person_name:
                    role = "Chair"
                    person_name = person_name.replace(", Chair", "")
                person_name = person_name.strip().strip(",").strip()
                committee.add_member(person_name, role)
            self.save_committee(committee)
Beispiel #10
0
    def scrape(self, chamber, term):
        if term != '2011-2012':
            raise NoDataForPeriod(term)

        chamber_abbr = {'upper': 'S', 'lower': 'H'}[chamber]

        url = ('http://www.leg.state.vt.us/lms/legdir/comms.asp?Body=%s' %
               chamber_abbr)
        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)

            for li in page.xpath("//li"):
                # Strip the room number from the committee name
                comm_name = re.match(r'[^\(]+', li.text).group(0).strip()

                # Strip chamber from beginning of committee name
                comm_name = re.sub(r'^(HOUSE|SENATE) COMMITTEE ON ', '',
                                   comm_name)

                comm_name = comm_name.title()

                comm = Committee(chamber, comm_name)
                comm.add_source(url)

                for tr in li.xpath("../../following-sibling::tr"):
                    # Break when we reach the next committee
                    if tr.xpath("th/li"):
                        break

                    name = tr.xpath("string()").strip()

                    match = re.search(
                        '^([\w\s\.]+),\s+'
                        '(Chair|Vice Chair|Ranking Member|Clerk)$', name)
                    if match:
                        name = match.group(1)
                        mtype = match.group(2).lower()
                    else:
                        mtype = 'member'

                    name = re.sub(r'of [\w\s\.]+$', '', name)

                    comm.add_member(name, mtype)

                self.save_committee(comm)
Beispiel #11
0
    def scrape_page(self, a, chamber, term):
        page, text = self.lxmlize(a.attrib['href'])
        committee = a.text_content()
        twitter_ids = re.findall("setUser\('(.*)'\)", text)
        twitter_id = twitter_ids[0] if twitter_ids != [] else None
        roles = {", Chair": "chair", ", Vice-Chair": "member"}

        committee = Committee(chamber, committee, twitter=twitter_id)

        committee.add_source(a.attrib['href'])

        tables = page.xpath("//table[@width='545' or @width='540']")
        added = False

        for table in tables:
            people = table.xpath(
                ".//a[contains(@href, 'MemberDetailPage')]/text()")
            for person in [x.strip() for x in people]:
                role = "member"
                for flag in roles:
                    if person.endswith(flag):
                        role = roles[flag]
                        person = person[:-len(flag)].strip()
                committee.add_member(person, role)
                added = True

        if added:
            self.save_committee(committee)
            return

        tables = page.xpath("//table[@width='466']")
        added = False
        for table in tables:
            if "committee members" in table.text_content().lower():
                for person in table.xpath(".//td/text()"):
                    person = person.strip()
                    if person != "":
                        committee.add_member(person, "member")
                        added = True

        if added:
            self.save_committee(committee)
            return

        self.warning("Unable to scrape!")
Beispiel #12
0
    def scrape_senate_committee(self, name, url):
        url = url.replace('Default.asp', 'Assignments.asp')

        committee = Committee('upper', name)
        committee.add_source(url)

        with self.urlopen(url) as text:
            page = lxml.html.fromstring(text)

            links = page.xpath('//table[@bordercolor="#EBEAEC"]/tr/td/font/a')

            for link in links:
                name = link.xpath('string()')
                name = name.replace('Senator ', '').strip()

                committee.add_member(name)

        self.save_committee(committee)
Beispiel #13
0
    def scrape_house_sub_committee(self, sub_committee_name, url):
        find_expr = "//div[@class='col1']/ul[position()<3]/li"

        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)
            com = Committee('lower', sub_committee_name)

            for el in page.xpath(find_expr):
                member = [item.strip() for item in el.text_content().split(',',1)]
                if len(member) > 1:
                    member_name, role = member
                else:
                    member_name, role = member[0], 'member'
                if member_name != "":
                    com.add_member(member_name, role)

            com.add_source(url)
            self.save_committee(com)
Beispiel #14
0
    def scrape_senate_comm(self):
        base = 'http://webserver.rilin.state.ri.us'

        linklist = self.scrape_comm_list('ComMemS')
        if linklist is not None:
            for a in linklist:
                link=a.attrib['href']
                commName=a.text
                self.log( commName )
                if commName in COMM_BLACKLIST:
                    self.log( "XXX: Blacklisted" )
                    continue
                url=base+link
                self.log("url "+url)
                c=Committee('upper',commName)
                self.add_members(c,url)
                c.add_source(url)
                self.save_committee(c)
Beispiel #15
0
    def scrape_current(self, chamber, term):
        if chamber == 'upper':
            chambers = ['special_committees', 'senate_committees']
        else:
            chambers = ['house_committees']

        with self.urlopen(ksapi.url + 'ctte/') as committee_request:
            committee_json = json.loads(committee_request)

            for com_type in chambers:
                committees = committee_json['content'][com_type]

                for committee_data in committees:

                    # set to joint if we are using the special_committees
                    com_chamber = ('joint' if com_type == 'special_committees'
                                   else chamber)

                    committee = Committee(com_chamber, committee_data['TITLE'])

                    com_url = ksapi.url + 'ctte/%s/' % committee_data['KPID']
                    try:
                        detail_json = self.urlopen(com_url)
                    except scrapelib.HTTPError:
                        self.warning("error fetching committee %s" % com_url)
                        continue
                    details = json.loads(detail_json)['content']
                    for chair in details['CHAIR']:
                        committee.add_member(chair['FULLNAME'], 'chairman')
                    for vicechair in details['VICECHAIR']:
                        committee.add_member(vicechair['FULLNAME'],
                                             'vice-chairman')
                    for rankedmember in details['RMMEM']:
                        committee.add_member(rankedmember['FULLNAME'],
                                             'ranking member')
                    for member in details['MEMBERS']:
                        committee.add_member(member['FULLNAME'])

                    if not committee['members']:
                        self.warning('skipping blank committee %s' %
                                     committee_data['TITLE'])
                    else:
                        committee.add_source(com_url)
                        self.save_committee(committee)
Beispiel #16
0
    def scrape_upper_committee(self, name, url):
        page = self.urlopen(url)
        page = lxml.html.fromstring(page)

        comm = Committee('upper', name)
        comm.add_source(url)

        member_div = page.xpath("//div[@class = 'committee-members']")[0]

        xpath = '//label[contains(., "Chair:")]/following-sibling::a/text()'
        chair = page.xpath(xpath)
        if chair:
            comm.add_member(chair.pop().strip(), 'chair')

        seen = set([member['name'] for member in comm['members']])
        for link in member_div.xpath(".//a"):
            if not link.text:
                try:
                    # On one vice chair, the text was nested differently.
                    member = link[0].tail.strip()
                except (IndexError, AttributeError):
                    continue
            else:
                member = link.text.strip()

            next_elem = link.getnext()
            if (next_elem is not None and
                next_elem.tag == 'a' and
                next_elem.attrib['href'] == link.attrib['href']):
                # Sometimes NY is cool and splits names across a
                # couple links
                member = "%s %s" % (member, next_elem.text.strip())

            member = re.sub(r'\s+', ' ', member)

            if member in seen or not member:
                continue
            seen.add(member)

            name, role = parse_name(member)
            comm.add_member(name, role)

        if comm['members']:
            self.save_committee(comm)
    def scrape_senate_comm(self):
        url = 'http://legisweb1.mainelegislature.org/wp/senate/legislative-committees/'
        html = self.get(url).text
        doc = lxml.html.fromstring(html)
        committee_urls = doc.xpath('//address/a/@href')
        for committee_url in committee_urls:

            # Exclude the committee listing document
            if committee_url.endswith('.docx'):
                continue

            html = self.get(committee_url).text
            doc = lxml.html.fromstring(html)

            (committee_name, ) = \
                    doc.xpath('//h1[contains(@class, "entry-title")]/text()')
            committee_name = re.sub(r'\(.*?\)', "", committee_name)

            is_joint = (re.search(r'(?s)Committee Members.*Senate:.*House:.*', html))
            if is_joint:
                continue

            committee = Committee('upper', committee_name)
            committee.add_source(committee_url)

            members = doc.xpath('//address/a/text()')
            if not members:
                members = doc.xpath('//p/a/text()')
            for member in members:
                if member.isspace():
                    continue

                member = re.sub(r'^Senator ', "", member)
                member = re.sub(r' of .*', "", member)

                if member.endswith(", Chair"):
                    role = 'chair'
                    member = re.sub(r', Chair', "", member)
                else:
                    role = 'member'

                committee.add_member(member, role)

            self.save_committee(committee)
Beispiel #18
0
    def scrape(self, term, chambers):
        year_abr = term[0:4]

        self._init_mdb(year_abr)
        members_csv = self.access_to_csv('COMember')
        info_csv = self.access_to_csv('Committee')

        comm_dictionary = {}

        #Committe Info Database
        for rec in info_csv:
            abrv = rec["Code"]
            comm_name = rec["Description"]
            comm_type = rec["Type"]
            aide = rec["Aide"]
            contact_info = rec["Phone"]

            if abrv[0] == "A":
                chamber = "lower"
            elif abrv[0] == "S":
                chamber = "upper"

            comm = Committee(chamber, comm_name, comm_type = comm_type,
                             aide = aide, contact_info = contact_info)
            comm.add_source('http://www.njleg.state.nj.us/downloads.asp')
            comm_dictionary[abrv] = comm

        #Committee Member Database
        POSITIONS = {
            'C': 'chair',
            'V': 'vice-chair',
            '': 'member'
        }
        for member_rec in members_csv:
            # assignment=P means they are active, assignment=R means removed
            if member_rec['Assignment_to_Committee'] == 'P':
                abr = member_rec["Code"]
                comm_name = comm_dictionary[abr]

                leg = member_rec["Member"]
                role = POSITIONS[member_rec["Position_on_Committee"]]
                comm_name.add_member(leg, role=role)

        self.save_committee(comm_name)
Beispiel #19
0
    def scrape(self, chamber, term):
        base_url = 'http://www.ncga.state.nc.us/gascripts/Committees/Committees.asp?bPrintable=true&sAction=ViewCommitteeType&sActionDetails='

        chambers = {
            'upper': ['Senate%20Standing', 'Senate%20Select'],
            'lower': ['House%20Standing', 'House%20Select']
        }

        for ctype in chambers[chamber]:
            with self.urlopen(base_url + ctype) as data:
                doc = lxml.html.fromstring(data)
                doc.make_links_absolute(base_url + ctype)
                for comm in doc.xpath('//ul/li/a'):
                    name = comm.text
                    url = comm.get('href')
                    committee = Committee(chamber, name)
                    self.scrape_committee(committee, url)
                    committee.add_source(url)
                    self.save_committee(committee)
Beispiel #20
0
 def scrape_committees(self, chamber):
     url = _COMMITTEE_URL % _CHAMBERS[chamber]
     page = self.get(url).text
     html = lxml.html.fromstring(page)
     table = html.xpath(
         'body/section[2]/div/div/div/section[2]/div[2]/div/div/div/div'
     )[1:]
     for row in table:
         # committee name, description, hours of operation,
         # secretary and office_phone
         text = list(row[0].xpath('div')[0].itertext())
         attributes = [
             list(
                 value.replace(u'\xa0', ' ').replace(
                     'Secretary:', '').encode('ascii', 'ignore')
                 for value in text if 'Email:' not in value
                 and value != '\n' and 'Phone:' not in value)
         ]
         for i in range(len(attributes[0])):
             if 'Room' in attributes[0][i]:
                 attributes[0][i] = attributes[0][i].split(
                     'Room')[0].replace(', ', ' ')
         if len(attributes[0]) > 5:
             com = dict(zip(_TD_ONE, attributes[0]))
         else:
             com = dict(zip(_TD_TWO, attributes[0]))
         committee = Committee(chamber, **com)
         committee.add_source(url)
         # membership
         for td in row[1].xpath('div'):
             td_text = list(td.itertext())
             members = list(
                 value for value in td_text
                 if value != ' ' and value != '\n' and value != ',')
         role = "member"
         for member in members:
             if (member in ['Chair', 'Vice Chair']):
                 role = member.lower()
                 continue
             else:
                 committee.add_member(member, role=role)
                 role = "member"
         self.save_committee(committee)
Beispiel #21
0
    def scrape_session(self, term, chambers, session):
        sid = self.metadata['session_details'][session]['_guid']
        committees = self.cservice.GetCommitteesBySession(
            sid)['CommitteeListing']
        for committee in committees:
            cid = committee['Id']
            committee = self.cservice.GetCommittee(cid)

            name, typ, guid, code, description = [
                committee[x]
                for x in ['Name', 'Type', 'Id', 'Code', 'Description']
            ]
            chamber = {
                "House": "lower",
                "Senate": "upper",
                "Joint": "joint"
            }[typ]
            ctty = None
            if code in self.ctty_cache:
                ctty = self.ctty_cache[code]
                if (ctty['chamber'] != chamber) and (description and 'joint'
                                                     in description.lower()):
                    ctty['chamber'] = 'joint'
                else:
                    ctty = None

            if ctty is None:
                ctty = Committee(chamber,
                                 name,
                                 code=code,
                                 _guid=guid,
                                 description=description)
                self.ctty_cache[code] = ctty

            members = committee['Members']['CommitteeMember']
            for member in members:
                name = "{First} {Last}".format(
                    **dict(member['Member']['Name']))
                role = member['Role']
                ctty.add_member(name, role, _guid=member['Member']['Id'])

            ctty.add_source(self.csource)
            self.save_committee(ctty)
Beispiel #22
0
    def scrape_committee(self, chamber, url):
        with self.urlopen(url) as html:
            doc = lxml.html.fromstring(html)

            name = doc.xpath('//span[@class="committeeShortName"]/text()')[0]
            com = Committee(chamber, name)
            com.add_source(url)

            # get both titles and names, order is consistent
            titles = doc.xpath('//p[@class="rankingMemberTitle"]/text()')
            names = doc.xpath('//p[@class="rankingMemberName"]/a/text()')

            for title, name in zip(titles, names):
                com.add_member(name, title)

            for member in doc.xpath('//div[@class="committeeRegularMembers"]//a/text()'):
                com.add_member(member)

            self.save_committee(com)
Beispiel #23
0
    def get_committee_obj(self):
        name = self.get_name()
        url = self.get_url()
        parent_name = self.get_parent_name()

        if parent_name is not None:
            subcommittee = name
            committee_name = parent_name
        else:
            subcommittee = None
            committee_name = name

        self.committee = Committee('upper',
                                   committee_name,
                                   subcommittee=subcommittee)

        self.add_members()
        self.add_sources()
        return self.committee
Beispiel #24
0
    def _scrape_lower_standing_committee(self, committee_name, url):
        page = self.lxmlize(url)

        committee = Committee('lower', committee_name)
        committee.add_source(url)

        rows = page.xpath('//table[@id="body_ListView1_itemPlaceholder'
            'Container"]/tr[@class="linkStyle2"]')

        for row in rows:
            member_name = row.xpath('normalize-space(string(./td[1]/a))')
            member_name = ' '.join(filter(None, name_tools.split(member_name)))
            member_role = row.xpath('normalize-space(string(./td[2]))')

            member_role = self._normalize_member_role(member_role)

            committee.add_member(member_name, member_role)

        self.save_committee(committee)
Beispiel #25
0
    def scrape_committee(self, chamber, name, url):
        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)

            if page.xpath("//h3[. = 'Joint Committee']"):
                chamber = 'joint'

            comm = Committee(chamber, name)
            comm.add_source(url)

            for link in page.xpath("//a[contains(@href, 'member=')]"):
                member = link.text.strip()

                mtype = link.xpath("string(../preceding-sibling::td[1])")
                mtype = mtype.strip(": \r\n\t").lower()

                comm.add_member(member, mtype)

            self.save_committee(comm)
Beispiel #26
0
    def scrape_upper_committee(self, url):
        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)

            comm_name = page.xpath("string(//div[@class='contentheading'])")

            committee = Committee('upper', comm_name)
            committee.add_source(url)

            for el in page.xpath('//table/tr/td'):
                sen_name = el.xpath('string(a[@class="senatorLN"])')
                mark = sen_name.find('(')
                full_name = sen_name[0:mark]
                full_name = full_name.strip()
                if full_name:
                    committee.add_member(full_name)

            if committee['members']:
                self.save_committee(committee)
Beispiel #27
0
    def scrape(self, chamber, term):

        if chamber == 'upper':
            urls = ["http://www.lrc.ky.gov/committee/standing_senate.htm"]
            # also invoke joint scraper
            self.scrape('joint', term)
        elif chamber == 'lower':
            urls = ["http://www.lrc.ky.gov/committee/standing_house.htm"]
        else:
            urls = [
                "http://www.lrc.ky.gov/committee/interim.htm",
                "http://www.lrc.ky.gov/committee/statutory.htm"
            ]

            chamber = 'joint'

        for url in urls:
            page = self.urlopen(url)
            page = lxml.html.fromstring(page)
            page.make_links_absolute(url)

            links = []

            cttypages = [
                "//a[contains(@href, 'standing/')]",
                "//a[contains(@href, 'interim')]",
                "//a[contains(@href, 'statutory')]"
            ]

            for exp in cttypages:
                linkz = page.xpath(exp)
                links = links + linkz

            for link in links:
                name = re.sub(r'\s+\((H|S)\)$', '', link.text).strip().title()
                name = name.replace(".", "")
                comm = Committee(chamber, name)
                comm_url = link.attrib['href'].replace('home.htm',
                                                       'members.htm')
                self.scrape_members(comm, comm_url)
                if comm['members']:
                    self.save_committee(comm)
    def scrape_house_committees(self):
        base_url = 'http://house.mi.gov/MHRPublic/CommitteeInfo.aspx?comkey='
        html = self.get('http://house.mi.gov/mhrpublic/committee.aspx').text
        doc = lxml.html.fromstring(html)

        # get values out of drop down
        for opt in doc.xpath('//option'):
            name = opt.text
            # skip invalid choice
            if opt.text in ('Statutory Committees', 'Select One'):
                continue
            if 'have not been created' in opt.text:
                self.warning('no committees yet for the house')
                return
            com_url = base_url + opt.get('value')
            com_html = self.get(com_url).text
            cdoc = lxml.html.fromstring(com_html)
            com = Committee(chamber='lower', committee=name)
            com.add_source(com_url)

            for a in doc.xpath('//a[starts-with(@id, "memberLink")]'):
                name = a.text.strip()

            # all links to http:// pages in servicecolumn2 are legislators
            members = cdoc.xpath('//div[contains(@id,"memberPanelRow")]')
            for mem in members:
                name = mem.xpath('./a')
                if name:
                    name = name[0].text.strip()
                else:
                    #this is a blank row
                    continue
                text = mem.xpath('./span')[0].text
                if 'Committee Chair' in text:
                    role = 'chair'
                elif 'Vice-Chair' in text:
                    role = 'vice chair'
                else:
                    role = 'member'
                com.add_member(name, role=role)

            self.save_committee(com)
Beispiel #29
0
    def get_jmfc(self, name, url):
        """Gets the Joint Millennium Fund Committee info"""
        jfmc_page = self.urlopen(url)
        html = lxml.html.fromstring(jfmc_page)
        committee = Committee('joint', name)
        table = html.xpath('//table')[2]
        for row in table.xpath('tbody/tr'):
            senate, house = [ td.text.replace('\r\n', ' ').replace(u'\xa0', ' ') \
                              for td in row.xpath('td') ]

            sen_data = senate.strip('Sen.').strip().split(',')
            hou_data = house.strip('Rep.').strip().split(',')

            if len(sen_data) > 1 and sen_data[1].strip() != "":
                committee.add_member(*sen_data)
            if len(hou_data) > 1 and hou_data[1].strip() != "":
                committee.add_member(*hou_data)

        committee.add_source(url)
        self.save_committee(committee)
Beispiel #30
0
    def scrape(self, chamber, term):

        if chamber == 'upper':
            url = "http://www.lrc.ky.gov/org_adm/committe/standing_senate.htm"
        elif chamber == 'lower':
            url = "http://www.lrc.ky.gov/org_adm/committe/standing_house.htm"
        else:
            return

        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)
            page.make_links_absolute(url)

            for link in page.xpath("//a[contains(@href, 'standing/')]"):
                name = re.sub(r'\s+\((H|S)\)$', '', link.text).strip()
                comm = Committee(chamber, name)
                comm_url = link.attrib['href'].replace('home.htm',
                                                       'members.htm')
                self.scrape_members(comm, comm_url)
                self.save_committee(comm)