Beispiel #1
0
    def scrape_old_legislators(self, chamber, session):
        """
        Scrape pre-2009 legislators.
        """
        if chamber == "upper":
            chamber_name = "Senate"
        else:
            chamber_name = "House"

        if int(session) < 2008:
            filename = "district.htm"
        else:
            filename = "MembersDistrict.htm"

        leg_list_url = "http://legis.state.sd.us/sessions/%s/%s" % (session, filename)
        leg_list = self.soup_parser(self.urlopen(leg_list_url))

        for district_str in leg_list.findAll("h2"):
            district = district_str.contents[0].split(" ")[1].lstrip("0")

            for row in district_str.findNext("table").findAll("tr")[1:]:
                if row.findAll("td")[1].contents[0].strip() != chamber_name:
                    continue

                full_name = row.td.a.contents[0].strip()

                party = row.findAll("td")[3].contents[0].strip()
                occupation = row.findAll("td")[4].contents[0].strip()

                legislator = Legislator(session, chamber, district, full_name, party=party, occupation=occupation)
                legislator.add_source(leg_list_url)
                self.save_legislator(legislator)
Beispiel #2
0
    def scrape_new_legislators(self, chamber, session):
        """
        Scrape legislators from 2009 and later.
        """

        if chamber == "upper":
            search = "Senate Members"
        else:
            search = "House Members"

        leg_list_url = "http://legis.state.sd.us/sessions/%s/" "MemberMenu.aspx" % (session)
        leg_list = self.soup_parser(self.urlopen(leg_list_url))

        list_div = leg_list.find(text=search).findNext("div")

        for link in list_div.findAll("a"):
            full_name = link.contents[0].strip()

            leg_page_url = "http://legis.state.sd.us/sessions/%s/%s" % (session, link["href"])
            leg_page = self.soup_parser(self.urlopen(leg_page_url))

            party = leg_page.find(id="ctl00_contentMain_spanParty").contents[0].strip()

            district = leg_page.find(id="ctl00_contentMain_spanDistrict").contents[0]
            district = district.strip().lstrip("0")

            occ_span = leg_page.find(id="ctl00_contentMain_spanOccupation")
            if len(occ_span.contents) > 0:
                occupation = occ_span.contents[0].strip()
            else:
                occupation = None

            legislator = Legislator(session, chamber, district, full_name, party=party, occupation=occupation)
            legislator.add_source(leg_page_url)
            self.save_legislator(legislator)
Beispiel #3
0
    def scrape_senators(self, chamber, term):
        sen_url = 'http://www.ohiosenate.gov/directory.html'
        with self.urlopen(sen_url) as page:
            root = lxml.etree.fromstring(page, lxml.etree.HTMLParser())

            for el in root.xpath('//table[@class="fullWidth"]/tr/td'):
                sen_link = el.xpath('a[@class="senatorLN"]')[1]

                full_name = sen_link.text
                full_name = full_name[0:-2]
                if full_name == 'To Be Announced':
                    continue

                district = el.xpath('string(h3)').split()[1]

                party = el.xpath('string(a[@class="senatorLN"]/span)')

                if party == "D":
                    party = "Democrat"
                elif party == "R":
                    party = "Republican"

                leg = Legislator(term, chamber, district, full_name,
                        '', '', '', party)
                leg.add_source(sen_url)

                self.save_legislator(leg)
Beispiel #4
0
    def scrape_senate(self, term):
        url = 'http://www.senate.leg.state.mn.us/members/member_list.php'

        with self.urlopen(url) as html:
            doc = lxml.html.fromstring(html)

            for row in doc.xpath('//tr'):
                tds = row.xpath('td')
                if len(tds) == 5 and tds[1].text_content() in self._parties:
                    district = tds[0].text_content()
                    party = tds[1].text_content()
                    name_a = tds[2].xpath('a')[0]
                    name = name_a.text.strip()
                    addr, phone = tds[3].text_content().split(u'\xa0\xa0')
                    email = tds[4].text_content()

                    leg = Legislator(term, 'upper', district, name,
                                     party=self._parties[party],
                                     office_address=addr, office_phone=phone)

                    if '@' in email:
                        leg['email'] = email

                    leg.add_source(url)

                    self.save_legislator(leg)
    def scrape(self, chamber, term):
        # this beautiful page is loaded from the council page via AJAX
        url = 'http://www.dccouncil.washington.dc.us/include/linkedpage.aspx?linkedpage=2&page=17'

        # do nothing if they're trying to get a lower chamber
        if chamber == 'lower':
            return

        with self.urlopen(url) as data:
            base_doc = lxml.html.fromstring(data)

            for link in base_doc.xpath('//a'):
                leg_url = 'http://www.dccouncil.washington.dc.us/' + link.get('href')
                with self.urlopen(leg_url) as leg_html:
                    doc = lxml.html.fromstring(leg_html)
                    name = link.text

                    # Name, District
                    title = doc.get_element_by_id('PageTitle')
                    district = title.text.rsplit(', ')[-1]

                    # party
                    party = get_surrounding_block(doc, 'Political Affiliation')
                    if 'Democratic' in party:
                        party = 'Democratic'
                    else:
                        party = 'Independent'

                    legislator = Legislator(term, 'upper', district, name,
                                            party=party)
                    legislator.add_source(leg_url)
                self.save_legislator(legislator)
Beispiel #6
0
    def scrape_senators(self, chamber, year):


        sen_url = 'http://www.ohiosenate.gov/directory.html' 
        with self.urlopen(sen_url) as page:
            root = lxml.etree.fromstring(page, lxml.etree.HTMLParser())

            for el in root.xpath('//table[@class="fullWidth"]/tr/td'):


                sen_link = el.xpath('a[@class="senatorLN"]')[1]
                full_name = sen_link.text
                full_name = full_name[0 : len(full_name) - 2]
                district = el.xpath('string(h3)')
                district = district.split()[1]
                party = el.xpath('string(a[@class="senatorLN"]/span)')

                first_name = full_name.split()[0]
                last_name = full_name.split()[1]
                middle_name = ''

                leg = Legislator('128', chamber, district, full_name, 
                        first_name, last_name, middle_name, party)
                leg.add_source(sen_url)

                self.save_legislator(leg)
    def scrape_reps(self, chamber, session, term_name):
        # There is only 99 districts
        for district in range(1,100):
            rep_url = 'http://www.house.state.oh.us/components/com_displaymembers/page.php?district=' + str(district)
            with self.urlopen(rep_url) as page:
                root = lxml.etree.fromstring(page, lxml.etree.HTMLParser())

                for el in root.xpath('//table[@class="page"]'):
                    rep_link = el.xpath('tr/td/title')[0]
                    full_name = rep_link.text
                    party = full_name[-2]
                    full_name = full_name[0 : len(full_name)-3]
                    first_name = ""
                    last_name = ""
                    middle_name = ""                    
                    
                    if party == "D":
                        party = "Democrat"
                    elif party == "R":
                        party = "Republican"
                    else:
                        party = party

                    leg = Legislator(term_name, chamber, str(district), full_name, first_name, last_name, middle_name, party)
                    leg.add_source(rep_url)

                self.save_legislator(leg)
    def scrape_legislator_data(self, url, chamber):
        with self.lxml_context(url) as page:
            legislator_table = page.get_element_by_id("ctl00_PlaceHolderMain_dlMembers")
            legislators = legislator_table.cssselect('a')
            for legislator in legislators:
                name = legislator.text_content()
                full_name, first_name, middle_name, last_name = self.separate_name(name)
                name_for_url = last_name.lower()
                name_for_url = re.sub("'", "", name_for_url)
        
                if chamber == 'upper':
                    legislator_page_url = "http://www.leg.wa.gov/senate/senators/Pages/" + name_for_url + ".aspx"
                else: 
                    legislator_page_url = "http://www.leg.wa.gov/house/representatives/Pages/" + name_for_url + ".aspx"

                with self.lxml_context(legislator_page_url) as legislator_page:
                    try:
                        full_name, first_name, middle_name, last_name = self.scrape_legislator_name(legislator_page)
                    except:
                        break     
    
                    party_element = legislator_page.get_element_by_id("ctl00_PlaceHolderMain_lblParty")
                    
                    if party_element.text_content() == '(R)':
                        party = 'Republican'
                    else:
                        party = 'Democrat'
  
                    district_element = legislator_page.get_element_by_id("ctl00_PlaceHolderMain_hlDistrict")
                    district = district_element.text_content()        
               
                    legislator = Legislator('2009-2010', chamber, district, full_name, "", "", "", party)
                    legislator.add_source(legislator_page_url)
                    self.save_legislator(legislator)
    def scrape_legislator_data(self, chamber, session):
        with self.urlopen(house_url(chamber)) as page_html:
            page = lxml.html.fromstring(page_html)
            legislator_table = page.get_element_by_id("ctl00_PlaceHolderMain_dlMembers")
            legislators = legislator_table.cssselect('a')
            for legislator in legislators:
                name = legislator.text_content()
                full_name, first_name, middle_name, last_name = separate_name(name)
                name_for_url = last_name.lower()
                name_for_url = re.sub("'", "", name_for_url)
        
                legislator_page_url = legs_url(chamber, name_for_url)

                with self.urlopen(legislator_page_url) as legislator_page_html:
                    legislator_page = lxml.html.fromstring(legislator_page_html)
                    try:
                        full_name, first_name, middle_name, last_name = self.scrape_legislator_name(legislator_page)
                    except:
                        break     
    
                    party_element = legislator_page.get_element_by_id("ctl00_PlaceHolderMain_lblParty")
                    
                    if party_element.text_content() == '(R)':
                        party = 'Republican'
                    else:
                        party = 'Democrat'
  
                    district_element = legislator_page.get_element_by_id("ctl00_PlaceHolderMain_hlDistrict")
                    district = district_element.text_content()        
               
                    legislator = Legislator(session, chamber, district, full_name, "", "", "", party)
                    legislator.add_source(legislator_page_url)
                    self.save_legislator(legislator)
    def scrape_reps(self, chamber, session):

       rep_url = 'http://www.maine.gov/legis/house/dist_mem.htm'

       with self.urlopen(rep_url) as page:
            root = lxml.etree.fromstring(page, lxml.etree.HTMLParser())

            #There are 151 districts
            for district in range(1, 152):

                if (district % 10) == 0:
                    path = 'string(/html/body/p[%s]/a[3])' % (district+4)
                else:
                    path = 'string(/html/body/p[%s]/a[2])' % (district+4)
                name = root.xpath(path)

                if len(name) > 0:
                    if name.split()[0] != 'District':
                        mark = name.find('(')
                        party = name[mark + 1]
                        name = name[15 : mark]

                        firstname = ""
                        lastname = ""
                        middlename = ""

                        if party == "V":
                            name = "Vacant"

                        leg = Legislator(session, chamber, district, name, firstname, lastname, middlename, party)
                        leg.add_source(rep_url)
                        self.save_legislator(leg)
Beispiel #11
0
    def scrape_details(self, chamber, term, leg_name, leg_link, role):
        try:
            url = 'http://billstatus.ls.state.ms.us/members/%s' % leg_link
            with self.urlopen(url) as details_page:
                details_page = details_page.decode('latin1').encode('utf8', 'ignore')
                root = lxml.etree.fromstring(details_page, lxml.etree.HTMLParser())
                party = root.xpath('string(//party)')
                district = root.xpath('string(//district)')
                first_name, middle_name, last_name = "", "", ""

                home_phone = root.xpath('string(//h_phone)')
                bis_phone = root.xpath('string(//b_phone)')
                capital_phone = root.xpath('string(//cap_phone)')
                other_phone = root.xpath('string(//oth_phone)')
                org_info = root.xpath('string(//org_info)')
                email_name = root.xpath('string(//email_address)')
                email = '%s@%s.ms.gov' % (email_name, chamber)
                if party == 'D':
                    party = 'Democratic'
                else:
                    party = 'Republican'

                leg = Legislator(term, chamber, district, leg_name, first_name,
                                 last_name, middle_name, party, role=role,
                                 home_phone = home_phone, bis_phone=bis_phone,
                                 capital_phone=capital_phone,
                                 other_phone=other_phone, org_info=org_info,
                                 email=email)
                leg.add_source(url)
                self.save_legislator(leg)
        except scrapelib.HTTPError, e:
            self.warning(str(e))
Beispiel #12
0
    def scrape(self, chamber, year):
        session = "%d-%d" % (int(year), int(year) + 1)

        url = "http://www.ncga.state.nc.us/gascripts/members/"\
            "memberList.pl?sChamber="

        if chamber == 'lower':
            url += 'House'
        else:
            url += 'Senate'

        with self.urlopen(url) as (resp, data):
            leg_list = self.soup_parser(data)
            leg_table = leg_list.find('div', id='mainBody').find('table')

            for row in leg_table.findAll('tr')[1:]:
                party = row.td.contents[0].strip()
                if party == 'Dem':
                    party = 'Democrat'
                elif party == 'Rep':
                    party = 'Republican'

                district = row.findAll('td')[1].contents[0].strip()
                full_name = row.findAll('td')[2].a.contents[0].strip()
                full_name = full_name.replace(u'\u00a0', ' ')
                (first_name, last_name, middle_name, suffix) = split_name(
                    full_name)

                legislator = Legislator(session, chamber, district, full_name,
                                        first_name, last_name, middle_name,
                                        party, suffix=suffix)
                legislator.add_source(url)
                self.save_legislator(legislator)
    def scrape_reps(self, year):
        if year != '2009':
            return

        leg_page_url = "http://www.flhouse.gov/Sections/Representatives/"\
            "representatives.aspx"
        leg_page = BeautifulSoup(self.urlopen(leg_page_url))

        table = leg_page.find('table',
                              id='ctl00_ContentPlaceHolder1_ctrlContentBox'\
                                  '_ctrlPageContent_ctl00_dgLegislators')

        for row in table.findAll('tr')[1:]:
            full = row.findAll('td')[1].a.contents[0].replace('  ', ' ')

            district = row.findAll('td')[3].contents[0]
            party = row.findAll('td')[2].contents[0]

            if party == 'D':
                party = 'Democrat'
            elif party == 'R':
                party = 'Republican'

            leg = Legislator(year, 'lower', district, full, party=party)
            leg.add_source(leg_page_url)
            self.save_legislator(leg)
    def scrape_rep(self, name, term, url):
        # special case names that confuses name_tools
        if name == 'Franklin, A.B.':
            name = 'Franklin, A. B.'
        elif ', Jr., ' in name:
            name.replace(', Jr., ', ' ')
            name += ', Jr.'
        elif ', III, ' in name:
            name.replace(', III, ', ' ')
            name += ', III'

        with self.urlopen(url) as text:
            page = lxml.html.fromstring(text)

            district = page.xpath(
                "//a[contains(@href, 'Maps')]")[0].attrib['href']
            district = re.search("district(\d+).pdf", district).group(1)

            if "Democrat&nbsp;District" in text:
                party = "Democratic"
            elif "Republican&nbsp;District" in text:
                party = "Republican"
            elif "Independent&nbsp;District" in text:
                party = "Independent"
            else:
                party = "Other"

            leg = Legislator(term, 'lower', district, name, party=party)
            leg.add_source(url)
            self.save_legislator(leg)
    def scrape(self, chamber, year):
        year = int(year)
        session = internal_sessions[year][0][1]
        # iterating through subsessions would be a better way to do this..
        if year % 2 == 0 and (year != dt.date.today().year or  year+1 != dt.date.today().year):
            raise NoDataForYear(year)

        if chamber == 'upper':
            url = "http://legis.wi.gov/w3asp/contact/legislatorslist.aspx?house=senate"
        else:
            url = "http://legis.wi.gov/w3asp/contact/legislatorslist.aspx?house=assembly"

        #body = unicode(self.urlopen(url), 'latin-1')
        with self.urlopen(url) as body:
            page = lxml.html.fromstring(body)

            for row in page.cssselect("#ctl00_C_dgLegData tr"):
                if len(row.cssselect("td a")) > 0:
                    rep_url = list(row)[0].cssselect("a[href]")[0].get("href")

                    legpart = re.findall(r'([\w\-\,\s\.]+)\s+\(([\w])\)', list(row)[0].text_content())
                    if legpart:
                        full_name, party = legpart[0]

                        district = str(int(list(row)[2].text_content()))

                        leg = Legislator(session, chamber, district, full_name,
                                         party)
                        leg.add_source(rep_url)

                        leg = self.add_committees(leg, rep_url, session)
                        self.save_legislator(leg)
    def scrape(self, chamber, term):
        self.validate_term(term)

        if chamber == 'lower':
            title = 'Representative'
        else:
            title = 'Senator'

        url = 'http://www.le.state.ut.us/asp/roster/roster.asp?year=%s' % term
        leg_list = self.soup_parser(self.urlopen(url))

        for row in leg_list.findAll('table')[1].findAll('tr')[1:]:
            tds = row.findAll('td')

            leg_title = tds[1].find(text=True)
            if leg_title == title:
                fullname = tds[0].find(text=True)
                last_name = fullname.split(',')[0]
                first_name = fullname.split(' ')[1]
                if len(fullname.split(' ')) > 2:
                    middle_name = fullname.split(' ')[2]

                leg = Legislator(term, chamber, tds[3].find(text=True),
                                 fullname, first_name, last_name,
                                 middle_name, tds[2].find(text=True))
                leg.add_source(url)
                self.save_legislator(leg)
Beispiel #17
0
    def scrape_legislator_data(self, url, chamber):
        party_fulls = {'R' : 'Republican', 'D' : 'Democrat'}
        with self.urlopen(url) as page:
            page = BeautifulSoup(page)
            for data in page.find('table', id = 'ctl00_mainCopy_DataList1')('td'):
                spans = data('span')
                if len(spans) == 0:
                    self.debug('Found an empty cell in %s. Continuing' % url)
                    continue
                full_name = ' '.join([span.string.strip() for span in spans])
                if len(spans[0].string.strip().split()) == 2:
                    first_name, middle_name = spans[0].string.strip().split()
                else:
                    first_name, middle_name = spans[0].string.strip(), ''
                last_name = spans[1].string.strip()

                details_url = get_abs_url(url, data.find('a')['href'])
                with self.urlopen(details_url) as details:
                    details = BeautifulSoup(details)
                    district = details.find('a', id = 'ctl00_mainCopy_LegisInfo_DISTRICTLabel').string.strip()
                    party = party_fulls[details.find('span', id = 'ctl00_mainCopy_LegisInfo_PARTYLabel').string]

                    leg = Legislator('2010', chamber, district, full_name, first_name, 
                            last_name, middle_name, party)
                    leg.add_source(details_url)

                    comms_table = details.find('table', id = 'ctl00_mainCopy_MembershipGrid')
                    for comms_raw_data in comms_table('tr')[1:]:
                        comm_data = comms_raw_data('td')
                        comm_role_type = comm_data[0].string.strip()
                        comm_name = comm_data[1]('a')[0].string.strip()
                        leg.add_role(comm_role_type, '2010', chamber = chamber, committee = comm_name)

                    self.save_legislator(leg)
    def scrape(self, chamber, term):
        urls = {'upper': 'http://www.legislature.state.al.us/senate/senators/senateroster_alpha.html',
                'lower': 'http://www.legislature.state.al.us/house/representatives/houseroster_alpha.html'}

        url = urls[chamber]

        with self.urlopen(url) as html:
            doc = lxml.html.fromstring(html)

            for row in doc.xpath('//strong[starts-with(text(), "MEMBERS")]/following-sibling::table/tr')[1:]:
                name, party, district, office, phone = row.getchildren()

                # if the name column contains a link it isn't vacant
                link = name.xpath('a')
                if link:
                    name = name.text_content()
                    name = ' '.join(normalize_name(name))

                    party = party.text_content()
                    district = district.text_content()
                    office = office.text_content()
                    phone = phone.text_content()

                    leg = Legislator(term, chamber, district, name, party,
                                     phone=phone, office=office,
                                     url=link[0].get('href'))
                    leg.add_source(url)
                    self.save_legislator(leg)
Beispiel #19
0
    def scrape(self, chamber, term):
        url = self.urls[term][chamber]

        if url is None:
            raise NoDataForPeriod(term)

        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)

            for row in page.xpath("//tr")[1:]:
                name = row.xpath("td")[0].text_content()
                name = name.split(",")
                if len(name) == 2:
                    fullname = "%s %s" % (name[1].strip(), name[0].strip())
                elif len(name) == 3:
                    fullname = "%s %s, %s" % (name[1].strip(), name[0].strip(), name[2].strip())
                else:
                    fullname = " ".join(name).strip()

                # Most recent general assembly legislators list is slightly different than archived versions
                if term == "106th General Assembly":
                    party = row.xpath("td")[1].text_content().strip()
                    district = row.xpath("td")[3].text_content().replace("District ", "").strip()
                else:
                    party, district = row.xpath("td")[1].text_content().split("-")
                    party = party.strip()
                    district = district.strip()

                leg = Legislator(term, chamber, district, fullname, party=party)
                leg.add_source(url)
                self.save_legislator(leg)
    def scrape(self, chamber, year):
        found = False
        for session in metadata['sessions']:
            if session['name'] == year:
                found = True
                break
        if not found:
            raise NoDataForYear(year)

        if chamber == 'lower':
            title = 'Representative'
        else:
            title = 'Senator'

        url = 'http://www.le.state.ut.us/asp/roster/roster.asp?year=%s' % year
        leg_list = self.soup_parser(self.urlopen(url))

        for row in leg_list.findAll('table')[1].findAll('tr')[1:]:
            tds = row.findAll('td')

            leg_title = tds[1].find(text=True)
            if leg_title == title:
                fullname = tds[0].find(text=True)
                last_name = fullname.split(',')[0]
                first_name = fullname.split(' ')[1]
                if len(fullname.split(' ')) > 2:
                    middle_name = fullname.split(' ')[2]

                leg = Legislator(year, chamber, tds[3].find(text=True),
                                 fullname, first_name, last_name,
                                 middle_name, tds[2].find(text=True))
                leg.add_source(url)
                self.save_legislator(leg)
    def fetch_member(self, url, name, term, chamber):
        party_map = {'R': 'Republican', 'D': 'Democratic', 'I': 'Independent'}
        party_district_re = re.compile(
            r'\((R|D|I)\) - (?:House|Senate) District\s+(\d+)')

        url = 'http://leg6.state.va.us' + url

        # handle resignations, special elections
        match = re.search(r'-(Resigned|Member) (\d{1,2}/\d{1,2})?', name)
        if match:
            action, date = match.groups()
            name = name.rsplit('-')[0]
            if action == 'Resigned':
                pass # TODO: set end date
            elif action == 'Member':
                pass # TODO: set start date

        with self.urlopen(url) as html:
            doc = lxml.html.fromstring(html)

            party_district_line = doc.xpath('//h3/font/text()')[0]
            party, district = party_district_re.match(party_district_line).groups()

            leg = Legislator(term, chamber, district, name.strip(),
                             party=party_map[party])
            leg.add_source(url)
            self.save_legislator(leg)
    def scrape(self, chamber, term):
        self.validate_term(term)

        if chamber == 'upper':
            url = "http://legis.wi.gov/w3asp/contact/legislatorslist.aspx?house=senate"
        else:
            url = "http://legis.wi.gov/w3asp/contact/legislatorslist.aspx?house=assembly"

        with self.urlopen(url) as body:
            page = lxml.html.fromstring(body)

            for row in page.cssselect("#ctl00_C_dgLegData tr"):
                if len(row.cssselect("td a")) > 0:
                    rep_url = list(row)[0].cssselect("a[href]")[0].get("href")

                    legpart = re.findall(r'([\w\-\,\s\.]+)\s+\(([\w])\)', list(row)[0].text_content())
                    if legpart:
                        full_name, party = legpart[0]

                        district = str(int(list(row)[2].text_content()))

                        leg = Legislator(term, chamber, district, full_name,
                                         party=party)
                        leg.add_source(rep_url)

                        leg = self.add_committees(leg, rep_url, term)
                        self.save_legislator(leg)
Beispiel #23
0
    def scrape(self, chamber, term):
        if term != '2011-2012':
            raise NoDataForPeriod(term)

        chamber_name = {'upper': 'Senate',
                        'lower': 'House'}[chamber]

        url = ("http://www.in.gov/cgi-bin/legislative/listing/"
               "listing-2.pl?data=alpha&chamber=%s" % chamber_name)

        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)

            for link in page.xpath("//div[@id='col2']/p/a"):
                name = link.text.strip()

                details = link.getnext().text.strip()

                party = details.split(',')[0]
                if party == 'Democrat':
                    party = 'Democratic'

                district = re.search(r'District (\d+)', details).group(1)
                district = district.lstrip('0')

                leg = Legislator(term, chamber, district, name,
                                 '', '', '', party)
                leg.add_source(url)

                self.save_legislator(leg)
Beispiel #24
0
    def scrape_reps(self, chamber, term):
        # There are 99 House districts
        for district in xrange(1, 100):
            rep_url = ('http://www.house.state.oh.us/components/'
                       'com_displaymembers/page.php?district=%d' % district)

            with self.urlopen(rep_url) as page:
                root = lxml.etree.fromstring(page, lxml.etree.HTMLParser())

                for el in root.xpath('//table[@class="page"]'):
                    rep_link = el.xpath('tr/td/title')[0]
                    full_name = rep_link.text
                    party = full_name[-2]
                    full_name = full_name[0:-3]

                    if party == "D":
                        party = "Democratic"
                    elif party == "R":
                        party = "Republican"

                    leg = Legislator(term, chamber, str(district),
                                     full_name, '', '', '', party)
                    leg.add_source(rep_url)

                self.save_legislator(leg)
Beispiel #25
0
    def scrape_house(self, term):
        url = 'http://www.house.leg.state.mn.us/members/housemembers.asp'
        office_addr = ''' State Office Building,
100 Rev. Dr. Martin Luther King Jr. Blvd.
Saint Paul, Minnesota 55155'''

        with self.urlopen(url) as html:
            doc = lxml.html.fromstring(html)

            # skip first header row
            for row in doc.xpath('//tr')[1:]:
                tds = [td.text_content().strip() for td in row.xpath('td')]
                if len(tds) == 5:
                    district = tds[0]
                    name, party = tds[1].rsplit(' ', 1)
                    if party == '(R)':
                        party = 'Republican'
                    elif party == '(DFL)':
                        party = 'Democratic-Farmer-Labor'
                    addr = tds[2] + office_addr
                    phone = tds[3]
                    email = tds[4]

                leg = Legislator(term, 'lower', district, name,
                                 party=party, office_address=addr,
                                 office_phone=phone, email=email)
                leg.add_source(url)
                self.save_legislator(leg)
    def scrape(self, chamber, term):
        # Pennsylvania doesn't make member lists easily available
        # for previous sessions, unfortunately
        if term != '2009-2010':
            raise NoDataForPeriod(term)

        leg_list_url = legislators_url(chamber)

        with self.urlopen(leg_list_url) as page:
            page = lxml.html.fromstring(page)

            for link in page.xpath("//a[contains(@href, '_bio.cfm')]"):
                full_name = link.text[0:-4]
                district = re.search("District (\d+)", link.tail).group(1)

                party = link.text[-2]
                if party == 'R':
                    party = 'Republican'
                elif party == 'D':
                    party = 'Democratic'

                legislator = Legislator(term, chamber, district,
                                        full_name, party=party)
                legislator.add_source(leg_list_url)
                self.save_legislator(legislator)
Beispiel #27
0
    def scrape_pre_58_legislators(self, chamber, term, suffix):
        url = 'http://leg.mt.gov/css/Sessions/%s%s/legname.asp' % (term, suffix)
        legislator_page = ElementTree(lxml.html.fromstring(self.urlopen(url)))

        if term == '57':
            if chamber == 'upper':
                tableName = '57th Legislatore Roster Senate (2001-2002)'
                startRow = 3
            else:
                tableName = '57th Legislator Roster (House)(2001-2002)'
                startRow = 5
        elif term == '56':
            if chamber == 'upper':
                tableName = 'Members of the Senate'
                startRow = 3
            else:
                tableName = 'Members of the House'
                startRow = 5

        for table in legislator_page.xpath("//table"):
            if table.attrib.has_key('name') and table.attrib['name'] == tableName:
                parse_names = False
                for row in table.getchildren():
                    if row.tag != 'tr':
                        continue
                    celldata = row.getchildren()[0].text_content().strip()
                    if parse_names and len(celldata) != 0:
                        name, party_letter = celldata.rsplit(' (', 1)
                        party_letter = party_letter[0]

                        nameParts = [namePart.strip() for namePart in name.split(',')]
                        assert len(nameParts) < 4
                        if len(nameParts) == 2:
                            last_name, first_name = nameParts
                        elif len(nameParts) == 3:
                            last_name = ' '.join(nameParts[0:2])
                            first_name = nameParts[2]
                        else:
                            name, party_letter = celldata.rsplit(' (', 1)

                        district = row.getchildren()[2].text_content().strip()

                        if party_letter == 'R':
                            party = 'Republican'
                        elif party_letter == 'D':
                            party = 'Democrat'
                        else:
                            party = party_letter

                        legislator = Legislator(term, chamber, district, '%s %s' % (first_name, last_name), \
                                                first_name, last_name, '', party)
                        legislator.add_source(url)
                        self.save_legislator(legislator)

                    if celldata == "Name (Party)":
                        # The table headers seem to vary in size, but the last row
                        # always seems to start with 'Name (Party)' -- once we find
                        # that, start parsing legislator names
                        parse_names = True
    def scrape(self, chamber, term):
        mtype = {'upper':'senator', 'lower': 'representative'}[chamber]

        extra_fields = {
            'phone':
                './phone-numbers/phone-number[@title="Capitol Phone"]/@number',
            'district_phone':
                './phone-numbers/phone-number[@title="District Phone"]/@number'
        }

        addr_fields = {
            'capitol_address':
                './addresses/address[@title="Capitol Address"]',
            'district_address':
                './addresses/address[@title="District Office Address"]',
        }

        party_map = {'DEM': 'Democratic', 'REP': 'Republican'}

        with self.urlopen('http://www.leg.state.or.us/xml/members.xml') as html:
            doc = lxml.html.fromstring(html)

            for member in doc.xpath('//member[@member-type="%s"]' % mtype):
                first_name = member.get('first-name')
                last_name = member.get('last-name')
                party = party_map[member.get('party')]

                # extra_fields
                extra_dict = {}
                for name, xpath in extra_fields.iteritems():
                    result = member.xpath(xpath)
                    if result:
                        extra_dict[name] = result[0]

                # address fields
                for name, xpath in addr_fields.iteritems():
                    result = member.xpath(xpath)
                    if result:
                        extra_dict[name] = '%s %s, %s %s' % (
                            member.get('street-address'),
                            member.get('city'),
                            member.get('state'),
                            member.get('postal-code'))


                leg = Legislator(term, chamber, member.get('district-number'),
                                 full_name=first_name+' '+last_name,
                                 first_name=first_name,
                                 last_name=last_name,
                                 middle_name=member.get('middle-initial'),
                                 party=party,
                                 email=member.get('e-mail'),
                                 website=member.get('website'),
                                 oregon_member_id=member.get('leg-member-id'),
                                 **extra_fields)
                leg.add_source('http://www.leg.state.or.us/xml/members.xml')


                self.save_legislator(leg)
Beispiel #29
0
    def scrape(self, chamber, year):
        if year != '2009':
            raise NoDataForYear(year)
        term = "%s-%d" % (year, int(year) + 1)

        # What Vermont claims are Word and Excel files are actually
        # just HTML tables
        # What Vermont claims is a CSV file is actually one row of comma
        # separated values followed by a ColdFusion error.
        leg_url = "http://www.leg.state.vt.us/legdir/"\
            "memberdata.cfm/memberdata.doc?FileType=W"
        leg_table = BeautifulSoup(self.urlopen(leg_url))

        for tr in leg_table.findAll('tr')[1:]:
            leg_cham = tr.findAll('td')[3].contents[0]
            if leg_cham == 'H' and chamber == 'upper':
                continue
            if leg_cham == 'S' and chamber == 'lower':
                continue

            district = tr.findAll('td')[5].contents[0]
            district = district.replace(' District', '').strip()
            first = tr.findAll('td')[6].contents[0]

            middle = tr.findAll('td')[7]
            if len(middle.contents) == 0:
                middle = ''
            else:
                middle = middle.contents[0].strip()

            last = tr.findAll('td')[8].contents[0]

            if len(middle) == 0:
                full = "%s, %s" % (last, first)
            else:
                full = "%s, %s %s." % (last, first, middle)

            official_email = tr.findAll('td')[9]
            if len(official_email.contents) == 0:
                official_email = ''
            else:
                official_email = official_email.contents[0]

            party = tr.findAll('td')[4].contents[0]
            if party == 'D':
                party = 'Democrat'
            elif party == 'R':
                party = 'Republican'
            elif party == 'I':
                party = 'Independent'
            elif party == 'P':
                party = 'Progressive'

            leg = Legislator(term, chamber, district, full,
                             first, last, middle, party,
                             official_email=official_email)
            leg.add_source(leg_url)
            self.save_legislator(leg)
    def scrape(self, chamber, year):
        if int(year) != 2009:
            return
        session = "%s-%d" % (year, int(year) + 1)

        # What Vermont claims are Word and Excel files are actually
        # just HTML tables
        # What Vermont claims is a CSV file is actually one row of comma
        # separated values followed by a ColdFusion error.
        leg_url = "http://www.leg.state.vt.us/legdir/" "memberdata.cfm/memberdata.doc?FileType=W"
        leg_table = BeautifulSoup(self.urlopen(leg_url))

        for tr in leg_table.findAll("tr")[1:]:
            leg_cham = tr.findAll("td")[3].contents[0]
            if leg_cham == "H" and chamber == "upper":
                continue
            if leg_cham == "S" and chamber == "lower":
                continue

            district = tr.findAll("td")[5].contents[0]
            district = district.replace(" District", "").strip()
            first = tr.findAll("td")[6].contents[0]

            middle = tr.findAll("td")[7]
            if len(middle.contents) == 0:
                middle = ""
            else:
                middle = middle.contents[0].strip()

            last = tr.findAll("td")[8].contents[0]

            if len(middle) == 0:
                full = "%s, %s" % (last, first)
            else:
                full = "%s, %s %s." % (last, first, middle)

            official_email = tr.findAll("td")[9]
            if len(official_email.contents) == 0:
                official_email = ""
            else:
                official_email = official_email.contents[0]

            party = tr.findAll("td")[4].contents[0]
            if party == "D":
                party = "Democrat"
            elif party == "R":
                party = "Republican"
            elif party == "I":
                party = "Independent"
            elif party == "P":
                party = "Progressive"

            leg = Legislator(
                session, chamber, district, full, first, last, middle, party, official_email=official_email
            )
            leg.add_source(leg_url)
            self.save_legislator(leg)