Beispiel #1
0
    def scrape(self, chamber, term):
        # Pennsylvania doesn't make member lists easily available
        # for previous sessions, unfortunately
        self.validate_term(term, latest_only=True)

        leg_list_url = legislators_url(chamber)

        with self.urlopen(leg_list_url) as page:
            page = lxml.html.fromstring(page)
            page.make_links_absolute(leg_list_url)

            for link in page.xpath("//a[contains(@href, '_bio.cfm')]"):
                full_name = link.text
                district = link.getparent().getnext().tail.strip()
                district = re.search("District (\d+)", district).group(1)

                party = link.text[-2]
                if party == 'R':
                    party = 'Republican'
                elif party == 'D':
                    party = 'Democratic'

                url = link.get('href')

                legislator = Legislator(term, chamber, district,
                                        full_name, party=party, url=url)
                legislator.add_source(leg_list_url)
                self.save_legislator(legislator)
    def scrape_legislator(self, chamber, term, name, url):
        html = self.get(url).text
        page = lxml.html.fromstring(html)
        page.make_links_absolute(url)

        xpath = '//select[@name="sel_member"]/option[@selected]/text()'
        district = page.xpath('//h1[contains(., "DISTRICT")]/text()').pop().split()[1].strip().lstrip('0')

        party = page.xpath('//h2').pop().text_content()
        party = re.search(r'\((R|D|I)[ \-\]]', party).group(1)

        if party == 'D':
            party = 'Democratic'
        elif party == 'R':
            party = 'Republican'
        elif party == 'I':
            party = 'Independent'

        photo_url = page.xpath(
            "//img[contains(@src, 'images/members/')]")[0].attrib['src']


        leg = Legislator(term, chamber, district, name, party=party,
                         photo_url=photo_url, url=url)
        leg.add_source(url)
        self.scrape_offices(leg, page)
        self.save_legislator(leg)
Beispiel #3
0
    def scrape_upper(self, chamber, term):
        url = 'http://www.senate.michigan.gov/members/memberlist.htm'
        html = self.urlopen(url)
        doc = lxml.html.fromstring(html)
        for row in doc.xpath('//table[@width=550]/tr')[1:39]:
            # party, dist, member, office_phone, office_fax, office_loc
            party, dist, member, phone, fax, loc = row.getchildren()
            party = abbr[party.text]
            district = dist.text_content().strip()
            name = member.text_content().strip()
            if name == 'Vacant':
                self.info('district %s is vacant', district)
                continue
            leg_url = member.xpath('a/@href')[0]
            office_phone = phone.text
            office_fax = fax.text
            office_loc = loc.text
            leg = Legislator(term=term, chamber=chamber,
                             district=district,
                             full_name=name,
                             party=party,
                             url=leg_url)

            leg.add_office('capitol', 'Capitol Office',
                           address=office_loc,
                           fax=office_fax,
                           phone=office_phone)


            leg.add_source(url)
            self.save_legislator(leg)
Beispiel #4
0
    def scrape(self, chamber, term):
        term_slug = term[:-2]
        url = MEMBER_LIST_URL[chamber] % term_slug

        html = self.urlopen(url)
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)

        for row in doc.xpath('//table')[4].xpath('tr')[2:]:
            name, _, _, district, party = row.xpath('td')
            district = district.text
            party = {'D':'Democratic', 'R': 'Republican',
                     'I': 'Independent'}[party.text]
            leg_url = name.xpath('a/@href')[0]
            name = name.text_content().strip()

            # inactive legislator, skip them for now
            if name.endswith('*'):
                continue

            leg_html = self.urlopen(leg_url)
            leg_doc = lxml.html.fromstring(leg_html)
            photo_url = leg_doc.xpath('//img[contains(@src, "/members/")]/@src')[0]

            leg = Legislator(term, chamber, district, name, party=party,
                             url=leg_url, photo_url=photo_url)
            leg.add_source(url)
            leg.add_source(leg_url)
            self.save_legislator(leg)
Beispiel #5
0
  def scrape(self, chamber, term):
    self.validate_term(term, latest_only=True)

    if chamber == 'upper':
      url = ('http://www.rilin.state.ri.us/Documents/Senators.xls')
      rep_type = 'Senator '
    elif chamber == 'lower':
      url = ('http://www.rilin.state.ri.us/Documents/Representatives.xls')
      rep_type = 'Representative '

    with self.urlopen(url) as senator_xls:
      with open('ri_senate.xls', 'w') as f:
        f.write(senator_xls)

    wb = xlrd.open_workbook('ri_senate.xls')
    sh = wb.sheet_by_index(0)

    for rownum in xrange(1, sh.nrows):
      d = {}
      for field, col_num in excel_mapping.iteritems():
        d[field] = str(sh.cell(rownum, col_num).value)
      district_name = "District " + d['district']
      full_name = re.sub(rep_type, '', d['full_name']).strip()
      leg = Legislator(term, chamber, district_name, full_name,
                       '', '', '',
                       d['party'], 
                       office_address=d['address'],
                       town_represented=d['town_represented'],
                       email=d['email'])
      leg.add_source(url)

      self.save_legislator(leg)
Beispiel #6
0
    def scrape_details(self, chamber, term, leg_name, leg_link, role):
        try:
            url = 'http://billstatus.ls.state.ms.us/members/%s' % leg_link
            with self.urlopen(url) as details_page:
                details_page = details_page.decode('latin1').encode('utf8', 'ignore')
                root = lxml.etree.fromstring(details_page, lxml.etree.HTMLParser())
                party = root.xpath('string(//party)')
                district = root.xpath('string(//district)')
                first_name, middle_name, last_name = "", "", ""

                home_phone = root.xpath('string(//h_phone)')
                bis_phone = root.xpath('string(//b_phone)')
                capital_phone = root.xpath('string(//cap_phone)')
                other_phone = root.xpath('string(//oth_phone)')
                org_info = root.xpath('string(//org_info)')
                email_name = root.xpath('string(//email_address)')
                email = '%s@%s.ms.gov' % (email_name, chamber)
                if party == 'D':
                    party = 'Democratic'
                else:
                    party = 'Republican'

                leg = Legislator(term, chamber, district, leg_name, first_name,
                                 last_name, middle_name, party, role=role,
                                 home_phone = home_phone, bis_phone=bis_phone,
                                 capital_phone=capital_phone,
                                 other_phone=other_phone, org_info=org_info,
                                 email=email, url=url)
                leg.add_source(url)
                self.save_legislator(leg)
        except scrapelib.HTTPError, e:
            self.warning(str(e))
Beispiel #7
0
    def scrape_details(self, chamber, term, leg_name, leg_link, role):
        if not leg_link:
            # Vacant post, likely:
            if "Vacancy" in leg_name:
                return
            raise Exception("leg_link is null. something went wrong")
        try:
            url = 'http://billstatus.ls.state.ms.us/members/%s' % leg_link
            url_root = os.path.dirname(url)
            details_page = self.urlopen(url)
            root = lxml.etree.fromstring(details_page.bytes)
            party = root.xpath('string(//PARTY)')
            district = root.xpath('string(//DISTRICT)')
            photo = "%s/%s" % (url_root, root.xpath('string(//IMG_NAME)'))

            home_phone = root.xpath('string(//H_PHONE)')
            bis_phone = root.xpath('string(//B_PHONE)')
            capital_phone = root.xpath('string(//CAP_PHONE)')
            other_phone = root.xpath('string(//OTH_PHONE)')
            org_info = root.xpath('string(//ORG_INFO)')
            email_name = root.xpath('string(//EMAIL_ADDRESS)')
            cap_room = root.xpath('string(//CAP_ROOM)')

            if party == 'D':
                party = 'Democratic'
            else:
                party = 'Republican'

            leg = Legislator(term, chamber, district, leg_name,
                             party=party,
                             role=role,
                             org_info=org_info,
                             url=url,
                             photo_url=photo)
            leg.add_source(url)

            kwargs = {}

            if email_name.strip() != "":
                email = '%s@%s.ms.gov' % (email_name, {
                    "upper": "senate",
                    "lower": "house"
                }[chamber])
                kwargs['email'] = email

            if capital_phone != "":
                kwargs['phone'] = capital_phone

            if cap_room != "":
                kwargs["address"] = "Room %s\n%s" % (cap_room, CAP_ADDRESS)
            else:
                kwargs['address'] = CAP_ADDRESS

            leg.add_office('capitol',
                           'Capitol Office',
                           **kwargs)

            self.save_legislator(leg)
        except scrapelib.HTTPError, e:
            self.warning(str(e))
Beispiel #8
0
    def scrape_reps(self, chamber, term):
        # There are 99 House districts
        for district in xrange(1, 100):
            rep_url = "http://www.house.state.oh.us/components/" "com_displaymembers/page.php?district=%d" % district

            with self.urlopen(rep_url) as page:
                page = lxml.html.fromstring(page)

                for el in page.xpath('//table[@class="page"]'):
                    rep_link = el.xpath("tr/td/title")[0]
                    full_name = rep_link.text
                    party = full_name[-2]
                    full_name = full_name[0:-3]

                    if full_name == "Vacant Posit":
                        continue

                    if party == "D":
                        party = "Democratic"
                    elif party == "R":
                        party = "Republican"

                    leg = Legislator(term, chamber, str(district), full_name, party=party, url=rep_url)
                    leg.add_source(rep_url)

                self.save_legislator(leg)
Beispiel #9
0
    def scrape_upper(self, term):
        url = "http://www.nysenate.gov/senators"
        page = self.urlopen(url)
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        xpath = (
            '//div[contains(@class, "views-row")]/'
            'div[contains(@class, "last-name")]/'
            'span[contains(@class, "field-content")]/a')
        for link in page.xpath(xpath):
            if link.text in (None, 'Contact', 'RSS'):
                continue
            name = link.text.strip()

            district = link.xpath("string(../../../div[3]/span[1])")
            district = re.match(r"District (\d+)", district).group(1)

            photo_link = link.xpath("../../../div[1]/span/a/img")[0]
            photo_url = photo_link.attrib['src']

            legislator = Legislator(term, 'upper', district,
                                    name, party="Unknown",
                                    photo_url=photo_url)
            legislator.add_source(url)

            contact_link = link.xpath("../span[@class = 'contact']/a")[0]
            contact_url = contact_link.attrib['href']
            self.scrape_upper_offices(legislator, contact_url)

            legislator['url'] = contact_url.replace('/contact', '')

            self.save_legislator(legislator)
Beispiel #10
0
    def scrape_2011Leg(self, chamber, term, url):
        """2011 Scraper for legislators"""
        parties = {'(D)': 'Democratic', '(R)': 'Republican'}
        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)
            page.make_links_absolute(url)
            table = page.xpath('//table[contains(@id, "GridView1")]')[0]
            for row in table.xpath('tr[td/a[contains(@href, "memberpage")]]'):
                params = {}
                district = row.xpath('td/span[contains(@id, "LabelDistrict")]/font')[0].text
                last_name_a = row.xpath('td/a[contains(@id, "HyperLinkLast")]')[0]
                member_url = last_name_a.get('href')
                last_name = last_name_a.text_content().strip()
                first_names = row.xpath('td/span[contains(@id, "LabelFirst")]/font')[0].text.strip()
                first_name = first_names.split()[0]
                middle_name = ' '.join(first_names.split()[1:])
                party = row.xpath('td/span[contains(@id, "LabelParty")]/font')[0].text
                party = parties[party]
                params['office_address'] = row.xpath('td/span[contains(@id, "LabelRoom")]')[0].text + \
                    " " + row.xpath('td/span[contains(@id, "LabelRoom2")]')[0].text
                params['photo_url'] = row.xpath('td/a[contains(@id, "HyperLinkChairJPG")]/img')[0].attrib['src']
                params['email'] = row.xpath('td/a[contains(@id, "HyperLinkEmail")]')[0].text
                params['phone'] = row.xpath('td/span[contains(@id, "LabelPhone2")]')[0].text

                full_name = first_names + " " + last_name
                leg = Legislator(term, chamber, district, full_name,
                                 first_name, last_name, middle_name, party,
                                 url=member_url, **params)
                leg.add_source(url)
                self.save_legislator(leg)
Beispiel #11
0
 def _scrape_speaker_of_the_house(self, url, term, chamber):
     """The speaker of the house has a special page, because he is just OH so special</sarcasm>
     
     Main page url like: http://www1.legis.ga.gov/legis/2011_12/house/speaker/index.htm
     but need to scrape: http://www1.legis.ga.gov/legis/2011_12/house/speaker/bio.html
     """
     if url.endswith("index.htm"):
         url = url.replace("index.htm", "bio.html")
     with self.lxml_context(url) as page:
         path = '//div[@id="title"]'
         speaker_info_div = page.xpath(path)
         if speaker_info_div and len(speaker_info_div) == 1:
             # This isn't exactly great but it's the best/quickest solution for now
             speaker_info = speaker_info_div[0].text_content().split()
             name = speaker_info[2] + " " + speaker_info[3]
             party = None
             if "R-" in speaker_info[4]:
                 party = "Republican"
             elif "D-" in speaker_info[4]:
                 party = "Democrat"
             elif "I-" in speaker_info[4]:
                 party = "Independent"
             
             district = None
             if "district" in speaker_info[6].lower():
                 district = speaker_info[7].strip(")")
             
             legislator = Legislator(term,
                                     chamber,
                                     district,
                                     name,
                                     party=party)
             legislator.add_source(url)
             return legislator
Beispiel #12
0
    def scrape_rep(self, name, term, url):
        # special case names that confuses name_tools
        if name == "Franklin, A.B.":
            name = "Franklin, A. B."
        elif ", Jr., " in name:
            name.replace(", Jr., ", " ")
            name += ", Jr."
        elif ", III, " in name:
            name.replace(", III, ", " ")
            name += ", III"

        with self.urlopen(url) as text:
            page = lxml.html.fromstring(text)

            district = page.xpath("//a[contains(@href, 'Maps')]")[0].attrib["href"]
            district = re.search("district(\d+).pdf", district).group(1)

            if "Democrat&nbsp;District" in text:
                party = "Democratic"
            elif "Republican&nbsp;District" in text:
                party = "Republican"
            elif "Independent&nbsp;District" in text:
                party = "Independent"
            else:
                party = "Other"

            leg = Legislator(term, "lower", district, name, party=party)
            leg.add_source(url)
            self.save_legislator(leg)
Beispiel #13
0
    def scrape_senator(self, name, term, url):
        with self.urlopen(url) as text:
            page = lxml.html.fromstring(text)

            district = page.xpath(
                "string(//*[starts-with(text(), 'Senator ')])")

            district = re.search(r'District (\d+)', district).group(1)

            try:
                party = page.xpath(
                    "//b[contains(text(), 'Party')]")[0].getnext().tail
                party = party.strip()
            except IndexError:
                party = 'N/A'

            if party == 'No Party (Independent)':
                party = 'Independent'
            elif party == 'Democrat':
                party = 'Democratic'

            leg = Legislator(term, 'upper', district, name, party=party,
                             url=url)
            leg.add_source(url)
            self.save_legislator(leg)
Beispiel #14
0
    def scrape_member(self, chamber, term, member_url):
        with self.urlopen(member_url) as page:
            root = lxml.html.fromstring(page)
            root.make_links_absolute(member_url)

        photo_url = root.xpath('//div[starts-with(@class,"bioPicContainer")]/img/@src')[0]
        photo_url = root.xpath('//div[starts-with(@class,"bioPicContainer")]/img/@src')[0]
        full_name = root.xpath('//div[starts-with(@class,"bioPicContainer")]/img/@alt')[0]

        email = root.xpath('//a[contains(@href, "mailto")]/@href')[0]
        email = email.replace('mailto:','')

        district = root.xpath('//div[@id="District"]//div[starts-with(@class,"widgetContent")]')
        if len(district):
            district = district[0].text.strip()
            district = clean_district(district)

        party = root.xpath('//span[@class="legislatorAffiliation"]/text()')[0]

        if party == 'D':
            party = 'Democratic'
        elif party == 'R':
            party = 'Republican'
        else:
            party = 'Other'

        leg = Legislator(term, chamber, district, full_name, party=party,
                         photo_url=photo_url, url=member_url, email=email)
        leg.add_source(member_url)

        self.save_legislator(leg)
Beispiel #15
0
    def scrape(self, chamber, term):
        if chamber == 'lower':
            url = 'http://www.scstatehouse.gov/html-pages/housemembers.html'
        else:
            url = 'http://www.scstatehouse.gov/html-pages/senatemembersd.html'

        with self.urlopen(url) as data:
            doc = lxml.html.fromstring(data)
            rows = doc.xpath('//pre/div[@class="sansSerifNormal"]')

            for row in rows:
                member_a = row.xpath('a')[0]

                name_party = member_a.text_content()
                if name_party.find('[D]') != -1:
                    party = 'Democratic'
                    full_name = name_party.partition('[D]')[0].strip()
                elif name_party.find('[R]') != -1:
                    party = 'Republican'
                    full_name = name_party.partition('[R]')[0].strip()

                photo_url = 'http://www.scstatehouse.gov/members/gif/' + re.search('(\d+)\.html', member_a.attrib['href']).group(1) + '.jpg'

                other_data = row.text_content().encode('ascii', 'ignore')
                od_result = re.search('^.+District (\d+) - (.+)Count.+$', other_data)
                district = od_result.group(1)

                contentb = re.search('^.+\(C\) (.+,.*\d+).*Bus. (\(\d+\) \d+-\d+).+$', other_data)
                if contentb is not None:
                    office_address = contentb.group(1)
                    office_phone = contentb.group(2)

                legislator = Legislator(term, chamber, district, full_name, party=party, photo_url=photo_url, office_address=office_address, office_phone=office_phone)
                legislator.add_source(url)
                self.save_legislator(legislator)
Beispiel #16
0
    def scrape_upper(self, term):
        url = "http://oksenate.gov/Senators/Default.aspx"
        html = self.urlopen(url)
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)

        for a in doc.xpath('//table[@summary]')[1].xpath('.//td//a[contains(@href, "biographies")]'):
            name, party = a.text.rsplit(None, 1)

            if party == '(D)':
                party = 'Democratic'
            elif party == '(R)':
                party = 'Republican'

            tail = a.xpath('..')[0].tail
            if tail:
                district = tail.split()[1]
            else:
                district = a.xpath('../../span')[1].text.split()[1]
            url = a.get('href')

            leg = Legislator(term, 'upper', district, name, party=party, url=url)
            leg.add_source(url)
            self.scrape_upper_offices(leg, url)
            self.save_legislator(leg)
Beispiel #17
0
    def scrape_lower(self, term):
        url = "http://assembly.state.ny.us/mem/?sh=email"
        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)
            page.make_links_absolute(url)

            for link, email in zip(
                page.xpath("//a[contains(@href, '/mem/')]"), page.xpath("//a[contains(@href, 'mailto')]")
            ):
                name = link.text.strip()
                if name == "Assembly Members":
                    continue
                # empty seats
                if "Assembly District" in name:
                    continue
                leg_url = link.get("href")

                district = link.xpath("string(../following-sibling::" "div[@class = 'email2'][1])")
                district = district.rstrip("rthnds")

                legislator = Legislator(term, "lower", district, name, party="Unknown", url=leg_url)
                legislator.add_source(url)

                email = email.text_content().strip()
                if email:
                    legislator["email"] = email
                self.save_legislator(legislator)
Beispiel #18
0
    def scrape_senate(self, term):
        urls = (
         'http://www.senadopr.us/senadores/Pages/Senadores%20Acumulacion.aspx',
         'http://www.senadopr.us/Pages/Senadores%20Distrito%20I.aspx',
         'http://www.senadopr.us/Pages/Senadores%20Distrito%20II.aspx',
         'http://www.senadopr.us/Pages/Senadores%20Distrito%20III.aspx',
         'http://www.senadopr.us/Pages/Senadores%20Distrito%20IV.aspx',
         'http://www.senadopr.us/Pages/Senadores%20Distrito%20V.aspx',
         'http://www.senadopr.us/Pages/Senadores%20Distrito%20VI.aspx',
         'http://www.senadopr.us/Pages/Senadores%20Distrito%20VII.aspx',
         'http://www.senadopr.us/Pages/Senadores%20Distrito%20VIII.aspx')

        for counter, url in enumerate(urls):
            leg_page_html = self.urlopen(url)
            doc = lxml.html.fromstring(leg_page_html)
            doc.make_links_absolute(url)
            table = doc.xpath('//table[@summary="Listado de Senadores"]')[0]

            # skip first row
            for row in table.xpath('tr')[1:]:
                tds = row.xpath('td')

                name = tds[0].text_content().title().replace('Hon.','',1).strip()
                party = tds[1].text_content()
                phone = tds[2].text_content()
                email = tds[3].text_content()
                #shapefiles denote 0 as At-Large Districts
                if counter == 0:
                    district = 'At-Large'
                else:
                    district = str(counter)

                #Code to guess the picture
                namefixed = unicode(name.replace(".",". "))  #Those middle names abbreviations are sometimes weird.
                namefixed = unicodedata.normalize('NFKD', namefixed).encode('ascii', 'ignore') #Remove the accents
                nameparts = namefixed.split()
                if nameparts[1].endswith('.'):
                    lastname = nameparts[2]
                else:
                    lastname = nameparts[1]

                # Construct the photo url
                picture_filename = 'http://www.senadopr.us/Fotos%20Senadores/sen_' + (nameparts[0][0] + lastname).lower() + '.jpg'

                try:
                    picture_data = self.urlopen(picture_filename):  # Checking to see if the file is there
                    leg = Legislator(term, 'upper', district, name,
                                     party=party,
                                     email=email, url=url,
                                     photo_url=picture_filename)

                except scrapelib.HTTPError:         # If not, leave out the photo_url
                    leg = Legislator(term, 'upper', district, name,
                                     party=party, phone=phone, email=email,
                                     url=url)

                leg.add_office('capitol', 'Oficina del Capitolio',
                               phone=phone)
                leg.add_source(url)
                self.save_legislator(leg)
Beispiel #19
0
    def scrape_upper(self, term):
        url = "http://www.nysenate.gov/senators"
        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)
            page.make_links_absolute(url)

            for link in page.xpath('//a[contains(@href, "/senator/")]'):
                if link.text in (None, "Contact", "RSS"):
                    continue
                name = link.text.strip()

                district = link.xpath("string(../../../div[3]/span[1])")
                district = re.match(r"District (\d+)", district).group(1)

                photo_link = link.xpath("../../../div[1]/span/a/img")[0]
                photo_url = photo_link.attrib["src"]

                legislator = Legislator(term, "upper", district, name, party="Unknown", photo_url=photo_url)
                legislator.add_source(url)

                contact_link = link.xpath("../span[@class = 'contact']/a")[0]
                contact_url = contact_link.attrib["href"]
                self.scrape_upper_contact_info(legislator, contact_url)

                legislator["url"] = contact_url.replace("/contact", "")

                self.save_legislator(legislator)
Beispiel #20
0
    def scrape_rep(self, name, term, url):
        # special case names that confuses name_tools
        if name == 'Franklin, A.B.':
            name = 'Franklin, A. B.'
        elif ', Jr., ' in name:
            name = name.replace(', Jr., ', ' ')
            name += ', Jr.'
        elif ', III, ' in name:
            name = name.replace(', III, ', ' ')
            name += ', III'

        with self.urlopen(url) as text:
            page = lxml.html.fromstring(text)

            district = page.xpath(
                "//a[contains(@href, 'district')]")[0].attrib['href']
            district = re.search("district(\d+).pdf", district).group(1)

            if "Democrat&nbsp;District" in text:
                party = "Democratic"
            elif "Republican&nbsp;District" in text:
                party = "Republican"
            elif "Independent&nbsp;District" in text:
                party = "Independent"
            else:
                party = "Other"

            leg = Legislator(term, 'lower', district, name, party=party,
                             url=url)
            leg.add_source(url)
            self.save_legislator(leg)
Beispiel #21
0
    def scrape(self, chamber, term):
        self.validate_term(term)

        if chamber == 'upper':
            url = "http://legis.wi.gov/w3asp/contact/legislatorslist.aspx?house=senate"
        else:
            url = "http://legis.wi.gov/w3asp/contact/legislatorslist.aspx?house=assembly"

        with self.urlopen(url) as body:
            page = lxml.html.fromstring(body)

            for row in page.cssselect("#ctl00_C_dgLegData tr"):
                if len(row.cssselect("td a")) > 0:
                    rep_url = list(row)[0].cssselect("a[href]")[0].get("href")
                    rep_url = 'http://legis.wi.gov/w3asp/contact/' + rep_url

                    legpart = re.findall(r'([\w\-\,\s\.]+)\s+\(([\w])\)', list(row)[0].text_content())
                    if legpart:
                        full_name, party = legpart[0]

                        # skip if the legislator is vacant (occurred in 2011 session)
                        if full_name == 'Vacant':
                            continue

                        party = PARTY_DICT[party]

                        district = str(int(list(row)[2].text_content()))

                        leg = Legislator(term, chamber, district, full_name,
                                         party=party, url=rep_url)
                        leg.add_source(rep_url)

                        leg = self.add_committees(leg, rep_url, term, chamber)
                        self.save_legislator(leg)
Beispiel #22
0
    def scrape_lower(self, term):
        url = "http://le.utah.gov/house2/representatives.jsp"
        html = self.urlopen(url)
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)

        for row in doc.xpath("//tr")[1:]:
            tds = row.xpath("td")

            district = tds[0].text_content()
            if tds[1].text_content() == "Empty":
                self.log("district %s is empty" % district)
                continue
            a = tds[1].xpath("a")[0]
            name = a.text_content()
            leg_url = a.get("href")

            party = tds[2].text_content()
            if party == "D":
                party = "Democratic"
            elif party == "R":
                party = "Republican"
            else:
                raise ValueError("unknown party")

            # get photo
            leg_html = self.urlopen(leg_url)
            leg_doc = lxml.html.fromstring(leg_html)
            leg_doc.make_links_absolute(leg_url)
            photo_url = leg_doc.xpath('//img[@alt="photo"]/@src')[0]

            leg = Legislator(term, "lower", district, name, party=party, photo_url=photo_url, url=leg_url)
            leg.add_source(url)
            leg.add_source(leg_url)
            self.save_legislator(leg)
Beispiel #23
0
    def scrape_legislator_data(self, url, chamber):
        party_fulls = {'R' : 'Republican', 'D' : 'Democrat'}
        with self.urlopen(url) as page:
            page = BeautifulSoup(page)
            for data in page.find('table', id = 'ctl00_mainCopy_DataList1')('td'):
                spans = data('span')
                if len(spans) == 0:
                    self.debug('Found an empty cell in %s. Continuing' % url)
                    continue
                full_name = ' '.join([span.string.strip() for span in spans])
                if len(spans[0].string.strip().split()) == 2:
                    first_name, middle_name = spans[0].string.strip().split()
                else:
                    first_name, middle_name = spans[0].string.strip(), ''
                last_name = spans[1].string.strip()

                details_url = get_abs_url(url, data.find('a')['href'])
                with self.urlopen(details_url) as details:
                    details = BeautifulSoup(details)
                    district = details.find('a', id = 'ctl00_mainCopy_LegisInfo_DISTRICTLabel').string.strip()
                    party = party_fulls[details.find('span', id = 'ctl00_mainCopy_LegisInfo_PARTYLabel').string]

                    leg = Legislator('2010', chamber, district, full_name, first_name, 
                            last_name, middle_name, party)
                    leg.add_source(details_url)

                    comms_table = details.find('table', id = 'ctl00_mainCopy_MembershipGrid')
                    for comms_raw_data in comms_table('tr')[1:]:
                        comm_data = comms_raw_data('td')
                        comm_role_type = comm_data[0].string.strip()
                        comm_name = comm_data[1]('a')[0].string.strip()
                        leg.add_role(comm_role_type, '2010', chamber = chamber, committee = comm_name)

                    self.save_legislator(leg)
Beispiel #24
0
 def scrape_2011Leg(self, chamber, term, url):
     """2011 Scraper for legislators"""
     titles = {'lower': 'Representative', 'upper': 'Senator'}
     parties = {'D': 'Democrat', 'R': 'Republican'}
     with self.urlopen(url) as page:
         page = lxml.html.fromstring(page)
         page.make_links_absolute(url)
         table = page.xpath('//table[contains(@id, "GridView1")]')[0]
         for row in table.xpath('tr[td/a[contains(@href, "memberpage")]]'):
             params = {}
             district = row.xpath('td/span[contains(@id, "LabelDis")]/font')[0].text + " " + \
                 row.xpath('td/span[contains(@id, "LabelDistrict2")]/font')[0].text
             # Replace any / in district name to allow json file to save.
             district = district.replace('/', '-')
             params['title'] = titles.get(chamber, '')
             last_name = row.xpath('td/a[contains(@id, "HyperLinkLast")]/font')[0].text.strip()
             first_names = row.xpath('td/span[contains(@id, "LabelFirst")]/font')[0].text.strip()
             first_name = first_names.split()[0]
             middle_name = ' '.join(first_names.split()[1:])
             party = row.xpath('td/span[contains(@id, "LabelParty")]/font')[0].text
             party = party.replace('(', '')
             party = party.replace(')', '')
             party = parties.get(party, '') # Expand party from initial letter.
             params['office_address'] = row.xpath('td/span[contains(@id, "LabelRoom")]')[0].text + \
                 " " + row.xpath('td/span[contains(@id, "LabelRoom2")]')[0].text
             params['photo_url'] = row.xpath('td/a[contains(@id, "HyperLinkChairJPG")]/img')[0].attrib['src']
             params['email'] = row.xpath('td/a[contains(@id, "HyperLinkEmail")]')[0].text
             params['phone'] = row.xpath('td/span[contains(@id, "LabelPhone2")]')[0].text
             
             full_name = first_names + " " + last_name
             leg = Legislator(term, chamber, district, full_name, 
                     first_name, last_name, middle_name, party, **params)
             leg.add_source(url)
             self.save_legislator(leg)
Beispiel #25
0
    def scrape_legislator(self, chamber, term, name, url):
        with self.urlopen(url) as page:
            # Alaska fails at unicode, some of the pages have broken
            # characters. They're not in data we care about so just
            # replace them.
            page = page.decode('utf8', 'replace')
            page = lxml.html.fromstring(page)

            name = re.sub(r'\s+', ' ', name)

            info = page.xpath('string(//div[@id = "fullpage"])')

            district = re.search(r'District ([\w\d]+)', info).group(1)
            party = re.search(r'Party: (.+) Toll-Free', info).group(1).strip()
            email = re.search(r'Email: ([\w_]+@legis\.state\.ak\.us)',
                              info).group(1)

            # for consistency
            if party == 'Democrat':
                party = 'Democratic'

            leg = Legislator(term, chamber, district, name, party=party,
                             email=email, url=url)
            leg.add_source(url)

            self.save_legislator(leg)
Beispiel #26
0
    def scrape(self, chamber, term):
        if chamber == 'upper':
            url = ('http://webserver.rilin.state.ri.us/Documents/Senators.xls')
            rep_type = 'Senator '
        elif chamber == 'lower':
            url = (
             'http://webserver.rilin.state.ri.us/Documents/Representatives.xls')
            rep_type = 'Representative '

        self.urlretrieve(url, 'ri_leg.xls')

        wb = xlrd.open_workbook('ri_leg.xls')
        sh = wb.sheet_by_index(0)

        for rownum in xrange(1, sh.nrows):
            d = {}
            for field, col_num in excel_mapping.iteritems():
                d[field] = sh.cell(rownum, col_num).value
            dist = str(int(d['district']))
            district_name = dist
            full_name = re.sub(rep_type, '', d['full_name']).strip()
            translate = {
                "Democrat"    : "Democratic",
                "Republican"  : "Republican",
                "Independent" : "Independent"
            }
            leg = Legislator(term, chamber, district_name, full_name,
                             '', '', '',
                             translate[d['party']],
                             town_represented=d['town_represented'],
                             email=d['email'])
            leg.add_office('district', 'Address', address=d['address'])
            leg.add_source(url)
            self.save_legislator(leg)
Beispiel #27
0
    def scrape(self, chamber, session):
        url = self.get_district_list(chamber, session)
        people_pages = self.scrape_directory( url, chamber, session )

        for person in people_pages:
            district = person
            p_url = people_pages[district]
            metainf = self.process_person( p_url )

            p = Legislator( session, chamber, district, metainf['name'],
                party=metainf['party'],
                # some additional things the website provides:
                occupation=metainf['occupation'],
                photo_url=metainf['photo_url'],
                url=metainf['homepage'])
            if "email" in metainf:
                p['email'] = metainf['email']
            if "number" in metainf:
                p.add_office('capitol', 'Capitol Office',
                             phone=metainf['number'],
                             address='200 E. Colfax\nDenver, CO 80203'
                            )

            p.add_source( p_url )

            if 'ctty' in metainf:
                for ctty in metainf['ctty']:
                    p.add_role( 'committee member',
                        term=session,
                        chamber=chamber,
                        committee=clean_committee(ctty),
                        position="member"
                    )
            self.save_legislator( p )
Beispiel #28
0
    def scrape(self, chamber, session):
        metainf = self.scrape_leg_page(get_chamber_listing_url( chamber ))
        for leg in metainf:
            p = Legislator( session, chamber, leg['district'], leg['name'],
                party=leg['party'],
                # some additional things the website provides:
                photo_url=leg['image'],
                url=leg['homepage'],
                room=leg['room'],
                phone=leg['phone'],
                fax=leg['fax'],
                email=leg['email'],
                address=leg['addr'])

            for source in leg['source']:
                p.add_source( source )

            try:
                for ctty in leg['ctty']:
                    flag='Joint Legislative'
                    if ctty['name'][:len(flag)] == flag:
                        ctty_chamber = "joint"
                    else:
                        ctty_chamber = chamber

                    p.add_role( 'committee member',
                        term=session,
                        chamber=ctty_chamber,
                        committee=ctty['name'],
                        position="member")
            except KeyError:
                self.log( "XXX: Warning, %s has no scraped Commities" %
                    leg['name'] )

            self.save_legislator( p )
Beispiel #29
0
    def scrape_lower(self, term):
        url = "http://www.okhouse.gov/Members/Default.aspx"
        page = lxml.html.fromstring(self.urlopen(url))
        page.make_links_absolute(url)

        for tr in page.xpath("//table[@class='rgMasterTable']/tbody/tr")[1:]:
            name = tr.xpath('.//td[1]/a')[0].text.strip()
            district = tr.xpath('.//td[3]')[0].text_content().strip()
            party = tr.xpath('.//td[4]')[0].text_content().strip()
            party = {'R': 'Republican', 'D': 'Democratic'}[party]

            leg_url = 'http://www.okhouse.gov/District.aspx?District=' + district
            leg_doc = lxml.html.fromstring(self.urlopen(leg_url))
            leg_doc.make_links_absolute(leg_url)
            photo_url = leg_doc.xpath('//a[contains(@href, "HiRes")]/@href')[0]

            if name.startswith('House District'):
                self.warning("skipping %s %s" % (name, leg_url))
                continue

            leg = Legislator(term, 'lower', district, name, party=party,
                             photo_url=photo_url, url=leg_url)
            leg.add_source(url)
            leg.add_source(leg_url)

            # Scrape offices.
            self.scrape_lower_offices(leg_doc, leg)

            self.save_legislator(leg)
Beispiel #30
0
    def scrape(self, chamber, session):
        url = self.get_district_list(chamber, session)
        people_pages = self.scrape_directory( url, chamber, session )

        for person in people_pages:
            district = person
            p_url = people_pages[district]
            metainf = self.process_person( p_url )

            p = Legislator( session, chamber, district, metainf['name'],
                party=metainf['party'],
                # some additional things the website provides:
                occupation=metainf['occupation'],
                photo_url=metainf['photo_url'],
                url=metainf['homepage'])

            phone = metainf['number'] if 'number' in metainf else None
            email = metainf['email'] if 'email' in metainf else None
            p.add_office('capitol', 'Capitol Office',
                             phone=phone,
                             address='200 E. Colfax\nDenver, CO 80203',
                             email=email
                            )

            p.add_source( p_url )
            self.save_legislator( p )
Beispiel #31
0
    def scrape_page(self, chamber, term, url):
        page = self.urlopen(url)
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        for legislator in page.xpath(
                "//div[contains(concat(' ', "
                "normalize-space(@class), ' '), ' memberModule ')]"):

            img = legislator.xpath(
                ".//div[@class='thumbnail']//img")[0].attrib['src']
            data = legislator.xpath(".//div[@class='data']")[0]
            homepage = data.xpath(".//a[@class='black']")[0]
            full_name = homepage.text_content()
            homepage = homepage.attrib['href']
            party = data.xpath(
                ".//span[@class='partyLetter']")[0].text_content()
            party = {"R": "Republican", "D": "Democratic"}[party]
            office_lines = data.xpath("child::text()")
            phone = office_lines.pop(-1)
            office = "\n".join(office_lines)
            h3 = data.xpath("./h3")
            if len(h3):
                h3 = h3[0]
                district = h3.xpath("./br")[0].tail.replace("District",
                                                            "").strip()
            else:
                district = re.findall("\d+\.png",
                                      legislator.attrib['style'])[-1].split(
                                          ".", 1)[0]

            full_name = re.sub("\s+", " ", full_name).strip()
            leg = Legislator(term,
                             chamber,
                             district,
                             full_name,
                             party=party,
                             url=homepage,
                             photo_url=img)

            leg.add_office('capitol',
                           'Capitol Office',
                           address=office,
                           phone=phone)

            self.scrape_homepage(leg, chamber, homepage, term)

            leg.add_source(url)
            self.save_legislator(leg)
Beispiel #32
0
    def scrape(self, chamber, session):
        metainf = self.scrape_leg_page(get_legislator_listing_url(chamber))
        for leg in metainf:
            try:
                chamber = {"House": "lower",
                           "Senate": "upper"}[leg['chamber']]
            except KeyError:
                print("")
                print("  ERROR: Bad Legislator page.")
                print("    -> " + "\n    -> ".join(leg['source']))
                print("")
                print("  Added this workaround because of a bad legislator")
                print("  page, while they filled their info out.")
                print("")
                print("  Emailed webmaster. Told to wait.")
                print("   - PRT, Jun 23, 2014")
                print("")
                continue

            p = Legislator( session, chamber, leg['district'], leg['name'],
                party=leg['party'],
                # some additional things the website provides:
                photo_url=leg['image'],
                url=leg['homepage'],
                email=leg['email'])
            p.add_office('capitol', 'Capitol Office', address=leg['addr'],
                         phone=leg['phone'], fax=leg['fax'] or None)

            for source in leg['source']:
                p.add_source( source )

            try:
                for ctty in leg['ctty']:
                    flag='Joint Legislative'
                    if ctty['name'][:len(flag)] == flag:
                        ctty_chamber = "joint"
                    else:
                        ctty_chamber = chamber

                    p.add_role( 'committee member',
                        term=session,
                        chamber=ctty_chamber,
                        committee=ctty['name'],
                        position="member")
            except KeyError:
                self.log( "XXX: Warning, %s has no scraped Commities" %
                    leg['name'] )

            self.save_legislator( p )
Beispiel #33
0
    def scrape(self, chamber, term):
        """
        Scrapes legislators for the current term only
        """
        self.validate_term(term, latest_only=True)
        url = BASE_URL % CHAMBERS[chamber].lower()
        index = self.get(url).text
        html = lxml.html.fromstring(index)
        html.make_links_absolute(url)

        rows = html.xpath('//div[contains(@class, "row-equal-height")]')

        for row in rows:
            img_url = row.xpath('.//img/@src')[0]

            inner = row.xpath('.//div[@class="vc-column-innner-wrapper"]')[1]

            name = inner.xpath('p/strong')[0].text.replace(u'\xa0', ' ').strip()
            name = re.sub('\s+', ' ', name)
            party = PARTY[inner.xpath('p/strong')[0].tail.strip()]
            email = inner.xpath('p/strong/a')[0].text
            district = inner.xpath('p/a')[0].text.replace('District ', '')
            leg_url = inner.xpath('p/a/@href')[0]

            leg = Legislator(term, chamber, district, name, party=party,
                             email=email)

            phones = get_phones(inner)
            leg.add_office('district', 'District Office',
                           address=get_address(inner), fax=get_fax(inner),
                           phone=phones.get('home') or phones.get('business'))
            leg.add_office('capitol', 'Capitol Office', phone=phones.get('office'))

            leg.add_source(url)
            leg['photo_url'] = img_url
            leg['url'] = leg_url

            for com in inner.xpath('p/a[contains(@href, "committees")]'):
                role = com.tail.strip()
                if not role:
                    role = 'member'
                leg.add_role('committee member',
                             term=term,
                             chamber=chamber,
                             committee=com.text,
                             position=role)

            self.save_legislator(leg)
Beispiel #34
0
    def scrape_senate(self, term):

        index_url = 'http://www.senate.mn/members/index.php'
        doc = lxml.html.fromstring(self.urlopen(index_url))
        doc.make_links_absolute(index_url)

        leg_data = defaultdict(dict)

        # get all the tds in a certain div
        tds = doc.xpath('//div[@id="hide_show_alpha_all"]//td[@style="vertical-align:top;"]')
        for td in tds:
            # each td has 2 <a>s- site & email
            main_link, email = td.xpath('.//a')
            # get name
            name = main_link.text_content().split(' (')[0]
            leg = leg_data[name]
            leg['leg_url'] = main_link.get('href')
            leg['photo_url'] = td.xpath('./preceding-sibling::td/a/img/@src')[0]
            if 'mailto:' in email.get('href'):
                leg['email'] = email.get('href').replace('mailto:', '')

        self.info('collected preliminary data on %s legislators', len(leg_data))
        assert leg_data

        # use CSV for most of data
        csv_url = 'http://www.senate.mn/members/member_list_ascii.php?ls='
        csvfile = self.urlopen(csv_url)

        for row in csv.DictReader(StringIO(csvfile)):
            if not row['First Name']:
                continue
            name = '%s %s' % (row['First Name'], row['Last Name'])
            party = self._parties[row['Party']]
            leg = Legislator(term, 'upper', row['District'].lstrip('0'), name,
                             party=party,
                             first_name=row['First Name'],
                             last_name=row['Last Name'],
                             **leg_data[name]
                            )
            leg.add_office('capitol', 'Capitol Office',
                           address='{Address}\n{Address2}\n{City}, {State} {Zipcode}'.format(**row)
                           )


            leg.add_source(csv_url)
            leg.add_source(index_url)

            self.save_legislator(leg)
Beispiel #35
0
    def scrape(self, term, chambers):
        year_slug = term[5: ]

        # Load all members via the private API
        legislator_dump_url = \
                'http://legislature.vermont.gov/people/loadAll/{}'.\
                format(year_slug)
        json_data = self.urlopen(legislator_dump_url)
        legislators = json.loads(json_data)['data']

        # Parse the information from each legislator
        for info in legislators:
            # Strip whitespace from strings
            info = { k:v.strip() for k, v in info.iteritems() }

            leg = Legislator(
                    term=term,
                    chamber=('upper' if info['Title'] == 'Senator' else 'lower'),
                    district=info['District'].replace(" District", ""),
                    party=info['Party'],
                    email=info['Email'],
                    full_name="{0}{1} {2}".format(
                            info['FirstName'],
                            (" " + info['MI'] if info['MI'] else ""),
                            info['LastName']
                            ),
                    photo_url=
                            'http://legislature.vermont.gov/assets/Documents/Legislators/{}.jpg'.
                            format(info['Email'][ :-(len("@leg.state.vt.us"))]
                            )
                    )
            leg.add_source(legislator_dump_url)
            leg.add_office(
                    type='district',
                    name='District Office',
                    address="{0}{1}\n{2}, {3} {4}".format(
                            info['MailingAddress1'],
                            ("\n" + info['MailingAddress2']
                                    if info['MailingAddress2']
                                    else ""),
                            info['MailingCity'],
                            info['MailingState'],
                            info['MailingZIP']
                            ),
                    phone=(info['HomePhone'] if info['HomePhone'] else None),
                    email=(info['HomeEmail'] if info['HomeEmail'] else None)
                    )
            self.save_legislator(leg)
Beispiel #36
0
    def scrape_house(self, term):
        url = 'http://www.house.leg.state.mn.us/members/housemembers.asp'

        html = self.urlopen(url)
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)

        # skip first header row
        for row in doc.xpath('//tr')[1:]:
            tds = [td.text_content().strip() for td in row.xpath('td')]
            if len(tds) == 5:
                district = tds[0].lstrip('0')
                name, party = tds[1].rsplit(' ', 1)
                if party == '(R)':
                    party = 'Republican'
                elif party == '(DFL)':
                    party = 'Democratic-Farmer-Labor'
                leg_url = row.xpath('td[2]/p/a/@href')[0]
                addr = tds[2]
                phone = tds[3]
                email = tds[4]

            leg = Legislator(term,
                             'lower',
                             district,
                             name,
                             party=party,
                             email=email,
                             url=leg_url)

            addr = ('{0} State Office Building\n'
                    '100 Rev. Dr. Martin Luther King Jr. Blvd.\n'
                    'St. Paul, MN 55155').format(addr)
            leg.add_office('capitol',
                           'Capitol Office',
                           address=addr,
                           phone=phone)

            # add photo_url
            leg_html = self.urlopen(leg_url)
            leg_doc = lxml.html.fromstring(leg_html)
            img_src = leg_doc.xpath('//img[contains(@src, "memberimg")]/@src')
            if img_src:
                leg['photo_url'] = img_src[0]

            leg.add_source(url)
            leg.add_source(leg_url)
            self.save_legislator(leg)
Beispiel #37
0
    def scrape(self, chamber, session):
        # All other years are stored in a pdf
        # http://www.capitol.hawaii.gov/session2009/misc/statehood.pdf
        if year_from_session(session) != 2009:
            raise NoDataForPeriod(session)

        if chamber == 'upper':
            legislators_page_url = BASE_URL + "/site1/info/direct/sendir.asp"
        else:
            legislators_page_url = BASE_URL + "/site1/info/direct/repdir.asp"

        with self.urlopen(legislators_page_url) as legislators_page_html:
            legislators_page = lxml.html.fromstring(legislators_page_html)

            # get all rows (except first) of first table
            legislators_data = legislators_page.xpath('//table[1]/tr')[1:]
            # group legislator data in sets of 3
            legislators_data = grouper(3, legislators_data)

            for name_and_party, district, email in legislators_data:
                element, attribute, link, pos = name_and_party.iterlinks().next()
                source = BASE_URL + link

                name_and_party = name_and_party.cssselect('td')
                name_and_party = name_and_party[0]
                name, sep, party =  name_and_party.text_content().partition("(")
                # remove space at the beginning
                name = name.strip()

                if party == 'R)':
                    party = 'Republican'
                else:
                    party = 'Democratic'

                district = district.cssselect('td')
                district = district[1]
                district = district.text_content()

                email = email.cssselect('a')
                email = email[0]
                email = email.text_content()
                # Remove white space
                email = email.strip()

                leg = Legislator(session, chamber, district, name,
                                 party=party, official_email=email)
                leg.add_source(source)
                self.save_legislator(leg)
Beispiel #38
0
    def scrape(self, chamber, term):
        urls = {'lower': "http://www.msa.md.gov/msa/mdmanual/06hse/html/hseal.html",
                'upper': "http://www.msa.md.gov/msa/mdmanual/05sen/html/senal.html"}
        detail_re = re.compile('\((R|D)\), (?:Senate President, )?(?:House Speaker, )?District (\w+)')

        self.validate_term(term, latest_only=True)

        with self.urlopen(urls[chamber]) as html:
            doc = lxml.html.fromstring(html)

            # data on this page is <li>s that have anchor tags
            for a in doc.cssselect('li a'):
                link = a.get('href')
                # tags don't close so we get the <li> and <a> content and diff them
                name_text = a.text_content()
                detail_text = a.getparent().text_content().replace(name_text, '')

                # ignore if it is not a valid link
                if link:
                    # handle names
                    names = name_text.split(',')
                    last_name = names[0]
                    first_name = names[1].strip()
                    # TODO: try to trim first name to remove middle initial
                    if len(names) > 2:
                        suffixes = names[2]
                    else:
                        suffixes = ''

                    # handle details
                    details = detail_text.strip()
                    party, district = detail_re.match(details).groups()
                    party = PARTY_DICT[party]

                    leg = Legislator(term, chamber, district,
                                     ' '.join((first_name, last_name)),
                                     first_name, last_name, '',
                                     party, suffixes=suffixes)
                    leg_url = BASE_URL+link
                    leg.add_source(url=leg_url)

                    with self.urlopen(leg_url) as leg_html:
                        leg_doc = lxml.html.fromstring(leg_html)
                        img_src = leg_doc.xpath('//img[@align="left"]/@src')
                        if img_src:
                            leg['photo_url'] = BASE_URL + img_src[0]

                    self.save_legislator(leg)
Beispiel #39
0
    def scrape_legislators(self, term, chamber, leg_page, member_url,
                           main_url):
        full_name = leg_page.xpath(
            '//div[@class="content"][1]/table[1]//tr[1]/td[2]/table//tr[1]/td/h2'
        )[0].text
        if len(full_name.split()) == 3:
            first_name = full_name.split()[1]
            middle_name = ''
            last_name = full_name.split()[2]
            full_name = first_name + ' ' + last_name
        else:
            first_name = full_name.split()[1]
            middle_name = full_name.split()[2]
            last_name = full_name.split()[3]
            full_name = first_name + ' ' + middle_name + ' ' + last_name
        district = leg_page.xpath(
            '//div[@class="content"][1]/table[1]//tr[1]/td[2]/table//tr[5]/td[2]'
        )[0].text
        party = leg_page.xpath(
            '//div[@class="content"][1]/table[1]//tr[1]/td[2]/table//tr[6]/td[2]'
        )[0].text
        full_address = leg_page.xpath(
            '//div[@class="content"][1]/table[1]//tr[1]/td[2]/table//tr[2]/td[2]'
        )[0].text
        phone = leg_page.xpath(
            '//div[@class="content"][1]/table[1]//tr[1]/td[2]/table//tr[3]/td[2]'
        )[0].text
        email = leg_page.xpath(
            '//div[@class="content"][1]/table[1]//tr[1]/td[2]/table//tr[4]/td[2]/a'
        )[0].text

        if party == 'Democrat':
            party = 'Democratic'
        leg = Legislator(chamber,
                         term,
                         district,
                         full_name,
                         first_name,
                         last_name,
                         middle_name,
                         party,
                         full_address=full_address,
                         phone=phone,
                         email=email,
                         url=member_url)
        leg.add_source(member_url)
        leg.add_source(main_url)
        self.save_legislator(leg)
Beispiel #40
0
    def scrape_member(self, chamber, term, member_url):
        with self.urlopen(member_url) as page:
            root = lxml.html.fromstring(page)

            name_and_party = root.xpath(
                'string(//td[@class="SiteNames"])').split()

            title = name_and_party[0]
            if title == 'Representative':
                chamber = 'lower'
            elif title == 'Senator':
                chamber = 'upper'

            full_name = ' '.join(name_and_party[1:-1])

            party = name_and_party[-1]
            if party == '(R)':
                party = 'Republican'
            elif party == '(D)':
                party = 'Democratic'

            img = root.xpath('//img[@class="SitePhotos"]')[0]
            photo_url = img.attrib['src']

            # Need to figure out a cleaner method for this later
            info_box = root.xpath('string(//table[@class="InfoTable"])')
            district = re.search(r'District(.+)\r', info_box).group(1)

            leg = Legislator(term,
                             chamber,
                             district,
                             full_name,
                             party=party,
                             photo_url=photo_url)
            leg.add_source(member_url)

            try:
                leg['email'] = re.search(r'Email(.+)\r', info_box).group(1)
            except AttributeError:
                pass

            try:
                leg['occupation'] = re.search(r'Occupation(.+)\r',
                                              info_box).group(1)
            except AttributeError:
                pass

            self.save_legislator(leg)
Beispiel #41
0
    def scrape(self, chamber, term):
        self.validate_term(term, latest_only=True)

        abbr = {'D': 'Democratic', 'R': 'Republican'}

        if chamber == 'lower':
            url = 'http://house.michigan.gov/replist.asp'
            with self.urlopen(url) as html:
                doc = lxml.html.fromstring(html)
                # skip two rows at top
                for row in doc.xpath('//table[@cellspacing=0]/tr')[2:]:
                    tds = [x.text_content().strip() for x in row.xpath('td')]
                    (district, last_name, first_name, party, office, phone,
                     email) = tds
                    leg = Legislator(term=term,
                                     chamber=chamber,
                                     district=str(int(district)),
                                     full_name=first_name + " " + last_name,
                                     first_name=first_name,
                                     last_name=last_name,
                                     party=abbr[party],
                                     office=office,
                                     phone=phone,
                                     email=email)
                    leg.add_source(url)
                    self.save_legislator(leg)
        else:
            url = 'http://www.senate.michigan.gov/members/memberlist.htm'
            with self.urlopen(url) as html:
                doc = lxml.html.fromstring(html)
                for row in doc.xpath('//table[@width=550]/tr')[1:39]:
                    # party, dist, member, office_phone, office_fax, office_loc
                    party = abbr[row.xpath('td[1]/text()')[0]]
                    district = row.xpath('td[2]/a/text()')[0]
                    name = row.xpath('td[3]/a/text()')[0]
                    office_phone = row.xpath('td[4]/text()')[0]
                    office_fax = row.xpath('td[5]/text()')[0]
                    office_loc = row.xpath('td[6]/text()')[0]
                    leg = Legislator(term=term,
                                     chamber=chamber,
                                     district=district,
                                     full_name=name,
                                     party=party,
                                     office_phone=office_phone,
                                     office_fax=office_fax,
                                     office_loc=office_loc)
                    leg.add_source(url)
                    self.save_legislator(leg)
Beispiel #42
0
    def scrape_member(self, chamber, year, member_url):
        member_page = self.urlopen(member_url)
        doc = lxml.html.fromstring(member_page)

        photo_url = doc.xpath('//div[@id="bioImage"]/img/@src')[0]
        name_pieces = doc.xpath('//span[@id="name"]/text()')[0].split()
        full_name = ' '.join(name_pieces[1:-1]).strip()

        party = name_pieces[-1]
        if party == '(R)':
            party = 'Republican'
        elif party == '(D)':
            party = 'Democratic'
        elif party == '(I)':
            party = 'Independent'

        district = doc.xpath(
            '//span[@id="districtHeader"]/text()')[0].split()[-1]

        leg = Legislator(year,
                         chamber,
                         district,
                         full_name,
                         party=party,
                         photo_url=photo_url,
                         url=member_url)
        leg.add_source(member_url)

        address = '\n'.join(
            doc.xpath(
                '//div[@id="FrankfortAddresses"]//span[@class="bioText"]/text()'
            ))
        phone = None
        phone_numbers = doc.xpath(
            '//div[@id="PhoneNumbers"]//span[@class="bioText"]/text()')
        for num in phone_numbers:
            if num.startswith('Annex: '):
                phone = num.replace('Annex: ', '')

        if address.strip() == "":
            self.warning("Missing Capitol Office!!")
        else:
            leg.add_office('capitol',
                           'Capitol Office',
                           address=address,
                           phone=phone)

        self.save_legislator(leg)
Beispiel #43
0
    def scrape_senators(self, term):
        url = "http://www.flsenate.gov/Senators/"
        page = self.urlopen(url)
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        for link in page.xpath("//a[contains(@href, 'Senators/s')]"):
            name = link.text.strip()
            name = re.sub(r'\s+', ' ', name)
            leg_url = link.get('href')
            leg_doc = lxml.html.fromstring(self.urlopen(leg_url))
            leg_doc.make_links_absolute(leg_url)

            if 'Vacant' in name:
                continue

            # Special case - name_tools gets confused
            # by 'JD', thinking it is a suffix instead of a first name
            if name == 'Alexander, JD':
                name = 'JD Alexander'
            elif name == 'Vacant':
                name = 'Vacant Seat'

            district = link.xpath("string(../../td[1])")
            party = link.xpath("string(../../td[2])")

            # for consistency
            if party == 'Democrat':
                party = 'Democratic'

            if term != '2013-2014':
                raise ValueError('Please change the senate photo_url string.')
            photo_url = leg_doc.xpath('//div[@id="sidebar"]//img/@src').pop()

            leg = Legislator(term,
                             'upper',
                             district,
                             name,
                             party=party,
                             photo_url=photo_url,
                             url=leg_url)
            leg.add_source(url)
            leg.add_source(leg_url)

            self.scrape_sen_offices(leg, leg_url)

            self.save_legislator(leg)
Beispiel #44
0
    def scrape_lower_legislator(self, url, leg_info, term):
        page = self.lxmlize(url)
        photo = xpath_one(page, '//img[@rel="lightbox"]').attrib['src']
        infoblk = xpath_one(
            page, '//font/b[contains(text(), "CAUCUS/DELEGATION MEMBERSHIP")]')
        infoblk = infoblk.getparent()
        info = infoblk.text_content()
        cty = xpath_one(infoblk, "./b[contains(text(), 'ASSIGNMENTS')]")
        cty = cty.getnext()

        partyblk = filter(lambda x: "District" in x,
                          page.xpath('//p[@align="center"]//text()'))[0]

        party_flags = {
            "Democrat": "Democratic",
            "Republican": "Republican",
            "Independent": "Independent"
        }

        if leg_info['name'].startswith("Vacant"):
            return

        party = 'other'
        for p in party_flags:
            if p in partyblk:
                party = party_flags[p]

        if party == 'other':
            raise Exception

        kwargs = {"url": url, "party": party, "photo_url": photo}

        leg = Legislator(term, 'lower', leg_info['dist'], leg_info['name'],
                         **kwargs)

        kwargs = {
            "address": leg_info['office'],
            "phone": leg_info['phone'],
        }

        if leg_info['email'] != "":
            kwargs['email'] = leg_info['email']

        leg.add_office('district', 'District Office', **kwargs)

        leg.add_source(url)
        self.save_legislator(leg)
Beispiel #45
0
    def scrape_lower(self, term):
        url = 'http://le.utah.gov/house2/representatives.jsp'
        html = self.urlopen(url)
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)

        for row in doc.xpath('//tr')[1:]:
            tds = row.xpath('td')

            district = tds[0].text_content()
            if tds[1].text_content() == 'Empty':
                self.log('district %s is empty' % district)
                continue
            a = tds[1].xpath('a')[0]
            name = a.text_content()
            leg_url = a.get('href')

            party = tds[2].text_content()
            if party == 'D':
                party = 'Democratic'
            elif party == 'R':
                party = 'Republican'
            else:
                raise ValueError('unknown party')

            # get photo
            leg_html = self.urlopen(leg_url)
            leg_doc = lxml.html.fromstring(leg_html)
            leg_doc.make_links_absolute(leg_url)
            photo_url = leg_doc.xpath('//img[@alt="photo"]/@src')[0]
            email = leg_doc.xpath('//a[starts-with(@href, "mailto")]')[0].text

            address = leg_doc.xpath('//b[text()="Address:"]')[0].tail.strip()
            cell = leg_doc.xpath('//b[text()="Cell Phone:"]')
            if cell:
                cell = cell[0].tail.strip()
            else:
                cell = None

            leg = Legislator(term, 'lower', district, name,
                             party=party, photo_url=photo_url, email=email,
                             url=leg_url)
            leg.add_office('district', 'Home', address=address, phone=cell)

            leg.add_source(url)
            leg.add_source(leg_url)
            self.save_legislator(leg)
Beispiel #46
0
    def scrape_rep(self, name, term, url):

        with self.urlopen(url) as text:
            page = lxml.html.fromstring(text)

            xpath = '//table[@id="table41"]/tr/td/font'
            name = page.xpath(xpath)[3].xpath('p')[0].text
            name = name.replace('Representative', '').strip().strip(',')

            district = page.xpath(
                "//a[contains(@href, 'district')]")[0].attrib['href']
            district = re.search("district(\d+).pdf", district).group(1)

            if "Democrat&nbsp;District" in text:
                party = "Democratic"
            elif "Republican&nbsp;District" in text:
                party = "Republican"
            elif "Independent&nbsp;District" in text:
                party = "Independent"
            else:
                party = "Other"

            kwargs = {"party": party,
                      "url": url}

            photo = page.xpath("//img[@rel='lightbox']")
            if len(photo) > 0:
                photo = photo[0]
                photo_url = "http://house.louisiana.gov/H_Reps/%s" % (
                    photo.attrib['src']
                )
                kwargs['photo_url'] = photo_url
            else:
                self.warning("No photo found :(")

            district_office = _get_b_tail(page, 'DISTRICT OFFICE')
            email = page.xpath('//a[starts-with(@href, "mailto")]/@href')[0]
            # split off extra parts of mailto: link
            email = email.split(':')[1].split('?')[0]

            leg = Legislator(term, 'lower', district, name, email=email,
                             **kwargs)
            leg.add_office('district', 'District Office',
                           address=district_office)
            leg.add_source(url)

            self.save_legislator(leg)
Beispiel #47
0
    def scrape(self, chamber, term_name):

        for t in self.metadata['terms']:
            if t['name'] == term_name:
                session = t['sessions'][-1]
                slug = self.metadata['session_details'][session]['slug']

        if chamber == 'upper':
            chamber_slug = 'Senate'
        elif chamber == 'lower':
            chamber_slug = 'Assembly'

        leg_base_url = 'http://www.leg.state.nv.us/App/Legislator/A/%s/%s/' % (chamber_slug, slug)
        leg_json_url = 'http://www.leg.state.nv.us/App/Legislator/A/api/%s/Legislator?house=%s' % (slug, chamber_slug)

        resp = json.loads(self.urlopen(leg_json_url))

        for item in resp:
            # empty district
            if 'District No' in item['FullName']:
                continue
            leg = Legislator(term_name, chamber, item['DistrictNbr'],
                             item['FullName'], party=item['Party'],
                             photo_url=item['PhotoURL'])
            leg_url = leg_base_url + item['DistrictNbr']

            # fetch office from legislator page
            try:
                doc = lxml.html.fromstring(self.urlopen(leg_url))
                if not doc.xpath('//div'):
                    self.warning('invalid page, maybe a weird PDF?')
                else:
                    address = doc.xpath('//div[@class="contactAddress"]')[0].text_content()
                    address2 = doc.xpath('//div[@class="contactAddress2"]')
                    if address2:
                        address += ' ' + address2[0].text_content()
                    address += '\n' + doc.xpath('//div[@class="contactCityStateZip"]')[0].text_content()
                    phone = doc.xpath('//div[@class="contactPhone"]')[0].text_content()

                    leg.add_office('district', 'District Address', address=address,
                                   phone=phone)
            except scrapelib.HTTPError:
                self.warning('could not fetch %s' % leg_url)
                pass

            leg.add_source(leg_url)
            self.save_legislator(leg)
Beispiel #48
0
    def scrape(self, chamber, term):
        url = self.urls[term][chamber]
        version = self.urls[term]['version']

        if url is None:
            raise NoDataForPeriod(term)

        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)

            for row in page.xpath("//tr")[1:]:

                name = row.xpath("td")[0].text_content()
                name = name.split(",")
                if len(name) == 2:
                    fullname = "%s %s" % (name[1].strip(), name[0].strip())
                elif len(name) == 3:
                    fullname = "%s %s, %s" % (name[1].strip(), name[0].strip(),
                                              name[2].strip())
                else:
                    fullname = ' '.join(name).strip()

                # Most recent general assembly legislators list is slightly different than archived versions
                if version >= 2:
                    party = row.xpath("td")[1].text_content().strip()
                    district = row.xpath("td")[3].text_content().replace(
                        "District ", "").strip()
                    phone = email = ''

                    if version >= 3:
                        phone = row.xpath("td")[6].text_content().strip()
                        email = row.xpath("td")[6].text_content().strip()
                else:
                    party, district = row.xpath("td")[1].text_content().split(
                        "-")
                    party = party.strip()
                    district = district.strip()
                    phone = email = ''

                leg = Legislator(term,
                                 chamber,
                                 district,
                                 fullname,
                                 party=party,
                                 email=email)
                leg.add_source(url)
                self.save_legislator(leg)
Beispiel #49
0
    def scrape_upper(self, term):
        url = 'http://www.utahsenate.org/aspx/roster.aspx'
        html = self.urlopen(url)
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)

        for row in doc.xpath('//tr')[1:]:
            tds = row.xpath('td')

            # 1st has district
            district = tds[0].text_content()

            # 3rd has name and email
            person = tds[2].xpath('span[@class="person"]')[0]
            if '(D)' in person.text_content():
                party = 'Democratic'
            elif '(R)' in person.text_content():
                party = 'Republican'
            else:
                raise ValueError('unknown party')
            a = person.xpath('a')[0]
            name = a.text_content()
            leg_url = a.get('href')
            email = tds[2].xpath('span[@class="email"]/a/text()')[0]

            # text is split by br in 4th td, join with a space
            address = ' '.join(row.xpath('td[4]/font/text()'))

            # get photo
            leg_html = self.urlopen(leg_url)
            leg_doc = lxml.html.fromstring(leg_html)
            leg_doc.make_links_absolute(leg_url)
            photo_url = leg_doc.xpath('//p[@class="photo"]/img/@src')[0]

            leg = Legislator(term,
                             'upper',
                             district,
                             name,
                             party=party,
                             email=email,
                             address=address,
                             photo_url=photo_url,
                             url=leg_url)
            leg.add_source(url)
            leg.add_source(leg_url)
            self.save_legislator(leg)
Beispiel #50
0
    def scrape_2011Leg(self, chamber, term, url):
        """2011 Scraper for legislators"""
        parties = {'(D)': 'Democratic', '(R)': 'Republican'}
        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)
            page.make_links_absolute(url)
            table = page.xpath('//table[contains(@id, "GridView1")]')[0]
            for row in table.xpath('tr[td/a[contains(@href, "memberpage")]]'):
                params = {}
                district = row.xpath(
                    'td/span[contains(@id, "LabelDistrict")]/font')[0].text
                last_name_a = row.xpath(
                    'td/a[contains(@id, "HyperLinkLast")]')[0]
                member_url = last_name_a.get('href')
                last_name = last_name_a.text_content().strip()
                first_names = row.xpath(
                    'td/span[contains(@id, "LabelFirst")]/font')[0].text.strip(
                    )
                first_name = first_names.split()[0]
                middle_name = ' '.join(first_names.split()[1:])
                party = row.xpath(
                    'td/span[contains(@id, "LabelParty")]/font')[0].text
                party = parties[party]
                params['office_address'] = row.xpath('td/span[contains(@id, "LabelRoom")]')[0].text + \
                    " " + row.xpath('td/span[contains(@id, "LabelRoom2")]')[0].text
                params['photo_url'] = row.xpath(
                    'td/a[contains(@id, "HyperLinkChairJPG")]/img'
                )[0].attrib['src']
                params['email'] = row.xpath(
                    'td/a[contains(@id, "HyperLinkEmail")]')[0].text
                params['phone'] = row.xpath(
                    'td/span[contains(@id, "LabelPhone2")]')[0].text

                full_name = first_names + " " + last_name
                leg = Legislator(term,
                                 chamber,
                                 district,
                                 full_name,
                                 first_name,
                                 last_name,
                                 middle_name,
                                 party,
                                 url=member_url,
                                 **params)
                leg.add_source(url)
                self.save_legislator(leg)
Beispiel #51
0
    def scrape(self, chamber, term):
        if chamber == 'lower':
            url = 'http://www.scstatehouse.gov/html-pages/housemembers.html'
        else:
            url = 'http://www.scstatehouse.gov/html-pages/senatemembersd.html'

        with self.urlopen(url) as data:
            doc = lxml.html.fromstring(data)
            rows = doc.xpath('//pre/div[@class="sansSerifNormal"]')

            for row in rows[1:]:
                member_a = row.xpath('a')[0]

                name_party = member_a.text_content()
                if name_party.find('[D]') != -1:
                    party = 'Democratic'
                    full_name = name_party.partition('[D]')[0].strip()
                elif name_party.find('[R]') != -1:
                    party = 'Republican'
                    full_name = name_party.partition('[R]')[0].strip()

                photo_url = 'http://www.scstatehouse.gov/members/gif/' + re.search(
                    '(\d+)\.html', member_a.attrib['href']).group(1) + '.jpg'

                other_data = row.text_content().encode('ascii', 'ignore')
                od_result = re.search('^.+District (\d+) - (.+)Count.+$',
                                      other_data)
                district = od_result.group(1)

                contentb = re.search(
                    '^.+\(C\) (.+,.*\d+).*Bus. (\(\d+\) \d+-\d+).+$',
                    other_data)
                if contentb is not None:
                    office_address = contentb.group(1)
                    office_phone = contentb.group(2)

                legislator = Legislator(term,
                                        chamber,
                                        district,
                                        full_name,
                                        party=party,
                                        photo_url=photo_url,
                                        office_address=office_address,
                                        office_phone=office_phone)
                legislator.add_source(url)
                self.save_legislator(legislator)
    def get_member(self, term, chamber, kpid):
        url = '%smembers/%s' % (ksapi.url, kpid)
        content = json.loads(self.get(url).text)['content']

        party = content['PARTY']
        if party == 'Democrat':
            party = 'Democratic'

        slug = {'2013-2014': 'b2013_14', '2015-2016': 'b2015_16'}[term]
        leg_url = 'http://www.kslegislature.org/li/%s/members/%s/' % (slug,
                                                                      kpid)

        try:
            legislator_page = self.lxmlize(leg_url)
            (photo_url,
             ) = legislator_page.xpath('//img[@class="profile-picture"]/@src')
        except scrapelib.HTTPError:
            self.warning("{}'s legislator bio page not found".format(
                content['FULLNAME']))
            leg_url = ''
            photo_url = ''

        legislator = Legislator(
            term,
            chamber,
            str(content['DISTRICT']),
            content['FULLNAME'],
            party=party,
            url=leg_url,
            photo_url=photo_url,
            occupation=content['OCCUPATION'],
        )

        address = ('Room %s\n'
                   'Kansas State Capitol Building\n'
                   '300 SW 10th St.\n'
                   'Topeka, KS 66612') % content['OFFICENUM']

        legislator.add_office('capitol',
                              'Capitol Office',
                              phone=content['OFFPH'] or None,
                              address=address,
                              email=content['EMAIL'])

        legislator.add_source(url)
        self.save_legislator(legislator)
Beispiel #53
0
    def scrape_lower(self, term):
        url = "http://www.okhouse.gov/Members/Default.aspx"
        page = lxml.html.fromstring(self.urlopen(url))

        for link in page.xpath("//a[contains(@href, 'District')]")[3:]:
            name = link.text.strip()
            district = link.xpath("string(../../td[3])").strip()

            party = link.xpath("string(../../td[4])").strip()
            if party == 'R':
                party = 'Republican'
            elif party == 'D':
                party = 'Democratic'

            leg = Legislator(term, 'lower', district, name, party=party)
            leg.add_source(url)
            self.save_legislator(leg)
Beispiel #54
0
    def _scrape_upper(self, roster_page, roster_url, term):
        # TODO: photo_urls http://www.senate.texas.gov/members.php

        for tbl in roster_page.xpath('//table[@class="memdir"]'):
            leg_a = tbl.xpath('.//a')[0]
            name = leg_a.text
            leg_url = leg_a.get('href')
            district = tbl.xpath(
                './/span[contains(text(), "District:")]')[0].tail.lstrip('0')
            party = tbl.xpath('.//span[contains(text(), "Party:")]')[0].tail
            legislator = Legislator(term,
                                    'upper',
                                    district,
                                    name,
                                    party=party,
                                    url=leg_url)

            for addr in tbl.xpath('.//td[@headers]'):
                fax = phone = address = None
                lines = [addr.text]
                for child in addr.getchildren():
                    # when we get to span tag we just ingested a phone #
                    if child.tag == 'span' and child.text:
                        if 'TEL' in child.text:
                            phone = lines.pop()
                        elif 'FAX' in child.text:
                            fax = lines.pop()
                    elif child.tail:
                        lines.append(child.tail)

                address = '\n'.join(line.strip() for line in lines if line)
                if 'CAP' in addr.get('headers'):
                    office_type = 'capitol'
                    office_name = 'Capitol Office'
                else:
                    office_type = 'district'
                    office_name = 'District Office'
                legislator.add_office(office_type,
                                      office_name,
                                      address=address,
                                      phone=phone,
                                      fax=fax)

            legislator.add_source(roster_url)
            legislator.add_source(leg_url)
            self.save_legislator(legislator)
Beispiel #55
0
    def scrape_senators(self, term):
        url = "http://www.flsenate.gov/Senators/"
        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)
            page.make_links_absolute(url)

            for link in page.xpath("//a[contains(@href, 'Senators/s')]"):
                name = link.text.strip()
                name = re.sub(r'\s+', ' ', name)
                leg_url = link.get('href')

                if 'Vacant' in name:
                    continue

                # Special case - name_tools gets confused
                # by 'JD', thinking it is a suffix instead of a first name
                if name == 'Alexander, JD':
                    name = 'JD Alexander'
                elif name == 'Vacant':
                    name = 'Vacant Seat'

                district = link.xpath("string(../../td[1])")
                party = link.xpath("string(../../td[2])")

                # for consistency
                if party == 'Democrat':
                    party = 'Democratic'

                photo_url = ("http://www.flsenate.gov/userContent/"
                             "Senators/2010-2012/photos/s%03d.jpg" %
                             (int(district)))

                leg = Legislator(term,
                                 'upper',
                                 district,
                                 name,
                                 party=party,
                                 photo_url=photo_url,
                                 url=leg_url)
                leg.add_source(url)
                leg.add_source(leg_url)

                self.scrape_sen_offices(leg, leg_url)

                self.save_legislator(leg)
Beispiel #56
0
    def scrape(self, term, chambers):
        base_url = 'http://news.legislature.ne.gov/dist'

        #there are 49 districts
        for district in range(1, 50):
            if district < 10:
                rep_url = base_url + '0' + str(district) + '/biography/'
            else:
                rep_url = base_url + str(district) + '/biography/'

            try:
                html = self.get(rep_url).text
                page = lxml.html.fromstring(html)

                full_name = page.xpath('//div[@class="content_header_right"]/a')[0].text.split(' ',1)[1].strip()
                if full_name == 'Seat Vacant':
                    continue

                # This is hacky, are lis always the same?
                address = page.xpath('//div[@id="sidebar"]/ul[1]/li[3]')[0].text.strip() + '\n'
                address += page.xpath('//div[@id="sidebar"]/ul[1]/li[4]')[0].text.strip() + '\n'
                address += page.xpath('//div[@id="sidebar"]/ul[1]/li[5]')[0].text.strip()
                phone = page.xpath('//div[@id="sidebar"]/ul[1]/li[6]')[0].text.split()
                if len(phone) > 2:
                    phone = phone[1] + ' ' + phone[2]
                else:
                    phone = None
                mailto = page.xpath('//div[@id="sidebar"]/ul[1]/li[contains(text(), "Email:")]/a/@href')[0]
                email = mailto[7:]

                photo_url = \
                        "http://www.nebraskalegislature.gov/media/images/blogs/dist%02d.jpg" \
                        % district

                #Nebraska is offically nonpartisan
                party = 'Nonpartisan'
                leg = Legislator(term, 'upper', str(district), full_name,
                                 party=party, email=email, url=rep_url,
                                 photo_url=photo_url)
                leg.add_source(rep_url)
                leg.add_office('capitol', 'Capitol Office', address=address,
                               phone=phone)
                self.save_legislator(leg)
            except scrapelib.HTTPError:
                self.warning('could not retrieve %s' % rep_url)
Beispiel #57
0
    def scrape_senate(self, term):
        urls = (
            'http://www.senadopr.us/senadores/Pages/Senadores%20Acumulacion.aspx',
            'http://www.senadopr.us/Pages/Senadores%20Distrito%20I.aspx',
            'http://www.senadopr.us/Pages/Senadores%20Distrito%20II.aspx',
            'http://www.senadopr.us/Pages/Senadores%20Distrito%20III.aspx',
            'http://www.senadopr.us/Pages/Senadores%20Distrito%20IV.aspx',
            'http://www.senadopr.us/Pages/Senadores%20Distrito%20V.aspx',
            'http://www.senadopr.us/Pages/Senadores%20Distrito%20VI.aspx',
            'http://www.senadopr.us/Pages/Senadores%20Distrito%20VII.aspx',
            'http://www.senadopr.us/Pages/Senadores%20Distrito%20VIII.aspx')

        for counter, url in enumerate(urls):
            with self.urlopen(url) as leg_page_html:
                doc = lxml.html.fromstring(leg_page_html)
                table = doc.xpath(
                    '//table[@summary="Listado de Senadores"]')[0]

                # skip first row
                for row in table.xpath('tr')[1:]:
                    tds = row.xpath('td')
                    img = row.xpath('.//img/@src')
                    if len(img) != 0:
                        photo_url = img[0]
                    name = tds[1].text_content().title()
                    party = tds[2].text_content()
                    phone = tds[3].text_content()
                    email = tds[4].text_content()

                    if counter == 0:
                        district = 'At-Large'
                    else:
                        district = str(counter)

                    leg = Legislator(term,
                                     'upper',
                                     district,
                                     name,
                                     party=party,
                                     photo_url=photo_url,
                                     phone=phone,
                                     email=email)

                    leg.add_source(url)
                    self.save_legislator(leg)
Beispiel #58
0
    def scrape_2011Leg(self, chamber, term, url):
        """2011 Scraper for legislators"""
        titles = {'lower': 'Representative', 'upper': 'Senator'}
        parties = {'D': 'Democrat', 'R': 'Republican'}
        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)
            page.make_links_absolute(url)
            table = page.xpath('//table[contains(@id, "GridView1")]')[0]
            for row in table.xpath('tr[td/a[contains(@href, "memberpage")]]'):
                params = {}
                district = row.xpath('td/span[contains(@id, "LabelDis")]/font')[0].text + " " + \
                    row.xpath('td/span[contains(@id, "LabelDistrict2")]/font')[0].text
                # Replace any / in district name to allow json file to save.
                district = district.replace('/', '-')
                params['title'] = titles.get(chamber, '')
                last_name = row.xpath(
                    'td/a[contains(@id, "HyperLinkLast")]/font')[0].text.strip(
                    )
                first_names = row.xpath(
                    'td/span[contains(@id, "LabelFirst")]/font')[0].text.strip(
                    )
                first_name = first_names.split()[0]
                middle_name = ' '.join(first_names.split()[1:])
                party = row.xpath(
                    'td/span[contains(@id, "LabelParty")]/font')[0].text
                party = party.replace('(', '')
                party = party.replace(')', '')
                party = parties.get(party,
                                    '')  # Expand party from initial letter.
                params['office_address'] = row.xpath('td/span[contains(@id, "LabelRoom")]')[0].text + \
                    " " + row.xpath('td/span[contains(@id, "LabelRoom2")]')[0].text
                params['photo_url'] = row.xpath(
                    'td/a[contains(@id, "HyperLinkChairJPG")]/img'
                )[0].attrib['src']
                params['email'] = row.xpath(
                    'td/a[contains(@id, "HyperLinkEmail")]')[0].text
                params['phone'] = row.xpath(
                    'td/span[contains(@id, "LabelPhone2")]')[0].text

                full_name = first_names + " " + last_name
                leg = Legislator(term, chamber, district, full_name,
                                 first_name, last_name, middle_name, party,
                                 **params)
                leg.add_source(url)
                self.save_legislator(leg)
Beispiel #59
0
    def scrape_senate(self, term):
        BASE_URL = 'http://www.senate.leg.state.mn.us/'
        url = 'http://www.senate.leg.state.mn.us/members/member_list.php'

        with self.urlopen(url) as html:
            doc = lxml.html.fromstring(html)

            for row in doc.xpath('//tr'):
                tds = row.xpath('td')
                if len(tds) == 5 and tds[1].text_content() in self._parties:
                    district = tds[0].text_content().lstrip('0')
                    party = tds[1].text_content()
                    name_a = tds[2].xpath('a')[0]
                    name = name_a.text.strip()
                    leg_url = BASE_URL + name_a.get('href')
                    addr, phone = tds[3].text_content().split(u'\xa0\xa0')
                    email = tds[4].text_content()

                    leg = Legislator(term, 'upper', district, name,
                                     party=self._parties[party],
                                     url=leg_url)

                    if 'State Office' in addr:
                        addr = ('100 Rev. Dr. Martin Luther King Jr. Blvd.\n'
                                'Room {0}\n'
                                'St. Paul, MN 55155-1206').format(addr)
                    elif 'Capitol' in addr:
                        addr = ('75 Rev. Dr. Martin Luther King Jr. Blvd.\n'
                                'Room {0}\n'
                                'St. Paul, MN 55155-1606').format(addr)
                    leg.add_office('capitol', 'Capitol Office', address=addr,
                                   phone=phone)

                    if '@' in email:
                        leg['email'] = email

                    with self.urlopen(leg_url) as leg_html:
                        leg_doc = lxml.html.fromstring(leg_html)
                        img_src = leg_doc.xpath('//img[@height=164]/@src')
                        if img_src:
                            leg['photo_url'] = BASE_URL + img_src[0]

                    leg.add_source(url)

                    self.save_legislator(leg)
Beispiel #60
0
    def scrape_legislator(self, chamber, term, name, url):
        page = self.urlopen(url)
        page = lxml.html.fromstring(page)

        name = re.sub(r'\s+', ' ', name)

        info = page.xpath('string(//div[@id = "fullpage"])')

        district = re.search(r'District: ([\w\d]+)', info)
        if district is None:
            maddr = page.xpath(
                "//div[@id='fullpage']//a[contains(@href, 'mailto')]")
            if maddr == []:
                return  # Needed for http://senate.legis.state.ak.us/senator.php?id=cog ..
            maddr = maddr[0]
            district = maddr.getnext().tail
            # This hack needed for http://house.legis.state.ak.us/rep.php?id=dru
            # please remove as soon as this is alive.
        else:
            district = district.group(1)

        party = re.search(r'Party: (.+)', info).group(1).strip()
        email = re.search(r'Email: ([\w_]+@legis\.state\.ak\.us)', info)

        if email is None:
            email = re.search(r'Email: (.+@akleg\.gov)', info)

        email = email.group(1)

        # for consistency
        if party == 'Democrat':
            party = 'Democratic'

        leg = Legislator(term,
                         chamber,
                         district,
                         name,
                         party=party,
                         email=email,
                         url=url)
        self.scrape_address(leg, page, 'bioleft')
        self.scrape_address(leg, page, 'bioright')
        leg.add_source(url)

        self.save_legislator(leg)