Beispiel #1
0
    def scrape(self, chamber, term):
        if chamber == 'lower':
            return
        html = self.urlopen(self.legislators_url)
        doc = lxml.html.fromstring(html)
        members = doc.xpath('//div[@id="cdlist"]/div[@class="cd"]')
        for member in members:
            member_xpath = member.xpath
            res = {}

            title_name = member_xpath('div[@class="cdinfo"]/text()')[0].strip()
            res['url'] = self.base_url + member_xpath('div[@class="cdinfo"]/a/@href')[0]
            district = member_xpath('div[@class="cdinfo"]/a')[0].text_content().strip()
            if len(member_xpath('div[@class="cdinfo"]/a')) > 1:
                res['email'] = member_xpath('div[@class="cdinfo"]/a')[1].text_content().strip()
            else:
                res['email'] = None
            for t in titles:
                if t in title_name:
                    res['title'] = t
                    res['full_name'] = title_name.replace(t, '').strip()

            leg = Legislator(term, chamber, district, **res)
            leg.update(**res)
            self.save_legislator( leg )
    def scrape_bio(self, term, chamber, district, name, url):
        # this opens the committee section without having to do another request
        url += '&TableRow=1.5.5'
        frame_doc = self.lxmlize(url)
        actual_url = frame_doc.xpath("//frame[@name='right']/@src")[0]
        doc = self.lxmlize(actual_url)

        # party is in one of these
        party = doc.xpath('//div[@id="page_header"]')[0].text.strip()[-3:]
        if '(D)' in party:
            party = 'Democratic'
        elif '(R)' in party:
            party = 'Republican'
        else:
            raise AssertionError("No party found for {name}".format(name=name))

        leg = Legislator(term, chamber, district, name, party=party)

        photo_url = doc.xpath('//img[contains(@src, "jpg")]/@src')
        if photo_url:
            leg['photo_url'] = photo_url[0]

        contact_info = self.scrape_contact_info(doc)
        leg.update(contact_info)
        return leg
Beispiel #3
0
 def scrape(self, chamber, term):
     if chamber == 'lower':
         return
     html = self.urlopen(self.legislators_url)
     doc = lxml.html.fromstring(html)
     table  = doc.xpath('//table')[27]
     row = table.xpath('tr')[1]
     members = row.xpath('td')
     for member in members:
         res = {}
         member_xpath = member.xpath
         name = member_xpath('span/b/a/text()')
         if name:
             res['full_name'] = name[0]
         else:
             res['full_name'] = member_xpath('a/span/b/text()')[0].strip()
         res['url'] = member_xpath('span/a/@href')[0].strip()
         district = member_xpath('span/a')[0].text_content().strip()
         title_el = member_xpath('span/a/text()')
         if len(title_el) > 1:
             res['title'] = title_el[1].strip()
         else:
             res['title'] = ""
         leg = Legislator(term, chamber, district, **res)
         leg.update(**res)
         self.save_legislator( leg )
Beispiel #4
0
    def scrape(self, chamber, term):
        self.validate_term(term, latest_only=True)

        chamber_name = {"upper": "Senate", "lower": "House"}[chamber]

        url = "http://www.in.gov/cgi-bin/legislative/listing/" "listing-2.pl?data=alpha&chamber=%s" % chamber_name

        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)

            for link in page.xpath("//div[@id='col2']/p/a"):

                name = link.text.strip()
                href = link.get("href")

                details = link.getnext().text.strip()

                party = details.split(",")[0]
                if party == "Democrat":
                    party = "Democratic"

                district = re.search(r"District (\d+)", details).group(1)
                district = district.lstrip("0")

                # Get the legislator's bio page.

                leg = Legislator(term, chamber, district, name, party=party, url=href)
                leg.add_source(url)
                leg.add_source(href)

                details = self.scrape_details(chamber, term, href, page, party, leg)
                if details:
                    leg.update(details)

                self.save_legislator(leg)
Beispiel #5
0
    def scrape_bio(self, term, chamber, district, name, url):
        # this opens the committee section without having to do another request
        url += '&TableRow=1.5.5'
        frame_doc = self.lxmlize(url)
        actual_url = frame_doc.xpath("//frame[@name='right']/@src")[0]
        doc = self.lxmlize(actual_url)

        # party is in one of these
        party = doc.xpath('//div[@id="page_header"]')[0].text.strip()[-3:]
        if '(D)' in party:
            party = 'Democratic'
        elif '(R)' in party:
            party = 'Republican'
        else:
            raise AssertionError("No party found for {name}".format(name=name))

        leg = Legislator(term, chamber, district, name, party=party)

        photo_url = doc.xpath('//img[contains(@src, "jpg")]/@src')
        if photo_url:
            leg['photo_url'] = photo_url[0]

        contact_info = self.scrape_contact_info(doc)
        leg.update(contact_info)
        return leg
Beispiel #6
0
    def process_person(self, person):
        term = self.metadata['terms'][-1]['name']
        chamber = None
        district = None
        party = None
        name = person['name']
        url = person['links'][0]['url']
        photo_url = person['image']

        for membership in self.memberships[person['_id']]:
            org = membership['org']
            post = membership['post']
            if not org:
                print(membership)
            classification = org.get('classification') or org.get(
                'organization__classification')
            if classification in ('upper', 'lower'):
                chamber = classification
                district = post['label']
            elif classification == 'party':
                party = org['name']
            elif classification == 'legislature':  # DC
                chamber = 'upper'
                district = post['label']

        district_office = {}
        capitol_office = {}
        email = ''
        for detail in person['contact_details']:
            # rename voice->phone
            if detail['type'] == 'voice':
                detail['type'] = 'phone'
            elif detail['type'] == 'email':
                email = detail['value']
            if 'district' in detail['note'].lower():
                district_office[detail['type']] = detail['value']
            elif 'capitol' in detail['note'].lower():
                capitol_office[detail['type']] = detail['value']

        leg = Legislator(term,
                         chamber,
                         district,
                         name,
                         party=party,
                         url=url,
                         photo_url=photo_url,
                         email=email)

        if district_office:
            leg.add_office('district', 'District Office', **district_office)
        if capitol_office:
            leg.add_office('capitol', 'Capitol Office', **capitol_office)

        for source in person['sources']:
            leg.add_source(source['url'])

        leg.update(**person['extras'])

        self.save_legislator(leg)
Beispiel #7
0
    def scrape_legislator(self, chamber, term, option):
        url = urlparse.urljoin(self.url, option.attrib['value'])
        name, party, district = re.split(r'\s*,\s*', option.text.strip())
        name = re.sub(r'^(Sen\.|Rep\.)\s+', '', name)
        district = re.sub(r'^District\s+', '', district)
        if district == '[N/A]':
            msg = 'No district found for %r; skipping.'
            self.logger.warning(msg, name)
            return
        leg = Legislator(term, chamber, district, name, party=party)

        # Scrape leg page.
        try:
            html = self.urlopen(url)
        except scrapelib.HTTPError as exc:
            # As of July 2014, this only happens when a page has
            # gone missing from their varnish server.
            # if exc.response.status_code is 503:
            self.logger.exception(exc)
            self.logger.warning('Skipping legislator at url: %s' % url)
            skipped = True
            return

        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(self.url)
        leg.add_source(url)

        # Scrape committees.
        for tr in doc.xpath(
                '//div[@class="legislator-committees-container"]//table//tr'):
            committee, committee_type, role = tr
            committee = committee.text_content().strip()
            role = role.text_content().strip()
            if 'member' in role.lower():
                role = 'committee member'
            elif 'chair' in role.lower():
                role = 'chair'
            if committee != "Committee Name":
                leg.add_role(role, term, chamber=chamber, committee=committee)

        # Scrape offices.
        dist_office, phone = doc.xpath('//address')
        dist_office = dist_office.text_content().strip()
        dist_office = re.sub(r' {2,}', '', dist_office)

        phone = phone.text_content().strip()
        email = doc.xpath('string(//a[starts-with(@href, "mailto:")]/@href)')
        photo_url = doc.xpath('string(//img[contains(@class, "member")]/@src)')

        leg.update(email=email, photo_url=photo_url)
        leg.add_office(address=dist_office,
                       name='Capitol Office',
                       type='capitol',
                       phone=phone)

        self.save_legislator(leg)
    def process_person(self, person):
        term = self.metadata['terms'][-1]['name']
        chamber = None
        district = None
        party = None
        name = person['name']
        url = person['links'][0]['url']
        photo_url = person['image']

        for membership in self.memberships[person['_id']]:
            org = membership['org']
            post = membership['post']
            if not org:
                print(membership)
            classification = org.get('classification') or org.get('organization__classification')
            if classification in ('upper', 'lower'):
                chamber = classification
                district = post['label']
            elif classification == 'party':
                party = org['name']
            elif classification == 'legislature':      # DC
                chamber = 'upper'
                district = post['label']

        district_office = {}
        capitol_office = {}
        email = ''
        for detail in person['contact_details']:
            # rename voice->phone
            if detail['type'] == 'voice':
                detail['type'] = 'phone'
            elif detail['type'] == 'email':
                email = detail['value']
            if 'district' in detail['note'].lower():
                district_office[detail['type']] = detail['value']
            elif 'capitol' in detail['note'].lower():
                capitol_office[detail['type']] = detail['value']

        leg = Legislator(term, chamber, district, name,
                         party=party, url=url,
                         photo_url=photo_url,
                         email=email
                         )

        if district_office:
            leg.add_office('district', 'District Office', **district_office)
        if capitol_office:
            leg.add_office('capitol', 'Capitol Office', **capitol_office)

        for source in person['sources']:
            leg.add_source(source['url'])

        leg.update(**person['extras'])

        self.save_legislator(leg)
Beispiel #9
0
    def scrape_legislator(self, chamber, term, option):
        url = urlparse.urljoin(self.url, option.attrib["value"])
        name, party, district = re.split(r"\s*,\s*", option.text.strip())
        name = re.sub(r"^(Sen\.|Rep\.)\s+", "", name)
        district = re.sub(r"^District\s+", "", district)
        if district == "[N/A]":
            msg = "No district found for %r; skipping."
            self.logger.warning(msg, name)
            return
        leg = Legislator(term, chamber, district, name, party=party)
        leg.add_source(self.url)

        # Scrape leg page.
        try:
            html = self.urlopen(url)
        except scrapelib.HTTPError as exc:
            # As of July 2014, this only happens when a page has
            # gone missing from their varnish server.
            # if exc.response.status_code is 503:
            self.logger.exception(exc)
            self.logger.warning("Skipping legislator at url: %s" % url)
            skipped = True
            return

        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(self.url)
        leg.add_source(url)

        # Scrape committees.
        for tr in doc.xpath("//table//tr"):
            committee, role = tr
            committee = committee.text_content().strip()
            role = role.text_content().strip()
            if "member" in role.lower():
                role = "committee member"
            elif "chair" in role.lower():
                role = "chair"
            leg.add_role(role, term, chamber=chamber, committee=committee)

        # Scrape offices.
        dist_office, phone = doc.xpath("//address")
        dist_office = dist_office.text_content().strip()
        dist_office = re.sub(r" {2,}", "", dist_office)

        phone = phone.text_content().strip()
        email = doc.xpath('string(//a[starts-with(@href, "mailto:")]/@href)')
        photo_url = doc.xpath('string(//img[contains(@class, "member")]/@src)')

        leg.update(email=email, photo_url=photo_url)
        leg.add_office(address=dist_office, name="District Office", type="district", phone=phone)

        self.save_legislator(leg)
Beispiel #10
0
    def scrape_bio(self, term, chamber, district, name, url):
        # this opens the committee section without having to do another request
        url += '&TableRow=1.5.5'
        doc = lxml.html.fromstring(self.urlopen(url))
        doc.make_links_absolute(url)

        # party is in one of these
        party = doc.xpath('//div[@align="center"]/b/font[@size="2"]/text()')
        if '(D)' in party:
            party = 'Democratic'
        elif '(R)' in party:
            party = 'Republican'

        leg = Legislator(term, chamber, district, name, party=party, url=url)

        photo_url = doc.xpath('//img[contains(@src, "FieldElemFormat")]/@src')
        if photo_url:
            leg['photo_url'] = photo_url[0]

        roles = defaultdict(lambda: {})

        position = 'member'
        for text in doc.xpath('//td[@width="584"]/descendant::font/text()'):
            text = text.strip()
            if text == 'Committee Chair:':
                position = 'chair'
            elif text == 'Committee Co-chair:':
                position = 'co-chair'
            else:
                for committee in text.splitlines():
                    roles[committee].update(
                        role='committee member',
                        term=term,
                        chamber=chamber,
                        committee=committee,
                        party=party,
                        position=position)

        for role in roles.values():
            leg.add_role(**role)

        contact_info = self.scrape_contact_info(doc)
        leg.update(contact_info)

        return leg
Beispiel #11
0
    def scrape_bio(self, term, chamber, district, name, url):
        # this opens the committee section without having to do another request
        url += '&TableRow=1.5.5'
        doc = lxml.html.fromstring(self.urlopen(url))
        doc.make_links_absolute(url)

        # party is in one of these
        party = doc.xpath('//div[@align="center"]/b/font[@size="2"]/text()')
        if '(D)' in party:
            party = 'Democratic'
        elif '(R)' in party:
            party = 'Republican'

        leg = Legislator(term, chamber, district, name, party=party, url=url)

        photo_url = doc.xpath('//img[contains(@src, "FieldElemFormat")]/@src')
        if photo_url:
            leg['photo_url'] = photo_url[0]

        roles = defaultdict(lambda: {})

        position = 'member'
        for text in doc.xpath('//td[@width="584"]/descendant::font/text()'):
            text = text.strip()
            if text == 'Committee Chair:':
                position = 'chair'
            elif text == 'Committee Co-chair:':
                position = 'co-chair'
            else:
                for committee in text.splitlines():
                    roles[committee].update(
                        role='committee member',
                        term=term,
                        chamber=chamber,
                        committee=committee,
                        party=party,
                        position=position)

        for role in roles.values():
            leg.add_role(**role)

        contact_info = self.scrape_contact_info(doc)
        leg.update(contact_info)

        return leg
Beispiel #12
0
 def scrape(self, chamber, term):
     if chamber == 'lower':
         return
     html = self.urlopen(self.legislators_url)
     doc = lxml.html.fromstring(html)
     cells = doc.xpath('//table/tbody/tr/td')
     for cell in cells:
         cell_xpath = cell.xpath
         res = {}
         res['full_name'] = cell_xpath('a')[0].text_content()
         res['email'] = cell_xpath('a/@href')[1].replace('mailto:','')
         res['title'] = cell_xpath('text()')[0].strip()
         res['phone'] = cell_xpath('text()')[1].strip()
         res['url'] = self.base_url + cell_xpath('a/@href')[0]
         leg = Legislator(term, chamber, 'district', **res)
         print leg
         leg.update(**res)
         self.save_legislator( leg ) 
Beispiel #13
0
    def scrape_legislator(self, chamber, term, option):
        url = urlparse.urljoin(self.url, option.attrib['value'])
        name, party, district = re.split(r'\s*,\s*', option.text.strip())
        name = re.sub(r'^(Sen\.|Rep\.)\s+', '', name)
        district = re.sub(r'^District\s+', '', district)
        if district == '[N/A]':
            msg = 'No district found for %r; skipping.'
            self.logger.warning(msg, name)
            return
        leg = Legislator(term, chamber, district, name, party=party)
        leg.add_source(self.url)

        # Scrape leg page.
        html = self.urlopen(url)
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(self.url)
        leg.add_source(url)

        # Scrape committees.
        for tr in doc.xpath('//table//tr'):
            committee, role = tr
            committee = committee.text_content().strip()
            role = role.text_content().strip()
            if 'member' in role.lower():
                role = 'committee member'
            elif 'chair' in role.lower():
                role = 'chair'
            leg.add_role(role, term, chamber=chamber, committee=committee)

        # Scrape offices.
        dist_office, phone = doc.xpath('//address')
        dist_office = dist_office.text_content().strip()
        dist_office = re.sub(r' {2,}', '', dist_office)

        phone = phone.text_content().strip()
        email = doc.xpath('string(//a[starts-with(@href, "mailto:")]/@href)')
        photo_url = doc.xpath('string(//img[contains(@class, "member")]/@src)')

        leg.update(email=email, photo_url=photo_url)
        leg.add_office(
            address=dist_office, name='District Office',
            type='district', phone=phone)

        self.save_legislator(leg)
Beispiel #14
0
    def scrape(self, chamber, term):
        self.validate_term(term, latest_only=True)

        chamber_name = {'upper': 'Senate', 'lower': 'House'}[chamber]

        url = ("http://www.in.gov/cgi-bin/legislative/listing/"
               "listing-2.pl?data=alpha&chamber=%s" % chamber_name)

        page = self.urlopen(url)
        page = lxml.html.fromstring(page)

        for link in page.xpath("//div[@id='col2']/p/a"):

            name = link.text.strip()
            href = link.get('href')

            details = link.getnext().text.strip()

            party = details.split(',')[0]
            if party == 'Democrat':
                party = 'Democratic'

            district = re.search(r'District (\d+)', details).group(1)
            district = district.lstrip('0')

            # Get the legislator's bio page.

            leg = Legislator(term,
                             chamber,
                             district,
                             name,
                             party=party,
                             url=href)
            leg.add_source(url)
            leg.add_source(href)

            details = self.scrape_details(chamber, term, href, page, party,
                                          leg)
            if details:
                leg.update(details)

            self.fix_hotgarbage(leg)
            self.save_legislator(leg)
Beispiel #15
0
    def scrape_legislator(self, chamber, term, option):
        url = urlparse.urljoin(self.url, option.attrib['value'])
        name, party, district = re.split(r'\s*,\s*', option.text.strip())
        name = re.sub(r'^(Sen\.|Rep\.)\s+', '', name)
        district = re.sub(r'^District\s+', '', district)
        leg = Legislator(term, chamber, district, name, party=party)
        leg.add_source(self.url)

        # Scrape leg page.
        html = self.urlopen(url)
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(self.url)
        leg.add_source(url)

        # Scrape committees.
        for tr in doc.xpath('//table//tr'):
            committee, role = tr
            committee = committee.text_content().strip()
            role = role.text_content().strip()
            if 'member' in role.lower():
                role = 'committee member'
            elif 'chair' in role.lower():
                role = 'chair'
            leg.add_role(role, term, chamber=chamber, committee=committee)

        # Scrape offices.
        dist_office, phone = doc.xpath('//address')
        dist_office = dist_office.text_content().strip()
        dist_office = re.sub(r' {2,}', '', dist_office)

        phone = phone.text_content().strip()
        email = doc.xpath('string(//a[starts-with(@href, "mailto:")]/@href)')
        photo_url = doc.xpath('string(//img[contains(@class, "member")]/@src)')

        leg.update(email=email, photo_url=photo_url)
        leg.add_office(
            address=dist_office, name='District Office',
            type='district', phone=phone)

        self.save_legislator(leg)
Beispiel #16
0
    def scrape_bio(self, term, chamber, district, name, url):
        # this opens the committee section without having to do another request
        url += '&TableRow=1.5.5'
        doc = lxml.html.fromstring(self.urlopen(url))
        doc.make_links_absolute(url)

        # party is in one of these
        party = doc.xpath('//div[@align="center"]/b/font[@size="2"]/text()')
        if '(D)' in party:
            party = 'Democratic'
        elif '(R)' in party:
            party = 'Republican'

        leg = Legislator(term, chamber, district, name, party=party, url=url)

        photo_url = doc.xpath('//img[contains(@src, "FieldElemFormat")]/@src')
        if photo_url:
            leg['photo_url'] = photo_url[0]

        contact_info = self.scrape_contact_info(doc)
        leg.update(contact_info)
        return leg
Beispiel #17
0
    def scrape_bio(self, term, chamber, district, name, url):
        # this opens the committee section without having to do another request
        url += '&TableRow=1.5.5'
        doc = lxml.html.fromstring(self.urlopen(url))
        doc.make_links_absolute(url)

        # party is in one of these
        party = doc.xpath('//div[@align="center"]/b/font[@size="2"]/text()')
        if '(D)' in party:
            party = 'Democratic'
        elif '(R)' in party:
            party = 'Republican'

        leg = Legislator(term, chamber, district, name, party=party, url=url)

        photo_url = doc.xpath('//img[contains(@src, "FieldElemFormat")]/@src')
        if photo_url:
            leg['photo_url'] = photo_url[0]

        contact_info = self.scrape_contact_info(doc)
        leg.update(contact_info)
        return leg
Beispiel #18
0
    def scrape(self, chamber, term):
        self.validate_term(term, latest_only=True)

        chamber_name = {'upper': 'Senate',
                        'lower': 'House'}[chamber]

        url = ("http://www.in.gov/cgi-bin/legislative/listing/"
               "listing-2.pl?data=alpha&chamber=%s" % chamber_name)

        page = self.urlopen(url)
        page = lxml.html.fromstring(page)

        for link in page.xpath("//div[@id='col2']/p/a"):

            name = link.text.strip()
            href = link.get('href')

            details = link.getnext().text.strip()

            party = details.split(',')[0]
            if party == 'Democrat':
                party = 'Democratic'

            district = re.search(r'District (\d+)', details).group(1)
            district = district.lstrip('0')

            # Get the legislator's bio page.

            leg = Legislator(term, chamber, district, name, party=party,
                             url=href)
            leg.add_source(url)
            leg.add_source(href)

            details = self.scrape_details(chamber, term, href, page, party, leg)
            if details:
                leg.update(details)

            self.fix_hotgarbage(leg)
            self.save_legislator(leg)
Beispiel #19
0
    def scrape(self, chamber, term):

        for tdata in self.metadata["terms"]:
            if term == tdata["name"]:
                year = tdata["start_year"]
                session_number = tdata["session_number"]
                break

        # Fetch the csv.
        url = "http://leg.mt.gov/content/sessions/%s/%d%sMembers.txt" % (
            session_number,
            year,
            chamber == "upper" and "Senate" or "House",
        )

        # Parse it.
        data = self.urlopen(url)
        data = data.replace('"""', '"')  # weird triple quotes
        data = data.splitlines()

        fieldnames = ["last_name", "first_name", "party", "district", "address", "city", "state", "zip"]
        csv_parser = csv.DictReader(data, fieldnames)

        district_leg_urls = self._district_legislator_dict()

        # Toss the row headers.
        next(csv_parser)

        for entry in csv_parser:
            if not entry:
                continue

            # City.
            entry["city"] = entry["city"]

            # Address.
            entry["address"] = entry["address"]

            # District.
            district = entry["district"]
            hd_or_sd, district = district.split()
            del entry["district"]

            # Party.
            party_letter = entry["party"]
            party = {"D": "Democratic", "R": "Republican"}[party_letter]
            entry["party"] = party
            del entry["party"]

            # Get full name properly capped.
            fullname = _fullname = "%s %s" % (entry["first_name"].capitalize(), entry["last_name"].capitalize())

            city_lower = entry["city"].lower()

            # Get any info at the legislator's detail_url.
            detail_url = district_leg_urls[hd_or_sd][district]

            # Get the office.
            address = "\n".join([entry["address"], "%s, %s %s" % (entry["city"], entry["state"], entry["zip"])])

            office = dict(name="District Office", type="district", phone=None, fax=None, email=None, address=address)

            try:
                deets = self._scrape_details(detail_url)
            except NoDetails:
                self.logger.warning("No details found at %r" % detail_url)
                continue

            # Add the details and delete junk.
            entry.update(deets)
            del entry["first_name"], entry["last_name"]

            legislator = Legislator(term, chamber, district, fullname, party=party)
            legislator.update(entry)
            legislator.add_source(detail_url)
            legislator.add_source(url)
            legislator["url"] = detail_url

            office["phone"] = deets.get("phone")
            office["fax"] = deets.get("fax")
            legislator.add_office(**office)

            self.save_legislator(legislator)
    def scrape(self, chamber, term):
        term_slug = term[:-2]
        url = MEMBER_LIST_URL[chamber] % term_slug

        html = self.get(url).text
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)

        for row in doc.xpath('//table')[4].xpath('tr')[2:]:
            name, _, _, district, party = row.xpath('td')
            district = district.text

            if party.text_content().strip() == "":
                self.warning("Garbage party: Skipping!")
                continue

            party = {'D':'Democratic', 'R': 'Republican',
                     'I': 'Independent'}[party.text]
            leg_url = name.xpath('a/@href')[0]
            name = name.text_content().strip()

            # inactive legislator, skip them for now
            if name.endswith('*'):
                name = name.strip('*')
                continue

            leg_html = self.get(leg_url).text
            leg_doc = lxml.html.fromstring(leg_html)
            leg_doc.make_links_absolute(leg_url)

            leg = Legislator(term, chamber, district, name, party=party,
                             url=leg_url)
            leg.add_source(url)

            hotgarbage = (
                'Senate Biography Information for the 98th General '
                'Assembly is not currently available.')
            if hotgarbage in leg_html:
                # The legislator's bio isn't available yet.
                self.logger.warning('No legislator bio available for ' + name)
                self.save_legislator(leg)
                continue

            photo_url = leg_doc.xpath('//img[contains(@src, "/members/")]/@src')[0]
            photo_url_parsed = urlparse(photo_url)
            encoded_path = quote(photo_url_parsed.path)
            photo_url = photo_url_parsed._replace(path=encoded_path).geturl()
            leg.update(photo_url=photo_url)
            leg.add_source(leg_url)

            # email
            email = leg_doc.xpath('//b[text()="Email: "]')
            if email:
                email = email[0].tail.strip()
            else:
                email = None

            # function for turning an IL contact info table to office details
            def _table_to_office(table, office_type, office_name, email=None):
                addr = ''
                phone = ''
                fax = None
                for row in table.xpath('tr'):
                    row = row.text_content().strip()
                    # skip rows that aren't part of address
                    if 'Office:' in row or row == 'Cook County':
                        continue
                    # fax number row ends with FAX
                    elif 'FAX' in row:
                        fax = row.replace(' FAX', '')
                    # phone number starts with ( [make it more specific?]
                    elif row.startswith('('):
                        phone = row
                    # everything else is an address
                    else:
                        addr += (row + '\n')
                if addr.strip() != ',':
                    leg.add_office(office_type, office_name,
                                   address=addr.strip(), phone=phone, fax=fax, email=email)

            # extract both offices from tables
            table = leg_doc.xpath('//table[contains(string(), "Springfield Office")]')
            if table:
                _table_to_office(table[3], 'capitol', 'Springfield Office', email)
            table = leg_doc.xpath('//table[contains(string(), "District Office")]')
            if table:
                _table_to_office(table[3], 'district', 'District Office')

            self.save_legislator(leg)
Beispiel #21
0
    def scrape(self, chamber, term):

        for tdata in self.metadata['terms']:
            if term == tdata['name']:
                year = tdata['start_year']
                session_number = tdata['session_number']
                break

        # Scrape committees. Also produce a name dictionary that can be
        # used for fuzzy matching between the committee page names and the
        # all-caps csv names.
        for name_dict, _ in scrape_committees(year, chamber):
            pass

        # Fetch the csv.
        url = 'http://leg.mt.gov/content/sessions/%s/%d%sMembers.txt' % \
            (session_number, year, chamber == 'upper' and 'Senate' or 'House')

        # Parse it.
        data = self.urlopen(url)
        data = data.replace('"""', '"')  # weird triple quotes
        data = data.splitlines()

        fieldnames = [
            'last_name', 'first_name', 'party', 'district', 'address', 'city',
            'state', 'zip'
        ]
        csv_parser = csv.DictReader(data, fieldnames)

        district_leg_urls = self._district_legislator_dict()

        for entry in csv_parser:
            if not entry:
                continue

            # City.
            entry['city'] = entry['city'].title()

            # Address.
            entry['address'] = entry['address'].title()

            # District.
            district = entry['district']
            hd_or_sd, district = district.split()
            del entry['district']

            # Party.
            party_letter = entry['party']
            party = {'D': 'Democratic', 'R': 'Republican'}[party_letter]
            entry['party'] = party
            del entry['party']

            # Get full name properly capped.
            _fullname = '%s %s' % (entry['first_name'].capitalize(),
                                   entry['last_name'].capitalize())

            city_lower = entry['city'].lower()
            fullname = difflib.get_close_matches(_fullname,
                                                 name_dict[city_lower],
                                                 cutoff=0.5)

            # If there are no close matches with the committee page,
            # use the title-capped first and last name.
            if len(fullname) < 1:
                fullname = _fullname
                # msg = 'No matches found for "%s" with "%s" from %r'
                # self.debug(msg % (_fullname, fullname,
                #                   name_dict[city_lower]))
            else:
                fullname = fullname[0]
                # if _fullname != fullname:
                #     msg = 'matched "%s" with "%s" from %r'
                #     self.debug(msg % (_fullname, fullname,
                #                       name_dict[city_lower]))

            # Get any info at the legislator's detail_url.
            detail_url = district_leg_urls[hd_or_sd][district]
            deets = self._scrape_details(detail_url)

            # Add the details and delete junk.
            entry.update(deets)
            del entry['first_name'], entry['last_name']

            legislator = Legislator(term,
                                    chamber,
                                    district,
                                    fullname,
                                    party=party)
            legislator.update(entry)
            legislator.add_source(detail_url)
            legislator.add_source(url)
            legislator['url'] = detail_url

            self.save_legislator(legislator)
Beispiel #22
0
    def scrape(self, chamber, term):

        for tdata in self.metadata['terms']:
            if term == tdata['name']:
                year = tdata['start_year']
                session_number = tdata['session_number']
                break

        # Fetch the csv.
        url = 'http://leg.mt.gov/content/sessions/%s/%d%sMembers.txt' % \
            (session_number, year, chamber == 'upper' and 'Senate' or 'House')

        # Parse it.
        data = self.urlopen(url)
        data = data.replace('"""', '"')  # weird triple quotes
        data = data.splitlines()

        fieldnames = [
            'last_name', 'first_name', 'party', 'district', 'address', 'city',
            'state', 'zip'
        ]
        csv_parser = csv.DictReader(data, fieldnames)

        district_leg_urls = self._district_legislator_dict()

        # Toss the row headers.
        next(csv_parser)

        for entry in csv_parser:
            if not entry:
                continue

            # City.
            entry['city'] = entry['city']

            # Address.
            entry['address'] = entry['address']

            # District.
            district = entry['district']
            hd_or_sd, district = district.split()
            del entry['district']

            # Party.
            party_letter = entry['party']
            party = {'D': 'Democratic', 'R': 'Republican'}[party_letter]
            entry['party'] = party
            del entry['party']

            # Get full name properly capped.
            fullname = _fullname = '%s %s' % (entry['first_name'].capitalize(),
                                              entry['last_name'].capitalize())

            city_lower = entry['city'].lower()

            # Get any info at the legislator's detail_url.
            detail_url = district_leg_urls[hd_or_sd][district]

            # Get the office.
            address = '\n'.join([
                entry['address'],
                '%s, %s %s' % (entry['city'], entry['state'], entry['zip'])
            ])

            office = dict(name='District Office',
                          type='district',
                          phone=None,
                          fax=None,
                          email=None,
                          address=address)

            deets = self._scrape_details(detail_url)

            # Add the details and delete junk.
            entry.update(deets)
            del entry['first_name'], entry['last_name']

            legislator = Legislator(term,
                                    chamber,
                                    district,
                                    fullname,
                                    party=party)
            legislator.update(entry)
            legislator.add_source(detail_url)
            legislator.add_source(url)
            legislator['url'] = detail_url

            office['phone'] = deets.get('phone')
            office['fax'] = deets.get('fax')
            legislator.add_office(**office)

            self.save_legislator(legislator)
Beispiel #23
0
    def scrape(self, chamber, term):
        term_slug = term[:-2]
        url = MEMBER_LIST_URL[chamber] % term_slug

        html = self.urlopen(url)
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)

        for row in doc.xpath('//table')[4].xpath('tr')[2:]:
            name, _, _, district, party = row.xpath('td')
            district = district.text

            if party.text_content().strip() == "":
                self.warning("Garbage party: Skipping!")
                continue

            party = {
                'D': 'Democratic',
                'R': 'Republican',
                'I': 'Independent'
            }[party.text]
            leg_url = name.xpath('a/@href')[0]
            name = name.text_content().strip()

            # inactive legislator, skip them for now
            if name.endswith('*'):
                name = name.strip('*')
                continue

            leg_html = self.urlopen(leg_url)
            leg_doc = lxml.html.fromstring(leg_html)
            leg_doc.make_links_absolute(leg_url)

            leg = Legislator(term,
                             chamber,
                             district,
                             name,
                             party=party,
                             url=leg_url)
            leg.add_source(url)

            hotgarbage = ('Senate Biography Information for the 98th General '
                          'Assembly is not currently available.')
            if hotgarbage in leg_html:
                # The legislator's bio isn't available yet.
                self.logger.warning('No legislator bio available for ' + name)
                self.save_legislator(leg)
                continue

            photo_url = leg_doc.xpath(
                '//img[contains(@src, "/members/")]/@src')[0]
            photo_url_parsed = urlparse(photo_url)
            encoded_path = quote(photo_url_parsed.path)
            photo_url = photo_url_parsed._replace(path=encoded_path).geturl()
            leg.update(photo_url=photo_url)
            leg.add_source(leg_url)

            # email
            email = leg_doc.xpath('//b[text()="Email: "]')
            if email:
                leg['email'] = email[0].tail

            # function for turning an IL contact info table to office details
            def _table_to_office(table, office_type, office_name):
                addr = ''
                phone = ''
                fax = None
                for row in table.xpath('tr'):
                    row = row.text_content().strip()
                    # skip rows that aren't part of address
                    if 'Office:' in row or row == 'Cook County':
                        continue
                    # fax number row ends with FAX
                    elif 'FAX' in row:
                        fax = row.replace(' FAX', '')
                    # phone number starts with ( [make it more specific?]
                    elif row.startswith('('):
                        phone = row
                    # everything else is an address
                    else:
                        addr += (row + '\n')
                if addr.strip() != ',':
                    leg.add_office(office_type,
                                   office_name,
                                   address=addr.strip(),
                                   phone=phone,
                                   fax=fax)

            # extract both offices from tables
            table = leg_doc.xpath(
                '//table[contains(string(), "Springfield Office")]')
            if table:
                _table_to_office(table[3], 'capitol', 'Springfield Office')
            table = leg_doc.xpath(
                '//table[contains(string(), "District Office")]')
            if table:
                _table_to_office(table[3], 'district', 'District Office')

            self.save_legislator(leg)
Beispiel #24
0
    def scrape(self, chamber, term):

        for tdata in self.metadata['terms']:
            if term == tdata['name']:
                year = tdata['start_year']
                session_number = tdata['session_number']
                break

        # Scrape committees. Also produce a name dictionary that can be
        # used for fuzzy matching between the committee page names and the
        # all-caps csv names.
        # for name_dict, _ in scrape_committees(year, chamber):
        #     pass

        # Fetch the csv.
        url = 'http://leg.mt.gov/content/sessions/%s/%d%sMembers.txt' % \
            (session_number, year, chamber == 'upper' and 'Senate' or 'House')

        # Parse it.
        data = self.urlopen(url)
        data = data.replace('"""', '"')  # weird triple quotes
        data = data.splitlines()

        fieldnames = ['last_name', 'first_name', 'party', 'district',
                      'address', 'city', 'state', 'zip']
        csv_parser = csv.DictReader(data, fieldnames)

        district_leg_urls = self._district_legislator_dict()

        # Toss the row headers.
        next(csv_parser)

        for entry in csv_parser:
            if not entry:
                continue

            # City.
            entry['city'] = entry['city']

            # Address.
            entry['address'] = entry['address']

            # District.
            district = entry['district']
            hd_or_sd, district = district.split()
            del entry['district']

            # Party.
            party_letter = entry['party']
            party = {'D': 'Democratic', 'R': 'Republican'}[party_letter]
            entry['party'] = party
            del entry['party']

            # Get full name properly capped.
            fullname = _fullname = '%s %s' % (entry['first_name'].capitalize(),
                                   entry['last_name'].capitalize())

            city_lower = entry['city'].lower()
            # fullname = difflib.get_close_matches(
            #                _fullname, name_dict[city_lower], cutoff=0.5)

            # If there are no close matches with the committee page,
            # use the title-capped first and last name.
            # if len(fullname) < 1:
            #     fullname = _fullname
            #     # msg = 'No matches found for "%s" with "%s" from %r'
            #     # self.debug(msg % (_fullname, fullname,
            #     #                   name_dict[city_lower]))
            # else:
            #     fullname = fullname[0]
            #     # if _fullname != fullname:
            #     #     msg = 'matched "%s" with "%s" from %r'
            #     #     self.debug(msg % (_fullname, fullname,
            #     #                       name_dict[city_lower]))

            # Get any info at the legislator's detail_url.
            detail_url = district_leg_urls[hd_or_sd][district]

            # Get the office.
            address = '\n'.join([
                entry['address'],
                '%s, %s %s' % (entry['city'], entry['state'], entry['zip'])
                ])

            office = dict(
                name='District Office', type='district', phone=None,
                fax=None, email=None,
                address=address)

            deets = self._scrape_details(detail_url)
            # import ipdb;ipdb.set_trace()
            # Add the details and delete junk.
            entry.update(deets)
            del entry['first_name'], entry['last_name']

            legislator = Legislator(term, chamber, district, fullname,
                                    party=party)
            legislator.update(entry)
            legislator.add_source(detail_url)
            legislator.add_source(url)
            legislator['url'] = detail_url

            office['phone'] = deets.get('phone')
            office['fax'] = deets.get('fax')
            legislator.add_office(**office)

            print legislator['url']
            print legislator['photo_url']
            x = re.search(r'(\d+)\.jpg', legislator['photo_url'])
            if x:
                y = x.group(1)
                if y not in legislator['url']:
                    import ipdb;ipdb.set_trace()
            else:
                import ipdb;ipdb.set_trace()

            self.save_legislator(legislator)
Beispiel #25
0
    def scrape(self, chamber, term):

        for tdata in self.metadata['terms']:
            if term == tdata['name']:
                year = tdata['start_year']
                session_number = tdata['session_number']
                break

        # Scrape committees. Also produce a name dictionary that can be
        # used for fuzzy matching between the committee page names and the
        # all-caps csv names.
        for name_dict, _ in scrape_committees(year, chamber):
            pass

        # Fetch the csv.
        url = 'http://leg.mt.gov/content/sessions/%s/%d%sMembers.txt' % \
            (session_number, year, chamber == 'upper' and 'Senate' or 'House')

        # Parse it.
        data = self.urlopen(url)
        data = data.replace('"""', '"')  # weird triple quotes
        data = data.splitlines()

        fieldnames = ['last_name', 'first_name', 'party', 'district',
                      'address', 'city', 'state', 'zip']
        csv_parser = csv.DictReader(data, fieldnames)

        district_leg_urls = self._district_legislator_dict()

        for entry in csv_parser:
            if not entry:
                continue

            # City.
            entry['city'] = entry['city'].title()

            # Address.
            entry['address'] = entry['address'].title()

            # District.
            district = entry['district']
            hd_or_sd, district = district.split()
            del entry['district']

            # Party.
            party_letter = entry['party']
            party = {'D': 'Democratic', 'R': 'Republican'}[party_letter]
            entry['party'] = party
            del entry['party']

            # Get full name properly capped.
            _fullname = '%s %s' % (entry['first_name'].capitalize(),
                                   entry['last_name'].capitalize())

            city_lower = entry['city'].lower()
            fullname = difflib.get_close_matches(
                           _fullname, name_dict[city_lower], cutoff=0.5)

            # If there are no close matches with the committee page,
            # use the title-capped first and last name.
            if len(fullname) < 1:
                fullname = _fullname
                # msg = 'No matches found for "%s" with "%s" from %r'
                # self.debug(msg % (_fullname, fullname,
                #                   name_dict[city_lower]))
            else:
                fullname = fullname[0]
                # if _fullname != fullname:
                #     msg = 'matched "%s" with "%s" from %r'
                #     self.debug(msg % (_fullname, fullname,
                #                       name_dict[city_lower]))

            # Get any info at the legislator's detail_url.
            detail_url = district_leg_urls[hd_or_sd][district]
            deets = self._scrape_details(detail_url)

            # Add the details and delete junk.
            entry.update(deets)
            del entry['first_name'], entry['last_name']

            legislator = Legislator(term, chamber, district, fullname,
                                    party=party)
            legislator.update(entry)
            legislator.add_source(detail_url)
            legislator.add_source(url)
            legislator['url'] = detail_url

            self.save_legislator(legislator)
Beispiel #26
0
    def parse_legislator(self, tr, term, chamber):
        """
        Given a tr element, get specific data from it.
        """

        strip = methodcaller("strip")

        xpath = 'td[contains(@class, "views-field-field-%s-%s")]%s'

        xp = {
            "url": [("lname-value-1", "/a/@href"), ("member-lname-value-1", "/a/@href")],
            "district": [("district-value", "/text()")],
            "party": [("party-value", "/text()")],
            "full_name": [("feedbackurl-value", "/a/text()")],
            "address": [("feedbackurl-value", "/p/text()"), ("feedbackurl-value", "/p/font/text()")],
        }

        titles = {"upper": "senator", "lower": "member"}

        funcs = {"full_name": lambda s: s.replace("Contact Senator", "").strip(), "address": parse_address}

        rubberstamp = lambda _: _
        tr_xpath = tr.xpath
        res = collections.defaultdict(list)
        for k, xpath_info in xp.items():
            for vals in xpath_info:
                f = funcs.get(k, rubberstamp)
                vals = (titles[chamber],) + vals
                vals = map(f, map(strip, tr_xpath(xpath % vals)))

                res[k].extend(vals)

        # Photo.
        try:
            res["photo_url"] = tr_xpath("td/p/img/@src")[0]
        except IndexError:
            pass

        # Addresses.
        addresses = res["address"]
        try:
            addresses = map(dict, filter(None, addresses))
        except ValueError:
            # Sometimes legislators only have one address, in which
            # case this awful hack is helpful.
            addresses = map(dict, filter(None, [addresses]))

        for address in addresses[:]:

            # Toss results that don't have required keys.
            if not set(["street", "city", "zip"]) < set(address):
                if address in addresses:
                    addresses.remove(address)

        # Re-key the addresses
        offices = []
        if addresses:
            # Mariko Yamada's addresses wouldn't parse correctly as of
            # 3/23/2013, so here we're forced to test whether any
            # addresses were even found.
            addresses[0].update(type="capitol", name="Capitol Office")
            offices.append(addresses[0])

            for office in addresses[1:]:
                office.update(type="district", name="District Office")
                offices.append(office)

            for office in offices:
                street = office["street"]
                street = "%s\n%s, %s %s" % (street, office["city"], "CA", office["zip"])
                office["address"] = street
                office["fax"] = None
                office["email"] = None

                del office["street"], office["city"], office["zip"]

        res["offices"] = offices
        del res["address"]

        # Remove junk from assembly member names.
        junk = "Contact Assembly Member "
        res["full_name"] = res["full_name"].pop().replace(junk, "")

        # Normalize party.
        for party in res["party"][:]:
            if party:
                if party == "Democrat":
                    party = "Democratic"
                res["party"] = party
                break
            else:
                res["party"] = None

        # Mariko Yamada also didn't have a url that lxml would parse
        # as of 3/22/2013.
        if res["url"]:
            res["url"] = res["url"].pop()
        else:
            del res["url"]

        # strip leading zero
        res["district"] = str(int(res["district"].pop()))

        # Add a source for the url.
        leg = Legislator(term, chamber, **res)
        leg.update(**res)

        return leg
    def scrape(self, term, chambers):
        # The mayor doesn't sit on council.
        url = 'http://www.phila.gov/'
        doc = lxml.html.fromstring(self.urlopen(url))
        doc.make_links_absolute(url)

        # The mayor's name doesn't appear on the mayor's page!
        full_name  = re.search('Mayor (.+)', doc.xpath('//title/text()')[0].strip()).group(1)
        first_name, middle_name, last_name = parse_full_name(full_name)
        mayor = Person(full_name, first_name, last_name, middle_name)
        mayor.add_source(url)

        url = 'http://www.phila.gov/mayor/'
        doc = lxml.html.fromstring(self.urlopen(url))
        doc.make_links_absolute(url)

        lines   = map(clean_string, doc.xpath('//div[contains(text(),"Mailing Address")]/following-sibling::text()')[1:])
        address = '\n'.join(lines)
        phone   = '-'.join(tel_regex.search(doc.xpath('//strong[contains(text(),"Phone")]/following-sibling::text()[1]')[0]).groups())
        fax     = '-'.join(tel_regex.search(doc.xpath('//strong[contains(text(),"Fax")]/following-sibling::text()[1]')[0]).groups())
        email   = clean_string(doc.xpath('//strong[contains(text(),"Email")]/following-sibling::text()[1]')[0])

        mayor.update(dict(url=url, email=email))
        mayor.add_office('capitol', 'Office of the Mayor', address=address, phone=phone, fax=fax)
        mayor.add_role('Mayor', term)
        mayor.add_source(url)

        self.save_object(mayor)



        council_url = 'http://philadelphiacitycouncil.net/council-members/'
        doc = lxml.html.fromstring(self.urlopen(council_url))
        doc.make_links_absolute(council_url)

        urls = set(doc.xpath('//a[contains(@href, "/council-members/council")]/@href'))
        assert len(urls) <= 17, 'expected 17 unique councilmember URLs, found %d' % len(urls)

        for url in urls:
            doc = lxml.html.fromstring(self.urlopen(url))
            doc.make_links_absolute(url)

            optional  = dict() # fields not all legislators will have
            full_name = []
            first_name  = ''
            middle_name = ''
            last_name   = ''
            suffixes    = ''
            roles     = []
            lines     = []
            lines_office2 = []
            has_office2 = bool(False)
            reached_contact_form = bool(False)
            phone1    = None
            phone1_office2 = None
            phone2    = None
            phone2_office2 = None
            fax       = None
            fax_office2 = None
            office_name = None
            district  = 'At-Large' # default
            photo_url = (
                doc.xpath('//img[contains(@title, "brian picture")]/@src') or  # Special case for BRIAN J. O’NEILL
                doc.xpath('//img[contains(@class, "size-full")]/@src') or
                doc.xpath('//img[contains(@class, "size-medium")]/@src') or
                doc.xpath('//img[contains(@class, "size-thumbnail")]/@src')
            )[0]

            # That's an en dash, not a hyphen.
            parts = re.split(u'[,–]', doc.xpath('//h3/text()')[0])
            for index, part in enumerate(filter(None, parts)):
                part = clean_string(part)
                if index == 0:
                    if 'Councilman' in part:
                        optional['gender'] = 'Male'
                    elif 'Councilwoman' in part:
                        optional['gender'] = 'Female'
                    elif 'Council President' in part:
                        roles.append('Council President')
                    part = re.sub('^Council(?:man|woman| President)\s+', '', part)
                    full_name.append(part)
		    first_name, middle_name, last_name = parse_full_name(full_name[0])
                elif part in ('Jr.', 'Sr.'):
                    full_name.append(part)
		    suffixes = part
                elif 'District' in part:
                    district = part
                else:
                    roles.append(part)
            full_name = ', '.join(full_name)

            contact_url = doc.xpath('//a[text()="Contact"]/@href')[0]
            doc = lxml.html.fromstring(self.urlopen(contact_url))
            doc.make_links_absolute(contact_url)

            # @todo email, personal_url are sometimes in another paragraph.

            parts = doc.xpath('//div[@class="post-entry"]//text()')
            parts = map(clean_string, parts)
	    consuming_address_lines = bool(False)
            for part in filter(None, parts):
 
		# Special case for Curtis Jones Jr.
                if re.match(r'^Local Office:', part):
		    consuming_address_lines = True
                    has_office2 = True
		    office_name = 'Local Office'

                if re.match(r'City Hall Office', part) or re.match(r'^Hours', part) or re.match(r'.*facebook', part) or re.match(r'.*twitter', part) or reached_contact_form:
		    continue

                elif re.match(r'^Contact Council.*man', part) or re.match(r'^Contact CMAL', part):
		    reached_contact_form = True
                    continue

                elif re.match(r'^City Hall.+Room', part):
		    consuming_address_lines = True
                    lines.append(part)

                elif re.match(r'^FAX:', part, re.I) or re.match(r'^F:', part, re.I):
		    consuming_address_lines = False
                    if has_office2 and fax_office2 == None:
               		fax_office2 = '-'.join(tel_regex.search(part).groups())
                    elif fax == None:
               		fax = '-'.join(tel_regex.search(part).groups())

                elif tel_regex.search(part):
		    consuming_address_lines = False
                    if has_office2 and phone1_office2 == None and phone2_office2 == None:
			phone1_office2, phone2_office2 = parse_phones(part)
                    elif phone1 == None and phone2 == None:
			phone1, phone2 = parse_phones(part)

                elif '@' in part:
		    consuming_address_lines = False
                    optional['email'] = re.search('\S+@\S+', part).group()

                elif re.match(r'^Neighborhood Office.*', part):
		    consuming_address_lines = False
                    has_office2 = True

                elif re.match(r'.*Office.*', part) or re.match(r'.*Heroes Hall.*', part):

		    # Special case for Curtis Jones Jr.
		    if re.match(r'.*Local Office.*', part):
			continue

		    if len(lines_office2) > 0:
			consuming_address_lines = False
		    else:
			consuming_address_lines = True
			office_name =  string.strip(part, ':;,.')

                elif consuming_address_lines:
                    if has_office2:
                    	lines_office2.append(cleanup_address(part, False))
                    else:
			lines.append(cleanup_address(part))

                elif re.match(r'^(?:, )?Philadelphia, PA(?: 19107(?:-3290)?)?$', part):
                    pass

                else:
                    self.logger.warning('Skipped: ' + part)

            # Some Councilmembers have no zip code or only a 5-digit zip code.
            # All that changes between them is a room number.
            address = '\n'.join(lines)
            address_office2 = '\n'.join(lines_office2)

            legislator = Legislator(term, 'upper', district, full_name, first_name, last_name, middle_name, suffixes=suffixes, url=url, photo_url=photo_url, party=None)
            legislator.update(optional)

	    if re.search('.*\S.*', address):
      		legislator.add_office('capitol', 'City Hall Office', address=address, phone=phone1, secondary_phone=phone2, fax=fax)

	    if re.search('.*\S.*', address_office2):
      		legislator.add_office('district', office_name, address=address_office2, phone=phone1_office2, secondary_phone=phone2_office2, fax=fax_office2)

            legislator.add_source(url)

            for role in roles:
                legislator.add_role(role, term)

            self.save_legislator(legislator)
Beispiel #28
0
    def parse_legislator(self, tr, term, chamber,

            strip=methodcaller('strip'),

            xpath='td[contains(@class, "views-field-field-%s-%s")]%s',

            xp={'url':      ('lname-value-1', '/a/@href'),
                'district': ('district-value', '/text()'),
                'party':    ('party-value', '/text()'),
                'full_name':     ('feedbackurl-value', '/a/text()'),
                'address':  ('feedbackurl-value', '/p/text()')},

            titles={'upper': 'senator', 'lower': 'member'},

            funcs={
                'full_name': lambda s: s.replace('Contact Senator', '').strip(),
                'address': parse_address,
                }):
        '''
        Given a tr element, get specific data from it.
        '''
        rubberstamp = lambda _: _
        tr_xpath = tr.xpath
        res = {}
        for k, v in xp.items():

            f = funcs.get(k, rubberstamp)
            v = (titles[chamber],) + v
            v = map(f, map(strip, tr_xpath(xpath % v)))

            if len(v) == 1:
                res[k] = v[0]
            else:
                res[k] = v

        # Photo.
        try:
            res['photo_url'] = tr_xpath('td/p/img/@src')[0]
        except IndexError:
            pass

        # Addresses.
        addresses = res['address']
        try:
            addresses = map(dict, filter(None, addresses))
        except ValueError:
            # Sometimes legislators only have one address, in which
            # case this awful hack is helpful.
            addresses = map(dict, filter(None, [addresses]))

        for x in addresses:
            try:
                x['zip'] = x['zip'].replace('CA ', '')
            except KeyError:
                # No zip? Toss.
                addresses.remove(x)

        # Re-key the addresses
        addresses[0].update(type='capitol', name='Capitol Office')
        offices = [addresses[0]]
        for office in addresses[1:]:
            office.update(type='district', name='District Office')
            offices.append(office)

        for office in offices:
            street = office['street']
            street = '%s\n%s, %s %s' % (street, office['city'], 'CA',
                                        office['zip'])
            office['address'] = street
            office['fax'] = None
            office['email'] = None

            del office['street'], office['city'], office['zip']
        res['offices'] = offices
        del res['address']

        # Remove junk from assembly member names.
        junk = 'Contact Assembly Member '
        res['full_name'] = res['full_name'].replace(junk, '')

        # convert party
        if res['party'] == 'Democrat':
            res['party'] = 'Democratic'
        # strip leading zero
        res['district'] = str(int(res['district']))

        # Add a source for the url.
        leg = Legislator(term, chamber, **res)
        leg.update(**res)

        return leg
Beispiel #29
0
    def scrape(self, chamber, term):

        for tdata in self.metadata['terms']:
            if term == tdata['name']:
                year = tdata['start_year']
                session_number = tdata['session_number']
                break

        # Fetch the csv.
        url = 'http://leg.mt.gov/content/sessions/%s/%d%sMembers.txt' % \
            (session_number, year, chamber == 'upper' and 'Senate' or 'House')

        # Parse it.
        data = self.urlopen(url)
        data = data.replace('"""', '"')  # weird triple quotes
        data = data.splitlines()

        fieldnames = ['last_name', 'first_name', 'party', 'district',
                      'address', 'city', 'state', 'zip']
        csv_parser = csv.DictReader(data, fieldnames)

        district_leg_urls = self._district_legislator_dict()

        # Toss the row headers.
        next(csv_parser)

        for entry in csv_parser:
            if not entry:
                continue

            # City.
            entry['city'] = entry['city']

            # Address.
            entry['address'] = entry['address']

            # District.
            district = entry['district']
            hd_or_sd, district = district.split()
            del entry['district']

            # Party.
            party_letter = entry['party']
            party = {'D': 'Democratic', 'R': 'Republican'}[party_letter]
            entry['party'] = party
            del entry['party']

            # Get full name properly capped.
            fullname = _fullname = '%s %s' % (entry['first_name'].capitalize(),
                                   entry['last_name'].capitalize())

            city_lower = entry['city'].lower()

            # Get any info at the legislator's detail_url.
            detail_url = district_leg_urls[hd_or_sd][district]

            # Get the office.
            address = '\n'.join([
                entry['address'],
                '%s, %s %s' % (entry['city'], entry['state'], entry['zip'])
                ])

            office = dict(
                name='District Office', type='district', phone=None,
                fax=None, email=None,
                address=address)

            deets = self._scrape_details(detail_url)

            # Add the details and delete junk.
            entry.update(deets)
            del entry['first_name'], entry['last_name']

            legislator = Legislator(term, chamber, district, fullname,
                                    party=party)
            legislator.update(entry)
            legislator.add_source(detail_url)
            legislator.add_source(url)
            legislator['url'] = detail_url

            office['phone'] = deets.get('phone')
            office['fax'] = deets.get('fax')
            legislator.add_office(**office)

            self.save_legislator(legislator)
Beispiel #30
0
    def parse_legislator(
        self,
        tr,
        term,
        chamber,
        strip=methodcaller('strip'),
        xpath='td[contains(@class, "views-field-field-%s-%s")]%s',
        xp={
            'url': ('lname-value-1', '/a/@href'),
            'district': ('district-value', '/text()'),
            'party': ('party-value', '/text()'),
            'full_name': ('feedbackurl-value', '/a/text()'),
            'address': ('feedbackurl-value', '/p/text()')
        },
        titles={
            'upper': 'senator',
            'lower': 'member'
        },
        funcs={
            'full_name': lambda s: s.replace('Contact Senator', '').strip(),
            'address': parse_address,
        }):
        '''
        Given a tr element, get specific data from it.
        '''
        rubberstamp = lambda _: _
        tr_xpath = tr.xpath
        res = {}
        for k, v in xp.items():

            f = funcs.get(k, rubberstamp)
            v = (titles[chamber], ) + v
            v = map(f, map(strip, tr_xpath(xpath % v)))

            if len(v) == 1:
                res[k] = v[0]
            else:
                res[k] = v

        # Photo.
        try:
            res['photo_url'] = tr_xpath('td/p/img/@src')[0]
        except IndexError:
            pass

        # Addresses.
        addresses = res['address']
        try:
            addresses = map(dict, filter(None, addresses))
        except ValueError:
            # Sometimes legislators only have one address, in which
            # case this awful hack is helpful.
            addresses = map(dict, filter(None, [addresses]))

        for x in addresses:
            try:
                x['zip'] = x['zip'].replace('CA ', '')
            except KeyError:
                # No zip? Toss.
                addresses.remove(x)

        # Re-key the addresses
        addresses[0].update(type='capitol', name='Capitol Office')
        offices = [addresses[0]]
        for office in addresses[1:]:
            office.update(type='district', name='District Office')
            offices.append(office)

        for office in offices:
            street = office['street']
            street = '%s\n%s, %s %s' % (street, office['city'], 'CA',
                                        office['zip'])
            office['address'] = street
            office['fax'] = None
            office['email'] = None

            del office['street'], office['city'], office['zip']
        res['offices'] = offices
        del res['address']

        # Remove junk from assembly member names.
        junk = 'Contact Assembly Member '
        res['full_name'] = res['full_name'].replace(junk, '')

        # convert party
        if res['party'] == 'Democrat':
            res['party'] = 'Democratic'
        # strip leading zero
        res['district'] = str(int(res['district']))

        # Add a source for the url.
        leg = Legislator(term, chamber, **res)
        leg.update(**res)

        return leg
Beispiel #31
0
    def parse_assembly(self, tr, term, chamber):
        '''
        Given a tr element, get specific data from it.
        '''

        strip = methodcaller('strip')

        xpath = 'td[contains(@class, "views-field-field-%s-%s")]%s'

        xp = {
            'url':       [('lname-value-1', '/a/@href'),
                          ('member-lname-value-1', '/a/@href')],
            'district':  [('district-value', '/text()')],
            'party':     [('party-value', '/text()')],
            'full_name': [('feedbackurl-value', '/a/text()')],
            'address':   [('feedbackurl-value', '/p/text()'),
                          ('feedbackurl-value', '/p/font/text()')]
            }

        titles = {'upper': 'senator', 'lower': 'member'}

        funcs = {
            'full_name': lambda s: s.replace('Contact Senator', '').strip(),
            'address': parse_address,
            }

        rubberstamp = lambda _: _
        tr_xpath = tr.xpath
        res = collections.defaultdict(list)
        for k, xpath_info in xp.items():
            for vals in xpath_info:
                f = funcs.get(k, rubberstamp)
                vals = (titles[chamber],) + vals
                vals = map(f, map(strip, tr_xpath(xpath % vals)))

                res[k].extend(vals)

        # Photo.
        try:
            res['photo_url'] = tr_xpath('td/p/img/@src')[0]
        except IndexError:
            pass

        # Addresses.
        addresses = res['address']
        try:
            addresses = map(dict, filter(None, addresses))
        except ValueError:
            # Sometimes legislators only have one address, in which
            # case this awful hack is helpful.
            addresses = map(dict, filter(None, [addresses]))

        for address in addresses[:]:

            # Toss results that don't have required keys.
            if not set(['street', 'city', 'zip']) < set(address):
                if address in addresses:
                    addresses.remove(address)

        # Re-key the addresses
        offices = []
        if addresses:
            # Mariko Yamada's addresses wouldn't parse correctly as of
            # 3/23/2013, so here we're forced to test whether any
            # addresses were even found.
            addresses[0].update(type='capitol', name='Capitol Office')
            offices.append(addresses[0])

            for office in addresses[1:]:
                office.update(type='district', name='District Office')
                offices.append(office)

            for office in offices:
                street = office['street']
                street = '%s\n%s, %s %s' % (street, office['city'], 'CA',
                                            office['zip'])
                office['address'] = street
                office['fax'] = None
                office['email'] = None

                del office['street'], office['city'], office['zip']

        res['offices'] = offices
        del res['address']

        # Remove junk from assembly member names.
        junk = 'Contact Assembly Member '

        try:
            res['full_name'] = res['full_name'].pop().replace(junk, '')
        except IndexError:
            return

        # Normalize party.
        for party in res['party'][:]:
            if party:
                if party == 'Democrat':
                    party = 'Democratic'
                res['party'] = party
                break
            else:
                res['party'] = None

        # Mariko Yamada also didn't have a url that lxml would parse
        # as of 3/22/2013.
        if res['url']:
            res['url'] = res['url'].pop()
        else:
            del res['url']

        # strip leading zero
        res['district'] = str(int(res['district'].pop()))

        # Add a source for the url.
        leg = Legislator(term, chamber, **res)
        leg.update(**res)

        return leg
    def scrape(self, term, chambers):
        # The mayor doesn't sit on council.
        url = 'http://www.phila.gov/'
        doc = lxml.html.fromstring(self.urlopen(url))
        doc.make_links_absolute(url)

        # The mayor's name doesn't appear on the mayor's page!
        name  = re.search('Mayor (.+)', doc.xpath('//title/text()')[0].strip()).group(1)
        mayor = Person(name)
        mayor.add_source(url)

        url = 'http://www.phila.gov/mayor/'
        doc = lxml.html.fromstring(self.urlopen(url))
        doc.make_links_absolute(url)

        lines   = map(clean_string, doc.xpath('//div[contains(text(),"Mailing Address")]/following-sibling::text()')[1:])
        address = '\n'.join(lines)
        phone   = '-'.join(tel_regex.search(doc.xpath('//strong[contains(text(),"Phone")]/following-sibling::text()[1]')[0]).groups())
        fax     = '-'.join(tel_regex.search(doc.xpath('//strong[contains(text(),"Fax")]/following-sibling::text()[1]')[0]).groups())
        email   = clean_string(doc.xpath('//strong[contains(text(),"Email")]/following-sibling::text()[1]')[0])

        mayor.update(dict(url=url, email=email))
        mayor.add_office('capitol', 'Office of the Mayor', address=address, phone=phone, fax=fax)
        mayor.add_role('Mayor', term)
        mayor.add_source(url)

        self.save_object(mayor)



        council_url = 'http://philadelphiacitycouncil.net/council-members/'
        doc = lxml.html.fromstring(self.urlopen(council_url))
        doc.make_links_absolute(council_url)

        urls = set(doc.xpath('//a[contains(@href, "/council-members/council")]/@href'))
        assert len(urls) <= 17, 'expected 17 unique councilmember URLs, found %d' % len(urls)

        for url in urls:
            doc = lxml.html.fromstring(self.urlopen(url))
            doc.make_links_absolute(url)

            optional  = dict() # fields not all legislators will have
            name      = []
            roles     = []
            lines     = []
            phone1    = None
            phone2    = None
            fax       = None
            district  = 'At-Large' # default
            photo_url = (
                doc.xpath('//img[contains(@class, "size-full")]/@src') or
                doc.xpath('//img[contains(@class, "size-medium")]/@src') or
                doc.xpath('//img[contains(@class, "size-thumbnail")]/@src')
            )[0]

            # That's an en dash, not a hyphen.
            parts = re.split(u'[,–]', doc.xpath('//h3/text()')[0])
            for index, part in enumerate(filter(None, parts)):
                part = clean_string(part)
                if index == 0:
                    if 'Councilman' in part:
                        optional['gender'] = 'Male'
                    elif 'Councilwoman' in part:
                        optional['gender'] = 'Female'
                    elif 'Council President' in part:
                        roles.append('Council President')
                    part = re.sub('^Council(?:man|woman| President)\s+', '', part)
                    name.append(part)
                elif part in ('Jr.', 'Sr.'):
                    name.append(part)
                elif 'District' in part:
                    district = part
                else:
                    roles.append(part)
            name = ', '.join(name)

            contact_url = doc.xpath('//a[text()="Contact"]/@href')[0]
            doc = lxml.html.fromstring(self.urlopen(contact_url))
            doc.make_links_absolute(contact_url)

            # @todo email, second office, personal_url are sometimes in another paragraph.
            if len(doc.xpath('//div[@class="post-entry"]/p')) > 1:
                self.logger.warning('Skipped paragraphs:\n' + '\n'.join(lxml.html.tostring(html) for html in doc.xpath('//div[@class="post-entry"]/p[position()>1]')))

            parts = doc.xpath('//div[@class="post-entry"]/p[position()=1]//text()') or doc.xpath('//div[@class="post-entry"]//text()')
            parts = map(clean_string, parts)
            for part in filter(None, parts):
                if re.match(r'^City Hall', part):
                    lines.append('City Hall, Room %s' % re.search('Room (\d+)', part).group(1))
                elif re.match(r'^FAX:', part, re.I):
                    fax = '-'.join(tel_regex.search(part).groups())
                elif tel_regex.search(part):
                    if phone1:
                        self.logger.warning('Already have phone numbers for one office: ' + part)
                    else:
                        phones = tel_regex.findall(part)
                        phone1 = '-'.join(phones[0])
                        if len(phones) == 2:
                            phone2 = '-'.join(phones[1])
                        else:
                            phone2 = phone1[:8] + re.search(r'(?: or |/)(\d{4})$', parts[2]).group(1)
                elif '@' in part:
                    optional['email'] = re.search('\S+@\S+', part).group()
                elif re.match(r'^(?:, )?Philadelphia, PA(?: 19107(?:-3290)?)?$', part):
                    pass
                else: # @todo second office is sometimes in the same paragraph.
                    self.logger.warning('Skipped: ' + part)

            # Some Councilmembers have no zip code or only a 5-digit zip code.
            # All that changes between them is a room number.
            lines.append('Philadelphia, PA 19107-3290')
            address = '\n'.join(lines)

            legislator = Legislator(term, 'upper', district, name, url=url, photo_url=photo_url, party=None)
            legislator.update(optional)
            legislator.add_office('capitol', 'Council Office', address=address, phone=phone1, secondary_phone=phone2, fax=fax)
            legislator.add_source(url)

            for role in roles:
                legislator.add_role(role, term)

            self.save_legislator(legislator)