def scrape_counciler(self, url):
        page = self.lxmlize(url)
        who, = page.xpath("//h3[@class='subtitle']/text()")
        district, = page.xpath("//div[@class='right-bar']//h2/text()")
        image, = page.xpath(
            "//div[@class='left-bar']//a[@class='image lightbox']//img"
        )

        member = Person(
            primary_org='legislature',
            name=who, district=district,
            image=image.attrib['src']
        )
        member.add_source(url)

        details = page.xpath("//table[@align='center']//td")
        for detail in details:
            detail = detail.text_content().strip()
            if detail is None or detail == "":
                continue

            type_, value = detail.split(":", 1)
            cdtype = {
                "Home Phone": "voice",
                "Address": "address",
                "Email": "email",
                "Cell Phone": "voice",
            }[type_]
            member.add_contact_detail(type=cdtype,
                                      note=type_,
                                      value=value)

        yield member
Example #2
0
	def scrape_alderman(self, ward_num):
		ward_url = "{}/ward-{}".format(Utils.ALDERMEN_HOME, ward_num)
		alderman_url = self.alderman_url(ward_url)
		alderman_page = self.lxmlize(alderman_url)

		# person's name is the only <h1> tag on the page
		name = alderman_page.xpath("//h1/text()")[0]

		# initialize person object with appropriate data so that pupa can 
		# automatically create a membership object linking this person to
		# a post in the jurisdiction's "Board of Aldermen" organization
		district = "Ward {} Alderman".format(ward_num)
		person = Person(name=name, district=district, role="Alderman", 
										primary_org="legislature")

		# set additional fields
		person.image = alderman_page.xpath("//div/img/@src")[0]
		phone_number = alderman_page.xpath("//strong[text()='Phone:']/../text()")[1].strip()
		person.add_contact_detail(type="voice", value=phone_number)

		# add sources
		person.add_source(alderman_url, note="profile")
		person.add_source(ward_url, note="ward")

		return person
Example #3
0
def table_row_to_legislator_and_profile_url(table_row_element, chamber):
    """Derive a Legislator from an HTML table row lxml Element, and a link to their profile"""
    td_elements = table_row_element.xpath('td')
    (role_element, name_element, district_element, party_element,
     phone_element, email_element) = td_elements

    # Name comes in the form Last, First
    # last_name_first_name = name_element.text_content().strip()
    # full_name = last_name_first_name_to_full_name(last_name_first_name)
    full_name = name_element.text_content().strip()
    district = district_element.text_content().strip()
    party = party_element.text_content().strip()
    if party == 'Democrat':
        party = 'Democratic'

    role = role_element.text_content().strip()
    address = co_address_from_role(role)
    phone = phone_element.text_content().strip()
    email = email_element.text_content().strip()

    (profile_url, ) = name_element.xpath('a/@href')
    print(chamber, district, party)
    legislator = Person(primary_org=chamber,
                        name=full_name,
                        district=district,
                        party=party)
    legislator.add_contact_detail(type='address', value=address, note='Capitol Office')
    legislator.add_contact_detail(type='voice', value=phone, note='Capitol Office')
    legislator.add_contact_detail(type='email', value=email, note='Capitol Office')

    return legislator, profile_url
Example #4
0
def test_full_person():
    person = ScrapePerson('Tom Sawyer')
    person.add_identifier('1')
    person.add_name('Tommy', start_date='1880')
    person.add_contact_detail(type='phone', value='555-555-1234', note='this is fake')
    person.add_link('http://example.com/link')
    person.add_source('http://example.com/source')

    # import person
    pd = person.as_dict()
    PersonImporter('jurisdiction-id').import_data([pd])

    # get person from db and assert it imported correctly
    p = Person.objects.get()
    assert 'ocd-person' in p.id
    assert p.name == person.name

    assert p.identifiers.all()[0].identifier == '1'
    assert p.identifiers.all()[0].scheme == ''

    assert p.other_names.all()[0].name == 'Tommy'
    assert p.other_names.all()[0].start_date == '1880'

    assert p.contact_details.all()[0].type == 'phone'
    assert p.contact_details.all()[0].value == '555-555-1234'
    assert p.contact_details.all()[0].note == 'this is fake'

    assert p.links.all()[0].url == 'http://example.com/link'
    assert p.sources.all()[0].url == 'http://example.com/source'
Example #5
0
    def handle_list_item(self, item):
        photo_url = item.xpath('./img/@src')[0]
        url = item.xpath('.//h5/a/@href')[0]
        name_text = item.xpath('.//h5/a/b/text()')[0]

        name_match = re.match(r'^(.+)\(([0-9]{2}[AB]), ([A-Z]+)\)$', name_text)
        name = name_match.group(1).strip()
        district = name_match.group(2).lstrip('0').upper()
        party_text = name_match.group(3)
        party = PARTIES[party_text]

        info_texts = [x.strip() for x in item.xpath(
            './div/text()[normalize-space()]'
        ) if x.strip()]
        address = '\n'.join((info_texts[0], info_texts[1]))

        phone_text = info_texts[2]
        if validate_phone_number(phone_text):
            phone = phone_text

        email_text = item.xpath('.//a/@href')[1].replace('mailto:', '').strip()
        if validate_email_address(email_text):
            email = email_text

        rep = Person(name=name, district=district, party=party,
                     primary_org='lower', role='Representative',
                     image=photo_url)
        rep.add_link(url)
        rep.add_contact_detail(type='address', value=address, note='capitol')
        rep.add_contact_detail(type='voice', value=phone, note='capitol')
        rep.add_contact_detail(type='email', value=email, note='capitol')
        rep.add_source(self.url)

        yield rep
Example #6
0
    def scrape_chamber(self, chamber):
        """
        Scrapes legislators for the current term only
        """
        # self.validate_term(term, latest_only=True)
        url = BASE_URL % CHAMBERS[chamber].lower()
        index = self.get(url).text
        html = lxml.html.fromstring(index)
        html.make_links_absolute(url)

        rows = html.xpath('//div[contains(@class, "row-equal-height")]')

        for row in rows:
            img_url = row.xpath('.//img/@src')[0]

            inner = row.xpath('.//div[@class="vc-column-innner-wrapper"]')[1]
            inner_text = inner.text_content()
            if 'Resigned' in inner_text or 'Substitute' in inner_text:
                continue

            name = inner.xpath('p/strong')[0].text.replace(u'\xa0', ' ').strip()
            name = re.sub(r'\s+', ' ', name)
            party = PARTY[inner.xpath('p/strong')[0].tail.strip()]
            email = inner.xpath('p/strong/a')[0].text
            district = inner.xpath('p/a')[0].text.replace('District ', '')

            person_url = inner.xpath('p/a/@href')[0]
            # skip roles for now
            role = ''
            # for com in inner.xpath('p/a[contains(@href, "committees")]'):
            #     role = com.tail.strip()

            person = Person(name=name, district=district,
                            party=party, primary_org=chamber,
                            image=img_url, role=role)
            phones = get_phones(inner)
            phone = phones.get('home') or phones.get('business')
            office_phone = phones.get('office')
            address = get_address(inner)
            fax = get_fax(inner)
            if address:
                person.add_contact_detail(type='address', value=address,
                                          note='District Office')
            if phone:
                person.add_contact_detail(type='voice', value=phone,
                                          note='District Office')
            if fax:
                person.add_contact_detail(type='fax', value=fax,
                                          note='District Office')
            if email:
                person.add_contact_detail(type='email', value=email,
                                          note='District Office')
            if office_phone:
                person.add_contact_detail(type='voice', value=office_phone,
                                          note='Capitol Office')
            person.add_source(url)
            person.add_link(person_url)
            yield person
Example #7
0
    def scrape_lower(self, chamber):
        url = 'http://www.house.mi.gov/mhrpublic/frmRepList.aspx'
        table = [
            "website",
            "district",
            "name",
            "party",
            "location",
            "phone",
            "email"
        ]

        data = self.get(url).text
        doc = lxml.html.fromstring(data)

        # skip two rows at top
        for row in doc.xpath('//table[@id="grvRepInfo"]/*'):
            tds = row.xpath('.//td')
            if len(tds) == 0:
                continue
            metainf = {}
            for i in range(0, len(table)):
                metainf[table[i]] = tds[i]
            district = str(int(metainf['district'].text_content().strip()))
            party = metainf['party'].text_content().strip()
            phone = metainf['phone'].text_content().strip()
            email = metainf['email'].text_content().strip()
            leg_url = metainf['website'].xpath("./a")[0].attrib['href']
            name = metainf['name'].text_content().strip()
            if name == 'Vacant' or re.match(r'^District \d{1,3}$', name):
                self.warning('District {} appears vacant, and will be skipped'.format(district))
                continue

            office = metainf['location'].text_content().strip()
            office = re.sub(
                ' HOB',
                ' Anderson House Office Building\n124 North Capitol Avenue\nLansing, MI 48933',
                office
            )
            office = re.sub(
                ' CB',
                ' State Capitol Building\nLansing, MI 48909',
                office
            )

            photo_url = self.get_photo_url(leg_url)
            person = Person(name=name, district=district, party=abbr[party],
                            primary_org='lower', image=photo_url[0] if photo_url else None)

            person.add_link(leg_url)
            person.add_source(leg_url)

            person.add_contact_detail(type='address', value=office, note='Capitol Office')
            person.add_contact_detail(type='voice', value=phone, note='Capitol Office')
            person.add_contact_detail(type='email', value=email, note='Capitol Office')

            yield person
Example #8
0
    def scrape_member_page(self, chamber, url):
        page = self.get(url).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        for legislator in page.xpath(
                "//div[contains(concat(' ', normalize-space(@class), ' '), "
                "' memberModule ')]"
                ):
            img = legislator.xpath(
                ".//div[@class='thumbnail']//img")[0].attrib['src']
            data = legislator.xpath(".//div[@class='data']")[0]
            homepage = data.xpath(".//a[@class='black']")[0]
            full_name = homepage.text_content()

            if "Vacant" in full_name:
                continue

            homepage = homepage.attrib['href']
            party = data.xpath(
                ".//span[@class='partyLetter']")[0].text_content()
            party = {"R": "Republican", "D": "Democratic"}[party]
            office_lines = data.xpath("child::text()")
            phone = office_lines.pop(-1)
            office = "\n".join(office_lines)
            h3 = data.xpath("./h3")
            if len(h3):
                h3 = h3[0]
                district = h3.xpath("./br")[0].tail.replace("District", ""
                                                            ).strip()
            else:
                district = re.findall(
                    r"\d+\.png",
                    legislator.attrib['style']
                )[-1].split(".", 1)[0]

            full_name = re.sub(r"\s+", " ", full_name).strip()
            email = (
                'rep{0:0{width}}@ohiohouse.gov'
                if chamber == 'lower' else
                'sd{0:0{width}}@ohiosenate.gov'
            ).format(int(district), width=2)

            leg = Person(name=full_name, district=district,
                         party=party, primary_org=chamber,
                         image=img)

            leg.add_contact_detail(type='address', value=office, note='Capitol Office')
            leg.add_contact_detail(type='voice', value=phone, note='Capitol Office')
            leg.add_contact_detail(type='email', value=email, note='Capitol Office')

            self.scrape_homepage(leg, chamber, homepage)

            leg.add_source(url)
            leg.add_link(homepage)
            yield leg
Example #9
0
    def scrape_lower_legislator(self, url, leg_info):
        page = self.lxmlize(url)

        name = page.xpath(
            '//span[@id="body_FormView5_FULLNAMELabel"]/text()'
            )[0].strip()
        if name.startswith("District ") or name.startswith("Vacant "):
            self.warning("Seat is vacant: {}".format(name))
            return

        photo = page.xpath(
            '//img[contains(@src, "/h_reps/RepPics")]'
            )[0].attrib['src']
        party_flags = {
            "Democrat": "Democratic",
            "Republican": "Republican",
            "Independent": "Independent"
        }
        party_info = page.xpath(
            '//span[@id="body_FormView5_PARTYAFFILIATIONLabel"]/text()'
            )[0].strip()
        party = party_flags[party_info]
        try:
            email = page.xpath(
                '//span[@id="body_FormView6_EMAILADDRESSPUBLICLabel"]/text()'
                )[0].strip()
        except IndexError:
            email = None
        district = leg_info['dist'].replace('Dist', '').strip()

        person = Person(name=name,
                        party=party,
                        district=district,
                        primary_org='lower',
                        image=photo)

        contacts = [
            (leg_info["office"], "address"),
            (leg_info["phone"], "voice"),
            (email, "email"),
        ]

        for value, key in contacts:
            if value:
                person.add_contact_detail(type=key,
                                          value=value,
                                          note="District Office")

        person.add_source(url)
        person.add_link(url)

        yield person
Example #10
0
    def scrape_member(self, chamber, member_url):
        member_page = self.get(member_url).text
        doc = lxml.html.fromstring(member_page)

        photo_url = doc.xpath('//div[@id="bioImage"]/img/@src')[0]
        name_pieces = doc.xpath('//span[@id="name"]/text()')[0].split()
        full_name = ' '.join(name_pieces[1:-1]).strip()

        party = name_pieces[-1]
        if party == '(R)':
            party = 'Republican'
        elif party == '(D)':
            party = 'Democratic'
        elif party == '(I)':
            party = 'Independent'

        district = doc.xpath('//span[@id="districtHeader"]/text()')[0].split()[-1]

        person = Person(name=full_name, district=district, party=party,
                        primary_org=chamber, image=photo_url)
        person.add_source(member_url)
        person.add_link(member_url)

        address = '\n'.join(doc.xpath('//div[@id="FrankfortAddresses"]//'
                                      'span[@class="bioText"]/text()'))

        phone = None
        fax = None
        phone_numbers = doc.xpath('//div[@id="PhoneNumbers"]//span[@class="bioText"]/text()')
        for num in phone_numbers:
            if num.startswith('Annex: '):
                num = num.replace('Annex: ', '')
                if num.endswith(' (fax)'):
                    fax = num.replace(' (fax)', '')
                else:
                    phone = num

        emails = doc.xpath(
            '//div[@id="EmailAddresses"]//span[@class="bioText"]//a/text()'
        )
        email = reduce(
            lambda match, address: address if '@lrc.ky.gov' in str(address) else match,
            [None] + emails
        )

        if phone:
            person.add_contact_detail(type='voice', value=phone, note='Capitol Office')

        if fax:
            person.add_contact_detail(type='fax', value=fax, note='Capitol Office')

        if email:
            person.add_contact_detail(type='email', value=email, note='Capitol Office')

        if address.strip() == "":
            self.warning("Missing Capitol Office!!")
        else:
            person.add_contact_detail(type='address', value=address, note='Capitol Office')

        yield person
Example #11
0
    def scrape_upper_chamber(self, term):
        url = 'https://senado.pr.gov/Pages/Senadores.aspx'

        doc = self.lxmlize(url)
        links = self.get_nodes(doc, '//ul[@class="senadores-list"]/li/a/@href')

        for link in links:
            senator_page = self.lxmlize(link)
            profile_links = self.get_nodes(senator_page, '//ul[@class="profiles-links"]/li')

            name_text = self.get_node(senator_page, '//span[@class="name"]').text_content().strip()
            # Convert to title case as some names are in all-caps
            name = re.sub(r'^Hon\.', '', name_text, flags=re.IGNORECASE).strip().title()
            party = profile_links[0].text_content().strip()
            # Translate to English since being an Independent is a universal construct
            if party == "Independiente":
                party = "Independent"

            photo_url = self.get_node(senator_page, '//div[@class="avatar"]//img/@src')

            if profile_links[1].text_content().strip() == "Senador por Distrito":
                district_text = self.get_node(
                    senator_page,
                    '//div[@class="module-distrito"]//span[@class="headline"]').text_content()
                district = district_text.replace('DISTRITO', '', 1).replace('\u200b', '').strip()
            elif profile_links[1].text_content().strip() == "Senador por Acumulación":
                district = "At-Large"

            phone_node = self.get_node(senator_page, '//a[@class="contact-data tel"]')
            phone = phone_node.text_content().strip()
            email_node = self.get_node(senator_page, '//a[@class="contact-data email"]')
            email = email_node.text_content().replace('\u200b', '').strip()

            person = Person(primary_org='upper',
                            district=district,
                            name=name,
                            party=party,
                            image=photo_url)
            person.add_contact_detail(type='email',
                                      value=email,
                                      note='Capitol Office')
            person.add_contact_detail(type='voice',
                                      value=phone,
                                      note='Capitol Office')
            person.add_link(link)
            person.add_source(link)

            yield person
Example #12
0
    def scrape_senator_page(self, chamber, url):
        page = self.get(url).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        for legislator in page.xpath(
                "//div[@id='senators']//div[contains(concat(' ', normalize-space(@class), ' '), "
                "' portraitContainer ')]"):
            img = legislator.xpath(".//div[@class='profileThumbnailBoundingBox']/@style")[0]
            img = img[img.find('(')+1:img.find(')')]
            full_name = legislator.xpath(".//div[@class='profileName']/a/text()")[0]
            homepage_url = legislator.xpath(".//a[@class='profileImageLink']")[0].attrib['href']
            district = legislator.xpath(".//div[@class='profileDistrict']"
                                        "/a/text()")[0].split("#")[1]

            if "Vacant" in full_name:
                continue

            homepage = self.get(homepage_url).text
            page = lxml.html.fromstring(homepage)
            phone = page.xpath("//div[@class='phone']/span/text()")[0]

            address_lines = page.xpath("//div[@class='address']/span/text()")
            address = "\n".join(address_lines)

            party_image = page.xpath('//div[@class="senatorParty"]/img/@src')[0]
            if 'Republican' in party_image:
                party = 'Republican'
            elif 'Democrat' in party_image:
                party = 'Democratic'

            email = (
                'rep{0:0{width}}@ohiohouse.gov'
                if chamber == 'lower' else
                'sd{0:0{width}}@ohiosenate.gov'
            ).format(int(district), width=2)

            leg = Person(name=full_name, district=district,
                         primary_org=chamber, image=img, party=party)

            leg.add_contact_detail(type='address', value=address, note='Capitol Office')
            leg.add_contact_detail(type='voice', value=phone, note='Capitol Office')
            leg.add_contact_detail(type='email', value=email, note='Capitol Office')

            leg.add_source(url)
            leg.add_link(homepage_url)
            yield leg
Example #13
0
    def scrape_chamber(self, chamber):
        client = ApiClient(self)
        session = self.latest_session()
        base_url = "http://iga.in.gov/legislative"
        api_base_url = "https://api.iga.in.gov"
        chamber_name = "senate" if chamber == "upper" else "house"
        r = client.get("chamber_legislators", session=session, chamber=chamber_name)
        all_pages = client.unpaginate(r)
        for leg in all_pages:
            firstname = leg["firstName"]
            lastname = leg["lastName"]
            party = leg["party"]
            link = leg["link"]
            api_link = api_base_url+link
            html_link = base_url+link.replace("legislators/", "legislators/legislator_")
            try:
                html = get_with_increasing_timeout(self, html_link, fail=True,
                                                   kwargs={"verify": False})
            except scrapelib.HTTPError:
                self.logger.warning("Legislator's page is not available.")
                continue
            doc = lxml.html.fromstring(html.text)
            doc.make_links_absolute(html_link)
            address, phone = doc.xpath("//address")
            address = address.text_content().strip()
            address = "\n".join([l.strip() for l in address.split("\n")])
            phone = phone.text_content().strip()
            try:
                district = doc.xpath("//span[@class='district-heading']"
                                     )[0].text.lower().replace("district", "").strip()
            except IndexError:
                self.warning("skipping legislator w/o district")
                continue
            image_link = base_url+link.replace("legislators/", "portraits/legislator_")
            legislator = Person(primary_org=chamber,
                                district=district,
                                name=" ".join([firstname, lastname]),
                                party=party,
                                image=image_link)
            legislator.add_contact_detail(type="address", note="Capitol Office", value=address)
            legislator.add_contact_detail(type="voice", note="Capitol Office", value=phone)
            legislator.add_link(html_link)
            legislator.add_source(html_link)
            legislator.add_source(api_link)

            yield legislator
Example #14
0
    def scrape_rep(self, url):

        page = self.get(url).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        main = page.xpath('//div[@id="main-info"]')[0]
        if 'Resigned' in main.text_content():
            print("Member resigned {}".format(url))
            raise StopIteration   # don't yield anything

        name = page.xpath('//div[@class="member-name"]/text()')[0].strip()
        name = re.sub(r'\s+', ' ', name)
        district_number = page.xpath(
            '//span[contains(text(), "House District:")]'
            '/following-sibling::span/text()')[0].strip()
        # remove anything after first whitespace
        district_number = re.sub(r'\s.*', '', district_number.strip())

        email = None
        email_content = page.xpath('//a[./i[contains(@class,"fa-envelope")]]/text()')
        if email_content and email_content[0].strip():
            email = email_content[0].strip()

        photo_url = page.xpath('//header[@id="home"]/img/@src')[0]

        party = self.get_rep_table_by_header(page, 'Party Affiliation').text.strip()
        party = _party_map[party[0]]  # standardize

        main_p_text = page.xpath('//div[@id="main-info"]/p/text()')
        address = [t.strip() for t in main_p_text if t.strip()][0]

        person = Person(
            name=name,
            district=district_number,
            primary_org='lower',
            party=party,
            image=photo_url,
        )

        person.add_contact_detail(type='address', value=address, note='District Office')
        person.add_contact_detail(type='email', value=email, note='District Office')

        person.add_source(url)

        yield person
Example #15
0
    def handle_list_item(self, row):
        if not row['First Name']:
            return
        name = '{} {}'.format(row['First Name'], row['Last Name'])
        party = PARTIES[row['Party']]
        leg = Person(name=name, district=row['District'].lstrip('0'),
                     party=party, primary_org='upper', role='Senator',
                     image=self.extra_info[name]['image'])
        leg.add_link(self.extra_info[name]['url'])
        leg.add_contact_detail(type='voice',
                               value=self.extra_info[name]['office_phone'], note='capitol')
        if 'email' in self.extra_info[name]:
            leg.add_contact_detail(type='email',
                                   value=self.extra_info[name]['email'], note='capitol')

        row['Zipcode'] = row['Zipcode'].strip()
        # Accommodate for multiple address column naming conventions.
        address1_fields = [row.get('Address'), row.get('Office Building')]
        address2_fields = [row.get('Address2'), row.get('Office Address')]
        row['Address'] = next((a for a in address1_fields if a is not
                               None), False)
        row['Address2'] = next((a for a in address2_fields if a is not
                                None), False)

        if (a in row['Address2'] for a in ['95 University Avenue W',
                                           '100 Rev. Dr. Martin Luther King']):
            address = ('{Address}\n{Address2}\n{City}, {State} {Zipcode}'
                       .format(**row))
            if 'Rm. Number' in row:
                address = '{0} {1}'.format(row['Rm. Number'], address)
            leg.add_contact_detail(type='address', value=address,
                                   note='capitol')
        elif row['Address2']:
            address = ('{Address}\n{Address2}\n{City}, {State} {Zipcode}'
                       .format(**row))
            leg.add_contact_detail(type='address', value=address,
                                   note='district')
        else:
            address = '{Address}\n{City}, {State} {Zipcode}'.format(**row)
            leg.add_contact_detail(type='address', value=address,
                                   note='district')

        leg.add_source(self.url)
        leg.add_source(self._html_url)

        return leg
Example #16
0
    def scrape_chamber(self, chamber=None):
        metainf = self.scrape_leg_page(get_legislator_listing_url(chamber))
        for leg in metainf:
            try:
                chamber = {"House": "lower",
                           "Senate": "upper"}[leg['chamber']]
            except KeyError:
                print("")
                print("  ERROR: Bad Legislator page.")
                print("    -> " + "\n    -> ".join(leg['source']))
                print("")
                print("  Added this workaround because of a bad legislator")
                print("  page, while they filled their info out.")
                print("")
                print("  Emailed webmaster. Told to wait.")
                print("   - PRT, Jun 23, 2014")
                print("")
                continue

            person = Person(name=leg['name'], district=leg['district'],
                            party=leg['party'], primary_org=chamber,
                            image=leg['image'])

            for source in leg['source']:
                person.add_source(source)

            try:
                for ctty in leg['ctty']:
                    flag = 'Joint Legislative'
                    if ctty['name'][:len(flag)] == flag:
                        ctty_chamber = "joint"
                    else:
                        ctty_chamber = chamber

                    comm = Organization(name=ctty['name'], classification="committee",
                                        chamber=ctty_chamber)
                    comm.add_member(person, role="member")

            except KeyError:
                self.warn("%s has no scraped Committees" % leg['name'])

            person.add_link(leg['homepage'])

            if leg['addr']:
                person.add_contact_detail(type='address', value=leg['addr'], note='Capitol Office')
            if leg['phone']:
                person.add_contact_detail(type='voice', value=leg['phone'], note='Capitol Office')
            if leg['email']:
                person.add_contact_detail(type='email', value=leg['email'], note='Capitol Office')
            if leg['fax']:
                person.add_contact_detail(type='fax', value=leg['fax'], note='Capitol Office')
            yield person
Example #17
0
    def scrape_chamber(self, session):
        session_key = SESSION_KEYS[session]
        legislators_reponse = self.api_client.get('legislators', session=session_key)

        for legislator in legislators_reponse:
            url_name = legislator['WebSiteUrl'].split('/')[-1]
            chamber_name = 'house' if legislator['Chamber'] == 'H' else 'senate'
            img = 'https://www.oregonlegislature.gov/{}/MemberPhotos/{}.jpg'.format(
                chamber_name, url_name
            )

            party = legislator['Party']
            if party == 'Democrat':
                party = 'Democratic'

            person = Person(name='{} {}'.format(legislator['FirstName'], legislator['LastName']),
                            primary_org={'S': 'upper', 'H': 'lower'}[legislator['Chamber']],
                            party=party,
                            district=legislator['DistrictNumber'],
                            image=img)
            person.add_link(legislator['WebSiteUrl'])
            person.add_source(legislator['WebSiteUrl'])

            if legislator['CapitolAddress']:
                person.add_contact_detail(type='address', value=legislator['CapitolAddress'],
                                          note='Capitol Office')

            if legislator['CapitolPhone']:
                person.add_contact_detail(type='voice', value=legislator['CapitolPhone'],
                                          note='Capitol Office')

            person.add_contact_detail(type='email', value=legislator['EmailAddress'],
                                      note='Capitol Office')

            yield person
Example #18
0
    def scrape_chamber(self, chamber, session):

        if chamber == 'upper':
            chamber_slug = 'Senate'
        elif chamber == 'lower':
            chamber_slug = 'Assembly'
        session_slug = self.jurisdiction.session_slugs[session]

        leg_base_url = 'http://www.leg.state.nv.us/App/Legislator/A/%s/%s/' % (chamber_slug,
                                                                               session_slug)
        leg_json_url = ('http://www.leg.state.nv.us/App/Legislator/A/api/%s/Legislator?house=%s' %
                        (session_slug, chamber_slug))

        resp = json.loads(self.get(leg_json_url).text)
        for item in resp:
            # empty district
            empty_names = ['District No', 'Vacant']
            if any(name in item['FullName'] for name in empty_names):
                continue
            last, first = item['FullName'].split(",", 1)
            item['FullName'] = "{first} {last}".format(last=last,
                                                       first=first).strip()
            person = Person(name=item['FullName'], district=item['DistrictNbr'],
                            party=item['Party'], primary_org=chamber,
                            image=item['PhotoURL'])
            leg_url = leg_base_url + item['DistrictNbr']

            # hack to get the legislator ID
            html = self.get(leg_url).text
            for l in html.split('\n'):
                if 'GetLegislatorDetails' in l:
                    leg_id = l.split(',')[1].split("'")[1]

            # fetch the json used by the page
            leg_details_url = ('https://www.leg.state.nv.us/App/Legislator/A/api/{}/Legislator?id='
                               .format(session_slug) + leg_id)
            leg_resp = json.loads(self.get(leg_details_url).text)
            details = leg_resp['legislatorDetails']

            address = details['Address1']
            address2 = details['Address2']
            if address2:
                address += ' ' + address2
            address += '\n%s, NV %s' % (details['City'], details['Zip'])

            phone = details['LCBPhone']
            email = details['LCBEmail']
            if address:
                person.add_contact_detail(type='address', value=address,
                                          note='District Office')
            if phone:
                person.add_contact_detail(type='voice', value=phone,
                                          note='District Office')
            if phone:
                person.add_contact_detail(type='email', value=email,
                                          note='District Office')
            person.add_link(leg_details_url)
            person.add_source(leg_details_url)
            yield person
Example #19
0
    def handle_list_item(self, item):
        photo_url = item.xpath('./td[1]/a/img/@src')[0]
        info_nodes = item.xpath('./td[2]/p/a')
        name_text = info_nodes[0].xpath('./b/text()')[0]
        url = info_nodes[0].get('href')

        name_match = re.match(r'^(.+)\(([0-9]{2}[AB]), ([A-Z]+)\)$', name_text)
        name = name_match.group(1).strip()
        district = name_match.group(2).lstrip('0').upper()
        party_text = name_match.group(3)
        party = PARTIES[party_text]

        info_texts = [x.strip() for x in item.xpath(
            './td[2]/p/text()[normalize-space() and preceding-sibling::br]'
        ) if x.strip()]
        address = '\n'.join((info_texts[0], info_texts[1]))

        phone_text = info_texts[2]
        if validate_phone_number(phone_text):
            phone = phone_text

        email_node = info_nodes[1]
        email_text = email_node.text
        email_text = email_text.replace('Email: ', '').strip()
        if validate_email_address(email_text):
            email = email_text

        rep = Person(name=name, district=district, party=party,
                     primary_org='lower', role='Representative',
                     image=photo_url)
        rep.add_link(url)
        rep.add_contact_detail(type='address', value=address)
        rep.add_contact_detail(type='voice', value=phone)
        rep.add_contact_detail(type='email', value=email)
        rep.add_source(self.url)

        yield rep
Example #20
0
    def _scrape_legislator(self, row, chamber):
        name_cell = row.xpath('./td[@class="rosterCell nameCell"]/a')[0]
        name = ' '.join([line.strip() for line
                         in name_cell.text_content().split('\n')
                         if len(line.strip()) > 0])

        party_letter = row.xpath(
            './td[@class="rosterCell partyCell"]/text()')[0].strip()
        party = dict(D='Democratic', R='Republican')[party_letter]

        chamber_abbr = self._chamber_map[chamber]
        district = row.xpath('./td[@class="rosterCell seatCell"]'
                             '/text()')[0].replace(chamber_abbr, '').strip()
        try:
            email = row.xpath('./td[@class="rosterCell emailCell"]'
                              '/a/@href')[0].replace('mailto:', '').strip()
        except IndexError:
            email = None

        phone = row.xpath('./td[@class="rosterCell phoneCell"]'
                          '/text()')[0].strip() or None

        details_url = 'https://leg.mt.gov{}'.format(name_cell.attrib['href'])
        response = self.get(details_url)
        details_page = lxml.html.fromstring(response.text)

        address_lines = details_page.xpath(
            '//div[@class="col-lg-6 col-md-12 text-lg-left align-self-center"]'
            '/p[contains(text(), "Address")]'
            )[0].text_content() \
                .replace('Address', '') \
                .split('\n')
        address = '\n'.join([line.strip() for line in address_lines
                             if len(line.strip()) > 0])

        legislator = Person(name=name,
                            district=district,
                            party=party,
                            primary_org=chamber)

        legislator.add_contact_detail(type='address', value=address,
                                      note='Capitol Office')
        if phone is not None:
            legislator.add_contact_detail(type='voice', value=phone,
                                          note='Capitol Office')

        if email is not None:
            legislator.add_contact_detail(type='email', value=email,
                                          note='E-mail')

        legislator.add_link(details_url)
        legislator.add_source(self._roster_url)

        yield legislator
Example #21
0
    def get_member(self, session, chamber, kpid):
        url = '%smembers/%s' % (ksapi.url, kpid)
        content = json.loads(self.get(url).text)['content']

        party = content['PARTY']
        if party == 'Democrat':
            party = 'Democratic'

        slug = {'2013-2014': 'b2013_14',
                '2015-2016': 'b2015_16',
                '2017-2018': 'b2017_18',
                '2019-2020': 'b2019_20',
                }[session]
        leg_url = 'http://www.kslegislature.org/li/%s/members/%s/' % (slug, kpid)

        try:
            legislator_page = self.lxmlize(leg_url)
            photo_url, = legislator_page.xpath(
                '//img[@class="profile-picture"]/@src')
        except scrapelib.HTTPError:
            self.warning("{}'s legislator bio page not found".format(content['FULLNAME']))
            leg_url = ''
            photo_url = ''

        person = Person(
            name=content['FULLNAME'],
            district=str(content['DISTRICT']),
            primary_org=chamber,
            party=party,
            image=photo_url,
        )
        person.extras = {'occupation': content['OCCUPATION']}

        address = '\n'.join([
            'Room {}'.format(content['OFFICENUM']),
            'Kansas State Capitol Building',
            '300 SW 10th St.',
            'Topeka, KS 66612',
        ])

        note = 'Capitol Office'
        person.add_contact_detail(type='address', value=address, note=note)
        person.add_contact_detail(type='email', value=content['EMAIL'], note=note)
        if content['OFFPH']:
            person.add_contact_detail(type='voice', value=content['OFFPH'], note=note)

        person.add_source(url)
        person.add_link(leg_url)

        yield person
Example #22
0
    def scrape_senator(self, district):
        link = "https://legislature.maine.gov/District-{}".format(district)
        page = lxml.html.fromstring(self.get(link).text)
        page.make_links_absolute(link)

        main = page.xpath('//div[@id="main"]/div[@id="content"]')[0]
        title = main.xpath('h1')[0].text
        # e.g. District 25 - State Senator Catherine Breen (D - Cumberland)...
        title_match = re.match(
            r'District (\d+) - State Senator ([^\(]+) \(([DRI])', title)
        _, name, party = title_match.groups()
        name = re.sub(r'\s+', ' ', name.strip())
        party = _party_map[party]

        image_url = address = phone = email = None

        for p in main.xpath('p'):
            if p.xpath('.//img') and not image_url:
                image_url = p.xpath('.//img/@src')[0]
                continue
            field, _, value = p.text_content().partition(":")
            value = value.strip()
            if field in ('Address', 'Mailing Address'):
                address = value
            elif field in ('Phone', 'Home Phone'):
                phone = value
            elif field == 'Email':
                email = value

        person = Person(
            name=name,
            district=district,
            image=image_url,
            primary_org='upper',
            party=party,
        )

        person.add_link(link)
        person.add_source(link)

        if address:
            person.add_contact_detail(type='address', value=address, note='District Office')

        if phone:
            person.add_contact_detail(
                type='voice', value=clean_phone(phone), note='District Phone')
        person.add_contact_detail(type='email', value=email, note='District Email')

        yield person
Example #23
0
    def scrape_chamber(self, chamber):
        url = "http://www.ncga.state.nc.us/gascripts/members/"\
            "memberListNoPic.pl?sChamber="

        if chamber == 'lower':
            url += 'House'
        else:
            url += 'Senate'

        data = self.get(url).text
        doc = lxml.html.fromstring(data)
        doc.make_links_absolute('http://www.ncga.state.nc.us')
        rows = doc.xpath('//div[@id="mainBody"]/table/tr')

        for row in rows[1:]:
            party, district, full_name, counties = row.getchildren()

            party = party.text_content().strip("()")
            party = party_map[party]

            district = district.text_content().replace("District", "").strip()

            notice = full_name.xpath('span')
            if notice:
                notice = notice[0].text_content()
                # skip resigned legislators
                if 'Resigned' in notice or 'Deceased' in notice:
                    continue
            else:
                notice = None
            link = full_name.xpath('a/@href')[0]
            full_name = full_name.xpath('a')[0].text_content()
            full_name = full_name.replace(u'\u00a0', ' ')

            # scrape legislator page details
            lhtml = self.get(link).text
            ldoc = lxml.html.fromstring(lhtml)
            ldoc.make_links_absolute('http://www.ncga.state.nc.us')
            photo_url = ldoc.xpath('//a[contains(@href, "pictures")]/@href')[0]
            phone = get_table_item(ldoc, 'Phone:') or None
            address = get_table_item(ldoc, 'Address:') or None
            email = ldoc.xpath('//a[starts-with(@href, "mailto:")]')[0]
            capitol_email = email.text
            capitol_phone = email.xpath('ancestor::tr[1]/preceding-sibling::tr[1]/td/span')[0].text
            capitol_address = email.xpath('ancestor::tr[1]/preceding-sibling::tr[2]/td/text()')
            capitol_address = [x.strip() for x in capitol_address]
            capitol_address = '\n'.join(capitol_address) or None
            capitol_phone = capitol_phone.strip() or None

            # save legislator
            person = Person(name=full_name, district=district,
                            party=party, primary_org=chamber,
                            image=photo_url)
            person.extras['notice'] = notice
            person.add_link(link)
            person.add_source(link)
            if address:
                person.add_contact_detail(type='address', value=address,
                                          note='District Office')
            if phone:
                person.add_contact_detail(type='voice', value=phone,
                                          note='District Office')
            if capitol_address:
                person.add_contact_detail(type='address', value=capitol_address,
                                          note='Capitol Office')
            if capitol_phone:
                person.add_contact_detail(type='voice', value=capitol_phone,
                                          note='Capitol Office')
            if capitol_email:
                person.add_contact_detail(type='email', value=capitol_email,
                                          note='Capitol Office')
            yield person
Example #24
0
    def scrape_upper_leg_page(self, url, who):
        page = self.lxmlize(url)

        (who, ) = [x for x in
                   page.xpath('//tr/td/font/text()') if
                   x.strip().startswith("Senator ")
                   ]
        who = re.search(r'(?u)^\s*Senator\s*(.*?)\s*$', who).group(1)

        if 'Vacant' in who:
            return

        (district, ) = [x for x in
                        page.xpath('//tr/td/font/text()') if
                        x.strip().startswith("District - ")
                        ]
        district = re.search(
            r'(?u)^\s*District\s*-\s*(.*?)\s*$', district).group(1)

        info = [x.strip() for x in
                page.xpath('//font[contains(text(), "Information:")]/'
                'ancestor::table[1]//text()') if
                x.strip()
                ]

        parties = {
            "Republican": "Republican",
            "Democrat": "Democratic",
        }
        party_index = info.index("Party:") + 1
        party = parties[info[party_index]]

        phone_index = info.index("District Phone") + 1
        phone = info[phone_index]
        assert sum(c.isdigit() for c in phone) == 10, "Phone number is invalid: {}".format(phone)

        # Address exists for all lines between party and phone
        address = "\n".join(info[party_index + 2:phone_index - 1])
        address = address.replace("\r", "")

        if not address:
            address = "No Address Found"

        fax_index = info.index("Fax") + 1
        fax = info[fax_index]
        assert sum(c.isdigit() for c in fax) == 10, "Fax number is invalid: {}".format(fax)

        email_index = info.index("E-mail Address") + 1
        email = info[email_index]
        assert "@" in email, "Email info is not valid: {}".format(email)

        person = Person(name=who,
                        district=district,
                        party=party,
                        primary_org="upper")

        contacts = [
            (address, "address"),
            (phone, "voice"),
            (email, "email"),
            (fax, "fax"),
        ]

        for value, key in contacts:
            if value:
                person.add_contact_detail(type=key,
                                          value=value,
                                          note="District Office")

        person.add_source(url)
        person.add_link(url)

        yield person
Example #25
0
    def scrape_chamber(self, chamber):
        if chamber == "lower":
            url = "http://www.scstatehouse.gov/member.php?chamber=H"
        else:
            url = "http://www.scstatehouse.gov/member.php?chamber=S"

        seen_committees = {}

        data = self.get(url).text
        doc = lxml.html.fromstring(data)
        doc.make_links_absolute(url)

        for a in doc.xpath('//a[@class="membername"]'):
            full_name = a.text
            leg_url = a.get("href")

            if full_name.startswith("Senator"):
                full_name = full_name.replace("Senator ", "")
            if full_name.startswith("Representative"):
                full_name = full_name.replace("Representative ", "")

            leg_html = self.get(leg_url).text
            leg_doc = lxml.html.fromstring(leg_html)
            leg_doc.make_links_absolute(leg_url)

            if "Resigned effective" in leg_html:
                self.info("Resigned")
                continue

            party, district, _ = leg_doc.xpath(
                '//p[@style="font-size: 17px;'
                ' margin: 0 0 0 0; padding: 0;"]/text()')

            if "Republican" in party:
                party = "Republican"
            elif "Democrat" in party:
                party = "Democratic"

            # District # - County - Map
            district = district.split()[1]
            try:
                photo_url = leg_doc.xpath(
                    '//img[contains(@src,"/members/")]/@src')[0]
            except IndexError:
                self.warning("No Photo URL for {}".format(full_name))
                photo_url = ""
            person = Person(
                name=full_name,
                district=district,
                party=party,
                primary_org=chamber,
                image=photo_url,
            )

            # office address / phone
            try:
                addr_div = leg_doc.xpath(
                    '//div[@style="float: left; width: 225px;'
                    ' margin: 10px 5px 0 20px; padding: 0;"]')[0]
                capitol_address = addr_div.xpath(
                    'p[@style="font-size: 13px;'
                    ' margin: 0 0 10px 0; padding: 0;"]')[0].text_content()

                phone = addr_div.xpath(
                    'p[@style="font-size: 13px;'
                    ' margin: 0 0 0 0; padding: 0;"]/text()')[0]
                capitol_phone = phone.strip()

                if capitol_address:
                    person.add_contact_detail(type="address",
                                              value=capitol_address,
                                              note="Capitol Office")

                if capitol_phone:
                    person.add_contact_detail(type="voice",
                                              value=capitol_phone,
                                              note="Capitol Office")
            except IndexError:
                self.warning("no capitol address for {0}".format(full_name))

            # home address / phone
            try:
                addr_div = leg_doc.xpath(
                    '//div[@style="float: left;'
                    ' width: 225px; margin: 10px 0 0 20px;"]')[0]
                addr = addr_div.xpath(
                    'p[@style="font-size: 13px;'
                    ' margin: 0 0 10px 0; padding: 0;"]')[0].text_content()

                phone = addr_div.xpath(
                    'p[@style="font-size: 13px;'
                    ' margin: 0 0 0 0; padding: 0;"]/text()')[0]
                phone = phone.strip()
                if addr:
                    person.add_contact_detail(type="address",
                                              value=addr,
                                              note="District Office")

                if phone:
                    person.add_contact_detail(type="voice",
                                              value=phone,
                                              note="District Office")
            except IndexError:
                self.warning("no district address for {0}".format(full_name))

            person.add_link(leg_url)
            person.add_source(url)
            person.add_source(leg_url)

            # committees (skip first link)
            for com in leg_doc.xpath(
                    '//a[contains(@href, "committee.php")]')[1:]:
                if com.text.endswith(", "):
                    committee, role = com.text_content().rsplit(", ", 1)

                    # known roles
                    role = {
                        "Treas.": "treasurer",
                        "Secy.": "secretary",
                        "Secy./Treas.": "secretary/treasurer",
                        "V.C.": "vice-chair",
                        "1st V.C.": "first vice-chair",
                        "Co 1st V.C.": "co-first vice-chair",
                        "2nd V.C.": "second vice-chair",
                        "3rd V.C.": "third vice-chair",
                        "Ex.Officio Member": "ex-officio member",
                        "Chairman": "chairman",
                    }[role]
                else:
                    committee = com.text
                    role = "member"

                # only yield each committee once
                if committee not in seen_committees:
                    com = Organization(name=committee,
                                       classification="committee",
                                       chamber=chamber)
                    com.add_source(url)
                    seen_committees[committee] = com
                    yield com
                else:
                    com = seen_committees[committee]

                person.add_membership(com, role=role)

            yield person
Example #26
0
    def scrape_lower_chamber(self, term):
        # E-mail contact is now hidden behind webforms. Sadness.

        party_map = {'PNP': 'Partido Nuevo Progresista',
                     'PPD': u'Partido Popular Democr\xe1tico',
                     'PIP': u'Partido Independentista Puertorrique\u00F1o',
                     }

        url = 'http://www.tucamarapr.org/dnncamara/ComposiciondelaCamara/Biografia.aspx'
        page = self.lxmlize(url)

        member_nodes = self.get_nodes(page, '//li[@class="selectionRep"]')
        for member_node in member_nodes:
            member_info = member_node.text_content().strip().split("\n")

            name = re.sub(r'^Hon\.', '', member_info[0]).strip()
            district_text = member_info[-1].strip()
            if district_text == 'Representante por Acumulación':
                district = 'At-Large'
            else:
                district = district_text.replace("Representante del Distrito ", "").strip()
            photo_url = self.get_node(member_node, './/img/@src')

            rep_link = self.get_node(member_node, ".//a/@href")
            rep_page = self.lxmlize(rep_link)

            party_node = self.get_node(rep_page, '//span[@class="partyBio"]')
            # Albelo doesn't seem to have a "partyBio" as an independent, but we
            # expect this to exist for all other members.
            if not party_node and name == "Manuel A. Natal Albelo":
                party = "Independent"
            else:
                party_text = party_node.text_content().strip()
                party = party_map[party_text]

            address = self.get_node(rep_page, '//h6').text.strip().split("\n")[0].strip()

            # Only grabs the first validated phone number found.
            # Typically, representatives have multiple phone numbers.
            phone_node = self.get_node(
                rep_page,
                '//span[@class="data-type" and contains(text(), "Tel.")]')
            phone = None
            possible_phones = phone_node.text.strip().split("\n")
            for phone_attempt in possible_phones:
                # Don't keep searching phone numbers if a good one is found.
                if phone:
                    break

                phone_text = re.sub(r'^Tel\.[\s]*', '', phone_attempt).strip()
                if validate_phone_number(phone_text):
                    phone = phone_text

            fax_node = self.get_node(
                rep_page,
                '//span[@class="data-type" and contains(text(), "Fax.")]')
            fax = None
            if fax_node:
                fax_text = fax_node.text.strip()
                fax_text = re.sub(r'^Fax\.[\s]*', '', fax_text).strip()
                if validate_phone_number(fax_text):
                    fax = fax_text

            person = Person(primary_org='lower',
                            district=district,
                            name=name,
                            party=party,
                            image=photo_url)

            person.add_link(rep_link)
            person.add_source(rep_link)
            person.add_source(url)

            if address:
                person.add_contact_detail(type='address',
                                          value=address,
                                          note='Capitol Office')
            if phone:
                person.add_contact_detail(type='voice',
                                          value=phone,
                                          note='Capitol Office')
            if fax:
                person.add_contact_detail(type='fax',
                                          value=fax,
                                          note='Capitol Office')

            yield person
    def scrape(self):
        body_types = self.body_types()
        city_council, = [body for body in self.bodies()
                         if body["BodyName"] == "City Council"]
        terms = collections.defaultdict(list)

        for office in self.body_offices(city_council):
            if "VACAN" not in office["OfficeRecordFullName"]:
                terms[office["OfficeRecordFullName"].strip()].append(office)

        web_scraper = LegistarPersonScraper(requests_per_minute=self.requests_per_minute)
        web_scraper.MEMBERLIST = "https://pittsburgh.legistar.com/People.aspx"
        web_scraper.COMMITTEELIST = "https://pittsburgh.legistar.com/Departments.aspx"

        if self.cache_storage:
            web_scraper.cache_storage = self.cache_storage

        if self.requests_per_minute == 0:
            web_scraper.cache_write_only = False

        web_info = {}
        for member in web_scraper.councilMembers():
            web_info[member["Person Name"]] = member

        members = {}
        for member, offices in terms.items():
            person = Person(member)
            for term in offices:
                role = term["OfficeRecordTitle"]
                person.add_term("Councilmember",
                                "legislature",
                                start_date = self.toDate(term["OfficeRecordStartDate"]),
                                end_date = self.toDate(term["OfficeRecordEndDate"]))

            if member in web_info:
                web = web_info[member]
                if web["E-mail"] and web["E-mail"]["label"] and web["E-mail"]["label"] != "N/A":
                    person.add_contact_detail(type="email",
                                        value=web["E-mail"]["label"],
                                        note="E-mail")

            person_source_data = self.person_sources_from_office(term)
            person_api_url, person_api_response = person_source_data
            person.add_source(person_api_url, note="api")

            if person_api_response["PersonAddress1"]:
                address = (person_api_response["PersonAddress1"] + ", " + person_api_response["PersonCity1"]
                          + ", " + person_api_response["PersonState1"] + " " + person_api_response["PersonZip1"])
                person.add_contact_detail(type="address",
                                    value=address,
                                    note="Office address")

            if person_api_response["PersonPhone"]:
                person.add_contact_detail(type="voice",
                                    value=person_api_response["PersonPhone"],
                                    note="Office phone")

            if person_api_response["PersonWWW"]:
                person.add_contact_detail(type="url",
                                    value=person_api_response["PersonWWW"],
                                    note="District website")

            members[member] = person


        for body in self.bodies():
            if body["BodyTypeId"] == body_types["Committee"]:
                body_name_clean = body["BodyName"].strip()
                organization = Organization(body_name_clean,
                             classification="committee",
                             parent_id={"name" : "Pittsburgh City Council"})

                organization.add_source(self.BASE_URL + "/bodies/{BodyId}".format(**body), note="api")

                for office in self.body_offices(body):
                    role = office["OfficeRecordMemberType"]
                    if role not in ("Vice Chair", "Chair") or role == "Councilmember":
                        role = "Member"

                    person = office["OfficeRecordFullName"].strip()
                    if person in members:
                        person = members[person]
                    else:
                        person = Person(person)

                    person.add_membership(body_name_clean,
                                     role=role,
                                     start_date = self.toDate(office["OfficeRecordStartDate"]),
                                     end_date = self.toDate(office["OfficeRecordEndDate"]))

                yield organization

        for person in members.values():
            yield person
Example #28
0
    def handle_list_item(self, item):
        link = item.xpath('.//div[contains(@class, "rep_style")]/a')[0]
        name = link.text_content().strip()

        if "Vacant" in name or "Resigned" in name or "Pending" in name:
            return

        party = item.xpath(
            './/div[contains(@class, "party_style")]/text()')[0].strip()
        party = {"D": "Democratic", "R": "Republican"}[party]

        district = item.xpath(
            './/div[contains(@class, "district_style")]/text()')[0].strip()

        leg_url = link.get("href")
        split_url = parse.urlsplit(leg_url)
        member_id = parse.parse_qs(split_url.query)["MemberId"][0]
        image = "http://www.flhouse.gov/FileStores/Web/Imaging/Member/{}.jpg".format(
            member_id)

        name = fix_name(name)
        rep = Person(
            name=name,
            district=district,
            party=party,
            primary_org="lower",
            role="Representative",
            image=image,
        )
        rep.add_link(leg_url)
        rep.add_source(leg_url)
        rep.add_source(self.url)

        self.scrape_page(RepDetail, leg_url, obj=rep)

        # look for email in the list from the PDF directory - ideally
        # we'd find a way to better index the source data which
        # wouldn't require guessing the email, but this does at least
        # confirm that it's correct

        # deal with some stuff that ends up in name that won't work in
        # email, spaces, quotes, high latin1
        email_name = rep.name.replace('"', "").replace("La ",
                                                       "La").replace("ñ", "n")
        (last, *other) = re.split(r"[-\s,]+", email_name)

        # deal with a missing nickname used in an email address
        if "Patricia" in other:
            other.append("Pat")

        # search through all possible first names and nicknames
        # present - needed for some of the more elaborate concoctions
        found_email = False
        for first in other:
            email = "*****@*****.**" % (first, last)
            if email in self.member_emails:
                # it's bad if we can't uniquely match emails, so throw an error
                if email in self.claimed_member_emails:
                    raise ValueError(
                        "Email address %s matches multiple reps - %s and %s." %
                        (email, rep.name, self.claimed_member_emails[email]))

                self.claimed_member_emails[email] = rep.name

                rep.add_contact_detail(type="email",
                                       value=email,
                                       note="Capitol Office")
                rep.add_source(self.directory_pdf_url)

                found_email = True

                break

        if not found_email:
            log.warning("Rep %s does not have an email in the directory PDF." %
                        (rep.name, ))

        return rep
Example #29
0
    def scrape(self, session=None):
        if session is None:
            session = self.latest_session()

        year_slug = self.jurisdiction.get_year_slug(session)

        # Load all members via the private API
        legislator_dump_url = (
            'http://legislature.vermont.gov/people/loadAll/{}'.
            format(year_slug))
        json_data = self.get(legislator_dump_url).text
        legislators = json.loads(json_data)['data']

        # Parse the information from each legislator
        for info in legislators:
            # Strip whitespace from strings
            info = {k: v.strip() for k, v in info.items()}

            # Skip duplicate record for Christopher Mattos (appointed Rep September 2017)
            if info['PersonID'] == "29034":
                self.info("skipping first Christopher Mattos record")
                continue

            # Gather photo URL from the member's page
            member_url = ('http://legislature.vermont.gov/people/single/{}/{}'.
                          format(year_slug, info['PersonID']))
            page = self.lxmlize(member_url)
            (photo_url, ) = page.xpath('//img[@class="profile-photo"]/@src')

            # Also grab their state email address
            state_email = page.xpath(
                '//dl[@class="summary-table profile-summary"]/'
                'dt[text()="Email"]/following-sibling::dd[1]/a/text()')
            if state_email:
                (state_email, ) = state_email
            else:
                state_email = None

            district = info['District'].replace(" District", "")

            leg = Person(
                primary_org=self.CHAMBERS[info['Title']],
                district=district,
                party=info['Party'].replace("Democrat", "Democratic"),
                name="{0} {1}".format(info['FirstName'], info['LastName']),
                image=photo_url
            )

            leg.add_contact_detail(
                note="Capitol Office",
                type='address',
                value='Vermont State House\n115 State Street\nMontpelier, VT 05633'
            )
            if state_email:
                leg.add_contact_detail(note="Capitol Office", type='email', value=state_email)

            leg.add_contact_detail(
                note="District Office",
                type='address',
                value="{0}{1}\n{2}, {3} {4}".format(
                    info['MailingAddress1'],
                    ("\n" + info['MailingAddress2']
                        if info['MailingAddress2'].strip()
                        else ""),
                    info['MailingCity'],
                    info['MailingState'],
                    info['MailingZIP']
                )
            )
            if info['HomePhone']:
                leg.add_contact_detail(note="District Office", type='voice',
                                       value=info['HomePhone'])
            district_email = info['Email'] or info['HomeEmail'] or info['WorkEmail']
            if district_email:
                leg.add_contact_detail(note="District Office", type='email', value=district_email)

            leg.add_link(member_url)

            leg.add_source(legislator_dump_url)
            leg.add_source(member_url)

            yield leg
Example #30
0
    def scrape_lower(self, chamber):
        url = 'http://www.house.mi.gov/mhrpublic/frmRepList.aspx'
        table = [
            "website", "district", "name", "party", "location", "phone",
            "email"
        ]

        data = self.get(url).text
        doc = lxml.html.fromstring(data)

        # skip two rows at top
        for row in doc.xpath('//table[@id="grvRepInfo"]/*'):
            tds = row.xpath('.//td')
            if len(tds) == 0:
                continue
            metainf = {}
            for i in range(0, len(table)):
                metainf[table[i]] = tds[i]
            district = str(int(metainf['district'].text_content().strip()))
            party = metainf['party'].text_content().strip()
            phone = metainf['phone'].text_content().strip()
            email = metainf['email'].text_content().strip()
            name = metainf['name'].text_content().strip()
            if name == 'Vacant' or re.match(r'^District \d{1,3}$', name):
                self.warning(
                    'District {} appears vacant, and will be skipped'.format(
                        district))
                continue
            leg_url = metainf['website'].xpath("./a")[0].attrib['href']

            office = metainf['location'].text_content().strip()
            office = re.sub(
                ' HOB',
                ' Anderson House Office Building\n124 North Capitol Avenue\nLansing, MI 48933',
                office)
            office = re.sub(' CB',
                            ' State Capitol Building\nLansing, MI 48909',
                            office)

            try:
                photo_url = self.get_photo_url(leg_url)[0]
            except (scrapelib.HTTPError, IndexError):
                photo_url = ''
                self.warning('no photo url for %s', name)

            person = Person(name=name,
                            district=district,
                            party=abbr[party],
                            primary_org='lower',
                            image=photo_url)

            person.add_link(leg_url)
            person.add_source(leg_url)

            person.add_contact_detail(type='address',
                                      value=office,
                                      note='Capitol Office')
            person.add_contact_detail(type='voice',
                                      value=phone,
                                      note='Capitol Office')
            person.add_contact_detail(type='email',
                                      value=email,
                                      note='Capitol Office')

            yield person
Example #31
0
    def scrape(self):
        committee_d = {}
        non_committees = ('City Council', 'Office of the Mayor')

        for councilman, committees in self.councilMembers():
            if councilman['Ward/Office'] == "":
                continue

            ward = councilman['Ward/Office']
            if ward not in [
                    "Mayor",
                    "Clerk",
            ]:
                ward = "Ward {}".format(int(ward))

            p = Person(councilman['Person Name']['label'],
                       district=ward,
                       primary_org="legislature")

            if councilman['Photo']:
                p.image = councilman['Photo']

            contact_types = {
                "City Hall Office": ("address", "City Hall Office"),
                "City Hall Phone": ("voice", "City Hall Phone"),
                "Ward Office Phone": ("voice", "Ward Office Phone"),
                "Ward Office Address": ("address", "Ward Office Address"),
                "Fax": ("fax", "Fax")
            }

            for contact_type, (type_, _note) in contact_types.items():
                if councilman[contact_type]:
                    p.add_contact_detail(type=type_,
                                         value=councilman[contact_type],
                                         note=_note)

            if councilman["E-mail"]:
                p.add_contact_detail(type="email",
                                     value=councilman['E-mail']['label'],
                                     note='E-mail')

            if councilman['Website']:
                p.add_link(councilman['Website']['url'])
            p.add_source(MEMBERLIST)

            for committee, _, _ in committees:
                committee_name = committee['Legislative Body']['label']
                if committee_name and committee_name not in non_committees:
                    o = committee_d.get(committee_name, None)
                    if o is None:
                        o = Organization(committee_name,
                                         classification='committee')
                        o.add_source(
                            "https://chicago.legistar.com/Departments.aspx")
                        committee_d[committee_name] = o

                    o.add_member(p, role=committee["Title"])
            yield p

        for o in committee_d.values():
            yield o
Example #32
0
    def parse_senate(self, div, chamber):
        name = div.xpath('.//h3/text()')[0]
        if name.endswith(' (R)'):
            party = 'Republican'
        elif name.endswith(' (D)'):
            party = 'Democratic'
        else:
            self.warning('skipping ' + name)
            return None
        name = name.split(' (')[0]

        district = div.xpath(
            './/div[contains(@class, "senator-district")]/div/text()'
        )[0].strip().lstrip('0')
        photo_url = div.xpath('.//img/@src')[0]

        person = Person(
            name=name,
            party=party,
            district=district,
            primary_org=chamber,
            image=photo_url,
        )

        url = div.xpath('.//a/@href')[0]
        person.add_link(url)

        # CA senators have working emails, but they're not putting them on
        # their public pages anymore
        email = self._construct_email(chamber, name)

        person.add_contact_detail(type='email',
                                  value=email,
                                  note='Senate Office')

        office_path = './/div[contains(@class, "{}")]//p'

        for addr in div.xpath(
                office_path.format(
                    'views-field-field-senator-capitol-office')):
            note = 'Senate Office'
            addr, phone = addr.text_content().split('; ')
            person.add_contact_detail(type='address',
                                      value=addr.strip(),
                                      note=note)
            person.add_contact_detail(type='voice',
                                      value=phone.strip(),
                                      note=note)

        n = 1
        for addr in div.xpath(
                office_path.format(
                    'views-field-field-senator-district-office')):
            note = 'District Office #{}'.format(n)
            for addr in addr.text_content().strip().splitlines():
                try:
                    addr, phone = addr.strip().replace(u'\xa0',
                                                       ' ').split('; ')
                    person.add_contact_detail(type='address',
                                              value=addr.strip(),
                                              note=note)
                    person.add_contact_detail(type='voice',
                                              value=phone.strip(),
                                              note=note)
                except ValueError:
                    addr = addr.strip().replace(u'\xa0', ' ')
                    person.add_contact_detail(type='address',
                                              value=addr.strip(),
                                              note=note)
            n += 1

        return person
Example #33
0
    def parse_assembly(self, tr, chamber):
        '''
        Given a tr element, get specific data from it.
        '''

        strip = methodcaller('strip')

        xpath = 'td[contains(@class, "views-field-field-%s-%s")]%s'

        xp = {
            'url': [('lname-sort', '/a[not(contains(text(), "edit"))]/@href')],
            'district': [('district', '/text()')],
            'party': [('party', '/text()')],
            'name': [('office-information',
                      '/a[not(contains(text(), "edit"))]/text()')],
            'address':
            [('office-information', '/h3/following-sibling::text()'),
             ('office-information', '/p/text()')],
        }

        titles = {'upper': 'senator', 'lower': 'member'}

        funcs = {
            'name':
            lambda s: re.sub(  # "Assembly" is misspelled once
                r'Contact Assembl?y Member', '', s).strip(),
            'address':
            parse_address,
        }

        tr_xpath = tr.xpath
        res = collections.defaultdict(list)
        for k, xpath_info in xp.items():
            for vals in xpath_info:
                f = funcs.get(k, lambda _: _)
                vals = (titles[chamber], ) + vals
                vals = map(f, map(strip, tr_xpath(xpath % vals)))
                res[k].extend(vals)

        # Photo.
        try:
            res['image'] = tr_xpath('td/p/img/@src')[0]
        except IndexError:
            pass

        # Remove junk from assembly member names.
        junk = 'Contact Assembly Member '

        try:
            res['name'] = res['name'].pop().replace(junk, '')
        except IndexError:
            return

        # Normalize party.
        for party in res['party'][:]:
            if party:
                if party == 'Democrat':
                    party = 'Democratic'
                res['party'] = party
                break
            else:
                res['party'] = None

        # strip leading zero
        res['district'] = str(int(res['district'].pop()))

        person = Person(
            name=res['name'],
            district=res.get('district'),
            party=res.get('party'),
            image=res.get('image'),
            primary_org=chamber,
        )

        # Mariko Yamada also didn't have a url that lxml would parse
        # as of 3/22/2013.
        if res['url']:
            person.add_link(res['url'].pop())

        # Addresses.
        addresses = res['address']
        try:
            addresses = map(dict, filter(None, addresses))
        except ValueError:
            # Sometimes legislators only have one address, in which
            # case this awful hack is helpful.
            addresses = map(dict, filter(None, [addresses]))
        addresses = list(addresses)

        for address in addresses:
            # Toss results that don't have required keys.
            if not set(['street', 'city', 'state_zip']) < set(address):
                if address in addresses:
                    addresses.remove(address)

        # Re-key the addresses
        offices = []
        if addresses:
            # Mariko Yamada's addresses wouldn't parse correctly as of
            # 3/23/2013, so here we're forced to test whether any
            # addresses were even found.
            addresses[0].update(type='capitol', name='Capitol Office')
            offices.append(addresses[0])

            # CA reps have working emails, but they're not putting them on
            # their public pages anymore
            offices[0]['email'] = self._construct_email(chamber, res['name'])

            for n, office in enumerate(addresses[1:]):
                office.update(type='district',
                              name='District Office #{}'.format(n + 1))
                offices.append(office)

            for office in offices:
                street = office['street']
                state_zip = re.sub(r'\s+', ' ', office['state_zip'])
                street = '%s\n%s, %s' % (street, office['city'], state_zip)
                office['address'] = street
                office['fax'] = None
                if 'email' not in office:
                    office['email'] = None

                note = office['name']
                person.add_contact_detail(type='address',
                                          value=office['address'],
                                          note=note)
                if office['phone']:
                    person.add_contact_detail(type='voice',
                                              value=office['phone'],
                                              note=note)
                if office['email']:
                    person.add_contact_detail(type='email',
                                              value=office['email'],
                                              note=note)

        return person
Example #34
0
    def scrape_chamber(self, chamber, session):
        url = 'https://docs.legis.wisconsin.gov/{}/legislators/{}'.format(
            session,
            {
                'upper': 'senate',
                'lower': 'assembly'
            }[chamber],
        )

        body = self.get(url).text
        page = lxml.html.fromstring(body)
        page.make_links_absolute(url)

        for row in page.xpath(
                ".//div[@class='box-content']/div[starts-with(@id,'district')]"
        ):
            if row.xpath(
                    ".//a/@href") and not row.xpath(".//a[text()='Vacant']"):
                rep_url = row.xpath(".//a[text()='Details']/@href")[0].strip(
                    "https://")
                rep_url = "https://" + rep_url
                rep_doc = lxml.html.fromstring(self.get(rep_url).text)
                rep_doc.make_links_absolute(rep_url)

                full_name = rep_doc.xpath(
                    './/div[@id="district"]/h1/text()')[0].replace(
                        "Senator ", "").replace("Representative ", "")

                party = rep_doc.xpath('.//div[@id="district"]//small/text()')
                if len(party) > 0:
                    party = PARTY_DICT[party[0].split("-")[0].strip(
                        "(").strip()]
                else:
                    party = None
                district = rep_doc.xpath(
                    './/div[@id="district"]/h3/a/@href')[1]
                district = district.split("/")[-1]
                district = str(int(district))

                # email
                email = rep_doc.xpath("//span[@class='info email']/a/text()")
                if email:
                    email = email[0]
                else:
                    email = ''

                assert party is not None, "{} is missing party".format(
                    full_name)

                person = Person(
                    name=full_name,
                    district=district,
                    primary_org=chamber,
                    party=party,
                )

                img = rep_doc.xpath('.//div[@id="district"]/img/@src')
                if img:
                    person.image = img[0]

                # office ####
                address_lines = rep_doc.xpath(
                    './/span[@class="info office"]/text()')
                address = '\n'.join([
                    line.strip() for line in address_lines
                    if line.strip() != ""
                ])
                person.add_contact_detail(type='address',
                                          value=address,
                                          note='Capitol Office')

                phone = rep_doc.xpath(
                    './/span[@class="info telephone"]/text()')
                if phone:
                    phone = re.sub(r'\s+', ' ', phone[1]).strip()
                    person.add_contact_detail(type='voice',
                                              value=phone,
                                              note='Capitol Office')

                fax = rep_doc.xpath('.//span[@class="info fax"]/text()')
                if fax:
                    fax = re.sub(r'\s+', ' ', fax[1]).strip()
                    person.add_contact_detail(type='fax',
                                              value=fax,
                                              note='Capitol Office')

                if email:
                    person.add_contact_detail(type='email',
                                              value=email,
                                              note='Capitol Office')

                person.add_link(rep_url)
                person.add_source(rep_url)

                yield person
Example #35
0
    def get_people(self):
        people_base_url = "http://miamidade.gov/wps/portal/Main/government"
        doc = self.lxmlize(people_base_url)
        person_list = doc.xpath("//div[contains(@id,'elected')]//span")
        titles = ["Chairman", "Vice Chair"]
        for person in person_list:
            info = person.text_content().strip().split("\r")
            position = info[0].strip()
            name = " ".join(info[1:-1])
            name = name.replace("Website | Contact", "")
            for title in titles:
                name = name.replace(title, "")
            name = name.strip()
            url = person.xpath(".//a[contains(text(),'Website')]/@href")[0]
            image = person.xpath(".//img/@src")[0]
            pers = Person(name=name,
                          image=image,
                          primary_org='legislature',
                          role=position)
            pers.add_source(people_base_url,
                            note="Miami-Dade government website")
            pers.add_source(url, note="individual's website")

            #the commissioners have consistent site format
            if "district" in position.lower():
                person_doc = self.lxmlize(url)
                contact_rows = person_doc.xpath(
                    "//div[@class='leftContentContainer']//p")
                for line in contact_rows:
                    line_text = line.text_content()
                    if "email" in line_text.lower():
                        email_address = line_text.replace("Email:", "").strip()
                        pers.add_contact_detail(type="email",
                                                value=email_address)
                        continue
                    try:
                        office, phone, fax = line_text.strip().split("\n")
                    except ValueError:
                        #ick, it's all on one line.
                        if "downtown office" in line_text.lower():
                            office = "Downtown Office"
                        elif "district office" in line_text.lower():
                            office = "District Office"
                        else:
                            continue
                        phone = line_text[15:27]
                        fax = line_text[33:45]

                    if "office" not in office.lower():
                        continue
                        #social is also available in here
                        #but I don't see a place to put it
                    phone = phone.replace("Phone", "").strip()
                    fax = fax.replace("Fax", "").strip()
                    pers.add_contact_detail(
                        type="voice",  #phone is not allowed ????
                        value=phone,
                        note=office.strip())

                    pers.add_contact_detail(
                        type="fax",  #phone is not allowed ????
                        value=fax,
                        note=office.strip())

            yield pers
Example #36
0
    def scrape_table(self, chamber, tbl):
        # skip first
        for row in tbl.xpath('tr')[1:]:
            leg_a, district, _, _ = row.xpath('td')
            district = district.text
            name = leg_a.text_content().strip()
            if name.lower() == "to be announced":
                continue
            leg_url = leg_a.xpath('a/@href')[0]

            # get details
            html = self.get(leg_url).text
            ldoc = lxml.html.fromstring(html)
            ldoc.make_links_absolute(leg_url)

            party = _get_table_item(ldoc, 'Party Affiliation:').text
            if party == 'Democrat':
                party = 'Democratic'
            addr_lines = _get_table_item(ldoc, 'Annapolis Address:').xpath('text()')
            address = []
            phone = None
            fax = None
            for line in addr_lines:
                if 'Phone:' in line:
                    phone = re.findall(r'Phone: (\d{3}-\d{3}-\d{4})', line)[0]
                elif 'Fax:' in line:
                    # Number oddities: one has two dashes, one has a dash and then a space.
                    line = line.replace('--', '-').replace('- ', '-')
                    fax = re.findall(r'Fax: (\d{3}-\d{3}-\d{4})', line)[0]
                else:
                    address.append(line)
            address = '\n'.join(address)

            email = ldoc.xpath('//a[contains(@href, "mailto:")]/text()')
            if not email:
                email = None
            elif len(email) == 1:
                email = email[0].strip()
            else:
                raise AssertionError('Multiple email links found on page')

            img_src = ldoc.xpath('//img[@class="sponimg"]/@src')
            if img_src:
                photo_url = img_src[0]

            name = ' '.join(name.split(', ')[::-1])

            leg = Person(
                primary_org=chamber,
                district=district,
                name=name,
                party=party,
                image=photo_url,
            )
            leg.add_source(url=leg_url)
            leg.add_link(url=leg_url)

            if address:
                leg.add_contact_detail(
                    type='address',
                    value=address,
                    note='Capitol Office'
                )
            if phone:
                leg.add_contact_detail(
                    type='voice',
                    value=phone,
                    note='Capitol Office'
                )
            if fax:
                leg.add_contact_detail(
                    type='fax',
                    value=fax,
                    note='Capitol Office'
                )
            if email:
                leg.add_contact_detail(
                    type='email',
                    value=email,
                    note='Capitol Office'
                )

            yield leg
Example #37
0
    def scrape_upper(self, chamber):
        url = 'http://www.senate.michigan.gov/senatorinfo_list.html'
        url_to_append = 'http://www.senate.michigan.gov/_images/'
        data = self.get(url).text
        doc = lxml.html.fromstring(data)
        for row in doc.xpath('//table[not(@class="calendar")]//tr')[3:]:
            if len(row) != 7:
                continue

            # party, dist, member, office_phone, office_fax, office_loc
            party, dist, member, contact, phone, fax, loc = row.getchildren()
            if (party.text_content().strip() == ""
                    or 'Lieutenant Governor' in member.text_content()):
                continue

            party = abbr[party.text]
            district = dist.text_content().strip()
            name = member.text_content().strip()
            name = re.sub(r'\s+', " ", name)
            surname = re.split(', | ', name)
            surname[0] = re.sub('[\']', '', surname[0])
            try:
                self.head(url_to_append + surname[0] + '.png')
                photo_url = url_to_append + surname[0] + '.png'
            except scrapelib.HTTPError:
                try:
                    self.head(url_to_append + surname[0] + '.jpg')
                    photo_url = url_to_append + surname[0] + '.jpg'
                except scrapelib.HTTPError:
                    photo_url = None

            if name == 'Vacant':
                self.info('district %s is vacant', district)
                continue

            leg_url = member.xpath('a/@href')[0]
            office_phone = phone.text
            office_fax = fax.text

            office_loc = loc.text
            office_loc = re.sub(
                ' Farnum Bldg',
                ' Farnum Office Building\n125 West Allegan Street\nLansing, MI 48933',
                office_loc)
            office_loc = re.sub(' Capitol Bldg',
                                ' State Capitol Building\nLansing, MI 48909',
                                office_loc)

            # email addresses aren't on the list page anymore but they
            # are on the page linked off "Contact Me"

            # data has a typo in a row
            contact_url = [
                a for a in row.xpath(".//a")
                if a.text in ('Contact Me', 'Conact Me')
            ][0].get('href')
            contact_html = self.get(contact_url).text
            contact_doc = lxml.html.fromstring(contact_html)

            email = None
            header_email = contact_doc.xpath("//a[@class='header_email']")
            if header_email:
                email = header_email[0].text
            else:
                # not using the most common template, but maybe they
                # dropped their email on the page somewhere
                links = contact_doc.xpath('//a') or []
                text_email = [
                    a for a in links if 'mailto:' in (a.get('href') or '')
                ]
                if text_email:
                    email = text_email[0].text

            person = Person(name=name,
                            district=district,
                            party=party,
                            primary_org='upper',
                            image=photo_url)

            person.add_link(leg_url)
            person.add_source(leg_url)

            person.add_contact_detail(type='address',
                                      value=office_loc,
                                      note='Capitol Office')
            person.add_contact_detail(type='voice',
                                      value=office_phone,
                                      note='Capitol Office')
            person.add_contact_detail(type='fax',
                                      value=office_fax,
                                      note='Capitol Office')
            if email:
                person.add_contact_detail(type='email',
                                          value=email,
                                          note='Capitol Office')

            yield person
Example #38
0
    def scrape_details(self, chamber, leg_name, leg_link, role):
        if not leg_link:
            # Vacant post, likely:
            if "Vacancy" in leg_name:
                return
            raise Exception("leg_link is null. something went wrong")
        try:
            url = 'http://billstatus.ls.state.ms.us/members/%s' % leg_link
            url_root = os.path.dirname(url)
            details_page = self.get(url)
            root = lxml.etree.fromstring(details_page.content)
            party = root.xpath('string(//PARTY)')

            district = root.xpath('string(//DISTRICT)')

            photo = "%s/%s" % (url_root, root.xpath('string(//IMG_NAME)'))

            home_phone = root.xpath('string(//H_PHONE)')

            home_address = root.xpath('string(//H_ADDRESS)')
            home_address2 = root.xpath('string(//H_ADDRESS2)')
            home_city = root.xpath('string(//H_CITY)')
            home_zip = root.xpath('string(//H_ZIP)')

            home_address_total = ''
            if home_address and home_city:
                if not home_address2:
                    home_address_total = "%s\n%s, MS %s" % (
                        home_address, home_city, home_zip)
                else:
                    home_address_total = "%s\n%s\n%s, MS %s" % (
                        home_address, home_address2, home_city, home_zip)

            # bis_phone = root.xpath('string(//B_PHONE)')
            capital_phone = root.xpath('string(//CAP_PHONE)')
            # other_phone = root.xpath('string(//OTH_PHONE)')
            org_info = root.xpath('string(//ORG_INFO)')
            email_name = root.xpath('string(//EMAIL_ADDRESS)').strip()
            cap_room = root.xpath('string(//CAP_ROOM)')

            if leg_name in ('Lataisha Jackson', 'John G. Faulkner'):
                assert not party, (
                    "Remove special-casing for this Democrat without a "
                    "listed party: {}").format(leg_name)
                party = 'Democratic'
            elif leg_name in ('James W. Mathis', 'John Glen Corley'):
                assert not party, (
                    "Remove special-casing for this Republican without"
                    " a listed party: {}").format(leg_name)
                party = 'Republican'
            elif party == 'D':
                party = 'Democratic'
            elif party == 'R':
                party = 'Republican'
            else:
                raise AssertionError(
                    "A member with no identifiable party was found: {}".format(
                        leg_name))
            leg = Person(primary_org=chamber,
                         district=district,
                         party=party,
                         image=photo,
                         name=leg_name,
                         role=role)
            leg.extras['org_info'] = org_info
            leg.add_source(url)
            leg.add_link(url)

            if email_name != "":
                if "@" in email_name:
                    email = email_name
                else:
                    email = '%s@%s.ms.gov' % (email_name, {
                        "upper": "senate",
                        "lower": "house"
                    }[chamber])
                leg.add_contact_detail(type='email',
                                       value=email,
                                       note='Capitol Office')

            if capital_phone != "":
                leg.add_contact_detail(type='voice',
                                       value=capital_phone,
                                       note='Capitol Office')

            if cap_room != "":
                address = "Room %s\n%s" % (cap_room, CAP_ADDRESS)
            else:
                address = CAP_ADDRESS
            leg.add_contact_detail(type='address',
                                   value=address,
                                   note='Capitol Office')

            if home_phone != "":
                leg.add_contact_detail(type='voice',
                                       value=home_phone,
                                       note='District Office')

            if home_address_total != "":
                leg.add_contact_detail(type='address',
                                       value=home_address_total,
                                       note='District Office')

            yield leg
        except scrapelib.HTTPError as e:
            self.warning(str(e))
Example #39
0
    def _scrape_upper(self, roster_page, roster_url):
        """
        Retrieves a list of members of the upper legislative chamber.
        """
        # TODO: photo_urls https://senate.texas.gov/members.php
        #       also available on individual member screens
        # TODO: email addresses could be scraped from secondary sources
        #       https://github.com/openstates/openstates/issues/1292

        for tbl in roster_page.xpath('//table[@class="memdir"]'):
            # Scrape legislator information from roster URL
            leg_a = tbl.xpath('.//a')[0]
            name = leg_a.text
            # Skip vacant districts
            if re.search(r'district \d+ constituent services', name,
                         re.IGNORECASE):
                continue
            leg_url = leg_a.get('href')
            district = tbl.xpath(
                './/span[contains(text(), "District:")]')[0].tail.lstrip('0')
            party = tbl.xpath('.//span[contains(text(), "Party:")]')[0].tail

            if party == 'Democrat':
                party = 'Democratic'

            # Create Person object
            person = Person(name=name,
                            district=district,
                            party=party,
                            primary_org='upper')
            person.add_link(leg_url)

            # Scrape office contact information from roster URL
            office_num = 1
            for addr in tbl.xpath('.//td[@headers]'):
                fax = phone = None
                lines = [addr.text]
                for child in addr.getchildren():
                    # when we get to span tag we just ingested a phone #
                    if child.tag == 'span' and child.text:
                        if 'TEL' in child.text:
                            phone = lines.pop()
                        elif 'FAX' in child.text:
                            fax = lines.pop()
                    elif child.tail:
                        lines.append(child.tail)

                address = '\n'.join(line.strip() for line in lines if line)
                if 'CAP' in addr.get('headers'):
                    office_name = 'Capitol Office #{}'.format(office_num)
                    office_num += 1
                else:
                    office_name = 'District Office'

                # Add office contact information to Person object
                if address:
                    person.add_contact_detail(type='address',
                                              value=address,
                                              note=office_name)
                if phone:
                    person.add_contact_detail(type='voice',
                                              value=phone,
                                              note=office_name)
                if fax:
                    person.add_contact_detail(type='fax',
                                              value=fax,
                                              note=office_name)

            # Add source links to Person object
            person.add_source(roster_url)
            person.add_source(leg_url)
            yield person
Example #40
0
    def scrape_legislator(self, name, chamber, url, contact_page):
        page = self.get(url).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        party = page.xpath("string(//span[contains(@id, 'Party')])")
        party = party.strip()

        if party == 'Democrat':
            party = 'Democratic'

        district = page.xpath("string(//span[contains(@id, 'District')])")
        district = district.strip().lstrip('0')

        occupation = page.xpath(
            "string(//span[contains(@id, 'Occupation')])")
        occupation = occupation.strip()

        (photo_url, ) = page.xpath('//img[contains(@id, "_imgMember")]/@src')

        office_phone = page.xpath(
            "string(//span[contains(@id, 'CapitolPhone')])").strip()

        legislator = Person(primary_org=chamber,
                            image=photo_url,
                            name=name,
                            party=party,
                            district=district
                            )
        legislator.extras['occupation'] = occupation
        if office_phone.strip() != "":
            legislator.add_contact_detail(
                type='voice', value=office_phone, note='Capitol Office')

        # SD removed email from the detail pages but it's still in the
        # contact page, shared for all congress people
        member_id = re.search(r'Member=(\d+)', url).group(1)

        # find the profile block by finding a link inside it to their
        # detail page
        profile_link = contact_page.xpath(
            '//ul[@id="contact-list"]//a[contains(@href, "Member=%s")]' % (member_id,))
        if profile_link:
            # look for the adjacent email mailto link
            profile_link = profile_link[0]
            profile_block = profile_link.getparent().getparent().getparent()
            email_link = profile_block.xpath(
                './span/span/a[@class="mail-break"]')
            if email_link:
                email = email_link[0].text
                email = email.lstrip()
                email = email.rstrip()
                if email:
                    legislator.add_contact_detail(type='email',
                                                  value=email,
                                                  note='Capitol Office')
        home_address = [
            x.strip() for x in
            page.xpath('//td/span[contains(@id, "HomeAddress")]/text()')
            if x.strip()
        ]
        if home_address:
            home_address = "\n".join(home_address)
            home_phone = page.xpath(
                "string(//span[contains(@id, 'HomePhone')])").strip()
            legislator.add_contact_detail(type='address',
                                          value=home_address,
                                          note='District Office')
            if home_phone:
                legislator.add_contact_detail(type='voice',
                                              value=home_phone,
                                              note='District Office')

        legislator.add_source(url)
        legislator.add_link(url)

        committees = page.xpath(
            '//div[@id="divCommittees"]/span/section/table/tbody/tr/td/a')
        for committee in committees:
            self.scrape_committee(legislator, url, committee, chamber)
        yield legislator
Example #41
0
    def scrape(self, session=None):
        if not session:
            session = self.jurisdiction.legislative_sessions[-1]["name"]
            self.info("no session specified, using %s", session)

        year_abr = session[0:4]

        self._init_mdb(int(year_abr))

        roster_csv = self.access_to_csv("Roster")
        bio_csv = self.access_to_csv("LegBio")

        photos = {}
        for rec in bio_csv:
            photos[rec["Roster Key"]] = rec["URLPicture"]

        for rec in roster_csv:
            first_name = rec["Firstname"]
            middle_name = rec["MidName"]
            last_name = rec["LastName"]
            suffix = rec["Suffix"]
            full_name = first_name + " " + middle_name + " " + last_name + " " + suffix
            full_name = full_name.replace("  ", " ")
            full_name = full_name[0:len(full_name) - 1]

            district = str(int(rec["District"]))
            party = rec["Party"]
            if party == "R":
                party = "Republican"
            elif party == "D":
                party = "Democratic"
            else:
                party = party
            chamber = rec["House"]
            if chamber == "A":
                chamber = "lower"
            elif chamber == "S":
                chamber = "upper"

            leg_status = rec["LegStatus"]
            # skip Deceased/Retired members
            if leg_status != "Active":
                continue
            phone = rec["Phone"] or None
            email = None
            if rec["Email"]:
                email = rec["Email"]

            # Email has been removed from the Access DB, but it's
            # still [email protected] and [email protected] - many
            # reps have these emails on their personal pages even if
            # they're gone from the DB file
            if not email:
                email = self._construct_email(chamber, rec["Sex"], last_name)

            try:
                photo_url = photos[rec["Roster Key"]]
            except KeyError:
                photo_url = ""
                self.warning("no photo url for %s", rec["Roster Key"])
            url = "http://www.njleg.state.nj.us/members/bio.asp?Leg=" + str(
                int(rec["Roster Key"]))
            address = "{0}\n{1}, {2} {3}".format(rec["Address"], rec["City"],
                                                 rec["State"], rec["Zipcode"])
            gender = {"M": "Male", "F": "Female"}[rec["Sex"]]

            person = Person(
                name=full_name,
                district=district,
                primary_org=chamber,
                party=party,
                image=photo_url,
                gender=gender,
            )

            person.add_link(url)
            person.add_source(url)
            person.add_source("http://www.njleg.state.nj.us/downloads.asp")

            person.add_contact_detail(type="address",
                                      value=address,
                                      note="District Office")
            if phone is not None:
                person.add_contact_detail(type="voice",
                                          value=phone,
                                          note="District Office")
            if email is not None:
                person.add_contact_detail(type="email",
                                          value=email,
                                          note="District Office")

            yield person
Example #42
0
    def scrape_member(self, chamber, member_url):
        page = self.get(member_url).text
        root = lxml.html.fromstring(page)
        root.make_links_absolute(member_url)

        photo_url = root.xpath('//div[@class="thumbPhoto"]/img/@src')[0]
        full_name = root.xpath('//h1/span')[0].tail.strip()

        try:
            email = root.xpath('//a[contains(@href, "mailto")]/@href')[0]
            email = email.replace('mailto:', '')
        except:
            email = ''
            self.info("seat may be vacant")

        party, district = root.xpath('//h1/span')[1].text.split('-')
        party = party.strip()
        district = clean_district(district.strip())

        if party in ('D', 'Democrat', 'Democratic'):
            party = 'Democratic'
        elif party in ('R', 'Republican'):
            party = 'Republican'
        else:
            party = 'Other'

        leg = Person(primary_org=chamber,
                     district=district,
                     name=full_name,
                     party=party,
                     image=photo_url)
        leg.add_link(member_url)
        leg.add_source(member_url)

        leg.add_contact_detail(type='email',
                               value=email,
                               note='District Office')

        # offices
        for addr in root.xpath('//address/div[@class="contactGroup"]'):
            office_name = addr.xpath(
                '../preceding-sibling::h4/text()')[0].strip()
            if 'District' in office_name:
                note = 'District Office'
            elif 'State' in office_name:
                note = 'Capitol office'
            try:
                address = addr.xpath('a')[0].text_content()
                address = re.sub('\s{2,}', '\n', address)
                leg.add_contact_detail(type='address',
                                       value=address,
                                       note=note)
            except:
                self.warning("No address info found in `contactGroup`")
            next = None
            for phonerow in addr.xpath('./div/div'):
                phonerow = phonerow.text_content().strip()
                if phonerow == 'Phone:':
                    next = 'voice'
                elif phonerow == 'Fax:':
                    next = 'fax'
                elif next == 'voice':
                    leg.add_contact_detail(type='voice',
                                           value=phonerow,
                                           note=note)
                    next = None
                elif next == 'fax':
                    leg.add_contact_detail(type='fax',
                                           value=phonerow,
                                           note=note)
                    next = None
                else:
                    self.warning('unknown phonerow %s', phonerow)

        return leg
Example #43
0
    def scrape_table(self, chamber, tbl):
        # skip first
        for row in tbl.xpath('tr')[1:]:
            leg_a, district, _, _ = row.xpath('td')
            district = district.text
            name = leg_a.text_content().strip()
            if name.lower() == "to be announced":
                continue
            leg_url = leg_a.xpath('a/@href')[0]

            # get details
            html = self.get(leg_url).text
            ldoc = lxml.html.fromstring(html)
            ldoc.make_links_absolute(leg_url)

            party = _get_table_item(ldoc, 'Party Affiliation:').text
            if party == 'Democrat':
                party = 'Democratic'
            addr_lines = _get_table_item(ldoc,
                                         'Annapolis Address:').xpath('text()')
            address = []
            for line in addr_lines:
                if 'Phone:' not in line:
                    address.append(line)
                else:
                    phone = line
            address = '\n'.join(address)
            try:
                phone = re.findall('Phone: (\d{3}-\d{3}-\d{4})', phone)[0]
            except IndexError:
                self.warning("Missing phone!")
                phone = None

            email = ldoc.xpath('//a[contains(@href, "mailto:")]/text()')
            if not email:
                email = None
            elif len(email) == 1:
                email = email[0].strip()
            else:
                raise AssertionError('Multiple email links found on page')

            img_src = ldoc.xpath('//img[@class="sponimg"]/@src')
            if img_src:
                photo_url = img_src[0]

            leg = Person(primary_org=chamber,
                         district=district,
                         name=name,
                         party=party,
                         image=photo_url)
            leg.add_source(url=leg_url)
            leg.add_link(url=leg_url)

            # type ['address', 'email', 'url', 'fax', 'text', 'voice', 'video', 'pager', 'textphone']
            if address:
                leg.add_contact_detail(type='address',
                                       value=address or None,
                                       note='Capitol Office')
            if phone:
                leg.add_contact_detail(type='voice',
                                       value=phone,
                                       note='Capitol Office')

            if email:
                leg.add_contact_detail(type='email',
                                       value=email,
                                       note='Capitol Office')

            yield leg
Example #44
0
    def scrape_chamber(self, chamber):
        if chamber == 'lower':
            url = 'http://www.scstatehouse.gov/member.php?chamber=H'
        else:
            url = 'http://www.scstatehouse.gov/member.php?chamber=S'

        seen_committees = {}

        data = self.get(url).text
        doc = lxml.html.fromstring(data)
        doc.make_links_absolute(url)

        for a in doc.xpath('//a[contains(@href, "code=")]'):
            full_name = a.text
            leg_url = a.get('href')

            leg_html = self.get(leg_url).text
            leg_doc = lxml.html.fromstring(leg_html)
            leg_doc.make_links_absolute(leg_url)

            if 'Resigned effective' in leg_html:
                self.info('Resigned')
                continue

            party, district, _ = leg_doc.xpath(
                '//p[@style="font-size: 17px;'
                ' margin: 0 0 0 0; padding: 0;"]/text()')

            if 'Republican' in party:
                party = 'Republican'
            elif 'Democrat' in party:
                party = 'Democratic'

            # District # - County - Map
            district = district.split()[1]
            try:
                photo_url = leg_doc.xpath(
                    '//img[contains(@src,"/members/")]/@src')[0]
            except IndexError:
                self.warning("No Photo URL for {}".format(full_name))
                photo_url = ''
            person = Person(name=full_name,
                            district=district,
                            party=party,
                            primary_org=chamber,
                            image=photo_url)

            # office address / phone
            try:
                addr_div = leg_doc.xpath(
                    '//div[@style="float: left; width: 225px;'
                    ' margin: 10px 5px 0 20px; padding: 0;"]')[0]
                capitol_address = addr_div.xpath(
                    'p[@style="font-size: 13px;'
                    ' margin: 0 0 10px 0; padding: 0;"]')[0].text_content()

                phone = addr_div.xpath(
                    'p[@style="font-size: 13px;'
                    ' margin: 0 0 0 0; padding: 0;"]/text()')[0]
                capitol_phone = phone.strip()

                if capitol_address:
                    person.add_contact_detail(type='address',
                                              value=capitol_address,
                                              note='Capitol Office')

                if capitol_phone:
                    person.add_contact_detail(type='voice',
                                              value=capitol_phone,
                                              note='Capitol Office')
            except IndexError:
                self.warning('no capitol address for {0}'.format(full_name))

            # home address / phone
            try:
                addr_div = leg_doc.xpath(
                    '//div[@style="float: left;'
                    ' width: 225px; margin: 10px 0 0 20px;"]')[0]
                addr = addr_div.xpath(
                    'p[@style="font-size: 13px;'
                    ' margin: 0 0 10px 0; padding: 0;"]')[0].text_content()

                phone = addr_div.xpath(
                    'p[@style="font-size: 13px;'
                    ' margin: 0 0 0 0; padding: 0;"]/text()')[0]
                phone = phone.strip()
                if addr:
                    person.add_contact_detail(type='address',
                                              value=addr,
                                              note='District Office')

                if phone:
                    person.add_contact_detail(type='voice',
                                              value=phone,
                                              note='District Office')
            except IndexError:
                self.warning('no district address for {0}'.format(full_name))

            person.add_link(leg_url)
            person.add_source(url)
            person.add_source(leg_url)

            # committees (skip first link)
            for com in leg_doc.xpath(
                    '//a[contains(@href, "committee.php")]')[1:]:
                if com.text.endswith(', '):
                    committee, role = com.text_content().rsplit(', ', 1)

                    # known roles
                    role = {
                        'Treas.': 'treasurer',
                        'Secy.': 'secretary',
                        'Secy./Treas.': 'secretary/treasurer',
                        'V.C.': 'vice-chair',
                        '1st V.C.': 'first vice-chair',
                        'Co 1st V.C.': 'co-first vice-chair',
                        '2nd V.C.': 'second vice-chair',
                        '3rd V.C.': 'third vice-chair',
                        'Ex.Officio Member': 'ex-officio member',
                        'Chairman': 'chairman'
                    }[role]
                else:
                    committee = com.text
                    role = 'member'

                # only yield each committee once
                if committee not in seen_committees:
                    com = Organization(name=committee,
                                       classification='committee',
                                       chamber=chamber)
                    com.add_source(url)
                    seen_committees[committee] = com
                    yield com
                else:
                    com = seen_committees[committee]

                person.add_membership(com, role=role)

            yield person
Example #45
0
    def _parse_person(self, row, chamber, seat_map):
        # Capture legislator vitals.
        first_name = row["FirstName"]
        middle_name = row["MiddleName"]
        last_name = row["LastName"]
        full_name = "{} {} {}".format(first_name, middle_name, last_name)
        full_name = re.sub(r"[\s]{2,}", " ", full_name)

        if chamber == "lower":
            district = "{} {}".format(row["County"],
                                      int(row["District"])).strip()
        else:
            district = str(int(row["District"])).strip()

        party = self.party_map[row["party"].upper()]
        email = row["WorkEmail"]

        if district == "0":
            self.warning("Skipping {}, district is set to 0".format(full_name))
            return

        person = Person(primary_org=chamber,
                        district=district,
                        name=full_name,
                        party=party)

        extras = {
            "first_name": first_name,
            "middle_name": middle_name,
            "last_name": last_name,
        }

        person.extras = extras
        if email:
            office = "Capitol" if email.endswith(
                "@leg.state.nh.us") else "District"
            person.add_contact_detail(type="email",
                                      value=email,
                                      note=office + " Office")

        # Capture legislator office contact information.
        district_address = "{}\n{}\n{}, {} {}".format(row["Address"],
                                                      row["address2"],
                                                      row["city"],
                                                      row["State"],
                                                      row["Zipcode"]).strip()

        phone = row["Phone"].strip()
        if not phone:
            phone = None

        if district_address:
            office = "Capitol" if chamber == "upper" else "District"
            person.add_contact_detail(type="address",
                                      value=district_address,
                                      note=office + " Office")
        if phone:
            office = "Capitol" if "271-" in phone else "District"
            person.add_contact_detail(type="voice",
                                      value=phone,
                                      note=office + " Office")

        # Retrieve legislator portrait.
        profile_url = None
        if chamber == "upper":
            profile_url = self.senate_profile_url.format(row["District"])
        elif chamber == "lower":
            try:
                seat_number = seat_map[row["seatno"]]
                profile_url = self.house_profile_url.format(seat_number)
            except KeyError:
                pass

        if profile_url:
            person.image = self._get_photo(profile_url, chamber)
            person.add_source(profile_url)

        return person
Example #46
0
    def scrape_legislators(self, url, chamber):
        data = self.get(url).text
        data = data.replace('"""', '"')  # weird triple quotes
        data = data.splitlines()

        fieldnames = ['last_name', 'first_name', 'party', 'district',
                      'address', 'city', 'state', 'zip']
        csv_parser = csv.DictReader(data, fieldnames)

        district_leg_urls = self._district_legislator_dict()

        # Toss the row headers.
        next(csv_parser)

        for entry in csv_parser:
            if not entry:
                continue

            # District.
            district = entry['district']
            hd_or_sd, district = district.split()

            # Party.
            party_letter = entry['party']
            party = {'D': 'Democratic', 'R': 'Republican'}[party_letter]

            # Get full name properly capped.
            fullname = '%s %s' % (entry['first_name'].title(),
                                  entry['last_name'].title())

            legislator = Person(name=fullname, primary_org=chamber, district=district,
                                party=party, image=entry.get('photo_url', ''))
            legislator.add_source(url)

            # Get any info at the legislator's detail_url.
            deets = {}
            try:
                detail_url = district_leg_urls[hd_or_sd][district]
                deets = self._scrape_details(detail_url)
            except KeyError:
                self.warning(
                    "Couldn't find legislator URL for district {} {}, likely retired; skipping"
                    .format(hd_or_sd, district)
                )
                continue
            except NoDetails:
                self.logger.warning("No details found at %r" % detail_url)
                continue
            else:
                legislator.add_source(detail_url)
                legislator.add_link(detail_url)

            # Get the office.
            address = '\n'.join([
                entry['address'],
                '%s, %s %s' % (entry['city'].title(), entry['state'], entry['zip'])
                ])
            legislator.add_contact_detail(type='address', value=address, note='District Office')

            phone = deets.get('phone')
            fax = deets.get('fax')
            email = deets.get('email')
            if phone:
                legislator.add_contact_detail(type='voice', value=phone, note='District Office')
            if fax:
                legislator.add_contact_detail(type='fax', value=fax, note='District Office')
            if email:
                legislator.add_contact_detail(type='email', value=email, note='District Office')

            yield legislator
Example #47
0
    def scrape_legislator(self, chamber, url):
        # Initialize default values for legislator attributes.
        full_name = None
        party = None
        photo_url = None
        email = None
        capitol_address = None
        capitol_phone = None
        district = None
        district_address = None
        district_phone = None

        if chamber == 'upper':
            title_prefix = 'Senator '
        elif chamber == 'lower':
            title_prefix = 'Representative '
        else:
            title_prefix = ''

        santa_fe_area_code = '(505)'

        page = self.lxmlize(url)

        info_node = self.get_node(
            page, '//table[@id="MainContent_formViewLegislator"]')
        if info_node is None:
            raise ValueError('Could not locate legislator data.')

        district_node = self.get_node(
            info_node,
            './/a[@id="MainContent_formViewLegislator_linkDistrict"]')
        if district_node is not None:
            district = district_node.text.strip()

        name_node = self.get_node(
            page, './/span[@id="MainContent_formViewLegislatorName'
            '_lblLegislatorName"]')

        if name_node is not None:
            if name_node.text.strip().endswith(' Vacant'):
                self.warning(
                    'Found vacant seat for {} district {}; skipping'.format(
                        chamber, district))
                return

            n_head, n_sep, n_party = name_node.text.rpartition(' - ')

            full_name = re.sub(r'^{}'.format(title_prefix), '', n_head.strip())

            if '(D)' in n_party:
                party = 'Democratic'
            elif '(R)' in n_party:
                party = 'Republican'
            elif '(DTS)' in n_party:
                # decline to state = independent
                party = 'Independent'
            else:
                raise AssertionError('Unknown party {} for {}'.format(
                    party, full_name))

        photo_node = self.get_node(
            info_node,
            './/img[@id="MainContent_formViewLegislator_imgLegislator"]')
        if photo_node is not None:
            photo_url = photo_node.get('src')

        email_node = self.get_node(
            info_node, './/a[@id="MainContent_formViewLegislator_linkEmail"]')
        if email_node is not None and email_node.text:
            email = email_node.text.strip()

        capitol_address_node = self.get_node(
            info_node,
            './/span[@id="MainContent_formViewLegislator_lblCapitolRoom"]')
        if capitol_address_node is not None:
            capitol_address_text = capitol_address_node.text
            if capitol_address_text is not None:
                capitol_address = 'Room {} State Capitol\nSanta Fe, NM 87501'\
                    .format(capitol_address_text.strip())

        capitol_phone_node = self.get_node(
            info_node,
            './/span[@id="MainContent_formViewLegislator_lblCapitolPhone"]')
        if capitol_phone_node is not None:
            capitol_phone_text = capitol_phone_node.text
            if capitol_phone_text:
                capitol_phone_text = capitol_phone_text.strip()
                area_code, phone = extract_phone_number(capitol_phone_text)
                if phone:
                    capitol_phone = '{} {}'.format(
                        area_code.strip() if area_code else santa_fe_area_code,
                        phone)

        district_address_node = self.get_node(
            info_node,
            './/span[@id="MainContent_formViewLegislator_lblAddress"]')
        if district_address_node is not None:
            district_address = '\n'.join(district_address_node.xpath('text()'))

        office_phone_node = self.get_node(
            info_node,
            './/span[@id="MainContent_formViewLegislator_lblOfficePhone"]')

        home_phone_node = self.get_node(
            info_node,
            './/span[@id="MainContent_formViewLegislator_lblHomePhone"]')

        if office_phone_node is not None and office_phone_node.text:
            district_phone_text = office_phone_node.text
        elif home_phone_node is not None and home_phone_node.text:
            district_phone_text = home_phone_node.text
        else:
            district_phone_text = None
        if district_phone_text:
            d_area_code, d_phone = extract_phone_number(district_phone_text)
            district_phone = '{} {}'.format(d_area_code.strip(), d_phone)

        person = Person(name=full_name,
                        district=district,
                        party=party,
                        primary_org=chamber,
                        image=photo_url)
        if district_address:
            person.add_contact_detail(type='address',
                                      value=district_address,
                                      note='District Office')
        if district_phone:
            person.add_contact_detail(type='voice',
                                      value=district_phone,
                                      note='District Office')
        if capitol_address:
            person.add_contact_detail(type='address',
                                      value=capitol_address,
                                      note='Capitol Office')
        if capitol_phone:
            person.add_contact_detail(type='voice',
                                      value=capitol_phone,
                                      note='Capitol Office')
        if email:
            person.add_contact_detail(type='email',
                                      value=email,
                                      note='Capitol Office')

        person.add_link(url)
        person.add_source(url)

        yield person
    def scrape(self):
        body_types = self.body_types()

        city_council, = [body for body in self.bodies()
                         if body['BodyName'] == 'City Council']

        terms = collections.defaultdict(list)
        for office in self.body_offices(city_council):
            if 'VACAN' not in office['OfficeRecordFullName']:
                terms[office['OfficeRecordFullName'].strip()].append(office)

        web_scraper = LegistarPersonScraper(None,None)
        web_scraper.MEMBERLIST = 'https://chicago.legistar.com/DepartmentDetail.aspx?ID=12357&GUID=4B24D5A9-FED0-4015-9154-6BFFFB2A8CB4&R=8bcbe788-98cd-4040-9086-b34fa8e49881'
        web_scraper.ALL_MEMBERS = '3:3'

        web_info = {}
        for member, _ in web_scraper.councilMembers({'ctl00$ContentPlaceHolder$lstName' : 'City Council'}):
            web_info[member['Person Name']['label']] = member


        web_info['Balcer, James'] = collections.defaultdict(lambda : None)
        web_info['Fioretti, Bob'] = collections.defaultdict(lambda : None)
        web_info['Balcer, James']['Ward/Office'] = 11
        web_info['Fioretti, Bob']['Ward/Office'] = 2
        
        members = {}
        for member, offices in terms.items():
            web = web_info[member]
            p = Person(member)
            for term in offices:
                role = term['OfficeRecordTitle']
                p.add_term('Alderman',
                           'legislature',
                           district = "Ward {}".format(int(web['Ward/Office'])),
                           start_date = self.toDate(term['OfficeRecordStartDate']),
                           end_date = self.toDate(term['OfficeRecordEndDate']))

            if web['Photo'] :
                p.image = web['Photo']

            contact_types = {
                "City Hall Office": ("address", "City Hall Office"),
                "City Hall Phone": ("voice", "City Hall Phone"),
                "Ward Office Phone": ("voice", "Ward Office Phone"),
                "Ward Office Address": ("address", "Ward Office Address"),
                "Fax": ("fax", "Fax")
            }

            for contact_type, (type_, _note) in contact_types.items():
                if web[contact_type] and web[contact_type] != 'N/A':
                    p.add_contact_detail(type=type_,
                                         value= web[contact_type],
                                         note=_note)

            if web["E-mail"] and web["E-mail"]["label"] and web["E-mail"]["label"] != 'N/A':
                p.add_contact_detail(type="email",
                                     value=web['E-mail']['label'],
                                     note='E-mail')


            if web['Website']:
                p.add_link(web['Website']['url'])

            source_urls = self.person_sources_from_office(term)
            person_api_url, person_web_url = source_urls
            p.add_source(person_api_url, note='api')
            p.add_source(person_web_url, note='web')


            members[member] = p

        for body in self.bodies():
            if body['BodyTypeId'] == body_types['Committee']:
                o = Organization(body['BodyName'],
                                 classification='committee',
                                 parent_id={'name' : 'Chicago City Council'})

                o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body), note='api')
                o.add_source(self.WEB_URL + '/DepartmentDetail.aspx?ID={BodyId}&GUID={BodyGuid}'.format(**body), note='web')

                for office in self.body_offices(body):
                    # messed up record for joanna thompson
                    if office['OfficeRecordId'] == 1055:
                        continue
                        
                    role = office['OfficeRecordTitle']
                    if role not in ("Vice Chair", "Chairman"):
                        role = 'Member'

                    person = office['OfficeRecordFullName'].strip()
                    if person in members:
                        p = members[person]
                    else:
                        p = Person(person)
                        
                        source_urls = self.person_sources_from_office(office)
                        person_api_url, person_web_url = source_urls
                        p.add_source(person_api_url, note='api')
                        p.add_source(person_web_url, note='web')

                        members[person] = p

                    p.add_membership(body['BodyName'],
                                     role=role,
                                     start_date = self.toDate(office['OfficeRecordStartDate']),
                        
                                     end_date = self.toDate(office['OfficeRecordEndDate']))
                        

                yield o

        for body in self.bodies():
            if body['BodyTypeId'] == body_types['Joint Committee']:
                o = Organization(body['BodyName'],
                                 classification='committee',
                                 parent_id={'name' : 'Chicago City Council'})

                o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body), note='api')
                o.add_source(self.WEB_URL + '/DepartmentDetail.aspx?ID={BodyId}&GUID={BodyGuid}'.format(**body), note='web')

                yield o        

        for p in members.values():
            yield p
Example #49
0
    def scrape_upper(self, chamber):
        url = 'http://www.senate.michigan.gov/senatorinfo_list.html'
        data = self.get(url).text
        doc = lxml.html.fromstring(data)
        for row in doc.xpath('//table[not(@class="calendar")]//tr')[3:]:
            if len(row) != 7:
                continue

            # party, dist, member, office_phone, office_fax, office_loc
            party, dist, member, contact, phone, fax, loc = row.getchildren()
            if (party.text_content().strip() == "" or
                    'Lieutenant Governor' in member.text_content()):
                continue

            party = abbr[party.text]
            district = dist.text_content().strip()
            name = member.text_content().strip()
            name = re.sub(r'\s+', " ", name)

            if name == 'Vacant':
                self.info('district %s is vacant', district)
                continue

            leg_url = member.xpath('a/@href')[0]
            office_phone = phone.text
            office_fax = fax.text

            office_loc = loc.text
            office_loc = re.sub(
                ' Farnum Bldg',
                ' Farnum Office Building\n125 West Allegan Street\nLansing, MI 48933',
                office_loc
            )
            office_loc = re.sub(
                ' Capitol Bldg',
                ' State Capitol Building\nLansing, MI 48909',
                office_loc
            )

            # email addresses aren't on the list page anymore but they
            # are on the page linked off "Contact Me"

            # data has a typo in a row
            contact_url = [
                a for a in row.xpath(".//a")
                if a.text in ('Contact Me', 'Conact Me')][0].get('href')
            contact_html = self.get(contact_url).text
            contact_doc = lxml.html.fromstring(contact_html)

            email = None
            header_email = contact_doc.xpath("//a[@class='header_email']")
            if header_email:
                email = header_email[0].text
            else:
                # not using the most common template, but maybe they
                # dropped their email on the page somewhere
                links = contact_doc.xpath('//a') or []
                text_email = [a for a in links
                              if 'mailto:' in (a.get('href') or '')]
                if text_email:
                    email = text_email[0].text

            person = Person(name=name, district=district, party=party, primary_org='upper')

            person.add_link(leg_url)
            person.add_source(leg_url)

            person.add_contact_detail(type='address', value=office_loc, note='Capitol Office')
            person.add_contact_detail(type='voice', value=office_phone, note='Capitol Office')
            person.add_contact_detail(type='fax', value=office_fax, note='Capitol Office')
            if email:
                person.add_contact_detail(type='email', value=email, note='Capitol Office')

            yield person
Example #50
0
    def scrape_member(self, chamber, member_url):
        page = self.get(member_url).text
        root = lxml.html.fromstring(page)

        name_and_party = root.xpath(
            'string(//div[@class="col-md-12"]/h1[1])').split()

        title = name_and_party[0]
        # Account for Representative-Elect and Senator-Elect, for incoming class
        if title.startswith("Representative"):
            chamber = "lower"
        elif title.startswith("Senator"):
            chamber = "upper"

        full_name = " ".join(name_and_party[1:-1])

        party = name_and_party[-1]

        if party == "(R)":
            party = "Republican"
        elif party == "(D)":
            party = "Democratic"
        elif party == "(G)":
            party = "Green"
        elif party == "(I)":
            party = "Independent"
        elif "-Elect" in title and not party.startswith("("):
            self.warning("Member-elect is currently missing a party")
            full_name = " ".join(name_and_party[1:])
            party = ""
        else:
            raise AssertionError("Unknown party ({0}) for {1}".format(
                party, full_name))

        try:
            img = root.xpath('//img[@class="SitePhotos MemberPhoto"]')[0]
            photo_url = "https://www.arkleg.state.ar.us" + img.attrib["src"]
        except IndexError:
            self.warning("No member photo found")
            photo_url = ""

        # Need to figure out a cleaner method for this later
        # info_box = root.xpath('string(//div[@id="bodyContent"]/div[2]/div[2])')
        try:
            district = root.xpath(
                'string(//div[@id="bodyContent"]/div[2]/div[2]/div[3]/div[2])')
        except AttributeError:
            self.warning("Member has no district listed; skipping them")
            return

        person = Person(
            name=full_name,
            district=district,
            party=party,
            primary_org=chamber,
            image=photo_url,
        )

        person.add_link(member_url)
        person.add_source(member_url)

        try:
            phone = root.xpath(
                'string(//div[@id="bodyContent"]/div[2]/div[2]/div[1]/div[2]/a)'
            )
            if not phone.strip():
                raise AttributeError
        except AttributeError:
            phone = None
        try:
            email = root.xpath(
                'string(//div[@id="bodyContent"]/div[2]/div[2]/div[2]/div[2]/a)'
            )
            if not email.strip():
                raise AttributeError
        except AttributeError:
            email = None
        address = root.xpath(
            'string(//div[@id="bodyContent"]/div[1]/div[1]/p/b)')

        person.add_contact_detail(type="address",
                                  value=address,
                                  note="District Office")
        if phone is not None:
            person.add_contact_detail(type="voice",
                                      value=phone,
                                      note="District Office")
        if email is not None:
            person.add_contact_detail(type="email",
                                      value=email,
                                      note="District Office")

        try:
            occupation_check = root.xpath(
                'string(//div[@id="bodyContent"]/div[2]/div[2]/div[5]/div[1]/b)'
            )
            if occupation_check == "Occupation:":
                person.extras["occupation"] = root.xpath(
                    'string(//div[@id="bodyContent"]/div[2]/div[2]/div[5]/div[2])'
                )
            else:
                raise AttributeError
            if not person.extras["occupation"].strip():
                raise AttributeError
        except AttributeError:
            pass

        yield person
    def scrape_chamber(self, chamber=None):
        metainf = self.scrape_leg_page(get_legislator_listing_url(chamber))
        for leg in metainf:
            try:
                chamber = {"House": "lower", "Senate": "upper"}[leg['chamber']]
            except KeyError:
                print("")
                print("  ERROR: Bad Legislator page.")
                print("    -> " + "\n    -> ".join(leg['source']))
                print("")
                print("  Added this workaround because of a bad legislator")
                print("  page, while they filled their info out.")
                print("")
                print("  Emailed webmaster. Told to wait.")
                print("   - PRT, Jun 23, 2014")
                print("")
                continue

            person = Person(name=leg['name'],
                            district=leg['district'],
                            party=leg['party'],
                            primary_org=chamber,
                            image=leg['image'])

            for source in leg['source']:
                person.add_source(source)

            try:
                for ctty in leg['ctty']:
                    flag = 'Joint Legislative'
                    if ctty['name'][:len(flag)] == flag:
                        ctty_chamber = "joint"
                    else:
                        ctty_chamber = chamber

                    comm = Organization(name=ctty['name'],
                                        classification="committee",
                                        chamber=ctty_chamber)
                    comm.add_member(person, role="member")

            except KeyError:
                self.warn("%s has no scraped Committees" % leg['name'])

            person.add_link(leg['homepage'])

            if leg['addr']:
                person.add_contact_detail(type='address',
                                          value=leg['addr'],
                                          note='Capitol Office')
            if leg['phone']:
                person.add_contact_detail(type='voice',
                                          value=leg['phone'],
                                          note='Capitol Office')
            if leg['email']:
                person.add_contact_detail(type='email',
                                          value=leg['email'],
                                          note='Capitol Office')
            if leg['fax']:
                person.add_contact_detail(type='fax',
                                          value=leg['fax'],
                                          note='Capitol Office')
            yield person
Example #52
0
    def scrape(self, session=None):
        if not session:
            session = self.jurisdiction.legislative_sessions[-1]['name']
            self.info('no session specified, using %s', session)

        year_abr = session[0:4]

        self._init_mdb(year_abr)

        roster_csv = self.access_to_csv('Roster')
        bio_csv = self.access_to_csv('LegBio')

        photos = {}
        for rec in bio_csv:
            photos[rec['Roster Key']] = rec['URLPicture']

        for rec in roster_csv:
            first_name = rec["Firstname"]
            middle_name = rec["MidName"]
            last_name = rec["LastName"]
            suffix = rec["Suffix"]
            full_name = first_name + " " + middle_name + " " + last_name + " " + suffix
            full_name = full_name.replace('  ', ' ')
            full_name = full_name[0:len(full_name) - 1]

            district = str(int(rec["District"]))
            party = rec["Party"]
            if party == 'R':
                party = "Republican"
            elif party == 'D':
                party = "Democratic"
            else:
                party = party
            chamber = rec["House"]
            if chamber == 'A':
                chamber = "lower"
            elif chamber == 'S':
                chamber = "upper"

            leg_status = rec["LegStatus"]
            # skip Deceased/Retired members
            if leg_status != 'Active':
                continue
            phone = rec["Phone"] or None
            email = None
            if rec["Email"]:
                email = rec["Email"]

            # Email has been removed from the Access DB, but it's
            # still [email protected] and [email protected] - many
            # reps have these emails on their personal pages even if
            # they're gone from the DB file
            if not email:
                email = self._construct_email(chamber, last_name)

            try:
                photo_url = photos[rec['Roster Key']]
            except KeyError:
                photo_url = ''
                self.warning('no photo url for %s', rec['Roster Key'])
            url = ('http://www.njleg.state.nj.us/members/bio.asp?Leg=' +
                   str(int(rec['Roster Key'])))
            address = '{0}\n{1}, {2} {3}'.format(rec['Address'], rec['City'],
                                                 rec['State'], rec['Zipcode'])
            gender = {'M': 'Male', 'F': 'Female'}[rec['Sex']]

            person = Person(
                name=full_name,
                district=district,
                primary_org=chamber,
                party=party,
                image=photo_url,
                gender=gender,
            )

            person.add_link(url)
            person.add_source(url)
            person.add_source('http://www.njleg.state.nj.us/downloads.asp')

            person.add_contact_detail(type='address',
                                      value=address,
                                      note='District Office')
            if phone is not None:
                person.add_contact_detail(type='voice',
                                          value=phone,
                                          note='District Office')
            if email is not None:
                person.add_contact_detail(type='email',
                                          value=email,
                                          note='District Office')

            yield person
Example #53
0
    def scrape_details(self, chamber, leg_name, leg_link, role):
        if not leg_link:
            # Vacant post, likely:
            if "Vacancy" in leg_name:
                return
            raise Exception("leg_link is null. something went wrong")
        try:
            url = 'http://billstatus.ls.state.ms.us/members/%s' % leg_link
            url_root = os.path.dirname(url)
            details_page = self.get(url)
            root = lxml.etree.fromstring(details_page.content)
            party = root.xpath('string(//PARTY)')

            district = root.xpath('string(//DISTRICT)')

            photo = "%s/%s" % (url_root, root.xpath('string(//IMG_NAME)'))

            home_phone = root.xpath('string(//H_PHONE)')

            home_address = root.xpath('string(//H_ADDRESS)')
            home_address2 = root.xpath('string(//H_ADDRESS2)')
            home_city = root.xpath('string(//H_CITY)')
            home_zip = root.xpath('string(//H_ZIP)')

            home_address_total = ''
            if home_address and home_city:
                if not home_address2:
                    home_address_total = "%s\n%s, MS %s" % (
                        home_address,
                        home_city,
                        home_zip
                    )
                else:
                    home_address_total = "%s\n%s\n%s, MS %s" % (
                        home_address,
                        home_address2,
                        home_city,
                        home_zip
                    )

            # bis_phone = root.xpath('string(//B_PHONE)')
            capital_phone = root.xpath('string(//CAP_PHONE)')
            # other_phone = root.xpath('string(//OTH_PHONE)')
            org_info = root.xpath('string(//ORG_INFO)')
            email_name = root.xpath('string(//EMAIL_ADDRESS)').strip()
            cap_room = root.xpath('string(//CAP_ROOM)')

            if leg_name in ('Lataisha Jackson', 'John G. Faulkner', 'Jeffery Harness'):
                assert not party, ("Remove special-casing for this Democrat without a "
                                   "listed party: {}").format(leg_name)
                party = 'Democratic'
            elif leg_name in ('James W. Mathis',
                              'John Glen Corley'):
                assert not party, ("Remove special-casing for this Republican without"
                                   " a listed party: {}").format(leg_name)
                party = 'Republican'
            elif party == 'D':
                party = 'Democratic'
            elif party == 'R':
                party = 'Republican'
            else:
                raise AssertionError(
                    "A member with no identifiable party was found: {}".format(leg_name))
            leg = Person(primary_org=chamber,
                         district=district,
                         party=party,
                         image=photo,
                         name=leg_name,
                         role=role
                         )
            leg.extras['org_info'] = org_info
            leg.add_source(url)
            leg.add_link(url)

            if email_name != "":
                if "@" in email_name:
                    email = email_name
                else:
                    email = '%s@%s.ms.gov' % (email_name,
                                              {"upper": "senate", "lower": "house"}[chamber])
                leg.add_contact_detail(type='email', value=email, note='Capitol Office')

            if capital_phone != "":
                leg.add_contact_detail(type='voice', value=capital_phone, note='Capitol Office')

            if cap_room != "":
                address = "Room %s\n%s" % (cap_room, CAP_ADDRESS)
            else:
                address = CAP_ADDRESS
            leg.add_contact_detail(type='address', value=address, note='Capitol Office')

            if home_phone != "":
                leg.add_contact_detail(type='voice', value=home_phone, note='District Office')

            if home_address_total != "":
                leg.add_contact_detail(type='address',
                                       value=home_address_total,
                                       note='District Office')

            yield leg
        except scrapelib.HTTPError as e:
            self.warning(str(e))
Example #54
0
    def scrape_senators(self):
        mapping = {
            'district': 0,
            'first_name': 2,
            'middle_name': 3,
            'last_name': 4,
            'suffixes': 5,
            'party': 1,
            'street_addr': 6,
            'city': 7,
            'state': 8,
            'zip_code': 9,
            'phone1': 10,
            'phone2': 11,
            'email': 12
        }

        url = ('https://mainelegislature.org/uploads/visual_edit/'
               '128th-senate-members-for-distribution-1.xlsx')
        fn, result = self.urlretrieve(url)

        wb = xlrd.open_workbook(fn)
        sh = wb.sheet_by_index(0)

        LEGISLATOR_ROSTER_URL = \
            'https://mainelegislature.org/senate/128th-senators/9332'
        roster_doc = lxml.html.fromstring(self.get(LEGISLATOR_ROSTER_URL).text)
        roster_doc.make_links_absolute(LEGISLATOR_ROSTER_URL)

        for rownum in range(1, sh.nrows):
            # get fields out of mapping
            d = {}
            for field, col_num in mapping.items():
                try:
                    d[field] = str(sh.cell(rownum, col_num).value).strip()
                except IndexError:
                    # This col_num doesn't exist in the sheet.
                    pass
            first_name = d['first_name']
            middle_name = d['middle_name']
            last_name = d['last_name']

            full_name = " ".join((first_name, middle_name, last_name))
            full_name = re.sub(r'\s+', ' ', full_name).strip()

            address = "{street_addr}\n{city}, ME {zip_code}".format(**d)

            phone = d['phone1']
            if not phone:
                phone = d['phone2']
            if not phone:
                phone = None

            district = d['district'].split('.')[0]
            party = d['party'].split('.')[0]

            # Determine legislator's URL to get their photo
            URL_XPATH = '//li/a[contains(text(), "District {:02d}")]/@href'.format(
                int(district))

            try:
                (leg_url, ) = roster_doc.xpath(URL_XPATH)
            except ValueError:
                self.warning('vacant seat %s', district)
                continue  # Seat is vacant

            html = self.get(leg_url).text
            doc = lxml.html.fromstring(html)
            doc.make_links_absolute(leg_url)
            xpath = '//img[contains(@src, ".png")]/@src'
            photo_url = doc.xpath(xpath)
            if photo_url:
                photo_url = photo_url.pop()
            else:
                photo_url = None

            person = Person(
                name=full_name,
                district=district,
                image=photo_url,
                primary_org='upper',
                party=party,
            )

            person.add_link(leg_url)
            person.add_source(leg_url)
            person.extras['first_name'] = first_name
            person.extras['middle_name'] = middle_name
            person.extras['last_name'] = last_name

            person.add_contact_detail(type='address',
                                      value=address,
                                      note='District Office')
            if phone:
                person.add_contact_detail(type='voice',
                                          value=clean_phone(phone),
                                          note='District Phone')
            person.add_contact_detail(type='email',
                                      value=d['email'],
                                      note='District Email')

            yield person
    def scrape(self):
        committee_d = {}
        non_committees = {'City Council', 'Office of the Mayor',
                          'Office of the City Clerk'}

        for councilman, committees in self.councilMembers() :
            if councilman['Ward/Office'] == "":
                continue

            ward = councilman['Ward/Office']
            if ward not in {"Mayor", "Clerk"} :

                ward = "Ward {}".format(int(ward))
                role = "Alderman"
                p = Person(councilman['Person Name']['label'],
                           district=ward,
                           primary_org="legislature",
                           role=role)
                

            if councilman['Photo'] :
                p.image = councilman['Photo']

            contact_types = {
                "City Hall Office": ("address", "City Hall Office"),
                "City Hall Phone": ("voice", "City Hall Phone"),
                "Ward Office Phone": ("voice", "Ward Office Phone"),
                "Ward Office Address": ("address", "Ward Office Address"),
                "Fax": ("fax", "Fax")
            }

            for contact_type, (type_, _note) in contact_types.items():
                if councilman[contact_type]:
                    p.add_contact_detail(type=type_,
                                         value= councilman[contact_type],
                                         note=_note)

            if councilman["E-mail"]:
                p.add_contact_detail(type="email",
                                     value=councilman['E-mail']['label'],
                                     note='E-mail')


            if councilman['Website']:
                p.add_link(councilman['Website']['url'])
            p.add_source(councilman['Person Name']['url'], note='web')

            for committee, _, _ in committees:
                committee_name = committee['Legislative Body']['label']
                if committee_name and committee_name not in non_committees:
                    o = committee_d.get(committee_name, None)
                    if o is None:
                        o = Organization(committee_name,
                                         classification='committee',
                                         parent_id={'name' : 'Chicago City Council'})
                        o.add_source(committee['Legislative Body']['url'], 
                                     note='web')
                        committee_d[committee_name] = o

                    o.add_member(p, role=committee["Title"])

            yield p

        for name, term in FORMER_ALDERMEN.items() :
            p =  Person(name=name,
                        primary_org="legislature",
                        start_date=term['term'][0],
                        end_date=term['term'][1],
                        district="Ward {}".format(term['ward']),
                        role='Alderman')
            if name == 'Chandler, Michael D.' :
                p.add_term('Alderman',
                           "legislature",
                           district="Ward {}".format(term['ward']),
                           start_date=datetime.date(2011, 5, 16),
                           end_date=datetime.date(2015, 5, 18))

            p.add_source(term['source'], note='web')
            yield p

        for o in committee_d.values() :
            yield o

        for committee_name in FORMER_COMMITTEES :
            o = Organization(committee_name, 
                             classification='committee',
                             parent_id={'name' : 'Chicago City Council'})
            o.add_source("https://chicago.legistar.com/Departments.aspx", 
                         note='web')
            yield o

        for joint_committee in JOINT_COMMITTEES :

            o = Organization(joint_committee, 
                             classification='committee',
                             parent_id={'name' : 'Chicago City Council'})
            o.add_source("https://chicago.legistar.com/Departments.aspx",
                         note='web')
            yield o
Example #56
0
    def scrape_reps(self):
        url = 'http://www.maine.gov/legis/house/dist_mem.htm'
        page = self.get(url).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        # These do not include the non-voting tribal representatives
        # They do not have numbered districts, and lack a good deal of
        # the standard profile information about representatives
        for district in page.xpath('//a[contains(@href, "dist_twn")]/..'):
            if "- Vacant" in district.text_content():
                self.warning("District is vacant: '{}'".format(
                    district.text_content()))
                continue

            _, district_number = district.xpath('a[1]/@href')[0].split('#')

            leg_url = district.xpath('a[2]/@href')[0]
            leg_info = district.xpath('a[2]/text()')[0]

            INFO_RE = r'''
                    Representative\s
                    (?P<member_name>.+?)
                    \s\(
                    (?P<party>[DRCUIG])
                    -
                    (?P<district_name>.+?)
                    \)
                    '''
            info_search = re.search(INFO_RE, leg_info, re.VERBOSE)

            if not info_search:
                leg_url = district.xpath('a[3]/@href')[0]
                leg_info_second_active = district.xpath('a[3]/text()')[0]

                member_name = leg_info_second_active
                mem_info = district.xpath('a[3]/following-sibling::text()')
                party = _party_map[mem_info[0][2]]
                district = mem_info[0].split('-')[1][:-1]
            else:
                member_name = info_search.group('member_name')
                party = _party_map[info_search.group('party')]
                district_name = info_search.group('district_name')

            # Get the photo url.
            html = self.get(leg_url).text
            doc = lxml.html.fromstring(html)
            doc.make_links_absolute(leg_url)
            (photo_url, ) = doc.xpath('//img[contains(@src, ".jpg")]/@src')

            # Add contact information from personal page
            office_address = re.search(r'<B>Address:  </B>(.+?)\n?</?P>', html,
                                       re.IGNORECASE).group(1)

            office_email = doc.xpath(
                '//a[starts-with(@href, "mailto:")]/text()')
            business_phone = re.search(
                r'<B>Business Telephone:  </B>(.+?)</?P>', html, re.IGNORECASE)
            home_phone = re.search(r'<B>Home Telephone:  </B>(.+?)</?P>', html,
                                   re.IGNORECASE)
            cell_phone = re.search(r'<B>Cell Telephone:  </B>(.+?)</?P>', html,
                                   re.IGNORECASE)

            person = Person(
                name=member_name,
                district=district_number,
                primary_org='lower',
                party=party,
                image=photo_url,
            )
            person.extras['district_name'] = district_name

            person.add_link(leg_url)
            person.add_source(leg_url)

            if office_address:
                leg_address = office_address
                person.add_contact_detail(type='address',
                                          value=leg_address,
                                          note='District Office')
            else:
                # If no address for legislator
                if party == 'Democratic':
                    leg_address = (
                        'House Democratic Office, Room 333 State House, 2 State House Station, '
                        'Augusta, Maine 04333-0002')

                    person.add_contact_detail(type='address',
                                              value=leg_address,
                                              note='Party Office')

                elif party == 'Republican':
                    leg_address = (
                        'House GOP Office, Room 332 State House, 2 State House Station, '
                        'Augusta, Maine 04333-0002')

                    person.add_contact_detail(type='address',
                                              value=leg_address,
                                              note='Party Office')

            if office_email:
                office_email = office_email[0]
                person.add_contact_detail(type='email',
                                          value=office_email,
                                          note='District Office')
            if business_phone:
                person.add_contact_detail(type='voice',
                                          value=clean_phone(
                                              business_phone.group(1)),
                                          note='Business Phone')
            if home_phone:
                person.add_contact_detail(type='voice',
                                          value=clean_phone(
                                              home_phone.group(1)),
                                          note='Home Phone')
            if cell_phone:
                person.add_contact_detail(type='voice',
                                          value=clean_phone(
                                              cell_phone.group(1)),
                                          note='Cell Phone')

            yield person
Example #57
0
    def scrape(self):
        committee_d = {}
        non_committees = {
            'City Council', 'Office of the Mayor', 'Office of the City Clerk'
        }

        for councilman, committees in self.councilMembers():
            if councilman['Ward/Office'] == "":
                continue

            ward = councilman['Ward/Office']
            if ward not in {"Mayor", "Clerk"}:

                ward = "Ward {}".format(int(ward))
                role = "Alderman"
                p = Person(councilman['Person Name']['label'],
                           district=ward,
                           primary_org="legislature",
                           role=role)

            if councilman['Photo']:
                p.image = councilman['Photo']

            contact_types = {
                "City Hall Office": ("address", "City Hall Office"),
                "City Hall Phone": ("voice", "City Hall Phone"),
                "Ward Office Phone": ("voice", "Ward Office Phone"),
                "Ward Office Address": ("address", "Ward Office Address"),
                "Fax": ("fax", "Fax")
            }

            for contact_type, (type_, _note) in contact_types.items():
                if councilman[contact_type]:
                    p.add_contact_detail(type=type_,
                                         value=councilman[contact_type],
                                         note=_note)

            if councilman["E-mail"]:
                p.add_contact_detail(type="email",
                                     value=councilman['E-mail']['label'],
                                     note='E-mail')

            if councilman['Website']:
                p.add_link(councilman['Website']['url'])
            p.add_source(councilman['Person Name']['url'], note='web')

            for committee, _, _ in committees:
                committee_name = committee['Legislative Body']['label']
                if committee_name and committee_name not in non_committees:
                    o = committee_d.get(committee_name, None)
                    if o is None:
                        o = Organization(
                            committee_name,
                            classification='committee',
                            parent_id={'name': 'Chicago City Council'})
                        o.add_source(committee['Legislative Body']['url'],
                                     note='web')
                        committee_d[committee_name] = o

                    o.add_member(p, role=committee["Title"])

            yield p

        for name, term in FORMER_ALDERMEN.items():
            p = Person(name=name,
                       primary_org="legislature",
                       start_date=term['term'][0],
                       end_date=term['term'][1],
                       district="Ward {}".format(term['ward']),
                       role='Alderman')
            if name == 'Chandler, Michael D.':
                p.add_term('Alderman',
                           "legislature",
                           district="Ward {}".format(term['ward']),
                           start_date=datetime.date(2011, 5, 16),
                           end_date=datetime.date(2015, 5, 18))

            p.add_source(term['source'], note='web')
            yield p

        for o in committee_d.values():
            yield o

        for committee_name in FORMER_COMMITTEES:
            o = Organization(committee_name,
                             classification='committee',
                             parent_id={'name': 'Chicago City Council'})
            o.add_source("https://chicago.legistar.com/Departments.aspx",
                         note='web')
            yield o

        for joint_committee in JOINT_COMMITTEES:

            o = Organization(joint_committee,
                             classification='committee',
                             parent_id={'name': 'Chicago City Council'})
            o.add_source("https://chicago.legistar.com/Departments.aspx",
                         note='web')
            yield o
Example #58
0
    def scrape_chamber(self, chamber):
        body = {"lower": "H", "upper": "S"}[chamber]
        url = "http://www.azleg.gov/MemberRoster/?body=" + body
        page = self.get(url).text

        # there is a bad comment closing tag on this page
        page = page.replace("--!>", "-->")

        root = html.fromstring(page)

        path = "//table//tr"
        roster = root.xpath(path)[1:]
        for row in roster:
            position = ""
            name, district, party, email, room, phone, = row.xpath("td")

            if email.attrib.get("class") == "vacantmember":
                continue  # Skip any vacant members.

            link = name.xpath("string(a/@href)")
            if len(name) == 1:
                name = name.text_content().strip()
            else:
                position = name.tail.strip()
                name = name[0].text_content().strip()
            if "--" in name:
                name = name.split("--")[0].strip()

            linkpage = self.get(link).text
            linkpage = linkpage.replace("--!>", "-->")
            linkroot = html.fromstring(linkpage)
            linkroot.make_links_absolute(link)

            photos = linkroot.xpath("//img[contains(@src, 'MemberPhoto')]")

            if len(photos) != 1:
                self.warning("no photo on " + link)
                photo_url = ""
            else:
                photo_url = photos[0].attrib["src"]

            district = district.text_content().strip()
            party = party.text_content().strip()
            email = email.text_content().strip()

            if email.startswith("Email: "):
                email = email.replace("Email: ", "").lower() + "@azleg.gov"
            else:
                email = ""

            party = self.get_party(party)
            room = room.text_content().strip()
            if chamber == "lower":
                address = "House of Representatives\n"
            else:
                address = "Senate\n"
            address = (address + "1700 West Washington\n Room " + room +
                       "\nPhoenix, AZ 85007")

            phone = phone.text_content().strip()
            if "602" not in re.findall(r"(\d+)", phone):
                phone = "602-" + phone

            leg = Person(
                primary_org=chamber,
                image=photo_url,
                name=name,
                district=district,
                party=party,
            )
            leg.add_contact_detail(type="address",
                                   value=address,
                                   note="Capitol Office")
            leg.add_contact_detail(type="voice",
                                   value=phone,
                                   note="Capitol Office")
            leg.add_party(party=party)
            leg.add_link(link)

            if email:
                leg.add_contact_detail(type="email", value=email)
            if position:
                leg.add_membership(name_or_org=party, role=position)
                # leg.add_role(position, term, chamber=chamber,
                #             district=district, party=party)

            leg.add_source(url)

            # Probably just get this from the committee scraper
            # self.scrape_member_page(link, session, chamber, leg)
            yield leg
Example #59
0
    def legislators(self, latest_only):
        legs = {}

        for member, chamber, term, url in self._memberships(latest_only):
            name, _, _, district, party = member.xpath("td")
            district = district.text
            detail_url = name.xpath("a/@href")[0]

            if party.text_content().strip() == "":
                self.warning("Garbage party: Skipping!")
                continue

            party = {
                "D": "Democratic",
                "R": "Republican",
                "I": "Independent"
            }[party.text]
            name = name.text_content().strip()

            # inactive legislator, skip them for now
            if name.endswith("*"):
                name = name.strip("*")
                continue

            name = AKA.get(name, name)

            if name in legs:
                p, terms = legs[name]
                terms.append((chamber, district, term, party))
            else:
                p = Person(name, party=party)
                legs[name] = p, [(chamber, district, term, party)]

            p.add_source(url)
            p.add_source(detail_url)
            p.add_link(detail_url)

            birth_date = BIRTH_DATES.get(name, None)
            if birth_date:
                p.birth_date = birth_date

            leg_html = self.get(detail_url).text
            leg_doc = lxml.html.fromstring(leg_html)
            leg_doc.make_links_absolute(detail_url)

            hotgarbage = ("Senate Biography Information for the 98th General "
                          "Assembly is not currently available.")

            if hotgarbage in leg_html:
                # The legislator's bio isn't available yet.
                self.logger.warning("No legislator bio available for " + name)
                continue

            photo_url = leg_doc.xpath(
                '//img[contains(@src, "/members/")]/@src')[0]
            p.image = photo_url

            p.contact_details = []
            # email
            email = leg_doc.xpath('//b[text()="Email: "]')
            if email:
                p.add_contact_detail(type="email",
                                     value=email[0].tail.strip(),
                                     note="capitol")

            offices = {
                "capitol": '//table[contains(string(), "Springfield Office")]',
                "district": '//table[contains(string(), "District Office")]',
            }

            for location, xpath in offices.items():
                table = leg_doc.xpath(xpath)
                if table:
                    for type, value in self._table_to_office(table[3]):
                        if type in ("fax", "voice"
                                    ) and not validate_phone_number(value):
                            continue

                        p.add_contact_detail(type=type,
                                             value=value,
                                             note=location)

        return legs
Example #60
0
    def _scrape_representative(self, url, parties):
        """
        Returns a Person object representing a member of the lower
        legislative chamber.
        """
        # url = self.get(url).text.replace('<br>', '')
        member_page = self.lxmlize(url)

        photo_url = member_page.xpath('//img[@class="member-photo"]/@src')[0]
        if photo_url.endswith('/.jpg'):
            photo_url = None

        scraped_name, district_text = member_page.xpath(
            '//div[@class="member-info"]/h2')
        scraped_name = scraped_name.text_content().strip().replace('Rep. ', '')
        scraped_name = ' '.join(scraped_name.split())

        name = ' '.join(scraped_name.split(', ')[::-1])

        district_text = district_text.text_content().strip()
        district = str(self.district_re.search(district_text).group(1))

        # Vacant house "members" are named after their district numbers:
        if re.match(r'^\d+$', scraped_name):
            yield None

        party = parties[district]

        person = Person(name=name,
                        district=district,
                        party=party,
                        primary_org='lower')

        if photo_url is not None:
            person.image = photo_url

        person.add_link(url)
        person.add_source(url)

        def office_name(element):
            """Returns the office address type."""
            return element.xpath('preceding-sibling::h4[1]/text()')[0] \
                .rstrip(':')

        offices_text = [{
            'name':
            office_name(p_tag),
            'type':
            office_name(p_tag).replace(' Address', '').lower(),
            'details':
            p_tag.text_content()
        } for p_tag in member_page.xpath(
            '//h4/following-sibling::p[@class="double-space"]')]

        for office_text in offices_text:
            details = office_text['details'].strip()

            # A few member pages have blank office listings:
            if details == '':
                continue

            # At the time of writing, this case of multiple district
            # offices occurs exactly once, for the representative at
            # District 43:
            if details.count('Office') > 1:
                district_offices = [
                    district_office.strip() for district_office in re.findall(
                        r'(\w+ Office.+?(?=\w+ Office|$))',
                        details,
                        flags=re.DOTALL)
                ]
                offices_text += [{
                    'name':
                    re.match(r'\w+ Office', office).group(),
                    'type':
                    'district',
                    'details':
                    re.search(r'(?<=Office).+(?=\w+ Office|$)?', office,
                              re.DOTALL).group()
                } for office in district_offices]

            match = self.address_re.search(details)
            if match is not None:
                address = re.sub(' +$',
                                 '',
                                 match.group().replace('\r', '').replace(
                                     '\n\n', '\n'),
                                 flags=re.MULTILINE)
            else:
                # No valid address found in the details.
                continue

            phone_number = extract_phone(details)
            fax_number = extract_fax(details)

            if address:
                person.add_contact_detail(type='address',
                                          value=address,
                                          note=office_text['name'])
            if phone_number:
                person.add_contact_detail(type='voice',
                                          value=phone_number,
                                          note=office_text['name'])
            if fax_number:
                person.add_contact_detail(type='fax',
                                          value=fax_number,
                                          note=office_text['name'])

        yield person