def scrape_counciler(self, url): page = self.lxmlize(url) who, = page.xpath("//h3[@class='subtitle']/text()") district, = page.xpath("//div[@class='right-bar']//h2/text()") image, = page.xpath( "//div[@class='left-bar']//a[@class='image lightbox']//img" ) member = Person( primary_org='legislature', name=who, district=district, image=image.attrib['src'] ) member.add_source(url) details = page.xpath("//table[@align='center']//td") for detail in details: detail = detail.text_content().strip() if detail is None or detail == "": continue type_, value = detail.split(":", 1) cdtype = { "Home Phone": "voice", "Address": "address", "Email": "email", "Cell Phone": "voice", }[type_] member.add_contact_detail(type=cdtype, note=type_, value=value) yield member
def scrape_alderman(self, ward_num): ward_url = "{}/ward-{}".format(Utils.ALDERMEN_HOME, ward_num) alderman_url = self.alderman_url(ward_url) alderman_page = self.lxmlize(alderman_url) # person's name is the only <h1> tag on the page name = alderman_page.xpath("//h1/text()")[0] # initialize person object with appropriate data so that pupa can # automatically create a membership object linking this person to # a post in the jurisdiction's "Board of Aldermen" organization district = "Ward {} Alderman".format(ward_num) person = Person(name=name, district=district, role="Alderman", primary_org="legislature") # set additional fields person.image = alderman_page.xpath("//div/img/@src")[0] phone_number = alderman_page.xpath("//strong[text()='Phone:']/../text()")[1].strip() person.add_contact_detail(type="voice", value=phone_number) # add sources person.add_source(alderman_url, note="profile") person.add_source(ward_url, note="ward") return person
def table_row_to_legislator_and_profile_url(table_row_element, chamber): """Derive a Legislator from an HTML table row lxml Element, and a link to their profile""" td_elements = table_row_element.xpath('td') (role_element, name_element, district_element, party_element, phone_element, email_element) = td_elements # Name comes in the form Last, First # last_name_first_name = name_element.text_content().strip() # full_name = last_name_first_name_to_full_name(last_name_first_name) full_name = name_element.text_content().strip() district = district_element.text_content().strip() party = party_element.text_content().strip() if party == 'Democrat': party = 'Democratic' role = role_element.text_content().strip() address = co_address_from_role(role) phone = phone_element.text_content().strip() email = email_element.text_content().strip() (profile_url, ) = name_element.xpath('a/@href') print(chamber, district, party) legislator = Person(primary_org=chamber, name=full_name, district=district, party=party) legislator.add_contact_detail(type='address', value=address, note='Capitol Office') legislator.add_contact_detail(type='voice', value=phone, note='Capitol Office') legislator.add_contact_detail(type='email', value=email, note='Capitol Office') return legislator, profile_url
def test_full_person(): person = ScrapePerson('Tom Sawyer') person.add_identifier('1') person.add_name('Tommy', start_date='1880') person.add_contact_detail(type='phone', value='555-555-1234', note='this is fake') person.add_link('http://example.com/link') person.add_source('http://example.com/source') # import person pd = person.as_dict() PersonImporter('jurisdiction-id').import_data([pd]) # get person from db and assert it imported correctly p = Person.objects.get() assert 'ocd-person' in p.id assert p.name == person.name assert p.identifiers.all()[0].identifier == '1' assert p.identifiers.all()[0].scheme == '' assert p.other_names.all()[0].name == 'Tommy' assert p.other_names.all()[0].start_date == '1880' assert p.contact_details.all()[0].type == 'phone' assert p.contact_details.all()[0].value == '555-555-1234' assert p.contact_details.all()[0].note == 'this is fake' assert p.links.all()[0].url == 'http://example.com/link' assert p.sources.all()[0].url == 'http://example.com/source'
def handle_list_item(self, item): photo_url = item.xpath('./img/@src')[0] url = item.xpath('.//h5/a/@href')[0] name_text = item.xpath('.//h5/a/b/text()')[0] name_match = re.match(r'^(.+)\(([0-9]{2}[AB]), ([A-Z]+)\)$', name_text) name = name_match.group(1).strip() district = name_match.group(2).lstrip('0').upper() party_text = name_match.group(3) party = PARTIES[party_text] info_texts = [x.strip() for x in item.xpath( './div/text()[normalize-space()]' ) if x.strip()] address = '\n'.join((info_texts[0], info_texts[1])) phone_text = info_texts[2] if validate_phone_number(phone_text): phone = phone_text email_text = item.xpath('.//a/@href')[1].replace('mailto:', '').strip() if validate_email_address(email_text): email = email_text rep = Person(name=name, district=district, party=party, primary_org='lower', role='Representative', image=photo_url) rep.add_link(url) rep.add_contact_detail(type='address', value=address, note='capitol') rep.add_contact_detail(type='voice', value=phone, note='capitol') rep.add_contact_detail(type='email', value=email, note='capitol') rep.add_source(self.url) yield rep
def scrape_chamber(self, chamber): """ Scrapes legislators for the current term only """ # self.validate_term(term, latest_only=True) url = BASE_URL % CHAMBERS[chamber].lower() index = self.get(url).text html = lxml.html.fromstring(index) html.make_links_absolute(url) rows = html.xpath('//div[contains(@class, "row-equal-height")]') for row in rows: img_url = row.xpath('.//img/@src')[0] inner = row.xpath('.//div[@class="vc-column-innner-wrapper"]')[1] inner_text = inner.text_content() if 'Resigned' in inner_text or 'Substitute' in inner_text: continue name = inner.xpath('p/strong')[0].text.replace(u'\xa0', ' ').strip() name = re.sub(r'\s+', ' ', name) party = PARTY[inner.xpath('p/strong')[0].tail.strip()] email = inner.xpath('p/strong/a')[0].text district = inner.xpath('p/a')[0].text.replace('District ', '') person_url = inner.xpath('p/a/@href')[0] # skip roles for now role = '' # for com in inner.xpath('p/a[contains(@href, "committees")]'): # role = com.tail.strip() person = Person(name=name, district=district, party=party, primary_org=chamber, image=img_url, role=role) phones = get_phones(inner) phone = phones.get('home') or phones.get('business') office_phone = phones.get('office') address = get_address(inner) fax = get_fax(inner) if address: person.add_contact_detail(type='address', value=address, note='District Office') if phone: person.add_contact_detail(type='voice', value=phone, note='District Office') if fax: person.add_contact_detail(type='fax', value=fax, note='District Office') if email: person.add_contact_detail(type='email', value=email, note='District Office') if office_phone: person.add_contact_detail(type='voice', value=office_phone, note='Capitol Office') person.add_source(url) person.add_link(person_url) yield person
def scrape_lower(self, chamber): url = 'http://www.house.mi.gov/mhrpublic/frmRepList.aspx' table = [ "website", "district", "name", "party", "location", "phone", "email" ] data = self.get(url).text doc = lxml.html.fromstring(data) # skip two rows at top for row in doc.xpath('//table[@id="grvRepInfo"]/*'): tds = row.xpath('.//td') if len(tds) == 0: continue metainf = {} for i in range(0, len(table)): metainf[table[i]] = tds[i] district = str(int(metainf['district'].text_content().strip())) party = metainf['party'].text_content().strip() phone = metainf['phone'].text_content().strip() email = metainf['email'].text_content().strip() leg_url = metainf['website'].xpath("./a")[0].attrib['href'] name = metainf['name'].text_content().strip() if name == 'Vacant' or re.match(r'^District \d{1,3}$', name): self.warning('District {} appears vacant, and will be skipped'.format(district)) continue office = metainf['location'].text_content().strip() office = re.sub( ' HOB', ' Anderson House Office Building\n124 North Capitol Avenue\nLansing, MI 48933', office ) office = re.sub( ' CB', ' State Capitol Building\nLansing, MI 48909', office ) photo_url = self.get_photo_url(leg_url) person = Person(name=name, district=district, party=abbr[party], primary_org='lower', image=photo_url[0] if photo_url else None) person.add_link(leg_url) person.add_source(leg_url) person.add_contact_detail(type='address', value=office, note='Capitol Office') person.add_contact_detail(type='voice', value=phone, note='Capitol Office') person.add_contact_detail(type='email', value=email, note='Capitol Office') yield person
def scrape_member_page(self, chamber, url): page = self.get(url).text page = lxml.html.fromstring(page) page.make_links_absolute(url) for legislator in page.xpath( "//div[contains(concat(' ', normalize-space(@class), ' '), " "' memberModule ')]" ): img = legislator.xpath( ".//div[@class='thumbnail']//img")[0].attrib['src'] data = legislator.xpath(".//div[@class='data']")[0] homepage = data.xpath(".//a[@class='black']")[0] full_name = homepage.text_content() if "Vacant" in full_name: continue homepage = homepage.attrib['href'] party = data.xpath( ".//span[@class='partyLetter']")[0].text_content() party = {"R": "Republican", "D": "Democratic"}[party] office_lines = data.xpath("child::text()") phone = office_lines.pop(-1) office = "\n".join(office_lines) h3 = data.xpath("./h3") if len(h3): h3 = h3[0] district = h3.xpath("./br")[0].tail.replace("District", "" ).strip() else: district = re.findall( r"\d+\.png", legislator.attrib['style'] )[-1].split(".", 1)[0] full_name = re.sub(r"\s+", " ", full_name).strip() email = ( 'rep{0:0{width}}@ohiohouse.gov' if chamber == 'lower' else 'sd{0:0{width}}@ohiosenate.gov' ).format(int(district), width=2) leg = Person(name=full_name, district=district, party=party, primary_org=chamber, image=img) leg.add_contact_detail(type='address', value=office, note='Capitol Office') leg.add_contact_detail(type='voice', value=phone, note='Capitol Office') leg.add_contact_detail(type='email', value=email, note='Capitol Office') self.scrape_homepage(leg, chamber, homepage) leg.add_source(url) leg.add_link(homepage) yield leg
def scrape_lower_legislator(self, url, leg_info): page = self.lxmlize(url) name = page.xpath( '//span[@id="body_FormView5_FULLNAMELabel"]/text()' )[0].strip() if name.startswith("District ") or name.startswith("Vacant "): self.warning("Seat is vacant: {}".format(name)) return photo = page.xpath( '//img[contains(@src, "/h_reps/RepPics")]' )[0].attrib['src'] party_flags = { "Democrat": "Democratic", "Republican": "Republican", "Independent": "Independent" } party_info = page.xpath( '//span[@id="body_FormView5_PARTYAFFILIATIONLabel"]/text()' )[0].strip() party = party_flags[party_info] try: email = page.xpath( '//span[@id="body_FormView6_EMAILADDRESSPUBLICLabel"]/text()' )[0].strip() except IndexError: email = None district = leg_info['dist'].replace('Dist', '').strip() person = Person(name=name, party=party, district=district, primary_org='lower', image=photo) contacts = [ (leg_info["office"], "address"), (leg_info["phone"], "voice"), (email, "email"), ] for value, key in contacts: if value: person.add_contact_detail(type=key, value=value, note="District Office") person.add_source(url) person.add_link(url) yield person
def scrape_member(self, chamber, member_url): member_page = self.get(member_url).text doc = lxml.html.fromstring(member_page) photo_url = doc.xpath('//div[@id="bioImage"]/img/@src')[0] name_pieces = doc.xpath('//span[@id="name"]/text()')[0].split() full_name = ' '.join(name_pieces[1:-1]).strip() party = name_pieces[-1] if party == '(R)': party = 'Republican' elif party == '(D)': party = 'Democratic' elif party == '(I)': party = 'Independent' district = doc.xpath('//span[@id="districtHeader"]/text()')[0].split()[-1] person = Person(name=full_name, district=district, party=party, primary_org=chamber, image=photo_url) person.add_source(member_url) person.add_link(member_url) address = '\n'.join(doc.xpath('//div[@id="FrankfortAddresses"]//' 'span[@class="bioText"]/text()')) phone = None fax = None phone_numbers = doc.xpath('//div[@id="PhoneNumbers"]//span[@class="bioText"]/text()') for num in phone_numbers: if num.startswith('Annex: '): num = num.replace('Annex: ', '') if num.endswith(' (fax)'): fax = num.replace(' (fax)', '') else: phone = num emails = doc.xpath( '//div[@id="EmailAddresses"]//span[@class="bioText"]//a/text()' ) email = reduce( lambda match, address: address if '@lrc.ky.gov' in str(address) else match, [None] + emails ) if phone: person.add_contact_detail(type='voice', value=phone, note='Capitol Office') if fax: person.add_contact_detail(type='fax', value=fax, note='Capitol Office') if email: person.add_contact_detail(type='email', value=email, note='Capitol Office') if address.strip() == "": self.warning("Missing Capitol Office!!") else: person.add_contact_detail(type='address', value=address, note='Capitol Office') yield person
def scrape_upper_chamber(self, term): url = 'https://senado.pr.gov/Pages/Senadores.aspx' doc = self.lxmlize(url) links = self.get_nodes(doc, '//ul[@class="senadores-list"]/li/a/@href') for link in links: senator_page = self.lxmlize(link) profile_links = self.get_nodes(senator_page, '//ul[@class="profiles-links"]/li') name_text = self.get_node(senator_page, '//span[@class="name"]').text_content().strip() # Convert to title case as some names are in all-caps name = re.sub(r'^Hon\.', '', name_text, flags=re.IGNORECASE).strip().title() party = profile_links[0].text_content().strip() # Translate to English since being an Independent is a universal construct if party == "Independiente": party = "Independent" photo_url = self.get_node(senator_page, '//div[@class="avatar"]//img/@src') if profile_links[1].text_content().strip() == "Senador por Distrito": district_text = self.get_node( senator_page, '//div[@class="module-distrito"]//span[@class="headline"]').text_content() district = district_text.replace('DISTRITO', '', 1).replace('\u200b', '').strip() elif profile_links[1].text_content().strip() == "Senador por Acumulación": district = "At-Large" phone_node = self.get_node(senator_page, '//a[@class="contact-data tel"]') phone = phone_node.text_content().strip() email_node = self.get_node(senator_page, '//a[@class="contact-data email"]') email = email_node.text_content().replace('\u200b', '').strip() person = Person(primary_org='upper', district=district, name=name, party=party, image=photo_url) person.add_contact_detail(type='email', value=email, note='Capitol Office') person.add_contact_detail(type='voice', value=phone, note='Capitol Office') person.add_link(link) person.add_source(link) yield person
def scrape_senator_page(self, chamber, url): page = self.get(url).text page = lxml.html.fromstring(page) page.make_links_absolute(url) for legislator in page.xpath( "//div[@id='senators']//div[contains(concat(' ', normalize-space(@class), ' '), " "' portraitContainer ')]"): img = legislator.xpath(".//div[@class='profileThumbnailBoundingBox']/@style")[0] img = img[img.find('(')+1:img.find(')')] full_name = legislator.xpath(".//div[@class='profileName']/a/text()")[0] homepage_url = legislator.xpath(".//a[@class='profileImageLink']")[0].attrib['href'] district = legislator.xpath(".//div[@class='profileDistrict']" "/a/text()")[0].split("#")[1] if "Vacant" in full_name: continue homepage = self.get(homepage_url).text page = lxml.html.fromstring(homepage) phone = page.xpath("//div[@class='phone']/span/text()")[0] address_lines = page.xpath("//div[@class='address']/span/text()") address = "\n".join(address_lines) party_image = page.xpath('//div[@class="senatorParty"]/img/@src')[0] if 'Republican' in party_image: party = 'Republican' elif 'Democrat' in party_image: party = 'Democratic' email = ( 'rep{0:0{width}}@ohiohouse.gov' if chamber == 'lower' else 'sd{0:0{width}}@ohiosenate.gov' ).format(int(district), width=2) leg = Person(name=full_name, district=district, primary_org=chamber, image=img, party=party) leg.add_contact_detail(type='address', value=address, note='Capitol Office') leg.add_contact_detail(type='voice', value=phone, note='Capitol Office') leg.add_contact_detail(type='email', value=email, note='Capitol Office') leg.add_source(url) leg.add_link(homepage_url) yield leg
def scrape_chamber(self, chamber): client = ApiClient(self) session = self.latest_session() base_url = "http://iga.in.gov/legislative" api_base_url = "https://api.iga.in.gov" chamber_name = "senate" if chamber == "upper" else "house" r = client.get("chamber_legislators", session=session, chamber=chamber_name) all_pages = client.unpaginate(r) for leg in all_pages: firstname = leg["firstName"] lastname = leg["lastName"] party = leg["party"] link = leg["link"] api_link = api_base_url+link html_link = base_url+link.replace("legislators/", "legislators/legislator_") try: html = get_with_increasing_timeout(self, html_link, fail=True, kwargs={"verify": False}) except scrapelib.HTTPError: self.logger.warning("Legislator's page is not available.") continue doc = lxml.html.fromstring(html.text) doc.make_links_absolute(html_link) address, phone = doc.xpath("//address") address = address.text_content().strip() address = "\n".join([l.strip() for l in address.split("\n")]) phone = phone.text_content().strip() try: district = doc.xpath("//span[@class='district-heading']" )[0].text.lower().replace("district", "").strip() except IndexError: self.warning("skipping legislator w/o district") continue image_link = base_url+link.replace("legislators/", "portraits/legislator_") legislator = Person(primary_org=chamber, district=district, name=" ".join([firstname, lastname]), party=party, image=image_link) legislator.add_contact_detail(type="address", note="Capitol Office", value=address) legislator.add_contact_detail(type="voice", note="Capitol Office", value=phone) legislator.add_link(html_link) legislator.add_source(html_link) legislator.add_source(api_link) yield legislator
def scrape_rep(self, url): page = self.get(url).text page = lxml.html.fromstring(page) page.make_links_absolute(url) main = page.xpath('//div[@id="main-info"]')[0] if 'Resigned' in main.text_content(): print("Member resigned {}".format(url)) raise StopIteration # don't yield anything name = page.xpath('//div[@class="member-name"]/text()')[0].strip() name = re.sub(r'\s+', ' ', name) district_number = page.xpath( '//span[contains(text(), "House District:")]' '/following-sibling::span/text()')[0].strip() # remove anything after first whitespace district_number = re.sub(r'\s.*', '', district_number.strip()) email = None email_content = page.xpath('//a[./i[contains(@class,"fa-envelope")]]/text()') if email_content and email_content[0].strip(): email = email_content[0].strip() photo_url = page.xpath('//header[@id="home"]/img/@src')[0] party = self.get_rep_table_by_header(page, 'Party Affiliation').text.strip() party = _party_map[party[0]] # standardize main_p_text = page.xpath('//div[@id="main-info"]/p/text()') address = [t.strip() for t in main_p_text if t.strip()][0] person = Person( name=name, district=district_number, primary_org='lower', party=party, image=photo_url, ) person.add_contact_detail(type='address', value=address, note='District Office') person.add_contact_detail(type='email', value=email, note='District Office') person.add_source(url) yield person
def handle_list_item(self, row): if not row['First Name']: return name = '{} {}'.format(row['First Name'], row['Last Name']) party = PARTIES[row['Party']] leg = Person(name=name, district=row['District'].lstrip('0'), party=party, primary_org='upper', role='Senator', image=self.extra_info[name]['image']) leg.add_link(self.extra_info[name]['url']) leg.add_contact_detail(type='voice', value=self.extra_info[name]['office_phone'], note='capitol') if 'email' in self.extra_info[name]: leg.add_contact_detail(type='email', value=self.extra_info[name]['email'], note='capitol') row['Zipcode'] = row['Zipcode'].strip() # Accommodate for multiple address column naming conventions. address1_fields = [row.get('Address'), row.get('Office Building')] address2_fields = [row.get('Address2'), row.get('Office Address')] row['Address'] = next((a for a in address1_fields if a is not None), False) row['Address2'] = next((a for a in address2_fields if a is not None), False) if (a in row['Address2'] for a in ['95 University Avenue W', '100 Rev. Dr. Martin Luther King']): address = ('{Address}\n{Address2}\n{City}, {State} {Zipcode}' .format(**row)) if 'Rm. Number' in row: address = '{0} {1}'.format(row['Rm. Number'], address) leg.add_contact_detail(type='address', value=address, note='capitol') elif row['Address2']: address = ('{Address}\n{Address2}\n{City}, {State} {Zipcode}' .format(**row)) leg.add_contact_detail(type='address', value=address, note='district') else: address = '{Address}\n{City}, {State} {Zipcode}'.format(**row) leg.add_contact_detail(type='address', value=address, note='district') leg.add_source(self.url) leg.add_source(self._html_url) return leg
def scrape_chamber(self, chamber=None): metainf = self.scrape_leg_page(get_legislator_listing_url(chamber)) for leg in metainf: try: chamber = {"House": "lower", "Senate": "upper"}[leg['chamber']] except KeyError: print("") print(" ERROR: Bad Legislator page.") print(" -> " + "\n -> ".join(leg['source'])) print("") print(" Added this workaround because of a bad legislator") print(" page, while they filled their info out.") print("") print(" Emailed webmaster. Told to wait.") print(" - PRT, Jun 23, 2014") print("") continue person = Person(name=leg['name'], district=leg['district'], party=leg['party'], primary_org=chamber, image=leg['image']) for source in leg['source']: person.add_source(source) try: for ctty in leg['ctty']: flag = 'Joint Legislative' if ctty['name'][:len(flag)] == flag: ctty_chamber = "joint" else: ctty_chamber = chamber comm = Organization(name=ctty['name'], classification="committee", chamber=ctty_chamber) comm.add_member(person, role="member") except KeyError: self.warn("%s has no scraped Committees" % leg['name']) person.add_link(leg['homepage']) if leg['addr']: person.add_contact_detail(type='address', value=leg['addr'], note='Capitol Office') if leg['phone']: person.add_contact_detail(type='voice', value=leg['phone'], note='Capitol Office') if leg['email']: person.add_contact_detail(type='email', value=leg['email'], note='Capitol Office') if leg['fax']: person.add_contact_detail(type='fax', value=leg['fax'], note='Capitol Office') yield person
def scrape_chamber(self, session): session_key = SESSION_KEYS[session] legislators_reponse = self.api_client.get('legislators', session=session_key) for legislator in legislators_reponse: url_name = legislator['WebSiteUrl'].split('/')[-1] chamber_name = 'house' if legislator['Chamber'] == 'H' else 'senate' img = 'https://www.oregonlegislature.gov/{}/MemberPhotos/{}.jpg'.format( chamber_name, url_name ) party = legislator['Party'] if party == 'Democrat': party = 'Democratic' person = Person(name='{} {}'.format(legislator['FirstName'], legislator['LastName']), primary_org={'S': 'upper', 'H': 'lower'}[legislator['Chamber']], party=party, district=legislator['DistrictNumber'], image=img) person.add_link(legislator['WebSiteUrl']) person.add_source(legislator['WebSiteUrl']) if legislator['CapitolAddress']: person.add_contact_detail(type='address', value=legislator['CapitolAddress'], note='Capitol Office') if legislator['CapitolPhone']: person.add_contact_detail(type='voice', value=legislator['CapitolPhone'], note='Capitol Office') person.add_contact_detail(type='email', value=legislator['EmailAddress'], note='Capitol Office') yield person
def scrape_chamber(self, chamber, session): if chamber == 'upper': chamber_slug = 'Senate' elif chamber == 'lower': chamber_slug = 'Assembly' session_slug = self.jurisdiction.session_slugs[session] leg_base_url = 'http://www.leg.state.nv.us/App/Legislator/A/%s/%s/' % (chamber_slug, session_slug) leg_json_url = ('http://www.leg.state.nv.us/App/Legislator/A/api/%s/Legislator?house=%s' % (session_slug, chamber_slug)) resp = json.loads(self.get(leg_json_url).text) for item in resp: # empty district empty_names = ['District No', 'Vacant'] if any(name in item['FullName'] for name in empty_names): continue last, first = item['FullName'].split(",", 1) item['FullName'] = "{first} {last}".format(last=last, first=first).strip() person = Person(name=item['FullName'], district=item['DistrictNbr'], party=item['Party'], primary_org=chamber, image=item['PhotoURL']) leg_url = leg_base_url + item['DistrictNbr'] # hack to get the legislator ID html = self.get(leg_url).text for l in html.split('\n'): if 'GetLegislatorDetails' in l: leg_id = l.split(',')[1].split("'")[1] # fetch the json used by the page leg_details_url = ('https://www.leg.state.nv.us/App/Legislator/A/api/{}/Legislator?id=' .format(session_slug) + leg_id) leg_resp = json.loads(self.get(leg_details_url).text) details = leg_resp['legislatorDetails'] address = details['Address1'] address2 = details['Address2'] if address2: address += ' ' + address2 address += '\n%s, NV %s' % (details['City'], details['Zip']) phone = details['LCBPhone'] email = details['LCBEmail'] if address: person.add_contact_detail(type='address', value=address, note='District Office') if phone: person.add_contact_detail(type='voice', value=phone, note='District Office') if phone: person.add_contact_detail(type='email', value=email, note='District Office') person.add_link(leg_details_url) person.add_source(leg_details_url) yield person
def handle_list_item(self, item): photo_url = item.xpath('./td[1]/a/img/@src')[0] info_nodes = item.xpath('./td[2]/p/a') name_text = info_nodes[0].xpath('./b/text()')[0] url = info_nodes[0].get('href') name_match = re.match(r'^(.+)\(([0-9]{2}[AB]), ([A-Z]+)\)$', name_text) name = name_match.group(1).strip() district = name_match.group(2).lstrip('0').upper() party_text = name_match.group(3) party = PARTIES[party_text] info_texts = [x.strip() for x in item.xpath( './td[2]/p/text()[normalize-space() and preceding-sibling::br]' ) if x.strip()] address = '\n'.join((info_texts[0], info_texts[1])) phone_text = info_texts[2] if validate_phone_number(phone_text): phone = phone_text email_node = info_nodes[1] email_text = email_node.text email_text = email_text.replace('Email: ', '').strip() if validate_email_address(email_text): email = email_text rep = Person(name=name, district=district, party=party, primary_org='lower', role='Representative', image=photo_url) rep.add_link(url) rep.add_contact_detail(type='address', value=address) rep.add_contact_detail(type='voice', value=phone) rep.add_contact_detail(type='email', value=email) rep.add_source(self.url) yield rep
def _scrape_legislator(self, row, chamber): name_cell = row.xpath('./td[@class="rosterCell nameCell"]/a')[0] name = ' '.join([line.strip() for line in name_cell.text_content().split('\n') if len(line.strip()) > 0]) party_letter = row.xpath( './td[@class="rosterCell partyCell"]/text()')[0].strip() party = dict(D='Democratic', R='Republican')[party_letter] chamber_abbr = self._chamber_map[chamber] district = row.xpath('./td[@class="rosterCell seatCell"]' '/text()')[0].replace(chamber_abbr, '').strip() try: email = row.xpath('./td[@class="rosterCell emailCell"]' '/a/@href')[0].replace('mailto:', '').strip() except IndexError: email = None phone = row.xpath('./td[@class="rosterCell phoneCell"]' '/text()')[0].strip() or None details_url = 'https://leg.mt.gov{}'.format(name_cell.attrib['href']) response = self.get(details_url) details_page = lxml.html.fromstring(response.text) address_lines = details_page.xpath( '//div[@class="col-lg-6 col-md-12 text-lg-left align-self-center"]' '/p[contains(text(), "Address")]' )[0].text_content() \ .replace('Address', '') \ .split('\n') address = '\n'.join([line.strip() for line in address_lines if len(line.strip()) > 0]) legislator = Person(name=name, district=district, party=party, primary_org=chamber) legislator.add_contact_detail(type='address', value=address, note='Capitol Office') if phone is not None: legislator.add_contact_detail(type='voice', value=phone, note='Capitol Office') if email is not None: legislator.add_contact_detail(type='email', value=email, note='E-mail') legislator.add_link(details_url) legislator.add_source(self._roster_url) yield legislator
def get_member(self, session, chamber, kpid): url = '%smembers/%s' % (ksapi.url, kpid) content = json.loads(self.get(url).text)['content'] party = content['PARTY'] if party == 'Democrat': party = 'Democratic' slug = {'2013-2014': 'b2013_14', '2015-2016': 'b2015_16', '2017-2018': 'b2017_18', '2019-2020': 'b2019_20', }[session] leg_url = 'http://www.kslegislature.org/li/%s/members/%s/' % (slug, kpid) try: legislator_page = self.lxmlize(leg_url) photo_url, = legislator_page.xpath( '//img[@class="profile-picture"]/@src') except scrapelib.HTTPError: self.warning("{}'s legislator bio page not found".format(content['FULLNAME'])) leg_url = '' photo_url = '' person = Person( name=content['FULLNAME'], district=str(content['DISTRICT']), primary_org=chamber, party=party, image=photo_url, ) person.extras = {'occupation': content['OCCUPATION']} address = '\n'.join([ 'Room {}'.format(content['OFFICENUM']), 'Kansas State Capitol Building', '300 SW 10th St.', 'Topeka, KS 66612', ]) note = 'Capitol Office' person.add_contact_detail(type='address', value=address, note=note) person.add_contact_detail(type='email', value=content['EMAIL'], note=note) if content['OFFPH']: person.add_contact_detail(type='voice', value=content['OFFPH'], note=note) person.add_source(url) person.add_link(leg_url) yield person
def scrape_senator(self, district): link = "https://legislature.maine.gov/District-{}".format(district) page = lxml.html.fromstring(self.get(link).text) page.make_links_absolute(link) main = page.xpath('//div[@id="main"]/div[@id="content"]')[0] title = main.xpath('h1')[0].text # e.g. District 25 - State Senator Catherine Breen (D - Cumberland)... title_match = re.match( r'District (\d+) - State Senator ([^\(]+) \(([DRI])', title) _, name, party = title_match.groups() name = re.sub(r'\s+', ' ', name.strip()) party = _party_map[party] image_url = address = phone = email = None for p in main.xpath('p'): if p.xpath('.//img') and not image_url: image_url = p.xpath('.//img/@src')[0] continue field, _, value = p.text_content().partition(":") value = value.strip() if field in ('Address', 'Mailing Address'): address = value elif field in ('Phone', 'Home Phone'): phone = value elif field == 'Email': email = value person = Person( name=name, district=district, image=image_url, primary_org='upper', party=party, ) person.add_link(link) person.add_source(link) if address: person.add_contact_detail(type='address', value=address, note='District Office') if phone: person.add_contact_detail( type='voice', value=clean_phone(phone), note='District Phone') person.add_contact_detail(type='email', value=email, note='District Email') yield person
def scrape_chamber(self, chamber): url = "http://www.ncga.state.nc.us/gascripts/members/"\ "memberListNoPic.pl?sChamber=" if chamber == 'lower': url += 'House' else: url += 'Senate' data = self.get(url).text doc = lxml.html.fromstring(data) doc.make_links_absolute('http://www.ncga.state.nc.us') rows = doc.xpath('//div[@id="mainBody"]/table/tr') for row in rows[1:]: party, district, full_name, counties = row.getchildren() party = party.text_content().strip("()") party = party_map[party] district = district.text_content().replace("District", "").strip() notice = full_name.xpath('span') if notice: notice = notice[0].text_content() # skip resigned legislators if 'Resigned' in notice or 'Deceased' in notice: continue else: notice = None link = full_name.xpath('a/@href')[0] full_name = full_name.xpath('a')[0].text_content() full_name = full_name.replace(u'\u00a0', ' ') # scrape legislator page details lhtml = self.get(link).text ldoc = lxml.html.fromstring(lhtml) ldoc.make_links_absolute('http://www.ncga.state.nc.us') photo_url = ldoc.xpath('//a[contains(@href, "pictures")]/@href')[0] phone = get_table_item(ldoc, 'Phone:') or None address = get_table_item(ldoc, 'Address:') or None email = ldoc.xpath('//a[starts-with(@href, "mailto:")]')[0] capitol_email = email.text capitol_phone = email.xpath('ancestor::tr[1]/preceding-sibling::tr[1]/td/span')[0].text capitol_address = email.xpath('ancestor::tr[1]/preceding-sibling::tr[2]/td/text()') capitol_address = [x.strip() for x in capitol_address] capitol_address = '\n'.join(capitol_address) or None capitol_phone = capitol_phone.strip() or None # save legislator person = Person(name=full_name, district=district, party=party, primary_org=chamber, image=photo_url) person.extras['notice'] = notice person.add_link(link) person.add_source(link) if address: person.add_contact_detail(type='address', value=address, note='District Office') if phone: person.add_contact_detail(type='voice', value=phone, note='District Office') if capitol_address: person.add_contact_detail(type='address', value=capitol_address, note='Capitol Office') if capitol_phone: person.add_contact_detail(type='voice', value=capitol_phone, note='Capitol Office') if capitol_email: person.add_contact_detail(type='email', value=capitol_email, note='Capitol Office') yield person
def scrape_upper_leg_page(self, url, who): page = self.lxmlize(url) (who, ) = [x for x in page.xpath('//tr/td/font/text()') if x.strip().startswith("Senator ") ] who = re.search(r'(?u)^\s*Senator\s*(.*?)\s*$', who).group(1) if 'Vacant' in who: return (district, ) = [x for x in page.xpath('//tr/td/font/text()') if x.strip().startswith("District - ") ] district = re.search( r'(?u)^\s*District\s*-\s*(.*?)\s*$', district).group(1) info = [x.strip() for x in page.xpath('//font[contains(text(), "Information:")]/' 'ancestor::table[1]//text()') if x.strip() ] parties = { "Republican": "Republican", "Democrat": "Democratic", } party_index = info.index("Party:") + 1 party = parties[info[party_index]] phone_index = info.index("District Phone") + 1 phone = info[phone_index] assert sum(c.isdigit() for c in phone) == 10, "Phone number is invalid: {}".format(phone) # Address exists for all lines between party and phone address = "\n".join(info[party_index + 2:phone_index - 1]) address = address.replace("\r", "") if not address: address = "No Address Found" fax_index = info.index("Fax") + 1 fax = info[fax_index] assert sum(c.isdigit() for c in fax) == 10, "Fax number is invalid: {}".format(fax) email_index = info.index("E-mail Address") + 1 email = info[email_index] assert "@" in email, "Email info is not valid: {}".format(email) person = Person(name=who, district=district, party=party, primary_org="upper") contacts = [ (address, "address"), (phone, "voice"), (email, "email"), (fax, "fax"), ] for value, key in contacts: if value: person.add_contact_detail(type=key, value=value, note="District Office") person.add_source(url) person.add_link(url) yield person
def scrape_chamber(self, chamber): if chamber == "lower": url = "http://www.scstatehouse.gov/member.php?chamber=H" else: url = "http://www.scstatehouse.gov/member.php?chamber=S" seen_committees = {} data = self.get(url).text doc = lxml.html.fromstring(data) doc.make_links_absolute(url) for a in doc.xpath('//a[@class="membername"]'): full_name = a.text leg_url = a.get("href") if full_name.startswith("Senator"): full_name = full_name.replace("Senator ", "") if full_name.startswith("Representative"): full_name = full_name.replace("Representative ", "") leg_html = self.get(leg_url).text leg_doc = lxml.html.fromstring(leg_html) leg_doc.make_links_absolute(leg_url) if "Resigned effective" in leg_html: self.info("Resigned") continue party, district, _ = leg_doc.xpath( '//p[@style="font-size: 17px;' ' margin: 0 0 0 0; padding: 0;"]/text()') if "Republican" in party: party = "Republican" elif "Democrat" in party: party = "Democratic" # District # - County - Map district = district.split()[1] try: photo_url = leg_doc.xpath( '//img[contains(@src,"/members/")]/@src')[0] except IndexError: self.warning("No Photo URL for {}".format(full_name)) photo_url = "" person = Person( name=full_name, district=district, party=party, primary_org=chamber, image=photo_url, ) # office address / phone try: addr_div = leg_doc.xpath( '//div[@style="float: left; width: 225px;' ' margin: 10px 5px 0 20px; padding: 0;"]')[0] capitol_address = addr_div.xpath( 'p[@style="font-size: 13px;' ' margin: 0 0 10px 0; padding: 0;"]')[0].text_content() phone = addr_div.xpath( 'p[@style="font-size: 13px;' ' margin: 0 0 0 0; padding: 0;"]/text()')[0] capitol_phone = phone.strip() if capitol_address: person.add_contact_detail(type="address", value=capitol_address, note="Capitol Office") if capitol_phone: person.add_contact_detail(type="voice", value=capitol_phone, note="Capitol Office") except IndexError: self.warning("no capitol address for {0}".format(full_name)) # home address / phone try: addr_div = leg_doc.xpath( '//div[@style="float: left;' ' width: 225px; margin: 10px 0 0 20px;"]')[0] addr = addr_div.xpath( 'p[@style="font-size: 13px;' ' margin: 0 0 10px 0; padding: 0;"]')[0].text_content() phone = addr_div.xpath( 'p[@style="font-size: 13px;' ' margin: 0 0 0 0; padding: 0;"]/text()')[0] phone = phone.strip() if addr: person.add_contact_detail(type="address", value=addr, note="District Office") if phone: person.add_contact_detail(type="voice", value=phone, note="District Office") except IndexError: self.warning("no district address for {0}".format(full_name)) person.add_link(leg_url) person.add_source(url) person.add_source(leg_url) # committees (skip first link) for com in leg_doc.xpath( '//a[contains(@href, "committee.php")]')[1:]: if com.text.endswith(", "): committee, role = com.text_content().rsplit(", ", 1) # known roles role = { "Treas.": "treasurer", "Secy.": "secretary", "Secy./Treas.": "secretary/treasurer", "V.C.": "vice-chair", "1st V.C.": "first vice-chair", "Co 1st V.C.": "co-first vice-chair", "2nd V.C.": "second vice-chair", "3rd V.C.": "third vice-chair", "Ex.Officio Member": "ex-officio member", "Chairman": "chairman", }[role] else: committee = com.text role = "member" # only yield each committee once if committee not in seen_committees: com = Organization(name=committee, classification="committee", chamber=chamber) com.add_source(url) seen_committees[committee] = com yield com else: com = seen_committees[committee] person.add_membership(com, role=role) yield person
def scrape_lower_chamber(self, term): # E-mail contact is now hidden behind webforms. Sadness. party_map = {'PNP': 'Partido Nuevo Progresista', 'PPD': u'Partido Popular Democr\xe1tico', 'PIP': u'Partido Independentista Puertorrique\u00F1o', } url = 'http://www.tucamarapr.org/dnncamara/ComposiciondelaCamara/Biografia.aspx' page = self.lxmlize(url) member_nodes = self.get_nodes(page, '//li[@class="selectionRep"]') for member_node in member_nodes: member_info = member_node.text_content().strip().split("\n") name = re.sub(r'^Hon\.', '', member_info[0]).strip() district_text = member_info[-1].strip() if district_text == 'Representante por Acumulación': district = 'At-Large' else: district = district_text.replace("Representante del Distrito ", "").strip() photo_url = self.get_node(member_node, './/img/@src') rep_link = self.get_node(member_node, ".//a/@href") rep_page = self.lxmlize(rep_link) party_node = self.get_node(rep_page, '//span[@class="partyBio"]') # Albelo doesn't seem to have a "partyBio" as an independent, but we # expect this to exist for all other members. if not party_node and name == "Manuel A. Natal Albelo": party = "Independent" else: party_text = party_node.text_content().strip() party = party_map[party_text] address = self.get_node(rep_page, '//h6').text.strip().split("\n")[0].strip() # Only grabs the first validated phone number found. # Typically, representatives have multiple phone numbers. phone_node = self.get_node( rep_page, '//span[@class="data-type" and contains(text(), "Tel.")]') phone = None possible_phones = phone_node.text.strip().split("\n") for phone_attempt in possible_phones: # Don't keep searching phone numbers if a good one is found. if phone: break phone_text = re.sub(r'^Tel\.[\s]*', '', phone_attempt).strip() if validate_phone_number(phone_text): phone = phone_text fax_node = self.get_node( rep_page, '//span[@class="data-type" and contains(text(), "Fax.")]') fax = None if fax_node: fax_text = fax_node.text.strip() fax_text = re.sub(r'^Fax\.[\s]*', '', fax_text).strip() if validate_phone_number(fax_text): fax = fax_text person = Person(primary_org='lower', district=district, name=name, party=party, image=photo_url) person.add_link(rep_link) person.add_source(rep_link) person.add_source(url) if address: person.add_contact_detail(type='address', value=address, note='Capitol Office') if phone: person.add_contact_detail(type='voice', value=phone, note='Capitol Office') if fax: person.add_contact_detail(type='fax', value=fax, note='Capitol Office') yield person
def scrape(self): body_types = self.body_types() city_council, = [body for body in self.bodies() if body["BodyName"] == "City Council"] terms = collections.defaultdict(list) for office in self.body_offices(city_council): if "VACAN" not in office["OfficeRecordFullName"]: terms[office["OfficeRecordFullName"].strip()].append(office) web_scraper = LegistarPersonScraper(requests_per_minute=self.requests_per_minute) web_scraper.MEMBERLIST = "https://pittsburgh.legistar.com/People.aspx" web_scraper.COMMITTEELIST = "https://pittsburgh.legistar.com/Departments.aspx" if self.cache_storage: web_scraper.cache_storage = self.cache_storage if self.requests_per_minute == 0: web_scraper.cache_write_only = False web_info = {} for member in web_scraper.councilMembers(): web_info[member["Person Name"]] = member members = {} for member, offices in terms.items(): person = Person(member) for term in offices: role = term["OfficeRecordTitle"] person.add_term("Councilmember", "legislature", start_date = self.toDate(term["OfficeRecordStartDate"]), end_date = self.toDate(term["OfficeRecordEndDate"])) if member in web_info: web = web_info[member] if web["E-mail"] and web["E-mail"]["label"] and web["E-mail"]["label"] != "N/A": person.add_contact_detail(type="email", value=web["E-mail"]["label"], note="E-mail") person_source_data = self.person_sources_from_office(term) person_api_url, person_api_response = person_source_data person.add_source(person_api_url, note="api") if person_api_response["PersonAddress1"]: address = (person_api_response["PersonAddress1"] + ", " + person_api_response["PersonCity1"] + ", " + person_api_response["PersonState1"] + " " + person_api_response["PersonZip1"]) person.add_contact_detail(type="address", value=address, note="Office address") if person_api_response["PersonPhone"]: person.add_contact_detail(type="voice", value=person_api_response["PersonPhone"], note="Office phone") if person_api_response["PersonWWW"]: person.add_contact_detail(type="url", value=person_api_response["PersonWWW"], note="District website") members[member] = person for body in self.bodies(): if body["BodyTypeId"] == body_types["Committee"]: body_name_clean = body["BodyName"].strip() organization = Organization(body_name_clean, classification="committee", parent_id={"name" : "Pittsburgh City Council"}) organization.add_source(self.BASE_URL + "/bodies/{BodyId}".format(**body), note="api") for office in self.body_offices(body): role = office["OfficeRecordMemberType"] if role not in ("Vice Chair", "Chair") or role == "Councilmember": role = "Member" person = office["OfficeRecordFullName"].strip() if person in members: person = members[person] else: person = Person(person) person.add_membership(body_name_clean, role=role, start_date = self.toDate(office["OfficeRecordStartDate"]), end_date = self.toDate(office["OfficeRecordEndDate"])) yield organization for person in members.values(): yield person
def handle_list_item(self, item): link = item.xpath('.//div[contains(@class, "rep_style")]/a')[0] name = link.text_content().strip() if "Vacant" in name or "Resigned" in name or "Pending" in name: return party = item.xpath( './/div[contains(@class, "party_style")]/text()')[0].strip() party = {"D": "Democratic", "R": "Republican"}[party] district = item.xpath( './/div[contains(@class, "district_style")]/text()')[0].strip() leg_url = link.get("href") split_url = parse.urlsplit(leg_url) member_id = parse.parse_qs(split_url.query)["MemberId"][0] image = "http://www.flhouse.gov/FileStores/Web/Imaging/Member/{}.jpg".format( member_id) name = fix_name(name) rep = Person( name=name, district=district, party=party, primary_org="lower", role="Representative", image=image, ) rep.add_link(leg_url) rep.add_source(leg_url) rep.add_source(self.url) self.scrape_page(RepDetail, leg_url, obj=rep) # look for email in the list from the PDF directory - ideally # we'd find a way to better index the source data which # wouldn't require guessing the email, but this does at least # confirm that it's correct # deal with some stuff that ends up in name that won't work in # email, spaces, quotes, high latin1 email_name = rep.name.replace('"', "").replace("La ", "La").replace("ñ", "n") (last, *other) = re.split(r"[-\s,]+", email_name) # deal with a missing nickname used in an email address if "Patricia" in other: other.append("Pat") # search through all possible first names and nicknames # present - needed for some of the more elaborate concoctions found_email = False for first in other: email = "*****@*****.**" % (first, last) if email in self.member_emails: # it's bad if we can't uniquely match emails, so throw an error if email in self.claimed_member_emails: raise ValueError( "Email address %s matches multiple reps - %s and %s." % (email, rep.name, self.claimed_member_emails[email])) self.claimed_member_emails[email] = rep.name rep.add_contact_detail(type="email", value=email, note="Capitol Office") rep.add_source(self.directory_pdf_url) found_email = True break if not found_email: log.warning("Rep %s does not have an email in the directory PDF." % (rep.name, )) return rep
def scrape(self, session=None): if session is None: session = self.latest_session() year_slug = self.jurisdiction.get_year_slug(session) # Load all members via the private API legislator_dump_url = ( 'http://legislature.vermont.gov/people/loadAll/{}'. format(year_slug)) json_data = self.get(legislator_dump_url).text legislators = json.loads(json_data)['data'] # Parse the information from each legislator for info in legislators: # Strip whitespace from strings info = {k: v.strip() for k, v in info.items()} # Skip duplicate record for Christopher Mattos (appointed Rep September 2017) if info['PersonID'] == "29034": self.info("skipping first Christopher Mattos record") continue # Gather photo URL from the member's page member_url = ('http://legislature.vermont.gov/people/single/{}/{}'. format(year_slug, info['PersonID'])) page = self.lxmlize(member_url) (photo_url, ) = page.xpath('//img[@class="profile-photo"]/@src') # Also grab their state email address state_email = page.xpath( '//dl[@class="summary-table profile-summary"]/' 'dt[text()="Email"]/following-sibling::dd[1]/a/text()') if state_email: (state_email, ) = state_email else: state_email = None district = info['District'].replace(" District", "") leg = Person( primary_org=self.CHAMBERS[info['Title']], district=district, party=info['Party'].replace("Democrat", "Democratic"), name="{0} {1}".format(info['FirstName'], info['LastName']), image=photo_url ) leg.add_contact_detail( note="Capitol Office", type='address', value='Vermont State House\n115 State Street\nMontpelier, VT 05633' ) if state_email: leg.add_contact_detail(note="Capitol Office", type='email', value=state_email) leg.add_contact_detail( note="District Office", type='address', value="{0}{1}\n{2}, {3} {4}".format( info['MailingAddress1'], ("\n" + info['MailingAddress2'] if info['MailingAddress2'].strip() else ""), info['MailingCity'], info['MailingState'], info['MailingZIP'] ) ) if info['HomePhone']: leg.add_contact_detail(note="District Office", type='voice', value=info['HomePhone']) district_email = info['Email'] or info['HomeEmail'] or info['WorkEmail'] if district_email: leg.add_contact_detail(note="District Office", type='email', value=district_email) leg.add_link(member_url) leg.add_source(legislator_dump_url) leg.add_source(member_url) yield leg
def scrape_lower(self, chamber): url = 'http://www.house.mi.gov/mhrpublic/frmRepList.aspx' table = [ "website", "district", "name", "party", "location", "phone", "email" ] data = self.get(url).text doc = lxml.html.fromstring(data) # skip two rows at top for row in doc.xpath('//table[@id="grvRepInfo"]/*'): tds = row.xpath('.//td') if len(tds) == 0: continue metainf = {} for i in range(0, len(table)): metainf[table[i]] = tds[i] district = str(int(metainf['district'].text_content().strip())) party = metainf['party'].text_content().strip() phone = metainf['phone'].text_content().strip() email = metainf['email'].text_content().strip() name = metainf['name'].text_content().strip() if name == 'Vacant' or re.match(r'^District \d{1,3}$', name): self.warning( 'District {} appears vacant, and will be skipped'.format( district)) continue leg_url = metainf['website'].xpath("./a")[0].attrib['href'] office = metainf['location'].text_content().strip() office = re.sub( ' HOB', ' Anderson House Office Building\n124 North Capitol Avenue\nLansing, MI 48933', office) office = re.sub(' CB', ' State Capitol Building\nLansing, MI 48909', office) try: photo_url = self.get_photo_url(leg_url)[0] except (scrapelib.HTTPError, IndexError): photo_url = '' self.warning('no photo url for %s', name) person = Person(name=name, district=district, party=abbr[party], primary_org='lower', image=photo_url) person.add_link(leg_url) person.add_source(leg_url) person.add_contact_detail(type='address', value=office, note='Capitol Office') person.add_contact_detail(type='voice', value=phone, note='Capitol Office') person.add_contact_detail(type='email', value=email, note='Capitol Office') yield person
def scrape(self): committee_d = {} non_committees = ('City Council', 'Office of the Mayor') for councilman, committees in self.councilMembers(): if councilman['Ward/Office'] == "": continue ward = councilman['Ward/Office'] if ward not in [ "Mayor", "Clerk", ]: ward = "Ward {}".format(int(ward)) p = Person(councilman['Person Name']['label'], district=ward, primary_org="legislature") if councilman['Photo']: p.image = councilman['Photo'] contact_types = { "City Hall Office": ("address", "City Hall Office"), "City Hall Phone": ("voice", "City Hall Phone"), "Ward Office Phone": ("voice", "Ward Office Phone"), "Ward Office Address": ("address", "Ward Office Address"), "Fax": ("fax", "Fax") } for contact_type, (type_, _note) in contact_types.items(): if councilman[contact_type]: p.add_contact_detail(type=type_, value=councilman[contact_type], note=_note) if councilman["E-mail"]: p.add_contact_detail(type="email", value=councilman['E-mail']['label'], note='E-mail') if councilman['Website']: p.add_link(councilman['Website']['url']) p.add_source(MEMBERLIST) for committee, _, _ in committees: committee_name = committee['Legislative Body']['label'] if committee_name and committee_name not in non_committees: o = committee_d.get(committee_name, None) if o is None: o = Organization(committee_name, classification='committee') o.add_source( "https://chicago.legistar.com/Departments.aspx") committee_d[committee_name] = o o.add_member(p, role=committee["Title"]) yield p for o in committee_d.values(): yield o
def parse_senate(self, div, chamber): name = div.xpath('.//h3/text()')[0] if name.endswith(' (R)'): party = 'Republican' elif name.endswith(' (D)'): party = 'Democratic' else: self.warning('skipping ' + name) return None name = name.split(' (')[0] district = div.xpath( './/div[contains(@class, "senator-district")]/div/text()' )[0].strip().lstrip('0') photo_url = div.xpath('.//img/@src')[0] person = Person( name=name, party=party, district=district, primary_org=chamber, image=photo_url, ) url = div.xpath('.//a/@href')[0] person.add_link(url) # CA senators have working emails, but they're not putting them on # their public pages anymore email = self._construct_email(chamber, name) person.add_contact_detail(type='email', value=email, note='Senate Office') office_path = './/div[contains(@class, "{}")]//p' for addr in div.xpath( office_path.format( 'views-field-field-senator-capitol-office')): note = 'Senate Office' addr, phone = addr.text_content().split('; ') person.add_contact_detail(type='address', value=addr.strip(), note=note) person.add_contact_detail(type='voice', value=phone.strip(), note=note) n = 1 for addr in div.xpath( office_path.format( 'views-field-field-senator-district-office')): note = 'District Office #{}'.format(n) for addr in addr.text_content().strip().splitlines(): try: addr, phone = addr.strip().replace(u'\xa0', ' ').split('; ') person.add_contact_detail(type='address', value=addr.strip(), note=note) person.add_contact_detail(type='voice', value=phone.strip(), note=note) except ValueError: addr = addr.strip().replace(u'\xa0', ' ') person.add_contact_detail(type='address', value=addr.strip(), note=note) n += 1 return person
def parse_assembly(self, tr, chamber): ''' Given a tr element, get specific data from it. ''' strip = methodcaller('strip') xpath = 'td[contains(@class, "views-field-field-%s-%s")]%s' xp = { 'url': [('lname-sort', '/a[not(contains(text(), "edit"))]/@href')], 'district': [('district', '/text()')], 'party': [('party', '/text()')], 'name': [('office-information', '/a[not(contains(text(), "edit"))]/text()')], 'address': [('office-information', '/h3/following-sibling::text()'), ('office-information', '/p/text()')], } titles = {'upper': 'senator', 'lower': 'member'} funcs = { 'name': lambda s: re.sub( # "Assembly" is misspelled once r'Contact Assembl?y Member', '', s).strip(), 'address': parse_address, } tr_xpath = tr.xpath res = collections.defaultdict(list) for k, xpath_info in xp.items(): for vals in xpath_info: f = funcs.get(k, lambda _: _) vals = (titles[chamber], ) + vals vals = map(f, map(strip, tr_xpath(xpath % vals))) res[k].extend(vals) # Photo. try: res['image'] = tr_xpath('td/p/img/@src')[0] except IndexError: pass # Remove junk from assembly member names. junk = 'Contact Assembly Member ' try: res['name'] = res['name'].pop().replace(junk, '') except IndexError: return # Normalize party. for party in res['party'][:]: if party: if party == 'Democrat': party = 'Democratic' res['party'] = party break else: res['party'] = None # strip leading zero res['district'] = str(int(res['district'].pop())) person = Person( name=res['name'], district=res.get('district'), party=res.get('party'), image=res.get('image'), primary_org=chamber, ) # Mariko Yamada also didn't have a url that lxml would parse # as of 3/22/2013. if res['url']: person.add_link(res['url'].pop()) # Addresses. addresses = res['address'] try: addresses = map(dict, filter(None, addresses)) except ValueError: # Sometimes legislators only have one address, in which # case this awful hack is helpful. addresses = map(dict, filter(None, [addresses])) addresses = list(addresses) for address in addresses: # Toss results that don't have required keys. if not set(['street', 'city', 'state_zip']) < set(address): if address in addresses: addresses.remove(address) # Re-key the addresses offices = [] if addresses: # Mariko Yamada's addresses wouldn't parse correctly as of # 3/23/2013, so here we're forced to test whether any # addresses were even found. addresses[0].update(type='capitol', name='Capitol Office') offices.append(addresses[0]) # CA reps have working emails, but they're not putting them on # their public pages anymore offices[0]['email'] = self._construct_email(chamber, res['name']) for n, office in enumerate(addresses[1:]): office.update(type='district', name='District Office #{}'.format(n + 1)) offices.append(office) for office in offices: street = office['street'] state_zip = re.sub(r'\s+', ' ', office['state_zip']) street = '%s\n%s, %s' % (street, office['city'], state_zip) office['address'] = street office['fax'] = None if 'email' not in office: office['email'] = None note = office['name'] person.add_contact_detail(type='address', value=office['address'], note=note) if office['phone']: person.add_contact_detail(type='voice', value=office['phone'], note=note) if office['email']: person.add_contact_detail(type='email', value=office['email'], note=note) return person
def scrape_chamber(self, chamber, session): url = 'https://docs.legis.wisconsin.gov/{}/legislators/{}'.format( session, { 'upper': 'senate', 'lower': 'assembly' }[chamber], ) body = self.get(url).text page = lxml.html.fromstring(body) page.make_links_absolute(url) for row in page.xpath( ".//div[@class='box-content']/div[starts-with(@id,'district')]" ): if row.xpath( ".//a/@href") and not row.xpath(".//a[text()='Vacant']"): rep_url = row.xpath(".//a[text()='Details']/@href")[0].strip( "https://") rep_url = "https://" + rep_url rep_doc = lxml.html.fromstring(self.get(rep_url).text) rep_doc.make_links_absolute(rep_url) full_name = rep_doc.xpath( './/div[@id="district"]/h1/text()')[0].replace( "Senator ", "").replace("Representative ", "") party = rep_doc.xpath('.//div[@id="district"]//small/text()') if len(party) > 0: party = PARTY_DICT[party[0].split("-")[0].strip( "(").strip()] else: party = None district = rep_doc.xpath( './/div[@id="district"]/h3/a/@href')[1] district = district.split("/")[-1] district = str(int(district)) # email email = rep_doc.xpath("//span[@class='info email']/a/text()") if email: email = email[0] else: email = '' assert party is not None, "{} is missing party".format( full_name) person = Person( name=full_name, district=district, primary_org=chamber, party=party, ) img = rep_doc.xpath('.//div[@id="district"]/img/@src') if img: person.image = img[0] # office #### address_lines = rep_doc.xpath( './/span[@class="info office"]/text()') address = '\n'.join([ line.strip() for line in address_lines if line.strip() != "" ]) person.add_contact_detail(type='address', value=address, note='Capitol Office') phone = rep_doc.xpath( './/span[@class="info telephone"]/text()') if phone: phone = re.sub(r'\s+', ' ', phone[1]).strip() person.add_contact_detail(type='voice', value=phone, note='Capitol Office') fax = rep_doc.xpath('.//span[@class="info fax"]/text()') if fax: fax = re.sub(r'\s+', ' ', fax[1]).strip() person.add_contact_detail(type='fax', value=fax, note='Capitol Office') if email: person.add_contact_detail(type='email', value=email, note='Capitol Office') person.add_link(rep_url) person.add_source(rep_url) yield person
def get_people(self): people_base_url = "http://miamidade.gov/wps/portal/Main/government" doc = self.lxmlize(people_base_url) person_list = doc.xpath("//div[contains(@id,'elected')]//span") titles = ["Chairman", "Vice Chair"] for person in person_list: info = person.text_content().strip().split("\r") position = info[0].strip() name = " ".join(info[1:-1]) name = name.replace("Website | Contact", "") for title in titles: name = name.replace(title, "") name = name.strip() url = person.xpath(".//a[contains(text(),'Website')]/@href")[0] image = person.xpath(".//img/@src")[0] pers = Person(name=name, image=image, primary_org='legislature', role=position) pers.add_source(people_base_url, note="Miami-Dade government website") pers.add_source(url, note="individual's website") #the commissioners have consistent site format if "district" in position.lower(): person_doc = self.lxmlize(url) contact_rows = person_doc.xpath( "//div[@class='leftContentContainer']//p") for line in contact_rows: line_text = line.text_content() if "email" in line_text.lower(): email_address = line_text.replace("Email:", "").strip() pers.add_contact_detail(type="email", value=email_address) continue try: office, phone, fax = line_text.strip().split("\n") except ValueError: #ick, it's all on one line. if "downtown office" in line_text.lower(): office = "Downtown Office" elif "district office" in line_text.lower(): office = "District Office" else: continue phone = line_text[15:27] fax = line_text[33:45] if "office" not in office.lower(): continue #social is also available in here #but I don't see a place to put it phone = phone.replace("Phone", "").strip() fax = fax.replace("Fax", "").strip() pers.add_contact_detail( type="voice", #phone is not allowed ???? value=phone, note=office.strip()) pers.add_contact_detail( type="fax", #phone is not allowed ???? value=fax, note=office.strip()) yield pers
def scrape_table(self, chamber, tbl): # skip first for row in tbl.xpath('tr')[1:]: leg_a, district, _, _ = row.xpath('td') district = district.text name = leg_a.text_content().strip() if name.lower() == "to be announced": continue leg_url = leg_a.xpath('a/@href')[0] # get details html = self.get(leg_url).text ldoc = lxml.html.fromstring(html) ldoc.make_links_absolute(leg_url) party = _get_table_item(ldoc, 'Party Affiliation:').text if party == 'Democrat': party = 'Democratic' addr_lines = _get_table_item(ldoc, 'Annapolis Address:').xpath('text()') address = [] phone = None fax = None for line in addr_lines: if 'Phone:' in line: phone = re.findall(r'Phone: (\d{3}-\d{3}-\d{4})', line)[0] elif 'Fax:' in line: # Number oddities: one has two dashes, one has a dash and then a space. line = line.replace('--', '-').replace('- ', '-') fax = re.findall(r'Fax: (\d{3}-\d{3}-\d{4})', line)[0] else: address.append(line) address = '\n'.join(address) email = ldoc.xpath('//a[contains(@href, "mailto:")]/text()') if not email: email = None elif len(email) == 1: email = email[0].strip() else: raise AssertionError('Multiple email links found on page') img_src = ldoc.xpath('//img[@class="sponimg"]/@src') if img_src: photo_url = img_src[0] name = ' '.join(name.split(', ')[::-1]) leg = Person( primary_org=chamber, district=district, name=name, party=party, image=photo_url, ) leg.add_source(url=leg_url) leg.add_link(url=leg_url) if address: leg.add_contact_detail( type='address', value=address, note='Capitol Office' ) if phone: leg.add_contact_detail( type='voice', value=phone, note='Capitol Office' ) if fax: leg.add_contact_detail( type='fax', value=fax, note='Capitol Office' ) if email: leg.add_contact_detail( type='email', value=email, note='Capitol Office' ) yield leg
def scrape_upper(self, chamber): url = 'http://www.senate.michigan.gov/senatorinfo_list.html' url_to_append = 'http://www.senate.michigan.gov/_images/' data = self.get(url).text doc = lxml.html.fromstring(data) for row in doc.xpath('//table[not(@class="calendar")]//tr')[3:]: if len(row) != 7: continue # party, dist, member, office_phone, office_fax, office_loc party, dist, member, contact, phone, fax, loc = row.getchildren() if (party.text_content().strip() == "" or 'Lieutenant Governor' in member.text_content()): continue party = abbr[party.text] district = dist.text_content().strip() name = member.text_content().strip() name = re.sub(r'\s+', " ", name) surname = re.split(', | ', name) surname[0] = re.sub('[\']', '', surname[0]) try: self.head(url_to_append + surname[0] + '.png') photo_url = url_to_append + surname[0] + '.png' except scrapelib.HTTPError: try: self.head(url_to_append + surname[0] + '.jpg') photo_url = url_to_append + surname[0] + '.jpg' except scrapelib.HTTPError: photo_url = None if name == 'Vacant': self.info('district %s is vacant', district) continue leg_url = member.xpath('a/@href')[0] office_phone = phone.text office_fax = fax.text office_loc = loc.text office_loc = re.sub( ' Farnum Bldg', ' Farnum Office Building\n125 West Allegan Street\nLansing, MI 48933', office_loc) office_loc = re.sub(' Capitol Bldg', ' State Capitol Building\nLansing, MI 48909', office_loc) # email addresses aren't on the list page anymore but they # are on the page linked off "Contact Me" # data has a typo in a row contact_url = [ a for a in row.xpath(".//a") if a.text in ('Contact Me', 'Conact Me') ][0].get('href') contact_html = self.get(contact_url).text contact_doc = lxml.html.fromstring(contact_html) email = None header_email = contact_doc.xpath("//a[@class='header_email']") if header_email: email = header_email[0].text else: # not using the most common template, but maybe they # dropped their email on the page somewhere links = contact_doc.xpath('//a') or [] text_email = [ a for a in links if 'mailto:' in (a.get('href') or '') ] if text_email: email = text_email[0].text person = Person(name=name, district=district, party=party, primary_org='upper', image=photo_url) person.add_link(leg_url) person.add_source(leg_url) person.add_contact_detail(type='address', value=office_loc, note='Capitol Office') person.add_contact_detail(type='voice', value=office_phone, note='Capitol Office') person.add_contact_detail(type='fax', value=office_fax, note='Capitol Office') if email: person.add_contact_detail(type='email', value=email, note='Capitol Office') yield person
def scrape_details(self, chamber, leg_name, leg_link, role): if not leg_link: # Vacant post, likely: if "Vacancy" in leg_name: return raise Exception("leg_link is null. something went wrong") try: url = 'http://billstatus.ls.state.ms.us/members/%s' % leg_link url_root = os.path.dirname(url) details_page = self.get(url) root = lxml.etree.fromstring(details_page.content) party = root.xpath('string(//PARTY)') district = root.xpath('string(//DISTRICT)') photo = "%s/%s" % (url_root, root.xpath('string(//IMG_NAME)')) home_phone = root.xpath('string(//H_PHONE)') home_address = root.xpath('string(//H_ADDRESS)') home_address2 = root.xpath('string(//H_ADDRESS2)') home_city = root.xpath('string(//H_CITY)') home_zip = root.xpath('string(//H_ZIP)') home_address_total = '' if home_address and home_city: if not home_address2: home_address_total = "%s\n%s, MS %s" % ( home_address, home_city, home_zip) else: home_address_total = "%s\n%s\n%s, MS %s" % ( home_address, home_address2, home_city, home_zip) # bis_phone = root.xpath('string(//B_PHONE)') capital_phone = root.xpath('string(//CAP_PHONE)') # other_phone = root.xpath('string(//OTH_PHONE)') org_info = root.xpath('string(//ORG_INFO)') email_name = root.xpath('string(//EMAIL_ADDRESS)').strip() cap_room = root.xpath('string(//CAP_ROOM)') if leg_name in ('Lataisha Jackson', 'John G. Faulkner'): assert not party, ( "Remove special-casing for this Democrat without a " "listed party: {}").format(leg_name) party = 'Democratic' elif leg_name in ('James W. Mathis', 'John Glen Corley'): assert not party, ( "Remove special-casing for this Republican without" " a listed party: {}").format(leg_name) party = 'Republican' elif party == 'D': party = 'Democratic' elif party == 'R': party = 'Republican' else: raise AssertionError( "A member with no identifiable party was found: {}".format( leg_name)) leg = Person(primary_org=chamber, district=district, party=party, image=photo, name=leg_name, role=role) leg.extras['org_info'] = org_info leg.add_source(url) leg.add_link(url) if email_name != "": if "@" in email_name: email = email_name else: email = '%s@%s.ms.gov' % (email_name, { "upper": "senate", "lower": "house" }[chamber]) leg.add_contact_detail(type='email', value=email, note='Capitol Office') if capital_phone != "": leg.add_contact_detail(type='voice', value=capital_phone, note='Capitol Office') if cap_room != "": address = "Room %s\n%s" % (cap_room, CAP_ADDRESS) else: address = CAP_ADDRESS leg.add_contact_detail(type='address', value=address, note='Capitol Office') if home_phone != "": leg.add_contact_detail(type='voice', value=home_phone, note='District Office') if home_address_total != "": leg.add_contact_detail(type='address', value=home_address_total, note='District Office') yield leg except scrapelib.HTTPError as e: self.warning(str(e))
def _scrape_upper(self, roster_page, roster_url): """ Retrieves a list of members of the upper legislative chamber. """ # TODO: photo_urls https://senate.texas.gov/members.php # also available on individual member screens # TODO: email addresses could be scraped from secondary sources # https://github.com/openstates/openstates/issues/1292 for tbl in roster_page.xpath('//table[@class="memdir"]'): # Scrape legislator information from roster URL leg_a = tbl.xpath('.//a')[0] name = leg_a.text # Skip vacant districts if re.search(r'district \d+ constituent services', name, re.IGNORECASE): continue leg_url = leg_a.get('href') district = tbl.xpath( './/span[contains(text(), "District:")]')[0].tail.lstrip('0') party = tbl.xpath('.//span[contains(text(), "Party:")]')[0].tail if party == 'Democrat': party = 'Democratic' # Create Person object person = Person(name=name, district=district, party=party, primary_org='upper') person.add_link(leg_url) # Scrape office contact information from roster URL office_num = 1 for addr in tbl.xpath('.//td[@headers]'): fax = phone = None lines = [addr.text] for child in addr.getchildren(): # when we get to span tag we just ingested a phone # if child.tag == 'span' and child.text: if 'TEL' in child.text: phone = lines.pop() elif 'FAX' in child.text: fax = lines.pop() elif child.tail: lines.append(child.tail) address = '\n'.join(line.strip() for line in lines if line) if 'CAP' in addr.get('headers'): office_name = 'Capitol Office #{}'.format(office_num) office_num += 1 else: office_name = 'District Office' # Add office contact information to Person object if address: person.add_contact_detail(type='address', value=address, note=office_name) if phone: person.add_contact_detail(type='voice', value=phone, note=office_name) if fax: person.add_contact_detail(type='fax', value=fax, note=office_name) # Add source links to Person object person.add_source(roster_url) person.add_source(leg_url) yield person
def scrape_legislator(self, name, chamber, url, contact_page): page = self.get(url).text page = lxml.html.fromstring(page) page.make_links_absolute(url) party = page.xpath("string(//span[contains(@id, 'Party')])") party = party.strip() if party == 'Democrat': party = 'Democratic' district = page.xpath("string(//span[contains(@id, 'District')])") district = district.strip().lstrip('0') occupation = page.xpath( "string(//span[contains(@id, 'Occupation')])") occupation = occupation.strip() (photo_url, ) = page.xpath('//img[contains(@id, "_imgMember")]/@src') office_phone = page.xpath( "string(//span[contains(@id, 'CapitolPhone')])").strip() legislator = Person(primary_org=chamber, image=photo_url, name=name, party=party, district=district ) legislator.extras['occupation'] = occupation if office_phone.strip() != "": legislator.add_contact_detail( type='voice', value=office_phone, note='Capitol Office') # SD removed email from the detail pages but it's still in the # contact page, shared for all congress people member_id = re.search(r'Member=(\d+)', url).group(1) # find the profile block by finding a link inside it to their # detail page profile_link = contact_page.xpath( '//ul[@id="contact-list"]//a[contains(@href, "Member=%s")]' % (member_id,)) if profile_link: # look for the adjacent email mailto link profile_link = profile_link[0] profile_block = profile_link.getparent().getparent().getparent() email_link = profile_block.xpath( './span/span/a[@class="mail-break"]') if email_link: email = email_link[0].text email = email.lstrip() email = email.rstrip() if email: legislator.add_contact_detail(type='email', value=email, note='Capitol Office') home_address = [ x.strip() for x in page.xpath('//td/span[contains(@id, "HomeAddress")]/text()') if x.strip() ] if home_address: home_address = "\n".join(home_address) home_phone = page.xpath( "string(//span[contains(@id, 'HomePhone')])").strip() legislator.add_contact_detail(type='address', value=home_address, note='District Office') if home_phone: legislator.add_contact_detail(type='voice', value=home_phone, note='District Office') legislator.add_source(url) legislator.add_link(url) committees = page.xpath( '//div[@id="divCommittees"]/span/section/table/tbody/tr/td/a') for committee in committees: self.scrape_committee(legislator, url, committee, chamber) yield legislator
def scrape(self, session=None): if not session: session = self.jurisdiction.legislative_sessions[-1]["name"] self.info("no session specified, using %s", session) year_abr = session[0:4] self._init_mdb(int(year_abr)) roster_csv = self.access_to_csv("Roster") bio_csv = self.access_to_csv("LegBio") photos = {} for rec in bio_csv: photos[rec["Roster Key"]] = rec["URLPicture"] for rec in roster_csv: first_name = rec["Firstname"] middle_name = rec["MidName"] last_name = rec["LastName"] suffix = rec["Suffix"] full_name = first_name + " " + middle_name + " " + last_name + " " + suffix full_name = full_name.replace(" ", " ") full_name = full_name[0:len(full_name) - 1] district = str(int(rec["District"])) party = rec["Party"] if party == "R": party = "Republican" elif party == "D": party = "Democratic" else: party = party chamber = rec["House"] if chamber == "A": chamber = "lower" elif chamber == "S": chamber = "upper" leg_status = rec["LegStatus"] # skip Deceased/Retired members if leg_status != "Active": continue phone = rec["Phone"] or None email = None if rec["Email"]: email = rec["Email"] # Email has been removed from the Access DB, but it's # still [email protected] and [email protected] - many # reps have these emails on their personal pages even if # they're gone from the DB file if not email: email = self._construct_email(chamber, rec["Sex"], last_name) try: photo_url = photos[rec["Roster Key"]] except KeyError: photo_url = "" self.warning("no photo url for %s", rec["Roster Key"]) url = "http://www.njleg.state.nj.us/members/bio.asp?Leg=" + str( int(rec["Roster Key"])) address = "{0}\n{1}, {2} {3}".format(rec["Address"], rec["City"], rec["State"], rec["Zipcode"]) gender = {"M": "Male", "F": "Female"}[rec["Sex"]] person = Person( name=full_name, district=district, primary_org=chamber, party=party, image=photo_url, gender=gender, ) person.add_link(url) person.add_source(url) person.add_source("http://www.njleg.state.nj.us/downloads.asp") person.add_contact_detail(type="address", value=address, note="District Office") if phone is not None: person.add_contact_detail(type="voice", value=phone, note="District Office") if email is not None: person.add_contact_detail(type="email", value=email, note="District Office") yield person
def scrape_member(self, chamber, member_url): page = self.get(member_url).text root = lxml.html.fromstring(page) root.make_links_absolute(member_url) photo_url = root.xpath('//div[@class="thumbPhoto"]/img/@src')[0] full_name = root.xpath('//h1/span')[0].tail.strip() try: email = root.xpath('//a[contains(@href, "mailto")]/@href')[0] email = email.replace('mailto:', '') except: email = '' self.info("seat may be vacant") party, district = root.xpath('//h1/span')[1].text.split('-') party = party.strip() district = clean_district(district.strip()) if party in ('D', 'Democrat', 'Democratic'): party = 'Democratic' elif party in ('R', 'Republican'): party = 'Republican' else: party = 'Other' leg = Person(primary_org=chamber, district=district, name=full_name, party=party, image=photo_url) leg.add_link(member_url) leg.add_source(member_url) leg.add_contact_detail(type='email', value=email, note='District Office') # offices for addr in root.xpath('//address/div[@class="contactGroup"]'): office_name = addr.xpath( '../preceding-sibling::h4/text()')[0].strip() if 'District' in office_name: note = 'District Office' elif 'State' in office_name: note = 'Capitol office' try: address = addr.xpath('a')[0].text_content() address = re.sub('\s{2,}', '\n', address) leg.add_contact_detail(type='address', value=address, note=note) except: self.warning("No address info found in `contactGroup`") next = None for phonerow in addr.xpath('./div/div'): phonerow = phonerow.text_content().strip() if phonerow == 'Phone:': next = 'voice' elif phonerow == 'Fax:': next = 'fax' elif next == 'voice': leg.add_contact_detail(type='voice', value=phonerow, note=note) next = None elif next == 'fax': leg.add_contact_detail(type='fax', value=phonerow, note=note) next = None else: self.warning('unknown phonerow %s', phonerow) return leg
def scrape_table(self, chamber, tbl): # skip first for row in tbl.xpath('tr')[1:]: leg_a, district, _, _ = row.xpath('td') district = district.text name = leg_a.text_content().strip() if name.lower() == "to be announced": continue leg_url = leg_a.xpath('a/@href')[0] # get details html = self.get(leg_url).text ldoc = lxml.html.fromstring(html) ldoc.make_links_absolute(leg_url) party = _get_table_item(ldoc, 'Party Affiliation:').text if party == 'Democrat': party = 'Democratic' addr_lines = _get_table_item(ldoc, 'Annapolis Address:').xpath('text()') address = [] for line in addr_lines: if 'Phone:' not in line: address.append(line) else: phone = line address = '\n'.join(address) try: phone = re.findall('Phone: (\d{3}-\d{3}-\d{4})', phone)[0] except IndexError: self.warning("Missing phone!") phone = None email = ldoc.xpath('//a[contains(@href, "mailto:")]/text()') if not email: email = None elif len(email) == 1: email = email[0].strip() else: raise AssertionError('Multiple email links found on page') img_src = ldoc.xpath('//img[@class="sponimg"]/@src') if img_src: photo_url = img_src[0] leg = Person(primary_org=chamber, district=district, name=name, party=party, image=photo_url) leg.add_source(url=leg_url) leg.add_link(url=leg_url) # type ['address', 'email', 'url', 'fax', 'text', 'voice', 'video', 'pager', 'textphone'] if address: leg.add_contact_detail(type='address', value=address or None, note='Capitol Office') if phone: leg.add_contact_detail(type='voice', value=phone, note='Capitol Office') if email: leg.add_contact_detail(type='email', value=email, note='Capitol Office') yield leg
def scrape_chamber(self, chamber): if chamber == 'lower': url = 'http://www.scstatehouse.gov/member.php?chamber=H' else: url = 'http://www.scstatehouse.gov/member.php?chamber=S' seen_committees = {} data = self.get(url).text doc = lxml.html.fromstring(data) doc.make_links_absolute(url) for a in doc.xpath('//a[contains(@href, "code=")]'): full_name = a.text leg_url = a.get('href') leg_html = self.get(leg_url).text leg_doc = lxml.html.fromstring(leg_html) leg_doc.make_links_absolute(leg_url) if 'Resigned effective' in leg_html: self.info('Resigned') continue party, district, _ = leg_doc.xpath( '//p[@style="font-size: 17px;' ' margin: 0 0 0 0; padding: 0;"]/text()') if 'Republican' in party: party = 'Republican' elif 'Democrat' in party: party = 'Democratic' # District # - County - Map district = district.split()[1] try: photo_url = leg_doc.xpath( '//img[contains(@src,"/members/")]/@src')[0] except IndexError: self.warning("No Photo URL for {}".format(full_name)) photo_url = '' person = Person(name=full_name, district=district, party=party, primary_org=chamber, image=photo_url) # office address / phone try: addr_div = leg_doc.xpath( '//div[@style="float: left; width: 225px;' ' margin: 10px 5px 0 20px; padding: 0;"]')[0] capitol_address = addr_div.xpath( 'p[@style="font-size: 13px;' ' margin: 0 0 10px 0; padding: 0;"]')[0].text_content() phone = addr_div.xpath( 'p[@style="font-size: 13px;' ' margin: 0 0 0 0; padding: 0;"]/text()')[0] capitol_phone = phone.strip() if capitol_address: person.add_contact_detail(type='address', value=capitol_address, note='Capitol Office') if capitol_phone: person.add_contact_detail(type='voice', value=capitol_phone, note='Capitol Office') except IndexError: self.warning('no capitol address for {0}'.format(full_name)) # home address / phone try: addr_div = leg_doc.xpath( '//div[@style="float: left;' ' width: 225px; margin: 10px 0 0 20px;"]')[0] addr = addr_div.xpath( 'p[@style="font-size: 13px;' ' margin: 0 0 10px 0; padding: 0;"]')[0].text_content() phone = addr_div.xpath( 'p[@style="font-size: 13px;' ' margin: 0 0 0 0; padding: 0;"]/text()')[0] phone = phone.strip() if addr: person.add_contact_detail(type='address', value=addr, note='District Office') if phone: person.add_contact_detail(type='voice', value=phone, note='District Office') except IndexError: self.warning('no district address for {0}'.format(full_name)) person.add_link(leg_url) person.add_source(url) person.add_source(leg_url) # committees (skip first link) for com in leg_doc.xpath( '//a[contains(@href, "committee.php")]')[1:]: if com.text.endswith(', '): committee, role = com.text_content().rsplit(', ', 1) # known roles role = { 'Treas.': 'treasurer', 'Secy.': 'secretary', 'Secy./Treas.': 'secretary/treasurer', 'V.C.': 'vice-chair', '1st V.C.': 'first vice-chair', 'Co 1st V.C.': 'co-first vice-chair', '2nd V.C.': 'second vice-chair', '3rd V.C.': 'third vice-chair', 'Ex.Officio Member': 'ex-officio member', 'Chairman': 'chairman' }[role] else: committee = com.text role = 'member' # only yield each committee once if committee not in seen_committees: com = Organization(name=committee, classification='committee', chamber=chamber) com.add_source(url) seen_committees[committee] = com yield com else: com = seen_committees[committee] person.add_membership(com, role=role) yield person
def _parse_person(self, row, chamber, seat_map): # Capture legislator vitals. first_name = row["FirstName"] middle_name = row["MiddleName"] last_name = row["LastName"] full_name = "{} {} {}".format(first_name, middle_name, last_name) full_name = re.sub(r"[\s]{2,}", " ", full_name) if chamber == "lower": district = "{} {}".format(row["County"], int(row["District"])).strip() else: district = str(int(row["District"])).strip() party = self.party_map[row["party"].upper()] email = row["WorkEmail"] if district == "0": self.warning("Skipping {}, district is set to 0".format(full_name)) return person = Person(primary_org=chamber, district=district, name=full_name, party=party) extras = { "first_name": first_name, "middle_name": middle_name, "last_name": last_name, } person.extras = extras if email: office = "Capitol" if email.endswith( "@leg.state.nh.us") else "District" person.add_contact_detail(type="email", value=email, note=office + " Office") # Capture legislator office contact information. district_address = "{}\n{}\n{}, {} {}".format(row["Address"], row["address2"], row["city"], row["State"], row["Zipcode"]).strip() phone = row["Phone"].strip() if not phone: phone = None if district_address: office = "Capitol" if chamber == "upper" else "District" person.add_contact_detail(type="address", value=district_address, note=office + " Office") if phone: office = "Capitol" if "271-" in phone else "District" person.add_contact_detail(type="voice", value=phone, note=office + " Office") # Retrieve legislator portrait. profile_url = None if chamber == "upper": profile_url = self.senate_profile_url.format(row["District"]) elif chamber == "lower": try: seat_number = seat_map[row["seatno"]] profile_url = self.house_profile_url.format(seat_number) except KeyError: pass if profile_url: person.image = self._get_photo(profile_url, chamber) person.add_source(profile_url) return person
def scrape_legislators(self, url, chamber): data = self.get(url).text data = data.replace('"""', '"') # weird triple quotes data = data.splitlines() fieldnames = ['last_name', 'first_name', 'party', 'district', 'address', 'city', 'state', 'zip'] csv_parser = csv.DictReader(data, fieldnames) district_leg_urls = self._district_legislator_dict() # Toss the row headers. next(csv_parser) for entry in csv_parser: if not entry: continue # District. district = entry['district'] hd_or_sd, district = district.split() # Party. party_letter = entry['party'] party = {'D': 'Democratic', 'R': 'Republican'}[party_letter] # Get full name properly capped. fullname = '%s %s' % (entry['first_name'].title(), entry['last_name'].title()) legislator = Person(name=fullname, primary_org=chamber, district=district, party=party, image=entry.get('photo_url', '')) legislator.add_source(url) # Get any info at the legislator's detail_url. deets = {} try: detail_url = district_leg_urls[hd_or_sd][district] deets = self._scrape_details(detail_url) except KeyError: self.warning( "Couldn't find legislator URL for district {} {}, likely retired; skipping" .format(hd_or_sd, district) ) continue except NoDetails: self.logger.warning("No details found at %r" % detail_url) continue else: legislator.add_source(detail_url) legislator.add_link(detail_url) # Get the office. address = '\n'.join([ entry['address'], '%s, %s %s' % (entry['city'].title(), entry['state'], entry['zip']) ]) legislator.add_contact_detail(type='address', value=address, note='District Office') phone = deets.get('phone') fax = deets.get('fax') email = deets.get('email') if phone: legislator.add_contact_detail(type='voice', value=phone, note='District Office') if fax: legislator.add_contact_detail(type='fax', value=fax, note='District Office') if email: legislator.add_contact_detail(type='email', value=email, note='District Office') yield legislator
def scrape_legislator(self, chamber, url): # Initialize default values for legislator attributes. full_name = None party = None photo_url = None email = None capitol_address = None capitol_phone = None district = None district_address = None district_phone = None if chamber == 'upper': title_prefix = 'Senator ' elif chamber == 'lower': title_prefix = 'Representative ' else: title_prefix = '' santa_fe_area_code = '(505)' page = self.lxmlize(url) info_node = self.get_node( page, '//table[@id="MainContent_formViewLegislator"]') if info_node is None: raise ValueError('Could not locate legislator data.') district_node = self.get_node( info_node, './/a[@id="MainContent_formViewLegislator_linkDistrict"]') if district_node is not None: district = district_node.text.strip() name_node = self.get_node( page, './/span[@id="MainContent_formViewLegislatorName' '_lblLegislatorName"]') if name_node is not None: if name_node.text.strip().endswith(' Vacant'): self.warning( 'Found vacant seat for {} district {}; skipping'.format( chamber, district)) return n_head, n_sep, n_party = name_node.text.rpartition(' - ') full_name = re.sub(r'^{}'.format(title_prefix), '', n_head.strip()) if '(D)' in n_party: party = 'Democratic' elif '(R)' in n_party: party = 'Republican' elif '(DTS)' in n_party: # decline to state = independent party = 'Independent' else: raise AssertionError('Unknown party {} for {}'.format( party, full_name)) photo_node = self.get_node( info_node, './/img[@id="MainContent_formViewLegislator_imgLegislator"]') if photo_node is not None: photo_url = photo_node.get('src') email_node = self.get_node( info_node, './/a[@id="MainContent_formViewLegislator_linkEmail"]') if email_node is not None and email_node.text: email = email_node.text.strip() capitol_address_node = self.get_node( info_node, './/span[@id="MainContent_formViewLegislator_lblCapitolRoom"]') if capitol_address_node is not None: capitol_address_text = capitol_address_node.text if capitol_address_text is not None: capitol_address = 'Room {} State Capitol\nSanta Fe, NM 87501'\ .format(capitol_address_text.strip()) capitol_phone_node = self.get_node( info_node, './/span[@id="MainContent_formViewLegislator_lblCapitolPhone"]') if capitol_phone_node is not None: capitol_phone_text = capitol_phone_node.text if capitol_phone_text: capitol_phone_text = capitol_phone_text.strip() area_code, phone = extract_phone_number(capitol_phone_text) if phone: capitol_phone = '{} {}'.format( area_code.strip() if area_code else santa_fe_area_code, phone) district_address_node = self.get_node( info_node, './/span[@id="MainContent_formViewLegislator_lblAddress"]') if district_address_node is not None: district_address = '\n'.join(district_address_node.xpath('text()')) office_phone_node = self.get_node( info_node, './/span[@id="MainContent_formViewLegislator_lblOfficePhone"]') home_phone_node = self.get_node( info_node, './/span[@id="MainContent_formViewLegislator_lblHomePhone"]') if office_phone_node is not None and office_phone_node.text: district_phone_text = office_phone_node.text elif home_phone_node is not None and home_phone_node.text: district_phone_text = home_phone_node.text else: district_phone_text = None if district_phone_text: d_area_code, d_phone = extract_phone_number(district_phone_text) district_phone = '{} {}'.format(d_area_code.strip(), d_phone) person = Person(name=full_name, district=district, party=party, primary_org=chamber, image=photo_url) if district_address: person.add_contact_detail(type='address', value=district_address, note='District Office') if district_phone: person.add_contact_detail(type='voice', value=district_phone, note='District Office') if capitol_address: person.add_contact_detail(type='address', value=capitol_address, note='Capitol Office') if capitol_phone: person.add_contact_detail(type='voice', value=capitol_phone, note='Capitol Office') if email: person.add_contact_detail(type='email', value=email, note='Capitol Office') person.add_link(url) person.add_source(url) yield person
def scrape(self): body_types = self.body_types() city_council, = [body for body in self.bodies() if body['BodyName'] == 'City Council'] terms = collections.defaultdict(list) for office in self.body_offices(city_council): if 'VACAN' not in office['OfficeRecordFullName']: terms[office['OfficeRecordFullName'].strip()].append(office) web_scraper = LegistarPersonScraper(None,None) web_scraper.MEMBERLIST = 'https://chicago.legistar.com/DepartmentDetail.aspx?ID=12357&GUID=4B24D5A9-FED0-4015-9154-6BFFFB2A8CB4&R=8bcbe788-98cd-4040-9086-b34fa8e49881' web_scraper.ALL_MEMBERS = '3:3' web_info = {} for member, _ in web_scraper.councilMembers({'ctl00$ContentPlaceHolder$lstName' : 'City Council'}): web_info[member['Person Name']['label']] = member web_info['Balcer, James'] = collections.defaultdict(lambda : None) web_info['Fioretti, Bob'] = collections.defaultdict(lambda : None) web_info['Balcer, James']['Ward/Office'] = 11 web_info['Fioretti, Bob']['Ward/Office'] = 2 members = {} for member, offices in terms.items(): web = web_info[member] p = Person(member) for term in offices: role = term['OfficeRecordTitle'] p.add_term('Alderman', 'legislature', district = "Ward {}".format(int(web['Ward/Office'])), start_date = self.toDate(term['OfficeRecordStartDate']), end_date = self.toDate(term['OfficeRecordEndDate'])) if web['Photo'] : p.image = web['Photo'] contact_types = { "City Hall Office": ("address", "City Hall Office"), "City Hall Phone": ("voice", "City Hall Phone"), "Ward Office Phone": ("voice", "Ward Office Phone"), "Ward Office Address": ("address", "Ward Office Address"), "Fax": ("fax", "Fax") } for contact_type, (type_, _note) in contact_types.items(): if web[contact_type] and web[contact_type] != 'N/A': p.add_contact_detail(type=type_, value= web[contact_type], note=_note) if web["E-mail"] and web["E-mail"]["label"] and web["E-mail"]["label"] != 'N/A': p.add_contact_detail(type="email", value=web['E-mail']['label'], note='E-mail') if web['Website']: p.add_link(web['Website']['url']) source_urls = self.person_sources_from_office(term) person_api_url, person_web_url = source_urls p.add_source(person_api_url, note='api') p.add_source(person_web_url, note='web') members[member] = p for body in self.bodies(): if body['BodyTypeId'] == body_types['Committee']: o = Organization(body['BodyName'], classification='committee', parent_id={'name' : 'Chicago City Council'}) o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body), note='api') o.add_source(self.WEB_URL + '/DepartmentDetail.aspx?ID={BodyId}&GUID={BodyGuid}'.format(**body), note='web') for office in self.body_offices(body): # messed up record for joanna thompson if office['OfficeRecordId'] == 1055: continue role = office['OfficeRecordTitle'] if role not in ("Vice Chair", "Chairman"): role = 'Member' person = office['OfficeRecordFullName'].strip() if person in members: p = members[person] else: p = Person(person) source_urls = self.person_sources_from_office(office) person_api_url, person_web_url = source_urls p.add_source(person_api_url, note='api') p.add_source(person_web_url, note='web') members[person] = p p.add_membership(body['BodyName'], role=role, start_date = self.toDate(office['OfficeRecordStartDate']), end_date = self.toDate(office['OfficeRecordEndDate'])) yield o for body in self.bodies(): if body['BodyTypeId'] == body_types['Joint Committee']: o = Organization(body['BodyName'], classification='committee', parent_id={'name' : 'Chicago City Council'}) o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body), note='api') o.add_source(self.WEB_URL + '/DepartmentDetail.aspx?ID={BodyId}&GUID={BodyGuid}'.format(**body), note='web') yield o for p in members.values(): yield p
def scrape_upper(self, chamber): url = 'http://www.senate.michigan.gov/senatorinfo_list.html' data = self.get(url).text doc = lxml.html.fromstring(data) for row in doc.xpath('//table[not(@class="calendar")]//tr')[3:]: if len(row) != 7: continue # party, dist, member, office_phone, office_fax, office_loc party, dist, member, contact, phone, fax, loc = row.getchildren() if (party.text_content().strip() == "" or 'Lieutenant Governor' in member.text_content()): continue party = abbr[party.text] district = dist.text_content().strip() name = member.text_content().strip() name = re.sub(r'\s+', " ", name) if name == 'Vacant': self.info('district %s is vacant', district) continue leg_url = member.xpath('a/@href')[0] office_phone = phone.text office_fax = fax.text office_loc = loc.text office_loc = re.sub( ' Farnum Bldg', ' Farnum Office Building\n125 West Allegan Street\nLansing, MI 48933', office_loc ) office_loc = re.sub( ' Capitol Bldg', ' State Capitol Building\nLansing, MI 48909', office_loc ) # email addresses aren't on the list page anymore but they # are on the page linked off "Contact Me" # data has a typo in a row contact_url = [ a for a in row.xpath(".//a") if a.text in ('Contact Me', 'Conact Me')][0].get('href') contact_html = self.get(contact_url).text contact_doc = lxml.html.fromstring(contact_html) email = None header_email = contact_doc.xpath("//a[@class='header_email']") if header_email: email = header_email[0].text else: # not using the most common template, but maybe they # dropped their email on the page somewhere links = contact_doc.xpath('//a') or [] text_email = [a for a in links if 'mailto:' in (a.get('href') or '')] if text_email: email = text_email[0].text person = Person(name=name, district=district, party=party, primary_org='upper') person.add_link(leg_url) person.add_source(leg_url) person.add_contact_detail(type='address', value=office_loc, note='Capitol Office') person.add_contact_detail(type='voice', value=office_phone, note='Capitol Office') person.add_contact_detail(type='fax', value=office_fax, note='Capitol Office') if email: person.add_contact_detail(type='email', value=email, note='Capitol Office') yield person
def scrape_member(self, chamber, member_url): page = self.get(member_url).text root = lxml.html.fromstring(page) name_and_party = root.xpath( 'string(//div[@class="col-md-12"]/h1[1])').split() title = name_and_party[0] # Account for Representative-Elect and Senator-Elect, for incoming class if title.startswith("Representative"): chamber = "lower" elif title.startswith("Senator"): chamber = "upper" full_name = " ".join(name_and_party[1:-1]) party = name_and_party[-1] if party == "(R)": party = "Republican" elif party == "(D)": party = "Democratic" elif party == "(G)": party = "Green" elif party == "(I)": party = "Independent" elif "-Elect" in title and not party.startswith("("): self.warning("Member-elect is currently missing a party") full_name = " ".join(name_and_party[1:]) party = "" else: raise AssertionError("Unknown party ({0}) for {1}".format( party, full_name)) try: img = root.xpath('//img[@class="SitePhotos MemberPhoto"]')[0] photo_url = "https://www.arkleg.state.ar.us" + img.attrib["src"] except IndexError: self.warning("No member photo found") photo_url = "" # Need to figure out a cleaner method for this later # info_box = root.xpath('string(//div[@id="bodyContent"]/div[2]/div[2])') try: district = root.xpath( 'string(//div[@id="bodyContent"]/div[2]/div[2]/div[3]/div[2])') except AttributeError: self.warning("Member has no district listed; skipping them") return person = Person( name=full_name, district=district, party=party, primary_org=chamber, image=photo_url, ) person.add_link(member_url) person.add_source(member_url) try: phone = root.xpath( 'string(//div[@id="bodyContent"]/div[2]/div[2]/div[1]/div[2]/a)' ) if not phone.strip(): raise AttributeError except AttributeError: phone = None try: email = root.xpath( 'string(//div[@id="bodyContent"]/div[2]/div[2]/div[2]/div[2]/a)' ) if not email.strip(): raise AttributeError except AttributeError: email = None address = root.xpath( 'string(//div[@id="bodyContent"]/div[1]/div[1]/p/b)') person.add_contact_detail(type="address", value=address, note="District Office") if phone is not None: person.add_contact_detail(type="voice", value=phone, note="District Office") if email is not None: person.add_contact_detail(type="email", value=email, note="District Office") try: occupation_check = root.xpath( 'string(//div[@id="bodyContent"]/div[2]/div[2]/div[5]/div[1]/b)' ) if occupation_check == "Occupation:": person.extras["occupation"] = root.xpath( 'string(//div[@id="bodyContent"]/div[2]/div[2]/div[5]/div[2])' ) else: raise AttributeError if not person.extras["occupation"].strip(): raise AttributeError except AttributeError: pass yield person
def scrape(self, session=None): if not session: session = self.jurisdiction.legislative_sessions[-1]['name'] self.info('no session specified, using %s', session) year_abr = session[0:4] self._init_mdb(year_abr) roster_csv = self.access_to_csv('Roster') bio_csv = self.access_to_csv('LegBio') photos = {} for rec in bio_csv: photos[rec['Roster Key']] = rec['URLPicture'] for rec in roster_csv: first_name = rec["Firstname"] middle_name = rec["MidName"] last_name = rec["LastName"] suffix = rec["Suffix"] full_name = first_name + " " + middle_name + " " + last_name + " " + suffix full_name = full_name.replace(' ', ' ') full_name = full_name[0:len(full_name) - 1] district = str(int(rec["District"])) party = rec["Party"] if party == 'R': party = "Republican" elif party == 'D': party = "Democratic" else: party = party chamber = rec["House"] if chamber == 'A': chamber = "lower" elif chamber == 'S': chamber = "upper" leg_status = rec["LegStatus"] # skip Deceased/Retired members if leg_status != 'Active': continue phone = rec["Phone"] or None email = None if rec["Email"]: email = rec["Email"] # Email has been removed from the Access DB, but it's # still [email protected] and [email protected] - many # reps have these emails on their personal pages even if # they're gone from the DB file if not email: email = self._construct_email(chamber, last_name) try: photo_url = photos[rec['Roster Key']] except KeyError: photo_url = '' self.warning('no photo url for %s', rec['Roster Key']) url = ('http://www.njleg.state.nj.us/members/bio.asp?Leg=' + str(int(rec['Roster Key']))) address = '{0}\n{1}, {2} {3}'.format(rec['Address'], rec['City'], rec['State'], rec['Zipcode']) gender = {'M': 'Male', 'F': 'Female'}[rec['Sex']] person = Person( name=full_name, district=district, primary_org=chamber, party=party, image=photo_url, gender=gender, ) person.add_link(url) person.add_source(url) person.add_source('http://www.njleg.state.nj.us/downloads.asp') person.add_contact_detail(type='address', value=address, note='District Office') if phone is not None: person.add_contact_detail(type='voice', value=phone, note='District Office') if email is not None: person.add_contact_detail(type='email', value=email, note='District Office') yield person
def scrape_details(self, chamber, leg_name, leg_link, role): if not leg_link: # Vacant post, likely: if "Vacancy" in leg_name: return raise Exception("leg_link is null. something went wrong") try: url = 'http://billstatus.ls.state.ms.us/members/%s' % leg_link url_root = os.path.dirname(url) details_page = self.get(url) root = lxml.etree.fromstring(details_page.content) party = root.xpath('string(//PARTY)') district = root.xpath('string(//DISTRICT)') photo = "%s/%s" % (url_root, root.xpath('string(//IMG_NAME)')) home_phone = root.xpath('string(//H_PHONE)') home_address = root.xpath('string(//H_ADDRESS)') home_address2 = root.xpath('string(//H_ADDRESS2)') home_city = root.xpath('string(//H_CITY)') home_zip = root.xpath('string(//H_ZIP)') home_address_total = '' if home_address and home_city: if not home_address2: home_address_total = "%s\n%s, MS %s" % ( home_address, home_city, home_zip ) else: home_address_total = "%s\n%s\n%s, MS %s" % ( home_address, home_address2, home_city, home_zip ) # bis_phone = root.xpath('string(//B_PHONE)') capital_phone = root.xpath('string(//CAP_PHONE)') # other_phone = root.xpath('string(//OTH_PHONE)') org_info = root.xpath('string(//ORG_INFO)') email_name = root.xpath('string(//EMAIL_ADDRESS)').strip() cap_room = root.xpath('string(//CAP_ROOM)') if leg_name in ('Lataisha Jackson', 'John G. Faulkner', 'Jeffery Harness'): assert not party, ("Remove special-casing for this Democrat without a " "listed party: {}").format(leg_name) party = 'Democratic' elif leg_name in ('James W. Mathis', 'John Glen Corley'): assert not party, ("Remove special-casing for this Republican without" " a listed party: {}").format(leg_name) party = 'Republican' elif party == 'D': party = 'Democratic' elif party == 'R': party = 'Republican' else: raise AssertionError( "A member with no identifiable party was found: {}".format(leg_name)) leg = Person(primary_org=chamber, district=district, party=party, image=photo, name=leg_name, role=role ) leg.extras['org_info'] = org_info leg.add_source(url) leg.add_link(url) if email_name != "": if "@" in email_name: email = email_name else: email = '%s@%s.ms.gov' % (email_name, {"upper": "senate", "lower": "house"}[chamber]) leg.add_contact_detail(type='email', value=email, note='Capitol Office') if capital_phone != "": leg.add_contact_detail(type='voice', value=capital_phone, note='Capitol Office') if cap_room != "": address = "Room %s\n%s" % (cap_room, CAP_ADDRESS) else: address = CAP_ADDRESS leg.add_contact_detail(type='address', value=address, note='Capitol Office') if home_phone != "": leg.add_contact_detail(type='voice', value=home_phone, note='District Office') if home_address_total != "": leg.add_contact_detail(type='address', value=home_address_total, note='District Office') yield leg except scrapelib.HTTPError as e: self.warning(str(e))
def scrape_senators(self): mapping = { 'district': 0, 'first_name': 2, 'middle_name': 3, 'last_name': 4, 'suffixes': 5, 'party': 1, 'street_addr': 6, 'city': 7, 'state': 8, 'zip_code': 9, 'phone1': 10, 'phone2': 11, 'email': 12 } url = ('https://mainelegislature.org/uploads/visual_edit/' '128th-senate-members-for-distribution-1.xlsx') fn, result = self.urlretrieve(url) wb = xlrd.open_workbook(fn) sh = wb.sheet_by_index(0) LEGISLATOR_ROSTER_URL = \ 'https://mainelegislature.org/senate/128th-senators/9332' roster_doc = lxml.html.fromstring(self.get(LEGISLATOR_ROSTER_URL).text) roster_doc.make_links_absolute(LEGISLATOR_ROSTER_URL) for rownum in range(1, sh.nrows): # get fields out of mapping d = {} for field, col_num in mapping.items(): try: d[field] = str(sh.cell(rownum, col_num).value).strip() except IndexError: # This col_num doesn't exist in the sheet. pass first_name = d['first_name'] middle_name = d['middle_name'] last_name = d['last_name'] full_name = " ".join((first_name, middle_name, last_name)) full_name = re.sub(r'\s+', ' ', full_name).strip() address = "{street_addr}\n{city}, ME {zip_code}".format(**d) phone = d['phone1'] if not phone: phone = d['phone2'] if not phone: phone = None district = d['district'].split('.')[0] party = d['party'].split('.')[0] # Determine legislator's URL to get their photo URL_XPATH = '//li/a[contains(text(), "District {:02d}")]/@href'.format( int(district)) try: (leg_url, ) = roster_doc.xpath(URL_XPATH) except ValueError: self.warning('vacant seat %s', district) continue # Seat is vacant html = self.get(leg_url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(leg_url) xpath = '//img[contains(@src, ".png")]/@src' photo_url = doc.xpath(xpath) if photo_url: photo_url = photo_url.pop() else: photo_url = None person = Person( name=full_name, district=district, image=photo_url, primary_org='upper', party=party, ) person.add_link(leg_url) person.add_source(leg_url) person.extras['first_name'] = first_name person.extras['middle_name'] = middle_name person.extras['last_name'] = last_name person.add_contact_detail(type='address', value=address, note='District Office') if phone: person.add_contact_detail(type='voice', value=clean_phone(phone), note='District Phone') person.add_contact_detail(type='email', value=d['email'], note='District Email') yield person
def scrape(self): committee_d = {} non_committees = {'City Council', 'Office of the Mayor', 'Office of the City Clerk'} for councilman, committees in self.councilMembers() : if councilman['Ward/Office'] == "": continue ward = councilman['Ward/Office'] if ward not in {"Mayor", "Clerk"} : ward = "Ward {}".format(int(ward)) role = "Alderman" p = Person(councilman['Person Name']['label'], district=ward, primary_org="legislature", role=role) if councilman['Photo'] : p.image = councilman['Photo'] contact_types = { "City Hall Office": ("address", "City Hall Office"), "City Hall Phone": ("voice", "City Hall Phone"), "Ward Office Phone": ("voice", "Ward Office Phone"), "Ward Office Address": ("address", "Ward Office Address"), "Fax": ("fax", "Fax") } for contact_type, (type_, _note) in contact_types.items(): if councilman[contact_type]: p.add_contact_detail(type=type_, value= councilman[contact_type], note=_note) if councilman["E-mail"]: p.add_contact_detail(type="email", value=councilman['E-mail']['label'], note='E-mail') if councilman['Website']: p.add_link(councilman['Website']['url']) p.add_source(councilman['Person Name']['url'], note='web') for committee, _, _ in committees: committee_name = committee['Legislative Body']['label'] if committee_name and committee_name not in non_committees: o = committee_d.get(committee_name, None) if o is None: o = Organization(committee_name, classification='committee', parent_id={'name' : 'Chicago City Council'}) o.add_source(committee['Legislative Body']['url'], note='web') committee_d[committee_name] = o o.add_member(p, role=committee["Title"]) yield p for name, term in FORMER_ALDERMEN.items() : p = Person(name=name, primary_org="legislature", start_date=term['term'][0], end_date=term['term'][1], district="Ward {}".format(term['ward']), role='Alderman') if name == 'Chandler, Michael D.' : p.add_term('Alderman', "legislature", district="Ward {}".format(term['ward']), start_date=datetime.date(2011, 5, 16), end_date=datetime.date(2015, 5, 18)) p.add_source(term['source'], note='web') yield p for o in committee_d.values() : yield o for committee_name in FORMER_COMMITTEES : o = Organization(committee_name, classification='committee', parent_id={'name' : 'Chicago City Council'}) o.add_source("https://chicago.legistar.com/Departments.aspx", note='web') yield o for joint_committee in JOINT_COMMITTEES : o = Organization(joint_committee, classification='committee', parent_id={'name' : 'Chicago City Council'}) o.add_source("https://chicago.legistar.com/Departments.aspx", note='web') yield o
def scrape_reps(self): url = 'http://www.maine.gov/legis/house/dist_mem.htm' page = self.get(url).text page = lxml.html.fromstring(page) page.make_links_absolute(url) # These do not include the non-voting tribal representatives # They do not have numbered districts, and lack a good deal of # the standard profile information about representatives for district in page.xpath('//a[contains(@href, "dist_twn")]/..'): if "- Vacant" in district.text_content(): self.warning("District is vacant: '{}'".format( district.text_content())) continue _, district_number = district.xpath('a[1]/@href')[0].split('#') leg_url = district.xpath('a[2]/@href')[0] leg_info = district.xpath('a[2]/text()')[0] INFO_RE = r''' Representative\s (?P<member_name>.+?) \s\( (?P<party>[DRCUIG]) - (?P<district_name>.+?) \) ''' info_search = re.search(INFO_RE, leg_info, re.VERBOSE) if not info_search: leg_url = district.xpath('a[3]/@href')[0] leg_info_second_active = district.xpath('a[3]/text()')[0] member_name = leg_info_second_active mem_info = district.xpath('a[3]/following-sibling::text()') party = _party_map[mem_info[0][2]] district = mem_info[0].split('-')[1][:-1] else: member_name = info_search.group('member_name') party = _party_map[info_search.group('party')] district_name = info_search.group('district_name') # Get the photo url. html = self.get(leg_url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(leg_url) (photo_url, ) = doc.xpath('//img[contains(@src, ".jpg")]/@src') # Add contact information from personal page office_address = re.search(r'<B>Address: </B>(.+?)\n?</?P>', html, re.IGNORECASE).group(1) office_email = doc.xpath( '//a[starts-with(@href, "mailto:")]/text()') business_phone = re.search( r'<B>Business Telephone: </B>(.+?)</?P>', html, re.IGNORECASE) home_phone = re.search(r'<B>Home Telephone: </B>(.+?)</?P>', html, re.IGNORECASE) cell_phone = re.search(r'<B>Cell Telephone: </B>(.+?)</?P>', html, re.IGNORECASE) person = Person( name=member_name, district=district_number, primary_org='lower', party=party, image=photo_url, ) person.extras['district_name'] = district_name person.add_link(leg_url) person.add_source(leg_url) if office_address: leg_address = office_address person.add_contact_detail(type='address', value=leg_address, note='District Office') else: # If no address for legislator if party == 'Democratic': leg_address = ( 'House Democratic Office, Room 333 State House, 2 State House Station, ' 'Augusta, Maine 04333-0002') person.add_contact_detail(type='address', value=leg_address, note='Party Office') elif party == 'Republican': leg_address = ( 'House GOP Office, Room 332 State House, 2 State House Station, ' 'Augusta, Maine 04333-0002') person.add_contact_detail(type='address', value=leg_address, note='Party Office') if office_email: office_email = office_email[0] person.add_contact_detail(type='email', value=office_email, note='District Office') if business_phone: person.add_contact_detail(type='voice', value=clean_phone( business_phone.group(1)), note='Business Phone') if home_phone: person.add_contact_detail(type='voice', value=clean_phone( home_phone.group(1)), note='Home Phone') if cell_phone: person.add_contact_detail(type='voice', value=clean_phone( cell_phone.group(1)), note='Cell Phone') yield person
def scrape(self): committee_d = {} non_committees = { 'City Council', 'Office of the Mayor', 'Office of the City Clerk' } for councilman, committees in self.councilMembers(): if councilman['Ward/Office'] == "": continue ward = councilman['Ward/Office'] if ward not in {"Mayor", "Clerk"}: ward = "Ward {}".format(int(ward)) role = "Alderman" p = Person(councilman['Person Name']['label'], district=ward, primary_org="legislature", role=role) if councilman['Photo']: p.image = councilman['Photo'] contact_types = { "City Hall Office": ("address", "City Hall Office"), "City Hall Phone": ("voice", "City Hall Phone"), "Ward Office Phone": ("voice", "Ward Office Phone"), "Ward Office Address": ("address", "Ward Office Address"), "Fax": ("fax", "Fax") } for contact_type, (type_, _note) in contact_types.items(): if councilman[contact_type]: p.add_contact_detail(type=type_, value=councilman[contact_type], note=_note) if councilman["E-mail"]: p.add_contact_detail(type="email", value=councilman['E-mail']['label'], note='E-mail') if councilman['Website']: p.add_link(councilman['Website']['url']) p.add_source(councilman['Person Name']['url'], note='web') for committee, _, _ in committees: committee_name = committee['Legislative Body']['label'] if committee_name and committee_name not in non_committees: o = committee_d.get(committee_name, None) if o is None: o = Organization( committee_name, classification='committee', parent_id={'name': 'Chicago City Council'}) o.add_source(committee['Legislative Body']['url'], note='web') committee_d[committee_name] = o o.add_member(p, role=committee["Title"]) yield p for name, term in FORMER_ALDERMEN.items(): p = Person(name=name, primary_org="legislature", start_date=term['term'][0], end_date=term['term'][1], district="Ward {}".format(term['ward']), role='Alderman') if name == 'Chandler, Michael D.': p.add_term('Alderman', "legislature", district="Ward {}".format(term['ward']), start_date=datetime.date(2011, 5, 16), end_date=datetime.date(2015, 5, 18)) p.add_source(term['source'], note='web') yield p for o in committee_d.values(): yield o for committee_name in FORMER_COMMITTEES: o = Organization(committee_name, classification='committee', parent_id={'name': 'Chicago City Council'}) o.add_source("https://chicago.legistar.com/Departments.aspx", note='web') yield o for joint_committee in JOINT_COMMITTEES: o = Organization(joint_committee, classification='committee', parent_id={'name': 'Chicago City Council'}) o.add_source("https://chicago.legistar.com/Departments.aspx", note='web') yield o
def scrape_chamber(self, chamber): body = {"lower": "H", "upper": "S"}[chamber] url = "http://www.azleg.gov/MemberRoster/?body=" + body page = self.get(url).text # there is a bad comment closing tag on this page page = page.replace("--!>", "-->") root = html.fromstring(page) path = "//table//tr" roster = root.xpath(path)[1:] for row in roster: position = "" name, district, party, email, room, phone, = row.xpath("td") if email.attrib.get("class") == "vacantmember": continue # Skip any vacant members. link = name.xpath("string(a/@href)") if len(name) == 1: name = name.text_content().strip() else: position = name.tail.strip() name = name[0].text_content().strip() if "--" in name: name = name.split("--")[0].strip() linkpage = self.get(link).text linkpage = linkpage.replace("--!>", "-->") linkroot = html.fromstring(linkpage) linkroot.make_links_absolute(link) photos = linkroot.xpath("//img[contains(@src, 'MemberPhoto')]") if len(photos) != 1: self.warning("no photo on " + link) photo_url = "" else: photo_url = photos[0].attrib["src"] district = district.text_content().strip() party = party.text_content().strip() email = email.text_content().strip() if email.startswith("Email: "): email = email.replace("Email: ", "").lower() + "@azleg.gov" else: email = "" party = self.get_party(party) room = room.text_content().strip() if chamber == "lower": address = "House of Representatives\n" else: address = "Senate\n" address = (address + "1700 West Washington\n Room " + room + "\nPhoenix, AZ 85007") phone = phone.text_content().strip() if "602" not in re.findall(r"(\d+)", phone): phone = "602-" + phone leg = Person( primary_org=chamber, image=photo_url, name=name, district=district, party=party, ) leg.add_contact_detail(type="address", value=address, note="Capitol Office") leg.add_contact_detail(type="voice", value=phone, note="Capitol Office") leg.add_party(party=party) leg.add_link(link) if email: leg.add_contact_detail(type="email", value=email) if position: leg.add_membership(name_or_org=party, role=position) # leg.add_role(position, term, chamber=chamber, # district=district, party=party) leg.add_source(url) # Probably just get this from the committee scraper # self.scrape_member_page(link, session, chamber, leg) yield leg
def legislators(self, latest_only): legs = {} for member, chamber, term, url in self._memberships(latest_only): name, _, _, district, party = member.xpath("td") district = district.text detail_url = name.xpath("a/@href")[0] if party.text_content().strip() == "": self.warning("Garbage party: Skipping!") continue party = { "D": "Democratic", "R": "Republican", "I": "Independent" }[party.text] name = name.text_content().strip() # inactive legislator, skip them for now if name.endswith("*"): name = name.strip("*") continue name = AKA.get(name, name) if name in legs: p, terms = legs[name] terms.append((chamber, district, term, party)) else: p = Person(name, party=party) legs[name] = p, [(chamber, district, term, party)] p.add_source(url) p.add_source(detail_url) p.add_link(detail_url) birth_date = BIRTH_DATES.get(name, None) if birth_date: p.birth_date = birth_date leg_html = self.get(detail_url).text leg_doc = lxml.html.fromstring(leg_html) leg_doc.make_links_absolute(detail_url) hotgarbage = ("Senate Biography Information for the 98th General " "Assembly is not currently available.") if hotgarbage in leg_html: # The legislator's bio isn't available yet. self.logger.warning("No legislator bio available for " + name) continue photo_url = leg_doc.xpath( '//img[contains(@src, "/members/")]/@src')[0] p.image = photo_url p.contact_details = [] # email email = leg_doc.xpath('//b[text()="Email: "]') if email: p.add_contact_detail(type="email", value=email[0].tail.strip(), note="capitol") offices = { "capitol": '//table[contains(string(), "Springfield Office")]', "district": '//table[contains(string(), "District Office")]', } for location, xpath in offices.items(): table = leg_doc.xpath(xpath) if table: for type, value in self._table_to_office(table[3]): if type in ("fax", "voice" ) and not validate_phone_number(value): continue p.add_contact_detail(type=type, value=value, note=location) return legs
def _scrape_representative(self, url, parties): """ Returns a Person object representing a member of the lower legislative chamber. """ # url = self.get(url).text.replace('<br>', '') member_page = self.lxmlize(url) photo_url = member_page.xpath('//img[@class="member-photo"]/@src')[0] if photo_url.endswith('/.jpg'): photo_url = None scraped_name, district_text = member_page.xpath( '//div[@class="member-info"]/h2') scraped_name = scraped_name.text_content().strip().replace('Rep. ', '') scraped_name = ' '.join(scraped_name.split()) name = ' '.join(scraped_name.split(', ')[::-1]) district_text = district_text.text_content().strip() district = str(self.district_re.search(district_text).group(1)) # Vacant house "members" are named after their district numbers: if re.match(r'^\d+$', scraped_name): yield None party = parties[district] person = Person(name=name, district=district, party=party, primary_org='lower') if photo_url is not None: person.image = photo_url person.add_link(url) person.add_source(url) def office_name(element): """Returns the office address type.""" return element.xpath('preceding-sibling::h4[1]/text()')[0] \ .rstrip(':') offices_text = [{ 'name': office_name(p_tag), 'type': office_name(p_tag).replace(' Address', '').lower(), 'details': p_tag.text_content() } for p_tag in member_page.xpath( '//h4/following-sibling::p[@class="double-space"]')] for office_text in offices_text: details = office_text['details'].strip() # A few member pages have blank office listings: if details == '': continue # At the time of writing, this case of multiple district # offices occurs exactly once, for the representative at # District 43: if details.count('Office') > 1: district_offices = [ district_office.strip() for district_office in re.findall( r'(\w+ Office.+?(?=\w+ Office|$))', details, flags=re.DOTALL) ] offices_text += [{ 'name': re.match(r'\w+ Office', office).group(), 'type': 'district', 'details': re.search(r'(?<=Office).+(?=\w+ Office|$)?', office, re.DOTALL).group() } for office in district_offices] match = self.address_re.search(details) if match is not None: address = re.sub(' +$', '', match.group().replace('\r', '').replace( '\n\n', '\n'), flags=re.MULTILINE) else: # No valid address found in the details. continue phone_number = extract_phone(details) fax_number = extract_fax(details) if address: person.add_contact_detail(type='address', value=address, note=office_text['name']) if phone_number: person.add_contact_detail(type='voice', value=phone_number, note=office_text['name']) if fax_number: person.add_contact_detail(type='fax', value=fax_number, note=office_text['name']) yield person