def scrape_committees(self, session): session_key = SESSION_KEYS[session] committees_response = self.api_client.get('committees', session=session_key) legislators = index_legislators(self, session_key) for committee in committees_response: org = Organization( chamber={'S': 'upper', 'H': 'lower', 'J': 'legislature'}[committee['HouseOfAction']], name=committee['CommitteeName'], classification='committee') org.add_source( 'https://olis.leg.state.or.us/liz/{session}' '/Committees/{committee}/Overview'.format(session=session_key, committee=committee['CommitteeName'])) members_response = self.api_client.get('committee_members', session=session_key, committee=committee['CommitteeCode']) for member in members_response: try: member_name = legislators[member['LegislatorCode']] except KeyError: logger.warn('Legislator {} not found in session {}'.format( member['LegislatorCode'], session_key)) member_name = member['LegislatorCode'] org.add_member(member_name, role=member['Title'] if member['Title'] else '') yield org
def scrape_reps_comm(self): # As of 1/27/15, the committee page has the wrong # session number (126th) at the top, but # has newly elected people, so we're rolling with it. url = 'http://legislature.maine.gov/house/hsecoms.htm' page = self.get(url).text root = lxml.html.fromstring(page) count = 0 for n in range(1, 12, 2): path = 'string(//body/center[%s]/h1/a)' % (n) comm_name = root.xpath(path) committee = Organization(chamber='lower', name=comm_name, classification='committee') count = count + 1 path2 = '/html/body/ul[%s]/li/a' % (count) for el in root.xpath(path2): rep = el.text if rep.find('(') != -1: mark = rep.find('(') rep = rep[15: mark].strip() if 'chair' in rep.lower(): role = 'chair' rep = re.sub(r'(?i)[\s,]*chair\s*$', '', rep).strip() else: role = 'member' committee.add_member(rep, role) committee.add_source(url) yield committee
def _scrape_upper_committee(self, name, url2): cat = "Assignments.asp" url3 = url2.replace("default.asp", cat) committee = Organization(name, chamber="upper", classification="committee" ) committee.add_source(url2) page = self.lxmlize(url3) members = page.xpath('//table[@id="table38"]//font/a/b') for link in members: role = "member" if link == members[0]: role = "Chairman" if link == members[1]: role = "Vice-Chairman" name = link.xpath('string()') name = name.replace('Senator ', '') name = re.sub('[\s]{2,}', ' ', name).strip() committee.add_member(name, role) yield committee
def _scrape_lower_special_committees(self): url = 'http://house.louisiana.gov/H_Cmtes/SpecialCommittees.aspx' page = self.lxmlize(url) committee_list = page.xpath('//div[@class="accordion"]')[0] headers = committee_list.xpath('./h3') for header in headers: committee_name_text = header.xpath('string()') committee_name = committee_name_text.strip() committee_name = self._normalize_committee_name(committee_name) chamber = 'legislature' if committee_name.startswith('Joint') else 'lower' committee = Organization(committee_name, chamber=chamber, classification='committee') committee.add_source(url) committee_memberlist = header.xpath('./following-sibling::div[@class="pane"]' '//tr[@class="linkStyle2"]') for row in committee_memberlist: member_name = row.xpath('normalize-space(string(./th[1]))') member_name = self._normalize_member_name(member_name) member_role = row.xpath('normalize-space(string(./th[2]))') member_role = self._normalize_member_role(member_role) committee.add_member(member_name, member_role) yield committee
def test_committee_add_member_person(): c = Organization('Defense', classification='committee') p = Person('John Adams') c.add_member(p, role='chairman') assert c._related[0].person_id == p._id assert c._related[0].organization_id == c._id assert c._related[0].role == 'chairman'
def scrape_chamber(self, chamber): session = self.latest_session() # since we are scraping only latest_session session_id = session_metadata.session_id_meta_data[session] client = AZClient() committees = client.list_committees( sessionId=session_id, includeOnlyCommitteesWithAgendas='false', legislativeBody='S' if chamber == 'upper' else 'H', ) for committee in committees.json(): c = Organization(name=committee['CommitteeName'], chamber=chamber, classification='committee') details = client.get_standing_committee( sessionId=session_id, legislativeBody='S' if chamber == 'upper' else 'H', committeeId=committee['CommitteeId'], includeMembers='true', ) for member in details.json()[0]['Members']: c.add_member( u'{} {}'.format(member['FirstName'], member['LastName']), role=parse_role(member), ) c.add_source(details.url) c.add_source(committees.url) yield c
def handle_page(self): name = self.doc.xpath('//h2[@class="committeeName"]')[0].text if name.startswith('Appropriations Subcommittee'): name = name.replace('Appropriations ', '') parent = {'name': 'Appropriations', 'classification': 'upper'} chamber = None else: if name.startswith('Committee on'): name = name.replace('Committee on ', '') parent = None chamber = 'upper' comm = Organization(name=name, classification="committee", chamber=chamber, parent_id=parent, ) for dt in self.doc.xpath('//div[@id="members"]/dl/dt'): role = dt.text.replace(': ', '').strip().lower() member = dt.xpath('./following-sibling::dd')[0].text_content() member = self.clean_name(member) comm.add_member(member, role=role) for ul in self.doc.xpath('//div[@id="members"]/ul/li'): member = self.clean_name(ul.text_content()) comm.add_member(member) comm.add_source(self.url) yield comm
def scrape_interim_committee(self, link, name): url = re.sub(r'\s+', '', link.attrib['href']) html = self.get(url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) if 'Subcommittee' in name: # Check whether the parent committee is manually defined first # before attempting to automatically resolve it. parent = WVCommitteeScraper.subcommittee_parent_map.get(name, None) if parent is None: parent = name.partition('Subcommittee')[0].strip() comm = Organization( name=name, classification='committee', parent_id={'name': parent, 'classification': 'joint'} ) else: comm = Organization(name=name, classification='committee', chamber='joint') comm.add_source(url) xpath = '//a[contains(@href, "?member=")]' for link in doc.xpath(xpath): name = link.text_content().strip() name = re.sub(r'^Delegate\s+', '', name) name = re.sub(r'^Senator\s+', '', name) role = link.getnext().text or 'member' comm.add_member(name, role.strip()) return comm
def scrape_approp_subcommittees(self): URL = 'http://www.senate.michigan.gov/committee/appropssubcommittee.html' html = self.get(URL).text doc = lxml.html.fromstring(html) for strong in doc.xpath('//strong'): com = Organization( name=strong.text.strip(), parent_id=self._senate_appropriations, classification='committee', ) com.add_source(URL) legislators = strong.getnext().tail.replace('Senators', '').strip() for leg in re.split(', | and ', legislators): if leg.endswith('(C)'): role = 'chairman' leg = leg[:-4] elif leg.endswith('(VC)'): role = 'vice chairman' leg = leg[:-5] elif leg.endswith('(MVC)'): role = 'minority vice chairman' leg = leg[:-6] else: role = 'member' com.add_member(leg, role=role) yield com
def scrape_committee(self, chamber, name, url): page = self.get(url).text page = lxml.html.fromstring(page) if page.xpath("//h3[. = 'Joint Committee']"): chamber = 'joint' subcommittee = page.xpath("//h3[@align='center']/text()")[0] if "Subcommittee" not in subcommittee: comm = Organization( chamber=chamber, name=name, classification='committee') else: comm = Organization( name=subcommittee, classification='committee', parent_id={'classification': chamber, 'name': name}) comm.add_source(url) for link in page.xpath("//a[contains(@href, 'member=')]"): member = link.text.strip() mtype = link.xpath("string(../preceding-sibling::td[1])") mtype = mtype.strip(": \r\n\t").lower() comm.add_member(member, mtype) if not comm._related: self.warning('not saving %s, appears to be empty' % name) else: yield comm
def scrape_committee(self, term, href, name): page = self.get(href).text page = lxml.html.fromstring(page) page.make_links_absolute(href) members = page.xpath("//div[@class='view-content']" "//a[contains(@href, 'members')]") if '/joint/' in href: chamber = 'legislature' elif '/senate/' in href: chamber = 'upper' elif '/house/' in href: chamber = 'lower' else: # interim committees and others were causing duplicate committee issues, skipping self.warning('Failed to identify chamber for {}; skipping'.format(href)) return cttie = Organization(name, chamber=chamber, classification='committee') for a in members: member = a.text role = a.xpath("ancestor::div/h2[@class='pane-title']/text()")[0].strip() role = {"Legislative Members": "member", "Chairman": "chair", "Vice Chairman": "member"}[role] if member is None or member.startswith("District"): continue member = member.replace('Senator ', '').replace('Representative ', '') cttie.add_member(member, role=role) cttie.add_source(href) yield cttie
def scrape_approp_subcommittees(self, url): html = self.get(url).text doc = lxml.html.fromstring(html) for strong in doc.xpath('//strong'): com = Organization( name=strong.text.strip(), parent_id={ 'name': 'Appropriations', 'classification': 'committee', }, classification='committee', ) com.add_source(url) legislators = strong.getnext().tail.replace('Senators', '').strip() for leg in re.split(', | and ', legislators): if leg.endswith('(C)'): role = 'chairman' leg = leg[:-4] elif leg.endswith('(VC)'): role = 'vice chairman' leg = leg[:-5] elif leg.endswith('(MVC)'): role = 'minority vice chairman' leg = leg[:-6] else: role = 'member' com.add_member(leg, role=role) yield com
def scrape(self): url = 'http://www.mec.mo.gov/EthicsWeb/CampaignFinance/CF11_SearchComm.aspx' for letter in ['a', 'e', 'i', 'o', 'u', 'y']: print("Searching '{}'".format(letter)) initial = self.get(url).text parsed = lxml.html.fromstring(initial) page_n = 0 data = get_form_data(parsed, first_time=True) data['ctl00$ContentPlaceHolder$txtCandLast'] = letter while True: page_n += 1 print("Page: {}".format(page_n)) r = self.post(url, data=data, cookies=dict(PageIndex=str(1))) output = lxml.html.fromstring(r.text) rows = output.cssselect('#ctl00_ContentPlaceHolder_grvSearch tr') for r in rows: tds = r.cssselect('td') if len(tds) > 3: name = tds[2].text_content().strip() _registrant = Person( name=name, source_identified=True ) committee_name = tds[1].text_content().strip() _office = Organization( name=committee_name, classification='Committee', # parent_id=self.jurisdiction._state, source_identified=True ) _office.add_member( _registrant, role='committee candidate', label='candidate for {n}'.format(n=_office.name), ) yield _registrant yield _office if not output.xpath("//*[@id='ctl00_ContentPlaceHolder_grvSearch_ctl28_lbtnNextPage']"): print(output.xpath("//*[@id='ctl00_ContentPlaceHolder_grvSearch_ctl28_lbtnNextPage']")) break data = get_form_data(output)
def scrape_senate_committee(self, url): html = self.get(url).text doc = lxml.html.fromstring(html) headers = doc.xpath('(//div[@class="row"])[2]//h1') assert len(headers) == 1 name = ' '.join(headers[0].xpath('./text()')) name = re.sub(r'\s+Committee.*$', '', name) com = Organization(chamber='upper', name=name, classification='committee') for member in doc.xpath('(//div[@class="row"])[3]/div[1]/ul[1]/li'): text = member.text_content() member_name = member.xpath('./a/text()')[0].replace('Representative ', '') if 'Committee Chair' in text: role = 'chair' elif 'Minority Vice' in text: role = 'minority vice chair' elif 'Vice' in text: role = 'majority vice chair' else: role = 'member' com.add_member(member_name, role=role) com.add_source(url) yield com
def scrape_committee(self, name, url, chamber): org = Organization(name=name, chamber=chamber, classification='committee') org.add_source(url) data = self.get(url).text doc = lxml.html.fromstring(data) for leg in doc.xpath('//div[@id="members"]/div[@id="members"]/p/a/text()'): leg = leg.replace('Representative ', '') leg = leg.replace('Senator ', '') leg = leg.strip() if ' (' in leg: leg, role = leg.split(' (') if 'Vice-Chair' in role: role = 'vice-chair' elif 'Co-Chair' in role: role = 'co-chair' elif 'Chair' in role: role = 'chair' else: raise Exception('unknown role: %s' % role) else: role = 'member' org.add_member(leg, role) return org
def scrape_lower_committee(self, name, url): page = self.lxmlize(url) committee = Organization(chamber='lower', name=name, classification="committee") committee.add_source(url) seen = set() member_links = self.get_nodes( page, '//div[@class="mod-inner"]//a[contains(@href, "mem")]') for member_link in member_links: member_name = None member_role = None member_name = member_link.text if member_name is None: continue # Figure out if this person is the chair. if member_link == member_links[0]: member_role = 'chair' else: member_role = 'member' if name not in seen: committee.add_member(member_name, member_role) seen.add(member_name) return committee
def scrape_page(self, link, chamber=None): page = self.lxmlize(link.attrib['href']) comName = link.text roles = { "Chair": "chair", "Vice Chair": "vice-chair", "Vice-Chair": "vice-chair", } committee = Organization(comName, chamber=chamber, classification='committee') committee.add_source(link.attrib['href']) for member in page.xpath('//div[@class="members"]/' + 'div[@class="roster-item"]'): details = member.xpath('.//div[@class="member-details"]')[0] person = details.xpath('./h4')[0].text_content() # This page does random weird things with whitepace to names person = ' '.join(person.strip().split()) if not person: continue role = details.xpath('./span[@class="member-role"]') if role: role = roles[role[0].text] else: role = 'member' committee.add_member(person, role=role) yield committee
def scrape_committees_pdf(self, year, chamber, filename, url): if chamber == 'lower' and year == '2015': text = self._fix_house_text(filename).decode() else: text = convert_pdf(filename, type='text-nolayout').decode() for hotgarbage, replacement in ( (r'Judicial Branch, Law Enforcement,\s+and\s+Justice', 'Judicial Branch, Law Enforcement, and Justice'), (r'Natural Resources and\s+Transportation', 'Natural Resources and Transportation'), (r'(?u)Federal Relations, Energy,?\s+and\s+Telecommunications', 'Federal Relations, Energy, and Telecommunications') ): text = re.sub(hotgarbage, replacement, text) lines = iter(text.splitlines()) # Drop any lines before the ag committee. lines = dropwhile(lambda s: 'Agriculture' not in s, lines) comm = None for line in lines: # Replace Unicode variants with ASCII equivalents line = line.replace(" ", " ").replace("‐", "-") if 'Subcommittees' in line: self.warning("Currently, we're skipping subcommittees") # https://github.com/openstates/openstates/issues/2099 break if is_committee_name(line): if comm and comm._related: yield comm committee = line.strip() comm = Organization(name=committee, chamber=chamber, classification='committee') comm.add_source(url) elif is_legislator_name(line): name, party = line.rsplit('(', 1) name = name.strip().replace("Rep. ", "").replace("Sen. ", "") if re.search(' Ch', party): role = 'chair' elif ' VCh' in party: role = 'vice chair' elif ' MVCh' in party: role = 'minority vice chair' else: role = 'member' comm.add_member(name, role) if comm._related: yield comm
def scrape(self, session=None): if session is None: session = self.latest_session() self.info('no session specified, using %s', session) # com_types = ['J', 'SE', 'O'] # base_url = 'https://wyoleg.gov/LsoService/api/committeeList/2018/J' url = 'https://wyoleg.gov/LsoService/api/committees/{}'.format(session) response = self.get(url) coms_json = json.loads(response.content.decode('utf-8')) for row in coms_json: com_url = 'https://wyoleg.gov/LsoService/api/committeeDetail/{}/{}'.format( session, row['ownerID']) com_response = self.get(com_url) com = json.loads(com_response.content.decode('utf-8')) # WY doesn't seem to have any house/senate only committees that I can find committee = Organization( name=com['commName'], chamber='legislature', classification='committee') for member in com['commMembers']: role = 'chairman' if member['chairman'] == 'Chairman' else 'member' committee.add_member(member['name'], role) # some WY committees have non-legislators appointed to the member by the Governor # but the formatting is super inconsistent if com['otherMembers']: committee.extras['other_members'] = com['otherMembers'] committee.extras['wy_id'] = com['commID'] committee.extras['wy_code'] = com['ownerID'] committee.extras['wy_type_code'] = com['type'] committee.extras['budget'] = com['budget'] if com['statAuthority']: committee.extras['statutory_authority'] = com['statAuthority'] if com['number']: committee.extras['seat_distribution'] = com['number'] committee.add_identifier( scheme='WY Committee ID', identifier=str(com['commID'])) committee.add_identifier( scheme='WY Committee Code', identifier=str(com['ownerID'])) if com['description']: committee.add_identifier( scheme='Common Name', identifier=com['description']) source_url = 'http://wyoleg.gov/Committees/{}/{}'.format( session, com['ownerID']) committee.add_source(source_url) yield committee
def scrape_chamber(self, chamber=None): metainf = self.scrape_leg_page(get_legislator_listing_url(chamber)) for leg in metainf: try: chamber = {"House": "lower", "Senate": "upper"}[leg['chamber']] except KeyError: print("") print(" ERROR: Bad Legislator page.") print(" -> " + "\n -> ".join(leg['source'])) print("") print(" Added this workaround because of a bad legislator") print(" page, while they filled their info out.") print("") print(" Emailed webmaster. Told to wait.") print(" - PRT, Jun 23, 2014") print("") continue person = Person(name=leg['name'], district=leg['district'], party=leg['party'], primary_org=chamber, image=leg['image']) for source in leg['source']: person.add_source(source) try: for ctty in leg['ctty']: flag = 'Joint Legislative' if ctty['name'][:len(flag)] == flag: ctty_chamber = "joint" else: ctty_chamber = chamber comm = Organization(name=ctty['name'], classification="committee", chamber=ctty_chamber) comm.add_member(person, role="member") except KeyError: self.warn("%s has no scraped Committees" % leg['name']) person.add_link(leg['homepage']) if leg['addr']: person.add_contact_detail(type='address', value=leg['addr'], note='Capitol Office') if leg['phone']: person.add_contact_detail(type='voice', value=leg['phone'], note='Capitol Office') if leg['email']: person.add_contact_detail(type='email', value=leg['email'], note='Capitol Office') if leg['fax']: person.add_contact_detail(type='fax', value=leg['fax'], note='Capitol Office') yield person
def scrape(self, session=None): if not session: session = self.jurisdiction.legislative_sessions[-1]['name'] self.info('no session specified, using %s', session) year_abr = session[0:4] self._init_mdb(year_abr) members_csv = self.access_to_csv('COMember') info_csv = self.access_to_csv('Committee') org_dictionary = {} # Committee Info Database for rec in info_csv: abrv = rec["Code"] comm_name = rec["Description"] if abrv[0] == "A": chamber = "lower" elif abrv[0] == "S": chamber = "upper" org = Organization( name=comm_name, chamber=chamber, classification='committee', ) org.add_source('http://www.njleg.state.nj.us/downloads.asp') org_dictionary[abrv] = org # Committee Member Database POSITIONS = { 'C': 'chair', 'V': 'vice-chair', '': 'member' } for member_rec in members_csv: # assignment=P means they are active, assignment=R means removed if member_rec['Assignment_to_Committee'] == 'P': abr = member_rec["Code"] org = org_dictionary[abr] leg = member_rec["Member"] role = POSITIONS[member_rec["Position_on_Committee"]] leg = ' '.join(leg.split(', ')[::-1]) org.add_member(leg, role=role) for org in org_dictionary.values(): yield org
def _scrape_committee(self, committee_name, link, chamber): """Scrape individual committee page and add members""" page = self.get(link).text page = lxml.html.fromstring(page) page.make_links_absolute(link) is_subcommittee = bool(page.xpath('//li/a[text()="Committee"]')) if is_subcommittee: # All TN subcommittees are just the name of the parent committee with " Subcommittee" # at the end parent_committee_name = re.sub(r'\s*(Study )?Subcommittee\s*', '', committee_name) com = Organization( committee_name, classification='committee', parent_id=self.parents[parent_committee_name] ) else: com = Organization( committee_name, chamber=chamber, classification='committee', ) self.parents[committee_name] = com._id OFFICER_SEARCH = '//h2[contains(text(), "Committee Officers")]/' \ 'following-sibling::div/ul/li/a' MEMBER_SEARCH = '//h2[contains(text(), "Committee Members")]/' \ 'following-sibling::div/ul/li/a' for a in (page.xpath(OFFICER_SEARCH) + page.xpath(MEMBER_SEARCH)): member_name = ' '.join([ x.strip() for x in a.xpath('text()') + a.xpath('span/text()') if x.strip() ]) role = a.xpath('small') if role: role = role[0].xpath('text()')[0].strip() else: role = 'member' if '(Vacant)' in role: continue com.add_member(member_name, role) com.add_link(link) com.add_source(link) return com
def _scrape_standing_committees(self): """Scrapes the Standing Committees page of the Nebraska state legislature.""" main_url = 'http://www.nebraskalegislature.gov/committees/standing-committees.php' page = self.lxmlize(main_url) committee_nodes = self.get_nodes( page, '//a[@class="accordion-switch"][contains(text(), "Standing Committees")]' '/ancestor::div[@class="panel panel-leg"]//div[@class="list-group"]' '/a[@class="list-group-item"]') for committee_node in committee_nodes: committee_page_url = committee_node.attrib['href'] committee_page = self.lxmlize(committee_page_url) name_text = self.get_node( committee_page, '//div[@class="container view-front"]/div[@class="row"]/' 'div[@class="col-sm-6 col-md-7"]/h1/text()[normalize-space()]') name = name_text.split()[0:-1] committee_name = '' for x in range(len(name)): committee_name += name[x] + ' ' committee_name = committee_name[0: -1] org = Organization(name=committee_name, chamber='legislature', classification='committee') members = self.get_nodes( committee_page, '//div[@class="col-sm-4 col-md-3 ltc-col-right"][1]/' 'div[@class="block-box"][1]/ul[@class="list-unstyled ' 'feature-content"]/li/a/text()[normalize-space()]') for member in members: member_name = re.sub(r'\Sen\.\s+', '', member) member_name = re.sub(r', Chairperson', '', member_name).strip() if 'Chairperson' in member: member_role = 'Chairperson' else: member_role = 'member' org.add_member(member_name, member_role) org.add_source(main_url) org.add_source(committee_page_url) yield org
def scrape(self): sessions = reversed(self.jurisdiction.legislative_sessions) committee_term_instances = committees_from_sessions(self, sessions) committees_by_code = build_lookup_dict(self, data_list=committee_term_instances, index_key='code') for code, instances in committees_by_code.items(): # TODO: Figure out how to edit city council org. if code == 'CC': continue # When there are no meetings scheduled and was no way to deduce committee code. if not code: continue extras = {'tmmis_decision_body_ids': []} for i, inst in enumerate(instances): # TODO: Ensure this survives addition of new term (2017) # so specific year always creates canonical_i = 0 if i == canonical_i: o = Organization(name=inst['name'], classification='committee') extras.update({'description': inst['info']}) o.add_identifier(inst['code'], scheme=TWO_LETTER_ORG_CODE_SCHEME) # TODO: Scrape non-councillor members meeting_id = self.referenceMeetingId(inst['code'], inst['term']) if meeting_id: seen_posts = [] membership_url = MEMBERSHIP_URL_TEMPLATE.format(meeting_id) for councillor in self.councillorMembers(membership_url): o.add_member(councillor['name'], councillor['role']) if councillor['role'] not in seen_posts: o.add_post( role=councillor['role'], label=councillor['role'], # TODO: More specific divisions for some committee? division_id=self.jurisdiction.division_id, ) seen_posts.append(councillor['role']) extras['tmmis_decision_body_ids'].append({inst['term']: inst['decision_body_id']}) o.extras = extras o.add_source(inst['source_url']) if instances[canonical_i]['name'] != inst['name']: # TODO: Add start_date and end_date o.add_name(inst['name']) yield o
def scrape(self): com_url = 'http://dccouncil.us/committees' data = self.get(com_url).text doc = lxml.html.fromstring(data) doc.make_links_absolute(com_url) comms = set( doc.xpath('//a[contains(@href, "dccouncil.us/committees/")]')) for committee in comms: url = committee.attrib['href'] name = committee.text_content().strip() comm_data = self.get(url).text comm_page = lxml.html.fromstring(comm_data) comm_page.make_links_absolute(url) # classify these as belonging to the legislature committee = Organization(name=name, classification='committee', chamber='legislature') if comm_page.xpath('//p[@class="page-summary"]'): summary = comm_page.xpath( '//p[@class="page-summary"]')[0].text_content().strip() committee.extras['summary'] = summary chair = comm_page.xpath( "//h4[text()='Chairperson']/following-sibling::p") chair_name = chair[0].text_content().strip() chair_name = self.remove_title(chair_name) committee.add_member(chair_name, role="chair") members = comm_page.xpath( "//h4[text()='Councilmembers']/following-sibling::ul") members = members[0].xpath("./li") for m in members: mem_name = m.text_content().strip() mem_name = self.remove_title(mem_name) if mem_name != chair_name: committee.add_member(mem_name) committee.add_source(url) committee.add_link(url, note='Official Website') if not committee._related: self.warning('empty committee: %s;', name) else: yield committee
def scrape_committee(self, chamber, url): html = self.get(url).text doc = lxml.html.fromstring(html) name = doc.xpath('//title/text()')[0] com = Organization(name, chamber=chamber, classification='committee') com.add_source(url) members = doc.xpath('//a[contains(@href, "/Legislators/Profile")]') for member in members: title = member.xpath('../span') role = title[0].text.lower() if title else 'member' com.add_member(member.text, role) if members: return com
def scrape(self, chamber=None): committees_url = 'http://le.utah.gov/data/committees.json' committees = self.get(committees_url).json()['committees'] people_url = 'http://le.utah.gov/data/legislators.json' people = self.get(people_url).json()['legislators'] # The committee JSON only has legislator IDs, not names ids_to_names = {} for person in people: ids_to_names[person['id']] = person['formatName'] for committee in committees: name = committee['description'] if name.endswith(' Committee'): name = name[:len(name) - len(' Committee')] elif name.endswith(' Subcommittee'): name = name[:len(name) - len(' Subcommittee')] if name.startswith('House '): name = name[len('House '):] chamber = 'lower' elif name.startswith('Senate '): name = name[len('Senate '):] chamber = 'upper' else: chamber = 'legislature' c = Organization( chamber=chamber, name=name, classification='committee' ) c.add_source(committees_url) c.add_source(people_url) c.add_link(committee['link']) for member in committee['members']: try: member_name = ids_to_names[member['id']] except KeyError: self.warning( "Found unknown legislator ID in committee JSON: " + member['id'] ) c.add_member(member_name, role=member['position']) yield c
def scrape_lower_committee(self, link, name): url = re.sub(r'\s+', '', link.attrib['href']) html = self.get(url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) comm = Organization(name=name, chamber='lower', classification='committee') comm.add_source(url) xpath = '//a[contains(@href, "?member=")]' for link in doc.xpath(xpath): name = link.text_content().strip() name = re.sub(r'^Delegate\s+', '', name) role = link.getnext().text or 'member' comm.add_member(name, role.strip()) return comm
def scrape_upper_committee(self, url): doc = self.lxmlize(url) inner_content = self.get_node(doc, '//section[@class="inner-content"]') comm_name = self.get_node(inner_content, './/h2').text.strip() # Remove "Committee" from committee names comm_name = ( comm_name. replace(u"Comisión de ", ""). replace(u"Comisión sobre ", ""). replace(u"Comisión para ", ""). replace(u"Comisión Especial para el Estudio de ", ""). replace(u"Comisión Especial para ", ""). replace(u"Comisión ", "") ) comm_name = re.sub(r'(?u)^(las?|el|los)\s', "", comm_name) comm_name = comm_name[0].upper() + comm_name[1:] comm = Organization(comm_name, chamber='upper', classification='committee') comm.add_source(url) members = self.get_nodes(inner_content, './/li') for member in members: name_parts = member.text.split("-") name = name_parts[0].replace("Hon. ", "").strip() if len(name_parts) > 1: title = name_parts[1].strip() # Translate titles to English for parity with other states if "President" in title: title = 'chairman' elif title.startswith("Vicepresident"): title = 'vicechairman' elif title.startswith("Secretari"): title = 'secretary' else: raise AssertionError("Unknown member type: {}". format(title)) comm.add_member(name, title) else: comm.add_member(name) yield comm
def scrape(self, session=None): if not session: session = self.jurisdiction.legislative_sessions[-1]["name"] self.info("no session specified, using %s", session) year_abr = session[0:4] self._init_mdb(year_abr) members_csv = self.access_to_csv("COMember") info_csv = self.access_to_csv("Committee") org_dictionary = {} # Committee Info Database for rec in info_csv: abrv = rec["Code"] comm_name = rec["Description"] if abrv[0] == "A": chamber = "lower" elif abrv[0] == "S": chamber = "upper" org = Organization( name=comm_name, chamber=chamber, classification="committee" ) org.add_source("http://www.njleg.state.nj.us/downloads.asp") org_dictionary[abrv] = org # Committee Member Database POSITIONS = {"C": "chair", "V": "vice-chair", "": "member"} for member_rec in members_csv: # assignment=P means they are active, assignment=R means removed if member_rec["Assignment_to_Committee"] == "P": abr = member_rec["Code"] org = org_dictionary[abr] leg = member_rec["Member"] role = POSITIONS[member_rec["Position_on_Committee"]] leg = " ".join(leg.split(", ")[::-1]) org.add_member(leg, role=role) for org in org_dictionary.values(): yield org
def scrape(self, chamber=None): committees_url = 'http://le.utah.gov/data/committees.json' committees = self.get(committees_url).json()['committees'] people_url = 'http://le.utah.gov/data/legislators.json' people = self.get(people_url).json()['legislators'] # The committee JSON only has legislator IDs, not names ids_to_names = {} for person in people: ids_to_names[person['id']] = person['formatName'] for committee in committees: name = committee['description'] if name.endswith(' Committee'): name = name[:len(name) - len(' Committee')] elif name.endswith(' Subcommittee'): name = name[:len(name) - len(' Subcommittee')] if name.startswith('House '): name = name[len('House '):] chamber = 'lower' elif name.startswith('Senate '): name = name[len('Senate '):] chamber = 'upper' else: chamber = 'legislature' c = Organization(chamber=chamber, name=name, classification='committee') c.add_source(committees_url) c.add_source(people_url) c.add_link(committee['link']) for member in committee['members']: try: member_name = ids_to_names[member['id']] except KeyError: self.warning( "Found unknown legislator ID in committee JSON: " + member['id']) c.add_member(member_name, role=member['position']) yield c
def scrape(self): # com_url = 'http://www.dccouncil.washington.dc.us/committees' com_url = 'http://dccouncil.us/committees' data = self.get(com_url).text doc = lxml.html.fromstring(data) doc.make_links_absolute(com_url) #dc spelled committe(e) two different ways #IN THEIR HTML CLASS NAMES! comms = doc.xpath('//li[contains(@class,"node_committee-on")]/a') comms += doc.xpath('//li[contains(@class,"node_committe-on")]/a') for committee in comms: url = committee.attrib['href'] # committee = Organization(name=name, chamber=chamber, classification='committee') name = committee.text_content().strip() comm_data = self.get(url).text comm_page = lxml.html.fromstring(comm_data) comm_page.make_links_absolute(url) # comm = Committee("upper",name) committee = Organization(name=name, classification='committee', chamber='upper') chair = comm_page.xpath("//h3[text()='Committee Chair']/following-sibling::p") chair_name = chair[0].text_content().strip() committee.add_member(chair_name,role="chair") members = comm_page.xpath("//h3[text()='Councilmembers']/following-sibling::ul") members = members[0].xpath("./li") for m in members: mem_name = m.text_content().strip() if mem_name != chair_name: committee.add_member(mem_name) committee.add_source(url) # self.save_committee(comm) if not committee._related: self.warning('empty committee: %s;', name) else: yield committee
def scrape_lower_committee(self, name, parent, url): page = self.curl_lxmlize(url) if 'Joint' in name or (parent and 'Joint' in parent): chamber = 'joint' else: chamber = 'lower' if parent: comm = Organization(name=parent, chamber=chamber, classification='committee') subcomm = Organization(name=name, parent_id=comm, classification='committee') else: comm = Organization(name=name, chamber=chamber, classification='committee') comm.add_source(url) xpath = "//a[contains(@href, 'District')]" for link in page.xpath(xpath): member = link.xpath('string()').strip() member = re.sub(r'\s+', ' ', member) if not member or member == 'House District Maps': continue match = re.match(r'((Co-)?(Vice )?Chair)?Rep\. ([^\(]+)', member) member = match.group(4).strip() role = match.group(1) or 'member' comm.add_member(member, role.lower()) if not comm._related: if subcomm.name == 'test': # Whoopsie, prod data. return raise Exception('no members for %s (%s)' % (comm.name, subcomm.name)) yield comm
def scrape_house_committees(self): base_url = "http://house.mi.gov/MHRPublic/CommitteeInfo.aspx?comkey=" html = self.get("http://house.mi.gov/mhrpublic/committee.aspx").text doc = lxml.html.fromstring(html) # get values out of drop down for opt in doc.xpath("//option"): name = opt.text # skip invalid choice if opt.text in ("Statutory Committees", "Select One"): continue if "have not been created" in opt.text: self.warning("no committees yet for the house") return com_url = base_url + opt.get("value") com_html = self.get(com_url).text cdoc = lxml.html.fromstring(com_html) com = Organization(chamber="lower", name=name, classification="committee") com.add_source(com_url) for a in doc.xpath('//a[starts-with(@id, "memberLink")]'): name = a.text.strip() # all links to http:// pages in servicecolumn2 are legislators members = cdoc.xpath('//div[contains(@id,"memberPanelRow")]') for mem in members: name = mem.xpath("./a") if name: name = name[0].text.strip() else: # this is a blank row continue text = mem.xpath("./span")[0].text if "Committee Chair" in text: role = "chair" elif "Vice-Chair" in text: role = "vice chair" else: role = "member" com.add_member(name, role=role) yield com
def scrape_current(self, chamber): if chamber == "upper": chambers = ["special_committees", "senate_committees"] else: chambers = ["house_committees"] committee_request = self.get(ksapi.url + "ctte/").text committee_json = json.loads(committee_request) for com_type in chambers: committees = committee_json["content"][com_type] for committee_data in committees: # set to joint if we are using the special_committees com_chamber = ( "legislature" if com_type == "special_committees" else chamber ) committee = Organization( committee_data["TITLE"], chamber=com_chamber, classification="committee", ) com_url = ksapi.url + "ctte/%s/" % committee_data["KPID"] try: detail_json = self.get(com_url).text except scrapelib.HTTPError: self.warning("error fetching committee %s" % com_url) continue details = json.loads(detail_json)["content"] for chair in details["CHAIR"]: if chair.get("FULLNAME", None): chair_name = chair["FULLNAME"] else: chair_name = self.parse_kpid(chair["KPID"]) self.warning("no FULLNAME for %s", chair["KPID"]) committee.add_member(chair_name, "chairman") for vicechair in details["VICECHAIR"]: committee.add_member(vicechair["FULLNAME"], "vice-chairman") for rankedmember in details["RMMEM"]: committee.add_member(rankedmember["FULLNAME"], "ranking member") for member in details["MEMBERS"]: committee.add_member(member["FULLNAME"]) if not committee._related: self.warning( "skipping blank committee %s" % committee_data["TITLE"] ) else: committee.add_source(com_url) yield committee
def scrape_lower_committee(self, name, parent, url): page = self.curl_lxmlize(url) if "Joint" in name or (parent and "Joint" in parent): chamber = "joint" else: chamber = "lower" if parent: comm = Organization( name=parent, chamber=chamber, classification="committee" ) subcomm = Organization( name=name, parent_id=comm, classification="committee" ) else: comm = Organization(name=name, chamber=chamber, classification="committee") comm.add_source(url) xpath = "//a[contains(@href, 'District')]" for link in page.xpath(xpath): member = link.xpath("string()").strip() member = re.sub(r"\s+", " ", member) if not member or member == "House District Maps": continue match = re.match(r"((Co-)?(Vice )?Chair)?Rep\. ([^\(]+)", member) member = match.group(4).strip() role = match.group(1) or "member" member = member.replace("Representative ", "") comm.add_member(member, role.lower()) if not comm._related: if subcomm.name == "test": # Whoopsie, prod data. return raise Exception("no members for %s (%s)" % (comm.name, subcomm.name)) yield comm
def scrape_current(self, chamber): if chamber == 'upper': chambers = ['special_committees', 'senate_committees'] else: chambers = ['house_committees'] committee_request = self.get(ksapi.url + 'ctte/').text committee_json = json.loads(committee_request) for com_type in chambers: committees = committee_json['content'][com_type] for committee_data in committees: # set to joint if we are using the special_committees com_chamber = ('legislature' if com_type == 'special_committees' else chamber) committee = Organization( committee_data['TITLE'], chamber=com_chamber, classification='committee', ) com_url = ksapi.url + 'ctte/%s/' % committee_data['KPID'] try: detail_json = self.get(com_url).text except scrapelib.HTTPError: self.warning("error fetching committee %s" % com_url) continue details = json.loads(detail_json)['content'] for chair in details['CHAIR']: if chair.get('FULLNAME', None): chair_name = chair['FULLNAME'] else: chair_name = self.parse_kpid(chair['KPID']) self.warning('no FULLNAME for %s', chair['KPID']) committee.add_member(chair_name, 'chairman') for vicechair in details['VICECHAIR']: committee.add_member(vicechair['FULLNAME'], 'vice-chairman') for rankedmember in details['RMMEM']: committee.add_member(rankedmember['FULLNAME'], 'ranking member') for member in details['MEMBERS']: committee.add_member(member['FULLNAME']) if not committee._related: self.warning('skipping blank committee %s' % committee_data['TITLE']) else: committee.add_source(com_url) yield committee
def scrape(self, session=None): if session is None: session = self.latest_session() self.info('no session specified, using %s', session) list_url = self.urls["list"] % (session, ) committees = {} page = self.get(list_url).text page = lxml.html.fromstring(page) for el in page.xpath(".//a[contains(@href, 'CommitteeMembers')]"): committees[el.text.strip()] = el.get("href") for c in committees: self.info(c) detail_url = self.urls["detail"] % (committees[c], ) page = self.get(detail_url).text page = lxml.html.fromstring(page) if re.match('\d{1,2}-', c): c = c.split('-', 1)[1] jcomm = Organization(name=c.strip(), chamber='joint', classification='committee') for table in page.xpath( ".//table[contains(@id, 'CommitteeMembers')]"): rows = table.xpath(".//tr") chamber = rows[0].xpath('.//td')[0].text_content().strip() chamber = 'upper' if chamber == 'Senator' else 'lower' comm = Organization(name=c.strip(), chamber=chamber, classification='committee') for row in rows[1:]: tds = row.xpath('.//td') name = tds[0].text_content().strip() role = 'chairman' if tds[3].text_content().strip( ) == 'Chairman' else 'member' comm.add_member(name, role) jcomm.add_member(name, role) comm.add_source(detail_url) yield comm jcomm.add_source(detail_url) yield jcomm
def scrape_upper_committee(self, link, name): url = re.sub(r"\s+", "", link.attrib["href"]) html = self.get(url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) comm = Organization(name=name, chamber="upper", classification="committee") comm.add_source(url) xpath = '//a[contains(@href, "?member=")]' for link in doc.xpath(xpath): name = link.text_content().strip() name = re.sub(r"^Delegate\s+", "", name) role = link.getnext().text or "member" comm.add_member(name, role.strip()) return comm
def _scrape_select_special_committees(self): """Scrapes the Select and Special Committees page of the Nebraska state legislature.""" main_url = 'http://www.nebraskalegislature.gov/committees/select-committees.php' page = self.lxmlize(main_url) committee_nodes = self.get_nodes( page, '//a[contains(@class, "accordion-switch")]' '/ancestor::div[@class="panel panel-leg"]') for committee_node in committee_nodes: committee_name = self.get_node( committee_node, './/h2[@class="panel-title"]/text()[normalize-space()]') if committee_name is None: committee_name = self.get_node( committee_node, './/h2[@class="panel-title"]/a/text()[normalize-space()]') org = Organization(name=committee_name, chamber='legislature', classification='committee') org.add_source(main_url) members = self.get_nodes( committee_node, './/a[@class="list-group-item"]' '/text()[normalize-space()]') for member in members: member_name = re.sub(r'\Sen\.\s+', '', member) member_name = re.sub(r', Chairperson', '', member_name).strip() if 'Chairperson' in member: member_role = 'Chairperson' else: member_role = 'member' org.add_member(member_name, member_role) if not org._related: self.warning('No members found in {} committee.'.format( org.name)) else: yield org
def scrape(self, chamber=None): committees_url = "http://le.utah.gov/data/committees.json" committees = self.get(committees_url).json()["committees"] people_url = "http://le.utah.gov/data/legislators.json" people = self.get(people_url).json()["legislators"] # The committee JSON only has legislator IDs, not names ids_to_names = {} for person in people: ids_to_names[person["id"]] = person["formatName"] for committee in committees: name = committee["description"] if name.endswith(" Committee"): name = name[: len(name) - len(" Committee")] elif name.endswith(" Subcommittee"): name = name[: len(name) - len(" Subcommittee")] if name.startswith("House "): name = name[len("House ") :] chamber = "lower" elif name.startswith("Senate "): name = name[len("Senate ") :] chamber = "upper" else: chamber = "legislature" c = Organization(chamber=chamber, name=name, classification="committee") c.add_source(committees_url) c.add_source(people_url) c.add_link(committee["link"]) for member in committee["members"]: try: member_name = ids_to_names[member["id"]] except KeyError: self.warning( "Found unknown legislator ID in committee JSON: " + member["id"] ) c.add_member(member_name, role=member["position"]) yield c
def scrape_committee(self, comm_num): url = self.committee_url(comm_num) page = self.lxmlize(url) # get title comm_name = page.xpath("//h1/text()")[0] # create object comm = Organization(name=comm_name, classification="committee", chamber="legislature") comm.add_source(url=url) # add posts comm.add_post(label="chair", role="chair") # FIXME do we need a separate post for each member? # FIXME is member an appropriate name? comm.add_post(label="member", role="member") # helper for finding other nodes landmark_node = page.xpath("//h2[text()='Committee Members']")[0] # add memberships member_names = landmark_node.xpath( "following-sibling::div/ul/li/a/text()") fl_names = [HumanName.name_firstandlast(name) for name in member_names] print("My attempt to scrub people's names:", list(zip(member_names, fl_names))) chair_name, *other_names = fl_names if chair_name not in {'Lewis Reed'}: comm.add_member(chair_name, role="chair") for name in other_names: if name not in {'Lewis Reed'}: comm.add_member(name, role="member") # add description about_node = page.xpath("//h2[text()='About']")[0] (description, ) = about_node.xpath( "parent::div//div[@class='content-block']/p[2]/text()") description = description.strip() comm.extras = {"description": description} yield comm
def scrape_house_committees(self): url = 'http://www.house.leg.state.mn.us/comm/commemlist.asp' html = self.get(url).text doc = lxml.html.fromstring(html) for com in doc.xpath('//h2[@class="commhighlight"]'): members_url = com.xpath( 'following-sibling::p[1]/a[text()="Members"]/@href')[0] com = Organization(com.text, chamber='lower', classification='committee') com.add_source(members_url) member_html = self.get(members_url).text mdoc = lxml.html.fromstring(member_html) # each legislator in their own table # first row, second column contains all the info for ltable in mdoc.xpath('//table/tr[1]/td[2]/p/b[1]'): # name is tail string of last element name = ltable.text_content() text = ltable.text if text and name != text: name = name.replace(text, '') # role is inside a nested b tag role = ltable.xpath('b/*/text()') if role: # if there was a role, remove it from name role = role[0] name = name.replace(role, '') else: role = 'member' name = name.split(' (')[0] com.add_member(name, role) # save yield com
def scrape_senate_committee(self, url): html = self.get(url).text doc = lxml.html.fromstring(html) com_name = doc.xpath('//a[contains(@href, "committee_bio")]/text()')[0] parent = doc.xpath('//h4//a[contains(@href, "committee_bio")]/text()') if parent: self.log('%s is subcommittee of %s', com_name, parent[0]) com = Organization(com_name, chamber='upper', classification='committee', parent_id={ 'name': parent[0], 'classification': 'upper' }) else: com = Organization(com_name, chamber='upper', classification='committee') for link in doc.xpath( '//div[@id="members"]//a[contains(@href, "member_bio")]'): name = link.text_content().strip() if name: position = link.xpath('.//preceding-sibling::b/text()') if not position: position = 'member' elif position[0] == 'Chair:': position = 'chair' elif position[0] == 'Vice Chair:': position = 'vice chair' elif position[0] == 'Ranking Minority Member:': position = 'ranking minority member' else: raise ValueError('unknown position: %s' % position[0]) name = name.split(' (')[0] com.add_member(name, position) com.add_source(url) yield com
def _scrape_lower_standing_committee(self, committee_name, url): page = self.lxmlize(url) committee = Organization(committee_name, chamber="lower", classification="committee" ) committee.add_source(url) rows = page.xpath('//table[@id="body_ListView1_itemPlaceholderContainer"]' '/tr[@class="linkStyle2"]') for row in rows: member_name = row.xpath('normalize-space(string(./td[1]/a))') member_name = self._normalize_member_name(member_name) member_role = row.xpath('normalize-space(string(./td[2]))') member_role = self._normalize_member_role(member_role) committee.add_member(member_name, member_role) yield committee
def scrape_lower_committee(self, name, url): page = self.lxmlize(url) committee = Organization(chamber='lower', name=name, classification="committee") committee.add_source(url) seen = set() member_links = self.get_nodes( page, '//div[@class="commlinks"]//a[contains(@href, "mem")]') for member_link in member_links: member_name = None member_role = None member_text = member_link.text if member_text is not None: member = member_text.strip() member = re.sub(r'\s+', ' ', member) member_name, member_role = self._parse_name(member) if member_name is None: continue # Figure out if this person is the chair. role_type = self.get_node( member_link, '../../preceding-sibling::div[1]/text()') if role_type in (['Chair'], ['Co-Chair']): member_role = 'chair' else: member_role = 'member' if name not in seen: committee.add_member(member_name, member_role) seen.add(member_name) return committee
def scrape_committee(self, term, href, name): page = self.get(href).text page = lxml.html.fromstring(page) page.make_links_absolute(href) members = page.xpath("//div[@class='view-content']" "//a[contains(@href, 'members')]") if '/joint/' in href: chamber = 'legislature' elif '/senate/' in href: chamber = 'upper' elif '/house/' in href: chamber = 'lower' else: # interim committees and others were causing duplicate committee issues, skipping self.warning( 'Failed to identify chamber for {}; skipping'.format(href)) return cttie = Organization(name, chamber=chamber, classification='committee') for a in members: member = a.text role = a.xpath( "ancestor::div/h2[@class='pane-title']/text()")[0].strip() role = { "Legislative Members": "member", "Chairman": "chair", "Vice Chairman": "member" }[role] if member is None or member.startswith("District"): continue member = member.replace('Senator ', '').replace('Representative ', '') cttie.add_member(member, role=role) cttie.add_source(href) yield cttie
def scrape_comm(self, url, chamber): data = self.post(url).json()['Data'] for item in data: comm_name = item['CommitteeName'] committee = Organization(name=comm_name, chamber=chamber, classification='committee') chair_man = str(item['ChairName']) vice_chair = str(item['ViceChairName']) comm_id = item['CommitteeId'] comm_url = self.get_comm_url(chamber, comm_id, comm_name) members = self.scrape_member_info(comm_url) if vice_chair != 'None': committee.add_member(vice_chair, role='Vice-Chair') if chair_man != 'None': committee.add_member(chair_man, role='Chairman') for member in members: # vice_chair and chair_man already added. if chair_man not in member and vice_chair not in member: member = " ".join(member.split()) if member: committee.add_member(member) committee.add_source(comm_url) committee.add_source(url) yield committee
def scrape_comm(self, chamber): url = "http://billstatus.ls.state.ms.us/htms/%s_cmtememb.xml" % chamber comm_page = self.get(url) root = lxml.etree.fromstring(comm_page.content) if chamber == "h": chamber = "lower" else: chamber = "upper" for mr in root.xpath("//COMMITTEE"): name = mr.xpath("string(NAME)") comm = Organization(name, chamber=chamber, classification="committee") chair = mr.xpath("string(CHAIR)") chair = chair.replace(", Chairman", "") role = "Chairman" if len(chair) > 0: comm.add_member(chair, role=role) vice_chair = mr.xpath("string(VICE_CHAIR)") vice_chair = vice_chair.replace(", Vice-Chairman", "") role = "Vice-Chairman" if len(vice_chair) > 0: comm.add_member(vice_chair, role=role) members = mr.xpath("string(MEMBERS)").split(";") if "" in members: members.remove("") for leg in members: leg = leg.strip() comm.add_member(leg) comm.add_source(url) yield comm
def scrape(self): com_url = 'http://dccouncil.us/committees' data = self.get(com_url).text doc = lxml.html.fromstring(data) doc.make_links_absolute(com_url) # dc spelled committe(e) two different ways # IN THEIR HTML CLASS NAMES! comms = doc.xpath('//li[contains(@class,"node_committee-on")]/a') comms += doc.xpath('//li[contains(@class,"node_committe-on")]/a') for committee in comms: url = committee.attrib['href'] name = committee.text_content().strip() comm_data = self.get(url).text comm_page = lxml.html.fromstring(comm_data) comm_page.make_links_absolute(url) # classify these as belonging to the legislature committee = Organization(name=name, classification='committee', chamber='legislature') chair = comm_page.xpath("//h3[text()='Committee Chair']/following-sibling::p") chair_name = chair[0].text_content().strip() committee.add_member(chair_name, role="chair") members = comm_page.xpath("//h3[text()='Councilmembers']/following-sibling::ul") members = members[0].xpath("./li") for m in members: mem_name = m.text_content().strip() if mem_name != chair_name: committee.add_member(mem_name) committee.add_source(url) if not committee._related: self.warning('empty committee: %s;', name) else: yield committee
def scrape_lower_committee(self, committee_name, url): page = self.lxmlize(url) committee_name = committee_name.strip() comm = Organization(committee_name, chamber='lower', classification='committee') comm.add_source(url) info_node = self.get_node( page, './/div[@id = "dnn_ctr1109_ViewWebCommission_WebCommission1_' 'pnlCommission"]') # This will likely capture empty text nodes as well. members = self.get_nodes( info_node, './/div[@class="two-cols com"]/div[@class="col"]//text()' '[normalize-space() and preceding-sibling::br]') member_count = 0 for member in members: member = re.sub(r'Hon\.\s*', '', member).strip() # Skip empty nodes. if not member: continue member, title = self._match_title(member) if title is not None: comm.add_member(member, title) else: comm.add_member(member) member_count += 1 if member_count > 0: yield comm
def scrape(self, chamber=None): url = "http://le.utah.gov/asp/interim/Main.asp?ComType=All&Year=2015&List=2#Results" page = self.lxmlize(url) for comm_link in page.xpath("//a[contains(@href, 'Com=')]"): comm_name = comm_link.text.strip() if "House" in comm_name: chamber = "lower" elif "Senate" in comm_name: chamber = "upper" else: chamber = "legislature" # Drop leading "House" or "Senate" from name comm_name = re.sub(r"^(House|Senate) ", "", comm_name) comm = Organization(name=comm_name, chamber=chamber, classification='committee') committee_page = self.lxmlize(comm_link.attrib['href']) for mbr_link in committee_page.xpath( "//table[@class='memberstable']//a"): name = mbr_link.text.strip() name = re.sub(r' \([A-Z]\)$', "", name) name = re.sub(r'^Sen. ', "", name) name = re.sub(r'^Rep. ', "", name) role = mbr_link.tail.strip().strip(",").strip() typ = "member" if role: typ = role comm.add_member(name, typ) comm.add_source(url) comm.add_source(comm_link.get('href')) yield comm
def scrape_committees(self, session): session_key = SESSION_KEYS[session] committees_response = self.api_client.get("committees", session=session_key) legislators = index_legislators(self, session_key) for committee in committees_response: org = Organization( chamber={ "S": "upper", "H": "lower", "J": "legislature" }[committee["HouseOfAction"]], name=committee["CommitteeName"], classification="committee", ) org.add_source("https://olis.leg.state.or.us/liz/{session}" "/Committees/{committee}/Overview".format( session=session_key, committee=committee["CommitteeName"])) members_response = self.api_client.get( "committee_members", session=session_key, committee=committee["CommitteeCode"], ) for member in members_response: try: member_name = legislators[member["LegislatorCode"]] except KeyError: logger.warn("Legislator {} not found in session {}".format( member["LegislatorCode"], session_key)) member_name = member["LegislatorCode"] org.add_member(member_name, role=member["Title"] if member["Title"] else "") yield org
def scrape_senate_committee(self, url): html = self.get(url).text doc = lxml.html.fromstring(html) com_name = doc.xpath('//a[contains(@href, "committee_bio")]/text()')[0] parent = doc.xpath('//h4//a[contains(@href, "committee_bio")]/text()') if parent: self.log("%s is subcommittee of %s", com_name, parent[0]) com = Organization( com_name, chamber="upper", classification="committee", parent_id={"name": parent[0], "classification": "upper"}, ) else: com = Organization(com_name, chamber="upper", classification="committee") for link in doc.xpath('//div[@id="members"]//a[contains(@href, "member_bio")]'): name = link.text_content().strip() if name: position = link.xpath(".//preceding-sibling::b/text()") if not position: position = "member" elif position[0] == "Chair:": position = "chair" elif position[0] == "Vice Chair:": position = "vice chair" elif position[0] == "Ranking Minority Member:": position = "ranking minority member" else: raise ValueError("unknown position: %s" % position[0]) name = name.split(" (")[0] com.add_member(name.strip(), position) com.add_source(url) yield com
def scrape_upper_committee(self, name, url): page = lxml.html.fromstring(self.get(url).text) comm = Organization(name=name, chamber="upper", classification="committee") comm.add_source(url) for link in page.xpath("//a[contains(@href, 'biographies')]"): member = link.xpath("string()").strip() member = re.sub(r"\s+", " ", member) if not member: continue role = link.tail if not role: role = "member" elif "Vice Chair" in role: role = "vice chair" elif "Chair" in role: role = "chair" member = member.replace("Senator ", "") comm.add_member(member, role=role) if not comm._related: raise Exception("no members for %s", comm.name) yield comm
def scrape_committee(self, chamber, name, url): page = self.get(url).text page = lxml.html.fromstring(page) if page.xpath("//h3[. = 'Joint Committee']"): chamber = "joint" subcommittee = page.xpath("//h3[@align='center']/text()")[0] if "Subcommittee" not in subcommittee: comm = Organization(chamber=chamber, name=name, classification="committee") else: comm = Organization( name=subcommittee, classification="committee", parent_id={ "classification": chamber, "name": name }, ) comm.add_source(url) for link in page.xpath("//a[contains(@href, 'member=')]"): member = link.text.strip() mtype = link.xpath("string(../preceding-sibling::td[1])") mtype = mtype.strip(": \r\n\t").lower() comm.add_member(member, mtype) if not comm._related: self.warning("not saving %s, appears to be empty" % name) else: yield comm
def scrape_senate_comm(self): url = ('http://legislature.maine.gov/committee-information/' 'standing-committees-of-the-senate') html = self.get(url).text doc = lxml.html.fromstring(html) headings = doc.xpath('//p/strong') for heading in headings: committee = Organization(chamber='upper', name=heading.text.strip(':'), classification='committee') committee.add_source(url) par = heading.getparent().getnext() while True: link = par.xpath('a') if len(link) == 0: break res = self.senate_committee_pattern.search(link[0].text) name, chair = res.groups() committee.add_member( name, 'chair' if chair is not None else 'member') par = par.getnext() yield committee
def scrape_committee(self, chamber, com_name, url): html = self.get(url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) com = Organization(chamber=chamber, name=com_name, classification='committee') com.add_source(url) if 'stab=04' in url: for table in doc.xpath('//table[@class="grid"]'): rows = table.xpath('tr') sub_name = rows[0].getchildren()[0].text.strip() # new table - subcommittee if sub_name != 'Full Committee': sub_name = sub_name.replace("Subcommittee", "").strip() com = Organization( name=sub_name, classification='committee', parent_id=self.parents[(chamber, com_name)]) com.add_source(url) for row in rows[1:]: name = row.getchildren()[0].text_content().strip() name, role = define_role(name) com.add_member(name, role) return com else: table_source = doc.xpath('//table[@class="noncogrid"]') if table_source != []: for table in table_source: row = table.xpath('tr/td/a[contains(@href, "sponpage")]/text()') sub_name_source = table.xpath('tr/th/text()') if "Subcommittee" in sub_name_source[0]: sub_name = sub_name_source[0] sub_name = sub_name.replace("Subcommittee", "").strip() com = Organization( name=sub_name, classification='committee', parent_id=self.parents[(chamber, com_name)]) com.add_source(url) for name in row: name, role = define_role(name) com.add_member(name, role) return com else: row = doc.xpath('//table[@class="spco"]/tr[1]/td/text()') for name in row: name, role = define_role(name) com.add_member(name, role) return com
def get_joint_committees_data(self, name, url): page = self.get(url).text html = lxml.html.fromstring(page) org = Organization(name=name, chamber='joint', classification="committee") table = html.xpath("//section[@class=' row-equal-height no-padding']") for td in table: senate_members = td.xpath('div[1]/div/div/div[2]/div/p/strong') if (len(senate_members) > 0): member_string = list(senate_members[0].itertext()) if (len(member_string) > 1): name = member_string[0] role = member_string[1] for ch in ['Sen.', ',', u'\u00a0']: name = name.replace(ch, ' ').strip() role = role.replace(ch, ' ').strip() org.add_member(name, role=role) else: name = member_string[0].replace('Sen.', ' ').strip() for ch in ['Sen.', ',', u'\u00a0']: name = name.replace(ch, ' ').strip() org.add_member(name) house_members = list( td.xpath('div[2]/div/div/div[2]/div/p/strong')) if (len(house_members) > 0): member_string = list(house_members[0].itertext()) if (len(member_string) > 1): name = member_string[0].replace('Rep.', ' ').strip() role = member_string[1].replace(',', ' ').strip() for ch in ['Rep.', ',', u'\u00a0']: name = name.replace(ch, ' ').strip() role = role.replace(ch, ' ').strip() org.add_member(name, role=role) else: name = member_string[0].replace('Rep.', ' ').strip() for ch in ['Rep.', ',', u'\u00a0']: name = name.replace(ch, ' ').strip() org.add_member(name) org.add_source(url) return org