def get_jlfc(self, name, url): """Gets info for the Joint Legislative Oversight Committee""" jlfc_page = self.urlopen(url) html = lxml.html.fromstring(jlfc_page) committee = Committee('joint', name) member_path = '//h3[contains(text(), "%s")]/following-sibling::p[1]' for chamber in ('Senate', 'House'): members = html.xpath(member_path % chamber)[0]\ .text_content().split('\r\n') for member in members: if member.strip(): committee.add_member( *member.replace(u'\xa0', ' ').split(','), chamber=_REV_CHAMBERS[chamber.lower()]) committee.add_source(url) self.save_committee(committee)
def scrape(self, chamber, term): if chamber == 'lower': # Committee members from both houses are listed # together. So, we'll only scrape once. return None year = None # Even thought each term spans two years, committee # memberships don't appear to change. So we only # need to scrape the first year of the term. for t in self.metadata["terms"]: if term == t["name"]: year = t["start_year"] break if not year: raise NoDataForPeriod(term) list_url = self.urls["list"] % (year, ) committees = {} with self.urlopen(list_url) as page: page = lxml.html.fromstring(page) for el in page.xpath(".//a[contains(@href, 'CommitteeMembers')]"): committees[el.text] = el.get("href") for c in committees: self.log(c) detail_url = self.urls["detail"] % (committees[c], ) with self.urlopen(detail_url) as page: page = lxml.html.fromstring(page) if re.match('\d{1,2}-', c): c = c.split('-', 1)[1] comm = Committee('joint', c.strip()) for table in page.xpath( ".//table[contains(@id, 'CommitteeMembers')]"): rows = table.xpath(".//tr") chamber = rows[0].xpath('.//td')[0].text_content().strip() chamber = 'upper' if chamber == 'Senator' else 'lower' for row in rows[1:]: tds = row.xpath('.//td') name = tds[0].text_content().strip() role = 'chairman' if tds[3].text_content().strip( ) == 'Chairman' else 'member' comm.add_member(name, role, chamber=chamber) comm.add_source(detail_url) self.save_committee(comm)
def scrape_house_special(self, scraped_committees): url = 'http://house.louisiana.gov/H_Reps/H_Reps_SpecialCmtes.asp' text = self.get(url).text page = lxml.html.fromstring(text) page.make_links_absolute('http://house.louisiana.gov') committees = {} for el in page.xpath("//a[contains(@href,'../H_Cmtes/')]"): comm_name = el.xpath('normalize-space(string())') comm_name = self.normalize_committee_name(comm_name) # skip committees that have already been scraped from # http://house.louisiana.gov/H_Reps/H_Reps_CmtesFull.asp if comm_name not in scraped_committees: comm_url = el.get('href').replace('../', '') committees[comm_name] = comm_url for name, url in committees.items(): chamber = 'joint' if name.startswith('Joint') else 'lower' committee = Committee(chamber, name) committee.add_source(url) text = self.get(url).text page = lxml.html.fromstring(text) page.make_links_absolute('http://house.louisiana.gov') for row in page.xpath('//table[@id="table1"]//tbody/tr'): member_info = row.xpath('./td') mname = member_info[0].xpath('normalize-space(string())') mtype = member_info[1].xpath('normalize-space(string())') if mtype == 'Chairman': mtype = 'chairman' elif mtype == 'Co-Chairmain': mtype = 'co-chairmain' elif mtype == 'Vice Chair': mtype = 'vice chair' elif mtype == 'Ex Officio': mtype = 'ex officio' elif mtype == 'Interim Member': mtype = 'interim' else: mtype = 'member' committee.add_member(mname, mtype) committees[name] = committee return committees
def get_joint_committees_data(self, name, url): page = self.get(url).text html = lxml.html.fromstring(page) committee = Committee('joint', name) table = html.xpath("//section[@class=' row-equal-height no-padding']") for td in table: senate_members = td.xpath('div[1]/div/div/div[2]/div/p/strong') if (len(senate_members) > 0): member_string = list(senate_members[0].itertext()) if (len(member_string) > 1): name = member_string[0].encode('ascii', 'ignore') for ch in ['\r\n', 'Sen.']: if ch in name: name = name.replace(ch, '').strip() role = member_string[1].encode('ascii', 'ignore') for ch in ['\r\n', ',']: if ch in role: role = role.replace(ch, '').strip() committee.add_member(name, role=role, chamber='senate') else: name = member_string[0].encode('ascii', 'ignore') for ch in ['\r\n', 'Sen.']: if ch in name: name = name.replace(ch, '').strip() committee.add_member(name, chamber='senate') house_members = list( td.xpath('div[2]/div/div/div[2]/div/p/strong')) if (len(house_members) > 0): member_string = list(house_members[0].itertext()) if (len(member_string) > 1): name = member_string[0].encode('ascii', 'ignore') for ch in ['\r\n', 'Rep.']: if ch in name: name = name.replace(ch, '').strip() role = member_string[1].encode('ascii', 'ignore') for ch in ['\r\n', ',']: if ch in role: role = role.replace(ch, '').strip() committee.add_member(name, role=role, chamber='house') else: name = member_string[0].encode('ascii', 'ignore') for ch in ['\r\n', u'\xa0', 'Rep.']: if ch in name: name = name.replace(ch, '').strip() committee.add_member(name, chamber='house') committee.add_source(url) self.save_committee(committee)
def scrape_senate_committees(self, term_name, chamber): years = [ t[2:] for t in term_name.split('-') ] for year in years: if int(year) > int(str(dt.datetime.now().year)[2:]): self.log("Not running session %s, it's in the future." % ( term_name )) continue url = '{base}{year}info/com-standing.htm'.format( base=self.senate_url_base, year=year) page_string = self.urlopen(url) page = lxml.html.fromstring(page_string) ps = page.xpath('id("mainContent")/table/*[3]/p') for p in ps: links = p.xpath('a[1]') if not links: continue a = links[0] committee_name = a.text_content().strip() committee_url = a.attrib.get('href') if 'joint' in committee_name.lower(): c = "joint" else: c = chamber committee = Committee(c, committee_name) committee_page_string = self.urlopen(committee_url) committee_page = lxml.html.fromstring( committee_page_string) lis = committee_page.xpath( "//div[@id='mainContent']/ul/ul[1]/li") if len(lis) == 0: lis = committee_page.xpath( "//div[@id='mainContent']//li") # This MIGHT cause issues. for li in lis: mem_parts = li.text_content().strip().split(',') mem_name = mem_parts[0] mem_role = 'member' if len(mem_parts) > 2: mem_role = mem_parts[2].lower() committee.add_member(mem_name, role=mem_role) committee.add_source(url) committee.add_source(committee_url) self.save_committee(committee)
def scrape(self, chamber, term): self.validate_term(term, latest_only=True) chamber_abbr = {'upper': 'S', 'lower': 'H'}[chamber] url = ('http://www.leg.state.vt.us/lms/legdir/comms.asp?Body=%s' % chamber_abbr) with self.urlopen(url) as page: page = lxml.html.fromstring(page) for li in page.xpath("//li"): # Strip the room number from the committee name comm_name = re.match(r'[^\(]+', li.text_content()).group(0).strip() # Strip chamber from beginning of committee name comm_name = re.sub(r'^(HOUSE|SENATE) COMMITTEE ON ', '', comm_name) # normalize case of committee name comm_name = comm_name.title() comm = Committee(chamber, comm_name) comm.add_source(url) for tr in li.xpath("../../following-sibling::tr"): name = tr.text_content().strip() # Break when we reach the next committee if 'COMMITTEE' in name: break match = re.search( '^([\w\s\.]+),\s+' '(Chair|Vice Chair|Vice-Chair|Ranking Member|Clerk)$', name) if match: name = match.group(1) mtype = match.group(2).lower() else: mtype = 'member' if not name.startswith(DOUBLED_NAMES): name = re.sub(r'of [\w\s\.]+$', '', name) comm.add_member(name, mtype) self.save_committee(comm)
def _scrape_standing_committees(self): """Scrapes the Standing Committees page of the Nebraska state legislature.""" main_url = 'http://www.nebraskalegislature.gov/committees/standing-committees.php' page = self.lxmlize(main_url) committee_nodes = self.get_nodes( page, '//div[@class="main-content"]/div[@class="panel panel-leg"][1]/' 'div[@class="list-group"]/a[@class="list-group-item"]') for committee_node in committee_nodes: committee_page_url = committee_node.attrib['href'] committee_page = self.lxmlize(committee_page_url) name_text = self.get_node( committee_page, '//div[@class="container view-front"]/div[@class="row"]/' 'div[@class="col-sm-6 col-md-7"]/h1/text()[normalize-space()]') name = name_text.split()[0:-1] committee_name = '' for x in range(len(name)): committee_name += name[x] + ' ' committee_name = committee_name[0:-1] committee = Committee('upper', committee_name) members = self.get_nodes( committee_page, '//div[@class="col-sm-4 col-md-3 ltc-col-right"][1]/' 'div[@class="block-box"][1]/ul[@class="list-unstyled ' 'feature-content"]/li/a/text()[normalize-space()]') for member in members: member_name = re.sub(r'\Sen\.\s+', '', member) member_name = re.sub(r', Chairperson', '', member_name).strip() if 'Chairperson' in member: member_role = 'Chairperson' else: member_role = 'member' committee.add_member(member_name, member_role) committee.add_source(main_url) committee.add_source(committee_page_url) self.save_committee(committee)
def scrape_upper_committee(self, link, name): url = re.sub(r'\s+', '', link.attrib['href']) html = self.urlopen(url) doc = lxml.html.fromstring(html) doc.make_links_absolute(url) comm = Committee('upper', name) comm.add_source(url) xpath = '//a[contains(@href, "?member=")]' for link in doc.xpath(xpath): name = link.text_content().strip() name = re.sub(r'^Delegate\s+', '', name) role = link.getnext().text or 'member' comm.add_member(name, role.strip()) return comm
def scrape_joint_committees(self, term, session): url = "http://legis.delaware.gov/legislature.nsf/testside.html?OpenPage&BaseTarget=right" page = self.lxmlize(url) joint_comms = page.xpath("//a[text()='Joint Committees']") comm_list = joint_comms[0].getnext() for li in comm_list.xpath("./li/a"): comm_name = li.text comm_link = li.attrib["href"] if comm_name.strip( ) == "Sunset": #I don't even want to go into it. new_link = "http://legis.delaware.gov/Sunset/"\ "Sunset.nsf/general+Info/JSC+Members?opendocument" assert new_link != comm_link, "Remove Sunset Committee special casing" comm_link = new_link committee = Committee("joint", comm_name) committee.add_source(comm_link) comm_page = self.lxmlize(comm_link) people = comm_page.xpath("//a/b") things_to_replace = [ "Senator", "Representative", "(D)", "(R)", "House Minority Whip", "House Majority Whip", "Senate Minority Whip", "Senate Majority Whip", "House Minority Leader", "House Majority Leader", "Senate Minority Leader", "Senate Majority Leader", "President Pro Tempore", "Speaker of the House" ] for person in people: person_name = person.text_content() for thing in things_to_replace: person_name = person_name.replace(thing, "") person_name = person_name.strip().strip(",") role = "Member" if person_name.strip()[-1] == ")": person_name, role = person_name.rsplit("(", 1) role = role.replace(")", "").strip() elif ", Vice-Chair" in person_name: role = "Vice-Chair" person_name = person_name.replace(", Vice-Chair", "") elif ", Chair" in person_name: role = "Chair" person_name = person_name.replace(", Chair", "") person_name = person_name.strip().strip(",").strip() committee.add_member(person_name, role) self.save_committee(committee)
def scrape(self, chamber, term): if term != '2011-2012': raise NoDataForPeriod(term) chamber_abbr = {'upper': 'S', 'lower': 'H'}[chamber] url = ('http://www.leg.state.vt.us/lms/legdir/comms.asp?Body=%s' % chamber_abbr) with self.urlopen(url) as page: page = lxml.html.fromstring(page) for li in page.xpath("//li"): # Strip the room number from the committee name comm_name = re.match(r'[^\(]+', li.text).group(0).strip() # Strip chamber from beginning of committee name comm_name = re.sub(r'^(HOUSE|SENATE) COMMITTEE ON ', '', comm_name) comm_name = comm_name.title() comm = Committee(chamber, comm_name) comm.add_source(url) for tr in li.xpath("../../following-sibling::tr"): # Break when we reach the next committee if tr.xpath("th/li"): break name = tr.xpath("string()").strip() match = re.search( '^([\w\s\.]+),\s+' '(Chair|Vice Chair|Ranking Member|Clerk)$', name) if match: name = match.group(1) mtype = match.group(2).lower() else: mtype = 'member' name = re.sub(r'of [\w\s\.]+$', '', name) comm.add_member(name, mtype) self.save_committee(comm)
def scrape_page(self, a, chamber, term): page, text = self.lxmlize(a.attrib['href']) committee = a.text_content() twitter_ids = re.findall("setUser\('(.*)'\)", text) twitter_id = twitter_ids[0] if twitter_ids != [] else None roles = {", Chair": "chair", ", Vice-Chair": "member"} committee = Committee(chamber, committee, twitter=twitter_id) committee.add_source(a.attrib['href']) tables = page.xpath("//table[@width='545' or @width='540']") added = False for table in tables: people = table.xpath( ".//a[contains(@href, 'MemberDetailPage')]/text()") for person in [x.strip() for x in people]: role = "member" for flag in roles: if person.endswith(flag): role = roles[flag] person = person[:-len(flag)].strip() committee.add_member(person, role) added = True if added: self.save_committee(committee) return tables = page.xpath("//table[@width='466']") added = False for table in tables: if "committee members" in table.text_content().lower(): for person in table.xpath(".//td/text()"): person = person.strip() if person != "": committee.add_member(person, "member") added = True if added: self.save_committee(committee) return self.warning("Unable to scrape!")
def scrape_senate_committee(self, name, url): url = url.replace('Default.asp', 'Assignments.asp') committee = Committee('upper', name) committee.add_source(url) with self.urlopen(url) as text: page = lxml.html.fromstring(text) links = page.xpath('//table[@bordercolor="#EBEAEC"]/tr/td/font/a') for link in links: name = link.xpath('string()') name = name.replace('Senator ', '').strip() committee.add_member(name) self.save_committee(committee)
def scrape_house_sub_committee(self, sub_committee_name, url): find_expr = "//div[@class='col1']/ul[position()<3]/li" with self.urlopen(url) as page: page = lxml.html.fromstring(page) com = Committee('lower', sub_committee_name) for el in page.xpath(find_expr): member = [item.strip() for item in el.text_content().split(',',1)] if len(member) > 1: member_name, role = member else: member_name, role = member[0], 'member' if member_name != "": com.add_member(member_name, role) com.add_source(url) self.save_committee(com)
def scrape_senate_comm(self): base = 'http://webserver.rilin.state.ri.us' linklist = self.scrape_comm_list('ComMemS') if linklist is not None: for a in linklist: link=a.attrib['href'] commName=a.text self.log( commName ) if commName in COMM_BLACKLIST: self.log( "XXX: Blacklisted" ) continue url=base+link self.log("url "+url) c=Committee('upper',commName) self.add_members(c,url) c.add_source(url) self.save_committee(c)
def scrape_current(self, chamber, term): if chamber == 'upper': chambers = ['special_committees', 'senate_committees'] else: chambers = ['house_committees'] with self.urlopen(ksapi.url + 'ctte/') as committee_request: committee_json = json.loads(committee_request) for com_type in chambers: committees = committee_json['content'][com_type] for committee_data in committees: # set to joint if we are using the special_committees com_chamber = ('joint' if com_type == 'special_committees' else chamber) committee = Committee(com_chamber, committee_data['TITLE']) com_url = ksapi.url + 'ctte/%s/' % committee_data['KPID'] try: detail_json = self.urlopen(com_url) except scrapelib.HTTPError: self.warning("error fetching committee %s" % com_url) continue details = json.loads(detail_json)['content'] for chair in details['CHAIR']: committee.add_member(chair['FULLNAME'], 'chairman') for vicechair in details['VICECHAIR']: committee.add_member(vicechair['FULLNAME'], 'vice-chairman') for rankedmember in details['RMMEM']: committee.add_member(rankedmember['FULLNAME'], 'ranking member') for member in details['MEMBERS']: committee.add_member(member['FULLNAME']) if not committee['members']: self.warning('skipping blank committee %s' % committee_data['TITLE']) else: committee.add_source(com_url) self.save_committee(committee)
def scrape_upper_committee(self, name, url): page = self.urlopen(url) page = lxml.html.fromstring(page) comm = Committee('upper', name) comm.add_source(url) member_div = page.xpath("//div[@class = 'committee-members']")[0] xpath = '//label[contains(., "Chair:")]/following-sibling::a/text()' chair = page.xpath(xpath) if chair: comm.add_member(chair.pop().strip(), 'chair') seen = set([member['name'] for member in comm['members']]) for link in member_div.xpath(".//a"): if not link.text: try: # On one vice chair, the text was nested differently. member = link[0].tail.strip() except (IndexError, AttributeError): continue else: member = link.text.strip() next_elem = link.getnext() if (next_elem is not None and next_elem.tag == 'a' and next_elem.attrib['href'] == link.attrib['href']): # Sometimes NY is cool and splits names across a # couple links member = "%s %s" % (member, next_elem.text.strip()) member = re.sub(r'\s+', ' ', member) if member in seen or not member: continue seen.add(member) name, role = parse_name(member) comm.add_member(name, role) if comm['members']: self.save_committee(comm)
def scrape_senate_comm(self): url = 'http://legisweb1.mainelegislature.org/wp/senate/legislative-committees/' html = self.get(url).text doc = lxml.html.fromstring(html) committee_urls = doc.xpath('//address/a/@href') for committee_url in committee_urls: # Exclude the committee listing document if committee_url.endswith('.docx'): continue html = self.get(committee_url).text doc = lxml.html.fromstring(html) (committee_name, ) = \ doc.xpath('//h1[contains(@class, "entry-title")]/text()') committee_name = re.sub(r'\(.*?\)', "", committee_name) is_joint = (re.search(r'(?s)Committee Members.*Senate:.*House:.*', html)) if is_joint: continue committee = Committee('upper', committee_name) committee.add_source(committee_url) members = doc.xpath('//address/a/text()') if not members: members = doc.xpath('//p/a/text()') for member in members: if member.isspace(): continue member = re.sub(r'^Senator ', "", member) member = re.sub(r' of .*', "", member) if member.endswith(", Chair"): role = 'chair' member = re.sub(r', Chair', "", member) else: role = 'member' committee.add_member(member, role) self.save_committee(committee)
def scrape(self, term, chambers): year_abr = term[0:4] self._init_mdb(year_abr) members_csv = self.access_to_csv('COMember') info_csv = self.access_to_csv('Committee') comm_dictionary = {} #Committe Info Database for rec in info_csv: abrv = rec["Code"] comm_name = rec["Description"] comm_type = rec["Type"] aide = rec["Aide"] contact_info = rec["Phone"] if abrv[0] == "A": chamber = "lower" elif abrv[0] == "S": chamber = "upper" comm = Committee(chamber, comm_name, comm_type = comm_type, aide = aide, contact_info = contact_info) comm.add_source('http://www.njleg.state.nj.us/downloads.asp') comm_dictionary[abrv] = comm #Committee Member Database POSITIONS = { 'C': 'chair', 'V': 'vice-chair', '': 'member' } for member_rec in members_csv: # assignment=P means they are active, assignment=R means removed if member_rec['Assignment_to_Committee'] == 'P': abr = member_rec["Code"] comm_name = comm_dictionary[abr] leg = member_rec["Member"] role = POSITIONS[member_rec["Position_on_Committee"]] comm_name.add_member(leg, role=role) self.save_committee(comm_name)
def scrape(self, chamber, term): base_url = 'http://www.ncga.state.nc.us/gascripts/Committees/Committees.asp?bPrintable=true&sAction=ViewCommitteeType&sActionDetails=' chambers = { 'upper': ['Senate%20Standing', 'Senate%20Select'], 'lower': ['House%20Standing', 'House%20Select'] } for ctype in chambers[chamber]: with self.urlopen(base_url + ctype) as data: doc = lxml.html.fromstring(data) doc.make_links_absolute(base_url + ctype) for comm in doc.xpath('//ul/li/a'): name = comm.text url = comm.get('href') committee = Committee(chamber, name) self.scrape_committee(committee, url) committee.add_source(url) self.save_committee(committee)
def scrape_committees(self, chamber): url = _COMMITTEE_URL % _CHAMBERS[chamber] page = self.get(url).text html = lxml.html.fromstring(page) table = html.xpath( 'body/section[2]/div/div/div/section[2]/div[2]/div/div/div/div' )[1:] for row in table: # committee name, description, hours of operation, # secretary and office_phone text = list(row[0].xpath('div')[0].itertext()) attributes = [ list( value.replace(u'\xa0', ' ').replace( 'Secretary:', '').encode('ascii', 'ignore') for value in text if 'Email:' not in value and value != '\n' and 'Phone:' not in value) ] for i in range(len(attributes[0])): if 'Room' in attributes[0][i]: attributes[0][i] = attributes[0][i].split( 'Room')[0].replace(', ', ' ') if len(attributes[0]) > 5: com = dict(zip(_TD_ONE, attributes[0])) else: com = dict(zip(_TD_TWO, attributes[0])) committee = Committee(chamber, **com) committee.add_source(url) # membership for td in row[1].xpath('div'): td_text = list(td.itertext()) members = list( value for value in td_text if value != ' ' and value != '\n' and value != ',') role = "member" for member in members: if (member in ['Chair', 'Vice Chair']): role = member.lower() continue else: committee.add_member(member, role=role) role = "member" self.save_committee(committee)
def scrape_session(self, term, chambers, session): sid = self.metadata['session_details'][session]['_guid'] committees = self.cservice.GetCommitteesBySession( sid)['CommitteeListing'] for committee in committees: cid = committee['Id'] committee = self.cservice.GetCommittee(cid) name, typ, guid, code, description = [ committee[x] for x in ['Name', 'Type', 'Id', 'Code', 'Description'] ] chamber = { "House": "lower", "Senate": "upper", "Joint": "joint" }[typ] ctty = None if code in self.ctty_cache: ctty = self.ctty_cache[code] if (ctty['chamber'] != chamber) and (description and 'joint' in description.lower()): ctty['chamber'] = 'joint' else: ctty = None if ctty is None: ctty = Committee(chamber, name, code=code, _guid=guid, description=description) self.ctty_cache[code] = ctty members = committee['Members']['CommitteeMember'] for member in members: name = "{First} {Last}".format( **dict(member['Member']['Name'])) role = member['Role'] ctty.add_member(name, role, _guid=member['Member']['Id']) ctty.add_source(self.csource) self.save_committee(ctty)
def scrape_committee(self, chamber, url): with self.urlopen(url) as html: doc = lxml.html.fromstring(html) name = doc.xpath('//span[@class="committeeShortName"]/text()')[0] com = Committee(chamber, name) com.add_source(url) # get both titles and names, order is consistent titles = doc.xpath('//p[@class="rankingMemberTitle"]/text()') names = doc.xpath('//p[@class="rankingMemberName"]/a/text()') for title, name in zip(titles, names): com.add_member(name, title) for member in doc.xpath('//div[@class="committeeRegularMembers"]//a/text()'): com.add_member(member) self.save_committee(com)
def get_committee_obj(self): name = self.get_name() url = self.get_url() parent_name = self.get_parent_name() if parent_name is not None: subcommittee = name committee_name = parent_name else: subcommittee = None committee_name = name self.committee = Committee('upper', committee_name, subcommittee=subcommittee) self.add_members() self.add_sources() return self.committee
def _scrape_lower_standing_committee(self, committee_name, url): page = self.lxmlize(url) committee = Committee('lower', committee_name) committee.add_source(url) rows = page.xpath('//table[@id="body_ListView1_itemPlaceholder' 'Container"]/tr[@class="linkStyle2"]') for row in rows: member_name = row.xpath('normalize-space(string(./td[1]/a))') member_name = ' '.join(filter(None, name_tools.split(member_name))) member_role = row.xpath('normalize-space(string(./td[2]))') member_role = self._normalize_member_role(member_role) committee.add_member(member_name, member_role) self.save_committee(committee)
def scrape_committee(self, chamber, name, url): with self.urlopen(url) as page: page = lxml.html.fromstring(page) if page.xpath("//h3[. = 'Joint Committee']"): chamber = 'joint' comm = Committee(chamber, name) comm.add_source(url) for link in page.xpath("//a[contains(@href, 'member=')]"): member = link.text.strip() mtype = link.xpath("string(../preceding-sibling::td[1])") mtype = mtype.strip(": \r\n\t").lower() comm.add_member(member, mtype) self.save_committee(comm)
def scrape_upper_committee(self, url): with self.urlopen(url) as page: page = lxml.html.fromstring(page) comm_name = page.xpath("string(//div[@class='contentheading'])") committee = Committee('upper', comm_name) committee.add_source(url) for el in page.xpath('//table/tr/td'): sen_name = el.xpath('string(a[@class="senatorLN"])') mark = sen_name.find('(') full_name = sen_name[0:mark] full_name = full_name.strip() if full_name: committee.add_member(full_name) if committee['members']: self.save_committee(committee)
def scrape(self, chamber, term): if chamber == 'upper': urls = ["http://www.lrc.ky.gov/committee/standing_senate.htm"] # also invoke joint scraper self.scrape('joint', term) elif chamber == 'lower': urls = ["http://www.lrc.ky.gov/committee/standing_house.htm"] else: urls = [ "http://www.lrc.ky.gov/committee/interim.htm", "http://www.lrc.ky.gov/committee/statutory.htm" ] chamber = 'joint' for url in urls: page = self.urlopen(url) page = lxml.html.fromstring(page) page.make_links_absolute(url) links = [] cttypages = [ "//a[contains(@href, 'standing/')]", "//a[contains(@href, 'interim')]", "//a[contains(@href, 'statutory')]" ] for exp in cttypages: linkz = page.xpath(exp) links = links + linkz for link in links: name = re.sub(r'\s+\((H|S)\)$', '', link.text).strip().title() name = name.replace(".", "") comm = Committee(chamber, name) comm_url = link.attrib['href'].replace('home.htm', 'members.htm') self.scrape_members(comm, comm_url) if comm['members']: self.save_committee(comm)
def scrape_house_committees(self): base_url = 'http://house.mi.gov/MHRPublic/CommitteeInfo.aspx?comkey=' html = self.get('http://house.mi.gov/mhrpublic/committee.aspx').text doc = lxml.html.fromstring(html) # get values out of drop down for opt in doc.xpath('//option'): name = opt.text # skip invalid choice if opt.text in ('Statutory Committees', 'Select One'): continue if 'have not been created' in opt.text: self.warning('no committees yet for the house') return com_url = base_url + opt.get('value') com_html = self.get(com_url).text cdoc = lxml.html.fromstring(com_html) com = Committee(chamber='lower', committee=name) com.add_source(com_url) for a in doc.xpath('//a[starts-with(@id, "memberLink")]'): name = a.text.strip() # all links to http:// pages in servicecolumn2 are legislators members = cdoc.xpath('//div[contains(@id,"memberPanelRow")]') for mem in members: name = mem.xpath('./a') if name: name = name[0].text.strip() else: #this is a blank row continue text = mem.xpath('./span')[0].text if 'Committee Chair' in text: role = 'chair' elif 'Vice-Chair' in text: role = 'vice chair' else: role = 'member' com.add_member(name, role=role) self.save_committee(com)
def get_jmfc(self, name, url): """Gets the Joint Millennium Fund Committee info""" jfmc_page = self.urlopen(url) html = lxml.html.fromstring(jfmc_page) committee = Committee('joint', name) table = html.xpath('//table')[2] for row in table.xpath('tbody/tr'): senate, house = [ td.text.replace('\r\n', ' ').replace(u'\xa0', ' ') \ for td in row.xpath('td') ] sen_data = senate.strip('Sen.').strip().split(',') hou_data = house.strip('Rep.').strip().split(',') if len(sen_data) > 1 and sen_data[1].strip() != "": committee.add_member(*sen_data) if len(hou_data) > 1 and hou_data[1].strip() != "": committee.add_member(*hou_data) committee.add_source(url) self.save_committee(committee)
def scrape(self, chamber, term): if chamber == 'upper': url = "http://www.lrc.ky.gov/org_adm/committe/standing_senate.htm" elif chamber == 'lower': url = "http://www.lrc.ky.gov/org_adm/committe/standing_house.htm" else: return with self.urlopen(url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(url) for link in page.xpath("//a[contains(@href, 'standing/')]"): name = re.sub(r'\s+\((H|S)\)$', '', link.text).strip() comm = Committee(chamber, name) comm_url = link.attrib['href'].replace('home.htm', 'members.htm') self.scrape_members(comm, comm_url) self.save_committee(comm)