def scrape(self, chamber, term): url = "http://www.assembly.ab.ca/net/index.aspx?p=membership_list" html = self.urlopen(url) doc = lxml.html.fromstring(html) doc.make_links_absolute(url) committees = doc.xpath('//div[@id="_ctl0_Panel_committees"]') divs = committees[0].xpath("div")[1:] for div in divs[:]: if "class" in div.attrib and div.attrib["class"] == "committeetype_header": divs.remove(div) divs = iter(divs) while True: try: name, _, content = itertools.islice(divs, 3) except ValueError, StopIteration: break committee_name = name.text_content()[4:] committee = Committee("lower", committee_name) for td in content.xpath("table/descendant::td"): if td.xpath('a[contains(@href, "number")]'): name = td.xpath("a")[0].text_content() role = (td.xpath("a")[0].tail or "").strip("() ") committee.add_member(name, role or "member") xpath = 'table/descendant::td/a[contains(@href, "committees")]/@href' committee_url = content.xpath(xpath).pop() committee.add_source(url) committee.add_source(committee_url) self.save_committee(committee)
def scrape_committee(self, name, url, chamber): com = Committee(chamber, name) com.add_source(url) data = self.get(url).text doc = lxml.html.fromstring(data) for leg in doc.xpath('//div[@id="members"]/div[@id="members"]/p/a/text()'): leg = leg.replace('Representative ', '') leg = leg.replace('Senator ', '') leg = leg.strip() if ' (' in leg: leg, role = leg.split(' (') if 'Vice-Chair' in role: role = 'vice-chair' elif 'Co-Chair' in role: role = 'co-chair' elif 'Chair' in role: role = 'chair' else: raise Exception('unknown role: %s' % role) else: role = 'member' com.add_member(leg, role) self.save_committee(com)
def select_special_comm(self): main_url = 'http://www.nebraskalegislature.gov/committees/select-committees.php' with self.urlopen(main_url) as page: page = lxml.html.fromstring(page) for comm_names in page.xpath('/html/body/div[@id="wrapper"]/div[@id="content"]/div[@id="content_text"]/div[@class="content_box_container"]/div[@class="content_box"]'): name = comm_names.xpath('h2')[0].text if name != None: committee = Committee('upper', name) committee.add_source(main_url) for senators in comm_names.xpath('ul[@class="nobullet"]/li'): senator = senators[0].text if 'Chairperson' in senator: role = 'Chairperson' senator = senator[5:-13] else: role = 'member' senator = senator[5:-1] committee.add_member(senator, role) self.save_committee(committee) else: name = comm_names.xpath('h2/a')[0].text committee = Committee('upper', name) committee.add_source(main_url) for senators in comm_names.xpath('ul[@class="nobullet"]/li'): senator = senators[0].text if 'Chairperson' in senator: role = 'chairperson' senator = senator[5:-13] else: role = 'member' senator = senator[5:-1] committee.add_member(senator, role) self.save_committee(committee)
def scrape(self, term, chambers): com_url = 'http://www.dccouncil.washington.dc.us/committees' data = self.urlopen(com_url) doc = lxml.html.fromstring(data) doc.make_links_absolute(com_url) urls = set(doc.xpath('//a[contains(@href, "committee-on")]/@href')) for url in urls: data = self.urlopen(url) doc = lxml.html.fromstring(data) try: name = doc.xpath('//h1/text()')[0].replace('Committee on ', '') except IndexError: name = doc.xpath('//h2/text()')[0].replace('Committee on ', '') # skip link to Committees page if name == 'Committees': continue com = Committee('upper', name) for chair in doc.xpath('//h3[text()="Committee Chair"]/following-sibling::p'): com.add_member(chair.text_content(), role='chairperson') for member in doc.xpath('//h3[text()="Councilmembers"]/following-sibling::ul//a'): com.add_member(member.text_content(), role='member') com.add_source(url) self.save_committee(com)
def scrape_joint_committee(self, url): html = self.urlopen(url) doc = lxml.html.fromstring(html) name = doc.xpath('//h1/text()') or doc.xpath('//h2/text()') name = name[0].strip() comm = Committee('joint', name) comm.add_source(url) members = chain(doc.xpath('//a[contains(@href, "MemberId")]'), doc.xpath('//a[contains(@href, "Senators")]')) seen = set() for a in members: parent_content = a.getparent().text_content() if ':' in parent_content: title = parent_content.split(':')[0].strip() else: title = 'member' name = a.text.split(' (')[0].strip() if (name, title) not in seen: comm.add_member(name, title) seen.add((name, title)) if comm['members']: self.save_committee(comm)
def scrape_senate_committee(self, url): html = self.get(url).text doc = lxml.html.fromstring(html) name = doc.xpath('//h3/text()')[0] name = name.replace(' Committee', '') com = Committee(chamber='upper', committee=name) for member in doc.xpath('//div[@id="committeeright"]//a'): member_name = member.text.strip() # don't add clerks if member_name == 'Committee Clerk': continue # skip phone links if member.get("href").startswith("tel:"): continue if 'Committee Chair' in member.tail: role = 'chair' elif 'Majority Vice' in member.tail: role = 'majority vice chair' elif 'Minority Vice' in member.tail: role = 'minority vice chair' else: role = 'member' com.add_member(member_name, role=role) com.add_source(url) self.save_committee(com)
def scrape_senate_committee(self, term, link): with self.urlopen(link) as html: doc = lxml.html.fromstring(html) # strip first 30 and last 10 # Minnesota Senate Committees - __________ Committee committee_name = doc.xpath('//title/text()')[0][30:-10] com = Committee('upper', committee_name) # first id=bio table is members for row in doc.xpath('//table[@id="bio"]')[0].xpath('tr'): row = fix_whitespace(row.text_content()) # switch role if ':' in row: position, name = row.split(': ') role = position.lower().strip() else: name = row # add the member com.add_member(name.strip(), role) com.add_source(link) self.save_committee(com)
def standing_comm(self): main_url = 'http://www.nebraskalegislature.gov/committees/standing-committees.php' with self.urlopen(main_url) as page: page = lxml.html.fromstring(page) for comm_links in page.xpath('/html/body/div[@id="wrapper"]/div[@id="content"]/div[@id="content_text"]/div[@class="content_box_container"]/div[@class="content_box"][1]/ul[@class="nobullet"]/li/a'): detail_link = comm_links.attrib['href'] with self.urlopen(detail_link) as detail_page: detail_page = lxml.html.fromstring(detail_page) name = detail_page.xpath('/html/body[@class="home blog"]/div[@id="page"]/div[@id="content"]/div[@class="content_header"]/div[@class="content_header_right"]/a')[0].text name = name.split() name = name[0:-1] comm_name = '' for x in range(len(name)): comm_name += name[x] + ' ' comm_name = comm_name[0: -1] committee = Committee('upper', comm_name) for senators in detail_page.xpath('/html/body[@class="home blog"]/div[@id="page"]/div[@id="sidebar"]/ul[1]/li[1]/ul/li/a'): senator = senators.text if 'Chairperson' in senator: role = 'Chairperson' senator = senator[6: -13] else: role = 'member' senator = senator[6:-1] committee.add_member(senator, role) committee.add_source(main_url) committee.add_source(detail_link) self.save_committee(committee)
def scrape_committee(self, chamber, term, name, url): page = self.urlopen(url) page = lxml.html.fromstring(page) mlist = page.xpath("//strong[contains(., 'Members:')]")[0].tail mlist = re.sub(r'\s+', ' ', mlist) committee = Committee(chamber, name) committee.add_source(url) # split on periods not preceeded by capital letters for member in re.split('(?<![A-Z])[.,] ', mlist): member = re.sub(r'R\.M\.(M\.)?$', '', member.strip()).strip() if member: committee.add_member(member) chair = page.xpath("//strong[contains(., 'Chair:')]")[0] chair_name = chair.tail.strip() if chair_name: committee.add_member(chair_name, 'chair') vc = page.xpath("//strong[contains(., 'Vice Chair:')]")[0] vc_name = vc.tail.strip() if vc_name: committee.add_member(vc_name, 'vice chair') self.save_committee(committee)
def scrape_lower_committee(self, name, url): com = Committee('lower', name) com.add_source(url) doc = self.lxmlize(url) contact, directiva, reps = doc.xpath('//div[@class="sbox"]/div[2]') # all members are tails of images (they use img tags for bullets) # first three members are in the directiva div chair = directiva.xpath('b[text()="Presidente:"]/following-sibling::img[1]') vchair = directiva.xpath('b[text()="Vice Presidente:"]/following-sibling::img[1]') sec = directiva.xpath('b[text()="Secretario(a):"]/following-sibling::img[1]') member = 0 if chair and chair[0].tail is not None: chair = chair[0].tail com.add_member(clean_spaces(chair), 'chairman') member += 1 if vchair and vchair[0].tail is not None: vchair = vchair[0].tail com.add_member(clean_spaces(vchair), 'vice chairman') member += 1 if sec and sec is not None: sec = sec[0].tail com.add_member(clean_spaces(sec), 'secretary') member += 1 for img in reps.xpath('.//img'): member_name = clean_spaces(img.tail) if member_name is not None: com.add_member(member_name) member += 1 if member > 0: self.save_committee(com)
def _scrape_upper_committee(self, name, url2): cat = "Assignments.asp" url3 = "".join((url2, cat)) committee = Committee('upper', name) committee.add_source(url2) page = self.lxmlize(url3) members = page.xpath('//table[@id="table38"]//font/a/b') for link in members: role = "member" if link == members[0]: role = "Chairman" if link == members[1]: role = "Vice-Chairman" name = link.xpath('string()') name = name.replace('Senator ', '') name = re.sub('[\s]{2,}', ' ', name).strip() committee.add_member(name, role) self.save_committee(committee)
def scrape_house_committee(self, committee_name, link): """Scrape individual committee page and add members""" html = self.urlopen(link) doc = lxml.html.fromstring(html) subcommittee = False for h1 in doc.xpath('//h1/text()'): if 'subcommittee' in h1.lower(): subcommittee = True subcomm_name = ('Subcommittee' if subcommittee else None) if subcommittee: committee_name = committee_name.replace(' Subcommittee', '') com = Committee('lower', committee_name, subcomm_name) find_expr = "//div[@class='col1']/ul[position()<3]/li/a" for a in doc.xpath(find_expr): name = a.text role = (a.tail or '').strip(', ') or 'member' if name: com.add_member(name, role) com.add_source(link) if com['members']: self.save_committee(com)
def scrape_lower_committee(self, name, parent, url): page = lxml.html.fromstring(self.urlopen(url)) page.make_links_absolute(url) if "Joint" in name or (parent and "Joint" in parent): chamber = "joint" else: chamber = "lower" if parent: comm = Committee(chamber, parent, subcommittee=name) else: comm = Committee(chamber, name) comm.add_source(url) for link in page.xpath("//a[contains(@href, 'District')]"): member = link.xpath("string()").strip() member = re.sub(r"\s+", " ", member) if not member: continue match = re.match(r"((Co-)?(Vice )?Chair)?Rep\. ([^\(]+)", member) member = match.group(4).strip() role = match.group(1) or "member" comm.add_member(member, role.lower()) self.save_committee(comm)
def scrape_upper_committee(self,url): filename, resp = self.urlretrieve(url) root = lxml.etree.fromstring( convert_pdf(filename,'xml')) for link in root.xpath('/pdf2xml/page'): comm = None for line in link.findall('text'): text = line.findtext('b') if text is not None and text.startswith('Comisi'): comm = Committee('upper',text); comm.add_source(url) else: if line.text and line.text.startswith('Hon.'): line_text = line.text.replace(u'–','-') name_split = line_text.split(u'-',1) title = 'member' # print name_split if len(name_split) >= 2: name_split[1] = name_split[1].strip() if name_split[1] == 'Presidenta' or name_split[1] == 'Presidente': title = 'chairman' elif name_split[1] == 'Vicepresidente' or name_split[1] == 'Vicepresidenta': title = 'vicechairman' elif name_split[1] == 'Secretaria' or name_split[1] == 'Secretario': title = 'secretary' # if title != 'member': # print name_split[0] if name_split[0] != 'VACANTE': comm.add_member(name_split[0].replace('Hon.',''),title) self.save_committee(comm) os.remove(filename);
def scrape_upper(self): url = "http://senadopr.us/Lists/Listado%20de%20Comisiones/Comisiones%20del%20Senado.aspx" with self.urlopen(url) as html: doc = lxml.html.fromstring(html) doc.make_links_absolute(url) table = doc.xpath( '//table[@id="{C05AFE0D-D977-4033-8D7B-C43ABF948A4A}-{3E52C91B-AFC8-4493-967A-C8A47AC4E7B6}"]' ) for link in table[0].iterchildren("tr"): td_column = list(link) name = td_column[0].find("a") if name is not None: com_source = name.get("href") # if committee does not have a url use the default. if com_source == "http://senadopr.us/": com_source = url com_name = name.text # check the committee name to see if it's a join one. if td_column[1].text == "Comisi\xf3n Conjunta": chamber = "joint" else: chamber = "upper" com = Committee(chamber, com_name) com.add_source(com_source) com.add_member(clean_spaces(td_column[2].find("a").text), "chairman") self.save_committee(com)
def scrape_comm(self, chamber, term_name): url = 'http://billstatus.ls.state.ms.us/htms/%s_cmtememb.xml' % chamber comm_page = self.urlopen(url) root = lxml.etree.fromstring(comm_page.bytes) if chamber == 'h': chamber = "lower" else: chamber = "upper" for mr in root.xpath('//COMMITTEE'): name = mr.xpath('string(NAME)') comm = Committee(chamber, name) chair = mr.xpath('string(CHAIR)') chair = chair.replace(", Chairman", "") role = "Chairman" if len(chair) > 0: comm.add_member(chair, role=role) vice_chair = mr.xpath('string(VICE_CHAIR)') vice_chair = vice_chair.replace(", Vice-Chairman", "") role = "Vice-Chairman" if len(vice_chair) > 0: comm.add_member(vice_chair, role=role) members = mr.xpath('string(MEMBERS)').split(";") if "" in members: members.remove("") for leg in members: leg = leg.strip() comm.add_member(leg) comm.add_source(url) self.save_committee(comm)
def scrape_lower_committee(self, name, parent, url): page = lxml.html.fromstring(self.urlopen(url)) page.make_links_absolute(url) if 'Joint' in name or (parent and 'Joint' in parent): chamber = 'joint' else: chamber = 'lower' if parent: comm = Committee(chamber, parent, subcommittee=name) else: comm = Committee(chamber, name) comm.add_source(url) xpath = "//a[contains(@href, 'District')]" for link in page.xpath(xpath): member = link.xpath('string()').strip() member = re.sub(r'\s+', ' ', member) if not member: continue match = re.match(r'((Co-)?(Vice )?Chair)?Rep\. ([^\(]+)', member) member = match.group(4).strip() role = match.group(1) or 'member' comm.add_member(member, role.lower()) self.save_committee(comm)
def scrape_comm(self, url, chamber): data = self.post(url).json()['Data'] for item in data: comm_name = item['CommitteeName'] committee = Committee(chamber, comm_name) chair_man = str(item['ChairName']) vice_chair = str(item['ViceChairName']) comm_id = item['CommitteeId'] comm_url = self.get_comm_url(chamber, comm_id, comm_name) members = self.scrape_member_info(comm_url) if vice_chair != 'None': committee.add_member(vice_chair, 'Vice-Chair') if chair_man != 'None': committee.add_member(chair_man, 'Chairman') for member in members: # vice_chair and chair_man already added. if chair_man not in member and vice_chair not in member: member = " ".join(member.split()) if member: committee.add_member(member) committee.add_source(comm_url) committee.add_source(url) self.save_committee(committee)
def scrape_reps_comm(self): url = 'http://www.maine.gov/legis/house/hsecoms.htm' with self.urlopen(url) as page: root = lxml.html.fromstring(page) count = 0 for n in range(1, 12, 2): path = 'string(//body/center[%s]/h1/a)' % (n) comm_name = root.xpath(path) committee = Committee('lower', comm_name) count = count + 1 path2 = '/html/body/ul[%s]/li/a' % (count) for el in root.xpath(path2): rep = el.text if rep.find('(') != -1: mark = rep.find('(') rep = rep[15: mark] committee.add_member(rep) committee.add_source(url) self.save_committee(committee)
def scrape(self, chamber, term): for t in self.metadata['terms']: if t['name'] == term: session = t['sessions'][-1] sessionsuffix = 'th' if str(session)[-1] == '1': sessionsuffix = 'st' elif str(session)[-1] == '2': sessionsuffix = 'nd' elif str(session)[-1] == '3': sessionsuffix = 'rd' insert = str(session) + sessionsuffix + str(term[0:4]) chamber_letter = {'lower':'A', 'upper':'S'}[chamber] url = 'http://www.leg.state.nv.us/Session/%s/Committees/%s_Committees/' % ( insert, chamber_letter) page = self.urlopen(url) root = lxml.html.fromstring(page) for com_a in root.xpath('//strong/a'): com_url = url + com_a.get('href') if com_a.text == 'Committee of the Whole': continue com = Committee(chamber, com_a.text) com.add_source(com_url) self.scrape_comm_members(chamber, com, com_url) self.save_committee(com)
def scrape_approp_subcommittees(self, url): html = self.urlopen(url) doc = lxml.html.fromstring(html) for strong in doc.xpath('//strong'): com = Committee(chamber='upper', committee='Appropriations', subcommittee=strong.text.strip()) com.add_source(url) legislators = strong.getnext().tail.replace('Senators', '').strip() for leg in re.split(', | and ', legislators): if leg.endswith('(C)'): role = 'chairman' leg = leg[:-4] elif leg.endswith('(VC)'): role = 'vice chairman' leg = leg[:-5] elif leg.endswith('(MVC)'): role = 'minority vice chairman' leg = leg[:-6] else: role = 'member' com.add_member(leg, role=role) self.save_committee(com)
def scrape_committee(self, term, chambers, href, name): page = self.get(href).text page = lxml.html.fromstring(page) page.make_links_absolute(href) members = page.xpath("//div[@class='view-content']" "//a[contains(@href, 'members')]") if "/joint/" in href: chamber = "joint" elif "/senate/" in href: chamber = "upper" elif "/house/" in href: chamber = "lower" else: print "XXX: Fail! %s" % (href) return cttie = Committee(chamber, name) for a in members: member = a.text role = a.xpath("ancestor::div/h2[@class='pane-title']/text()")[0] role = {"Legislative Members": "member", "Chairman": "chair", "Vice Chairman": "member"}[role] if member is None or member.startswith("District"): continue cttie.add_member(member, role=role) cttie.add_source(href) self.save_committee(cttie)
def scrape_assembly(self): """Scrape Assembly Committees""" assembly_committees_url = "http://assembly.state.ny.us/comm/" with self.urlopen(assembly_committees_url) as html: doc = lxml.html.fromstring(html) standing_committees, subcommittees, legislative_commissions, task_forces = doc.cssselect('#sitelinks ul') committee_paths = set([l.get('href') for l in standing_committees.cssselect("li a[href]") if l.get("href").startswith('?sec=mem')]) for committee_path in committee_paths: committee_url = assembly_committees_url+committee_path with self.urlopen(committee_url) as chtml: cdoc = lxml.html.fromstring(chtml) for h in cdoc.cssselect("#content .pagehdg"): if h.text: committee_name = h.text.split('Committee Members')[0].strip() break committee = Committee("lower", committee_name) committee.add_source(committee_url) members = cdoc.cssselect("#sitelinks")[0] first = 1 for member in members.iter('span'): member = member.xpath('li/a')[0].text if first == 1: committee.add_member(member, 'chair') first = 0 else: committee.add_member(member) self.save_committee(committee)
def scrape_senate_committee(self, url): html = self.urlopen(url) doc = lxml.html.fromstring(html) name = doc.xpath('//h6/text()')[0] com = Committee(chamber='upper', committee=name) for member in doc.xpath('//div[@id="committeelist"]//a'): member_name = member.text.strip() # don't add clerks if member_name == 'Committee Clerk': continue if 'Committee Chair' in member.tail: role = 'chair' elif 'Majority Vice' in member.tail: role = 'majority vice chair' elif 'Minority Vice' in member.tail: role = 'minority vice chair' else: role = 'member' com.add_member(member_name, role=role) com.add_source(url) self.save_committee(com)
def standing_comm(self): main_url = "http://www.nebraskalegislature.gov/committees/standing-committees.php" with self.urlopen(main_url) as page: page = lxml.html.fromstring(page) for comm_links in page.xpath( '//div[@id="content_text"]/div[@class="content_box_container"]/div[@class="content_box"][1]/ul[@class="nobullet"]/li/a' ): detail_link = comm_links.attrib["href"] with self.urlopen(detail_link) as detail_page: detail_page = lxml.html.fromstring(detail_page) name = detail_page.xpath( '//div[@id="content"]/div[@class="content_header"]/div[@class="content_header_right"]/a' )[0].text name = name.split() name = name[0:-1] comm_name = "" for x in range(len(name)): comm_name += name[x] + " " comm_name = comm_name[0:-1] committee = Committee("upper", comm_name) for senators in detail_page.xpath('//div[@id="sidebar"]/ul[1]/li[1]/ul/li/a'): senator = senators.text if "Chairperson" in senator: role = "Chairperson" senator = senator[6:-13].strip() else: role = "member" senator = senator[6:].strip() committee.add_member(senator, role) committee.add_source(main_url) committee.add_source(detail_link) self.save_committee(committee)
def scrape_upper_committee(self, name, url): with self.urlopen(url) as page: page = lxml.html.fromstring(page) comm = Committee('upper', name) comm.add_source(url) member_div = page.xpath("//div[@class = 'committee-members']")[0] seen = set() for link in member_div.xpath(".//a"): if not link.text: continue member = link.text.strip() next_elem = link.getnext() if (next_elem is not None and next_elem.tag == 'a' and next_elem.attrib['href'] == link.attrib['href']): # Sometimes NY is cool and splits names across a # couple links member = "%s %s" % (member, next_elem.text.strip()) member = re.sub(r'\s+', ' ', member) if member in seen or not member: continue seen.add(member) name, role = parse_name(member) comm.add_member(name, role) self.save_committee(comm)
def scrape_committee(self, chamber, url): html = self.urlopen(url) doc = lxml.html.fromstring(html) name = doc.xpath('//span[@class="committeeShortName"]/text()') if len(name) == 0: self.warning("Had to skip this malformed page.") return # Because of http://www.malegislature.gov/Committees/Senate/S29 this # XXX: hack had to be pushed in. Remove me ASAP. This just skips # malformed pages. name = name[0] com = Committee(chamber, name) com.add_source(url) # get both titles and names, order is consistent titles = doc.xpath('//p[@class="rankingMemberTitle"]/text()') names = doc.xpath('//p[@class="rankingMemberName"]/a/text()') for title, name in zip(titles, names): com.add_member(name, title) for member in doc.xpath('//div[@class="committeeRegularMembers"]//a/text()'): com.add_member(member) if com['members']: self.save_committee(com)
def scrape_lower_committee(self, name, url): com = Committee("lower", name) com.add_source(url) with self.urlopen(url) as html: doc = lxml.html.fromstring(html) contact, directiva, reps = doc.xpath('//div[@class="sbox"]/div[2]') # all members are tails of images (they use img tags for bullets) # first three members are in the directiva div # pres, vpres, secretary, _ = directiva.xpath('.//img') chair = directiva.xpath('b[text()="Presidente:"]/following-sibling::img[1]') vchair = directiva.xpath('b[text()="Vice Presidente:"]/following-sibling::img[1]') sec = directiva.xpath('b[text()="Secretario(a):"]/following-sibling::img[1]') member = 0 if chair: com.add_member(clean_spaces(chair[0].tail), "chairman") ++member if vchair: com.add_member(clean_spaces(vchair[0].tail), "vice chairman") ++member if sec: com.add_member(clean_spaces(sec[0].tail), "secretary") ++member for img in reps.xpath(".//img"): com.add_member(clean_spaces(img.tail)) ++member if member > 0: self.save_committee(com)
def _scrape_lower_special_committees(self): url = 'http://house.louisiana.gov/H_Cmtes/SpecialCommittees.aspx' page = self.lxmlize(url) committee_list = page.xpath('//table[@id="table106"]//div[@class=' '"exBody1A"]/div[@class="accordion"]')[0] headers = committee_list.xpath('./h3') for header in headers: committee_name_text = header.xpath('string()') committee_name = committee_name_text.strip() committee_name = self._normalize_committee_name(committee_name) chamber = 'joint' if committee_name.startswith('Joint') else 'lower' committee = Committee(chamber, committee_name) committee.add_source(url) committee_memberlist = header.xpath('./following-sibling::div[' '@class="pane"]//tr[@class="linkStyle2"]') for row in committee_memberlist: member_name = row.xpath('normalize-space(string(./td[1]))') member_name = ' '.join(filter(None, name_tools.split(member_name))) member_role = row.xpath('normalize-space(string(./td[2]))') member_role = self._normalize_member_role(member_role) committee.add_member(member_name, member_role) self.save_committee(committee)
def select_special_comm(self): main_url = "http://www.nebraskalegislature.gov/committees/select-committees.php" with self.urlopen(main_url) as page: page = lxml.html.fromstring(page) for comm_names in page.xpath('//div[@class="content_box"]'): name = comm_names.xpath("h2")[0].text if name != None: committee = Committee("upper", name) committee.add_source(main_url) for senators in comm_names.xpath('ul[@class="nobullet"]/li'): senator = senators[0].text if "Chairperson" in senator: role = "Chairperson" senator = senator[5:-13].strip() else: role = "member" senator = senator[5:].strip() committee.add_member(senator, role) self.save_committee(committee) else: name = comm_names.xpath("h2/a")[0].text committee = Committee("upper", name) committee.add_source(main_url) for senators in comm_names.xpath('ul[@class="nobullet"]/li'): senator = senators[0].text if "Chairperson" in senator: role = "chairperson" senator = senator[5:-13].strip() else: role = "member" senator = senator[5:].strip() committee.add_member(senator, role) self.save_committee(committee)
def scrape_joint_committee(self, url): html = self.urlopen(url) doc = lxml.html.fromstring(html) name = doc.xpath('//h1/text()') or doc.xpath('//h2/text()') name = name[0] comm = Committee('joint', name) comm.add_source(url) members = chain(doc.xpath('//a[contains(@href, "MemberId")]'), doc.xpath('//a[contains(@href, "Senators")]')) for a in members: parent_content = a.getparent().text_content() if ':' in parent_content: title = parent_content.split(':')[0].strip() else: title = 'member' comm.add_member(a.text.split(' (')[0].strip(), title) self.save_committee(comm)
def scrape(self, chamber, term): urls = { 'upper': 'http://legis.delaware.gov/LIS/LIS%s.nsf/SCommittees', 'lower': 'http://legis.delaware.gov/LIS/LIS%s.nsf/HCommittees' } # Mapping of term names to session numbers (see metatdata). term2session = {"2011-2012": "146"} session = term2session[term] url = urls[chamber] % (session, ) self.log(url) page = lxml.html.fromstring(self.urlopen(url)) page.make_links_absolute(url) committees = {} for row in page.xpath('//td[@width="96%"]/table/tr[@valign="top"]'): link = row.xpath('td/font/a[contains(@href, "opendocument")]')[0] committees[link.text] = link.attrib['href'] self.log(link.attrib['href']) for c in committees: url = committees[c] page = lxml.html.fromstring(self.urlopen(url)) page.make_links_absolute(url) committee = Committee(chamber, c) committee.add_source(url) for tr in page.xpath('//td[@width="96%"]/table/tr'): role_section = tr.xpath('td/b/font') if (len(role_section) > 0): role = re.sub(r's?:$', '', role_section[0].text).lower() for member in tr.xpath('td/font/a'): committee.add_member(member.text, role) self.save_committee(committee)
def scrape_house_committees(self, term): url = 'http://www.house.leg.state.mn.us/comm/commemlist.asp' html = self.urlopen(url) doc = lxml.html.fromstring(html) for com in doc.xpath('//h2[@class="commhighlight"]'): members_url = com.xpath( 'following-sibling::p[1]/a[text()="Members"]/@href')[0] com = Committee('lower', com.text) com.add_source(members_url) member_html = self.urlopen(members_url) mdoc = lxml.html.fromstring(member_html) # each legislator in their own table # first row, second column contains all the info for ltable in mdoc.xpath('//table/tr[1]/td[2]/p/b[1]'): # name is tail string of last element name = ltable.text_content() text = ltable.text if text and name != text: name = name.replace(text, '') # role is inside a nested b tag role = ltable.xpath('b/*/text()') if role: # if there was a role, remove it from name role = role[0] name = name.replace(role, '') else: role = 'member' name = name.split(' (')[0] com.add_member(name, role) # save self.save_committee(com)
def scrape_committee(self, chamber, name, url): with self.urlopen(url) as page: page = lxml.html.fromstring(page) if page.xpath("//h3[. = 'Joint Committee']"): chamber = 'joint' comm = Committee(chamber, name) comm.add_source(url) for link in page.xpath("//a[contains(@href, 'member=')]"): member = link.text.strip() mtype = link.xpath("string(../preceding-sibling::td[1])") mtype = mtype.strip(": \r\n\t").lower() comm.add_member(member, mtype) if not comm['members']: self.warning('not saving %s, appears to be empty' % name) else: self.save_committee(comm)
def get_jfac(self, name, url): """gets membership info for the Joint Finance and Appropriations Committee.""" with self.urlopen(url) as jfac_page: html = lxml.html.fromstring(jfac_page) table = html.xpath('body/table/tr/td[2]/table')[0] committee = Committee('joint', name) for row in table.xpath('tr')[1:]: senate, house = row.xpath('td/strong') senate = senate.text.replace(u'\xa0', ' ') house = house.text.replace(u'\xa0', ' ') if ',' in senate: committee.add_member(*senate.split(','), chamber='upper') else: committee.add_member(senate, chamber='upper') if ',' in house: committee.add_member(*house.split(','), chamber='lower') else: committee.add_member(house, chamber='lower') committee.add_source(url) self.save_committee(committee)
def _scrape_committee(self, committee_name, link, chamber): """Scrape individual committee page and add members""" page = self.get(link).text page = lxml.html.fromstring(page) page.make_links_absolute(link) is_subcommittee = bool(page.xpath('//li/a[text()="Committee"]')) if is_subcommittee: com = Committee(chamber, re.sub(r'\s*Subcommittee\s*', '', committee_name), committee_name) else: com = Committee(chamber, committee_name) OFFICER_SEARCH = '//h2[contains(text(), "Committee Officers")]/' \ 'following-sibling::div/ul/li/a' MEMBER_SEARCH = '//h2[contains(text(), "Committee Members")]/' \ 'following-sibling::div/ul/li/a' HOUSE_SEARCH = '//h2[contains(text(), "House Members")]/' \ 'following-sibling::div/ul/li/a' SENATE_SEARCH = '//h2[contains(text(), "House Members")]/' \ 'following-sibling::div/ul/li/a' for a in (page.xpath(OFFICER_SEARCH) + page.xpath(MEMBER_SEARCH)): member_name = ' '.join([ x.strip() for x in a.xpath('text()') + a.xpath('span/text()') if x.strip() ]) role = a.xpath('small') if role: role = role[0].xpath('text()')[0].strip() else: role = 'member' com.add_member(member_name, role) com.add_source(link) self.save_committee(com)
def scrapeProcedural(self, chamber, page, url): comm_count = 1 for comm_names in page.xpath('//div[@class="content"][1]/p/a'): name = re.sub('[^A-Za-z0-9]+', ' ', comm_names.text).replace(' ', '') comm = Committee(chamber, name) members_path = '//div[@class="content"][1]/table[@class="p"][%s]//tr/td[2]/a' % ( str(comm_count)) for members in comm_names.xpath(members_path): member = members.text member = re.sub('[^A-Za-z0-9]+', ' ', member) role = members.tail if (role != None) and ('Chairman' in role): role = 'Chairman' else: role = 'Member' comm.add_member(member, role) comm.add_source(url) self.save_committee(comm) comm_count += 1
def scrape_senate_comm(self): url = ('http://legislature.maine.gov/committee-information/' 'standing-committees-of-the-senate') html = self.get(url).text doc = lxml.html.fromstring(html) headings = doc.xpath('//p/strong') for heading in headings: committee = Committee('upper', heading.text.strip(':')) committee.add_source(url) par = heading.getparent().getnext() while True: link = par.xpath('a') if len(link) == 0: break res = self.senate_committee_pattern.search(link[0].text) name, chair = res.groups() committee.add_member( name, 'chair' if chair is not None else 'member') par = par.getnext() self.save_committee(committee)
def scrape_senate_committee(self, name, url): url = url.replace('Default.asp', 'Assignments.asp') committee = Committee('upper', name) committee.add_source(url) text = self.urlopen(url) page = lxml.html.fromstring(text) links = page.xpath('//table[@bordercolor="#EBEAEC"]/tr/td/font/a') for link in links: role = "member" if link.tail: role = link.tail.strip().strip("() ") name = link.xpath('string()') name = name.replace('Senator ', '').strip() committee.add_member(name, role) self.save_committee(committee)
def scrape(self, term, chambers): self.validate_term(term, latest_only=True) url = "http://le.utah.gov/asp/interim/Main.asp?ComType=All&Year=2014&List=2#Results" page = self.lxmlize(url) for comm_link in page.xpath("//a[contains(@href, 'Com=')]"): comm_name = comm_link.text.strip() if "House" in comm_name: chamber = "lower" if "Senate" in comm_name: chamber = "upper" else: chamber = "joint" # Drop leading "House" or "Senate" from name comm_name = re.sub(r"^(House|Senate) ", "", comm_name) comm = Committee(chamber, comm_name) committee_page = self.lxmlize(comm_link.attrib['href']) for mbr_link in committee_page.xpath( "//table[@class='memberstable']//a"): name = mbr_link.text.strip() role = mbr_link.tail.strip().strip(",").strip() type = "member" if role: type = role comm.add_member(name, type) comm.add_source(url) comm.add_source(comm_link.get('href')) self.save_committee(comm)
def select_special_comm(self): main_url = 'http://www.nebraskalegislature.gov/committees/select-committees.php' with self.urlopen(main_url) as page: page = lxml.html.fromstring(page) for comm_names in page.xpath( '/html/body/div[@id="wrapper"]/div[@id="content"]/div[@id="content_text"]/div[@class="content_box_container"]/div[@class="content_box"]' ): name = comm_names.xpath('h2')[0].text if name != None: committee = Committee('upper', name) committee.add_source(main_url) for senators in comm_names.xpath( 'ul[@class="nobullet"]/li'): senator = senators[0].text if 'Chairperson' in senator: role = 'Chairperson' senator = senator[5:-13].strip() else: role = 'member' senator = senator[5:].strip() committee.add_member(senator, role) self.save_committee(committee) else: name = comm_names.xpath('h2/a')[0].text committee = Committee('upper', name) committee.add_source(main_url) for senators in comm_names.xpath( 'ul[@class="nobullet"]/li'): senator = senators[0].text if 'Chairperson' in senator: role = 'chairperson' senator = senator[5:-13].strip() else: role = 'member' senator = senator[5:].strip() committee.add_member(senator, role) self.save_committee(committee)
def scrape_assembly(self): """Scrape Assembly Committees""" assembly_committees_url = "http://assembly.state.ny.us/comm/" with self.urlopen(assembly_committees_url) as html: doc = lxml.html.fromstring(html) standing_committees, subcommittees, legislative_commissions, task_forces = doc.cssselect( '#sitelinks ul') committee_paths = set([ l.get('href') for l in standing_committees.cssselect("li a[href]") if l.get("href").startswith('?sec=mem') ]) for committee_path in committee_paths: committee_url = assembly_committees_url + committee_path with self.urlopen(committee_url) as chtml: cdoc = lxml.html.fromstring(chtml) for h in cdoc.cssselect("#content .pagehdg"): if h.text: committee_name = h.text.split( 'Committee Members')[0].strip() break committee = Committee("lower", committee_name) committee.add_source(committee_url) members = cdoc.cssselect("#sitelinks")[0] first = 1 for member in members.iter('span'): member = member.xpath('li/a')[0].text if first == 1: committee.add_member(member, 'chair') first = 0 else: committee.add_member(member) self.save_committee(committee)
def scrape_lower_committee(self, name, url): page = self.lxmlize(url) committee = Committee('lower', name) committee.add_source(url) seen = set() member_links = self.get_nodes( page, '//div[@class="commlinks"]//a[contains(@href, "mem")]') for member_link in member_links: member_name = None member_role = None member_text = member_link.text if member_text is not None: member = member_text.strip() member = re.sub(r'\s+', ' ', member) member_name, member_role = self._parse_name(member) if member_name is None: continue # Figure out if this person is the chair. role_type = self.get_node( member_link, '../../preceding-sibling::div[1]/text()') if role_type in (['Chair'], ['Co-Chair']): member_role = 'chair' else: member_role = 'member' if name not in seen: committee.add_member(member_name, member_role) seen.add(member_name) return committee
def scrape_chamber(self, url, orig_chamber): html = self.urlopen(url) doc = lxml.html.fromstring(html) doc.make_links_absolute(url) for a in doc.xpath('//a[contains(@href, "committee.aspx")]'): com_name = a.text com_url = a.get('href') com_html = self.urlopen(com_url) com_data = lxml.html.fromstring(com_html) if 'Joint' in com_name: chamber = 'joint' else: chamber = orig_chamber if chamber == 'joint': if com_name not in self.joint_coms: self.joint_coms[com_name] = Committee(chamber, com_name) com = self.joint_coms.get(com_name) self.joint_coms[com_name] = com else: com = Committee(chamber, com_name) for a in com_data.xpath('//a[contains(@href, "Member=")]'): member = a.text role = a.xpath('../following-sibling::span/text()') if role: role = role[0].lower().replace(u'\xa0', ' ') # skip former members if 'until' in role: continue else: role = 'member' com.add_member(member, role) com.add_source(com_url) self.save_committee(com)
def scrape_house_committees(self): base_url = 'http://house.mi.gov/MHRPublic/CommitteeInfo.aspx?comkey=' html = self.urlopen('http://house.mi.gov/mhrpublic/committee.aspx') doc = lxml.html.fromstring(html) # get values out of drop down for opt in doc.xpath('//option'): name = opt.text # skip invalid choice if opt.text in ('Statutory Committees', 'Select One'): continue if 'have not been created' in opt.text: self.warning('no committees yet for the house') return com_url = base_url + opt.get('value') com_html = self.urlopen(com_url) cdoc = lxml.html.fromstring(com_html) com = Committee(chamber='lower', committee=name) com.add_source(com_url) for a in doc.xpath('//a[starts-with(@id, "memberLink")]'): name = a.text.strip() # all links to http:// pages in servicecolumn2 are legislators for a in cdoc.xpath( '//div[@class="servicecolumn2"]//a[starts-with(@href, "http")]' ): name = a.text.strip() text = a.xpath('following-sibling::span/text()')[0] if 'Committee Chair' in text: role = 'chair' elif 'Vice-Chair' in text: role = 'vice chair' else: role = 'member' com.add_member(name, role=role) self.save_committee(com)
def scrape_lower_committee(self, name, parent, url): page = lxml.html.fromstring(self.get(url).text) page.make_links_absolute(url) if 'Joint' in name or (parent and 'Joint' in parent): chamber = 'joint' else: chamber = 'lower' if parent: comm = Committee(chamber, parent, subcommittee=name) else: comm = Committee(chamber, name) comm.add_source(url) xpath = "//a[contains(@href, 'District')]" for link in page.xpath(xpath): member = link.xpath('string()').strip() member = re.sub(r'\s+', ' ', member) if not member or member == 'House District Maps': continue match = re.match(r'((Co-)?(Vice )?Chair)?Rep\. ([^\(]+)', member) member = match.group(4).strip() role = match.group(1) or 'member' comm.add_member(member, role.lower()) if not comm['members']: if comm['subcommittee'] == 'test': # Whoopsie, prod data. return raise Exception('no members for %s (%s)' % (comm['committee'], comm['subcommittee'])) self.save_committee(comm)
def scrape_senate_committees(self, term_name, chamber): years = [t[2:] for t in term_name.split('-')] for year in years: if int(year) > int(str(dt.datetime.now().year)[2:]): self.log("Not running session %s, it's in the future." % (term_name)) continue url = '{base}{year}info/com-standing.htm'.format( base=self.senate_url_base, year=year) page_string = self.urlopen(url) page = lxml.html.fromstring(page_string) ps = page.xpath('id("mainContent")/table/*[3]/p') for p in ps: links = p.xpath('a[1]') if not links: continue a = links[0] committee_name = a.text_content().strip() committee_url = a.attrib.get('href') committee = Committee(chamber, committee_name) committee_page_string = self.urlopen(committee_url) committee_page = lxml.html.fromstring(committee_page_string) lis = committee_page.xpath( "//div[@id='mainContent']/ul/ul[1]/li") if len(lis) == 0: lis = committee_page.xpath("//div[@id='mainContent']//li") # This MIGHT cause issues. for li in lis: mem_parts = li.text_content().strip().split(',') mem_name = mem_parts[0] mem_role = 'member' if len(mem_parts) > 2: mem_role = mem_parts[2].lower() committee.add_member(mem_name, role=mem_role) committee.add_source(url) committee.add_source(committee_url) self.save_committee(committee)
def scrape(self, chamber, term): self.validate_term(term) session = self.get_session_for_term(term) try: session_id = self.get_session_id(session) except KeyError: raise NoDataForPeriod url = 'http://www.azleg.gov/StandingCom.asp' html = self.get(url).text doc = lxml.html.fromstring(html) chamber_name = dict(upper="Senate", lower="House of Representatives")[chamber] xpath = '//strong[contains(text(), "%s")]/../../following-sibling::tr/td' tds = doc.xpath(xpath % chamber_name) for td in tds: name = td.text_content().strip() source_url = td.xpath('a/@href')[0] query = urlparse.urlparse(source_url).query params = dict(urlparse.parse_qsl(query)) c_id = params['Committee_ID'] session_id = params['Session_ID'] c = Committee(chamber, name, session=session, az_committee_id=c_id) c.add_source(source_url) #for some reason they don't always have any info on the committees' try: self.scrape_com_info(session, session_id, c_id, c) except HTTPError: pass if not c['members']: msg = 'No members found: not saving {committee}.' self.logger.warning(msg.format(**c)) continue self.save_committee(c)
def scrape_senate_committee(self, committee_name, link): """Scrape individual committee page and add members""" find_expr = "//div[@class='col1']/ul[position()<3]/li" com = Committee('upper', committee_name) with self.urlopen(link) as page: # Find individual committee urls page = lxml.html.fromstring(page) for el in page.xpath(find_expr): chunks = el.text_content().split(',', 1) member = [item.strip() for item in chunks] if len(member) > 1: member_name, role = member else: member_name, role = member[0], 'member' if member_name != "": com.add_member(member_name, role) com.add_source(link) self.save_committee(com)
def scrape(self, chamber, term): biennium = "%s-%s" % (term[0:4], term[7:9]) url = "%s/GetActiveCommittees?biennium=%s" % (self._base_url, biennium) page = self.urlopen(url) page = lxml.etree.fromstring(page.bytes) for comm in xpath(page, "//wa:Committee"): agency = xpath(comm, "string(wa:Agency)") comm_chamber = {'House': 'lower', 'Senate': 'upper'}[agency] if comm_chamber != chamber: continue name = xpath(comm, "string(wa:Name)") comm_id = xpath(comm, "string(wa:Id)") # acronym = xpath(comm, "string(wa:Acronym)") phone = xpath(comm, "string(wa:Phone)") comm = Committee(chamber, name, _code=comm_id, office_phone=phone) self.scrape_members(comm, agency) comm.add_source(url) if comm['members']: self.save_committee(comm)
def scrape_upper_committee(self, name, url): with self.urlopen(url) as page: page = lxml.html.fromstring(page) comm = Committee('upper', name) comm.add_source(url) member_div = page.xpath("//div[@class = 'committee-members']")[0] seen = set() for link in member_div.xpath(".//a"): if not link.text: continue member = link.text.strip() if member in seen or not member: continue seen.add(member) comm.add_member(member) self.save_committee(comm)
def scrape_upper_committee(self, name, url): page = lxml.html.fromstring(self.urlopen(url)) comm = Committee('upper', name) comm.add_source(url) for link in page.xpath("//a[contains(@href, 'biographies')]"): member = link.xpath("string()").strip() member = re.sub(r'\s+', ' ', member) if not member: continue role = link.tail if not role: role = 'member' elif 'Vice Chair' in role: role = 'vice chair' elif 'Chair' in role: role = 'chair' comm.add_member(member, role=role) if not comm['members']: raise Exception('no members for %s', comm['name']) self.save_committee(comm)
def scrape(self, term, chambers): com_url = 'http://www.dccouncil.washington.dc.us/committees' data = self.urlopen(com_url) doc = lxml.html.fromstring(data) urls = set(doc.xpath('//a[contains(@href, "committee-on")]/@href')) for url in urls: data = self.urlopen(url) doc = lxml.html.fromstring(data) name = doc.xpath('//h1/text()')[0].replace('Committee on ', '') com = Committee('upper', name) for chair in doc.xpath( '//h3[text()="Committee Chair"]/following-sibling::p'): com.add_member(chair.text_content(), role='chairperson') for member in doc.xpath( '//h3[text()="Councilmembers"]/following-sibling::p/a'): com.add_member(member.text_content(), role='member') com.add_source(url) self.save_committee(com)
def scrape_senate_comm(self): url = 'http://www.maine.gov/legis/senate/Senate-Standing-Committees.html' with self.urlopen(url) as html: doc = lxml.html.fromstring(html) # committee titles for item in doc.xpath('//span[@style="FONT-SIZE: 11pt"]'): text = item.text_content().strip() # some contain COMMITTEE ON & some are blank, drop those if not text or text.startswith('COMMITTEE'): continue # titlecase committee name com = Committee('upper', text.title()) com.add_source(url) # up two and get ul sibling for leg in item.xpath('../../following-sibling::ul[1]/li'): lname = leg.text_content().strip().split(' of ')[0] com.add_member(lname) self.save_committee(com)
def scrape(self, term, chambers): base_url = 'http://www.ncga.state.nc.us/gascripts/Committees/Committees.asp?bPrintable=true&sAction=ViewCommitteeType&sActionDetails=' chamber_slugs = { 'upper': ['Senate%20Standing', 'Senate%20Select'], 'lower': ['House%20Standing', 'House%20Select'] } for chamber in chambers: for ctype in chamber_slugs[chamber]: data = self.urlopen(base_url + ctype) doc = lxml.html.fromstring(data) doc.make_links_absolute(base_url + ctype) for comm in doc.xpath('//ul/li/a'): name = comm.text # skip committee of whole Senate if 'Whole Senate' in name: continue url = comm.get('href') committee = Committee(chamber, name) self.scrape_committee(committee, url) committee.add_source(url) self.save_committee(committee)
def scrape(self, chamber, term): for t in self.metadata['terms']: if t['name'] == term: session = t['sessions'][-1] sessionsuffix = 'th' if str(session)[-1] == '1': sessionsuffix = 'st' elif str(session)[-1] == '2': sessionsuffix = 'nd' elif str(session)[-1] == '3': sessionsuffix = 'rd' insert = str(session) + sessionsuffix + str(term[0:4]) chamber_letter = {'lower':'A', 'upper':'S'}[chamber] insert = self.metadata['session_details'][session].get( '_committee_session', insert ) url = 'http://www.leg.state.nv.us/Session/%s/Committees/%s_Committees/' % ( insert, chamber_letter) if insert in ['28th2014Special']: raise NoDataForPeriod(insert) page = self.urlopen(url) root = lxml.html.fromstring(page) for com_a in root.xpath('//strong/a'): com_url = url + com_a.get('href') if com_a.text == 'Committee of the Whole': continue com = Committee(chamber, com_a.text) com.add_source(com_url) self.scrape_comm_members(chamber, com, com_url) self.save_committee(com)
def scrape_lower_committee(self, committee_name, url): page = self.lxmlize(url) committee_name = committee_name.strip() committee = Committee('lower', committee_name) committee.add_source(url) info_node = self.get_node( page, './/div[@id = "dnn_ctr1109_ViewWebCommission_WebCommission1_' 'pnlCommission"]') # This will likely capture empty text nodes as well. members = self.get_nodes( info_node, './/div[@class="two-cols com"]/div[@class="col"]//text()' '[normalize-space() and preceding-sibling::br]') member_count = 0 for member in members: member = re.sub(r'Hon\.\s*', '', member).strip() # Skip empty nodes. if not member: continue member, title = self._match_title(member) if title is not None: committee.add_member(member, title) else: committee.add_member(member) member_count += 1 if member_count > 0: self.save_committee(committee)
def scrape_committees(self, chamber, url): html = self.urlopen(url) doc = lxml.html.fromstring(html) # distinct URLs containing /com/ committees = set([ l.get('href') for l in doc.xpath('//li/a') if l.get('href', '').find('/com/') != -1 ]) for com in committees: com_url = 'http://www.msa.md.gov' + com chtml = self.urlopen(com_url) cdoc = lxml.html.fromstring(chtml) for h in cdoc.xpath('//*[self::h2 or self::h3]'): if h.text: committee_name = h.text break # non committees if 'DEFUNCT' in committee_name or 'ORGANIZATION' in committee_name: continue cur_com = Committee(chamber, committee_name) cur_com.add_source(com_url) for l in cdoc.xpath('//a[@href]'): txt = l.text or '' if ' SUBCOMMITTEE' in txt or 'OVERSIGHT COMMITTEE' in txt: self.save_committee(cur_com) cur_com = Committee(chamber, committee_name, l.text) cur_com.add_source(com_url) elif 'html/msa' in l.get('href'): prev = l.getprevious() name = l.text if name.endswith(','): name = name[:-1] cur_com.add_member(name) self.save_committee(cur_com)
def select_special_comm(self): main_url = 'http://www.nebraskalegislature.gov/committees/select-committees.php' page = self.get(main_url).text page = lxml.html.fromstring(page) for comm_names in page.xpath('//div[@class="content_box"]'): name = comm_names.xpath('h2')[0].text if name != None: committee = Committee('upper', name) committee.add_source(main_url) for senators in comm_names.xpath('ul[@class="nobullet"]/li'): senator = senators[0].text if 'Chairperson' in senator: role = 'Chairperson' senator = senator[5:-13].strip() else: role = 'member' senator = senator[5:].strip() committee.add_member(senator, role) else: name = comm_names.xpath('h2/a')[0].text committee = Committee('upper', name) committee.add_source(main_url) for senators in comm_names.xpath('ul[@class="nobullet"]/li'): senator = senators[0].text if 'Chairperson' in senator: role = 'chairperson' senator = senator[5:-13].strip() else: role = 'member' senator = senator[5:].strip() committee.add_member(senator, role) if not committee['members']: self.warning('no members in %s', committee['committee']) else: self.save_committee(committee)
def scrapeStanding(self, chamber, page, url): comm_count = 1 for comm_names in page.xpath('//div[@class="content"][1]/p//span'): name = re.sub('[^A-Za-z0-9]+', ' ', comm_names.text).replace(' ', '') comm = Committee(chamber, name) member_count = 1 members_path = '//div[@class="content"][1]/table[@class="p"][%s]//tr/td[2]' % (str(comm_count)) for members in comm_names.xpath(members_path): memberName = members.xpath('a')[0].text if memberName == None: #special case for Randy Boehning under Goverment and Vetran Affairs in House memberName = members.xpath('a')[1].text memberName = re.sub('[^A-Za-z0-9]+', ' ', memberName) #role role_path = '//div[@class="content"][1]/table[@class="p"][%s]//tr[%s]/td[2]/a' % (comm_count, member_count) role_text = page.xpath(role_path)[0].tail if role_text != None: if "Vice" in role_text: role = "Vice-Chairman" elif "Chairman" in role_text: role = "Chairman" else: role = "Member" comm.add_member(memberName, role) else: if member_count == 1: role = "Chairman" elif member_count == 2: role = "Vice-Chairman" else: role = "Member" comm.add_member(memberName, role) member_count += 1 comm.add_source(url) self.save_committee(comm) comm_count += 1