def scrape_senate(self): """Scrape Senate Committees""" senate_url = "http://www.nysenate.gov" senate_committees_url = senate_url + "/committees" with self.urlopen(senate_committees_url) as html: doc = lxml.html.fromstring(html) committee_paths = set([l.get("href") for l in doc.cssselect("li a") if l.get("href", "").find("/committee/") != -1]) for committee_path in committee_paths: committee_url = senate_url+committee_path with self.urlopen(committee_url) as chtml: cdoc = lxml.html.fromstring(chtml) for h in cdoc.cssselect(".committee_name"): if h.text: committee_name = h.text break committee = Committee("upper", committee_name) committee.add_source(committee_url) for l in cdoc.cssselect(".committee-chair a[href]"): if "/senator/" in l.get("href") and l.text and l.text.startswith("Sen."): committee.add_member(l.text.split('Sen. ', 1)[1], "chair") for l in cdoc.cssselect(".committee-members a[href]"): if "/senator/" in l.get("href"): committee.add_member(l.text) self.save_committee(committee)
def scrape_senate_committee(self, term, link): with self.urlopen(link) as html: doc = lxml.html.fromstring(html) # strip first 30 and last 10 # Minnesota Senate Committees - __________ Committee committee_name = doc.xpath('//title/text()')[0][30:-10] com = Committee('upper', committee_name) # first id=bio table is members for row in doc.xpath('//table[@id="bio"]')[0].xpath('tr'): row = fix_whitespace(row.text_content()) # switch role if ':' in row: position, name = row.split(': ') role = position.lower().strip() else: name = row # add the member com.add_member(name, role) com.add_source(link) self.save_committee(com)
def scrape_reps_comm(self, chamber, year): save_chamber = chamber #id range for senate committees on their website for comm_id in range(87, 124): chamber = save_chamber comm_url = 'http://www.house.state.oh.us/index.php?option=com_displaycommittees&task=2&type=Regular&committeeId=' + str(comm_id) with self.urlopen(comm_url) as page: root = lxml.etree.fromstring(page, lxml.etree.HTMLParser()) comm_name = root.xpath('string(//table/tr[@class="committeeHeader"]/td)') comm_name = comm_name.replace("/", " ") #joint legislative committiees if comm_id < 92: chamber = "joint_legislation" committee = Committee(chamber, comm_name) path = '/html/body[@id="bd"]/div[@id="ja-wrapper"]/div[@id="ja-containerwrap-f"]/div[@id="ja-container"]/div[@id="ja-mainbody-f"]/div[@id="ja-contentwrap"]/div[@id="ja-content"]/table/tr[position() >=3]' for el in root.xpath(path): rep1 = el.xpath('string(td[1]/a)') rep2 = el.xpath('string(td[4]/a)') committee.add_member(rep1) committee.add_member(rep2) committee.add_source(comm_url) self.save_committee(committee)
def scrape(self, chamber, year): # TODO: scrape senate committees house_url = 'http://www.msa.md.gov/msa/mdmanual/06hse/html/hsecom.html' with self.urlopen(house_url) as html: doc = lxml.html.fromstring(html) # distinct URLs containing /com/ committees = set([l.get('href') for l in doc.cssselect('li a') if l.get('href', '').find('/com/') != -1]) for com in committees: com_url = 'http://www.msa.md.gov'+com with self.urlopen(com_url) as chtml: cdoc = lxml.html.fromstring(chtml) for h in cdoc.cssselect('h2, h3'): if h.text: committee_name = h.text break cur_com = Committee('lower', committee_name) cur_com.add_source(com_url) for l in cdoc.cssselect('a[href]'): if ' SUBCOMMITTEE' in (l.text or ''): self.save_committee(cur_com) cur_com = Committee('lower', l.text, committee_name) cur_com.add_source(com_url) elif 'html/msa' in l.get('href'): cur_com.add_member(l.text) self.save_committee(cur_com)
def scrape_reps_comm(self, chamber, term): save_chamber = chamber # id range for senate committees on their website for comm_id in range(87, 124): chamber = save_chamber comm_url = ( "http://www.house.state.oh.us/index.php?option=" "com_displaycommittees&task=2&type=Regular&" "committeeId=%d" % comm_id ) with self.urlopen(comm_url) as page: page = lxml.etree.fromstring(page, lxml.etree.HTMLParser()) comm_name = page.xpath('string(//table/tr[@class="committeeHeader"]/td)') comm_name = comm_name.replace("/", " ") if comm_id < 92: chamber = "joint" committee = Committee(chamber, comm_name) committee.add_source(comm_url) for link in page.xpath("//a[contains(@href, 'district')]"): name = link.text if name and name.strip(): committee.add_member(name.strip()) self.save_committee(committee)
def scrape_reps_comm(self, chamber, session): url = 'http://www.maine.gov/legis/house/hsecoms.htm' with self.urlopen(url) as page: root = lxml.etree.fromstring(page, lxml.etree.HTMLParser()) count = 0 for n in range(1, 12, 2): path = 'string(//body/center[%s]/h1/a)' % (n) comm_name = root.xpath(path) committee = Committee(chamber, comm_name) count = count + 1 path2 = '/html/body/ul[%s]/li/a' % (count) for el in root.xpath(path2): rep = el.text if rep.find('(') != -1: mark = rep.find('(') rep = rep[15: mark] committee.add_member(rep) committee.add_source(url) self.save_committee(committee)
def scrape(self, chamber, term): if term != '2011-2012': raise NoDataForPeriod(term) chamber_abbr = {'upper': 's', 'lower': 'h'}[chamber] url = "http://le.utah.gov/asp/interim/standing.asp?house=%s" % chamber_abbr with self.urlopen(url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(url) for comm_link in page.xpath("//a[contains(@href, 'Com=')]"): comm_name = comm_link.text.strip() # Drop leading "House" or "Senate" from name comm_name = re.sub(r"^(House|Senate) ", "", comm_name) comm = Committee(chamber, comm_name) for mbr_link in comm_link.xpath( "../../../font[2]/a[not(contains(@href, 'mailto'))]"): name = mbr_link.text.strip() next_el = mbr_link.getnext() if next_el is not None and next_el.tag == 'i': type = next_el.text.strip() else: type = 'member' comm.add_member(name, type) self.save_committee(comm)
def scrape_house_committees(self, term): url = 'http://www.house.leg.state.mn.us/comm/commemlist.asp' with self.urlopen(url) as html: doc = lxml.html.fromstring(html) for com in doc.xpath('//h2[@class="commhighlight"]'): members_url = com.xpath('following-sibling::p[1]/a[text()="Members"]/@href')[0] com = Committee('lower', com.text) com.add_source(members_url) with self.urlopen(members_url) as member_html: mdoc = lxml.html.fromstring(member_html) # each legislator in their own table # first row, second column contains all the info for ltable in mdoc.xpath('//table/tr[1]/td[2]/p/b[1]'): # name is tail string of last element name = ltable.text_content() # role is inside a nested b tag role = ltable.xpath('b/*/text()') if role: # if there was a role, remove it from name role = role[0] name = name.replace(role, '') else: role = 'member' com.add_member(name, role) # save self.save_committee(com)
def scrape_assembly(self): """Scrape Assembly Committees""" assembly_committees_url = "http://assembly.state.ny.us/comm/" with self.urlopen(assembly_committees_url) as html: doc = lxml.html.fromstring(html) standing_committees, subcommittees, legislative_commissions, task_forces = doc.cssselect('#sitelinks ul') committee_paths = set([l.get('href') for l in standing_committees.cssselect("li a[href]") if l.get("href").startswith('?sec=mem')]) for committee_path in committee_paths: committee_url = assembly_committees_url+committee_path with self.urlopen(committee_url) as chtml: cdoc = lxml.html.fromstring(chtml) for h in cdoc.cssselect("#content .pagehdg"): if h.text: committee_name = h.text.split('Committee Members')[0].strip() break committee = Committee("lower", committee_name) committee.add_source(committee_url) members = cdoc.cssselect("#sitelinks")[0] first = 1 for member in members.iter('span'): member = member.xpath('li/a')[0].text if first == 1: committee.add_member(member, 'chair') first = 0 else: committee.add_member(member) self.save_committee(committee)
def scrape_house(self): url = "http://house.louisiana.gov/H_Reps/H_Reps_CmtesFull.asp" comm_cache = {} with self.urlopen(url) as text: page = lxml.html.fromstring(text) for row in page.xpath("//table[@bordercolorlight='#EAEAEA']/tr"): cells = row.xpath('td') name = cells[0].xpath('string()').strip() if name.startswith('Vacant'): continue font = cells[1].xpath('font')[0] committees = [] if font.text: committees.append(font.text.strip()) for br in font.xpath('br'): if br.text: committees.append(br.text.strip()) if br.tail: committees.append(br.tail) for comm_name in committees: mtype = 'member' if comm_name.endswith(', Chairman'): mtype = 'chairman' comm_name = comm_name.replace(', Chairman', '') elif comm_name.endswith(', Co-Chairmain'): mtype = 'co-chairmain' comm_name = comm_name.replace(', Co-Chairmain', '') elif comm_name.endswith(', Vice Chair'): mtype = 'vice chair' comm_name = comm_name.replace(', Vice Chair', '') elif comm_name.endswith(', Ex Officio'): mtype = 'ex officio' comm_name = comm_name.replace(', Ex Officio', '') if comm_name.startswith('Joint'): chamber = 'joint' else: chamber = 'lower' try: committee = comm_cache[comm_name] except KeyError: committee = Committee(chamber, comm_name) committee.add_source(url) comm_cache[comm_name] = committee committee.add_member(name, mtype) for committee in comm_cache.values(): self.save_committee(committee)
def scrape(self, chamber, year): if year != '2009': raise NoDataForPeriod(year) if chamber == 'upper': url = ('http://www.legis.state.pa.us/cfdocs/legis/' 'home/member_information/senators_ca.cfm') else: url = ('http://www.legis.state.pa.us/cfdocs/legis/' 'home/member_information/representatives_ca.cfm') with self.urlopen(url) as page: page = lxml.html.fromstring(page) committees = {} for li in page.xpath("//a[contains(@href, 'bio.cfm')]/../.."): name = li.xpath("string(b/a[contains(@href, 'bio.cfm')])") name = name[0:-4] for link in li.xpath("a"): if not link.tail: continue committee_name = link.tail.strip() committee_name = re.sub(r"\s+", " ", committee_name) subcommittee_name = None role = 'member' rest = link.xpath('string(../i)') if rest: match = re.match(r',\s+(Subcommittee on .*)\s+-', rest) if match: subcommittee_name = match.group(1) role = rest.split('-')[1].strip() else: role = rest.replace(', ', '').strip() try: committee = committees[(chamber, committee_name, subcommittee_name)] except KeyError: committee = Committee(chamber, committee_name) if subcommittee_name: committee['subcommittee'] = subcommittee_name committees[(chamber, committee_name, subcommittee_name)] = committee committee.add_member(name, role) for committee in committees.values(): self.save_committee(committee)
def scrape_senate(self): """Scrape Senate Committees""" for name, comm in nyss_openlegislation.models.committees.items(): name = name.title().replace('And', 'and') committee = Committee('upper', name) for member in comm.members: committee.add_member(member.fullname) self.save_committee(committee)
def scrape_joint_comm(self, chamber, session): fileurl = 'http://www.maine.gov/legis/house/commlist.xls' joint = urllib.urlopen(fileurl).read() f = open('me_joint.xls', 'w') f.write(joint) f.close() wb = xlrd.open_workbook('me_joint.xls') sh = wb.sheet_by_index(0) cur_comm_name = '' chamber = 'joint' for rownum in range(1, sh.nrows): comm_name = sh.cell(rownum, 0).value first_name = sh.cell(rownum, 3).value middle_name = sh.cell(rownum, 4).value last_name = sh.cell(rownum, 5).value jrsr = sh.cell(rownum, 6).value full_name = first_name + " " + middle_name + " " + last_name + " " + jrsr party = sh.cell(rownum, 7).value legalres = sh.cell(rownum, 8).value address1 = sh.cell(rownum, 9).value address2 = sh.cell(rownum, 10).value town = sh.cell(rownum, 11).value state = sh.cell(rownum, 12).value zipcode = int(sh.cell(rownum, 13).value) phone = str(sh.cell(rownum, 14).value) home_email = sh.cell(rownum, 15).value leg_email = sh.cell(rownum, 16).value leg_chamber = sh.cell(rownum, 2).value chair = sh.cell(rownum, 1).value role = "member" if chair == 1: role = leg_chamber + " " + "Chair" if comm_name != cur_comm_name: cur_comm_name = comm_name committee = Committee(chamber, comm_name) committee.add_member(full_name, role = role, party = party, legalres= legalres, address1 = address1, address2 = address2, town = town, state = state, zipcode = zipcode, phone = phone, home_email = home_email, leg_email = leg_email) committee.add_source(fileurl) else: committee.add_member(full_name, role = role, party = party, legalres = legalres, address1 = address1, address2 = address2, town = town, state = state, zipcode = zipcode, phone = phone, home_email = home_email, leg_email = leg_email) self.save_committee(committee)
def scrape_comm(self, chamber, term_name): url = "http://billstatus.ls.state.ms.us/htms/%s_cmtememb.xml" % chamber with self.urlopen(url) as comm_page: root = lxml.etree.fromstring(comm_page, lxml.etree.HTMLParser()) if chamber == "h": chamber = "lower" else: chamber = "upper" for mr in root.xpath("//committee"): name = mr.xpath("string(name)") comm = Committee(chamber, name) chair = mr.xpath("string(chair)") chair = chair.replace(", Chairman", "") role = "Chairman" if len(chair) > 0: comm.add_member(chair, role=role) vice_chair = mr.xpath("string(vice_chair)") vice_chair = vice_chair.replace(", Vice-Chairman", "") role = "Vice-Chairman" if len(vice_chair) > 0: comm.add_member(vice_chair, role=role) members = mr.xpath("string(members)").split(";") for leg in members: if leg[0] == " ": comm.add_member(leg[1 : len(leg)]) else: comm.add_member(leg) comm.add_source(url) self.save_committee(comm)
def scrape_senate_committee(self, name, url): url = url.replace('Default.asp', 'Assignments.asp') committee = Committee('upper', name) with self.urlopen(url) as text: page = lxml.html.fromstring(text) links = page.xpath('//table[@bordercolor="#EBEAEC"]/tr/td/font/a') for link in links: name = link.xpath('string()') name = name.replace('Senator ', '').strip() committee.add_member(name) self.save_committee(committee)
def scrape_committee(self, chamber, term, name, url): with self.urlopen(url) as page: page = lxml.html.fromstring(page) mlist = page.xpath("//strong[contains(., 'Members:')]")[0].tail mlist = re.sub(r'\s+', ' ', mlist) committee = Committee(chamber, name) committee.add_source(url) for member in mlist.split(','): member = re.sub(r'R\.M\.(M\.)?$', '', member.strip()) committee.add_member(member.strip()) chair = page.xpath("//strong[contains(., 'Chair:')]")[0] chair_name = chair.tail.strip() if chair_name: committee.add_member(chair_name, 'chair') vc = page.xpath("//strong[contains(., 'Vice Chair:')]")[0] vc_name = vc.tail.strip() if vc_name: committee.add_member(vc_name, 'vice chair') self.save_committee(committee)
def scrape_senate_comm(self, chamber, term): committees = [ "agriculture", "education", "energy-and-public-utilities", "environment-and-natural-resources", "finance-and-financial-institutions", "government-oversight", "health-human-services-and-aging", "highways-and-transportation", "insurance-commerce-and-labor", "judiciary-civil-justice", "judiciary-criminal-justice", "reference", "rules", "state-and-local-government-and-veterans-affairs", "ways-and-means-and-economic-development", ] for name in committees: comm_url = "http://www.ohiosenate.gov/committees/standing/detail/" "%s.html" % name with self.urlopen(comm_url) as page: root = lxml.etree.fromstring(page, lxml.etree.HTMLParser()) comm_name = name comm_name = comm_name.replace("-", " ") comm_name = comm_name.title() committee = Committee(chamber, comm_name) committee.add_source(comm_url) for el in root.xpath("//table/tr/td"): sen_name = el.xpath('string(a[@class="senatorLN"])') mark = sen_name.find("(") full_name = sen_name[0:mark] full_name = full_name.strip() if full_name: committee.add_member(full_name) self.save_committee(committee)
def scrape(self, chamber, term): com_url = {'lower': 'http://www.msa.md.gov/msa/mdmanual/06hse/html/hsecom.html', 'upper': 'http://www.msa.md.gov/msa/mdmanual/05sen/html/sencom.html'} # joint: http://www.msa.md.gov/msa/mdmanual/07leg/html/ga.html with self.urlopen(com_url[chamber]) as html: doc = lxml.html.fromstring(html) # distinct URLs containing /com/ committees = set([l.get('href') for l in doc.cssselect('li a') if l.get('href', '').find('/com/') != -1]) for com in committees: com_url = 'http://www.msa.md.gov'+com with self.urlopen(com_url) as chtml: cdoc = lxml.html.fromstring(chtml) for h in cdoc.cssselect('h2, h3'): if h.text: committee_name = h.text break cur_com = Committee(chamber, committee_name) cur_com.add_source(com_url) for l in cdoc.cssselect('a[href]'): if ' SUBCOMMITTEE' in (l.text or ''): self.save_committee(cur_com) cur_com = Committee(chamber, committee_name, l.text) cur_com.add_source(com_url) elif 'html/msa' in l.get('href'): prev = l.getprevious() name = l.text if name.endswith(','): name = name[:-1] if prev is not None and prev.tag == 'i': cur_com.add_member(name, 'ex-officio') else: cur_com.add_member(name) self.save_committee(cur_com)
def scrape(self, chamber, year): com = Committee('lower', 'Committee on Finance') com.add_source('http://example.com') # can optionally specify role com.add_member('Lou Adams', 'chairman') com.add_member('Bill Smith') # can also specify subcommittees subcom = Committee('lower', 'Finance Subcommittee on Banking', 'Committee on Finance') com.add_source('http://example.com') com.add_member('Bill Smith')
def scrape_assem_comm(self, chamber, insert, year, session): committees = self.scrape_comm(chamber, insert, session) for committee in committees: leg_url = 'http://www.leg.state.nv.us/Session/' + insert + '/Committees/A_Committees/' + committee with self.urlopen(leg_url) as page: root = lxml.etree.fromstring(page, lxml.etree.HTMLParser()) comm_name = root.xpath('string(/html/body/div[@id="content"]/h1)') #special cases for each session to grab the name if session == 73: comm_name = root.xpath('string(/html/body/div[@id="content"]/h1)') elif session == 72: comm_name = root.xpath('string(/html/body/h2[1]/font)') elif session == 71: comm_name = root.xpath('string(/html/body/h2)') elif committee == 'NR.cfm' and session != 72 and session != 71: comm_name = root.xpath('string(/html/body/div[@id="content"]/h2)') #Marking for grabbing only the name of the committee startmark = comm_name.find("Assembly") if startmark == -1: startmark = 0 else: startmark = 9 endmark = comm_name.find(str(session)) if session <= 73: comm_name = comm_name[startmark: len(comm_name)] else: comm_name = comm_name[startmark: endmark - 3] comm_name = comm_name.replace(' \r\n ', '') if committee == 'EPE.cfm' and (year == '2005' or year == '2007'): note1 = root.xpath('string(/html/body/div[@id="content"]/ul[1]/li[1])') note2 = root.xpath('string(/html/body/div[@id="content"]/ul[1]/li[2])') comm = Committee(chamber, comm_name, note1 = note1, note2 = note2) else: comm = Committee(chamber, comm_name) count = 0 #special case if committee == 'EPE.cfm' and year == '2009': special_name1 = root.xpath('string(/html/body/div[@id="content"]/p/a[1])') special_name1 = special_name1.split()[0] + " " + special_name1.split()[1] name1_2ndrole = "Constitutional Amendments Vice Chair" special_name2 = root.xpath('string(/html/body/div[@id="content"]/p/a[2])') special_name2 = special_name2.split()[0] + " " + special_name2.split()[1] name2_2ndrole = "Elections Procedures and Ethics Vice Chair" comm.add_member(special_name1, role="Elections Procedures and Ethics Chair", name1_2ndrole = name1_2ndrole) comm.add_member(special_name2, role="Constitutional Admendments Chair", name2_2ndrole = name2_2ndrole) #paths for grabbing names if session == 73 or session == 71: path = '//li' elif session == 72: path = '/html/body/ul/li' else: path = '/html/body/div[@id="content"]/ul/li' #grabbing names for mr in root.xpath(path): name = mr.xpath('string(a)') name = name.strip() if session == 72 or session == 71: name = mr.xpath('string()') name = name.replace('\r\n', '') name = name.replace(' -Vice Chair', '') name = name.replace(' -Chair', '') name = name.replace('-Chair', '') name = name.replace('\u', '') name = name.replace('\u00a0', '') name = name.replace(' ', ' ') count = count + 1 if count == 1 and committee[0:3] != 'EPE' and session != 72: role = 'Chair' elif count == 2 and committee[0:3] != 'EPE' and session != 72: role = 'Vice Chair' else: role = 'member' if len(name) > 0: comm.add_member(name, role = role) comm.add_source(leg_url) self.save_committee(comm)
def scrape_senate_comm(self, chamber, insert, session): committees = self.scrape_comm(chamber, insert, session) for committee in committees: leg_url = 'http://www.leg.state.nv.us/Session/' + insert + '/Committees/S_Committees/' + committee with self.urlopen(leg_url) as page: root = lxml.etree.fromstring(page, lxml.etree.HTMLParser()) comm_name = root.xpath('string(/html/body/div[@id="content"]/center/h2)') #special cases for each session to grab the name if session == 73: comm_name = root.xpath('string(/html/body/div[@id="content"]/h2[1])') elif session == 72: comm_name = root.xpath('string(/html/body/h2[1]/font)') elif session == 71: comm_name = root.xpath('string(/html/body/h2)') elif committee == 'NR.cfm' and session != 72 and session != 71: comm_name = root.xpath('string(/html/body/div[@id="content"]/h2)') #Marking for grabbing only the name of the committee startmark = comm_name.find("Senate") if startmark == -1: startmark = 0 else: startmark = 7 endmark = comm_name.find(str(session)) if session <= 73: comm_name = comm_name[startmark: len(comm_name)] else: comm_name = comm_name[startmark: endmark - 3] comm = Committee(chamber, comm_name) count = 0 #print comm_name if session == 73 or session == 71: path = '//li' elif session == 72: path = '/html/body/ul/li' else: path = '/html/body/div[@id="content"]/ul/li' for mr in root.xpath(path): name = mr.xpath('string(a)') name = name.replace(' \r\n ', '') if session == 72: name = mr.xpath('string()') name = name.replace('\r\n', '') name = name.replace(' -Vice Chair', '') name = name.replace(' -Chair', '') count = count + 1 if count == 1 and committee[0:3] != 'EPE.cfm': role = 'Chair' elif count == 2 and committee[0:3] != 'EPE.cfm': role = 'Vice Chair' else: role = 'member' comm.add_member(name, role) comm.add_source(leg_url) self.save_committee(comm)