def scrape(self, chamber, term_name): year = term_name[0:4] if int(year) < 2001: raise NoDataForPeriod(year) if ((int(year) - 2010) % 2) == 1: session = ((int(year) - 2010) / 2) + 76 elif( ((int(year) - 2010) % 2) == 0) and year >= 2010: session = ((int(year) - 2010) / 2) + 26 else: raise NoDataForPeriod(term_name) self.scrape_legislators(chamber, session, year, term_name)
def scrape(self, chamber, year): if year not in metadata['sessions']: raise NoDataForPeriod(year) self.scrape_session(chamber, year) for sub in metadata['session_details'][year]['sub_sessions']: self.scrape_session(chamber, sub)
def scrape(self, chamber, session): if session != '2011': raise NoDataForPeriod(session) # start by building subject map self.scrape_subjects(chamber, session) url = "http://webserver1.lsb.state.ok.us/WebApplication3/WebForm1.aspx" form_page = lxml.html.fromstring(self.urlopen(url)) if chamber == 'upper': chamber_letter = 'S' else: chamber_letter = 'H' values = [('cbxSessionId', self.session_id_map[session]), ('cbxActiveStatus', 'All'), ('RadioButtonList1', 'On Any day'), ('Button1', 'Retrieve')] for bill_type in self.bill_types: values.append(('lbxTypes', chamber_letter + bill_type)) for hidden in form_page.xpath("//input[@type='hidden']"): values.append((hidden.attrib['name'], hidden.attrib['value'])) page = self.urlopen(url, "POST", urllib.urlencode(values)) page = lxml.html.fromstring(page) page.make_links_absolute(url) for link in page.xpath("//a[contains(@href, 'BillInfo')]"): bill_id = link.text.strip() self.scrape_bill(chamber, session, bill_id, link.attrib['href'])
def _scrape_upper_chamber(self, year): # We only have data back to 2005. if int(year) < 2005: raise NoDataForPeriod(year) self.info('Scraping bills from upper chamber.') year2 = "%02d" % (int(year) % 100) # Save the root URL, since we'll use it later. bill_root = 'http://www.senate.mo.gov/{}info/BTS_Web/'.format(year2) index_url = bill_root + 'BillList.aspx?SessionType=R' index_page = self.get(index_url).text index_page = lxml.html.fromstring(index_page) # Each bill is in it's own table (nested within a larger table). bill_tables = index_page.xpath('//a[@id]') if not bill_tables: return for bill_table in bill_tables: # Here we just search the whole table string to get the BillID that # the MO senate site uses. if re.search(r'dgBillList.*hlBillNum',bill_table.attrib['id']): #print "keys = %s" % bill_table.attrib['id'] #print "table = %s " % bill_table.attrib.get('href') self._parse_senate_billpage(bill_root + bill_table.attrib.get('href'), year)
def scrape(self, chamber, term): if term != '2011-2012': raise NoDataForPeriod(term) if chamber == 'upper': chamber_abbrev = 'sen' title_abbrev = 'sen' else: chamber_abbrev = 'hse' title_abbrev = 'del' url = "http://www.legis.state.wv.us/districts/maps/%s_dist.cfm" % ( chamber_abbrev) page = lxml.html.fromstring(self.urlopen(url)) page.make_links_absolute(url) view_url = '%smemview' % title_abbrev for link in page.xpath("//a[contains(@href, '%s')]" % view_url): name = link.xpath("string()").strip() leg_url = urlescape(link.attrib['href']) if name in [ 'Members', 'Senate Members', 'House Members', 'Vacancy' ]: continue self.scrape_legislator(chamber, term, name, leg_url)
def scrape(self, chamber, term): if term != '2011-2012': raise NoDataForPeriod(term) chamber_abbr = {'upper': 's', 'lower': 'h'}[chamber] url = "http://le.utah.gov/asp/interim/standing.asp?house=%s" % chamber_abbr with self.urlopen(url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(url) for comm_link in page.xpath("//a[contains(@href, 'Com=')]"): comm_name = comm_link.text.strip() # Drop leading "House" or "Senate" from name comm_name = re.sub(r"^(House|Senate) ", "", comm_name) comm = Committee(chamber, comm_name) for mbr_link in comm_link.xpath( "../../../font[2]/a[not(contains(@href, 'mailto'))]"): name = mbr_link.text.strip() next_el = mbr_link.getnext() if next_el is not None and next_el.tag == 'i': type = next_el.text.strip() else: type = 'member' comm.add_member(name, type) self.save_committee(comm)
def scrape(self, chamber, term): # Pennsylvania doesn't make member lists easily available # for previous sessions, unfortunately if term != '2011-2012': raise NoDataForPeriod(term) leg_list_url = legislators_url(chamber) with self.urlopen(leg_list_url) as page: page = lxml.html.fromstring(page) for link in page.xpath("//a[contains(@href, '_bio.cfm')]"): full_name = link.text[0:-4] district = re.search("District (\d+)", link.tail).group(1) party = link.text[-2] if party == 'R': party = 'Republican' elif party == 'D': party = 'Democratic' legislator = Legislator(term, chamber, district, full_name, party=party) legislator.add_source(leg_list_url) self.save_legislator(legislator)
def scrape_house(self, year): if int(year) < 2000 or int(year) > dt.date.today().year: raise NoDataForPeriod(year) bill_page_url = ('%s/BillList.aspx?year=%s' % (self.senate_base_url, year)) self.parse_house_billpage(bill_page_url, year)
def scrape(self, chamber, term): if term != '2011-2012': raise NoDataForPeriod(term) chamber_name = {'upper': 'Senate', 'lower': 'House'}[chamber] url = ("http://www.in.gov/cgi-bin/legislative/listing/" "listing-2.pl?data=alpha&chamber=%s" % chamber_name) with self.urlopen(url) as page: page = lxml.html.fromstring(page) for link in page.xpath("//div[@id='col2']/p/a"): name = link.text.strip() details = link.getnext().text.strip() party = details.split(',')[0] if party == 'Democrat': party = 'Democratic' district = re.search(r'District (\d+)', details).group(1) district = district.lstrip('0') leg = Legislator(term, chamber, district, name, '', '', '', party) leg.add_source(url) self.save_legislator(leg)
def scrape(self, chamber, term): if term != '27': raise NoDataForPeriod(term) if chamber == 'upper': chamber_abbr = 'S' url = 'http://senate.legis.state.ak.us/' search = 'senator' else: chamber_abbr = 'H' url = 'http://house.legis.state.ak.us/' search = 'rep' with self.urlopen(url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(url) seen = set() for link in page.xpath("//a[contains(@href, '%s')]" % search): name = link.text # Members of the leadership are linked twice three times: # one image link and two text links. Don't double/triple # scrape them if not name or link.attrib['href'] in seen: continue seen.add(link.attrib['href']) self.scrape_legislator(chamber, term, link.xpath('string()').strip(), link.attrib['href'])
def scrape(self, chamber, session): if session != '2011 Regular Session': raise NoDataForPeriod(session) url = "http://www.lrc.ky.gov/legislative_calendar/index.aspx" with self.urlopen(url) as page: page = lxml.html.fromstring(page) for div in page.xpath("//div[@style = 'MARGIN-LEFT: 20px']"): date = div.xpath("string(../../span[1])").strip() try: time, location = div.xpath("string(span[1])").split(',') except ValueError: # No meetings continue when = "%s %s" % (date, time) when = datetime.datetime.strptime(when, "%A, %B %d, %Y %I:%M%p") when = self._tz.localize(when) desc = div.xpath("string(span[2])").strip() event = Event(session, when, 'committee:meeting', desc, location=location) event.add_source(url) self.save_event(event)
def scrape(self, chamber, year): year = int(year) session = self.getSession(year) #2 year terms starting on odd year, so if even number, use the previous odd year if year < 1999: raise NoDataForPeriod(year) if year % 2 == 0: year -= 1 if year == 1999: base_bill_url = 'http://data.opi.mt.gov/bills/BillHtml/' else: base_bill_url = 'http://data.opi.mt.gov/bills/%d/BillHtml/' % year index_page = ElementTree( lxml.html.fromstring(self.urlopen(base_bill_url))) bill_urls = [] for bill_anchor in index_page.findall('//a'): # See 2009 HB 645 if bill_anchor.text.find("govlineveto") == -1: # House bills start with H, Senate bills start with S if chamber == 'lower' and bill_anchor.text.startswith('H'): bill_urls.append("%s%s" % (base_bill_url, bill_anchor.text)) elif chamber == 'upper' and bill_anchor.text.startswith('S'): bill_urls.append("%s%s" % (base_bill_url, bill_anchor.text)) for bill_url in bill_urls: bill = self.parse_bill(bill_url, session, chamber) self.save_bill(bill)
def scrape_senate(self, year): # We only have data from 2005-present if int(year) < 2005 or int(year) > dt.date.today().year: raise NoDataForPeriod(year) year2 = "%02d" % (int(year) % 100) # year is mixed in to the directory. set a root_url, since # we'll use it later bill_root = 'http://www.senate.mo.gov/%sinfo/BTS_Web/' % year2 index_url = bill_root + 'BillList.aspx?SessionType=R' #print "index = %s" % index_url with self.urlopen(index_url) as index_page: index_page = lxml.html.fromstring(index_page) # each bill is in it's own table (nested in a larger table) bill_tables = index_page.xpath('//a[@id]') if not bill_tables: return for bill_table in bill_tables: # here we just search the whole table string to get # the BillID that the MO senate site uses if re.search(r'dgBillList.*hlBillNum',bill_table.attrib['id']): #print "keys = %s" % bill_table.attrib['id'] #print "table = %s " % bill_table.attrib.get('href') self.parse_senate_billpage(bill_root + bill_table.attrib.get('href'), year)
def scrape_senate(self, year): # We only have data from 2005-2009 if int(year) < 2005 or int(year) > dt.date.today().year: raise NoDataForPeriod(year) year2 = "%02d" % (int(year) % 100) # year is mixed in to the directory. set a root_url, since # we'll use it later bill_root = self.senate_root + '/' + year2 + 'info/BTS_Web/' index_url = bill_root + 'BillList.aspx?SessionType=R' with self.urlopen(index_url) as index_page: index_page = BeautifulSoup(index_page) # each bill is in it's own table (nested in a larger table) bill_tables = index_page.findAll(id="Table2") if not bill_tables: return for bill_table in bill_tables: # here we just search the whole table string to get # the BillID that the MO senate site uses m = re.search(r"BillID=(\d*)", str(bill_table)) if m: bill_web_id = m.group(1) bill_url = (bill_root + '/Bill.aspx?SessionType=R&BillID=' + bill_web_id) self.parse_senate_billpage(bill_url, year)
def scrape(self, chamber, term): if term != '2011-2012': raise NoDataForPeriod(term) if chamber == 'upper': url = ('http://www.legis.state.pa.us/cfdocs/legis/' 'home/member_information/senators_ca.cfm') else: url = ('http://www.legis.state.pa.us/cfdocs/legis/' 'home/member_information/representatives_ca.cfm') with self.urlopen(url) as page: page = lxml.html.fromstring(page) committees = {} for li in page.xpath("//a[contains(@href, 'bio.cfm')]/../.."): name = li.xpath("string(b/a[contains(@href, 'bio.cfm')])") name = name[0:-4] for link in li.xpath("a"): if not link.tail: continue committee_name = link.tail.strip() committee_name = re.sub(r"\s+", " ", committee_name) subcommittee_name = None role = 'member' rest = link.getnext().text if rest: match = re.match(r',\s+(Subcommittee on .*)\s+-', rest) if match: subcommittee_name = match.group(1) role = rest.split('-')[1].strip().lower() else: role = rest.replace(', ', '').strip().lower() if role == 'chairman': role = 'chair' try: committee = committees[(chamber, committee_name, subcommittee_name)] except KeyError: committee = Committee(chamber, committee_name) committee.add_source(url) if subcommittee_name: committee['subcommittee'] = subcommittee_name committees[(chamber, committee_name, subcommittee_name)] = committee committee.add_member(name, role) for committee in committees.values(): self.save_committee(committee)
def scrape(self, chamber, session): if session != '27': raise NoDataForPeriod(session) if chamber == 'other': return year, year2 = None, None for term in self.metadata['terms']: if term['sessions'][0] == session: year = str(term['start_year']) year2 = str(term['end_year']) break # Full calendar year date1 = '0101' + year[2:] date2 = '1231' + year[2:] url = ("http://www.legis.state.ak.us/basis/" "get_hearing.asp?session=%s&Chamb=B&Date1=%s&Date2=%s&" "Comty=&Root=&Sel=1&Button=Display" % (session, date1, date2)) with self.urlopen(url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(url) path = "//font[starts-with(., '(H)') or starts-with(., '(S)')]" for font in page.xpath(path): match = re.match(r'^\((H|S)\)(.+)$', font.text) chamber = {'H': 'lower', 'S': 'upper'}[match.group(1)] comm = match.group(2).strip().title() next_row = font.xpath("../../following-sibling::tr[1]")[0] when = next_row.xpath("string(td[1]/font)").strip() when = datetime.datetime.strptime(when + " " + year, "%b %d %A %I:%M %p %Y") when = self._tz.localize(when) where = next_row.xpath("string(td[2]/font)").strip() description = "Committee Meeting\n" description += comm links = font.xpath( "../../td/font/a[contains(@href, 'get_documents')]") if links: agenda_link = links[0] print agenda_link event['link'] = agenda_link.attrib['href'] event = Event(session, when, 'committee:meeting', description, location=where) event.add_source(url) self.save_event(event)
def scrape(self, chamber, term): if term != self.metadata['terms'][-1]['name']: raise NoDataForPeriod(term) if chamber == 'upper': self.scrape_senate() else: self.scrape_house()
def scrape(self, chamber, term): if term != '2011-2012': raise NoDataForPeriod(term) if chamber == "upper": self.scrape_upper() elif chamber == "lower": self.scrape_lower()
def scrape(self, chamber, term): if term != '2009-2010': raise NoDataForPeriod(term) if chamber == "upper": self.scrape_senate() elif chamber == "lower": self.scrape_assembly()
def scrape(self, chamber, term): if term != '2011-2012': raise NoDataForPeriod(term) if chamber == 'upper': self.scrape_senators(chamber, term) else: self.scrape_reps(chamber, term)
def scrape(self, chamber, term): if term != '2011-2012': raise NoDataForPeriod(term) if chamber == 'lower': self.scrape_lower(term) else: self.scrape_upper(term)
def scrape(self, chamber, session): if session != '2010': raise NoDataForPeriod(session) if chamber == 'lower': self.scrape_house_weekly_schedule(session) self.scrape_committee_schedule(session, chamber)
def scrape(self, chamber, term): if term != '20112012': raise NoDataForPeriod(term) if chamber == 'upper': self.scrape_upper_committees(term) else: self.scrape_lower_committees(term)
def scrape(self, chamber, term): # Data available for this term only if term != '2010': raise NoDataForPeriod(term) if chamber == "upper": self.scrape_senate() elif chamber == "lower": self.scrape_house()
def scrape(self, chamber, session): # Data available for this session only if year_from_session(session) != 2010: raise NoDataForPeriod(session) if chamber == 'upper': self.scrape_senate(session) elif chamber == 'lower': self.scrape_house(session)
def scrape(self, chamber, term): if term != '2011-2012': raise NoDataForPeriod(term) if chamber == 'upper': chamber_name = 'senate' else: chamber_name = 'house' url = "http://www.legis.iowa.gov/Legislators/%s.aspx" % chamber_name page = lxml.html.fromstring(self.urlopen(url)) page.make_links_absolute(url) table = page.xpath('//table[@class="legis"]')[0] for link in table.xpath(".//a[contains(@href, 'legislator.aspx')]"): name = link.text.strip() leg_url = link.get('href') district = link.xpath("string(../../td[2])") party = link.xpath("string(../../td[3])") email = link.xpath("string(../../td[5])") if party == 'Democrat': party = 'Democratic' pid = re.search("PID=(\d+)", link.attrib['href']).group(1) photo_url = ("http://www.legis.iowa.gov/getPhotoPeople.aspx" "?GA=84&PID=%s" % pid) leg = Legislator(term, chamber, district, name, party=party, email=email, photo_url=photo_url, url=url) leg.add_source(url) leg_page = lxml.html.fromstring(self.urlopen(link.attrib['href'])) comm_path = "//a[contains(@href, 'committee')]" for comm_link in leg_page.xpath(comm_path): comm = comm_link.text.strip() match = re.search(r'\((.+)\)$', comm) if match: comm = re.sub(r'\((.+)\)$', '', comm).strip() mtype = match.group(1).lower() else: mtype = 'member' if comm.endswith('Appropriations Subcommittee'): sub = re.match('^(.+) Appropriations Subcommittee$', comm).group(1) leg.add_role('committee member', term, chamber=chamber, committee='Appropriations', subcommittee=sub, position=mtype) else: leg.add_role('committee member', term, chamber=chamber, committee=comm, position=mtype) self.save_legislator(leg)
def scrape(self, chamber, term): if term != '2011-2012': raise NoDataForPeriod(term) office_code = {'upper': 'S', 'lower': 'H'}[chamber] leg_url = "ftp://ftp.cga.ct.gov/pub/data/LegislatorDatabase.csv" page = urllib2.urlopen(leg_url) page = csv.DictReader(page) for row in page: if office_code != row['office code']: continue district = row['dist'].lstrip('0') name = row['first name'] mid = row['middle initial'].strip() if mid: name += " %s" % mid name += " %s" % row['last name'] suffix = row['suffix'].strip() if suffix: name += " %s" % suffix party = row['party'] if party == 'Democrat': party = 'Democratic' office_address = "%s, Room %s\nHartford, CT 06106-1591" % ( row['capitol street address'], row['room number']) leg = Legislator(term, chamber, district, name, first_name=row['first name'], last_name=row['last name'], middle_name=row['middle initial'], suffixes=row['suffix'], party=party, email=row['email'], url=row['URL'], office_address=office_address, office_phone=row['capitol phone']) leg.add_source(leg_url) for comm_code in row['committee codes'].split(';'): if comm_code: comm_name = self._committee_names[comm_code] leg.add_role('committee member', term, chamber='joint', committee=comm_name) self.save_legislator(leg)
def scrape(self, chamber, term): if term != '2011-2012': raise NoDataForPeriod(term) session = ((int(term[0:4]) - 2009) / 2) + 124 if chamber == 'upper': self.scrape_senators(chamber, session, term) elif chamber == 'lower': self.scrape_reps(chamber, session, term)
def _scrape_lower_chamber(self, year): # We only have data back to 2000. if int(year) < 2000: raise NoDataForPeriod(year) self.info('Scraping bills from lower chamber.') bill_page_url = '{}/BillList.aspx?year={}'.format( self._senate_base_url,year) self._parse_house_billpage(bill_page_url, year)
def scrape(self, chamber, year): if year != '2010': raise NoDataForPeriod(year) if chamber == 'upper': url = 'http://legis.state.nm.us/lcs/leg.aspx?T=S' else: url = 'http://legis.state.nm.us/lcs/leg.aspx?T=R' self.scrape_legislator_data(url, chamber)