def scrape_bills(self,chamber,year): self.log("Getting bill list for %s %s" % (chamber, year)) if chamber == 'upper': min_id = self.upper_min_id max_id = self.upper_max_id elif chamber == 'lower': min_id = self.lower_min_id max_id = self.lower_max_id for id in range(min_id, max_id): bill_info_url = 'http://dlr.leg.wa.gov/billsummary/default.aspx?year=%s&bill=%s' % (year, id) with self.soup_context(bill_info_url) as soup: print('opened %s', id) bill_id = soup.find('span', id='ctl00_contentRegion_lblShortBillID').string bill_title = soup.find('span', id='ctl00_contentRegion_lblBriefDescription').string print('bill_id ', bill_id) print('bill_title ', bill_title) session_name = self._session_dict[year] bill = Bill(session_name, chamber, bill_id, bill_title) bill.add_source(bill_info_url) self._scrape_bill_docs(soup, bill) self._scrape_bill_sponsors(soup, bill) self._scrape_bill_votes(soup, bill, chamber) self.add_bill(bill)
def parse_senate_billpage(self, bill_url, year): with self.soup_context(bill_url) as bill_page: # get all the info needed to record the bill bill_id = bill_page.find(id="lblBillNum").b.font.contents[0] bill_title = bill_page.find(id="lblBillTitle").font.string bill_desc = bill_page.find(id="lblBriefDesc").font.contents[0] bill_lr = bill_page.find(id="lblLRNum").font.string bill = Bill(year, 'upper', bill_id, bill_desc, bill_url=bill_url, bill_lr=bill_lr, official_title=bill_title) bill.add_source(bill_url) # Get the primary sponsor bill_sponsor = bill_page.find(id="hlSponsor").i.font.contents[0] bill_sponsor_link = bill_page.find(id="hlSponsor").href bill.add_sponsor('primary', bill_sponsor, sponsor_link=bill_sponsor_link) # cosponsors show up on their own page, if they exist cosponsor_tag = bill_page.find(id="hlCoSponsors") if cosponsor_tag and 'href' in cosponsor_tag: self.parse_senate_cosponsors(bill, cosponsor_tag['href']) # get the actions action_url = bill_page.find(id="hlAllActions")['href'] self.parse_senate_actions(bill, action_url) # stored on a separate page versions_url = bill_page.find(id="hlFullBillText") if versions_url: self.parse_senate_bill_versions(bill, versions_url['href']) self.save_bill(bill)
def scrape2009(self, url, year, chamberName, session, number): "e.g. http://www.legis.ga.gov/legis/2009_10/sum/sum/sb1.htm" page = parse(url).getroot() # Bill name = page.cssselect('#legislation h1')[0].text_content().strip() bill = Bill(session, chamberName, number, name) # Sponsorships for a in page.cssselect("#sponsors a"): bill.add_sponsor('', a.text_content().strip()) # Actions for row in page.cssselect('#history tr')[1:]: date = row[0].text_content().strip() action_text = row[1].text_content().strip() if '/' not in date: continue if action_text.startswith('Senate'): bill.add_action('upper', action_text, date) elif action_text.startswith('House'): bill.add_action('lower', action_text, date) # Versions for row in page.cssselect('#versions a'): bill.add_version(a.text_content(), urlparse.urljoin(url, a.get('href'))) self.add_bill(bill)
def scrape2003(self, url, year, chamberName, session, number): "e.g. http://www.legis.ga.gov/legis/2003_04/sum/sum/sb1.htm" page = parse(url).getroot() # Grab the interesting tables on the page. tables = page.cssselect('center table') # Bill name = tables[0].text_content().split('-', 1)[1] bill = Bill(session, chamberName, number, name) # Sponsorships for a in tables[1].cssselect('a'): bill.add_sponsor('', a.text_content().strip()) # Actions center = page.cssselect('center table center')[0] for row in center.cssselect('table')[-2].cssselect('tr')[2:]: date = row[0].text_content().strip() action_text = row[1].text_content().strip() if '/' not in date: continue if action_text.startswith('Senate'): bill.add_action('upper', action_text, date) elif action_text.startswith('House'): bill.add_action('lower', action_text, date) # Versions for row in center.cssselect('table')[-1].cssselect('a'): bill.add_version(a.text_content(), urlparse.urljoin(url, a.get('href'))) self.add_bill(bill)
def scrape1999(self, url, year, chamberName, session, number): "e.g. http://www.legis.ga.gov/legis/1999_00/leg/sum/sb1.htm" page = parse(url).getroot() # Grab the interesting tables on the page. tables = page.cssselect('table') # Bill name = tables[1].cssselect('a')[0].text_content().split('-', 1)[1] bill = Bill(session, chamberName, number, name) # Versions bill.add_version('Current', url.replace('/sum/', '/fulltext/')) # Sponsorships for a in tables[2].cssselect('a'): bill.add_sponsor('', a.text_content().strip()) # Actions for row in tables[-1].cssselect('tr'): senate_date = row[0].text_content().strip() action_text = row[1].text_content().strip() house_date = row[2].text_content().strip() if '/' not in senate_date and '/' not in house_date: continue if senate_date: bill.add_action('upper', action_text, senate_date) if house_date: bill.add_action('lower', action_text, house_date) self.add_bill(bill)
def scrape2009(self, url, year, chamberName, session, number): "e.g. http://www.legis.ga.gov/legis/2009_10/sum/sb1.htm" page = parse(url).getroot() # Bill try: name = page.cssselect("#legislation h1")[0].text_content().strip() except: name = "Unknown" bill = Bill(session, chamberName, number, name) # Sponsorships for a in page.cssselect("#sponsors a"): bill.add_sponsor("", a.text_content().strip()) self.parse_votes(url, page, chamberName, bill) # Actions for row in page.cssselect("#history tr")[1:]: date = row[0].text_content().strip() action_text = row[1].text_content().strip() if "/" not in date: continue if action_text.startswith("Senate"): bill.add_action("upper", action_text, date) elif action_text.startswith("House"): bill.add_action("lower", action_text, date) # Versions for row in page.cssselect("#versions a"): bill.add_version(a.text_content(), urlparse.urljoin(url, a.get("href"))) self.add_bill(bill)
def scrape2003(self, url, year, chamberName, session, number): "e.g. http://www.legis.ga.gov/legis/2003_04/sum/sb1.htm" page = parse(url).getroot() # Grab the interesting tables on the page. tables = page.cssselect("center table") # Bill name = tables[0].text_content().split("-", 1)[1] bill = Bill(session, chamberName, number, name) # Sponsorships for a in tables[1].cssselect("a"): bill.add_sponsor("", a.text_content().strip()) self.parse_votes_2001_2004(url, page, chamberName, bill) # Actions center = page.cssselect("center table center")[0] for row in center.cssselect("table")[-2].cssselect("tr")[2:]: date = row[0].text_content().strip() action_text = row[1].text_content().strip() if "/" not in date: continue if action_text.startswith("Senate"): bill.add_action("upper", action_text, date) elif action_text.startswith("House"): bill.add_action("lower", action_text, date) # Versions for row in center.cssselect("table")[-1].cssselect("a"): bill.add_version(a.text_content(), urlparse.urljoin(url, a.get("href"))) self.add_bill(bill)
def scrape1999(self, url, year, chamberName, session, number): "e.g. http://www.legis.ga.gov/legis/1999_00/leg/sum/sb1.htm" page = parse(url).getroot() # Grab the interesting tables on the page. tables = page.cssselect("table") # Bill name = tables[1].cssselect("a")[0].text_content().split("-", 1)[1] bill = Bill(session, chamberName, number, name) # Versions bill.add_version("Current", url.replace("/sum/", "/fulltext/")) # Sponsorships for a in tables[2].cssselect("a"): bill.add_sponsor("", a.text_content().strip()) self.parse_votes_1999(url, page, chamberName, bill) # Actions for row in tables[-1].cssselect("tr"): senate_date = row[0].text_content().strip() action_text = row[1].text_content().strip() house_date = row[2].text_content().strip() if "/" not in senate_date and "/" not in house_date: continue if senate_date: bill.add_action("upper", action_text, senate_date) if house_date: bill.add_action("lower", action_text, house_date) self.add_bill(bill)
def get_bill_info(self, chamber, session, bill_detail_url): """Extracts all the requested info for a given bill. Calls the parent's methods to enter the results into CSV files. """ bill_detail_url_base='https://www.revisor.leg.state.mn.us/revisor/pages/search_status/' bill_detail_url = urlparse.urljoin(bill_detail_url_base, bill_detail_url) if chamber == "House": chamber = 'lower' else: chamber = 'upper' with self.soup_context(bill_detail_url) as bill_soup: bill_id = self.extract_bill_id(bill_soup) bill_title = self.extract_bill_title(bill_soup) bill = Bill(session, chamber, bill_id, bill_title) # get all versions of the bill. # Versions of a bill are on a separate page, linked to from the bill # details page in a link titled, "Bill Text". version_url_base = 'https://www.revisor.leg.state.mn.us' bill_version_link = self.extract_bill_version_link(bill_soup) version_detail_url = urlparse.urljoin(version_url_base, bill_version_link) with self.soup_context(version_detail_url) as version_soup: # MN bills can have multiple versions. Get them all, and loop over # the results, adding each one. bill_versions = self.extract_bill_versions(version_soup) for version in bill_versions: version_name = version['name'] version_url = urlparse.urljoin(version_url_base, version['url']) bill.add_version(version_name, version_url) # grab primary and cosponsors # MN uses "Primary Author" to name a bill's primary sponsor. # Everyone else listed will be added as a 'cosponsor'. sponsors = self.extract_bill_sponsors(bill_soup) primary_sponsor = sponsors[0] cosponsors = sponsors[1:] bill.add_sponsor('primary', primary_sponsor) for leg in cosponsors: bill.add_sponsor('cosponsor', leg) # Add Actions performed on the bill. bill_actions = self.extract_bill_actions(bill_soup, chamber) for action in bill_actions: action_chamber = action['action_chamber'] action_date = action['action_date'] action_text = action['action_text'] bill.add_action(action_chamber, action_text, action_date) self.add_bill(bill)
def get_bill_info(self, chamber, session, bill_detail_url, version_list_url): """Extracts all the requested info for a given bill. Calls the parent's methods to enter the results into JSON files. """ if chamber == "House": chamber = 'lower' else: chamber = 'upper' with self.soup_context(bill_detail_url) as bill_soup: bill_id = self.extract_bill_id(bill_soup) bill_title = self.extract_bill_title(bill_soup) bill = Bill(session, chamber, bill_id, bill_title) # Get all versions of the bill. # Versions of a bill are on a separate page, linked to from the column # labeled, "Bill Text", on the search results page. with self.soup_context(version_list_url) as version_soup: # MN bills can have multiple versions. Get them all, and loop over # the results, adding each one. self.debug("Extracting bill versions from: " + version_list_url) bill_versions = self.extract_bill_versions(version_soup) for version in bill_versions: version_name = version['name'] version_url = urlparse.urljoin(VERSION_URL_BASE, version['url']) bill.add_version(version_name, version_url) # grab primary and cosponsors # MN uses "Primary Author" to name a bill's primary sponsor. # Everyone else listed will be added as a 'cosponsor'. sponsors = self.extract_bill_sponsors(bill_soup) primary_sponsor = sponsors[0] cosponsors = sponsors[1:] bill.add_sponsor('primary', primary_sponsor) for leg in cosponsors: bill.add_sponsor('cosponsor', leg) # Add Actions performed on the bill. bill_actions = self.extract_bill_actions(bill_soup, chamber) for action in bill_actions: action_chamber = action['action_chamber'] action_date = action['action_date'] action_text = action['action_text'] bill.add_action(action_chamber, action_text, action_date) self.add_bill(bill)
def scrape1995(self, url, year, chamberName, session, number): "e.g. http://www.legis.ga.gov/legis/1995_96/leg/sum/sb1.htm" page = parse(url).getroot() # Bill name = page.cssselect('h3 br')[0].tail.split('-', 1)[1].strip() bill = Bill(session, chamberName, number, name) # Versions bill.add_version('Current', url.replace('/sum/', '/fulltext/')) # Sponsorships rows = page.cssselect('center table tr') for row in rows: if row.text_content().strip() == 'Sponsor and CoSponsors': continue if row.text_content().strip() == 'Links / Committees / Status': break for a in row.cssselect('a'): bill.add_sponsor('', a.text_content().strip()) # Actions # The actions are in a pre table that looks like: """ SENATE HOUSE ------------------------------------- 1/13/95 Read 1st time 2/6/95 1/31/95 Favorably Reported 2/1/95 Read 2nd Time 2/7/95 2/3/95 Read 3rd Time 2/3/95 Passed/Adopted """ actions = page.cssselect('pre')[0].text_content().split('\n') actions = actions[2:] for action in actions: senate_date = action[:22].strip() action_text = action[23:46].strip() house_date = action[46:].strip() if '/' not in senate_date and '/' not in house_date: continue if senate_date: bill.add_action('upper', action_text, senate_date) if house_date: bill.add_action('lower', action_text, house_date) self.add_bill(bill)
def scrape1995(self, url, year, chamberName, session, number): "e.g. http://www.legis.ga.gov/legis/1995_96/leg/sum/sb1.htm" page = parse(url).getroot() # Bill name = page.cssselect("h3 br")[0].tail.split("-", 1)[1].strip() bill = Bill(session, chamberName, number, name) # Versions bill.add_version("Current", url.replace("/sum/", "/fulltext/")) # Sponsorships rows = page.cssselect("center table tr") for row in rows: if row.text_content().strip() == "Sponsor and CoSponsors": continue if row.text_content().strip() == "Links / Committees / Status": break for a in row.cssselect("a"): bill.add_sponsor("", a.text_content().strip()) # Actions # The actions are in a pre table that looks like: """ SENATE HOUSE ------------------------------------- 1/13/95 Read 1st time 2/6/95 1/31/95 Favorably Reported 2/1/95 Read 2nd Time 2/7/95 2/3/95 Read 3rd Time 2/3/95 Passed/Adopted """ actions = page.cssselect("pre")[0].text_content().split("\n") actions = actions[2:] for action in actions: senate_date = action[:22].strip() action_text = action[23:46].strip() house_date = action[46:].strip() if "/" not in senate_date and "/" not in house_date: continue if senate_date: bill.add_action("upper", action_text, senate_date) if house_date: bill.add_action("lower", action_text, house_date) self.add_bill(bill)
def scrape_bill(self, chamber, session, billid, histurl, year): if year[0] != 'R': session = year else: session = self.metadata['session_details'][year][ 'sub_sessions'][int(year[0]) - 1] with self.urlopen_context(histurl) as data: soup = BS(cleansource(data)) basicinfo = soup.findAll('div', id='bhistleft')[0] hist = basicinfo.table sponsor = None title = None for b in basicinfo.findAll('b'): if b.next.startswith('SUMMARY'): title = b.findNextSiblings(text=True)[0].strip() elif b.next.startswith('SPONSOR'): for a in b.findNextSiblings('a'): if not issponsorlink(a): break sponsor = cleansponsor(a.contents[0]) bill = Bill(session, chamber, billid, title) if sponsor: bill.add_sponsor('primary', sponsor) for row in hist.findAll('tr'): link = row.td.a vlink = urlbase % link['href'] vname = link.contents[0].strip() bill.add_version(vname, vlink) history = soup.findAll('div', id='bhisttab')[0].table rows = history.findAll('tr')[1:] for row in rows: tds = row.findAll('td') if len(tds) < 2: # This is not actually an action continue date, action = row.findAll('td')[:2] date = dt.datetime.strptime(date.contents[0], '%m/%d/%y') action = action.contents[0].strip() if 'House' in action: actor = 'lower' elif 'Senate' in action: actor = 'upper' else: # for lack of a better actor = chamber bill.add_action(actor, action, date) self.add_bill(bill)
def scrape_session(self, chamber, session): if chamber == "lower": bill_abbr = "HB" else: bill_abbr = "SB" bill_list_url = "http://www.le.state.ut.us/~%s/bills.htm" % ( session.replace(' ', '')) self.log("Getting bill list for %s, %s" % (session, chamber)) try: base_bill_list = self.soup_parser(self.urlopen(bill_list_url)) except: # this session doesn't exist for this year return bill_list_link_re = re.compile('.*%s\d+ht.htm$' % bill_abbr) for link in base_bill_list.findAll('a', href=bill_list_link_re): bill_list = self.soup_parser(self.urlopen(link['href'])) bill_link_re = re.compile('.*billhtm/%s.*.htm' % bill_abbr) for bill_link in bill_list.findAll('a', href=bill_link_re): bill_id = bill_link.find(text=True).strip() bill_info_url = bill_link['href'] bill_info = self.soup_parser(self.urlopen(bill_info_url)) bill_title, primary_sponsor = bill_info.h3.contents[2].replace( ' ', ' ').strip().split(' -- ') bill = Bill(session, chamber, bill_id, bill_title) bill.add_source(bill_info_url) bill.add_sponsor('primary', primary_sponsor) status_re = re.compile('.*billsta/%s.*.htm' % bill_abbr.lower()) status_link = bill_info.find('a', href=status_re) if status_link: self.parse_status(bill, status_link['href']) text_find = bill_info.find( text="Bill Text (If you are having trouble viewing") if text_find: text_link_re = re.compile('.*\.htm') for text_link in text_find.parent.parent.findAll( 'a', href=text_link_re)[1:]: version_name = text_link.previous.strip() bill.add_version(version_name, text_link['href']) self.add_bill(bill)
def parse_bill(self, chamber, session, special, link): bill_number = link.contents[0] type = re.search('type=(B|R|)', link['href']).group(1) bill_id = "%s%s %s" % (bill_abbr(chamber), type, bill_number) bill_info_url = info_url(chamber, session, special, type, bill_number) with self.soup_context(bill_info_url) as info_page: title_label = info_page.find(text='Short Title:') title = title_label.findNext().contents[0] bill = Bill(session, chamber, bill_id, title) bill.add_source(bill_info_url) self.parse_bill_versions(bill, info_page) self.parse_history(bill, history_url(chamber, session, special, type, bill_number)) self.parse_votes(bill, vote_url(chamber, session, special, type, bill_number)) self.add_bill(bill)
def scrape1997(self, url, year, chamberName, session, number): "e.g. http://www.legis.ga.gov/legis/1997_98/leg/sum/sb1.htm" with self.lxml_context(url) as page: # Grab the interesting tables on the page. tables = [] for table in page.cssselect('center table'): if table.get('border') == '5': tables.append(table) # Bill name = page.cssselect('tr > td > font > b')[0].text_content().split( '-', 1)[1] bill = Bill(session, chamberName, number, name) # Versions bill.add_version('Current', url.replace('/sum/', '/fulltext/')) # Sponsorships for a in tables[0].cssselect('a'): if a.text_content().strip() == 'Current': break bill.add_sponsor('', a.text_content().strip()) # Actions for row in tables[1].cssselect('tr'): senate_date = row[0].text_content().strip() action_text = row[1].text_content().strip() house_date = row[2].text_content().strip() if '/' not in senate_date and '/' not in house_date: continue if senate_date: bill.add_action('upper', action_text, senate_date) if house_date: bill.add_action('lower', action_text, house_date) self.save_bill(bill)
def parse_bill(scraper, url): """Given a bill status URL, return a fully loaded Bill object, except for votes, which are expected to be handled externally. """ session = extract_session(url) chamber = chamber_for_doctype(extract_doctype(url)) s = get_soup(scraper, url) bill_id = extract_bill_id(s) landmark = s(text=re.compile(".*Short Description.*")) name_span = landmark[0].findParent().findNextSibling() bill_name = get_text(name_span) bill = Bill(session, chamber, bill_id, bill_name.strip(),status_url=url) actions = extract_actions(s) for chamber,action,date in actions: bill.add_action(chamber,action,date) #kwargs are permitted if we have 'em. sponsor_dict = extract_sponsors_from_actions([action[1] for action in actions]) for type,namelist in sponsor_dict.iteritems(): for name in namelist: bill.add_sponsor(type,name) for name,link in extract_versions(scraper, s): bill.add_version(name,link) return bill
def scrape_bills(self, chamber, year): if year != "2009": raise NoDataForYear if chamber == "upper": other_chamber = "lower" bill_id = "SB 1" else: other_chamber = "upper" bill_id = "HB 1" b1 = Bill("2009-2010", chamber, bill_id, "A super bill") b1.add_source("http://example.com") b1.add_version("As Introduced", "http://example.com/SB1.html") b1.add_document("Google", "http://google.com") b1.add_sponsor("primary", "Bob Smith") b1.add_sponsor("secondary", "Johnson, Sally") d1 = datetime.datetime.strptime("1/29/2010", "%m/%d/%Y") v1 = Vote("upper", d1, "Final passage", True, 2, 0, 0) v1.yes("Bob Smith") v1.yes("Sally Johnson") d2 = datetime.datetime.strptime("1/30/2010", "%m/%d/%Y") v2 = Vote("lower", d2, "Final passage", False, 0, 1, 1) v2.no("B. Smith") v2.other("Sally Johnson") b1.add_vote(v1) b1.add_vote(v2) b1.add_action(chamber, "introduced", d1) b1.add_action(chamber, "read first time", d1) b1.add_action(other_chamber, "introduced", d2) self.save_bill(b1)
def scrape_session(self, chamber, year): if chamber == "upper": bill_abbr = "SB|SCR|SJR" elif chamber == "lower": bill_abbr = "HB|HCR|HJR" # Sessions last 2 years, 1993-1994 was the 18th session = str(18 + ((int(year) - 1993) / 2)) year2 = str(int(year) + 1) # Full calendar year date1 = "0101" + year[2:] date2 = "1231" + year2[2:] # Get bill list bill_list_url = "http://www.legis.state.ak.us/" "basis/range_multi.asp?session=%s&date1=%s&date2=%s" % ( session, date1, date2, ) self.log("Getting bill list for %s %s (this may take a long time)." % (chamber, session)) bill_list = self.soup_parser(self.urlopen(bill_list_url)) # Find bill links re_str = "bill=%s\d+" % bill_abbr links = bill_list.findAll(href=re.compile(re_str)) for link in links: bill_id = link.contents[0].replace(" ", "") bill_name = link.parent.parent.findNext("td").find("font").contents[0].strip() bill = Bill(session, chamber, bill_id, bill_name.strip()) # Get the bill info page and strip malformed t info_url = "http://www.legis.state.ak.us/basis/%s" % link["href"] info_page = self.soup_parser(self.urlopen(info_url)) bill.add_source(info_url) # Get sponsors spons_str = info_page.find(text="SPONSOR(s):").parent.parent.contents[1] sponsors_match = re.match(" (SENATOR|REPRESENTATIVE)\([Ss]\) ([^,]+(,[^,]+){0,})", spons_str) if sponsors_match: sponsors = sponsors_match.group(2).split(",") bill.add_sponsor("primary", sponsors[0].strip()) for sponsor in sponsors[1:]: bill.add_sponsor("cosponsor", sponsor.strip()) else: # Committee sponsorship bill.add_sponsor("committee", spons_str.strip()) # Get actions act_rows = info_page.findAll("table", "myth")[1].findAll("tr")[1:] for row in act_rows: cols = row.findAll("td") act_date = cols[0].font.contents[0] act_date = dt.datetime.strptime(act_date, "%m/%d/%y") if cols[2].font.string == "(H)": act_chamber = "lower" elif cols[2].font.string == "(S)": act_chamber = "upper" else: act_chamber = chamber action = cols[3].font.contents[0].strip() if re.match("\w+ Y(\d+) N(\d+)", action): vote = self.parse_vote(bill, action, act_chamber, act_date, cols[1].a["href"]) bill.add_vote(vote) bill.add_action(act_chamber, action, act_date) # Get subjects bill["subjects"] = [] subject_link_re = re.compile(".*subject=\w+$") for subject_link in info_page.findAll("a", href=subject_link_re): subject = subject_link.contents[0].strip() bill["subjects"].append(subject) # Get versions text_list_url = "http://www.legis.state.ak.us/" "basis/get_fulltext.asp?session=%s&bill=%s" % ( session, bill_id, ) text_list = self.soup_parser(self.urlopen(text_list_url)) bill.add_source(text_list_url) text_link_re = re.compile("^get_bill_text?") for text_link in text_list.findAll("a", href=text_link_re): text_name = text_link.parent.previousSibling.contents[0] text_name = text_name.strip() text_url = "http://www.legis.state.ak.us/basis/%s" % (text_link["href"]) bill.add_version(text_name, text_url) self.add_bill(bill)
def scrape_session(self, chamber, year): if chamber == 'upper': bill_abbr = 'SB|SCR|SJR' elif chamber == 'lower': bill_abbr = 'HB|HCR|HJR' # Sessions last 2 years, 1993-1994 was the 18th session = str(18 + ((int(year) - 1993) / 2)) year2 = str(int(year) + 1) # Full calendar year date1 = '0101' + year[2:] date2 = '1231' + year2[2:] # Get bill list bill_list_url = 'http://www.legis.state.ak.us/'\ 'basis/range_multi.asp?session=%s&date1=%s&date2=%s' % ( session, date1, date2) self.log("Getting bill list for %s %s (this may take a long time)." % (chamber, session)) bill_list = self.soup_parser(self.urlopen(bill_list_url)) # Find bill links re_str = "bill=%s\d+" % bill_abbr links = bill_list.findAll(href=re.compile(re_str)) for link in links: bill_id = link.contents[0].replace(' ', '') bill_name = link.parent.parent.findNext('td').find( 'font').contents[0].strip() bill = Bill(session, chamber, bill_id, bill_name.strip()) # Get the bill info page and strip malformed t info_url = "http://www.legis.state.ak.us/basis/%s" % link['href'] info_page = self.soup_parser(self.urlopen(info_url)) bill.add_source(info_url) # Get sponsors spons_str = info_page.find( text="SPONSOR(s):").parent.parent.contents[1] sponsors_match = re.match( ' (SENATOR|REPRESENTATIVE)\([Ss]\) ([^,]+(,[^,]+){0,})', spons_str) if sponsors_match: sponsors = sponsors_match.group(2).split(',') bill.add_sponsor('primary', sponsors[0].strip()) for sponsor in sponsors[1:]: bill.add_sponsor('cosponsor', sponsor.strip()) else: # Committee sponsorship bill.add_sponsor('committee', spons_str.strip()) # Get actions act_rows = info_page.findAll('table', 'myth')[1].findAll('tr')[1:] for row in act_rows: cols = row.findAll('td') act_date = cols[0].font.contents[0] act_date = dt.datetime.strptime(act_date, '%m/%d/%y') if cols[2].font.string == "(H)": act_chamber = "lower" elif cols[2].font.string == "(S)": act_chamber = "upper" else: act_chamber = chamber action = cols[3].font.contents[0].strip() if re.match("\w+ Y(\d+) N(\d+)", action): try: vote = self.parse_vote(bill, action, act_chamber, act_date, cols[1].a['href']) bill.add_vote(vote) except: self.log("Failed parsing vote at %s" % cols[1].a['href']) bill.add_action(act_chamber, action, act_date) # Get subjects bill['subjects'] = [] subject_link_re = re.compile('.*subject=\w+$') for subject_link in info_page.findAll('a', href=subject_link_re): subject = subject_link.contents[0].strip() bill['subjects'].append(subject) # Get versions text_list_url = "http://www.legis.state.ak.us/"\ "basis/get_fulltext.asp?session=%s&bill=%s" % ( session, bill_id) text_list = self.soup_parser(self.urlopen(text_list_url)) bill.add_source(text_list_url) text_link_re = re.compile('^get_bill_text?') for text_link in text_list.findAll('a', href=text_link_re): text_name = text_link.parent.previousSibling.contents[0] text_name = text_name.strip() text_url = "http://www.legis.state.ak.us/basis/%s" % ( text_link['href']) bill.add_version(text_name, text_url) self.save_bill(bill)
def scrape_bills(self,chamber,year): if int(year) %2 == 0: raise NoDataForYear(year) # year = int(year) oyear = year #save off the original of the session if chamber == 'upper': bill_no = 1 abbr = 'SB' else: bill_no = 4001 abbr = 'HB' while True: (bill_page,year) = self.scrape_bill(year, abbr, bill_no) # if we can't find a page, we must be done. This is a healthy thing. if bill_page == None: return title = ''.join(self.flatten(bill_page.findAll(id='frg_billstatus_ObjectSubject')[0])) title = title.replace('\n','').replace('\r','') bill_id = "%s %d" % (abbr, bill_no) the_bill = Bill("Regular Session %d" % oyear, chamber, bill_id, title) #sponsors first = 0 for name in bill_page.findAll(id='frg_billstatus_SponsorList')[0].findAll('a'): the_bill.add_sponsor(['primary', 'cosponsor'][first], name.string) first = 1 #versions for doc in bill_page.findAll(id='frg_billstatus_DocumentGridTable')[0].findAll('tr'): r = self.parse_doc(the_bill, doc) if r: the_bill.add_version(*r) #documents if 'frg_billstatus_HlaTable' in str(bill_page): for doc in bill_page.findAll(id='frg_billstatus_HlaTable')[0].findAll('tr'): r = self.parse_doc(the_bill, doc) if r: the_bill.add_document(*r) if 'frg_billstatus_SfaSection' in str(bill_page): for doc in bill_page.findAll(id='frg_billstatus_SfaSection')[0].findAll('tr'): r = self.parse_doc(the_bill, doc) if r: the_bill.add_document(*r) the_bill.add_source('http://legislature.mi.gov/doc.aspx?%d-%s-%04d' % (year, abbr, bill_no)) self.parse_actions(the_bill, bill_page.findAll(id='frg_billstatus_HistoriesGridView')[0]) self.add_bill(the_bill) bill_no = bill_no + 1 pass
def parse_bill(self, chamber, session, bill_id, bill_info_url): with self.urlopen_context(bill_info_url) as bill_info_data: bill_info = self.soup_parser(bill_info_data) version_url = '%s/bill.doc' % bill_id version_link = bill_info.find(href=version_url) if not version_link: # This bill was withdrawn return bill_title = version_link.findNext('p').contents[0].strip() bill = Bill(session, chamber, bill_id, bill_title) bill.add_version("Most Recent Version", session_url(session) + version_url) bill.add_source(bill_info_url) sponsor_links = bill_info.findAll(href=re.compile( 'legislator/[SH]\d+\.htm')) for sponsor_link in sponsor_links: bill.add_sponsor('primary', sponsor_link.contents[0].strip()) action_p = version_link.findAllNext('p')[-1] for action in action_p.findAll(text=True): action = action.strip() if (not action or action == 'last action' or 'Prefiled' in action): continue action_date = action.split('-')[0] action_date = dt.datetime.strptime(action_date, '%b %d') # Fix: action_date = action_date.replace( year=int('20' + session[2:4])) action = '-'.join(action.split('-')[1:]) if action.endswith('House') or action.endswith('(H)'): actor = 'lower' elif action.endswith('Senate') or action.endswith('(S)'): actor = 'upper' else: actor = chamber bill.add_action(actor, action, action_date) vote_link = bill_info.find(href=re.compile('.*/vote_history.pdf')) if vote_link: bill.add_document( 'vote_history.pdf', bill_info_url.replace('.htm', '') + "/vote_history.pdf") self.add_bill(bill)
def scrape_bill(self, chamber, current_bill, session): other_chamber = 'upper' if chamber == 'lower' else 'lower' with self.soup_context("http://alisondb.legislature.state.al.us/acas/SESSBillsStatusResultsMac.asp?BillNumber=%s&GetStatus=Get+Status&session=%s" % (current_bill, session[0])) as bill: if "Your ACAS Session has expired." in str(bill): raise Exception("Expired cookie - you'll have to run with -n to skip caching") try: bill_id = int(re.findall(r'BTN([0-9]+)', str(bill))[0]) except: raise Exception("No bill found. Hopefully that means it's the end of the session") title = bill.find("td", {'colspan': '7'}).string self.log("Starting parse of %s" % current_bill) #create our bill! bill = Bill(session[1], chamber, current_bill, title.strip()) #add sponsors and co-sponsors with self.soup_context("http://alisondb.legislature.state.al.us/acas/ACTIONSponsorsResultsMac.asp?OID=%d" % bill_id) as sponsors: # This pains me. (primary,secondary) = sponsors.findAll("table", text="Co-Sponsors")[0].parent.parent.parent.findAll('table') for p in primary.findAll('td'): bill.add_sponsor('primary', p.string) for s in secondary.findAll('td'): bill.add_sponsor('cosponsor', s.string) with self.soup_context("http://alisondb.legislature.state.al.us/acas/ACTIONHistoryResultsMac.asp?OID=%d" % bill_id) as history: actions = history.findAll('table', text="Committee")[0].parent.parent.parent.findAll('tr') #Date Amend/Subst Matter Committee Nay Yea Abs Vote for event in actions: e = event.findAll('td') if len(e) == 0: continue date = e[0].string amend = e[1].find('input') matter = e[2].string y_votes = e[5].string n_votes = e[4].string a_votes = e[6].string if not matter: continue roll = e[7].find('input') #(date, amend, matter, committee, nays, yeas, abs, vote_thing) = map(lambda x: x.string, e) if date != None: act_date = dt.datetime.strptime(date, '%m/%d/%Y') if amend != None: splitter = re.findall(r'documentSelected\(\'(\w*)\',\'([\w\d-]*)\',\'([\w\.\-]*)\',\'([\w\d/]*)\',\'([\w\d]*)\',\'([\w\s]*)\'', str(amend))[0] amend = "http://alisondb.legislature.state.al.us/acas/%s/%s" % (splitter[3], splitter[2]) bill.add_document(matter, amend) if roll != None: splitter = re.findall(r'voteSelected\(\'(\d*)\',\'(\d*)\',\'(\d*)\',\'(.*)\',\'(\d*)\'',str(roll))[0] roll = "http://alisondb.legislature.state.al.us/acas/GetRollCallVoteResults.asp?MOID=%s&VOTE=%s&BODY=%s&SESS=%s" % (splitter[0], splitter[1], splitter[2], splitter[4]) with self.soup_context(roll) as votes: vote_rows = votes.findAll('table', text='Member')[0].parent.parent.parent.findAll('tr') yea_votes = int(votes.findAll('tr', text='Total Yea:')[0].parent.parent.findAll('td')[2].string) nay_votes = int(votes.findAll('tr', text='Total Nay:')[0].parent.parent.findAll('td')[2].string) abs_votes = int(votes.findAll('tr', text='Total Abs:')[0].parent.parent.findAll('td')[2].string) p_votes = len(votes.findAll('tr', text='P')) #chamber, date, motion, passed, yes_count, no_count, other_count vote = Vote(chamber, act_date, matter, (yea_votes > nay_votes), yea_votes, nay_votes, abs_votes + p_votes) vote.add_source(roll) for row in vote_rows: skip = str(row) if "Total Yea" in skip or "Total Nay" in skip or "Total Abs" in skip: continue html_layouts_are_awesome = row.findAll('td') if len(html_layouts_are_awesome) == 0: continue (name, t) = html_layouts_are_awesome[0].string, html_layouts_are_awesome[2].string self.dumb_vote(vote, name, t) if len(html_layouts_are_awesome) > 3: (name, t) = html_layouts_are_awesome[4].string, html_layouts_are_awesome[6].string self.dumb_vote(vote, name, t) bill.add_vote(vote) if y_votes != None: yea_votes = self.dumber_vote(y_votes) nay_votes = self.dumber_vote(n_votes) abs_votes = self.dumber_vote(a_votes) vote = Vote(chamber, act_date, matter, (yea_votes > nay_votes), yea_votes, nay_votes, abs_votes) bill.add_vote(vote) bill.add_action(chamber, matter, act_date) self.add_bill(bill)
def scrape_bills(self, chamber, year): """ Scrape the ND bills considered in a given chamber during a given year. """ # Error checking if year not in self.metadata['session_details']: raise NoDataForYear(year) # URL building if chamber == 'upper': url_chamber_name = 'senate' norm_chamber_name = 'Senate' else: url_chamber_name = 'house' norm_chamber_name = 'House' assembly_url = '/assembly/%i-%s' % ( self.metadata['session_details'][str(year)]['number'], year) chamber_url = '/bill-text/%s-bill.html' % (url_chamber_name) list_url = self.site_root + assembly_url + chamber_url # Parsing soup = self.parser.parse(self.urlopen(list_url)) if not soup: raise ScrapeError('Failed to parse legaslative list page.') table = soup.find('table', summary=norm_chamber_name + ' Bills') bill_links = table.findAll('a', href=re.compile('bill-actions')) indexed_bills = {} self.log('Scraping %s bills for %s.' % (norm_chamber_name, year)) for link in bill_links: # Populate base attributes attributes = { 'session': year, 'chamber': chamber, } bill_number = link.contents[0] if not re.match('^[0-9]{4}$', bill_number): raise ScrapeError('Bill number not in expected format.') # ND bill prefixes are coded numerically if bill_number[0] == '1': bill_prefix = 'HB' elif bill_number[0] == '2': bill_prefix = 'SB' elif bill_number[0] == '3': bill_prefix = 'HCR' elif bill_number[0] == '4': bill_prefix = 'SCR' elif bill_number[0] == '5': bill_prefix = 'HR' elif bill_number[0] == '6': bill_prefix = 'SR' elif bill_number[0] == '7': bill_prefix = 'HMR' elif bill_number[0] == '8': bill_prefix = 'SMR' attributes['bill_id'] = bill_prefix + ' ' + bill_number # Skip duplicates (bill is listed once for each version) if attributes['bill_id'] in indexed_bills.keys(): continue self.debug(attributes['bill_id']) # Parse details page attributes.update( self.scrape_bill_details(assembly_url, bill_number)) # Create bill bill = Bill(**attributes) # Parse actions (actions, actions_url) = self.scrape_bill_actions( assembly_url, bill_number, year) bill.add_source(actions_url) for action in actions: bill.add_action(**action) # Parse versions (versions, versions_url) = self.scrape_bill_versions( assembly_url, bill_number) bill.add_source(versions_url) for version in versions: bill.add_version(**version) # Add bill to dictionary, indexed by its id indexed_bills[attributes['bill_id']] = bill # Parse sponsorship data if int(year) >= 2005: self.log('Scraping sponsorship data.') (sponsors, sponsors_url) = self.scrape_bill_sponsors(assembly_url) for bill_id, sponsor_list in sponsors.items(): for sponsor in sponsor_list: # Its possible a bill was misnamed somewhere... but thats # not a good enough reason to error out if bill_id in indexed_bills.keys(): bill = indexed_bills[bill_id] bill.add_sponsor(**sponsor) bill.add_source(sponsors_url) else: self.log('Sponsorship data not available for %s.' % year) self.log('Saving scraped bills.') # Save bill for bill in indexed_bills.values(): self.save_bill(bill)
def parse_bill_xml(self, chamber, session, txt): root = lxml.etree.fromstring(txt) bill_id = ' '.join(root.attrib['bill'].split(' ')[1:]) bill_title = root.findtext("caption") if session[2] == 'R': session = session[0:2] bill = Bill(session, chamber, bill_id, bill_title) for action in root.findall('actions/action'): act_date = dt.datetime.strptime(action.findtext('date'), "%m/%d/%Y") extra = {} extra['action_number'] = action.find('actionNumber').text comment = action.find('comment') if comment is not None and comment.text: extra['comment'] = comment.text.strip() actor = {'H': 'lower', 'S': 'upper', 'E': 'executive'}[extra['action_number'][0]] bill.add_action(actor, action.findtext('description'), act_date, **extra) for author in root.findtext('authors').split(' | '): if author != "": bill.add_sponsor('author', author) for coauthor in root.findtext('coauthors').split(' | '): if coauthor != "": bill.add_sponsor('coauthor', coauthor) for sponsor in root.findtext('sponsors').split(' | '): if sponsor != "": bill.add_sponsor('sponsor', sponsor) for cosponsor in root.findtext('cosponsors').split(' | '): if cosponsor != "": bill.add_sponsor('cosponsor', cosponsor) bill['subjects'] = [] for subject in root.iterfind('subjects/subject'): bill['subjects'].append(subject.text.strip()) return bill
def scrape_session_new(self, chamber, session): if chamber == "lower": bill_abbr = "H." else: bill_abbr = "S." bill_list_path = "docs/bills.cfm?Session=%s&Body=%s" % ( session.split('-')[1], bill_abbr[0]) bill_list_url = "http://www.leg.state.vt.us/" + bill_list_path bill_list = BeautifulSoup(self.urlopen(bill_list_url)) bill_link_re = re.compile('.*?Bill=%s\.\d+.*' % bill_abbr[0]) for bill_link in bill_list.findAll('a', href=bill_link_re): bill_id = bill_link.string bill_title = bill_link.parent.findNext('b').string bill_info_url = "http://www.leg.state.vt.us" + bill_link['href'] bill = Bill(session, chamber, bill_id, bill_title) bill.add_source(bill_info_url) info_page = BeautifulSoup(self.urlopen(bill_info_url)) text_links = info_page.findAll('blockquote')[1].findAll('a') for text_link in text_links: bill.add_version(text_link.string, "http://www.leg.state.vt.us" + text_link['href']) act_table = info_page.findAll('blockquote')[2].table for row in act_table.findAll('tr')[1:]: action = "" for s in row.findAll('td')[1].findAll(text=True): action += s + " " action = action.strip() match = re.search('Governor on (.*)$', action) if match: act_date = parse_exec_date(match.group(1).strip()) actor = 'Governor' else: if row['bgcolor'] == 'Salmon': actor = 'lower' else: actor = 'upper' if row.td.a: act_date = row.td.a.string else: act_date = row.td.string act_date = re.search( '\d{1,2}/\d{1,2}/\d{4,4}', act_date).group(0) act_date = dt.datetime.strptime(act_date, '%m/%d/%Y') bill.add_action(actor, action, act_date) vote_link = row.find('a', text='Details') if vote_link: vote_url = vote_link.parent['href'] self.parse_vote_new(bill, actor, vote_url) sponsors = info_page.find( text='Sponsor(s):').parent.parent.findAll('b') bill.add_sponsor('primary', sponsors[0].string) for sponsor in sponsors[1:]: bill.add_sponsor('cosponsor', sponsor.string) self.save_bill(bill)
def scrape_session_old(self, chamber, session): if chamber == "lower": bill_abbr = "H." chamber_name = "House" other_chamber = "Senate" else: bill_abbr = "S." chamber_name = "Senate" other_chamber = "House" start_date = '1/1/%s' % session.split('-')[0] data = urllib.urlencode({'Date': start_date, 'Body': bill_abbr[0], 'Session': session.split('-')[1]}) bill_list_url = "http://www.leg.state.vt.us/database/"\ "rintro/results.cfm" bill_list = BeautifulSoup(urllib2.urlopen(bill_list_url, data)) bill_link_re = re.compile('.*?Bill=%s.\d+.*' % bill_abbr[0]) for bill_link in bill_list.findAll('a', href=bill_link_re): bill_id = bill_link.string bill_title = bill_link.parent.parent.findAll('td')[1].string bill_info_url = "http://www.leg.state.vt.us" + bill_link['href'] bill = Bill(session, chamber, bill_id, bill_title) bill.add_source(bill_info_url) info_page = BeautifulSoup(self.urlopen(bill_info_url)) text_links = info_page.findAll('blockquote')[-1].findAll('a') for text_link in text_links: bill.add_version(text_link.string, "http://www.leg.state.vt.us" + text_link['href']) sponsors = info_page.find( text='Sponsor(s):').parent.findNext('td').findAll('b') bill.add_sponsor('primary', sponsors[0].string) for sponsor in sponsors[1:]: bill.add_sponsor('cosponsor', sponsor.string) # Grab actions from the originating chamber act_table = info_page.find( text='%s Status:' % chamber_name).findNext('table') for row in act_table.findAll('tr')[3:]: action = row.td.string.replace(' ', '').strip(':') act_date = row.findAll('td')[1].b.string.replace(' ', '') if act_date != "": detail = row.findAll('td')[2].b if detail and detail.string != "": action += ": %s" % detail.string.replace(' ', '') bill.add_action(chamber, action, act_date) # Grab actions from the other chamber act_table = info_page.find( text='%s Status:' % other_chamber).findNext('table') if act_table: if chamber == 'upper': act_chamber = 'lower' else: act_chamber = 'upper' for row in act_table.findAll('tr')[3:]: action = row.td.string.replace(' ', '').strip(':') act_date = row.findAll('td')[1].b.string.replace( ' ', '') if act_date != "": detail = row.findAll('td')[2].b if detail and detail.string != "": action += ": %s" % detail.string.replace( ' ', '') date = dt.datetime.strptime(act_date, '%m/%d/%Y') bill.add_action(act_chamber, action, act_date) self.save_bill(bill)
def scrape_old_session(self, chamber, session): """ Scrape SD's bill data from 1997 through 2008. """ if chamber == 'upper': bill_abbr = 'SB' else: bill_abbr = 'HB' # Get bill list page (and replace malformed tags that some versions of # BeautifulSoup choke on) session_url = 'http://legis.state.sd.us/sessions/%s/' % session bill_list_url = session_url + 'billlist.htm' bill_list = self.soup_parser(self.urlopen(bill_list_url)) # Bill and text link formats bill_re = re.compile('%s (\d+)' % bill_abbr) text_re = re.compile('/sessions/%s/bills/%s.*\.htm' % ( session, bill_abbr), re.IGNORECASE) date_re = re.compile('\d{2}/\d{2}/\d{4}') for bill_link in bill_list.findAll('a', href=re.compile('\d\.htm$')): if len(bill_link.contents) == 0: # Empty link continue bill_match = bill_re.match(bill_link.contents[0]) if not bill_match: # Not bill link continue # Get the bill ID and name bill_id = bill_link.contents[0] bill_name = bill_link.findNext().contents[0] # Get history page (replacing malformed tag) hist_url = session_url + bill_link['href'] history = self.soup_parser(self.urlopen(hist_url)) # Get URL of latest verion of bill (should be listed last) bill_url = history.findAll('a', href=text_re)[-1]['href'] bill_url = 'http://legis.state.sd.us%s' % bill_url # Add bill bill = Bill(session, chamber, bill_id, bill_name) bill.add_source(hist_url) # Get bill versions text_table = history.findAll('table')[1] for row in text_table.findAll('tr')[2:]: #version_date = row.find('td').string version_path = row.findAll('td')[1].a['href'] version_url = "http://legis.state.sd.us" + version_path version_name = row.findAll('td')[1].a.contents[0].strip() bill.add_version(version_name, version_url) # Get actions act_table = history.find('table') for act_row in act_table.findAll('tr')[6:]: if act_row.find(text="Action"): continue # Get the date (if can't find one then this isn't an action) date_match = date_re.match(act_row.td.a.contents[0]) if not date_match: continue act_date = date_match.group(0) act_date = dt.datetime.strptime(act_date, "%m/%d/%Y") # Get the action string action = "" for node in act_row.findAll('td')[1].contents: if hasattr(node, 'contents'): action += node.contents[0] if node.contents[0].startswith('YEAS'): # This is a vote! if node['href'][0] == '/': vote_url = "http://legis.state.sd.us/%s" % ( node['href']) else: vote_url = "http://legis.state.sd.us/"\ "sessions/%s/%s" % (session, node['href']) vote = self.scrape_old_vote(vote_url) vote['date'] = act_date bill.add_vote(vote) else: action += node action = action.strip() # Add action bill.add_action(chamber, action, act_date) self.save_bill(bill)
def scrape_session(self, chamber, year, prefix, session): def parse_sponsors(bill, line, chamber): sponsor_type = None if chamber == "upper": leg_chamber = {"primary": "upper", "cosponsor": "lower"} else: leg_chamber = {"primary": "lower", "cosponsor": "upper"} for r in re.split(r"\sand\s|\,|;", line): r = r.strip() if r.find("Introduced by") != -1: sponsor_type = "primary" r = re.split(r"Introduced by \w+", r)[1] if r.find("cosponsored by") != -1: sponsor_type = "cosponsor" r = re.split(r"cosponsored by \w+", r)[1] bill.add_sponsor(sponsor_type, r.strip(), chamber=leg_chamber[sponsor_type]) def parse_action(bill, line, actor, date): line = lxml.html.fromstring(line) sane = line.text_content() # "06-18. S. Received from Assembly ................................... 220 " # "___________ __________________________________________" # 11 sane = sane.strip()[11:] # take out the date and house if sane.find("..") != -1: sane = sane[0 : sane.find(" ..")] # clear out bookkeeping bill.add_action(actor, sane, date) for doc in line.findall("a"): # have this treat amendments better, as they show up like "1" or "3" now.. bill.add_document(doc.text_content(), doc.get("href")) if sane.find("Ayes") != -1: self.add_vote(bill, actor, date, sane) house = "SB" if (chamber == "upper") else "AB" chambers = {"S": "upper", "A": "lower"} i = 1 while True: try: url = "http://www.legis.state.wi.us/%s/data/%s%s%dhst.html" % (year, prefix, house, i) body = unicode(self.urlopen(url), "latin-1") except urllib2.HTTPError as e: # 404tastic return page = lxml.html.fromstring(body).cssselect("pre")[0] # split the history into each line, exluding all blank lines and the title line history = filter(lambda x: len(x.strip()) > 0, lxml.html.tostring(page).split("\n"))[2:-1] buffer = "" bill_id = page.find("a").text_content() bill_title = None bill_sponsors = False current_year = None action_date = None current_chamber = None for line in history: stop = False # the year changed if re.match(r"^(\d{4})[\s]{0,1}$", line): current_year = int(line.strip()) continue # the action changed. if re.match(r"\s+(\d{2})-(\d{2}).\s\s([AS])\.\s", line): dm = re.findall(r"\s+(\d{2})-(\d{2}).\s\s([AS])\.\s", line)[0] workdata = buffer buffer = "" stop = True buffer = buffer + " " + line.strip() if stop and not bill_title: bill_title = workdata bill = Bill(session, chamber, bill_id, bill_title) continue if stop and not bill_sponsors: parse_sponsors(bill, workdata, chamber) bill_sponsors = True current_chamber = chambers[dm[2]] action_date = dt.datetime(current_year, int(dm[0]), int(dm[1])) continue if stop: parse_action(bill, workdata, current_chamber, action_date) # now update the date current_chamber = chambers[dm[2]] action_date = dt.datetime(current_year, int(dm[0]), int(dm[1])) current_chamber = chambers[dm[2]] action_date = dt.datetime(current_year, int(dm[0]), int(dm[1])) parse_action(bill, buffer, current_chamber, action_date) bill.add_source(url) self.save_bill(bill) i = i + 1
def scrape_new_session(self, chamber, session): """ Scrapes SD's bill data from 2009 on. """ if chamber == 'upper': bill_abbr = 'SB' elif chamber == 'lower': bill_abbr = 'HB' # Get bill list page session_url = 'http://legis.state.sd.us/sessions/%s/' % session bill_list_url = session_url + 'BillList.aspx' self.log('Getting bill list for %s %s' % (chamber, session)) bill_list = self.soup_parser(self.urlopen(bill_list_url)) # Format of bill link contents bill_re = re.compile(u'%s\xa0(\d+)' % bill_abbr) date_re = re.compile('\d{2}/\d{2}/\d{4}') for bill_link in bill_list.findAll('a'): if len(bill_link.contents) == 0: # Empty link continue #print bill_link.contents[0] bill_match = bill_re.search(bill_link.contents[0]) if not bill_match: continue # Parse bill ID and name bill_id = bill_link.contents[0].replace(u'\xa0', ' ') bill_name = bill_link.findNext().contents[0] # Download history page hist_url = session_url + bill_link['href'] history = self.soup_parser(self.urlopen(hist_url)) bill = Bill(session, chamber, bill_id, bill_name) bill.add_source(hist_url) # Get all bill versions text_table = history.findAll('table')[1] for row in text_table.findAll('tr')[2:]: #version_date = row.find('td').string version_path = row.findAll('td')[1].a['href'] version_url = "http://legis.state.sd.us/sessions/%s/%s" % ( session, version_path) version_name = row.findAll('td')[1].a.contents[0].strip() bill.add_version(version_name, version_url) # Get actions act_table = history.find('table') for act_row in act_table.findAll('tr')[6:]: if act_row.find(text='Action'): continue # Get the date (if can't find one then this isn't an action) date_match = date_re.match(act_row.td.a.contents[0]) if not date_match: continue act_date = date_match.group(0) act_date = dt.datetime.strptime(act_date, "%m/%d/%Y") # Get the action string action = "" for node in act_row.findAll('td')[1].contents: if hasattr(node, 'contents'): action += node.contents[0] if node.contents[0].startswith('YEAS'): # This is a vote! vote_url = "http://legis.state.sd.us/sessions/"\ "%s/%s" % (session, node['href']) vote = self.scrape_new_vote(vote_url) vote['date'] = act_date bill.add_vote(vote) else: action += node action = action.strip() # Add action bill.add_action(chamber, action, act_date) self.save_bill(bill)