def scrape_assem_bills(self, chamber, insert, session, year): doc_type = {1: 'bill', 3: 'resolution', 5: 'concurrent resolution', 6: 'joint resolution'} for docnum, bill_type in doc_type.iteritems(): parentpage_url = 'http://www.leg.state.nv.us/Session/%s/Reports/HistListBills.cfm?DoctypeID=%s' % (insert, docnum) links = self.scrape_links(parentpage_url) count = 0 for link in links: count = count + 1 page_path = 'http://www.leg.state.nv.us/Session/%s/Reports/%s' % (insert, link) with self.urlopen(page_path) as page: page = page.decode("utf8").replace(u"\xa0", " ") root = lxml.html.fromstring(page) bill_id = root.xpath('string(/html/body/div[@id="content"]/table[1]/tr[1]/td[1]/font)') title = root.xpath('string(/html/body/div[@id="content"]/table[1]/tr[5]/td)') if insert.find('Special') != -1: session = insert bill = Bill(session, chamber, bill_id, title, type=bill_type) bill_text = root.xpath("string(/html/body/div[@id='content']/table[6]/tr/td[2]/a/@href)") text_url = "http://www.leg.state.nv.us" + bill_text bill.add_version("Bill Text", text_url) primary, secondary = self.scrape_sponsors(page) if primary[0] == 'By:': primary.pop(0) if primary[0] == 'ElectionsProceduresEthicsand': primary[0] = 'Elections Procedures Ethics and' full_name = '' for part_name in primary: full_name = full_name + part_name + " " bill.add_sponsor('primary', full_name) else: for leg in primary: bill.add_sponsor('primary', leg) for leg in secondary: bill.add_sponsor('cosponsor', leg) minutes_count = 2 for mr in root.xpath('//table[4]/tr/td[3]/a'): minutes = mr.xpath("string(@href)") minutes_url = "http://www.leg.state.nv.us" + minutes minutes_date_path = "string(//table[4]/tr[%s]/td[2])" % minutes_count minutes_date = mr.xpath(minutes_date_path).split() minutes_date = minutes_date[0] + minutes_date[1] + minutes_date[2] + " Minutes" bill.add_document(minutes_date, minutes_url) minutes_count = minutes_count + 1 self.scrape_actions(root, bill, "lower") self.scrape_votes(page, bill, insert, year) bill.add_source(page_path) self.save_bill(bill)
def scrape_bill(self, session, chamber, bill_id, url): with self.urlopen(url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(url) title = page.xpath("//br")[8].tail if not title: return title = title.strip() bill = Bill(session, chamber, bill_id, title) bill.add_source(url) action_link = page.xpath("//a[contains(@href, 'getActions')]")[0] self.scrape_actions(bill, action_link.attrib['href']) version_path = "//a[contains(., '%s')]" for version_type in ('Introduced Bill', 'House Bill', 'Senate Bill', 'Engrossed Bill', 'Enrolled Act'): path = version_path % version_type links = page.xpath(path) if links: bill.add_version(version_type, links[0].attrib['href']) for doc_link in page.xpath("//a[contains(@href, 'FISCAL')]"): num = doc_link.text.strip().split("(")[0] bill.add_document("Fiscal Impact Statement #%s" % num, doc_link.attrib['href']) bill['subjects'] = self.subjects[bill_id] self.save_bill(bill)
def parse_bill(self, chamber, session, special, link): bill_num = link.text.strip() bill_type = re.search('type=(B|R|)', link.attrib['href']).group(1) bill_id = "%s%s %s" % (bill_abbr(chamber), bill_type, bill_num) url = info_url(chamber, session, special, bill_type, bill_num) with self.urlopen(url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(url) title = page.xpath( "//td[text() = 'Short Title:']/following-sibling::td")[0] title = title.text.strip() bill = Bill(session, chamber, bill_id, title) bill.add_source(url) self.parse_bill_versions(bill, page) self.parse_history(bill, history_url(chamber, session, special, bill_type, bill_num)) self.parse_votes(bill, vote_url(chamber, session, special, bill_type, bill_num)) self.save_bill(bill)
def scrape_session_2009(self, chamber, session): url, type = bills_url(chamber) with self.urlopen(url) as page_html: page = lxml.html.fromstring(page_html) for element, attribute, link, pos in page.iterlinks(): if re.search("billtype=" + type + "&billnumber=[0-9]+", link) != None: bill_page_url = bill_url(link) with self.urlopen(bill_page_url) as bill_page_str: bill_page = lxml.html.fromstring(bill_page_str) splitted_link = link.split("=") bill_number = splitted_link[-1] bill_id = bill_page.cssselect('a[class="headerlink"]') bill_id = bill_id[0] bill_id = bill_id.text_content() bill_title = bill_page.cssselect('td[style="color:Black"]') bill_title = bill_title[0] bill_title = bill_title.text_content() bill = Bill(session, chamber, bill_id, bill_title) bill.add_source(bill_page_url) actions_table_list = bill_page.cssselect('table[rules="all"]') actions_table = actions_table_list[0] action_elements = actions_table.cssselect('tr') # first element is not an action element action_elements.pop(0) for ae in action_elements: action_element_parts = ae.cssselect('td') action_date = action_element_parts[0] actor_house = action_element_parts[1] action_text = action_element_parts[2] # look for acting comittees match = re.search("(committee\(s\)|committee) on ([A-Z]{3}(/|-)[A-Z]{3}|[A-Z]{3})", action_text.text_content()) if(match != None): actor = match.group(0) elif(actor_house == 'H'): actor = "lower" elif (actor_house == 'S'): actor = "upper" else: actor = chamber action_date = dt.datetime.strptime(action_date.text_content(), '%m/%d/%Y') if (re.search("The votes were as follows", action_text.text_content()) != None): self.scrape_votes(action_text.text_content(), bill_page_url, actor_house, action_date, bill) bill.add_action(actor, action_text, action_date) with self.urlopen(versions_page_url(type, bill_number)) as versions_page_html: versions_page = lxml.html.fromstring(versions_page_html) versions_elements = versions_page.cssselect('span[class="searchtitle"]') for ve in versions_elements: element_text = ve.text_content() version_name = element_text.rstrip("_.HTM") bill.add_version(version_name, bill_version_url(element_text))
def scrape(self, chamber, session): self.validate_session(session) if chamber == 'upper': other_chamber = 'lower' bill_id = 'SB 1' else: other_chamber = 'upper' bill_id = 'HB 1' b1 = Bill(session, chamber, bill_id, 'A super bill') b1.add_source('http://example.com/') b1.add_version('As Introduced', 'http://example.com/SB1.html') b1.add_document('Google', 'http://google.com') b1.add_sponsor('primary', 'Bob Smith') b1.add_sponsor('secondary', 'Johnson, Sally') d1 = datetime.datetime.strptime('1/29/2010', '%m/%d/%Y') v1 = Vote('upper', d1, 'Final passage', True, 2, 0, 0) v1.yes('Smith') v1.yes('Johnson') d2 = datetime.datetime.strptime('1/30/2010', '%m/%d/%Y') v2 = Vote('lower', d2, 'Final passage', False, 0, 1, 1) v2.no('Bob Smith') v2.other('S. Johnson') b1.add_vote(v1) b1.add_vote(v2) b1.add_action(chamber, 'introduced', d1) b1.add_action(chamber, 'read first time', d2) b1.add_action(other_chamber, 'introduced', d2) self.save_bill(b1)
def scrape_bill(self, chamber, session, bill_id, url): with self.urlopen(url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(url) header = page.xpath('//h3/br')[0].tail.replace(' ', ' ') title, primary_sponsor = header.split(' -- ') if bill_id.startswith('H.B.') or bill_id.startswith('S.B.'): bill_type = ['bill'] elif bill_id.startswith('H.C.R.') or bill_id.startswith('S.C.R.'): bill_type = ['concurrent resolution'] elif bill_id.startswith('H.J.R.') or bill_id.startswith('S.J.R.'): bill_type = ['joint resolution'] bill = Bill(session, chamber, bill_id, title, type=bill_type) bill.add_sponsor('primary', primary_sponsor) bill.add_source(url) status_link = page.xpath('//a[contains(@href, "billsta")]')[0] self.parse_status(bill, status_link.attrib['href']) for link in page.xpath( '//a[contains(@href, "bills/") and text() = "HTML"]'): name = link.getprevious().tail.strip() bill.add_version(name, link.attrib['href']) self.save_bill(bill)
def scrape(self, chamber, session): year = year_from_session(session) url = bills_url(year) with self.urlopen(url) as bills_page_html: bills_page = lxml.html.fromstring(bills_page_html) table_rows = bills_page.cssselect('tr') # Eliminate empty rows table_rows = table_rows[0:len(table_rows):2] for row in table_rows: row_elements = row.cssselect('td') bill_document = row_elements[0] bill_document.make_links_absolute(base_url()) element, attribute, link, pos = bill_document.iterlinks().next() bill_id = element.text_content().rstrip('.pdf') bill_document_link = link title_and_sponsors = row_elements[1] title_match = re.search('([A-Z][a-z]+.+[a-z])[A-Z]', title_and_sponsors.text_content()) sponsors_match = re.search('[a-z]([A-Z]+.+)', title_and_sponsors.text_content()) title = title_match.group(1) sponsors = sponsors_match.group(1) separated_sponsors = sponsors.split('--') bill = Bill(session, chamber, bill_id, title) bill.add_version('current', bill_document_link) if separated_sponsors[1] == '(NONE)': bill.add_sponsor('primary', separated_sponsors[0]) else: bill.add_sponsor('cosponsor', separated_sponsors[0]) bill.add_sponsor('cosponsor', separated_sponsors[1]) versions_page_element = row_elements[2] versions_page_element.make_links_absolute(base_url()) element, attribute, link, pos = versions_page_element.iterlinks().next() bill.add_source(link) self.scrape_versions(link, bill) actions_page_element = row_elements[3] element, attribute, link, pos = actions_page_element.iterlinks().next() frame_link = base_url() + link.split('?Open&target=')[1] self.scrape_actions(frame_link, bill) votes_page_element = row_elements[7] element, attribute, link, pos = votes_page_element.iterlinks().next() frame_link = base_url() + link.split('?Open&target=')[1] self.scrape_votes(frame_link, chamber, bill)
def scrape_bill(self, chamber, session, bill_type, number): """ Creates a bill object """ if len(session) == 4: session_url = session+'rs' else: session_url = session url = BILL_URL % (session_url, bill_type, number) with self.urlopen(url) as html: doc = lxml.html.fromstring(html) # title # find <a name="Title">, get parent dt, get parent dl, then get dd within dl title = doc.cssselect('a[name=Title]')[0] \ .getparent().getparent().cssselect('dd')[0].text.strip() # create the bill object now that we have the title print "%s %d %s" % (bill_type, number, title) bill = Bill(session, chamber, "%s %d" % (bill_type, number), title) bill.add_source(url) self.parse_bill_sponsors(doc, bill) # sponsors self.parse_bill_actions(doc, bill) # actions self.parse_bill_documents(doc, bill) # documents and versions self.parse_bill_votes(doc, bill) # votes # add bill to collection self.save_bill(bill)
def parse_senate_billpage(self, bill_url, year): with self.urlopen(bill_url) as bill_page: bill_page = BeautifulSoup(bill_page) # get all the info needed to record the bill bill_id = bill_page.find(id="lblBillNum").b.font.contents[0] bill_title = bill_page.find(id="lblBillTitle").font.string bill_desc = bill_page.find(id="lblBriefDesc").font.contents[0] bill_lr = bill_page.find(id="lblLRNum").font.string bill = Bill(year, 'upper', bill_id, bill_desc, bill_url=bill_url, bill_lr=bill_lr, official_title=bill_title) bill.add_source(bill_url) # Get the primary sponsor bill_sponsor = bill_page.find(id="hlSponsor").i.font.contents[0] bill_sponsor_link = bill_page.find(id="hlSponsor").href bill.add_sponsor('primary', bill_sponsor, sponsor_link=bill_sponsor_link) # cosponsors show up on their own page, if they exist cosponsor_tag = bill_page.find(id="hlCoSponsors") if cosponsor_tag and 'href' in cosponsor_tag: self.parse_senate_cosponsors(bill, cosponsor_tag['href']) # get the actions action_url = bill_page.find(id="hlAllActions")['href'] self.parse_senate_actions(bill, action_url) # stored on a separate page versions_url = bill_page.find(id="hlFullBillText") if versions_url: self.parse_senate_bill_versions(bill, versions_url['href']) self.save_bill(bill)
def scrape_bill(self, bill_url, chamber, session): with self.urlopen(bill_url) as text: if "Specified Bill could not be found" in text: return False page = lxml.html.fromstring(text) page.make_links_absolute(bill_url) bill_id = page.xpath("string(//h2)").split()[0] summary = page.xpath( "string(//*[starts-with(text(), 'Summary: ')])") summary = summary.replace('Summary: ', '') match = re.match(r"^([^:]+): ([^(]+)", summary) if match: subjects = [match.group(1).strip()] title = match.group(2).strip() else: raise ScrapeError("Bad title") bill = Bill(session, chamber, bill_id, title, subjects=subjects) bill.add_source(bill_url) history_link = page.xpath("//a[text() = 'History']")[0] history_url = history_link.attrib['href'] self.scrape_history(bill, history_url) authors_link = page.xpath("//a[text() = 'Authors']")[0] authors_url = authors_link.attrib['href'] self.scrape_authors(bill, authors_url) try: versions_link = page.xpath( "//a[text() = 'Text - All Versions']")[0] versions_url = versions_link.attrib['href'] self.scrape_versions(bill, versions_url) except IndexError: # Only current version try: version_link = page.xpath( "//a[text() = 'Text - Current']")[0] version_url = version_link.attrib['href'] bill.add_version("%s Current" % bill_id, version_url) except IndexError: # Some bills don't have any versions :( pass try: votes_link = page.xpath("//a[text() = 'Votes']")[0] self.scrape_votes(bill, votes_link.attrib['href']) except IndexError: # Some bills don't have any votes pass self.save_bill(bill) return True
def parse_special_session_bill_status_page(self, bill_id, status_page, bill_table, session, chamber, sources): title = bill_table.xpath('//tr[3]/td[2]')[0].text_content() bill = Bill(session, chamber, bill_id, title) for source in sources: bill.add_source(source) self.add_sponsors(bill, self.get_sponsor_table(status_page)) self.add_actions(bill, self.get_action_table(status_page)) return bill
def scrape(self, chamber, year): # Data prior to 1997 is contained in pdfs if year < "1997": raise NoDataForYear(year) bills_url = "http://www.leg.state.co.us/CLICS/CLICS" + year + "A/csl.nsf/%28bf-1%29?OpenView&Count=2000" with self.lxml_context(bills_url) as bills_page: table_rows = bills_page.cssselect("tr") # Eliminate empty rows table_rows = table_rows[0 : len(table_rows) : 2] for row in table_rows: print "row" row_elements = row.cssselect("td") bill_document = row_elements[0] bill_document.make_links_absolute("http://www.leg.state.co.us") element, attribute, link, pos = bill_document.iterlinks().next() bill_id = element.text_content().rstrip(".pdf") bill_document_link = link title_and_sponsors = row_elements[1] title_match = re.search("([A-Z][a-z]+.+[a-z])[A-Z]", title_and_sponsors.text_content()) sponsors_match = re.search("[a-z]([A-Z]+.+)", title_and_sponsors.text_content()) title = title_match.group(1) sponsors = sponsors_match.group(1) separated_sponsors = sponsors.split("--") bill = Bill(year, chamber, bill_id, title) bill.add_version("current", bill_document_link) if separated_sponsors[1] == "(NONE)": bill.add_sponsor("primary", separated_sponsors[0]) else: bill.add_sponsor("cosponsor", separated_sponsors[0]) bill.add_sponsor("cosponsor", separated_sponsors[1]) versions_page_element = row_elements[2] versions_page_element.make_links_absolute("http://www.leg.state.co.us") element, attribute, link, pos = versions_page_element.iterlinks().next() bill.add_source(link) self.scrape_versions(link, bill) actions_page_element = row_elements[3] element, attribute, link, pos = actions_page_element.iterlinks().next() frame_link = "http://www.leg.state.co.us" + link.split("?Open&target=")[1] self.scrape_actions(frame_link, bill) votes_page_element = row_elements[7] element, attribute, link, pos = votes_page_element.iterlinks().next() frame_link = "http://www.leg.state.co.us" + link.split("?Open&target=")[1] self.scrape_votes(link, chamber, bill)
def scrape_year(self, year, chamber): sep = '<h1>House</h1>' if chamber == 'upper': after = False reg = '[5-9]' else: after = True reg = '[1-4]' with self.lxml_context("http://apps.leg.wa.gov/billinfo/dailystatus.aspx?year=" + str(year), sep, after) as page: for element, attribute, link, pos in page.iterlinks(): if re.search("bill=" + reg + "[0-9]{3}", link) != None: bill_page_url = "http://apps.leg.wa.gov/billinfo/" + link with self.lxml_context(bill_page_url) as bill_page: raw_title = bill_page.cssselect('title') split_title = string.split(raw_title[0].text_content(), ' ') bill_id = split_title[0] + ' ' + split_title[1] bill_id = bill_id.strip() session = split_title[3].strip() title_element = bill_page.get_element_by_id("ctl00_ContentPlaceHolder1_lblSubTitle") title = title_element.text_content() bill = Bill(session, chamber, bill_id, title) bill.add_source(bill_page_url) self.scrape_actions(bill_page, bill) for element, attribute, link, pos in bill_page.iterlinks(): if re.search("billdocs", link) != None: if re.search("Amendments", link) != None: bill.add_document("Amendment: " + element.text_content(), link) elif re.search("Bills", link) != None: bill.add_version(element.text_content(), link) else: bill.add_document(element.text_content(), link) elif re.search("senators|representatives", link) != None: with self.lxml_context(link) as senator_page: try: name_tuple = self.scrape_legislator_name(senator_page) bill.add_sponsor('primary', name_tuple[0]) except: pass elif re.search("ShowRollCall", link) != None: match = re.search("([0-9]+,[0-9]+)", link) match = match.group(0) match = match.split(',') id1 = match[0] id2 = match[1] url = "http://flooractivityext.leg.wa.gov/rollcall.aspx?id=" + id1 + "&bienId=" +id2 with self.lxml_context(url) as vote_page: self.scrape_votes(vote_page, bill, url) self.save_bill(bill)
def scrape(self, chamber, session): self.validate_session(session) if chamber == "lower": bill_abbr = "HB" else: bill_abbr = "SB" bill_list_url = "http://www.le.state.ut.us/~%s/bills.htm" % ( session.replace(' ', '')) self.log("Getting bill list for %s, %s" % (session, chamber)) try: base_bill_list = self.soup_parser(self.urlopen(bill_list_url)) except: # this session doesn't exist for this year return bill_list_link_re = re.compile('.*%s\d+ht.htm$' % bill_abbr) for link in base_bill_list.findAll('a', href=bill_list_link_re): bill_list = self.soup_parser(self.urlopen(link['href'])) bill_link_re = re.compile('.*billhtm/%s.*.htm' % bill_abbr) for bill_link in bill_list.findAll('a', href=bill_link_re): bill_id = bill_link.find(text=True).strip() bill_info_url = bill_link['href'] bill_info = self.soup_parser(self.urlopen(bill_info_url)) bill_title, primary_sponsor = bill_info.h3.contents[2].replace( ' ', ' ').strip().split(' -- ') bill = Bill(session, chamber, bill_id, bill_title) bill.add_source(bill_info_url) bill.add_sponsor('primary', primary_sponsor) status_re = re.compile('.*billsta/%s.*.htm' % bill_abbr.lower()) status_link = bill_info.find('a', href=status_re) if status_link: self.parse_status(bill, status_link['href']) text_find = bill_info.find( text="Bill Text (If you are having trouble viewing") if text_find: text_link_re = re.compile('.*\.htm') for text_link in text_find.parent.parent.findAll( 'a', href=text_link_re)[1:]: version_name = text_link.previous.strip() bill.add_version(version_name, text_link['href']) self.save_bill(bill)
def parse_bill(self, chamber, session, bill_id, bill_info_url): with self.urlopen(bill_info_url) as bill_info_data: bill_info = self.soup_parser(bill_info_data) version_url = '%s/bill.doc' % bill_id version_link = bill_info.find(href=version_url) if not version_link: # This bill was withdrawn return bill_title = version_link.findNext('p').contents[0].strip() bill = Bill(session, chamber, bill_id, bill_title) bill.add_version("Most Recent Version", session_url(session) + version_url) bill.add_source(bill_info_url) sponsor_links = bill_info.findAll(href=re.compile( 'legislator/[SH]\d+\.htm')) for sponsor_link in sponsor_links: bill.add_sponsor('primary', sponsor_link.contents[0].strip()) action_p = version_link.findAllNext('p')[-1] for action in action_p.findAll(text=True): action = action.strip() if (not action or action == 'last action' or 'Prefiled' in action): continue action_date = action.split('-')[0] action_date = dt.datetime.strptime(action_date, '%b %d') # Fix: action_date = action_date.replace( year=int('20' + session[2:4])) action = '-'.join(action.split('-')[1:]) if action.endswith('House') or action.endswith('(H)'): actor = 'lower' elif action.endswith('Senate') or action.endswith('(S)'): actor = 'upper' else: actor = chamber bill.add_action(actor, action, action_date) vote_link = bill_info.find(href=re.compile('.*/vote_history.pdf')) if vote_link: bill.add_document( 'vote_history.pdf', bill_info_url.replace('.htm', '') + "/vote_history.pdf") self.save_bill(bill)
def parse_standard_bill_status_page(self, bill_id, status_page, session, chamber, sources): try: title = status_page.xpath("/div/form[1]/table[2]/tr[3]/td[2]")[0].text_content() except IndexError: if len(status_page.xpath("/html/html")) == 2: title = status_page.xpath('/html/html[2]/tr[1]/td[2]')[0].text_content() else: title = status_page.xpath('/html/html[3]/tr[1]/td[2]')[0].text_content() bill = Bill(session, chamber, bill_id, title) for source in sources: bill.add_source(source) self.add_sponsors(bill, self.get_sponsor_table(status_page)) self.add_actions(bill, self.get_action_table(status_page)) return bill
def get_bill_info(self, chamber, session, bill_detail_url, version_list_url): """Extracts all the requested info for a given bill. Calls the parent's methods to enter the results into JSON files. """ if chamber == "House": chamber = 'lower' else: chamber = 'upper' with self.urlopen(bill_detail_url) as bill_html: doc = lxml.html.fromstring(bill_html) bill_id = doc.xpath('//title/text()')[0].split()[0] bill_title = doc.xpath('//font[@size=-1]/text()')[0] bill_type = {'F': 'bill', 'R':'resolution', 'C': 'concurrent resolution'}[bill_id[1]] bill = Bill(session, chamber, bill_id, bill_title, type=bill_type) bill.add_source(bill_detail_url) # grab sponsors sponsors = doc.xpath('//table[@summary="Show Authors"]/descendant::a/text()') if sponsors: primary_sponsor = sponsors[0].strip() bill.add_sponsor('primary', primary_sponsor) cosponsors = sponsors[1:] for leg in cosponsors: bill.add_sponsor('cosponsor', leg.strip()) # Add Actions performed on the bill. bill_actions = self.extract_bill_actions(doc, chamber) for action in bill_actions: bill.add_action(action['action_chamber'], action['action_text'], action['action_date'], type=action['action_type']) # Get all versions of the bill. # Versions of a bill are on a separate page, linked to from the column # labeled, "Bill Text", on the search results page. with self.urlopen(version_list_url) as version_html: version_doc = lxml.html.fromstring(version_html) for v in version_doc.xpath('//a[starts-with(@href, "/bin/getbill.php")]'): version_url = urlparse.urljoin(VERSION_URL_BASE, v.get('href')) bill.add_version(v.text.strip(), version_url) self.save_bill(bill)
def scrape(self, chamber, session): # internal id for the session, store on self so all methods have access self.site_id = self.metadata['session_details'][session]['site_id'] self.build_subject_map() # used for skipping bills from opposite chamber start_letter = 'H' if chamber == 'lower' else 'S' url = 'http://leg6.state.va.us/cgi-bin/legp604.exe?%s+lst+ALL' % self.site_id while url: with self.urlopen(url) as html: doc = lxml.html.fromstring(html) url = None # no more unless we encounter 'More...' bills = doc.xpath('//ul[@class="linkSect"]/li') for bill in bills: link = bill.getchildren()[0] bill_id = link.text_content() # check if this is the 'More...' link if bill_id == 'More...': url = BASE_URL + link.get('href') # skip bills from the other chamber elif not bill_id.startswith(start_letter): continue else: # create a bill desc = bill.xpath('text()')[0].strip() bill_type = {'B': 'bill', 'J': 'joint resolution', 'R': 'resolution'}[bill_id[1]] bill = Bill(session, chamber, bill_id, desc, type=bill_type) bill_url = BASE_URL + link.get('href') self.fetch_sponsors(bill) self.scrape_bill_details(bill_url, bill) bill['subjects'] = self.subject_map[bill_id] bill.add_source(bill_url) self.save_bill(bill)
def parse_bill_status_page(self, status_url, bill_url, session, chamber): status_page = ElementTree(lxml.html.fromstring(self.urlopen(status_url))) # see 2007 HB 2... weird. try: bill_id = status_page.xpath("/div/form[1]/table[2]/tr[2]/td[2]")[0].text_content() except IndexError: bill_id = status_page.xpath("/html/html[2]/tr[1]/td[2]")[0].text_content() try: title = status_page.xpath("/div/form[1]/table[2]/tr[3]/td[2]")[0].text_content() except IndexError: title = status_page.xpath("/html/html[3]/tr[1]/td[2]")[0].text_content() bill = Bill(session, chamber, bill_id, title) bill.add_source(bill_url) self.add_sponsors(bill, status_page) self.add_actions(bill, status_page) return bill
def scrape_bill(self, chamber, session, bill_type, number): """ Creates a bill object """ if len(session) == 4: session_url = session+'rs' else: session_url = session url = BILL_URL % (session_url, bill_type, number) with self.urlopen(url) as html: doc = lxml.html.fromstring(html) # find <a name="Title">, get parent dt, get parent dl, then dd n dl title = doc.xpath('//a[@name="Title"][1]/../../dd[1]/text()')[0].strip() synopsis = doc.xpath('//font[@size="3"]/p/text()')[0].strip() #print "%s %d %s" % (bill_type, number, title) if 'B' in bill_type: _type = ['bill'] elif 'J' in bill_type: _type = ['joint resolution'] bill = Bill(session, chamber, "%s %d" % (bill_type, number), title, type=_type, synopsis=synopsis) bill.add_source(url) self.parse_bill_sponsors(doc, bill) # sponsors self.parse_bill_actions(doc, bill) # actions self.parse_bill_documents(doc, bill) # documents and versions self.parse_bill_votes(doc, bill) # votes # subjects subjects = [] for subj in doc.xpath('//a[contains(@href, "/subjects/")]'): subjects.append(subj.text.split('-see also-')[0]) bill['subjects'] = subjects # add bill to collection self.save_bill(bill)
def scrape_assem_bills(self, chamber, insert, session): doc_type = [1, 3, 5, 6] for doc in doc_type: parentpage_url = 'http://www.leg.state.nv.us/Session/%s/Reports/HistListBills.cfm?DoctypeID=%s' % (insert, doc) links = self.scrape_links(parentpage_url) count = 0 for link in links: count = count + 1 page_path = 'http://www.leg.state.nv.us/Session/%s/Reports/%s' % (insert, link) with self.urlopen(page_path) as page: root = lxml.etree.fromstring(page, lxml.etree.HTMLParser()) bill_id = root.xpath('string(/html/body/div[@id="content"]/table[1]/tr[1]/td[1]/font)') title = root.xpath('string(/html/body/div[@id="content"]/table[1]/tr[5]/td)') bill = Bill(session, chamber, bill_id, title) primary, secondary = self.scrape_sponsors(page_path) if primary[0] == 'By:': primary.pop(0) if primary[0] == 'ElectionsProceduresEthicsand': primary[0] = 'Elections Procedures Ethics and' full_name = '' for part_name in primary: full_name = full_name + part_name + " " bill.add_sponsor('primary', full_name) else: for leg in primary: bill.add_sponsor('primary', leg) for leg in secondary: bill.add_sponsor('cosponsor', leg) self.scrape_actions(page_path, bill, "Assembly") self.scrape_votes(page_path, bill, "Assembly", insert, title) bill.add_source(page_path) self.save_bill(bill)
def scrape_bill(self, chamber, session, bill_type, number): """ Creates a bill object """ if len(session) == 4: session_url = session + "rs" else: session_url = session url = BILL_URL % (session_url, bill_type, number) with self.urlopen(url) as html: doc = lxml.html.fromstring(html) # find <a name="Title">, get parent dt, get parent dl, then dd n dl title = doc.xpath('//a[@name="Title"][1]/../../dd[1]/text()')[0].strip() # create the bill object now that we have the title print "%s %d %s" % (bill_type, number, title) if "B" in bill_type: _type = ["bill"] elif "J" in bill_type: _type = ["joint resolution"] bill = Bill(session, chamber, "%s %d" % (bill_type, number), title, type=_type) bill.add_source(url) self.parse_bill_sponsors(doc, bill) # sponsors self.parse_bill_actions(doc, bill) # actions self.parse_bill_documents(doc, bill) # documents and versions self.parse_bill_votes(doc, bill) # votes # subjects subjects = [] for subj in doc.xpath('//a[contains(@href, "/subjects/")]'): subjects.append(subj.text.split("-see also-")[0]) bill["subjects"] = subjects # add bill to collection self.save_bill(bill)
def parse_bill(self, chamber, session, special, link): bill_number = link.contents[0] type = re.search('type=(B|R|)', link['href']).group(1) bill_id = "%s%s %s" % (bill_abbr(chamber), type, bill_number) bill_info_url = info_url(chamber, session, special, type, bill_number) with self.urlopen(bill_info_url) as info_page: info_page = BeautifulSoup(info_page) title_label = info_page.find(text='Short Title:') title = title_label.findNext().contents[0] bill = Bill(session, chamber, bill_id, title) bill.add_source(bill_info_url) self.parse_bill_versions(bill, info_page) self.parse_history(bill, history_url(chamber, session, special, type, bill_number)) self.parse_votes(bill, vote_url(chamber, session, special, type, bill_number)) self.save_bill(bill)
def scrape_bill(self, chamber, session, bill_number, ga_num): bill_url = self.urls['info'] % (bill_number, ga_num) with self.urlopen(bill_url) as page: page = lxml.html.fromstring(page) title = page.xpath("//span[@id='lblAbstract']")[0].text bill = Bill(session, chamber, bill_number, title) bill.add_source(bill_url) # Primary Sponsor sponsor = page.xpath("//span[@id='lblBillSponsor']")[0].text_content().split("by")[-1] sponsor = sponsor.replace('*','').strip() bill.add_sponsor('primary',sponsor) # Co-sponsors unavailable for scraping (loaded into page via AJAX) # Full summary doc summary = page.xpath("//span[@id='lblBillSponsor']/a")[0] bill.add_document('Full summary', summary.get('href')) # Actions tables = page.xpath("//table[@id='tabHistoryAmendments_tabHistory_gvBillActionHistory']") actions_table = tables[0] action_rows = actions_table.xpath("tr[position()>1]") for ar in action_rows: action_taken = ar.xpath("td")[0].text action_date = datetime.datetime.strptime(ar.xpath("td")[1].text.strip(), '%m/%d/%Y') bill.add_action(chamber, action_taken, action_date) votes_link = page.xpath("//span[@id='lblBillVotes']/a") if(len(votes_link) > 0): votes_link = votes_link[0].get('href') bill = self.scrape_votes(bill, sponsor, 'http://wapp.capitol.tn.gov/apps/Billinfo/%s' % (votes_link,)) self.save_bill(bill)
def scrape_session_new(self, chamber, session): if chamber == "lower": bill_abbr = "H." else: bill_abbr = "S." bill_list_path = "docs/bills.cfm?Session=%s&Body=%s" % ( session.split('-')[1], bill_abbr[0]) bill_list_url = "http://www.leg.state.vt.us/" + bill_list_path bill_list = BeautifulSoup(self.urlopen(bill_list_url)) bill_link_re = re.compile('.*?Bill=%s\.\d+.*' % bill_abbr[0]) for bill_link in bill_list.findAll('a', href=bill_link_re): bill_id = bill_link.string bill_title = bill_link.parent.findNext('b').string bill_info_url = "http://www.leg.state.vt.us" + bill_link['href'] bill = Bill(session, chamber, bill_id, bill_title) bill.add_source(bill_info_url) info_page = BeautifulSoup(self.urlopen(bill_info_url)) text_links = info_page.findAll('blockquote')[1].findAll('a') for text_link in text_links: bill.add_version(text_link.string, "http://www.leg.state.vt.us" + text_link['href']) act_table = info_page.findAll('blockquote')[2].table for row in act_table.findAll('tr')[1:]: action = "" for s in row.findAll('td')[1].findAll(text=True): action += s + " " action = clean_action(action) match = re.search('Governor on (.*)$', action) if match: act_date = parse_exec_date(match.group(1).strip()) actor = 'Governor' else: if row['bgcolor'] == 'Salmon': actor = 'lower' else: actor = 'upper' if row.td.a: act_date = row.td.a.string else: act_date = row.td.string try: act_date = re.search( '\d{1,2}/\d{1,2}/\d{4,4}', act_date).group(0) except AttributeError: # No date, skip continue act_date = dt.datetime.strptime(act_date, '%m/%d/%Y') bill.add_action(actor, action, act_date, type=action_type(action)) vote_link = row.find('a', text='Details') if vote_link: vote_url = vote_link.parent['href'] self.parse_vote_new(bill, actor, vote_url) sponsors = info_page.find( text='Sponsor(s):').parent.parent.findAll('b') bill.add_sponsor('primary', sponsors[0].string) for sponsor in sponsors[1:]: bill.add_sponsor('cosponsor', sponsor.string) self.save_bill(bill)
def scrape_session_old(self, chamber, session): if chamber == "lower": bill_abbr = "H." chamber_name = "House" other_chamber = "Senate" else: bill_abbr = "S." chamber_name = "Senate" other_chamber = "House" start_date = '1/1/%s' % session.split('-')[0] data = urllib.urlencode({'Date': start_date, 'Body': bill_abbr[0], 'Session': session.split('-')[1]}) bill_list_url = "http://www.leg.state.vt.us/database/"\ "rintro/results.cfm" bill_list = BeautifulSoup(urllib2.urlopen(bill_list_url, data)) bill_link_re = re.compile('.*?Bill=%s.\d+.*' % bill_abbr[0]) for bill_link in bill_list.findAll('a', href=bill_link_re): bill_id = bill_link.string bill_title = bill_link.parent.parent.findAll('td')[1].string bill_info_url = "http://www.leg.state.vt.us" + bill_link['href'] bill = Bill(session, chamber, bill_id, bill_title) bill.add_source(bill_info_url) info_page = BeautifulSoup(self.urlopen(bill_info_url)) text_links = info_page.findAll('blockquote')[-1].findAll('a') for text_link in text_links: bill.add_version(text_link.string, "http://www.leg.state.vt.us" + text_link['href']) sponsors = info_page.find( text='Sponsor(s):').parent.findNext('td').findAll('b') bill.add_sponsor('primary', sponsors[0].string) for sponsor in sponsors[1:]: bill.add_sponsor('cosponsor', sponsor.string) # Grab actions from the originating chamber act_table = info_page.find( text='%s Status:' % chamber_name).findNext('table') for row in act_table.findAll('tr')[3:]: action = clean_action(row.td.string.replace( ' ', '').strip(':')) act_date = row.findAll('td')[1].b.string.replace(' ', '') if act_date != "": detail = row.findAll('td')[2].b if detail and detail.string != "": action += ": %s" % detail.string.replace(' ', '') bill.add_action(chamber, action, act_date, type=action_type(action)) # Grab actions from the other chamber act_table = info_page.find( text='%s Status:' % other_chamber).findNext('table') if act_table: if chamber == 'upper': act_chamber = 'lower' else: act_chamber = 'upper' for row in act_table.findAll('tr')[3:]: action = clean_action(row.td.string.replace( ' ', '').strip(':')) act_date = row.findAll('td')[1].b.string.replace( ' ', '') if act_date != "": detail = row.findAll('td')[2].b if detail and detail.string != "": action += ": %s" % detail.string.replace( ' ', '') date = dt.datetime.strptime(act_date, '%m/%d/%Y') bill.add_action(act_chamber, action, act_date, type=action_type(action)) self.save_bill(bill)
def scrape_session(self, chamber, year): if chamber == 'upper': bill_abbr = 'SB|SCR|SJR' elif chamber == 'lower': bill_abbr = 'HB|HCR|HJR' # Sessions last 2 years, 1993-1994 was the 18th session = str(18 + ((int(year) - 1993) / 2)) year2 = str(int(year) + 1) # Full calendar year date1 = '0101' + year[2:] date2 = '1231' + year2[2:] # Get bill list bill_list_url = 'http://www.legis.state.ak.us/'\ 'basis/range_multi.asp?session=%s&date1=%s&date2=%s' % ( session, date1, date2) self.log("Getting bill list for %s %s (this may take a long time)." % (chamber, session)) bill_list = self.soup_parser(self.urlopen(bill_list_url)) # Find bill links re_str = "bill=%s\d+" % bill_abbr links = bill_list.findAll(href=re.compile(re_str)) for link in links: bill_id = link.contents[0].replace(' ', '') bill_name = link.parent.parent.findNext('td').find( 'font').contents[0].strip() bill = Bill(session, chamber, bill_id, bill_name.strip()) # Get the bill info page and strip malformed t info_url = "http://www.legis.state.ak.us/basis/%s" % link['href'] info_page = self.soup_parser(self.urlopen(info_url)) bill.add_source(info_url) # Get sponsors spons_str = info_page.find( text="SPONSOR(s):").parent.parent.contents[1] sponsors_match = re.match( ' (SENATOR|REPRESENTATIVE)\([Ss]\) ([^,]+(,[^,]+){0,})', spons_str) if sponsors_match: sponsors = sponsors_match.group(2).split(',') bill.add_sponsor('primary', sponsors[0].strip()) for sponsor in sponsors[1:]: bill.add_sponsor('cosponsor', sponsor.strip()) else: # Committee sponsorship bill.add_sponsor('committee', spons_str.strip()) # Get actions act_rows = info_page.findAll('table', 'myth')[1].findAll('tr')[1:] for row in act_rows: cols = row.findAll('td') act_date = cols[0].font.contents[0] act_date = dt.datetime.strptime(act_date, '%m/%d/%y') if cols[2].font.string == "(H)": act_chamber = "lower" elif cols[2].font.string == "(S)": act_chamber = "upper" else: act_chamber = chamber action = cols[3].font.contents[0].strip() if re.match("\w+ Y(\d+) N(\d+)", action): try: vote = self.parse_vote(bill, action, act_chamber, act_date, cols[1].a['href']) bill.add_vote(vote) except: self.log("Failed parsing vote at %s" % cols[1].a['href']) bill.add_action(act_chamber, action, act_date) # Get subjects bill['subjects'] = [] subject_link_re = re.compile('.*subject=\w+$') for subject_link in info_page.findAll('a', href=subject_link_re): subject = subject_link.contents[0].strip() bill['subjects'].append(subject) # Get versions text_list_url = "http://www.legis.state.ak.us/"\ "basis/get_fulltext.asp?session=%s&bill=%s" % ( session, bill_id) text_list = self.soup_parser(self.urlopen(text_list_url)) bill.add_source(text_list_url) text_link_re = re.compile('^get_bill_text?') for text_link in text_list.findAll('a', href=text_link_re): text_name = text_link.parent.previousSibling.contents[0] text_name = text_name.strip() text_url = "http://www.legis.state.ak.us/basis/%s" % ( text_link['href']) bill.add_version(text_name, text_url) self.save_bill(bill)
def scrape_bill_pages(self, session, year_abr): """ assemble information on a bill from a number of DBF files """ #Main Bill information main_bill_url, main_bill_db = self.get_dbf(year_abr, 'MAINBILL') # keep a dictionary of bills (mapping bill_id to Bill obj) bill_dict = {} for rec in main_bill_db: bill_type = rec["billtype"] bill_number = int(rec["billnumber"]) bill_id = bill_type + str(bill_number) title = rec["synopsis"] if bill_type[0] == 'A': chamber = "lower" else: chamber = "upper" bill = Bill(str(session), chamber, bill_id, title, type=self._bill_types[bill_type[1:]]) bill.add_source(main_bill_url) bill_dict[bill_id] = bill #Sponsors bill_sponsors_url, bill_sponsors_db = self.get_dbf(year_abr, 'BILLSPON') for rec in bill_sponsors_db: bill_type = rec["billtype"] bill_number = int(rec["billnumber"]) bill_id = bill_type + str(bill_number) bill = bill_dict[bill_id] name = rec["sponsor"] sponsor_type = rec["type"] if sponsor_type == 'P': sponsor_type = "Primary" else: sponsor_type = "Co-sponsor" bill.add_sponsor(sponsor_type, name) #Documents bill_document_url, bill_document_db = self.get_dbf(year_abr, 'BILLWP') #print bill_document_db[2] for rec in bill_document_db: bill_type = rec["billtype"] bill_number = int(rec["billnumber"]) bill_id = bill_type + str(bill_number) bill = bill_dict[bill_id] document = rec["document"] document = document.split('\\') document = document[-2] + "/" + document[-1] year = str(year_abr) + str((year_abr + 1)) #doc_url = "ftp://www.njleg.state.nj.us/%s/%s" % (year, document) htm_url = 'http://www.njleg.state.nj.us/%s/Bills/%s' % (year_abr, document.replace('.DOC', '.HTM')) # name document based _doctype doc_name = self._doctypes[rec['doctype']] if rec['comment']: doc_name += ' ' + rec['comment'] if rec['doctype'] in self._version_types: bill.add_version(doc_name, htm_url) else: bill.add_document(doc_name, htm_url) #Senate Votes file1 = 'A' + str(year_abr) file2 = 'A' + str(year_abr + 1) file3 = 'S' + str(year_abr) file4 = 'S' + str(year_abr + 1) if str(year_abr) != '2010': vote_info_list = [file1, file2, file3, file4] else: vote_info_list = [file1, file3] for bill_vote_file in vote_info_list: s_vote_url = 'ftp://www.njleg.state.nj.us/votes/%s.zip' % bill_vote_file s_vote_zip, resp = self.urlretrieve(s_vote_url) zipedfile = zipfile.ZipFile(s_vote_zip) vfile = "%s.txt" % bill_vote_file vote_file = zipedfile.open(vfile, 'U') vdict_file = csv.DictReader(vote_file) votes = {} if bill_vote_file[0] == "A": chamber = "lower" else: chamber = "upper" for rec in vdict_file: bill_id = rec["Bill"] bill_id = bill_id.strip() leg = rec["Full_Name"] date = rec["Session_Date"] date = datetime.strptime(date, "%m/%d/%Y") action = rec["Action"] leg_vote = rec["Legislator_Vote"] vote_id = bill_id + "_" + action vote_id = vote_id.replace(" ", "_") passed = None if vote_id not in votes: votes[vote_id] = Vote(chamber, date, action, passed, None, None, None, bill_id=bill_id) if leg_vote == "Y": votes[vote_id].yes(leg) elif leg_vote == "N": votes[vote_id].no(leg) else: votes[vote_id].other(leg) #Counts yes/no/other votes and saves overall vote for vote in votes.itervalues(): vote_yes_count = len(vote["yes_votes"]) vote_no_count = len(vote["no_votes"]) vote_other_count = len(vote["other_votes"]) vote["yes_count"] = vote_yes_count vote["no_count"] = vote_no_count vote["other_count"] = vote_other_count if vote_yes_count > vote_no_count: vote["passed"] = True else: vote["passed"] = False vote_bill_id = vote["bill_id"] bill = bill_dict[vote_bill_id] bill.add_vote(vote) #Actions bill_action_url, bill_action_db = self.get_dbf(year_abr, 'BILLHIST') for rec in bill_action_db: bill_type = rec["billtype"] bill_number = int(rec["billnumber"]) bill_id = bill_type + str(bill_number) bill = bill_dict[bill_id] action = rec["action"] date = rec["dateaction"] actor = rec["house"] comment = rec["comment"] action, atype = self.categorize_action(action) if comment: action += (' ' + comment) bill.add_action(actor, action, date, type=atype) # Subjects subject_url, subject_db = self.get_dbf(year_abr, 'BILLSUBJ') for rec in subject_db: bill_id = rec['billtype'] + str(int(rec['billnumber'])) bill = bill_dict.get(bill_id) if bill: bill.setdefault('subjects', []).append(rec['subjectkey']) else: self.warning('invalid bill id in BILLSUBJ.DBF: %s' % bill_id) # save all bills at the end for bill in bill_dict.itervalues(): # add sources bill.add_source(bill_sponsors_url) bill.add_source(bill_document_url) bill.add_source(bill_action_url) bill.add_source(subject_url) self.save_bill(bill)
def scrape_year(self, chamber, year, session): if chamber == 'upper': chamber_abbr = 'H' elif chamber == 'lower': chamber_abbr = 'S' #set up POST data values = [('txtsessionyear', year), ('txttitle', ''), ('txtlsrnumber', ''), ('Submit1', 'Submit')] params = urllib.urlencode(values) search_url = 'http://www.gencourt.state.nh.us/bill_status/Results.aspx' #request page with list of all bills in year with self.urlopen(search_url + '?' + params) as doc: soup = BeautifulSoup(doc) #parse results bills = soup.find("table", {"class": "ptable"}) trs = soup.findAll("tr") #go through all of the table rows with relevant data tr_start = 8 tr_hop = 11 i = 0 while (tr_start + (tr_hop * i)) < len(trs): tr = trs[tr_start + (tr_hop * i)] i = i + 1 # strip off extra white space from name id = tr.find("big").string.strip() bill_id = tr.find("big").string.strip() exp = re.compile("^(\w*)") bill_id = exp.search(id).group(1) # check to see if its in the proper chamber exp = re.compile("^" + chamber_abbr) if exp.search(bill_id) == None: continue # in wrong house # check to see it is a bill and not a resolution exp = re.compile("B") if exp.search(bill_id) == None: continue # not a bill # get bill_id suffix if exists exp = re.compile("(-\w*)$") res = exp.search(id) if res != None: bill_id = bill_id + res.group(1) # get bill title title = tr.findAll("b")[0] bill_title = title.nextSibling.string bill_title = bill_title.strip() bill_title = bill_title.encode('ascii', 'xmlcharrefreplace') # grab url of bill text urls = tr.findAll("a") textexp = re.compile("Bill Text") textdoc = re.compile("Bill Docket") textstat = re.compile("Bill Status") textcall = re.compile("Roll Calls") textaudio = re.compile("Audio Files") for url in urls: if textexp.search(str(url.string)) != None: bill_url = self.get_bill_text(url) if textdoc.search(str(url.string)) != None: pass if textstat.search(str(url.string)) != None: add_bill_sponsors() if textcall.search(str(url.string)) != None: pass if textaudio.search(str(url.string)) != None: pass bill = Bill(session, chamber, bill_id, bill_title) bill.add_version("Bill text", bill_url) bill.add_source(search_url) self.save_bill(bill)
def scrape_bill(self, chamber, session, url): url = url + "&Year=%s" % session with self.urlopen(url) as page: page = page.replace(' ', ' ').replace('<br>', '\n') page = lxml.html.fromstring(page) page.make_links_absolute(url) title = page.xpath('//h3')[0].text.strip() title = re.match(r"^\w+\s+\d+:\s+(.*)$", title).group(1) bill_id = page.xpath("string(//pre[@class='billhistory']/b)") bill_id = bill_id.split()[0].strip() bill = Bill(session, chamber, bill_id, title) bill.add_source(url) hist = page.xpath("string(//pre[@class='billhistory'])").strip() act_re = re.compile(r'^ (\d\d/\d\d/\d\d) (SENATE|HOUSE)' r'(.*\n(\s{16,16}.*\n){0,})', re.MULTILINE) # Actions for match in act_re.finditer(hist): action = match.group(3).replace('\n', ' ') action = re.sub(r'\s+', ' ', action).strip() if match.group(2) == 'SENATE': actor = 'upper' else: actor = 'lower' date = match.group(1) date = datetime.datetime.strptime(date, "%m/%d/%y") for act_text in re.split(' -[HS]J \d+;? ?', action): act_text = act_text.strip() if not act_text: continue types = [] act_lower = act_text.lower() if act_lower.startswith('introduced'): types.append('bill:introduced') if 'referred to' in act_lower: types.append('committee:referred') if 'died in committee' in act_lower: types.append('committee:failed') if 'favorable by' in act_lower: types.append('committee:passed:favorable') if 'amendment(s) adopted' in act_lower: types.append('amendment:passed') bill.add_action(actor, act_text, date, type=types) # Sponsors primary_sponsor = re.search(r'by ([^;(\n]+;?|\w+)', hist).group(1).strip('; ') bill.add_sponsor('primary', primary_sponsor) cospon_re = re.compile(r'\((CO-SPONSORS|CO-AUTHORS)\) ' '([\w .]+(;[\w .\n]+){0,})', re.MULTILINE) match = cospon_re.search(hist) if match: for cosponsor in match.group(2).split(';'): cosponsor = cosponsor.replace('\n', '').strip() bill.add_sponsor('cosponsor', cosponsor) # Versions for link in page.xpath("//a[contains(@href, 'billtext/html')]"): version = link.xpath('string(../../td[1])').strip() bill.add_version(version, link.attrib['href']) # House Votes for link in page.xpath("//a[contains(@href, 'votes/html/h')]"): bill.add_vote(self.scrape_lower_vote(link.attrib['href'])) # Senate Votes for link in page.xpath("//a[contains(@href, 'votes/html/S')]"): bill.add_vote(self.scrape_upper_vote(link.attrib['href'])) self.save_bill(bill)