def scrape_assem_bills(self, chamber, insert, session, year): doc_type = {1: 'bill', 3: 'resolution', 5: 'concurrent resolution', 6: 'joint resolution'} for docnum, bill_type in doc_type.iteritems(): parentpage_url = 'http://www.leg.state.nv.us/Session/%s/Reports/HistListBills.cfm?DoctypeID=%s' % (insert, docnum) links = self.scrape_links(parentpage_url) count = 0 for link in links: count = count + 1 page_path = 'http://www.leg.state.nv.us/Session/%s/Reports/%s' % (insert, link) with self.urlopen(page_path) as page: page = page.decode("utf8").replace(u"\xa0", " ") root = lxml.html.fromstring(page) bill_id = root.xpath('string(/html/body/div[@id="content"]/table[1]/tr[1]/td[1]/font)') title = root.xpath('string(/html/body/div[@id="content"]/table[1]/tr[5]/td)') if insert.find('Special') != -1: session = insert bill = Bill(session, chamber, bill_id, title, type=bill_type) bill_text = root.xpath("string(/html/body/div[@id='content']/table[6]/tr/td[2]/a/@href)") text_url = "http://www.leg.state.nv.us" + bill_text bill.add_version("Bill Text", text_url) primary, secondary = self.scrape_sponsors(page) if primary[0] == 'By:': primary.pop(0) if primary[0] == 'ElectionsProceduresEthicsand': primary[0] = 'Elections Procedures Ethics and' full_name = '' for part_name in primary: full_name = full_name + part_name + " " bill.add_sponsor('primary', full_name) else: for leg in primary: bill.add_sponsor('primary', leg) for leg in secondary: bill.add_sponsor('cosponsor', leg) minutes_count = 2 for mr in root.xpath('//table[4]/tr/td[3]/a'): minutes = mr.xpath("string(@href)") minutes_url = "http://www.leg.state.nv.us" + minutes minutes_date_path = "string(//table[4]/tr[%s]/td[2])" % minutes_count minutes_date = mr.xpath(minutes_date_path).split() minutes_date = minutes_date[0] + minutes_date[1] + minutes_date[2] + " Minutes" bill.add_document(minutes_date, minutes_url) minutes_count = minutes_count + 1 self.scrape_actions(root, bill, "lower") self.scrape_votes(page, bill, insert, year) bill.add_source(page_path) self.save_bill(bill)
def scrape_bill(self, session, chamber, bill_id, url): with self.urlopen(url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(url) title = page.xpath("//br")[8].tail if not title: return title = title.strip() bill = Bill(session, chamber, bill_id, title) bill.add_source(url) action_link = page.xpath("//a[contains(@href, 'getActions')]")[0] self.scrape_actions(bill, action_link.attrib['href']) version_path = "//a[contains(., '%s')]" for version_type in ('Introduced Bill', 'House Bill', 'Senate Bill', 'Engrossed Bill', 'Enrolled Act'): path = version_path % version_type links = page.xpath(path) if links: bill.add_version(version_type, links[0].attrib['href']) for doc_link in page.xpath("//a[contains(@href, 'FISCAL')]"): num = doc_link.text.strip().split("(")[0] bill.add_document("Fiscal Impact Statement #%s" % num, doc_link.attrib['href']) bill['subjects'] = self.subjects[bill_id] self.save_bill(bill)
def scrape(self, chamber, session): self.validate_session(session) if chamber == 'upper': other_chamber = 'lower' bill_id = 'SB 1' else: other_chamber = 'upper' bill_id = 'HB 1' b1 = Bill(session, chamber, bill_id, 'A super bill') b1.add_source('http://example.com/') b1.add_version('As Introduced', 'http://example.com/SB1.html') b1.add_document('Google', 'http://google.com') b1.add_sponsor('primary', 'Bob Smith') b1.add_sponsor('secondary', 'Johnson, Sally') d1 = datetime.datetime.strptime('1/29/2010', '%m/%d/%Y') v1 = Vote('upper', d1, 'Final passage', True, 2, 0, 0) v1.yes('Smith') v1.yes('Johnson') d2 = datetime.datetime.strptime('1/30/2010', '%m/%d/%Y') v2 = Vote('lower', d2, 'Final passage', False, 0, 1, 1) v2.no('Bob Smith') v2.other('S. Johnson') b1.add_vote(v1) b1.add_vote(v2) b1.add_action(chamber, 'introduced', d1) b1.add_action(chamber, 'read first time', d2) b1.add_action(other_chamber, 'introduced', d2) self.save_bill(b1)
def scrape_year(self, year, chamber): sep = '<h1>House</h1>' if chamber == 'upper': after = False reg = '[5-9]' else: after = True reg = '[1-4]' with self.lxml_context("http://apps.leg.wa.gov/billinfo/dailystatus.aspx?year=" + str(year), sep, after) as page: for element, attribute, link, pos in page.iterlinks(): if re.search("bill=" + reg + "[0-9]{3}", link) != None: bill_page_url = "http://apps.leg.wa.gov/billinfo/" + link with self.lxml_context(bill_page_url) as bill_page: raw_title = bill_page.cssselect('title') split_title = string.split(raw_title[0].text_content(), ' ') bill_id = split_title[0] + ' ' + split_title[1] bill_id = bill_id.strip() session = split_title[3].strip() title_element = bill_page.get_element_by_id("ctl00_ContentPlaceHolder1_lblSubTitle") title = title_element.text_content() bill = Bill(session, chamber, bill_id, title) bill.add_source(bill_page_url) self.scrape_actions(bill_page, bill) for element, attribute, link, pos in bill_page.iterlinks(): if re.search("billdocs", link) != None: if re.search("Amendments", link) != None: bill.add_document("Amendment: " + element.text_content(), link) elif re.search("Bills", link) != None: bill.add_version(element.text_content(), link) else: bill.add_document(element.text_content(), link) elif re.search("senators|representatives", link) != None: with self.lxml_context(link) as senator_page: try: name_tuple = self.scrape_legislator_name(senator_page) bill.add_sponsor('primary', name_tuple[0]) except: pass elif re.search("ShowRollCall", link) != None: match = re.search("([0-9]+,[0-9]+)", link) match = match.group(0) match = match.split(',') id1 = match[0] id2 = match[1] url = "http://flooractivityext.leg.wa.gov/rollcall.aspx?id=" + id1 + "&bienId=" +id2 with self.lxml_context(url) as vote_page: self.scrape_votes(vote_page, bill, url) self.save_bill(bill)
def parse_bill(self, chamber, session, bill_id, bill_info_url): with self.urlopen(bill_info_url) as bill_info_data: bill_info = self.soup_parser(bill_info_data) version_url = '%s/bill.doc' % bill_id version_link = bill_info.find(href=version_url) if not version_link: # This bill was withdrawn return bill_title = version_link.findNext('p').contents[0].strip() bill = Bill(session, chamber, bill_id, bill_title) bill.add_version("Most Recent Version", session_url(session) + version_url) bill.add_source(bill_info_url) sponsor_links = bill_info.findAll(href=re.compile( 'legislator/[SH]\d+\.htm')) for sponsor_link in sponsor_links: bill.add_sponsor('primary', sponsor_link.contents[0].strip()) action_p = version_link.findAllNext('p')[-1] for action in action_p.findAll(text=True): action = action.strip() if (not action or action == 'last action' or 'Prefiled' in action): continue action_date = action.split('-')[0] action_date = dt.datetime.strptime(action_date, '%b %d') # Fix: action_date = action_date.replace( year=int('20' + session[2:4])) action = '-'.join(action.split('-')[1:]) if action.endswith('House') or action.endswith('(H)'): actor = 'lower' elif action.endswith('Senate') or action.endswith('(S)'): actor = 'upper' else: actor = chamber bill.add_action(actor, action, action_date) vote_link = bill_info.find(href=re.compile('.*/vote_history.pdf')) if vote_link: bill.add_document( 'vote_history.pdf', bill_info_url.replace('.htm', '') + "/vote_history.pdf") self.save_bill(bill)
def scrape(self, chamber, session): self.validate_session(session) if chamber == 'upper': bill_no = 1 abbr = 'SB' else: bill_no = 4001 abbr = 'HB' while True: bill_page = self.scrape_bill(session, abbr, bill_no) bill_page = BeautifulSoup(bill_page) # if we can't find a page, we must be done. This is a healthy thing. if bill_page == None: return title = ''.join(self.flatten(bill_page.findAll(id='frg_billstatus_ObjectSubject')[0])) title = title.replace('\n','').replace('\r','') bill_id = "%s %d" % (abbr, bill_no) the_bill = Bill(session, chamber, bill_id, title) #sponsors first = 0 for name in bill_page.findAll(id='frg_billstatus_SponsorList')[0].findAll('a'): the_bill.add_sponsor(['primary', 'cosponsor'][first], name.string) first = 1 #versions for doc in bill_page.findAll(id='frg_billstatus_DocumentGridTable')[0].findAll('tr'): r = self.parse_doc(the_bill, doc) if r: the_bill.add_version(*r) #documents if 'frg_billstatus_HlaTable' in str(bill_page): for doc in bill_page.findAll(id='frg_billstatus_HlaTable')[0].findAll('tr'): r = self.parse_doc(the_bill, doc) if r: the_bill.add_document(*r) if 'frg_billstatus_SfaSection' in str(bill_page): for doc in bill_page.findAll(id='frg_billstatus_SfaSection')[0].findAll('tr'): r = self.parse_doc(the_bill, doc) if r: the_bill.add_document(*r) self.parse_actions(the_bill, bill_page.findAll(id='frg_billstatus_HistoriesGridView')[0]) self.save_bill(the_bill) bill_no = bill_no + 1 pass
def scrape_bill(self, chamber, session, bill_number, ga_num): bill_url = self.urls['info'] % (bill_number, ga_num) with self.urlopen(bill_url) as page: page = lxml.html.fromstring(page) title = page.xpath("//span[@id='lblAbstract']")[0].text bill = Bill(session, chamber, bill_number, title) bill.add_source(bill_url) # Primary Sponsor sponsor = page.xpath("//span[@id='lblBillSponsor']")[0].text_content().split("by")[-1] sponsor = sponsor.replace('*','').strip() bill.add_sponsor('primary',sponsor) # Co-sponsors unavailable for scraping (loaded into page via AJAX) # Full summary doc summary = page.xpath("//span[@id='lblBillSponsor']/a")[0] bill.add_document('Full summary', summary.get('href')) # Actions tables = page.xpath("//table[@id='tabHistoryAmendments_tabHistory_gvBillActionHistory']") actions_table = tables[0] action_rows = actions_table.xpath("tr[position()>1]") for ar in action_rows: action_taken = ar.xpath("td")[0].text action_date = datetime.datetime.strptime(ar.xpath("td")[1].text.strip(), '%m/%d/%Y') bill.add_action(chamber, action_taken, action_date) votes_link = page.xpath("//span[@id='lblBillVotes']/a") if(len(votes_link) > 0): votes_link = votes_link[0].get('href') bill = self.scrape_votes(bill, sponsor, 'http://wapp.capitol.tn.gov/apps/Billinfo/%s' % (votes_link,)) self.save_bill(bill)
def scrape_bill_pages(self, session, year_abr): """ assemble information on a bill from a number of DBF files """ #Main Bill information main_bill_url, main_bill_db = self.get_dbf(year_abr, 'MAINBILL') # keep a dictionary of bills (mapping bill_id to Bill obj) bill_dict = {} for rec in main_bill_db: bill_type = rec["billtype"] bill_number = int(rec["billnumber"]) bill_id = bill_type + str(bill_number) title = rec["synopsis"] if bill_type[0] == 'A': chamber = "lower" else: chamber = "upper" bill = Bill(str(session), chamber, bill_id, title, type=self._bill_types[bill_type[1:]]) bill.add_source(main_bill_url) bill_dict[bill_id] = bill #Sponsors bill_sponsors_url, bill_sponsors_db = self.get_dbf(year_abr, 'BILLSPON') for rec in bill_sponsors_db: bill_type = rec["billtype"] bill_number = int(rec["billnumber"]) bill_id = bill_type + str(bill_number) bill = bill_dict[bill_id] name = rec["sponsor"] sponsor_type = rec["type"] if sponsor_type == 'P': sponsor_type = "Primary" else: sponsor_type = "Co-sponsor" bill.add_sponsor(sponsor_type, name) #Documents bill_document_url, bill_document_db = self.get_dbf(year_abr, 'BILLWP') #print bill_document_db[2] for rec in bill_document_db: bill_type = rec["billtype"] bill_number = int(rec["billnumber"]) bill_id = bill_type + str(bill_number) bill = bill_dict[bill_id] document = rec["document"] document = document.split('\\') document = document[-2] + "/" + document[-1] year = str(year_abr) + str((year_abr + 1)) #doc_url = "ftp://www.njleg.state.nj.us/%s/%s" % (year, document) htm_url = 'http://www.njleg.state.nj.us/%s/Bills/%s' % (year_abr, document.replace('.DOC', '.HTM')) # name document based _doctype doc_name = self._doctypes[rec['doctype']] if rec['comment']: doc_name += ' ' + rec['comment'] if rec['doctype'] in self._version_types: bill.add_version(doc_name, htm_url) else: bill.add_document(doc_name, htm_url) #Senate Votes file1 = 'A' + str(year_abr) file2 = 'A' + str(year_abr + 1) file3 = 'S' + str(year_abr) file4 = 'S' + str(year_abr + 1) if str(year_abr) != '2010': vote_info_list = [file1, file2, file3, file4] else: vote_info_list = [file1, file3] for bill_vote_file in vote_info_list: s_vote_url = 'ftp://www.njleg.state.nj.us/votes/%s.zip' % bill_vote_file s_vote_zip, resp = self.urlretrieve(s_vote_url) zipedfile = zipfile.ZipFile(s_vote_zip) vfile = "%s.txt" % bill_vote_file vote_file = zipedfile.open(vfile, 'U') vdict_file = csv.DictReader(vote_file) votes = {} if bill_vote_file[0] == "A": chamber = "lower" else: chamber = "upper" for rec in vdict_file: bill_id = rec["Bill"] bill_id = bill_id.strip() leg = rec["Full_Name"] date = rec["Session_Date"] date = datetime.strptime(date, "%m/%d/%Y") action = rec["Action"] leg_vote = rec["Legislator_Vote"] vote_id = bill_id + "_" + action vote_id = vote_id.replace(" ", "_") passed = None if vote_id not in votes: votes[vote_id] = Vote(chamber, date, action, passed, None, None, None, bill_id=bill_id) if leg_vote == "Y": votes[vote_id].yes(leg) elif leg_vote == "N": votes[vote_id].no(leg) else: votes[vote_id].other(leg) #Counts yes/no/other votes and saves overall vote for vote in votes.itervalues(): vote_yes_count = len(vote["yes_votes"]) vote_no_count = len(vote["no_votes"]) vote_other_count = len(vote["other_votes"]) vote["yes_count"] = vote_yes_count vote["no_count"] = vote_no_count vote["other_count"] = vote_other_count if vote_yes_count > vote_no_count: vote["passed"] = True else: vote["passed"] = False vote_bill_id = vote["bill_id"] bill = bill_dict[vote_bill_id] bill.add_vote(vote) #Actions bill_action_url, bill_action_db = self.get_dbf(year_abr, 'BILLHIST') for rec in bill_action_db: bill_type = rec["billtype"] bill_number = int(rec["billnumber"]) bill_id = bill_type + str(bill_number) bill = bill_dict[bill_id] action = rec["action"] date = rec["dateaction"] actor = rec["house"] comment = rec["comment"] action, atype = self.categorize_action(action) if comment: action += (' ' + comment) bill.add_action(actor, action, date, type=atype) # Subjects subject_url, subject_db = self.get_dbf(year_abr, 'BILLSUBJ') for rec in subject_db: bill_id = rec['billtype'] + str(int(rec['billnumber'])) bill = bill_dict.get(bill_id) if bill: bill.setdefault('subjects', []).append(rec['subjectkey']) else: self.warning('invalid bill id in BILLSUBJ.DBF: %s' % bill_id) # save all bills at the end for bill in bill_dict.itervalues(): # add sources bill.add_source(bill_sponsors_url) bill.add_source(bill_document_url) bill.add_source(bill_action_url) bill.add_source(subject_url) self.save_bill(bill)
def scrape_bill_pages(self, session, year_abr): #Main Bill information main_bill_url = 'ftp://www.njleg.state.nj.us/ag/%sdata/MAINBILL.DBF' % (year_abr) MAINBILL_dbf, resp = self.urlretrieve(main_bill_url) main_bill_db = dbf.Dbf(MAINBILL_dbf) bill_dict = {} for rec in main_bill_db: bill_type = rec["billtype"] bill_number = int(rec["billnumber"]) bill_id = bill_type + str(bill_number) title = rec["synopsis"] if bill_type[0] == 'A': chamber = "lower" else: chamber = "upper" bill = Bill(str(session), chamber, bill_id, title) bill.add_source(main_bill_url) bill_dict[bill_id] = bill #Sponsors bill_sponsors_url = 'ftp://www.njleg.state.nj.us/ag/%sdata/BILLSPON.DBF' % (year_abr) SPONSORS_dbf, resp = self.urlretrieve(bill_sponsors_url) bill_sponsors_db = dbf.Dbf(SPONSORS_dbf) for rec in bill_sponsors_db: bill_type = rec["billtype"] bill_number = int(rec["billnumber"]) bill_id = bill_type + str(bill_number) bill = bill_dict[bill_id] name = rec["sponsor"] sponsor_type = rec["type"] if sponsor_type == 'P': sponsor_type = "Primary" else: sponsor_type = "Co-sponsor" bill.add_sponsor(sponsor_type, name) #Documents bill_document_url = 'ftp://www.njleg.state.nj.us/ag/%sdata/BILLWP.DBF' % (year_abr) DOC_dbf, resp = self.urlretrieve(bill_document_url) bill_document_db = dbf.Dbf(DOC_dbf) #print bill_document_db[2] for rec in bill_document_db: bill_type = rec["billtype"] bill_number = int(rec["billnumber"]) bill_id = bill_type + str(bill_number) bill = bill_dict[bill_id] document = rec["document"] document = document.split('\\') doc_name = document[-1] document = document[-2] + "/" + document[-1] year = str(year_abr) + str((year_abr + 1)) doc_url = "ftp://www.njleg.state.nj.us/%s" % year doc_url = doc_url + "/" + document bill.add_document(doc_name, doc_url) #Senate Votes file1 = 'A' + str(year_abr) file2 = 'A' + str(year_abr + 1) file3 = 'S' + str(year_abr) file4 = 'S' + str(year_abr + 1) if str(year_abr) != '2010': vote_info_list = [file1, file2, file3, file4] else: vote_info_list = [file1, file3] for bill_vote_file in vote_info_list: s_vote_url = 'ftp://www.njleg.state.nj.us/votes/%s.zip' % bill_vote_file s_vote_zip, resp = self.urlretrieve(s_vote_url) zipedfile = zipfile.ZipFile(s_vote_zip) vfile = "%s.txt" % bill_vote_file vote_file = zipedfile.open(vfile, 'U') vdict_file = csv.DictReader(vote_file) votes = {} if bill_vote_file[0] == "A": chamber = "lower" else: chamber = "upper" for rec in vdict_file: bill_id = rec["Bill"] bill_id = bill_id.strip() leg = rec["Full_Name"] date = rec["Session_Date"] date = datetime.strptime(date, "%m/%d/%Y") action = rec["Action"] leg_vote = rec["Legislator_Vote"] vote_id = bill_id + "_" + action vote_id = vote_id.replace(" ", "_") passed = None if vote_id not in votes: votes[vote_id] = Vote(chamber, date, action, passed, None, None, None, bill_id = bill_id) if leg_vote == "Y": votes[vote_id].yes(leg) elif leg_vote == "N": votes[vote_id].no(leg) else: votes[vote_id].other(leg) #Counts yes/no/other votes and saves overall vote for vote in votes.itervalues(): vote_yes_count = len(vote["yes_votes"]) vote_no_count = len(vote["no_votes"]) vote_other_count = len(vote["other_votes"]) vote["yes_count"] = vote_yes_count vote["no_count"] = vote_no_count vote["other_count"] = vote_other_count if vote_yes_count > vote_no_count: vote["passed"] = True else: vote["passed"] = False vote_bill_id = vote["bill_id"] bill = bill_dict[vote_bill_id] bill.add_vote(vote) #Actions bill_action_url = 'ftp://www.njleg.state.nj.us/ag/%sdata/BILLHIST.DBF' % (year_abr) ACTION_dbf, resp = self.urlretrieve(bill_action_url) bill_action_db = dbf.Dbf(ACTION_dbf) bill.add_source(bill_sponsors_url) bill.add_source(bill_document_url) bill.add_source(bill_action_url) for rec in bill_action_db: bill_type = rec["billtype"] bill_number = int(rec["billnumber"]) bill_id = bill_type + str(bill_number) bill = bill_dict[bill_id] action = rec["action"] date = rec["dateaction"] actor = rec["house"] comment = rec["comment"] bill.add_action(actor, action, date, comment = comment) self.save_bill(bill)
def scrape_bill(self, chamber, session, bill_id): """ Scrapes documents, actions, vote counts and votes for a given bill. """ session_id = self.get_session_id(session) url = BASE_URL + 'DocumentsForBill.asp?Bill_Number=%s&Session_ID=%s' % ( bill_id.replace(' ', ''), session_id) with self.urlopen(url) as docs_for_bill: root = html.fromstring(docs_for_bill) bill_title = root.xpath( '//div[@class="ContentPageTitle"]')[1].text.strip() b_type = utils.get_bill_type(bill_id) bill = Bill(session, chamber, bill_id, bill_title, type=b_type) bill.add_source(url) path = '//tr[contains(td/font/text(), "%s")]' link_path = '//tr[contains(td/a/@href, "%s")]' link_path2 = '//tr[contains(td/font/a/@href, "%s")]' # versions rows = root.xpath(path % 'd Version') for row in rows: tds = row.cssselect('td') bill_version = tds[1].text_content().strip() bill_html = tds[2].xpath('string(font/a/@href)') bill.add_version(bill_version, bill_html) #fact sheets and summary rows = root.xpath(link_path2 % '/summary/') for row in rows: tds = row.cssselect('td') fact_sheet = tds[1].text_content().strip() fact_sheet_url = tds[1].xpath('string(font/a/@href)') bill.add_document(fact_sheet, fact_sheet_url, type="summary") #agendas # skipping revised, cancelled, date, time and room from agendas # but how to get the agenda type cleanly? meaning whether it is # house or senate? rows = root.xpath(link_path % '/agendas') for row in rows: tds = row.cssselect('td') agenda_committee = tds[0].text_content().strip() agenda_html = tds[7].xpath('string(a/@href)').strip() if agenda_html == '': agenda_html = tds[6].xpath('string(a/@href)').strip() bill.add_document(agenda_committee, agenda_html) # House Calendars # skipping calendar number, modified, date rows = root.xpath(link_path % '/calendar/h') for row in rows: tds = row.cssselect('td') calendar_name = tds[0].text_content().strip() calendar_html = tds[5].xpath('string(a/@href)') bill.add_document(calendar_name, calendar_html, type='house calendar') # Senate Calendars # skipping calendar number, modified, date rows = root.xpath(link_path % '/calendar/s') for row in rows: tds = row.cssselect('td') calendar_name = tds[0].text_content().strip() calendar_html = tds[5].xpath('string(a/@href)') bill.add_document(calendar_name, calendar_html, type='senate calendar') # amendments rows = root.xpath(path % 'AMENDMENT:') for row in rows: tds = row.cssselect('td') amendment_title = tds[1].text_content().strip() amendment_link = tds[2].xpath('string(font/a/@href)') bill.add_document(amendment_title, amendment_link, type='amendment') # videos # http://azleg.granicus.com/MediaPlayer.php?view_id=13&clip_id=7684 rows = root.xpath(link_path % '&clip_id') for row in rows: tds = row.cssselect('td') video_title = tds[1].text_content().strip() video_link = tds[2].xpath('string(a/@href)') video_date = tds[0].text_content().strip() bill.add_document(video_title, video_link, date=video_date, type='video') self.scrape_actions(chamber, session, bill)
def scrape(self, chamber, year): if year not in metadata['sessions']: raise NoDataForYear(year) start_char = 'S' if chamber == 'upper' else 'H' nm_locator_url = 'http://legis.state.nm.us/lcs/locator.aspx' with self.urlopen(nm_locator_url) as page: page = BeautifulSoup(page) #The first `tr` is simply 'Bill Locator`. Ignoring that data_table = page.find('table', id = 'ctl00_mainCopy_Locators')('tr')[1:] for session in data_table: session_tag = session.find('a') session_name = ' '.join([tag.string.strip() for tag in session_tag('span')]).strip() if year not in session_name: continue session_url = get_abs_url(nm_locator_url, session_tag['href']) with self.urlopen(session_url) as session_page: session_page = BeautifulSoup(session_page) bills_data_table = session_page.find('table', id = 'ctl00_mainCopy_LocatorGrid')('tr')[1:] for bill in bills_data_table: data = bill('td') bill_num_link = data[0].find('a') bill_num = ''.join([tag.string.strip() if tag.string else '' for tag in bill_num_link('span')]).strip() bill_num = bill_num[1:] if bill_num.startswith('*') else bill_num if not bill_num.startswith(start_char): self.log('Skipping %s. This bill is not for the relevant chamber %s.' % (bill_num, chamber)) continue bill_title = data[1].string.strip() #For now, removing the '*' in front of the bill # (* means emergency) bill_url = get_abs_url(session_url, bill_num_link['href'].replace(' ', '')) bill = Bill(session = session_name, chamber = chamber, bill_id = bill_num, title = bill_title) bill.add_source(bill_url) with self.urlopen(bill_url) as bill_page: bill_page = BeautifulSoup(bill_page) sponsor_data = bill_page.find('table', id = 'ctl00_mainCopy__SessionFormView') #The last link in this block will be the link to 'Key to Abbreviations'. Ignoring it. for sponsor_link in sponsor_data('a')[:-1]: #We will always have one extra 'a' tag than required - and it's 'span' strings will be empty. #need to check for that condition. sponsor_name = ' '.join([tag.string.strip() if tag.string else '' for tag in sponsor_link('span')]).strip() if sponsor_name != '': bill.add_sponsor(type = 'primary', name = sponsor_name) bill.add_version(**self.get_doc_data(bill_url, bill_page.find('table', id = 'ctl00_mainCopy_Introduced'))) committee_data = bill_page.find('table', id = 'ctl00_mainCopy_CommReportsList') if committee_data: for comms_data in committee_data('tr'): bill.add_document(**self.get_doc_data(bill_url, comms_data)) fir_data = bill_page.find('table', id = 'ctl00_mainCopy_FIRs') if fir_data: bill.add_document(**self.get_doc_data(bill_url, fir_data)) fin_ver_data = bill_page.find('table', id = 'ctl00_mainCopy_FinalVersion') if fin_ver_data: bill.add_version(**self.get_doc_data(bill_url, fin_ver_data)) self.save_bill(bill)
def scrape_bills(self, chamber_to_scrape, session): url = 'http://billstatus.ls.state.ms.us/%s/pdf/all_measures/allmsrs.xml' % session with self.urlopen(url) as bill_dir_page: root = lxml.etree.fromstring(bill_dir_page, lxml.etree.HTMLParser()) for mr in root.xpath('//lastaction/msrgroup'): bill_id = mr.xpath('string(measure)').replace(" ", "") if bill_id[0] == "S": chamber = "upper" else: chamber = "lower" bill_type = {'B':'bill', 'C': 'concurrent resolution', 'R': 'resolution', 'N': 'nomination'}[bill_id[1]] # just skip past bills that are of the wrong chamber if chamber != chamber_to_scrape: continue link = mr.xpath('string(actionlink)').replace("..", "") main_doc = mr.xpath('string(measurelink)').replace("../../../", "") main_doc_url = 'http://billstatus.ls.state.ms.us/%s' % main_doc bill_details_url = 'http://billstatus.ls.state.ms.us/%s/pdf/%s' % (session, link) with self.urlopen(bill_details_url) as details_page: details_page = details_page.decode('latin1').encode('utf8', 'ignore') details_root = lxml.etree.fromstring(details_page, lxml.etree.HTMLParser()) title = details_root.xpath('string(//shorttitle)') longtitle = details_root.xpath('string(//longtitle)') bill = Bill(session, chamber, bill_id, title, type=bill_type, longtitle=longtitle) #sponsors main_sponsor = details_root.xpath('string(//p_name)').split() if main_sponsor: main_sponsor = main_sponsor[0] main_sponsor_link = details_root.xpath('string(//p_link)').replace(" ", "_") main_sponsor_url = 'http://billstatus.ls.state.ms.us/%s/pdf/House_authors/%s.xml' % (session, main_sponsor_link) type = "primary" bill.add_sponsor(type, main_sponsor, main_sponsor_url = main_sponsor_url) for author in details_root.xpath('//authors/additional'): leg = author.xpath('string(co_name)').replace(" ", "_") leg_url = 'http://billstatus.ls.state.ms.us/%s/pdf/House_authors/%s.xml' % (session, leg) type = "cosponsor" bill.add_sponsor(type, leg, leg_url=leg_url) #Versions curr_version = details_root.xpath('string(//current_other)').replace("../../../../", "") curr_version_url = "http://billstatus.ls.state.ms.us/" + curr_version bill.add_version("Current version", curr_version_url) intro_version = details_root.xpath('string(//intro_other)').replace("../../../../", "") intro_version_url = "http://billstatus.ls.state.ms.us/" + intro_version bill.add_version("As Introduced", intro_version_url) comm_version = details_root.xpath('string(//cmtesub_other)').replace("../../../../", "") if comm_version.find("documents") != -1: comm_version_url = "http://billstatus.ls.state.ms.us/" + comm_version bill.add_version("Committee Substitute", comm_version_url) passed_version = details_root.xpath('string(//passed_other)').replace("../../../../", "") if passed_version.find("documents") != -1: passed_version_url = "http://billstatus.ls.state.ms.us/" + passed_version title = "As Passed the " + chamber bill.add_version(title, passed_version_url) asg_version = details_root.xpath('string(//asg_other)').replace("../../../../", "") if asg_version.find("documents") != -1: asg_version_url = "http://billstatus.ls.state.ms.us/" + asg_version bill.add_version("Approved by the Governor", asg_version_url) #Actions for action in details_root.xpath('//history/action'): action_num = action.xpath('string(act_number)').strip() action_num = int(action_num) act_vote = action.xpath('string(act_vote)').replace("../../../..", "") action_desc = action.xpath('string(act_desc)') date, action_desc = action_desc.split(" ", 1) date = date + "/" + session[0:4] date = datetime.strptime(date, "%m/%d/%Y") if action_desc.startswith("(H)"): actor = "lower" action = action_desc[4:] elif action_desc.startswith("(S)"): actor = "upper" action = action_desc[4:] else: actor = "executive" action = action_desc if action.find("Veto") != -1: version_path = details_root.xpath("string(//veto_other)") version_path = version_path.replace("../../../../", "") version_url = "http://billstatus.ls.state.ms.us/" + version_path bill.add_document("Veto", version_url) atype = 'other' for prefix, prefix_type in self._action_types: if action.startswith(prefix): atype = prefix_type break bill.add_action(actor, action, date, type=atype, action_num=action_num) if act_vote: vote_url = 'http://billstatus.ls.state.ms.us%s' % act_vote vote = self.scrape_votes(vote_url, action, date, actor) bill.add_vote(vote) bill.add_source(vote_url) bill.add_source(bill_details_url) self.save_bill(bill)
def scrape_bill(self, chamber, session, bill_id): session_id = self.get_session_id(session) url = base_url + 'DocumentsForBill.asp?Bill_Number=%s&Session_ID=%s' % ( bill_id, session_id) with self.urlopen(url) as docs_for_bill: root = html.fromstring(docs_for_bill) bill_title = root.xpath( '//div[@class="ContentPageTitle"]')[1].text.strip() # Depending on the progress the bill has made through the house # some table might not exist, the links that have javascript:Show**** # have a table with related documents/calanders/agendas/versions # I am skipping the sponsors link because that information is on the # bill overview page where all of the actions are found. doc_section_links = root.xpath( '//a[contains(@href, "javascript:Show")]') bill = Bill(session, chamber, bill_id, bill_title) bill.type = self.get_bill_type(bill_id[:-4]) bill.add_source(url) for link in doc_section_links: link_id = utils.parse_link_id(link) link_text = link.text_content().strip() div_path = '//div[@id="%s"]/table//tr' % link_id if link_text == 'Show Versions': # the first row has only a comment for tr in root.xpath(div_path)[1:]: tds = tr.cssselect('td') # list(tr.iterchildren('td')) if len(tds) >= 4: bill_version = tds[1].text_content().strip() bill_html = tds[2].xpath('string(font/a/@href)') bill_pdf = tds[3].xpath('string(font/a/@href)') bill.add_version(bill_version, bill_html, pdf_url=bill_pdf) elif link_text == 'Show Summaries/Fact Sheets': for tr in root.xpath(div_path)[1:]: # the first row has only a comment tds = tr.cssselect('td') if len(tds) > 1: fact_sheet = tds[1].text_content().strip() fact_sheet_url = tds[1].xpath( 'string(font/a/@href)') bill.add_document(fact_sheet, fact_sheet_url, type="fact sheet") elif link_text in ('Show Senate Agendas', 'Show House Agendas'): agenda_type = 'House Agenda' if re.match('House', link_text) else 'Senate Agenda' for tr in root.xpath(div_path)[2:]: # the first row has only a comment # the second row is the table header tds = tr.cssselect('td') if len(tds) >= 8: agenda_committee = tds[0].text_content().strip() agenda_revised = tds[1].text.strip() agenda_cancelled = tds[2].text.strip() agenda_date = tds[3].text_content().strip() agenda_time = tds[4].text_content().strip() agenda_room = tds[5].text_content().strip() agenda_pdf = tds[6].xpath('string(a/@href)').strip() agenda_html = tds[7].xpath('string(a/@href)').strip() bill.add_document(agenda_committee, agenda_html, type=agenda_type) elif link_text in ('Show Senate Calendars', 'Show House Calendar'): cal_type = 'house calendar' if re.match('House', link_text) else 'senate calendar' for tr in root.xpath(div_path)[2:]: # the first row has only a comment # the second row is the table header tds = tr.cssselect('td') if len(tds) >= 6: calendar_name = tds[0].text_content().strip() calendar_number = tds[1].text_content().strip() calendar_modified = True if tds[2].xpath('img') else False calendar_date = tds[3].text_content().strip() calendar_html = tds[5].xpath('string(a/@href)') bill.add_document(calendar_name, calendar_html, type="calendar") elif link_text == 'Show Adopted Amendments': for tr in root.xpath(div_path)[1:]: tds = tr.cssselect('td') amendment_title = tds[1].text_content().strip() amendment_link = tds[2].xpath('string(font/a/@href)') bill.add_document(amendment_title, amendment_link, type='amendment') elif link_text == 'Show Proposed Amendments': for tr in root.xpath(div_path)[1:]: tds = tr.cssselect('td') if len(tds) >= 3: amendment_title = tds[1].text_content().strip() amendment_link = tds[2].xpath('string(font/a/@href)') bill.add_document(amendment_title, amendment_link, type='amendment') elif link_text == 'Show Bill Videos': for tr in root.xpath(div_path)[2:]: tds = tr.cssselect('td') if len(tds) >= 3: video_title = tds[1].text_content().strip() video_link = tds[2].xpath('string(a/@href)') video_date = tds[0].text_content().strip() bill.add_document(video_title, video_link, date=video_date, type='video') # action_url = 'http://www.azleg.gov/FormatDocument.asp?inDoc=/legtext/49leg/2r/bills/hb2001o.asp' # again the actions page may or may not have a given table and the order # of the actions depends on the chamber the bill originated in. ses_num = utils.legislature_to_number(session) action_url = base_url + 'FormatDocument.asp?inDoc=/legtext/%s/bills/%so.asp' % (ses_num, bill_id.lower()) with self.urlopen(action_url) as action_page: bill.add_source(action_url) root = html.fromstring(action_page) action_tables = root.xpath('/html/body/div/table/tr[3]/td[4]/table/tr/td/table/tr/td/table') for table in action_tables: rows = table.cssselect('tr') house = False if chamber == 'upper' else True action = table.cssselect('td')[0].text_content().strip()[:-1] if action == 'SPONSORS': if len(rows[0]) == 4: for row in rows: tds = row.cssselect('td') sponsors = [tds[i:i+2:] for i in range(1, len(tds), 2)] bill.add_sponsor(sponsors[0][1].text_content().strip(), sponsors[0][0].text_content().strip(), sponsor_link=sponsors[0][0].xpath('string(a/@href)')) elif action == 'COMMITTEES': # the html for this table has meta tags that give the chamber # and the committee abreviation # <meta name="HCOMMITTEE" content="RULES"> # question for actions: in the case of committees would House # Rules be better for an actor? for row in rows[1:]: tds = row.cssselect('td') meta_tag = row.cssselect('meta')[0] actor = "%s:%s" % (meta_tag.get('name'), meta_tag.get('content')) committee = meta_tag.get('content') act = 'committee:reffered' date = datetime.datetime.strptime(tds[1].text_content().strip(), '%m/%d/%y') bill.add_action(actor, act, date, type='committee:referred') if len(tds) == 5: if re.match('\d{2}/\d{2}/\d{2}', tds[3].text_content().strip()): date = datetime.datetime.strptime(tds[3].text_content().strip(), '%m/%d/%y') else: date = datetime.datetime.strptime(tds[1].text_content().strip(), '%m/%d/%y') act = tds[4].text_content().strip() status = 'other' bill.add_action(actor, act, date, type=status, status=status) elif len(tds) == 6: where, committee = actor.split(':') where = 'lower' if where == 'HCOMMITTEE' else 'upper' date = datetime.datetime.strptime(tds[3].text_content().strip(), '%m/%d/%y') vote = tds[4].text_content().strip()[1:-1] if len(vote.split('-')) == 4: yes, no, nv, exc = vote.split('-') else: yes, no, excused, absent, nv = vote.split('-') motion = tds[5].text_content().strip() passed = True if yes > no else False vote = Vote(where, date, motion, passed, int(yes), int(no), int(nv), committee=committee) vote.add_source(tds[0].xpath('string(a/@href)').strip()) bill.add_vote(vote) elif action in ('HOUSE FIRST READ', 'HOUSE SECOND READ'): aType = 'other' if re.search('HOUSE FIRST', action): aType = 'committee:referred' bill.add_action('lower', action, utils.get_date(rows[0][1]), type=aType) elif action in ('SENATE FIRST READ', 'SENATE SECOND READ'): aType = 'other' if re.search('SECOND', action): aType = 'committee:referred' bill.add_action('upper', action, utils.get_date(rows[0][1]), type=aType) elif action in ('TRANSMIT TO HOUSE', 'TRANSMIT TO SENATE'): actor = 'lower' if re.match('HOUSE', action) else 'upper' house = True if actor == 'lower' else False date = utils.get_date(rows[0][1]) bill.add_action(actor, action, date) elif re.match('COW ACTION \d', action): actor = 'lower' if house else 'upper' for row in rows[1:]: date = utils.get_date(row[1]) bill.add_action(actor, action, date, motion=row[2].text_content().strip()) elif action in ('HOUSE FINAL READ', 'SENATE FINAL READ', 'THIRD READ'): actor = 'lower' if house else 'upper' for row in rows[1:]: if row[0].text_content().strip() == 'Vote Detail': if len(row.getchildren()) == 10: detail, date, ayes, nays, nv, exc, emer, rfe, two_thirds, result = [ x.text_content().strip() for x in row ] print action_url passed = True if result == 'PASSED' else False motion = action date = datetime.datetime.strptime(date, '%m/%d/%y') if date else '' vote = Vote(actor, date, motion, passed, int(ayes), int(nays), int(nv), excused=int(exc), emergency=emer, rfe=rfe, two_thirds_vote=two_thirds, type="passage") vote.add_source(row[0].xpath('string(a/@href)').strip()) bill.add_vote(vote) elif len(row.getchildren()) == 11: detail, date, ayes, nays, nv, exc, emer, amend, rfe, two_thirds, result = [ x.text_content().strip() for x in row ] passed = True if result == 'PASSED' else False motion = action date = datetime.datetime.strptime(date, '%m/%d/%y') if date else '' vote = Vote(actor, date, motion, passed, int(ayes), int(nays), int(nv), excused=int(exc), emergency=emer, amended=amend, rfe=rfe, two_thirds_vote=two_thirds, type="passage") vote.add_source(row[0].xpath('string(a/@href)').strip()) bill.add_vote(vote) elif action == 'TRANSMITTED TO': actor = 'lower' if house else 'upper' act = action + ": " + rows[0][1].text_content().strip() date = rows[0][2].text_content().strip() date = datetime.datetime.strptime(date, '%m/%d/%y') bill.add_action(actor, act, date, type='governor:received') # need action and chaptered, chaptered version if they exists act, date, chapter, version = '', '', '', '' for row in rows[1:]: if row[0].text_content().strip() == 'ACTION:': act = row[1].text_content().strip() date = datetime.datetime.strptime(row[2].text_content().strip(), '%m/%d/%y') elif row[0].text_content().strip() == 'CHAPTER': chapter = row[1].text_content().strip() elif row[0].text_content().strip() == 'CHAPTERED VERSION': version = row[1].text_content.strip() if act: action_type = 'governor:signed' if act == 'SIGNED' else 'governor:vetoed' if chapter: bill.add_action('governor', act, date, type=action_type, chapter=chapter, chaptered_version=version) else: bill.add_action('governor', act, date, type=action_type) self.save_bill(bill) self.log("saved: " + bill['bill_id'])
def scrape_bills(self, session): url = 'http://billstatus.ls.state.ms.us/%s/pdf/all_measures/allmsrs.xml' % session with self.urlopen(url) as bill_dir_page: root = lxml.etree.fromstring(bill_dir_page, lxml.etree.HTMLParser()) for mr in root.xpath('//lastaction/msrgroup'): bill_id = mr.xpath('string(measure)').replace(" ", "") if bill_id[0] == "S": chamber = "upper" else: chamber = "lower" link = mr.xpath('string(actionlink)').replace("..", "") main_doc = mr.xpath('string(measurelink)').replace("../../../", "") main_doc_url = 'http://billstatus.ls.state.ms.us/%s' % main_doc bill_details_url = 'http://billstatus.ls.state.ms.us/%s/pdf/%s' % (session, link) with self.urlopen(bill_details_url) as details_page: details_page = details_page.decode('latin1').encode('utf8', 'ignore') details_root = lxml.etree.fromstring(details_page, lxml.etree.HTMLParser()) title = details_root.xpath('string(//shorttitle)') longtitle = details_root.xpath('string(//longtitle)') bill = Bill(session, chamber, bill_id, title, longtitle = longtitle) #sponsors main_sponsor = details_root.xpath('string(//p_name)').split()[0] main_sponsor_link = details_root.xpath('string(//p_link)').replace(" ", "_") main_sponsor_url = 'http://billstatus.ls.state.ms.us/%s/pdf/House_authors/%s.xml' % (session, main_sponsor_link) type = "Primary sponsor" bill.add_sponsor(type, main_sponsor, main_sponsor_url = main_sponsor_url) for author in details_root.xpath('//authors/additional'): leg = author.xpath('string(co_name)').replace(" ", "_") leg_url = 'http://billstatus.ls.state.ms.us/%s/pdf/House_authors/%s.xml' % (session, leg) type = "additional sponsor" bill.add_sponsor(type, leg, leg_url=leg_url) #Versions curr_version = details_root.xpath('string(//current_other)').replace("../../../../", "") curr_version_url = "http://billstatus.ls.state.ms.us/" + curr_version bill.add_version("Current version", curr_version_url) intro_version = details_root.xpath('string(//intro_other)').replace("../../../../", "") intro_version_url = "http://billstatus.ls.state.ms.us/" + intro_version bill.add_version("As Introduced", intro_version_url) comm_version = details_root.xpath('string(//cmtesub_other)').replace("../../../../", "") if comm_version.find("documents") != -1: comm_version_url = "http://billstatus.ls.state.ms.us/" + comm_version bill.add_version("Committee Substitute", comm_version_url) passed_version = details_root.xpath('string(//passed_other)').replace("../../../../", "") if passed_version.find("documents") != -1: passed_version_url = "http://billstatus.ls.state.ms.us/" + passed_version title = "As Passed the " + chamber bill.add_version(title, passed_version_url) asg_version = details_root.xpath('string(//asg_other)').replace("../../../../", "") if asg_version.find("documents") != -1: asg_version_url = "http://billstatus.ls.state.ms.us/" + asg_version bill.add_version("Approved by the Governor", asg_version_url) #Actions for action in details_root.xpath('//history/action'): action_num = action.xpath('string(act_number)').strip() action_num = int(action_num) action_desc = action.xpath('string(act_desc)') act_vote = action.xpath('string(act_vote)').replace("../../../..", "") date = action_desc.split()[0] + "/" + session[0:4] date = datetime.strptime(date, "%m/%d/%Y") try: actor = action_desc.split()[2][1] if actor == "H": actor = "lower" else: actor = "upper" except: actor = "Executive" action = action_desc[10: len(action_desc)] if action.find("Veto") != -1: version_path = details_root.xpath("string(//veto_other)") version_path = version_path.replace("../../../../", "") version_url = "http://billstatus.ls.state.ms.us/" + version_path bill.add_document("Veto", version_url) bill.add_action(actor, action, date, action_num=action_num) vote_url = 'http://billstatus.ls.state.ms.us%s' % act_vote if vote_url != "http://billstatus.ls.state.ms.us": vote =self.scrape_votes(vote_url, action, date, actor) bill.add_vote(vote) self.save_bill(bill)
def scrape(self, chamber, session): sep = "<h1>House</h1>" if chamber == "upper": after = False reg = "[5-9]" else: after = True reg = "[1-4]" year = str(year_from_session(session)) with self.urlopen("http://apps.leg.wa.gov/billinfo/dailystatus.aspx?year=" + year) as page_html: page = lxml.html.fromstring(separate_content(page_html, sep)) for element, attribute, link, pos in page.iterlinks(): if re.search("bill=" + reg + "[0-9]{3}", link) != None: bill_page_url = "http://apps.leg.wa.gov/billinfo/" + link with self.urlopen(bill_page_url) as bill_page_html: bill_page = lxml.html.fromstring(bill_page_html) raw_title = bill_page.cssselect("title") split_title = string.split(raw_title[0].text_content(), " ") bill_id = split_title[0] + " " + split_title[1] bill_id = bill_id.strip() title_element = bill_page.get_element_by_id("ctl00_ContentPlaceHolder1_lblSubTitle") title = title_element.text_content() bill = Bill(session, chamber, bill_id, title) bill.add_source(bill_page_url) self.scrape_actions(bill_page, bill) for element, attribute, link, pos in bill_page.iterlinks(): if re.search("billdocs", link) != None: if re.search("Amendments", link) != None: bill.add_document("Amendment: " + element.text_content(), link) elif re.search("Bills", link) != None: bill.add_version(element.text_content(), link) else: bill.add_document(element.text_content(), link) elif re.search("senators|representatives", link) != None: with self.urlopen(link) as senator_page_html: senator_page = lxml.html.fromstring(senator_page_html) try: name_tuple = self.scrape_legislator_name(senator_page) bill.add_sponsor("primary", name_tuple[0]) except: pass elif re.search("ShowRollCall", link) != None: match = re.search("([0-9]+,[0-9]+)", link) match = match.group(0) match = match.split(",") id1 = match[0] id2 = match[1] url = votes_url(id1, id2) with self.urlopen(url) as vote_page_html: vote_page = lxml.html.fromstring(vote_page_html) self.scrape_votes(vote_page, bill, url) self.save_bill(bill)