def scrape_bill(self, chamber, session): url = "ftp://www.arkleg.state.ar.us/dfadooas/LegislativeMeasures.txt" page = self.urlopen(url) page = unicode_csv_reader(StringIO.StringIO(page), delimiter='|') for row in page: bill_chamber = {'H': 'lower', 'S': 'upper'}[row[0]] if bill_chamber != chamber: continue bill_id = "%s%s %s" % (row[0], row[1], row[2]) type_spec = re.match(r'(H|S)([A-Z]+)\s', bill_id).group(2) bill_type = { 'B': 'bill', 'R': 'resolution', 'JR': 'joint resolution', 'CR': 'concurrent resolution', 'MR': 'memorial resolution', 'CMR': 'concurrent memorial resolution'}[type_spec] if row[-1] != self.slug: continue bill = Bill(session, chamber, bill_id, row[3], type=bill_type) bill.add_source(url) primary = row[11] if not primary: primary = row[12] bill.add_sponsor('primary', primary) version_url = ("ftp://www.arkleg.state.ar.us/Bills/" "%s/Public/%s.pdf" % ( session, bill_id.replace(' ', ''))) bill.add_version(bill_id, version_url, mimetype='application/pdf') self.scrape_bill_page(bill) self.bills[bill_id] = bill
def scrape_bill(self, chamber, session, bill_id, url): html = self.urlopen(url) doc = lxml.html.fromstring(html) doc.make_links_absolute(url) title = doc.xpath('//h3[@class="h3billright"]')[0].text # TODO: grab summary (none present at time of writing) if 'B' in bill_id: _type = ['bill'] elif 'J' in bill_id: _type = ['joint resolution'] else: raise ValueError('unknown bill type ' + bill_id) bill = Bill(session, chamber, bill_id, title, type=_type) bill.add_source(url) # process sponsors sponsors = _get_td(doc, 'All Sponsors:').text_content() sponsors = sponsors.replace('Delegates ', '') sponsors = sponsors.replace('Delegate ', '') sponsors = sponsors.replace('Senator ', '') sponsors = sponsors.replace('Senators ', '') sponsor_type = 'primary' for sponsor in re.split(', (?:and )?', sponsors): #self.debug('sponsor: %s', sponsor) bill.add_sponsor(sponsor_type, sponsor) sponsor_type = 'cosponsor' # subjects subjects = _get_td(doc, 'Narrow Subject(s):').xpath('a/text()') bill['subjects'] = [s.split(' -see also-')[0] for s in subjects if s] # documents self.scrape_documents(bill, url.replace('stab=01', 'stab=02')) # actions self.scrape_actions(bill, url.replace('stab=01', 'stab=03')) self.save_bill(bill)
def parse_bill(self, chamber, session, special, link): bill_num = link.text.strip() type_abbr = re.search('type=(B|R|)', link.attrib['href']).group(1) if type_abbr == 'B': btype = ['bill'] elif type_abbr == 'R': btype = ['resolution'] bill_id = "%s%s %s" % (bill_abbr(chamber), type_abbr, bill_num) url = info_url(chamber, session, special, type_abbr, bill_num) page = self.urlopen(url) page = lxml.html.fromstring(page) page.make_links_absolute(url) xpath = '//div[contains(@class, "BillInfo-ShortTitle")]/div[@class="BillInfo-Section-Data"]' title = page.xpath(xpath).pop().text_content().strip() if not title: return bill = Bill(session, chamber, bill_id, title, type=btype) bill.add_source(url) self.parse_bill_versions(bill, page) self.parse_history( bill, history_url(chamber, session, special, type_abbr, bill_num)) # only fetch votes if votes were seen in history # if vote_count: self.parse_votes( bill, vote_url(chamber, session, special, type_abbr, bill_num)) # Dedupe sources. sources = bill['sources'] for source in sources: if 1 < sources.count(source): sources.remove(source) self.save_bill(bill)
def parse_senate_billpage(self, bill_url, year): with self.urlopen(bill_url) as bill_page: bill_page = BeautifulSoup(bill_page) # get all the info needed to record the bill bill_id = bill_page.find(id="lblBillNum").b.font.contents[0] bill_title = bill_page.find(id="lblBillTitle").font.string bill_desc = bill_page.find(id="lblBriefDesc").font.contents[0] bill_lr = bill_page.find(id="lblLRNum").font.string bill = Bill(year, 'upper', bill_id, bill_desc, bill_url=bill_url, bill_lr=bill_lr, official_title=bill_title) bill.add_source(bill_url) # Get the primary sponsor bill_sponsor = bill_page.find(id="hlSponsor").i.font.contents[0] bill_sponsor_link = bill_page.find(id="hlSponsor").href bill.add_sponsor('primary', bill_sponsor, sponsor_link=bill_sponsor_link) # cosponsors show up on their own page, if they exist cosponsor_tag = bill_page.find(id="hlCoSponsors") if cosponsor_tag and 'href' in cosponsor_tag: self.parse_senate_cosponsors(bill, cosponsor_tag['href']) # get the actions action_url = bill_page.find(id="hlAllActions")['href'] self.parse_senate_actions(bill, action_url) # stored on a separate page versions_url = bill_page.find(id="hlFullBillText") if versions_url: self.parse_senate_bill_versions(bill, versions_url['href']) self.save_bill(bill)
def parse_bill_status_page(self, status_url, bill_url, session, chamber): status_page = ElementTree( lxml.html.fromstring(self.urlopen(status_url))) # see 2007 HB 2... weird. try: bill_id = status_page.xpath("//tr[2]/td[2]")[0].text_content() except IndexError: bill_id = status_page.xpath('//tr[1]/td[2]')[0].text_content() try: title = status_page.xpath( "//form[1]/table[2]/tr[3]/td[2]")[0].text_content() except IndexError: title = status_page.xpath('//tr[1]/td[2]')[0].text_content() bill = Bill(session, chamber, bill_id, title) bill.add_source(bill_url) self.add_sponsors(bill, status_page) self.add_actions(bill, status_page) return bill
def scrape(self, chamber, session): try: for index in xrange(1, 1000): url = ("http://open.nysenate.gov/legislation/search/" "?search=otype:bill&searchType=&format=xml" "&pageIdx=%d" % index) with self.urlopen(url) as page: page = lxml.etree.fromstring(page) for result in page.xpath("//result[@type = 'bill']"): id = result.attrib['id'].split('-')[0] title = result.attrib['title'].strip() if title == '(no title)': continue primary_sponsor = result.attrib['sponsor'] if id.startswith('S'): bill_chamber = 'upper' else: bill_chamber = 'lower' if chamber != bill_chamber: continue bill = Bill(session, chamber, id, title) bill.add_source(url) bill.add_sponsor('primary', primary_sponsor) bill_url = ("http://open.nysenate.gov/legislation/" "bill/%s" % result.attrib['id']) self.scrape_bill(bill, bill_url) bill.add_source(bill_url) self.save_bill(bill) except scrapelib.HTTPError as e: if e.response.code != 404: raise
def parse_senate_billpage(self, bill_url, year): with self.urlopen(bill_url) as bill_page: bill_page = lxml.html.fromstring(bill_page) # get all the info needed to record the bill # TODO probably still needs to be fixed bill_id = bill_page.xpath('//*[@id="lblBillNum"]')[0].text_content() bill_title = bill_page.xpath('//*[@id="lblBillTitle"]')[0].text_content() bill_desc = bill_page.xpath('//*[@id="lblBriefDesc"]')[0].text_content() bill_lr = bill_page.xpath('//*[@id="lblLRNum"]')[0].text_content() #print "bill id = "+ bill_id bill = Bill(year, 'upper', bill_id, bill_desc, bill_url=bill_url, bill_lr=bill_lr, official_title=bill_title) bill.add_source(bill_url) # Get the primary sponsor sponsor = bill_page.xpath('//*[@id="hlSponsor"]')[0] bill_sponsor = sponsor.text_content() bill_sponsor_link = sponsor.attrib.get('href') bill.add_sponsor('primary', bill_sponsor, sponsor_link=bill_sponsor_link) # cosponsors show up on their own page, if they exist cosponsor_tag = bill_page.xpath('//*[@id="hlCoSponsors"]') if len(cosponsor_tag) > 0 and cosponsor_tag[0].attrib.has_key('href'): self.parse_senate_cosponsors(bill, cosponsor_tag[0].attrib['href']) # get the actions action_url = bill_page.xpath('//*[@id="hlAllActions"]') if len(action_url) > 0: action_url = action_url[0].attrib['href'] #print "actions = %s" % action_url self.parse_senate_actions(bill, action_url) # stored on a separate page versions_url = bill_page.xpath('//*[@id="hlFullBillText"]') if len(versions_url) > 0 and versions_url[0].attrib.has_key('href'): self.parse_senate_bill_versions(bill, versions_url[0].attrib['href']) self.save_bill(bill)
def scrape(self, chamber, session): chamber_abbrev = {'upper': 'SF', 'lower': 'HB'}[chamber] url = ("http://legisweb.state.wy.us/%s/billreference/" "BillReference.aspx?type=%s" % (session, chamber_abbrev)) page = self.lxmlize(url) for tr in page.xpath("//table[contains(@id,'cphContent_gvBills')]//tr")[1:]: bill_id = tr.xpath("string(td[1])").strip() title = tr.xpath("string(td[2])").strip() if bill_id[0:2] in ['SJ', 'HJ']: bill_type = 'joint resolution' else: bill_type = 'bill' bill = Bill(session, chamber, bill_id, title, type=bill_type) self.scrape_digest(bill) # versions for a in (tr.xpath('td[8]//a') + tr.xpath('td[11]//a') + tr.xpath('td[12]//a')): # skip references to other bills if a.text.startswith('See'): continue bill.add_version(a.text, a.get('href'), mimetype='application/pdf') # documents fnote = tr.xpath('td[9]//a') if fnote: bill.add_document('Fiscal Note', fnote[0].get('href')) summary = tr.xpath('td[14]//a') if summary: bill.add_document('Summary', summary[0].get('href')) bill.add_source(url) self.save_bill(bill)
def scrape_bill(self, chamber, session, bill_type, number): """ Creates a bill object """ if len(session) == 4: session_url = session+'rs' else: session_url = session url = BILL_URL % (session_url, bill_type, number) with self.urlopen(url) as html: doc = lxml.html.fromstring(html) # find <a name="Title">, get parent dt, get parent dl, then dd n dl title = doc.xpath('//a[@name="Title"][1]/../../dd[1]/text()')[0].strip() synopsis = doc.xpath('//font[@size="3"]/p/text()')[0].strip() #print "%s %d %s" % (bill_type, number, title) if 'B' in bill_type: _type = ['bill'] elif 'J' in bill_type: _type = ['joint resolution'] bill = Bill(session, chamber, "%s %d" % (bill_type, number), title, type=_type, synopsis=synopsis) bill.add_source(url) self.parse_bill_sponsors(doc, bill) # sponsors self.parse_bill_actions(doc, bill) # actions self.parse_bill_documents(doc, bill) # documents and versions self.parse_bill_votes(doc, bill) # votes # subjects subjects = [] for subj in doc.xpath('//a[contains(@href, "/subjects/")]'): subjects.append(subj.text.split('-see also-')[0]) bill['subjects'] = subjects # add bill to collection self.save_bill(bill)
def scrape_bill(self, session, chamber, bill_url): try: page = self.lxmlize('{}{}'.format(CO_URL_BASE, bill_url)) except scrapelib.HTTPError as e: if e.response.status_code == 503: self.error('Skipping %s w/ 503', bill_url) return else: raise bill_number = page.xpath( '//div[contains(@class,"field-name-field-bill-number")]' '//div[contains(@class,"field-item even")][1]/text()')[0].strip() bill_title = page.xpath('//span[@property="dc:title"]/@content')[0] bill_summary = page.xpath( 'string(//div[contains(@class,"field-name-field-bill-summary")])') bill_summary = bill_summary.strip() bill = Bill(session, chamber, bill_number, bill_title, summary=bill_summary) bill.add_source('{}{}'.format(CO_URL_BASE, bill_url)) self.scrape_sponsors(bill, page) self.scrape_actions(bill, page) self.scrape_versions(bill, page) self.scrape_research_notes(bill, page) self.scrape_fiscal_notes(bill, page) self.scrape_committee_report(bill, page) self.scrape_votes(bill, page) self.scrape_amendments(bill, page) self.save_bill(bill)
def scrape_bill(self, chamber, session, bill_id): biennium = "%s-%s" % (session[0:4], session[7:9]) bill_num = bill_id.split()[1] url = ("%s/GetLegislation?biennium=%s&billNumber" "=%s" % (self._base_url, biennium, bill_num)) with self.urlopen(url) as page: page = lxml.etree.fromstring(page) page = xpath(page, "//wa:Legislation")[0] title = xpath(page, "string(wa:LongDescription)") bill_type = xpath( page, "string(wa:ShortLegislationType/wa:LongLegislationType)") bill_type = bill_type.lower() if bill_type == 'gubernatorial appointment': return bill = Bill(session, chamber, bill_id, title, type=[bill_type]) chamber_name = {'lower': 'House', 'upper': 'Senate'}[chamber] version_url = ("http://www.leg.wa.gov/pub/billinfo/2011-12/" "Htm/Bills/%s %ss/%s.htm" % (chamber_name, bill_type.title(), bill_num)) bill.add_version(bill_id, version_url) fake_source = ("http://apps.leg.wa.gov/billinfo/" "summary.aspx?bill=%s&year=%s" % (bill_num, session[0:4])) bill.add_source(fake_source) self.scrape_sponsors(bill) self.scrape_actions(bill) self.scrape_votes(bill) return bill
def scrape(self, chamber, session): chamber_abbrev = {'upper': 'SF', 'lower': 'HB'}[chamber] url = ("http://legisweb.state.wy.us/%s/billindex/" "BillCrossRef.aspx?type=%s" % (session, chamber_abbrev)) page = lxml.html.fromstring(self.urlopen(url)) for tr in page.xpath("//tr[@valign='middle']")[1:]: bill_id = tr.xpath("string(td[1])").strip() title = tr.xpath("string(td[2])").strip() if bill_id[0:2] in ['SJ', 'HJ']: bill_type = 'joint resolution' else: bill_type = 'bill' bill = Bill(session, chamber, bill_id, title, type=bill_type) self.scrape_digest(bill) # versions for a in (tr.xpath('td[6]//a') + tr.xpath('td[9]//a') + tr.xpath('td[10]//a')): # skip references to other bills if a.text.startswith('See'): continue bill.add_version(a.text, a.get('href')) # documents fnote = tr.xpath('td[7]//a') if fnote: bill.add_document('Fiscal Note', fnote[0].get('href')) summary = tr.xpath('td[12]//a') if summary: bill.add_document('Summary', summary[0].get('href')) bill.add_source(url) self.save_bill(bill)
def scrape_bill_info(self, session, chambers): info_url = "ftp://ftp.cga.ct.gov/pub/data/bill_info.csv" data = self.urlopen(info_url) page = open_csv(data) chamber_map = {'H': 'lower', 'S': 'upper'} for row in page: bill_id = row['bill_num'] chamber = chamber_map[bill_id[0]] if not chamber in chambers: continue # assert that the bill data is from this session, CT is tricky assert row['sess_year'] == session if re.match(r'^(S|H)J', bill_id): bill_type = 'joint resolution' elif re.match(r'^(S|H)R', bill_id): bill_type = 'resolution' else: bill_type = 'bill' bill = Bill(session, chamber, bill_id, row['bill_title'], type=bill_type) bill.add_source(info_url) self.scrape_bill_page(bill) for introducer in self._introducers[bill_id]: bill.add_sponsor('primary', introducer, official_type='introducer') bill['subjects'] = self._subjects[bill_id] self.bills[bill_id] = bill
def scrape_bill_2012(self, chamber, session, bill_id, url): """ Creates a bill object """ if len(session) == 4: session_url = session+'rs' else: session_url = session html = self.urlopen(url) doc = lxml.html.fromstring(html) doc.make_links_absolute(url) # find <a name="Title">, get parent dt, get parent dl, then dd n dl title = doc.xpath('//a[@name="Title"][1]/../../dd[1]/text()')[0].strip() summary = doc.xpath('//font[@size="3"]/p/text()')[0].strip() if 'B' in bill_id: _type = ['bill'] elif 'J' in bill_id: _type = ['joint resolution'] bill = Bill(session, chamber, bill_id, title, type=_type, summary=summary) bill.add_source(url) self.parse_bill_sponsors(doc, bill) # sponsors self.parse_bill_actions(doc, bill) # actions self.parse_bill_documents(doc, bill) # documents and versions self.parse_bill_votes(doc, bill) # votes # subjects subjects = [] for subj in doc.xpath('//a[contains(@href, "/subjects/")]'): subjects.append(subj.text.split('-see also-')[0]) bill['subjects'] = subjects # add bill to collection self.save_bill(bill)
def scrape_bill(self, session, chamber, bill_type, url): bill_html = self.urlopen(url) bill_page = lxml.html.fromstring(bill_html) scraped_bill_id = bill_page.xpath( "//a[contains(@id, 'LinkButtonMeasure')]")[0].text_content() bill_id = scraped_bill_id.split(' ')[0] versions = bill_page.xpath( "//table[contains(@id, 'GridViewVersions')]" )[0] tables = bill_page.xpath("//table") metainf_table = bill_page.xpath('//div[contains(@id, "itemPlaceholder")]//table[1]')[0] action_table = bill_page.xpath('//div[contains(@id, "UpdatePanel1")]//table[1]')[0] meta = self.parse_bill_metainf_table(metainf_table) subs = [ s.strip() for s in meta['Report Title'].split(";") ] if "" in subs: subs.remove("") b = Bill(session, chamber, bill_id, title=meta['Measure Title'], summary=meta['Description'], referral=meta['Current Referral'], subjects=subs, type=bill_type) b.add_source(url) companion = meta['Companion'].strip() if companion: b['companion'] = companion for sponsor in meta['Introducer(s)']: b.add_sponsor(type='primary', name=sponsor) actions = self.parse_bill_actions_table(b, action_table) versions = self.parse_bill_versions_table(b, versions) self.save_bill(b)
def parse_bill(self, chamber, session, special, link): bill_num = link.text.strip() type_abbr = re.search('type=(B|R|)', link.attrib['href']).group(1) if type_abbr == 'B': btype = ['bill'] elif type_abbr == 'R': btype = ['resolution'] bill_id = "%s%s %s" % (bill_abbr(chamber), type_abbr, bill_num) url = info_url(chamber, session, special, type_abbr, bill_num) with self.urlopen(url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(url) title = page.xpath( "//td[text() = 'Short Title:']/following-sibling::td")[0] title = title.text.strip() bill = Bill(session, chamber, bill_id, title, type=btype) bill.add_source(url) self.parse_bill_versions(bill, page) vote_count = self.parse_history( bill, history_url(chamber, session, special, type_abbr, bill_num)) # only fetch votes if votes were seen in history if vote_count: self.parse_votes( bill, vote_url(chamber, session, special, type_abbr, bill_num)) self.save_bill(bill)
def scrape_bill(self, url, kw, re_amendment=re.compile(r'(^[A-Z]A \d{1,3}) to'), re_substitution=re.compile(r'(^[A-Z]S \d{1,2}) for'), re_digits=re.compile(r'\d{,5}'), actions_categorize=actions.categorize, actions_get_actor=actions.get_actor): bill = Bill(**kw) bill.add_source(url) #--------------------------------------------------------------------- # A few helpers. _url_2_lxml = self._url_2_lxml _cleanup_sponsors = self._cleanup_sponsors # Shortcut function partial to get text at a particular xpath: doc = _url_2_lxml(url) _get_text = partial(get_text, doc, 0) # Get session number--needed for fetching related documents (see below). xpath = '//font[contains(., "General Assembly") and @face="Arial"]' session_num = doc.xpath(xpath)[0].text_content() session_num = re_digits.match(session_num).group() #--------------------------------------------------------------------- # Sponsors chamber = bill['chamber'] sponsor_types = { 'Additional Sponsor(s):': 'cosponsor', 'CoSponsors:': 'cosponsor', 'Primary Sponsor:': 'primary' } xpath = '//font[contains(., "Sponsor") and @color="#008080"]' headings = doc.xpath(xpath + '/text()') sponsors = doc.xpath(xpath + '/../../following-sibling::td/font/text()') for h, s in zip(headings, sponsors): names = _cleanup_sponsors(s, chamber) type_ = sponsor_types[h.strip()] if names: for name, _chamber in names: bill.add_sponsor(type_, name, chamber=_chamber) #--------------------------------------------------------------------- # Versions tmp = '/'.join([ 'http://www.legis.delaware.gov', 'LIS/lis{session_num}.nsf/vwLegislation', '{moniker}/$file/{filename}{format_}?open' ]) documents = self.scrape_documents(source=url, docname="introduced", filename="Legis", tmp=tmp, session_num=session_num) for d in documents: bill.add_version(**d) # If bill is a substitution, add the original as a version. names = doc.xpath('//*[contains(text(), "Substituted ' 'Legislation for Bill:")]/text()') urls = doc.xpath('//*[contains(text(), "Substituted ' 'Legislation for Bill:")]' '/following-sibling::a/@href') for name, url in zip(names, urls): name = re_substitution.match(name).group(1) bill.add_version(name, url, description='original bill') #--------------------------------------------------------------------- # Actions actions = doc.xpath('//font[contains(., "Actions History")]' '/../following-sibling::table/descendant::td[2]') actions = actions[0].text_content() actions = filter(None, actions.splitlines()) for a in reversed(actions): date, action = a.split(' - ', 1) try: date = datetime.strptime(date, '%b %d, %Y') except ValueError: date = datetime.strptime(date, '%B %d, %Y') # XXX: ugh. actor = actions_get_actor(action, bill['chamber']) type_ = actions_categorize(action) bill.add_action(actor, action, date, type_) #--------------------------------------------------------------------- # Votes vote_strings = doc.xpath('//*[contains(text(), "vote:")]/text()') # Sometimes vote strings are contained in weird, separate elements. Probably # hand edited. if not all(re.search('\d', string) for string in vote_strings): # Use the parent's text_content instead. vote_strings = [] for el in doc.xpath('//*[contains(text(), "vote:")]/..'): vote_strings.append(el.text_content()) vote_urls = doc.xpath('//*[contains(text(), "vote:")]' '/following-sibling::a/@href') for string, url in zip(vote_strings, vote_urls): vote_data = parse_votestring(string) vote = self.scrape_vote(url, **vote_data) if vote: bill.add_vote(vote) #--------------------------------------------------------------------- # Amendments xpath = ("//font[contains(., 'Amendments')]/" "../../../td[2]/font/a") tmp = ('http://www.legis.delaware.gov/LIS/lis{session_num}.nsf/' 'vwLegislation/{id_}/$file/{filename}{format_}?open') for source, id_ in zip(doc.xpath(xpath + '/@href'), doc.xpath(xpath + '/text()')): short_id = re_amendment.match(id_).group(1) documents = self.scrape_documents(source=source, docname='amendment (%s)' % short_id, filename='Legis', tmp=tmp, session_num=session_num, id_=id_) for d in documents: bill.add_document(**d) #--------------------------------------------------------------------- # Add any related "Engrossments". # See www.ncsl.org/documents/legismgt/ILP/98Tab3Pt4.pdf for # an explanation of the engrossment process in DE. source = doc.xpath('//img[@alt="Engrossment"]/../@href') if source: tmp = '/'.join([ 'http://www.legis.delaware.gov', 'LIS/lis{session_num}.nsf/EngrossmentsforLookup', '{moniker}/$file/{filename}{format_}?open' ]) documents = self.scrape_documents(source=source[0], docname="Engrossment", filename="Engross", tmp=tmp, session_num=session_num, id_=bill['bill_id']) for d in documents: bill.add_version(**d) # -------------------------------------------------------------------- # Add any fiscal notes. source = doc.xpath("//img[@alt='Fiscal Note']/../@href") if source: tmp = '/'.join([ 'http://www.legis.delaware.gov', 'LIS/lis{session_num}.nsf/FiscalforLookup', '{docnum}/$file/{filename}{format_}?open' ]) documents = self.scrape_documents(source=source[0], docname="Fiscal Note", filename="Fiscal", tmp=tmp, session_num=session_num) for d in documents: bill.add_document(**d) #--------------------------------------------------------------------- # Extra fields # Helper to get the first td sibling of certain nodes. tmp = '//font[contains(., "%s")]/../../../td[2]' first_sibling_text = lambda heading: _get_text(tmp % heading) extra_fields = { # A long description of the legislation. "summary": "Synopsis", # Codification details for enacted legislation. "volume_chapter": "Volume Chapter", # Presumably the date of approval/veto. "date_governor_acted": "Date Governor Acted", "fiscal_notes": "Fiscal Notes", } for key, name in extra_fields.iteritems(): try: bill[key] = first_sibling_text(name) except IndexError: # xpath lookup failed. pass self.save_bill(bill)
def scrape_bill_pages(self, session, year_abr): """ assemble information on a bill from a number of DBF files """ #Main Bill information main_bill_url, main_bill_db = self.get_dbf(year_abr, 'MAINBILL') # keep a dictionary of bills (mapping bill_id to Bill obj) bill_dict = {} for rec in main_bill_db: bill_type = rec["billtype"] bill_number = int(rec["billnumber"]) bill_id = bill_type + str(bill_number) title = rec["synopsis"] if bill_type[0] == 'A': chamber = "lower" else: chamber = "upper" # some bills have a blank title.. just skip it if not title: continue bill = Bill(str(session), chamber, bill_id, title, type=self._bill_types[bill_type[1:]]) bill.add_source(main_bill_url) bill_dict[bill_id] = bill #Sponsors bill_sponsors_url, bill_sponsors_db = self.get_dbf(year_abr, 'BILLSPON') for rec in bill_sponsors_db: bill_type = rec["billtype"] bill_number = int(rec["billnumber"]) bill_id = bill_type + str(bill_number) bill = bill_dict[bill_id] name = rec["sponsor"] sponsor_type = rec["type"] if sponsor_type == 'P': sponsor_type = "Primary" else: sponsor_type = "Co-sponsor" bill.add_sponsor(sponsor_type, name) #Documents bill_document_url, bill_document_db = self.get_dbf(year_abr, 'BILLWP') #print bill_document_db[2] for rec in bill_document_db: bill_type = rec["billtype"] bill_number = int(rec["billnumber"]) bill_id = bill_type + str(bill_number) bill = bill_dict[bill_id] document = rec["document"] document = document.split('\\') document = document[-2] + "/" + document[-1] year = str(year_abr) + str((year_abr + 1)) #doc_url = "ftp://www.njleg.state.nj.us/%s/%s" % (year, document) htm_url = 'http://www.njleg.state.nj.us/%s/Bills/%s' % (year_abr, document.replace('.DOC', '.HTM')) # name document based _doctype try: doc_name = self._doctypes[rec['doctype']] except KeyError: raise Exception('unknown doctype %s on %s' % (rec['doctype'], bill_id)) if rec['comment']: doc_name += ' ' + rec['comment'] if rec['doctype'] in self._version_types: bill.add_version(doc_name, htm_url) else: bill.add_document(doc_name, htm_url) # Votes next_year = int(year_abr)+1 vote_info_list = ['A%s' % year_abr, 'A%s' % next_year, 'S%s' % year_abr, 'S%s' % next_year, 'CA%s-%s' % (year_abr, next_year), 'CS%s-%s' % (year_abr, next_year), ] for filename in vote_info_list: s_vote_url = 'ftp://www.njleg.state.nj.us/votes/%s.zip' % filename s_vote_zip, resp = self.urlretrieve(s_vote_url) zipedfile = zipfile.ZipFile(s_vote_zip) vfile = "%s.txt" % filename vote_file = zipedfile.open(vfile, 'U') vdict_file = csv.DictReader(vote_file) votes = {} if filename.startswith('A') or filename.startswith('CA'): chamber = "lower" else: chamber = "upper" if filename.startswith('C'): vote_file_type = 'committee' else: vote_file_type = 'chamber' for rec in vdict_file: if vote_file_type == 'chamber': bill_id = rec["Bill"].strip() leg = rec["Full_Name"] date = rec["Session_Date"] action = rec["Action"] leg_vote = rec["Legislator_Vote"] else: bill_id = '%s%s' % (rec['Bill_Type'], rec['Bill_Number']) leg = rec['Name'] # drop time portion date = rec['Agenda_Date'].split()[0] # make motion readable action = self._com_vote_motions[rec['BillAction']] # first char (Y/N) use [0:1] to ignore '' leg_vote = rec['LegislatorVote'][0:1] date = datetime.strptime(date, "%m/%d/%Y") vote_id = '_'.join((bill_id, chamber, action)) vote_id = vote_id.replace(" ", "_") if vote_id not in votes: votes[vote_id] = Vote(chamber, date, action, None, None, None, None, bill_id=bill_id) if vote_file_type == 'committee': votes[vote_id]['committee'] = self._committees[ rec['Committee_House']] if leg_vote == "Y": votes[vote_id].yes(leg) elif leg_vote == "N": votes[vote_id].no(leg) else: votes[vote_id].other(leg) # remove temp file os.remove(s_vote_zip) #Counts yes/no/other votes and saves overall vote for vote in votes.itervalues(): vote_yes_count = len(vote["yes_votes"]) vote_no_count = len(vote["no_votes"]) vote_other_count = len(vote["other_votes"]) vote["yes_count"] = vote_yes_count vote["no_count"] = vote_no_count vote["other_count"] = vote_other_count if vote_yes_count > vote_no_count: vote["passed"] = True else: vote["passed"] = False vote_bill_id = vote["bill_id"] bill = bill_dict[vote_bill_id] bill.add_vote(vote) #Actions bill_action_url, bill_action_db = self.get_dbf(year_abr, 'BILLHIST') for rec in bill_action_db: bill_type = rec["billtype"] bill_number = int(rec["billnumber"]) bill_id = bill_type + str(bill_number) bill = bill_dict[bill_id] action = rec["action"] date = rec["dateaction"] actor = rec["house"] comment = rec["comment"] action, atype = self.categorize_action(action) if comment: action += (' ' + comment) bill.add_action(actor, action, date, type=atype) # Subjects subject_url, subject_db = self.get_dbf(year_abr, 'BILLSUBJ') for rec in subject_db: bill_id = rec['billtype'] + str(int(rec['billnumber'])) bill = bill_dict.get(bill_id) if bill: bill.setdefault('subjects', []).append(rec['subjectkey']) else: self.warning('invalid bill id in BILLSUBJ.DBF: %s' % bill_id) # save all bills at the end for bill in bill_dict.itervalues(): # add sources bill.add_source(bill_sponsors_url) bill.add_source(bill_document_url) bill.add_source(bill_action_url) bill.add_source(subject_url) self.save_bill(bill)
def scrape_bill(self, chamber, session, bill_id, bill_type, url): doc = lxml.html.fromstring(self.get(url).text) doc.make_links_absolute(url) title = doc.xpath('//b[text()="TITLE:"]') if title: title = title[0].tail.strip().strip('"') else: self.warning("skipping bill %s, no information" % url) return bill = Bill(session, chamber, bill_id, title, type=bill_type) bill.add_source(url) # Get sponsors spons_str = doc.xpath( '//b[contains(text(), "SPONSOR")]')[0].tail.strip() sponsors_match = re.match( '(SENATOR|REPRESENTATIVE)\([Ss]\) ([^,]+(,[^,]+){0,})', spons_str) if sponsors_match: sponsors = sponsors_match.group(2).split(',') sponsor = sponsors[0].strip() if sponsor: bill.add_sponsor('primary', sponsors[0]) for sponsor in sponsors[1:]: sponsor = sponsor.strip() if sponsor: bill.add_sponsor('cosponsor', sponsor) else: # Committee sponsorship spons_str = spons_str.strip() if re.match(r' BY REQUEST OF THE GOVERNOR$', spons_str): spons_str = re.sub(r' BY REQUEST OF THE GOVERNOR$', '', spons_str).title() spons_str = (spons_str + " Committee (by request of the governor)") if spons_str: bill.add_sponsor('primary', spons_str) # Get actions from second myth table self._current_comm = None act_rows = doc.xpath('(//table[@class="myth"])[2]//tr')[1:] for row in act_rows: date, journal, raw_chamber, action = row.xpath('td') act_date = datetime.datetime.strptime(date.text_content().strip(), '%m/%d/%y') raw_chamber = raw_chamber.text_content().strip() action = action.text_content().strip() if raw_chamber == "(H)": act_chamber = "lower" elif raw_chamber == "(S)": act_chamber = "upper" if re.match("\w+ Y(\d+)", action): vote_href = journal.xpath('.//a/@href') if vote_href: self.parse_vote(bill, action, act_chamber, act_date, vote_href[0]) action, atype = self.clean_action(action) match = re.match('^Prefile released (\d+/\d+/\d+)$', action) if match: action = 'Prefile released' act_date = datetime.datetime.strptime(match.group(1), '%m/%d/%y') bill.add_action(act_chamber, action, act_date, type=atype) # Get subjects bill['subjects'] = [] for subj in doc.xpath('//a[contains(@href, "subject")]/text()'): bill['subjects'].append(subj.strip()) # Get versions text_list_url = "http://www.legis.state.ak.us/"\ "basis/get_fulltext.asp?session=%s&bill=%s" % ( session, bill_id) bill.add_source(text_list_url) text_doc = lxml.html.fromstring(self.get(text_list_url).text) text_doc.make_links_absolute(text_list_url) for link in text_doc.xpath('//a[contains(@href, "get_bill_text")]'): name = link.xpath('../preceding-sibling::td/text()')[0].strip() text_url = link.get('href') bill.add_version(name, text_url, mimetype="text/html") # Get documents doc_list_url = "http://www.legis.state.ak.us/"\ "basis/get_documents.asp?session=%s&bill=%s" % ( session, bill_id ) doc_list = lxml.html.fromstring(self.get(doc_list_url).text) doc_list.make_links_absolute(doc_list_url) bill.add_source(doc_list_url) for href in doc_list.xpath( '//a[contains(@href, "get_documents")][@onclick]'): h_name = href.text_content() h_href = href.attrib['href'] if h_name.strip(): bill.add_document(h_name, h_href) self.save_bill(bill)
def scrape_bill(self, session, chamber, bill_id, title, url, strip_sponsors=re.compile(r'\s*\(.{,50}\)\s*').sub): html = self.get(url).text page = lxml.html.fromstring(html) page.make_links_absolute(url) bill_type = self.bill_types[bill_id.split()[0][1:]] bill = Bill(session, chamber, bill_id, title, type=bill_type) bill.add_source(url) xpath = ('//strong[contains(., "SUBJECT")]/../' 'following-sibling::td/a/text()') bill['subjects'] = page.xpath(xpath) for version in self.scrape_versions(session, chamber, page, bill_id): bill.add_version(**version) # Resolution pages have different html. values = {} trs = page.xpath('//div[@id="bhistcontent"]/table/tr') for tr in trs: heading = tr.xpath('td/strong/text()') if heading: heading = heading[0] else: continue value = tr.text_content().replace(heading, '').strip() values[heading] = value # summary was always same as title #bill['summary'] = values['SUMMARY:'] # Add primary sponsor. primary = strip_sponsors('', values.get('LEAD SPONSOR:', '')) if primary: bill.add_sponsor('primary', primary) # Add cosponsors. if values.get('SPONSORS:'): sponsors = strip_sponsors('', values['SPONSORS:']) sponsors = re.split(', (?![A-Z]\.)', sponsors) for name in sponsors: name = name.strip(', \n\r') if name: # Fix name splitting bug where "Neale, D. Hall" match = re.search('(.+?), ([DM]\. Hall)', name) if match: for name in match.groups(): bill.add_sponsor('cosponsor', name) else: bill.add_sponsor('cosponsor', name) for link in page.xpath("//a[contains(@href, 'votes/house')]"): self.scrape_house_vote(bill, link.attrib['href']) for tr in reversed( page.xpath("//table[@class='tabborder']/descendant::tr")[1:]): tds = tr.xpath('td') if len(tds) < 3: continue chamber_letter = tds[0].text_content() chamber = {'S': 'upper', 'H': 'lower'}[chamber_letter] # Index of date info no longer varies on resolutions. date = tds[2].text_content().strip() date = datetime.datetime.strptime(date, "%m/%d/%y").date() action = tds[1].text_content().strip() if action.lower().startswith('passed senate'): for href in tds[1].xpath('a/@href'): self.scrape_senate_vote(bill, href, date) attrs = dict(actor=chamber, action=action, date=date) attrs.update(self.categorizer.categorize(action)) bill.add_action(**attrs) self.save_bill(bill)
def scrape_bill(self, chamber, session, bill_id, bill_type): url = '%s?r=%s' % (self.base_url, bill_id) with self.urlopen(url) as html: doc = lxml.html.fromstring(html) # search for Titulo, accent over i messes up lxml, so use 'tulo' title = doc.xpath( u'//td/b[contains(text(),"tulo")]/../following-sibling::td/text()' ) if not title: raise NoSuchBill() bill = Bill(session, chamber, bill_id, title[0], type=bill_type) author = doc.xpath( u'//td/b[contains(text(),"Autor")]/../text()')[0] for aname in author.split(','): bill.add_sponsor('primary', aname.strip()) co_authors = doc.xpath( u'//td/b[contains(text(),"Co-autor")]/../text()') if len(co_authors) != 0: for co_author in co_authors[1].split(','): bill.add_sponsor('cosponsor', co_author.strip()) action_table = doc.xpath('//table')[-1] for row in action_table[1:]: tds = row.xpath('td') # ignore row missing date if len(tds) != 2: continue date = datetime.datetime.strptime(tds[0].text_content(), "%m/%d/%Y") action = tds[1].text_content().strip() #parse the text to see if it's a new version or a unrelated document #if has - let's *shrug* assume it's a vote document #get url of action action_url = tds[1].xpath('a/@href') #check it has a url and is not just text if action_url: action_url = action_url[0] #check if it's a version of the bill or another type of document. #NOTE: not sure if new versions of the bill are only denoted with 'Entirillado' OR if that's the correct name but from what i gather it looks like it. if re.match('Entirillado', action): bill.add_version(action, action_url) else: bill.add_document(action, action_url) for pattern, atype in _classifiers: if re.match(pattern, action): break else: atype = 'other' bill.add_action(chamber, action, date, type=atype) if atype == 'bill:passed' and action_url: vote_chamber = None for pattern, vote_chamber in _voteChambers: if re.match(pattern, action): break else: self.warning('coudnt find voteChamber pattern') if vote_chamber == 'lower' and len(action_url) > 0: vote = self.scrape_votes(action_url, action, date, vote_chamber) if not vote[0] == None: vote[0].add_source(action_url) bill.add_vote(vote[0]) else: self.warning('Problem Reading vote: %s,%s' % (vote[1], bill_id)) bill.add_source(url) self.save_bill(bill)
def scrape_bill(self, bill_url, chamber, session): with self.urlopen(bill_url) as text: if "Specified Bill could not be found" in text: return False page = lxml.html.fromstring(text) page.make_links_absolute(bill_url) bill_id = page.xpath("string(//h2)").split()[0] summary = page.xpath( "string(//*[starts-with(text(), 'Summary: ')])") summary = summary.replace('Summary: ', '') match = re.match( r"^([^:]+): " r"((\(Constitutional [aA]mendment\) )?[^(]+)", summary) if match: subjects = [match.group(1).strip()] title = match.group(2).strip() else: raise ScrapeError("Bad title") if bill_id.startswith('SB') or bill_id.startswith('HB'): bill_type = ['bill'] elif bill_id.startswith('SR') or bill_id.startswith('HR'): bill_type = ['resolution'] elif bill_id.startswith('SCR') or bill_id.startswith('HCR'): bill_type = ['concurrent resolution'] else: raise ScrapeError("Invalid bill ID format: %s" % bill_id) if title.startswith("(Constitutional Amendment)"): bill_type.append('constitutional amendment') title = title.replace('(Constitutional Amendment) ', '') bill = Bill(session, chamber, bill_id, title, subjects=subjects, type=bill_type) bill.add_source(bill_url) history_link = page.xpath("//a[text() = 'History']")[0] history_url = history_link.attrib['href'] self.scrape_history(bill, history_url) authors_link = page.xpath("//a[text() = 'Authors']")[0] authors_url = authors_link.attrib['href'] self.scrape_authors(bill, authors_url) try: versions_link = page.xpath( "//a[text() = 'Text - All Versions']")[0] versions_url = versions_link.attrib['href'] self.scrape_versions(bill, versions_url) for doc in ["Notes", "Digest", "Amendments", "Misc"]: doc_link = page.xpath("//a[text() = '%s']" % doc)[0] doc_url = doc_link.attrib['href'] self.scrape_documents(bill, doc_url) except IndexError: # Only current version try: version_link = page.xpath( "//a[text() = 'Text - Current']")[0] version_url = version_link.attrib['href'] bill.add_version("%s Current" % bill_id, version_url, on_duplicate="use_old") except IndexError: # Some bills don't have any versions :( pass try: votes_link = page.xpath("//a[text() = 'Votes']")[0] self.scrape_votes(bill, votes_link.attrib['href']) except IndexError: # Some bills don't have any votes pass self.save_bill(bill) return True
def process_bill(self, data): chamber = parse_psuedo_id(data['from_organization'])['classification'] if chamber == 'legislature': chamber = 'upper' bill = Bill(data['legislative_session'], chamber, data['identifier'], data['title'], subjects=data['subject'], type=data['classification']) if data['abstracts']: bill['summary'] = data['abstracts'][0]['abstract'] bill.update(**data['extras']) for action in data['actions']: actor = parse_psuedo_id( action['organization_id'])['classification'] legislators = [] committees = [] for rel in action['related_entities']: if rel['entity_type'] == 'organization': committees.append(rel['name']) elif rel['entity_type'] == 'person': legislators.append(rel['name']) bill.add_action(actor, action['description'], parse_date(action['date']), type=_action_categories(action['classification']), committees=committees, legislators=legislators, **action.get('extras', {})) for source in data['sources']: bill.add_source(source['url']) for sponsor in data['sponsorships']: bill.add_sponsor( sponsor['classification'], sponsor['name'], ) for version in data['versions']: for link in version['links']: bill.add_version(version['note'], link['url'], mimetype=link['media_type'], date=parse_date(version['date']), **version.get('extras', {})) for doc in data['documents']: for link in doc['links']: bill.add_document(doc['note'], link['url'], mimetype=link['media_type'], date=parse_date(doc['date']), **doc.get('extras', {})) for title in data['other_titles']: bill.add_title(title['title']) for related in data['related_bills']: bill.add_companion(related['identifier'], related['legislative_session'], chamber) bill['alternate_bill_ids'] = [ oi['identifier'] for oi in data['other_identifiers'] ] self.save_bill(bill)
def parse_bill(self, chamber, session, bill_id, url): with self.urlopen(url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(url) try: short_bill_id = re.sub(r'S([JC])R', r'S\1', bill_id) version_link = page.xpath( "//a[contains(@href, '%s/bill.doc')]" % short_bill_id)[0] except IndexError: # Bill withdrawn return pars = version_link.xpath("following-sibling::p") if len(pars) == 2: title = pars[0].xpath("string()") action_p = pars[1] else: title = pars[0].getprevious().tail action_p = pars[0] title = re.sub(ur'[\s\xa0]+', ' ', title).strip() if 'CR' in bill_id: bill_type = 'concurrent resolution' elif 'JR' in bill_id: bill_type = 'joint resolution' elif 'R' in bill_id: bill_type = 'resolution' else: bill_type = 'bill' bill = Bill(session, chamber, bill_id, title, type=bill_type) bill['subjects'] = self._subjects[bill_id] bill.add_source(url) bill.add_version("Most Recent Version", version_link.attrib['href']) for link in page.xpath("//a[contains(@href, 'legislator/')]"): bill.add_sponsor('primary', link.text.strip()) for line in action_p.xpath("string()").split("\n"): action = line.strip() if (not action or action == 'last action' or 'Prefiled' in action): continue action_date = "%s %s" % (action.split('-')[0], session[0:4]) action_date = datetime.datetime.strptime( action_date, '%b %d %Y') action = '-'.join(action.split('-')[1:]) if action.endswith('House') or action.endswith('(H)'): actor = 'lower' elif action.endswith('Senate') or action.endswith('(S)'): actor = 'upper' else: actor = chamber atype = [] if action.startswith('introduced in'): atype.append('bill:introduced') if '; to ' in action: atype.append('committee:referred') elif action.startswith('signed by Governor'): atype.append('governor:signed') elif re.match(r'^to [A-Z]', action): atype.append('committee:referred') elif action == 'adopted by voice vote': atype.append('bill:passed') if '1st reading' in action: atype.append('bill:reading:1') if '3rd reading' in action: atype.append('bill:reading:3') if '2nd reading' in action: atype.append('bill:reading:2') if 'R' in bill_id and 'adopted by voice vote' in action: atype.append('bill:passed') amendment_re = (r'floor amendments?( \([a-z\d\-]+\))*' r'( and \([a-z\d\-]+\))? filed') if re.search(amendment_re, action): atype.append('amendment:introduced') if not atype: atype = ['other'] bill.add_action(actor, action, action_date, type=atype) try: votes_link = page.xpath( "//a[contains(@href, 'vote_history.pdf')]")[0] bill.add_document("Vote History", votes_link.attrib['href']) except IndexError: # No votes pass self.save_bill(bill)
def scrape_bill_sheet(self, session, chamber): """ Scrape the bill sheet (the page full of bills and other small bits of data) """ sheet_url = self.get_bill_folder(session, chamber) bill_chamber = {"Senate": "upper", "House": "lower"}[chamber] index = { "id": 0, "title_sponsor": 1, "version": 2, "history": 3, "votes": 7 } with self.urlopen(sheet_url) as sheet_html: sheet_page = lxml.html.fromstring(sheet_html) bills = sheet_page.xpath('//table/tr') for bill in bills: bill_id = self.read_td(bill[index["id"]][0]) if bill_id == None: # Every other entry is null for some reason continue dot_loc = bill_id.find('.') if dot_loc != -1: # budget bills are missing the .pdf, don't truncate bill_id = bill_id[:dot_loc] title_and_sponsor = bill[index["title_sponsor"]][0] bill_title = title_and_sponsor.text bill_title_and_sponsor = title_and_sponsor.text_content() sponsors = bill_title_and_sponsor.replace(bill_title, "").\ replace(" & ...", "").split("--") cats = { "SB": "bill", "HB": "bill", "HR": "resolution", "SR": "resolution", "SCR": "concurrent resolution", "HCR": "concurrent resolution", "SJR": "joint resolution", "HJR": "joint resolution", "SM": "memorial", "HM": "memorial" } bill_type = None for cat in cats: if bill_id[:len(cat)] == cat: bill_type = cats[cat] b = Bill(session, bill_chamber, bill_id, bill_title, type=bill_type) b.add_source(sheet_url) versions_url = \ bill[index["version"]].xpath('font/a')[0].attrib["href"] versions_url = CO_URL_BASE + versions_url versions = self.parse_versions(versions_url) for version in versions: b.add_version(version['name'], version['link'], mimetype=version['mimetype']) bill_history_href = CO_URL_BASE + \ bill[index["history"]][0][0].attrib['href'] # ^^^^^^^ We assume this is a full path to the target. # might want to consider some better rel-path support # XXX: Look at this ^ history = self.parse_history(bill_history_href) b.add_source(bill_history_href) for action in history: self.add_action_to_bill(b, action) for sponsor in sponsors: if sponsor != None and sponsor != "(NONE)" and \ sponsor != "": b.add_sponsor("primary", sponsor) # Now that we have history, let's see if we can't grab some # votes bill_vote_href = self.get_vote_url(bill_id, session) votes = self.parse_votes(bill_vote_href) if votes['sanity-check'] != bill_id: self.warning("XXX: READ ME! Sanity check failed!") self.warning(" -> Scraped ID: " + votes['sanity-check']) self.warning(" -> 'Real' ID: " + bill_id) assert votes['sanity-check'] == bill_id for vote in votes['votes']: filed_votes = vote['votes'] passage = vote['meta'] result = vote['result'] composite_time = "%s %s" % (passage['x-parent-date'], passage['TIME']) # It's now like: 04/01/2011 02:10:14 PM pydate = dt.datetime.strptime(composite_time, "%m/%d/%Y %I:%M:%S %p") hasHouse = "House" in passage['x-parent-ctty'] hasSenate = "Senate" in passage['x-parent-ctty'] if hasHouse and hasSenate: actor = "joint" elif hasHouse: actor = "lower" else: actor = "upper" other = (int(result['EXC']) + int(result['ABS'])) # OK, sometimes the Other count is wrong. local_other = 0 for voter in filed_votes: l_vote = filed_votes[voter].lower().strip() if l_vote != "yes" and l_vote != "no": local_other = local_other + 1 if local_other != other: self.warning( \ "XXX: !!!WARNING!!! - resetting the 'OTHER' VOTES" ) self.warning(" -> Old: %s // New: %s" % (other, local_other)) other = local_other v = Vote(actor, pydate, passage['MOTION'], (result['FINAL_ACTION'] == "PASS"), int(result['YES']), int(result['NO']), other, moved=passage['MOVED'], seconded=passage['SECONDED']) v.add_source(vote['meta']['url']) # v.add_source( bill_vote_href ) # XXX: Add more stuff to kwargs, we have a ton of data for voter in filed_votes: who = voter vote = filed_votes[who] if vote.lower() == "yes": v.yes(who) elif vote.lower() == "no": v.no(who) else: v.other(who) b.add_vote(v) self.save_bill(b)
def scrape(self, session, chambers): #get member id matching for vote parsing member_ids = self.get_member_ids()[session] per_page = 10 #seems like it gives me 10 no matter what. start_record = 0 headers = {"Content-Type": "application/json"} url = "http://lims.dccouncil.us/_layouts/15/uploader/AdminProxy.aspx/GetPublicAdvancedSearch" bill_url = "http://lims.dccouncil.us/_layouts/15/uploader/AdminProxy.aspx/GetPublicData" params = { "request": { "sEcho": 2, "iColumns": 4, "sColumns": "", "iDisplayStart": 0, "iDisplayLength": per_page, "mDataProp_0": "ShortTitle", "mDataProp_1": "Title", "mDataProp_2": "LegislationCategories", "mDataProp_3": "Modified", "iSortCol_0": 0, "sSortDir_0": "asc", "iSortingCols": 0, "bSortable_0": "true", "bSortable_1": "true", "bSortable_2": "true", "bSortable_3": "true" }, "criteria": { "Keyword": "", "Category": "", "SubCategoryId": "", "RequestOf": "", "CouncilPeriod": str(session), "Introducer": "", "CoSponsor": "", "CommitteeReferral": "", "CommitteeReferralComments": "", "StartDate": "", "EndDate": "", "QueryLimit": 100, "FilterType": "", "Phases": "", "LegislationStatus": "0", "IncludeDocumentSearch": "false" } } param_json = json.dumps(params) response = self.post(url, headers=headers, data=param_json) #the response is a terrible string-of-nested-json-strings. Yuck. response = self.decode_json(response.json()["d"]) data = response["aaData"] global bill_versions while len(data) > 0: for bill in data: bill_versions = [ ] #sometimes they're in there more than once, so we'll keep track bill_id = bill["Title"] if bill_id.startswith("AG"): #actually an agenda, skip continue bill_params = {"legislationId": bill_id} bill_info = self.post(bill_url, headers=headers, data=json.dumps(bill_params)) bill_info = self.decode_json(bill_info.json()["d"])["data"] bill_source_url = "http://lims.dccouncil.us/Legislation/" + bill_id legislation_info = bill_info["Legislation"][0] title = legislation_info["ShortTitle"] if bill_id.startswith("R") or bill_id.startswith("CER"): bill_type = "resolution" else: bill_type = "bill" #dc has no chambers. calling it all upper bill = Bill(session, "upper", bill_id, title, type=bill_type) #sponsors and cosponsors if "Introducer" in legislation_info: introducers = legislation_info["Introducer"] intro_date = self.date_format( legislation_info["IntroductionDate"]) bill.add_action("upper", "Introduced", intro_date, type="bill:introduced") else: #sometimes there are introducers, sometimes not. # Set Introducers to empty array to avoid downstream breakage, but log bills without introducers self.logger.warning("No Introducer: {0} {1}: {2}".format( bill['chamber'], bill['session'], bill['bill_id'])) introducers = [] try: #sometimes there are cosponsors, sometimes not. cosponsors = legislation_info["CoSponsor"] except KeyError: cosponsors = [] for i in introducers: sponsor_name = i["Name"] #they messed up Phil Mendelson's name if sponsor_name == "Phil Pmendelson": sponsor_name = "Phil Mendelson" bill.add_sponsor(name=sponsor_name, type="primary") for s in cosponsors: sponsor_name = s["Name"] if sponsor_name == "Phil Pmendelson": sponsor_name = "Phil Mendelson" bill.add_sponsor(name=sponsor_name, type="cosponsor") #if it's become law, add the law number as an alternate title if "LawNumber" in legislation_info: law_num = legislation_info["LawNumber"] if law_num: bill.add_title(law_num) #also sometimes it's got an act number if "ActNumber" in legislation_info: act_num = legislation_info["ActNumber"] if act_num: bill.add_title(act_num) #sometimes AdditionalInformation has a previous bill name if "AdditionalInformation" in legislation_info: add_info = legislation_info["AdditionalInformation"] if "previously" in add_info.lower(): prev_title = add_info.lower().replace( "previously", "").strip().replace(" ", "") bill.add_title(prev_title.upper()) elif add_info: bill["additional_information"] = add_info if "WithDrawnDate" in legislation_info: withdrawn_date = self.date_format( legislation_info["WithDrawnDate"]) withdrawn_by = legislation_info["WithdrawnBy"][0][ "Name"].strip() if withdrawn_by == "the Mayor": bill.add_action("executive", "withdrawn", withdrawn_date, "bill:withdrawn") elif "committee" in withdrawn_by.lower(): bill.add_action("upper", "withdrawn", withdrawn_date, "bill:withdrawn", committees=withdrawn_by) else: bill.add_action("upper", "withdrawn", withdrawn_date, "bill:withdrawn", legislators=withdrawn_by) #deal with actions involving the mayor mayor = bill_info["MayorReview"] if mayor != []: mayor = mayor[0] #in dc, mayor == governor because openstates schema if "TransmittedDate" in mayor: transmitted_date = self.date_format( mayor["TransmittedDate"]) bill.add_action("executive", "transmitted to mayor", transmitted_date, type="governor:received") if 'SignedDate' in mayor: signed_date = self.date_format(mayor["SignedDate"]) bill.add_action("executive", "signed", signed_date, type="governor:signed") elif 'ReturnedDate' in mayor: #if returned but not signed, it was vetoed veto_date = self.date_format(mayor["ReturnedDate"]) bill.add_action("executive", "vetoed", veto_date, type="governor:vetoed") if 'EnactedDate' in mayor: #if it was returned and enacted but not signed, there was a veto override override_date = self.date_format( mayor["EnactedDate"]) bill.add_action("upper", "veto override", override_date, type="bill:veto_override:passed") if 'AttachmentPath' in mayor: #documents relating to the mayor's review self.add_documents(mayor["AttachmentPath"], bill) congress = bill_info["CongressReview"] if len(congress) > 0: congress = congress[0] if "TransmittedDate" in congress: transmitted_date = self.date_format( congress["TransmittedDate"]) bill.add_action("other", "Transmitted to Congress for review", transmitted_date) #deal with committee actions if "DateRead" in legislation_info: date = legislation_info["DateRead"] elif "IntroductionDate" in legislation_info: date = legislation_info["IntroductionDate"] else: self.logger.warning( "Crap, we can't find anything that looks like an action date. Skipping" ) continue date = self.date_format(date) if "CommitteeReferral" in legislation_info: committees = [] for committee in legislation_info["CommitteeReferral"]: if committee["Name"].lower( ) == "retained by the council": committees = [] break else: committees.append(committee["Name"]) if committees != []: bill.add_action("upper", "referred to committee", date, committees=committees, type="committee:referred") if "CommitteeReferralComments" in legislation_info: committees = [] for committee in legislation_info[ "CommitteeReferralComments"]: committees.append(committee["Name"]) bill.add_action("upper", "comments from committee", date, committees=committees, type="other") #deal with random docs floating around docs = bill_info["OtherDocuments"] for d in docs: if "AttachmentPath" in d: self.add_documents(d["AttachmentPath"], bill) else: self.logger.warning( "Document path missing from 'Other Documents'") if "MemoLink" in legislation_info: self.add_documents(legislation_info["MemoLink"], bill) if "AttachmentPath" in legislation_info: self.add_documents(legislation_info["AttachmentPath"], bill) #full council votes votes = bill_info["VotingSummary"] for vote in votes: self.process_vote(vote, bill, member_ids) #deal with committee votes if "CommitteeMarkup" in bill_info: committee_info = bill_info["CommitteeMarkup"] if len(committee_info) > 0: for committee_action in committee_info: self.process_committee_vote(committee_action, bill) if "AttachmentPath" in committee_info: self.add_documents(vote["AttachmentPath"], bill, is_version) bill.add_source(bill_source_url) self.save_bill(bill) #get next page start_record += per_page params["request"]["iDisplayStart"] = start_record param_json = json.dumps(params) response = self.post(url, headers=headers, data=param_json) response = self.decode_json(response.json()["d"]) data = response["aaData"]
def scrape(self, chamber, session): # check for abiword if os.system('which abiword') != 0: raise ScrapeError('abiword is required for KS scraping') chamber_name = 'Senate' if chamber == 'upper' else 'House' chamber_letter = chamber_name[0] # perhaps we should save this data so we can make one request for both? with self.urlopen(ksapi.url + 'bill_status/') as bill_request: bill_request_json = json.loads(bill_request) bills = bill_request_json['content'] for bill_data in bills: bill_id = bill_data['BILLNO'] # filter other chambers if not bill_id.startswith(chamber_letter): continue if 'CR' in bill_id: btype = 'concurrent resolution' elif 'R' in bill_id: btype = 'resolution' elif 'B' in bill_id: btype = 'bill' title = bill_data['SHORTTITLE'] or bill_data['LONGTITLE'] # main bill = Bill(session, chamber, bill_id, title, type=btype, status=bill_data['STATUS']) bill.add_source(ksapi.url + 'bill_status/' + bill_id.lower()) if (bill_data['LONGTITLE'] and bill_data['LONGTITLE'] != bill['title']): bill.add_title(bill_data['LONGTITLE']) for sponsor in bill_data['SPONSOR_NAMES']: stype = ('primary' if len(bill_data['SPONSOR_NAMES']) == 1 else 'cosponsor') bill.add_sponsor(stype, sponsor) # history is backwards for event in reversed(bill_data['HISTORY']): actor = ('upper' if event['chamber'] == 'Senate' else 'lower') date = datetime.datetime.strptime( event['occurred_datetime'], "%Y-%m-%dT%H:%M:%S") # append committee names if present if 'committee_names' in event: action = (event['status'] + ' ' + ' and '.join(event['committee_names'])) else: action = event['status'] if event['action_code'] not in ksapi.action_codes: self.warning( 'unknown action code on %s: %s %s' % (bill_id, event['action_code'], event['status'])) atype = 'other' else: atype = ksapi.action_codes[event['action_code']] bill.add_action(actor, action, date, type=atype) try: self.scrape_html(bill) except scrapelib.HTTPError as e: self.warning('unable to fetch HTML for bill {0}'.format( bill['bill_id'])) self.save_bill(bill)
def scrape_details(self, bill_detail_url, session, chamber, bill_id): page = self.urlopen(bill_detail_url) if 'INVALID BILL NUMBER' in page: self.warning('INVALID BILL %s' % bill_detail_url) return doc = lxml.html.fromstring(page) doc.make_links_absolute(bill_detail_url) bill_div = doc.xpath('//div[@style="margin:0 0 40px 0;"]')[0] bill_type = bill_div.xpath('span/text()')[0] if 'General Bill' in bill_type: bill_type = 'bill' elif 'Concurrent Resolution' in bill_type: bill_type = 'concurrent resolution' elif 'Joint Resolution' in bill_type: bill_type = 'joint resolution' elif 'Resolution' in bill_type: bill_type = 'resolution' else: raise ValueError('unknown bill type: %s' % bill_type) # this is fragile, but less fragile than it was b = bill_div.xpath('./b[text()="Summary:"]')[0] bill_summary = b.getnext().tail.strip() bill = Bill(session, chamber, bill_id, bill_summary, type=bill_type) bill['subjects'] = list(self._subjects[bill_id]) # sponsors for sponsor in doc.xpath('//a[contains(@href, "member.php")]/text()'): bill.add_sponsor('sponsor', sponsor) # find versions version_url = doc.xpath('//a[text()="View full text"]/@href')[0] version_html = self.urlopen(version_url) version_doc = lxml.html.fromstring(version_html) version_doc.make_links_absolute(version_url) for version in version_doc.xpath('//a[contains(@href, "/prever/")]'): # duplicate versions with same date, use first appearance bill.add_version(version.text, version.get('href'), on_duplicate='use_old') # actions for row in bill_div.xpath('table/tr'): date_td, chamber_td, action_td = row.xpath('td') date = datetime.datetime.strptime(date_td.text, "%m/%d/%y") action_chamber = { 'Senate': 'upper', 'House': 'lower', None: 'other' }[chamber_td.text] action = action_td.text_content() action = action.split('(House Journal')[0] action = action.split('(Senate Journal')[0].strip() atype = action_type(action) bill.add_action(action_chamber, action, date, atype) # votes vurl = doc.xpath('//a[text()="View Vote History"]/@href') if vurl: vurl = vurl[0] self.scrape_vote_history(bill, vurl) bill.add_source(bill_detail_url) self.save_bill(bill)
def scrape_senate_bills(self, chamber, insert, session, year): doc_type = { 2: 'bill', 4: 'resolution', 7: 'concurrent resolution', 8: 'joint resolution' } for docnum, bill_type in doc_type.iteritems(): parentpage_url = 'http://www.leg.state.nv.us/Session/%s/Reports/HistListBills.cfm?DoctypeID=%s' % ( insert, docnum) links = self.scrape_links(parentpage_url) count = 0 for link in links: count = count + 1 page_path = 'http://www.leg.state.nv.us/Session/%s/Reports/%s' % ( insert, link) with self.urlopen(page_path) as page: page = page.decode("utf8").replace(u"\xa0", " ") root = lxml.html.fromstring(page) bill_id = root.xpath( 'string(/html/body/div[@id="content"]/table[1]/tr[1]/td[1]/font)' ) title = root.xpath( 'string(/html/body/div[@id="content"]/table[1]/tr[5]/td)' ) bill = Bill(session, chamber, bill_id, title, type=bill_type) bill_text = root.xpath( "string(/html/body/div[@id='content']/table[6]/tr/td[2]/a/@href)" ) text_url = "http://www.leg.state.nv.us" + bill_text bill.add_version("Bill Text", text_url) primary, secondary = self.scrape_sponsors(page) if primary and primary[0] == 'By:': primary.pop(0) if primary[0] == 'ElectionsProceduresEthicsand': primary[0] = 'Elections Procedures Ethics and' full_name = '' for part_name in primary: full_name = full_name + part_name + " " bill.add_sponsor('primary', full_name) else: for leg in primary: bill.add_sponsor('primary', leg) for leg in secondary: bill.add_sponsor('cosponsor', leg) minutes_count = 2 for mr in root.xpath('//table[4]/tr/td[3]/a'): minutes = mr.xpath("string(@href)") minutes_url = "http://www.leg.state.nv.us" + minutes minutes_date_path = "string(//table[4]/tr[%s]/td[2])" % minutes_count minutes_date = mr.xpath(minutes_date_path).split() minutes_date = minutes_date[0] + minutes_date[ 1] + minutes_date[2] + " Minutes" bill.add_document(minutes_date, minutes_url) minutes_count = minutes_count + 1 self.scrape_actions(root, bill, "upper") self.scrape_votes(page, bill, insert, year) bill.add_source(page_path) self.save_bill(bill)
def old_scrape(self, session): status_report_url = "http://www.legislature.ohio.gov/legislation/status-reports" #ssl verification off due Ohio not correctly implementing SSL doc = self.get(status_report_url, verify=False).text doc = lxml.html.fromstring(doc) doc.make_links_absolute(status_report_url) status_table = doc.xpath( "//div[contains(text(),'{}')]/following-sibling::table".format( session))[0] status_links = status_table.xpath( ".//a[contains(text(),'Excel')]/@href") for url in status_links: try: fname, resp = self.urlretrieve(url) except scrapelib.HTTPError: self.logger.warning("Missing report {}".format(report)) continue sh = xlrd.open_workbook(fname).sheet_by_index(0) # once workbook is open, we can remove tempfile os.remove(fname) for rownum in range(1, sh.nrows): bill_id = sh.cell(rownum, 0).value bill_type = "resolution" if "R" in bill_id else "bill" chamber = "lower" if "H" in bill_id else "upper" bill_title = str(sh.cell(rownum, 3).value) bill = Bill(session, chamber, bill_id, bill_title, type=bill_type) bill.add_source(url) bill.add_sponsor('primary', str(sh.cell(rownum, 1).value)) # add cosponsor if sh.cell(rownum, 2).value: bill.add_sponsor('cosponsor', str(sh.cell(rownum, 2).value)) actor = "" # Actions start column after bill title for colnum in range(4, sh.ncols - 1): action = str(sh.cell(0, colnum).value) cell = sh.cell(rownum, colnum) date = cell.value if len(action) != 0: if action.split()[0] == 'House': actor = "lower" elif action.split()[0] == 'Senate': actor = "upper" elif action.split()[-1] == 'Governor': actor = "executive" elif action.split()[0] == 'Gov.': actor = "executive" elif action.split()[-1] == 'Gov.': actor = "executive" if action in ('House Intro. Date', 'Senate Intro. Date'): atype = ['bill:introduced'] action = action.replace('Intro. Date', 'Introduced') elif action == '3rd Consideration': atype = ['bill:reading:3', 'bill:passed'] elif action == 'Sent to Gov.': atype = ['governor:received'] elif action == 'Signed By Governor': atype = ['governor:signed'] else: atype = ['other'] if type(date) == float: date = str(xlrd.xldate_as_tuple(date, 0)) date = datetime.datetime.strptime( date, "(%Y, %m, %d, %H, %M, %S)") bill.add_action(actor, action, date, type=atype) for idx, char in enumerate(bill_id): try: int(char) except ValueError: continue underscore_bill = bill_id[:idx] + "_" + bill_id[idx:] break self.scrape_votes_old(bill, underscore_bill, session) self.scrape_versions_old(bill, underscore_bill, session) self.save_bill(bill)