def scrape_bill(self, chamber, bill): bill_id = bill['id'].replace('w/','with ') page = lxml.html.fromstring(self.urlopen(bill['url'])) page.make_links_absolute(bill['url']) title_row = page.xpath('//tr[td/b[contains(font,"Long Title")]]')[0] # text_content() == make sure any tags in the title don't cause issues title = title_row.xpath('td[@width="79%"]/font')[0].text_content() # now we can create a bill object b = Bill(bill['session'], bill['chamber'], bill_id, title) b.add_source(bill['url']) sponsors_row = page.xpath('//tr[td/b[contains(font,"Primary Sponsor")]]')[0] sponsor = sponsors_row.xpath('td[@width="31%"]/font')[0].text if sponsor != None: b.add_sponsor('primary', sponsor) # scraping these and co-sponsors, but not doing anything with them until # it's decided whether or not to attempt to split 'em up additional = sponsors_row.xpath('td[@width="48%"]/font') additional_sponsors = additional[0].text if len(additional) > 0 else "" additional_sponsors = additional_sponsors.replace('   ','') cosponsors_row = page.xpath('//tr[td/b[contains(font,"CoSponsors")]]')[0] cosponsors = cosponsors_row.xpath('td[@width="79%"]/font')[0].text cosponsors = cosponsors if cosponsors != '{ NONE...}' else '' introduced_row = page.xpath('//tr[td/b[contains(font,"Introduced On")]]') if len(introduced_row) > 0: introduced = introduced_row[0].expath('/td[@width="31%"]/font')[0].text introduced = datetime.strptime(introduced, '%b %d, %Y') b.add_action(bill['chamber'], 'introduced', introduced, 'bill:introduced') actions = page.xpath('//table[preceding-sibling::b[contains(font,"Actions History:")]]/tr/td[@width="79%"]/font') if len(actions) > 0: actions = actions[0].text_content().split('\n') for act in actions: act = act.partition(' - ') date = datetime.strptime(act[0], '%b %d, %Y') b.add_action(bill['chamber'], act[2], date) # resources = page.xpath('//tr[td/b[contains(font, "Full text of Legislation")]]') # save vote urls for scraping later vote_urls = [] voting_reports = page.xpath('//tr[td/b[contains(font, "Voting Reports")]]') if(len(voting_reports) > 0): for report in voting_reports[0].xpath('td/font/a'): vote_urls.append(report.attrib['href']) # Scrape votes for url in vote_urls: vote = self.scrape_votes(chamber, title, bill_id, url) b.add_vote(vote) # Save bill self.save_bill(b)
def scrape(self, chamber, session): self.validate_session(session) if chamber == 'upper': other_chamber = 'lower' bill_id = 'SB 1' else: other_chamber = 'upper' bill_id = 'HB 1' b1 = Bill(session, chamber, bill_id, 'A super bill') b1.add_source('http://example.com/') b1.add_version('As Introduced', 'http://example.com/SB1.html') b1.add_document('Google', 'http://google.com') b1.add_sponsor('primary', 'Bob Smith') b1.add_sponsor('secondary', 'Johnson, Sally') d1 = datetime.datetime.strptime('1/29/2010', '%m/%d/%Y') v1 = Vote('upper', d1, 'Final passage', True, 2, 0, 0) v1.yes('Smith') v1.yes('Johnson') d2 = datetime.datetime.strptime('1/30/2010', '%m/%d/%Y') v2 = Vote('lower', d2, 'Final passage', False, 0, 1, 1) v2.no('Bob Smith') v2.other('S. Johnson') b1.add_vote(v1) b1.add_vote(v2) b1.add_action(chamber, 'introduced', d1) b1.add_action(chamber, 'read first time', d2) b1.add_action(other_chamber, 'introduced', d2) self.save_bill(b1)
def scrape_bill(self, chamber, session, bill_id, bill_type): url = '%s?r=%s' % (self.base_url, bill_id) with self.urlopen(url) as html: doc = lxml.html.fromstring(html) # search for Titulo, accent over i messes up lxml, so use 'tulo' title = doc.xpath( u'//td/b[contains(text(),"tulo")]/../following-sibling::td/text()' ) if not title: raise NoSuchBill() bill = Bill(session, chamber, bill_id, title[0], type=bill_type) author = doc.xpath( u'//td/b[contains(text(),"Autor")]/../text()')[0] for aname in author.split(','): bill.add_sponsor('primary', self.clean_name(aname).strip()) co_authors = doc.xpath( u'//td/b[contains(text(),"Co-autor")]/../text()') if len(co_authors) != 0: for co_author in co_authors[1].split(','): bill.add_sponsor('cosponsor', self.clean_name(co_author).strip()) action_table = doc.xpath('//table')[-1] for row in action_table[1:]: tds = row.xpath('td') # ignore row missing date if len(tds) != 2: continue date = datetime.datetime.strptime(tds[0].text_content(), "%m/%d/%Y") action = tds[1].text_content().strip() #parse the text to see if it's a new version or a unrelated document #if has - let's *shrug* assume it's a vote document #get url of action action_url = tds[1].xpath('a/@href') atype, action = self.parse_action(chamber, bill, action, action_url, date) if atype == 'bill:passed' and action_url: vote_chamber = None for pattern, vote_chamber in _voteChambers: if re.match(pattern, action): break else: self.warning('coudnt find voteChamber pattern') if vote_chamber == 'lower' and len(action_url) > 0: vote = self.scrape_votes(action_url[0], action, date, vote_chamber) if not vote[0] == None: vote[0].add_source(action_url[0]) bill.add_vote(vote[0]) else: self.warning('Problem Reading vote: %s,%s' % (vote[1], bill_id)) bill.add_source(url) self.save_bill(bill)
def scrape_bill(self, chamber, session, bill_id, bill_type): url = '%s?r=%s' % (self.base_url, bill_id) html = self.urlopen(url) if "error '80020009'" in html: self.warning('asp error on page, skipping %s', bill_id) return doc = lxml.html.fromstring(html) # search for Titulo, accent over i messes up lxml, so use 'tulo' title = doc.xpath(u'//td/b[contains(text(),"tulo")]/../following-sibling::td/text()') if not title: raise NoSuchBill() bill = Bill(session, chamber, bill_id, title[0], type=bill_type) author = doc.xpath(u'//td/b[contains(text(),"Autor")]/../text()')[0] for aname in author.split(','): aname = self.clean_name(aname).strip() if aname: bill.add_sponsor('primary', aname) co_authors = doc.xpath(u'//td/b[contains(text(),"Co-autor")]/../text()') if len(co_authors) != 0: for co_author in co_authors[1].split(','): bill.add_sponsor('cosponsor', self.clean_name(co_author).strip()); action_table = doc.xpath('//table')[-1] for row in action_table[1:]: tds = row.xpath('td') # ignore row missing date if len(tds) != 2: continue if tds[0].text_content(): date = datetime.datetime.strptime(tds[0].text_content(), "%m/%d/%Y") action = tds[1].text_content().strip() #parse the text to see if it's a new version or a unrelated document #if has - let's *shrug* assume it's a vote document #get url of action action_url = tds[1].xpath('a/@href') atype,action = self.parse_action(chamber,bill,action,action_url,date) if atype == 'bill:passed' and action_url: vote_chamber = None for pattern, vote_chamber in _voteChambers: if re.match(pattern,action): break else: self.warning('coudnt find voteChamber pattern') if vote_chamber == 'lower' and len(action_url) > 0: vote = self.scrape_votes(action_url[0], action,date, vote_chamber) if not vote[0] == None: vote[0].add_source(action_url[0]) bill.add_vote(vote[0]) else: self.warning('Problem Reading vote: %s,%s' % (vote[1], bill_id)) bill.add_source(url) self.save_bill(bill)
def scrape_current(self, chamber, term): chamber_name = "Senate" if chamber == "upper" else "House" with self.urlopen( ksapi.url + "bill_status/" ) as bill_request: # perhaps we should save this data so we can make on request for both chambers? bill_request_json = json.loads(bill_request) bills = bill_request_json["content"] for bill_data in bills: # filtering out other chambers bill_equal_chamber = False for history in bill_data["HISTORY"]: if history["chamber"] == chamber_name: bill_is_in_chamber = True if not bill_is_in_chamber: continue # main bill = Bill(term, chamber, bill_data["BILLNO"], bill_data["SHORTTITLE"]) bill.add_source(ksapi.url + "bill_status/" + bill_data["BILLNO"].lower()) if bill_data["LONGTITLE"]: bill.add_title(bill_data["LONGTITLE"]) bill.add_document("apn", ksapi.ksleg + bill_data["apn"]) bill.add_version("Latest", ksapi.ksleg + bill_data["apn"]) for sponsor in bill_data["SPONSOR_NAMES"]: bill.add_sponsor("primary" if len(bill_data["SPONSOR_NAMES"]) == 1 else "cosponsor", sponsor) for event in bill_data["HISTORY"]: if "committee_names" in event and "conferee_names" in event: actor = " and ".join(bill_data["committee_names"] + bill_data["conferee_names"]) elif "committee_names" in history: actor = " and ".join(bill_data["committee_names"]) elif "conferee_names" in history: actor = " and ".join(bill_data["conferee_names"]) else: actor = "upper" if chamber == "Senate" else "lower" date = datetime.datetime.strptime(event["occurred_datetime"], "%Y-%m-%dT%H:%M:%S") bill.add_action(actor, event["status"], date) if event["action_code"] in ksapi.voted: votes = votes_re.match(event["status"]) if votes: vote = Vote( chamber, date, votes.group(1), event["action_code"] in ksapi.passed, int(votes.group(2)), int(votes.group(3)), 0, ) vote.add_source(ksapi.ksleg + "bill_status/" + bill_data["BILLNO"].lower()) bill.add_vote(vote) self.save_bill(bill)
def scrape_bill(self, chamber, session, bill_id, bill_type): url = "%s?r=%s" % (self.base_url, bill_id) with self.urlopen(url) as html: doc = lxml.html.fromstring(html) # search for Titulo, accent over i messes up lxml, so use 'tulo' title = doc.xpath(u'//td/b[contains(text(),"tulo")]/../following-sibling::td/text()') if not title: raise NoSuchBill() bill = Bill(session, chamber, bill_id, title[0], type=bill_type) author = doc.xpath(u'//td/b[contains(text(),"Autor")]/../text()')[0] for aname in author.split(","): bill.add_sponsor("primary", self.clean_name(aname).strip()) co_authors = doc.xpath(u'//td/b[contains(text(),"Co-autor")]/../text()') if len(co_authors) != 0: for co_author in co_authors[1].split(","): bill.add_sponsor("cosponsor", self.clean_name(co_author).strip()) action_table = doc.xpath("//table")[-1] for row in action_table[1:]: tds = row.xpath("td") # ignore row missing date if len(tds) != 2: continue date = datetime.datetime.strptime(tds[0].text_content(), "%m/%d/%Y") action = tds[1].text_content().strip() # parse the text to see if it's a new version or a unrelated document # if has - let's *shrug* assume it's a vote document # get url of action action_url = tds[1].xpath("a/@href") atype, action = self.parse_action(chamber, bill, action, action_url, date) if atype == "bill:passed" and action_url: vote_chamber = None for pattern, vote_chamber in _voteChambers: if re.match(pattern, action): break else: self.warning("coudnt find voteChamber pattern") if vote_chamber == "lower" and len(action_url) > 0: vote = self.scrape_votes(action_url[0], action, date, vote_chamber) if not vote[0] == None: vote[0].add_source(action_url[0]) bill.add_vote(vote[0]) else: self.warning("Problem Reading vote: %s,%s" % (vote[1], bill_id)) bill.add_source(url) self.save_bill(bill)
def scrape(self, chamber, session): self.all_bills = {} self.slug = self.metadata['session_details'][session]['slug'] page = self.lxmlize(self.bill_directory_url.format(self.slug.upper())) page.make_links_absolute(self.base_url) ulid = 'senateBills' if chamber == 'upper' else 'houseBills' # id of <ul> header = page.xpath("//ul[@id='{0}_search']".format(ulid))[0] #Every ul with a data-load-action and an id bill_list_pages = header.xpath(".//ul[boolean(@data-load-action)" " and boolean(@id)]/@data-load-action") bill_anchors = [] for bill_list_url in bill_list_pages: bill_list_page = self.lxmlize('{}{}'.format(self.base_url, bill_list_url)) bill_list_page.make_links_absolute(self.base_url) bill_anchors.extend(bill_list_page.xpath('//a') or []) ws = re.compile(r"\s+") def _clean_ws(txt): """Remove extra whitespace from text.""" return ws.sub(' ', txt).strip() for a in bill_anchors: bid = ws.sub('', a.text_content()) # bill id bill_summary = _clean_ws(a.get('title')) # bill title is added below bill = Bill(session, chamber, bid, title='', summary=bill_summary) page = self.lxmlize(a.get('href')) versions = page.xpath('//ul[@class="dropdown-menu"]/li/span/' + 'a[contains(@title, "Get the Pdf")]/@href') measure_info = {} info = page.xpath("//table[@id='measureOverviewTable']/tr") for row in info: key, value = row.xpath("./*") key = key.text.replace(':','').strip() measure_info[key] = value for sponsor in measure_info['Chief Sponsors'].xpath("./a"): if sponsor.text_content().strip(): bill.add_sponsor( type='primary', name=sponsor.text_content()) for sponsor in measure_info['Regular Sponsors'].xpath("./a"): if sponsor.text_content().strip(): bill.add_sponsor( type='cosponsor', name=sponsor.text_content()) title = _clean_ws(measure_info['Bill Title'].text_content()) # some bill titles need to be added manually if self.slug == "2013R1" and bid == "HB2010": title = ("Relating to Water Resources Department contested" "case proceedings.") bill['title'] = title for version in versions: name = version.split("/")[-1] bill.add_version(name=name, url=version, mimetype='application/pdf') history_url = self.create_url('Measures/Overview/GetHistory/{bill}', bid) history = self.lxmlize(history_url).xpath("//table/tr") for entry in history: wwhere, action = [_clean_ws(x.text_content()) for x in entry.xpath("*")] vote_cleaning_re = r'(.*?)((Ayes)|(Nays),\s.*)' if re.match(vote_cleaning_re, action): action = re.search(vote_cleaning_re, action).groups()[0] wwhere = re.match( r"(?P<when>.*) \((?P<where>.*)\)", wwhere).groupdict() action_chamber = {"S": "upper", "H": "lower"}[wwhere['where']] when = "%s-%s" % (self.slug[:4], wwhere['when']) when = dt.datetime.strptime(when, "%Y-%m-%d") types = [] for expr, types_ in self.action_classifiers: m = re.match(expr, action) if m: types += types_ if types == []: types = ['other'] # actor, action, date, type, committees, legislators bill.add_action(action_chamber, action, when, type=types) # Parse and store Vote information vote_id = entry.xpath('./td/a[contains(@href, "otes-")]/@href') if not vote_id: continue elif "#measureVotes-" in vote_id[0]: vote_id = vote_id[0].split("-")[-1] vote_url = "https://olis.leg.state.or.us/liz/" + \ "{0}/Measures/MeasureVotes?id={1}". \ format(self.slug, vote_id) else: vote_id = vote_id[0].split("-")[-1] vote_url = "https://olis.leg.state.or.us/liz/" + \ "{0}/CommitteeReports/MajorityReport/{1}". \ format(self.slug, vote_id) votes = self._get_votes(vote_url) if not any(len(x) for x in votes.values()): self.warning("The votes webpage was empty for " + "action {0} on bill {1}.".format(action, bid)) continue passed = ( float(len(votes["yes_votes"])) / (len(votes["yes_votes"]) + len(votes["no_votes"])) > 0.5 ) vote = Vote( chamber=chamber, date=when, motion=action, passed=passed, yes_count=len(votes["yes_votes"]), no_count=len(votes["no_votes"]), other_count=len(votes["other_votes"]), session=session, bill_id=bid, bill_chamber=action_chamber ) vote.update(votes) bill_url = "https://olis.leg.state.or.us/liz/" + \ "{0}/Measures/Overview/{1}".format(self.slug, bid) vote.add_source(bill_url) bill.add_vote(vote) amendments_url = self.create_url( 'Measures/ProposedAmendments/{bill}', bid) amendments = self.lxmlize(amendments_url).xpath( "//div[@id='amendments']/table//tr") for amendment in amendments: nodes = amendment.xpath("./td") if nodes == []: continue pdf_href, date, committee, adopted, when = nodes pdf_href, = pdf_href.xpath("./a") pdf_link = pdf_href.attrib['href'] name = "Ammendment %s" % (pdf_href.text_content()) adopted = adopted.text bill.add_document(name=name, url=pdf_link, adopted=adopted, mimetype='application/pdf') bill.add_source(a.get('href')) self.save_bill(bill)
class AssemblyBillPage(object): '''Get the actions, sponsors, sponsors memo and summary and assembly floor votes from the assembly page. ''' def __init__(self, scraper, session, chamber, url, doc, bill_type, bill_id, title, bill_id_parts): self.scraper = scraper self.chamber = chamber self.url = url self.doc = doc self.bill_id = bill_id self.letter, self.number, self.version = bill_id_parts self.data = {} self.bill = Bill(session, chamber, bill_id, title, type=bill_type) self.succeeded = False self._build() def _build(self): if not self.doc.xpath('//pre/text()'): return self.get_actions() self.get_sponsors_memo() self.get_sponsors() self.get_summary() self.get_companions() self.get_lower_votes() self.get_version() self.succeeded = True self.bill.add_source(self.url) def _get_chunks(self): if 'summary' not in self.data: url = ('http://assembly.state.ny.us/leg/?default_fld=&' 'bn=%s&Summary=Y&Actions=Y') % self.bill_id doc = self.url2lxml(url) summary, actions = doc.xpath('//pre/text()') self.data['summary'], self.data['actions'] = summary, actions return summary, actions else: return self.data['summary'], self.data['actions'] def url2lxml(self, url): self.bill.add_source(url) return self.scraper.url2lxml(url) def get_version(self): url = 'http://assembly.state.ny.us/leg/?sh=printbill&bn=' + self.bill_id version = self.bill_id self.bill.add_version(version, url, mimetype='text/html') def get_companions(self): summary, _ = self._get_chunks() chunks = summary.split('\n\n') for chunk in chunks: if chunk.startswith('SAME AS'): companions = chunk.replace('SAME AS ', '') if companions != 'No same as': for companion in re.split(r'\s*[\,\\]\s*', companions): companion = re.sub(r'^Same as ', '', companion) companion = re.sub(r'^Uni', '', companion) companion = re.sub(r'\-\w+$', '', companion) self.bill.add_companion(companion) def get_sponsors_memo(self): if self.chamber == 'lower': url = ('http://assembly.state.ny.us/leg/?' 'default_fld=&bn=%s&term=&Memo=Y') % self.bill_id self.bill.add_document("Sponsor's Memorandum", url) def get_summary(self): summary, _ = self._get_chunks() chunks = summary.split('\n\n') self.bill['summary'] = chunks[-1] def _scrub_name(self, name): junk = [ r'^Rules\s+', '\(2nd Vice Chairperson\)', '\(MS\)', 'Assemblyman', 'Assemblywoman', 'Senator'] for rgx in junk: name = re.sub(rgx, '', name, re.I) return name.strip('(), ') def get_sponsors(self): summary, _ = self._get_chunks() chunks = summary.split('\n\n') for chunk in chunks: for sponsor_type in ('SPONSOR', 'COSPNSR', 'MLTSPNSR'): if chunk.startswith(sponsor_type): _, data = chunk.split(' ', 1) for sponsor in re.split(r',\s+', data.strip()): if not sponsor: continue # If it's a "Rules" bill, add the Rules committee # as the primary. if sponsor.startswith('Rules'): self.bill.add_sponsor('primary', 'Rules Committee', chamber='lower') sponsor = self._scrub_name(sponsor) # Figure out sponsor type. spons_swap = {'SPONSOR': 'primary'} _sponsor_type = spons_swap.get( sponsor_type, 'cosponsor') self.bill.add_sponsor(_sponsor_type, sponsor.strip(), official_type=sponsor_type) def get_actions(self): _, actions = self._get_chunks() categorizer = self.scraper.categorizer actions_rgx = r'(\d{2}/\d{2}/\d{4})\s+(.+)' actions_data = re.findall(actions_rgx, actions) for date, action in actions_data: date = datetime.datetime.strptime(date, r'%m/%d/%Y') act_chamber = ('upper' if action.isupper() else 'lower') types, attrs = categorizer.categorize(action) self.bill.add_action(act_chamber, action, date, type=types, **attrs) # Bail if the bill has been substituted by another. if 'substituted by' in action: return def get_lower_votes(self): url = ('http://assembly.state.ny.us/leg/?' 'default_fld=&bn=%s&term=&Votes=Y') doc = self.url2lxml(url % self.bill_id) if doc is None: return pre = doc.xpath('//pre')[0].text_content() no_votes = ('There are no votes for this bill in this ' 'legislative session.') if pre == no_votes: return actual_vote = collections.defaultdict(list) for table in doc.xpath('//table'): date = table.xpath('caption/label[contains(., "DATE:")]') date = date[0].itersiblings().next().text date = datetime.datetime.strptime(date, '%m/%d/%Y') votes = table.xpath('caption/span/label[contains(., "YEA/NAY:")]') votes = votes[0].itersiblings().next().text yes_count, no_count = map(int, votes.split('/')) passed = yes_count > no_count vote = Vote('lower', date, 'Floor Vote', passed, yes_count, no_count, other_count=0) tds = table.xpath('tr/td/text()') votes = iter(tds) while True: try: data = list(islice(votes, 2)) name, vote_val = data except (StopIteration, ValueError): # End of data. Stop. break name = self._scrub_name(name) if vote_val.strip() == 'Y': vote.yes(name) elif vote_val.strip() in ('N', 'NO'): vote.no(name) else: vote.other(name) actual_vote[vote_val].append(name) # The page doesn't provide an other_count. vote['other_count'] = len(vote['other_votes']) vote['actual_vote'] = actual_vote self.bill.add_vote(vote)
def _scrape_bill(self, session, bill_data): details = self._parse_bill_details(bill_data) (senate_url, assembly_url, bill_chamber, bill_type, bill_id, title, (prefix, number, active_version)) = details bill = Bill(session, bill_chamber, bill_id, title, type=bill_type, summary=bill_data['summary']) if bill_data['title'] is None: bill['title'] = bill_data['summary'] bill_active_version = bill_data['amendments']['items'][active_version] # Parse sponsors. if bill_data['sponsor']['rules'] == True: bill.add_sponsor('primary', 'Rules Committee', chamber=bill_chamber) elif not bill_data['sponsor']['budget']: primary_sponsor = bill_data['sponsor']['member'] bill.add_sponsor('primary', primary_sponsor['shortName']) # There *shouldn't* be cosponsors if there is no sponsor. cosponsors = bill_active_version['coSponsors']['items'] for cosponsor in cosponsors: bill.add_sponsor('cosponsor', cosponsor['shortName']) # List companion bill. same_as = bill_active_version.get('sameAs', {}) # Check whether "sameAs" property is populated with at least one bill. if same_as['items']: # Get companion bill ID. companion_bill_id = same_as['items'][0]['basePrintNo'] # Build companion bill session. start_year = same_as['items'][0]['session'] end_year = start_year + 1 companion_bill_session = '-'.join([str(start_year), str(end_year)]) # Determine companion bill chamber. companion_bill_prefix = self._parse_bill_number( same_as['items'][0]['basePrintNo'])[0] companion_bill_chamber = self._parse_bill_prefix( companion_bill_prefix)[0] # Attach companion bill data. bill.add_companion( companion_bill_id, companion_bill_session, companion_bill_chamber, ) # Parse actions. chamber_map = { 'senate': 'upper', 'assembly': 'lower', } for action in bill_data['actions']['items']: chamber = chamber_map[action['chamber'].lower()] action_datetime = datetime.datetime.strptime( action['date'], '%Y-%m-%d') action_date = action_datetime.date() types, attrs = NYBillScraper.categorizer.categorize(action['text']) bill.add_action(chamber, action['text'], action_date, type=types, **attrs) # Chamber-specific processing. if bill_chamber == 'upper': # Collect votes. for vote_data in bill_data['votes']['items']: vote = self._parse_senate_votes(vote_data) bill.add_vote(vote) elif bill_chamber == 'lower': assembly = AssemblyBillPage(self, session, bill, details) assembly.build() assembly_bill_data = assembly.bill # A little strange the way it works out, but the Assembly # provides the HTML version documents and the Senate provides # the PDF version documents. amendments = bill_data['amendments']['items'] for key, amendment in amendments.iteritems(): version = amendment['printNo'] html_version = version + ' HTML' html_url = 'http://assembly.state.ny.us/leg/?sh=printbill&bn='\ '{}&term={}'.format(bill_id, self.term_start_year) bill.add_version(html_version, html_version, mimetype='text/html') pdf_version = version + ' PDF' pdf_url = 'http://legislation.nysenate.gov/pdf/bills/{}/{}'\ .format(self.term_start_year, bill_id) bill.add_version(pdf_version, pdf_version, mimetype='application/pdf') # Handling of sources follows. Sources serving either chamber # maintain duplicate data, so we can see certain bill data # through either chamber's resources. However, we have to refer # to a specific chamber's resources if we want to grab certain # specific information such as vote data. # # As such, I'm placing all potential sources in the interest of # thoroughness. - Andy Lo # List Open Legislation API endpoint as a source. bill.add_source(self.api_client.root + self.api_client.\ resources['bill'].format( session_year=session, bill_id=bill_id, summary='', detail='')) bill.add_source(senate_url) bill.add_source(assembly_url) return bill
def bill_info(self, bill_link, session, main_url, bill_page): bill_page = lxml.html.fromstring(bill_page) #basic info try: long_title = bill_page.xpath('//div[@id="content_text"]/h2')[0].text.split() except IndexError: return None bill_id = long_title[0] title = '' for x in range(2, len(long_title)): title += long_title[x] + ' ' title = title[0:-1] #bill_type bill_type = 'resolution' if 'LR' in bill_id else 'bill' bill = Bill(session, 'upper', bill_id, title, type = bill_type) #sources bill.add_source(main_url) bill.add_source(bill_link) #Sponsor introduced_by = bill_page.xpath('//div[@id="content_text"]/div[2]/table/tr[2]/td[1]/a[1]')[0].text bill.add_sponsor('primary', introduced_by) #actions for actions in bill_page.xpath('//div[@id="content_text"]/div[3]/table/tr[1]/td[1]/table/tr'): date = actions[0].text if 'Date' not in date: date = datetime.strptime(date, '%b %d, %Y') action = actions[1].text if '-' in action: vote_info = action.split()[-1].split('-') yes_count = int(vote_info[0]) no_count = int(vote_info[1]) abstention_count = int(vote_info[2]) passed = True if ( yes_count > no_count) else False vote = Vote('upper', date, action, passed, yes_count, no_count, abstention_count) vote.add_source(bill_link) bill.add_vote(vote) if 'Governor' in action: actor = 'Governor' elif 'Speaker' in action: actor = 'Speaker' else: actor = 'upper' action_type = self.action_types(action) bill.add_action(actor, action, date, action_type) #versions for versions in bill_page.xpath('//div[@id="content_text"]/div[2]/table/tr[2]/td[2]/a'): version_url = versions.attrib['href'] version_url = 'http://nebraskalegislature.gov/' + version_url[3:len(version_url)] version_name = versions.text bill.add_version(version_name, version_url) #documents #additional_info for additional_info in bill_page.xpath('//div[@id="content_text"]/div[2]/table/tr[2]/td/a'): document_name = additional_info.text document_url = additional_info.attrib['href'] document_url = 'http://nebraskalegislature.gov/' + document_url[3:len(document_url)] if '.pdf' in document_url: bill.add_document(document_name, document_url) #amendments for admendments in bill_page.xpath('//div[@id="content_text"]/div[3]/table/tr[1]/td[2]/table/tr/td/a'): admendment_name = admendments.text admendment_url = admendments.attrib['href'] admendment_url = 'http://nebraskalegislature.gov/' + admendment_url[3:len(admendment_url)] bill.add_document(admendment_name, admendment_url) #related transcripts for transcripts in bill_page.xpath('//div[@id="content_text"]/div[3]/table/tr[2]/td[2]/a'): transcript_name = transcripts.text transcript_url = transcripts.attrib['href'] bill.add_document(transcript_name, transcript_url) self.save_bill(bill)
def scrape_bills(self, session, year_abr): #Main Bill information main_bill_csv = self.access_to_csv('MainBill') # keep a dictionary of bills (mapping bill_id to Bill obj) bill_dict = {} for rec in main_bill_csv: bill_type = rec["BillType"].strip() bill_number = int(rec["BillNumber"]) bill_id = bill_type + str(bill_number) title = rec["Synopsis"] if bill_type[0] == 'A': chamber = "lower" else: chamber = "upper" # some bills have a blank title.. just skip it if not title: continue bill = Bill(str(session), chamber, bill_id, title, type=self._bill_types[bill_type[1:]]) if rec['IdenticalBillNumber'].strip(): bill.add_companion(rec['IdenticalBillNumber'].split()[0]) # TODO: last session info is in there too bill_dict[bill_id] = bill #Sponsors bill_sponsors_csv = self.access_to_csv('BillSpon') for rec in bill_sponsors_csv: bill_type = rec["BillType"].strip() bill_number = int(rec["BillNumber"]) bill_id = bill_type + str(bill_number) if bill_id not in bill_dict: self.warning('unknown bill %s in sponsor database' % bill_id) continue bill = bill_dict[bill_id] name = rec["Sponsor"] sponsor_type = rec["Type"] if sponsor_type == 'P': sponsor_type = "primary" else: sponsor_type = "cosponsor" bill.add_sponsor(sponsor_type, name) #Documents bill_document_csv = self.access_to_csv('BillWP') for rec in bill_document_csv: bill_type = rec["BillType"].strip() bill_number = int(rec["BillNumber"]) bill_id = bill_type + str(bill_number) if bill_id not in bill_dict: self.warning('unknown bill %s in document database' % bill_id) continue bill = bill_dict[bill_id] document = rec["Document"] document = document.split('\\') document = document[-2] + "/" + document[-1] year = str(year_abr) + str((year_abr + 1)) #doc_url = "ftp://www.njleg.state.nj.us/%s/%s" % (year, document) htm_url = 'http://www.njleg.state.nj.us/%s/Bills/%s' % (year_abr, document.replace('.DOC', '.HTM')) # name document based _doctype try: doc_name = self._doctypes[rec['DocType']] except KeyError: raise Exception('unknown doctype %s on %s' % (rec['DocType'], bill_id)) if rec['Comment']: doc_name += ' ' + rec['Comment'] if rec['DocType'] in self._version_types: # Clean HTMX links. if htm_url.endswith('HTMX'): htm_url = re.sub('X$', '', htm_url) if htm_url.endswith('HTM'): mimetype = 'text/html' elif htm_url.endswith('wpd'): mimetype = 'application/vnd.wordperfect' bill.add_version(doc_name, htm_url, mimetype=mimetype) else: bill.add_document(doc_name, htm_url) # Votes next_year = int(year_abr)+1 vote_info_list = ['A%s' % year_abr, 'A%s' % next_year, 'S%s' % year_abr, 'S%s' % next_year, 'CA%s-%s' % (year_abr, next_year), 'CS%s-%s' % (year_abr, next_year), ] for filename in vote_info_list: s_vote_url = 'ftp://www.njleg.state.nj.us/votes/%s.zip' % filename try: s_vote_zip, resp = self.urlretrieve(s_vote_url) except scrapelib.FTPError: self.warning('could not find %s' % s_vote_url) continue zipedfile = zipfile.ZipFile(s_vote_zip) for vfile in ["%s.txt" % (filename), "%sEnd.txt" % (filename)]: try: vote_file = zipedfile.open(vfile, 'U') except KeyError: # # Right, so, 2011 we have an "End" file with more # vote data than was in the original dump. # self.warning("No such file: %s" % (vfile)) continue vdict_file = csv.DictReader(vote_file) votes = {} if filename.startswith('A') or filename.startswith('CA'): chamber = "lower" else: chamber = "upper" if filename.startswith('C'): vote_file_type = 'committee' else: vote_file_type = 'chamber' for rec in vdict_file: if vote_file_type == 'chamber': bill_id = rec["Bill"].strip() leg = rec["Full_Name"] date = rec["Session_Date"] action = rec["Action"] leg_vote = rec["Legislator_Vote"] else: bill_id = '%s%s' % (rec['Bill_Type'], rec['Bill_Number']) leg = rec['Name'] # drop time portion date = rec['Agenda_Date'].split()[0] # make motion readable action = self._com_vote_motions[rec['BillAction']] # first char (Y/N) use [0:1] to ignore '' leg_vote = rec['LegislatorVote'][0:1] date = datetime.strptime(date, "%m/%d/%Y") vote_id = '_'.join((bill_id, chamber, action)) vote_id = vote_id.replace(" ", "_") if vote_id not in votes: votes[vote_id] = Vote(chamber, date, action, None, None, None, None, bill_id=bill_id) if vote_file_type == 'committee': votes[vote_id]['committee'] = self._committees[ rec['Committee_House']] if leg_vote == "Y": votes[vote_id].yes(leg) elif leg_vote == "N": votes[vote_id].no(leg) else: votes[vote_id].other(leg) # remove temp file os.remove(s_vote_zip) #Counts yes/no/other votes and saves overall vote for vote in votes.itervalues(): vote_yes_count = len(vote["yes_votes"]) vote_no_count = len(vote["no_votes"]) vote_other_count = len(vote["other_votes"]) vote["yes_count"] = vote_yes_count vote["no_count"] = vote_no_count vote["other_count"] = vote_other_count # Veto override. if vote['motion'] == 'OVERRIDE': # Per the NJ leg's glossary, a veto override requires # 2/3ds of each chamber. 27 in the senate, 54 in the house. # http://www.njleg.state.nj.us/legislativepub/glossary.asp vote['passed'] = False if vote['chamber'] == 'lower': if vote_yes_count >= 54: vote['passed'] = True elif vote['chamber'] == 'upper': if vote_yes_count >= 27: vote['passed'] = True # Regular vote. elif vote_yes_count > vote_no_count: vote["passed"] = True else: vote["passed"] = False vote_bill_id = vote["bill_id"] bill = bill_dict[vote_bill_id] bill.add_vote(vote) #Actions bill_action_csv = self.access_to_csv('BillHist') actor_map = {'A': 'lower', 'G': 'executive', 'S': 'upper'} for rec in bill_action_csv: bill_type = rec["BillType"].strip() bill_number = int(rec["BillNumber"]) bill_id = bill_type + str(bill_number) if bill_id not in bill_dict: self.warning('unknown bill %s in action database' % bill_id) continue bill = bill_dict[bill_id] action = rec["Action"] date = rec["DateAction"] date = datetime.strptime(date, "%m/%d/%y %H:%M:%S") actor = actor_map[rec["House"]] comment = rec["Comment"] action, atype = self.categorize_action(action, bill_id) if comment: action += (' ' + comment) bill.add_action(actor, action, date, type=atype) # Subjects subject_csv = self.access_to_csv('BillSubj') for rec in subject_csv: bill_id = rec['BillType'].strip() + str(int(rec['BillNumber'])) if bill_id not in bill_dict: self.warning('unknown bill %s in subject database' % bill_id) continue bill = bill_dict.get(bill_id) if bill: bill.setdefault('subjects', []).append(rec['SubjectKey']) else: self.warning('invalid bill id in BillSubj: %s' % bill_id) phony_bill_count = 0 # save all bills at the end for bill in bill_dict.itervalues(): # add sources if not bill['actions'] and not bill['versions']: self.warning('probable phony bill detected %s', bill['bill_id']) phony_bill_count += 1 else: bill.add_source('http://www.njleg.state.nj.us/downloads.asp') self.save_bill(bill) if phony_bill_count: self.warning('%s total phony bills detected', phony_bill_count)
def scrape_bill(self, chamber, session, bill_id, short_title=None): """ Scrapes documents, actions, vote counts and votes for bills from the 2009 session and above. """ url = BILL_URL % (session, bill_id.replace(' ', '')) bill_page = self.get(url).text html = lxml.html.fromstring(bill_page) html.make_links_absolute('http://legislature.idaho.gov/legislation/%s/' % session) bill_tables = html.xpath('//table[contains(@class, "bill-table")]') title = bill_tables[1].text_content().strip() bill_type = get_bill_type(bill_id) bill = Bill(session, chamber, bill_id, title, type=bill_type) bill.add_source(url) bill['subjects'] = self._subjects[bill_id.replace(' ', '')] if short_title and bill['title'].lower() != short_title.lower(): bill.add_title(short_title) # documents doc_links = html.xpath('//div[contains(@class,"pf-content")]//a') for link in doc_links: name = link.text_content().strip() href = link.get('href') if 'Engrossment' in name or 'Bill Text' in name: bill.add_version(name, href, mimetype='application/pdf') else: bill.add_document(name, href) def _split(string): return re.split(r"\w+[,|AND]\s+", string) # sponsors range from a committee to one legislator to a group of legs sponsor_lists = bill_tables[0].text_content().split('by') if len(sponsor_lists) > 1: for sponsors in sponsor_lists[1:]: if 'COMMITTEE' in sponsors.upper(): bill.add_sponsor('primary', sponsors.strip()) else: for person in _split(sponsors): person = person.strip() if person != "": bill.add_sponsor('primary', person) actor = chamber last_date = None for row in bill_tables[2]: # lots of empty rows if len(row) == 1: continue _, date, action, _ = [x.text_content().strip() for x in row] if date: last_date = date else: date = last_date date = datetime.datetime.strptime(date + '/' + session[0:4], "%m/%d/%Y") if action.startswith('House'): actor = 'lower' elif action.startswith('Senate'): actor = 'upper' # votes if 'AYES' in action or 'NAYS' in action: vote = self.parse_vote(actor, date, row[2]) vote.add_source(url) bill.add_vote(vote) # some td's text is seperated by br elements if len(row[2]): action = "".join(row[2].itertext()) action = action.replace(u'\xa0', ' ').strip() atype = get_action(actor, action) bill.add_action(actor, action, date, type=atype) # after voice vote/roll call and some actions the bill is sent # 'to House' or 'to Senate' if 'to House' in action: actor = 'lower' elif 'to Senate' in action: actor = 'upper' self.save_bill(bill)
def scrape_bills(self, session, year_abr): #Main Bill information main_bill_csv = self.access_to_csv('MainBill') # keep a dictionary of bills (mapping bill_id to Bill obj) bill_dict = {} for rec in main_bill_csv: bill_type = rec["BillType"].strip() bill_number = int(rec["BillNumber"]) bill_id = bill_type + str(bill_number) title = rec["Synopsis"] if bill_type[0] == 'A': chamber = "lower" else: chamber = "upper" # some bills have a blank title.. just skip it if not title: continue bill = Bill(str(session), chamber, bill_id, title, type=self._bill_types[bill_type[1:]]) if rec['IdenticalBillNumber'].strip(): bill.add_companion(rec['IdenticalBillNumber'].split()[0]) # TODO: last session info is in there too bill_dict[bill_id] = bill #Sponsors bill_sponsors_csv = self.access_to_csv('BillSpon') for rec in bill_sponsors_csv: bill_type = rec["BillType"].strip() bill_number = int(rec["BillNumber"]) bill_id = bill_type + str(bill_number) if bill_id not in bill_dict: self.warning('unknown bill %s in sponsor database' % bill_id) continue bill = bill_dict[bill_id] name = rec["Sponsor"] sponsor_type = rec["Type"] if sponsor_type == 'P': sponsor_type = "primary" else: sponsor_type = "cosponsor" bill.add_sponsor(sponsor_type, name) #Documents bill_document_csv = self.access_to_csv('BillWP') for rec in bill_document_csv: bill_type = rec["BillType"].strip() bill_number = int(rec["BillNumber"]) bill_id = bill_type + str(bill_number) if bill_id not in bill_dict: self.warning('unknown bill %s in document database' % bill_id) continue bill = bill_dict[bill_id] document = rec["Document"] document = document.split('\\') document = document[-2] + "/" + document[-1] year = str(year_abr) + str((year_abr + 1)) #doc_url = "ftp://www.njleg.state.nj.us/%s/%s" % (year, document) htm_url = 'http://www.njleg.state.nj.us/%s/Bills/%s' % (year_abr, document.replace('.DOC', '.HTM')) # name document based _doctype try: doc_name = self._doctypes[rec['DocType']] except KeyError: raise Exception('unknown doctype %s on %s' % (rec['DocType'], bill_id)) if rec['Comment']: doc_name += ' ' + rec['Comment'] # Clean HTMX links. if htm_url.endswith('HTMX'): htm_url = re.sub('X$', '', htm_url) if rec['DocType'] in self._version_types: if htm_url.endswith('HTM'): mimetype = 'text/html' elif htm_url.endswith('wpd'): mimetype = 'application/vnd.wordperfect' try: bill.add_version(doc_name, htm_url, mimetype=mimetype) except ValueError: self.warning("Couldn't find a document for bill {}".format(bill_id)) pass else: bill.add_document(doc_name, htm_url) # Votes next_year = int(year_abr)+1 vote_info_list = ['A%s' % year_abr, 'A%s' % next_year, 'S%s' % year_abr, 'S%s' % next_year, 'CA%s-%s' % (year_abr, next_year), 'CS%s-%s' % (year_abr, next_year), ] for filename in vote_info_list: s_vote_url = 'ftp://www.njleg.state.nj.us/votes/%s.zip' % filename try: s_vote_zip, resp = self.urlretrieve(s_vote_url) except scrapelib.FTPError: self.warning('could not find %s' % s_vote_url) continue zipedfile = zipfile.ZipFile(s_vote_zip) for vfile in ["%s.txt" % (filename), "%sEnd.txt" % (filename)]: try: vote_file = zipedfile.open(vfile, 'U') except KeyError: # # Right, so, 2011 we have an "End" file with more # vote data than was in the original dump. # self.warning("No such file: %s" % (vfile)) continue vdict_file = csv.DictReader(vote_file) votes = {} if filename.startswith('A') or filename.startswith('CA'): chamber = "lower" else: chamber = "upper" if filename.startswith('C'): vote_file_type = 'committee' else: vote_file_type = 'chamber' for rec in vdict_file: if vote_file_type == 'chamber': bill_id = rec["Bill"].strip() leg = rec["Full_Name"] date = rec["Session_Date"] action = rec["Action"] leg_vote = rec["Legislator_Vote"] else: bill_id = '%s%s' % (rec['Bill_Type'], rec['Bill_Number']) leg = rec['Name'] # drop time portion date = rec['Agenda_Date'].split()[0] # make motion readable action = self._com_vote_motions[rec['BillAction']] # first char (Y/N) use [0:1] to ignore '' leg_vote = rec['LegislatorVote'][0:1] date = datetime.strptime(date, "%m/%d/%Y") vote_id = '_'.join((bill_id, chamber, action)) vote_id = vote_id.replace(" ", "_") if vote_id not in votes: votes[vote_id] = Vote(chamber, date, action, None, None, None, None, bill_id=bill_id) if vote_file_type == 'committee': votes[vote_id]['committee'] = self._committees[ rec['Committee_House']] if leg_vote == "Y": votes[vote_id].yes(leg) elif leg_vote == "N": votes[vote_id].no(leg) else: votes[vote_id].other(leg) # remove temp file os.remove(s_vote_zip) #Counts yes/no/other votes and saves overall vote for vote in votes.itervalues(): vote_yes_count = len(vote["yes_votes"]) vote_no_count = len(vote["no_votes"]) vote_other_count = len(vote["other_votes"]) vote["yes_count"] = vote_yes_count vote["no_count"] = vote_no_count vote["other_count"] = vote_other_count # Veto override. if vote['motion'] == 'OVERRIDE': # Per the NJ leg's glossary, a veto override requires # 2/3ds of each chamber. 27 in the senate, 54 in the house. # http://www.njleg.state.nj.us/legislativepub/glossary.asp vote['passed'] = False if vote['chamber'] == 'lower': if vote_yes_count >= 54: vote['passed'] = True elif vote['chamber'] == 'upper': if vote_yes_count >= 27: vote['passed'] = True # Regular vote. elif vote_yes_count > vote_no_count: vote["passed"] = True else: vote["passed"] = False vote_bill_id = vote["bill_id"] bill = bill_dict[vote_bill_id] bill.add_vote(vote) #Actions bill_action_csv = self.access_to_csv('BillHist') actor_map = {'A': 'lower', 'G': 'executive', 'S': 'upper'} for rec in bill_action_csv: bill_type = rec["BillType"].strip() bill_number = int(rec["BillNumber"]) bill_id = bill_type + str(bill_number) if bill_id not in bill_dict: self.warning('unknown bill %s in action database' % bill_id) continue bill = bill_dict[bill_id] action = rec["Action"] date = rec["DateAction"] date = datetime.strptime(date, "%m/%d/%y %H:%M:%S") actor = actor_map[rec["House"]] comment = rec["Comment"] action, atype = self.categorize_action(action, bill_id) if comment: action += (' ' + comment) bill.add_action(actor, action, date, type=atype) # Subjects subject_csv = self.access_to_csv('BillSubj') for rec in subject_csv: bill_id = rec['BillType'].strip() + str(int(rec['BillNumber'])) if bill_id not in bill_dict: self.warning('unknown bill %s in subject database' % bill_id) continue bill = bill_dict.get(bill_id) if bill: bill.setdefault('subjects', []).append(rec['SubjectKey']) else: self.warning('invalid bill id in BillSubj: %s' % bill_id) phony_bill_count = 0 # save all bills at the end for bill in bill_dict.itervalues(): # add sources if not bill['actions'] and not bill['versions']: self.warning('probable phony bill detected %s', bill['bill_id']) phony_bill_count += 1 else: bill.add_source('http://www.njleg.state.nj.us/downloads.asp') self.save_bill(bill) if phony_bill_count: self.warning('%s total phony bills detected', phony_bill_count)
def scrape(self, chamber, term): years = {'102': 2011} main_url = 'http://nebraskalegislature.gov/bills/search_by_date.php?SessionDay=%s' % ( years[term]) with self.urlopen(main_url) as page: page = lxml.html.fromstring(page) for docs in page.xpath( '/html/body/div[@id="wrapper"]/div[@id="content"]/div[@id="content_text"]/div[@class="cal_content_full"]/table[@id="bill_results"]/tr/td[1]/a' ): bill_link = docs.attrib['href'] bill_link = 'http://nebraskalegislature.gov/' + bill_link with self.urlopen(bill_link) as bill_page: bill_page = lxml.html.fromstring(bill_page) #basic info long_title = bill_page.xpath( '/html/body/div[@id="wrapper"]/div[@id="content"]/div[@id="content_text"]/h2' )[0].text.split() bill_id = long_title[0] title = '' for x in range(2, len(long_title)): title += long_title[x] + ' ' title = title[0:-1] #bill = Bill(term, chamber, bill_id, title) #bill_type if 'LR' in bill_id: bill_type = 'resolution' else: bill_type = 'bill' bill = Bill(term, chamber, bill_id, title, type=bill_type) #sources bill.add_source(main_url) bill.add_source(bill_link) #Sponsor introduced_by = bill_page.xpath( '/html/body/div[@id="wrapper"]/div[@id="content"]/div[@id="content_text"]/div[2]/table/tr[2]/td[1]/a[1]' )[0].text bill.add_sponsor('primary', introduced_by) #actions for actions in bill_page.xpath( '/html/body/div[@id="wrapper"]/div[@id="content"]/div[@id="content_text"]/div[3]/table/tr[1]/td[1]/table/tr' ): date = actions[0].text if 'Date' not in date: date = datetime.strptime(date, '%b %d, %Y') action = actions[1].text if '-' in action: vote_info = action.split()[-1].split('-') yes_count = int(vote_info[0]) no_count = int(vote_info[1]) abstention_count = int(vote_info[2]) if yes_count > no_count: passed = True else: passed = False vote = Vote(chamber, date, action, passed, yes_count, no_count, abstention_count) vote.add_source(bill_link) bill.add_vote(vote) if 'Governor' in action: actor = 'Governor' elif 'Speaker' in action: actor = 'Speaker' else: actor = chamber action_type = self.action_types(action) bill.add_action(actor, action, date, action_type) #versions for versions in bill_page.xpath( '/html/body/div[@id="wrapper"]/div[@id="content"]/div[@id="content_text"]/div[2]/table/tr[2]/td[2]/a' ): version_url = versions.attrib['href'] version_url = 'http://nebraskalegislature.gov/' + version_url[ 3:len(version_url)] version_name = versions.text bill.add_version(version_name, version_url) #documents #additional_info for additional_info in bill_page.xpath( '/html/body/div[@id="wrapper"]/div[@id="content"]/div[@id="content_text"]/div[2]/table/tr[2]/td/a' ): document_name = additional_info.text document_url = additional_info.attrib['href'] document_url = 'http://nebraskalegislature.gov/' + document_url[ 3:len(document_url)] if '.pdf' in document_url: bill.add_document(document_name, document_url) #amendments for admendments in bill_page.xpath( '/html/body/div[@id="wrapper"]/div[@id="content"]/div[@id="content_text"]/div[3]/table/tr[1]/td[2]/table/tr/td/a' ): admendment_name = admendments.text admendment_url = admendments.attrib['href'] admendment_url = 'http://nebraskalegislature.gov/' + admendment_url[ 3:len(admendment_url)] bill.add_document(admendment_name, admendment_url) #related transcripts for transcripts in bill_page.xpath( '/html/body/div[@id="wrapper"]/div[@id="content"]/div[@id="content_text"]/div[3]/table/tr[2]/td[2]/a' ): transcript_name = transcripts.text transcript_url = transcripts.attrib['href'] bill.add_document(transcript_name, transcript_url) self.save_bill(bill)
def scrape_bill(self, chamber, session, bill_id, bill_type): url = '%s?r=%s' % (self.base_url, bill_id) html = self.get(url).text if "error '80020009'" in html: self.warning('asp error on page, skipping %s', bill_id) return doc = lxml.html.fromstring(html) # search for Titulo, accent over i messes up lxml, so use 'tulo' title = doc.xpath( u'//td/b[contains(text(),"tulo")]/../following-sibling::td/text()') if not title: raise NoSuchBill() bill = Bill(session, chamber, bill_id, title[0], type=bill_type) author = doc.xpath(u'//td/b[contains(text(),"Autor")]/../text()')[0] for aname in author.split(','): aname = self.clean_name(aname).strip() if aname: bill.add_sponsor('primary', aname) co_authors = doc.xpath( u'//td/b[contains(text(),"Co-autor")]/../text()') if len(co_authors) != 0: for co_author in co_authors[1].split(','): bill.add_sponsor('cosponsor', self.clean_name(co_author).strip()) action_table = doc.xpath('//table')[-1] for row in action_table[1:]: tds = row.xpath('td') # ignore row missing date if len(tds) != 2: continue if tds[0].text_content(): date = datetime.datetime.strptime(tds[0].text_content(), "%m/%d/%Y") action = tds[1].text_content().strip() #parse the text to see if it's a new version or a unrelated document #if has a hyphen let's assume it's a vote document #get url of action action_url = tds[1].xpath('a/@href') atype, action = self.parse_action(chamber, bill, action, action_url, date) # Some lower-house roll calls could be parsed, but finnicky # Most roll lists are just images embedded within a document, # and offer no alt text to scrape # Instead, just scrape the vote counts vote_info = re.search( r'(?u)^(.*),\s([\s\d]{2})-([\s\d]{2})-([\s\d]{2})-([\s\d]{0,2})$', action) if vote_info and re.search(r'\d{1,2}', action): vote_name = vote_info.group(1) if u"Votación Final" in vote_name: (vote_chamber, vote_name) = re.search(r'(?u)^\w+ por (.*?) en (.*)$', vote_name).groups() if "Senado" in vote_chamber: vote_chamber = 'upper' else: vote_chamber = 'lower' elif "Cuerpo de Origen" in vote_name: vote_name = re.search(r'(?u)^Cuerpo de Origen (.*)$', vote_name).group(1) vote_chamber = chamber elif u"informe de Comisión de Conferencia" in vote_name: (vote_chamber, vote_name) = re.search( r'(?u)^(\w+) (\w+ informe de Comisi\wn de Conferencia)$', vote_name).groups() if vote_chamber == "Senado": vote_chamber = 'upper' else: vote_chamber = 'lower' elif u"Se reconsideró" in vote_name: if bill['votes']: vote_chamber = bill['votes'][-1]['chamber'] else: vote_chamber = chamber else: raise AssertionError( u"Unknown vote text found: {}".format(vote_name)) vote_name = vote_name.title() yes = int(vote_info.group(2)) no = int(vote_info.group(3)) other = 0 if vote_info.group(4).strip(): other += int(vote_info.group(4)) if vote_info.group(5).strip(): other += int(vote_info.group(5)) vote = Vote(chamber=vote_chamber, date=date, motion=vote_name, passed=(yes > no), yes_count=yes, no_count=no, other_count=other) vote.add_source(url) bill.add_vote(vote) bill.add_source(url) self.save_bill(bill)
class SenateBillPage(object): '''Used for categories, senate votes, events.''' def __init__(self, scraper, session, chamber, url, doc, bill_type, bill_id, title, bill_id_parts): self.scraper = scraper self.chamber = chamber self.url = url self.doc = doc self.bill_id = bill_id self.letter, self.number, self.version = bill_id_parts self.data = {} self.bill = Bill(session, chamber, bill_id, title, type=bill_type) self.succeeded = False self._build() self.bill.add_source(self.url) def _build(self): self.get_senate_votes() self.get_sponsors_memo() self.get_subjects() self.get_versions() self.succeeded = True def url2lxml(self, url): self.bill.add_source(url) return self.scraper.url2lxml(url) def get_subjects(self): subjects = [] for link in self.doc.xpath("//a[contains(@href, 'lawsection')]"): subjects.append(link.text.strip()) self.bill['subjects'] = subjects def get_sponsors_memo(self): if self.chamber == 'upper': self.bill.add_document("Sponsor's Memorandum", self.url) def get_senate_votes(self): for b in self.doc.xpath("//div/b[starts-with(., 'VOTE: FLOOR VOTE:')]"): date = b.text.split('-')[1].strip() date = datetime.datetime.strptime(date, "%b %d, %Y").date() yes_votes, no_votes, other_votes = [], [], [] yes_count, no_count, other_count = 0, 0, 0 actual_vote = collections.defaultdict(list) vtype = None for tag in b.xpath("following-sibling::blockquote/*"): if tag.tag == 'b': text = tag.text if text.startswith('Ayes'): vtype = 'yes' yes_count = int(re.search( r'\((\d+)\):', text).group(1)) elif text.startswith('Nays'): vtype = 'no' no_count = int(re.search( r'\((\d+)\):', text).group(1)) elif (text.startswith('Excused') or text.startswith('Abstain') or text.startswith('Absent') ): vtype = 'other' other_count += int(re.search( r'\((\d+)\):', text).group(1)) else: raise ValueError('bad vote type: %s' % tag.text) elif tag.tag == 'a': name = tag.text.strip() if vtype == 'yes': yes_votes.append(name) elif vtype == 'no': no_votes.append(name) elif vtype == 'other': other_votes.append((name, tag.text)) passed = yes_count > (no_count + other_count) vote = Vote('upper', date, 'Floor Vote', passed, yes_count, no_count, other_count) for name in yes_votes: vote.yes(name) for name in no_votes: vote.no(name) for name, vote_val in other_votes: vote.other(name) actual_vote[vote_val].append(name) vote['actual_vote'] = actual_vote vote.add_source(self.url) self.bill.add_vote(vote) for b in self.doc.xpath("//div/b[starts-with(., 'VOTE: COMMITTEE VOTE:')]"): _, committee, date = re.split(r'\s*\t+\s*-\s*', b.text) date = date.strip() date = datetime.datetime.strptime(date, "%b %d, %Y").date() yes_votes, no_votes, other_votes = [], [], [] yes_count, no_count, other_count = 0, 0, 0 vtype = None for tag in b.xpath("following-sibling::blockquote/*"): if tag.tag == 'b': text = tag.text if text.startswith('Ayes'): vtype = 'yes' yes_count += int(re.search( r'\((\d+)\):', text).group(1)) elif text.startswith('Nays'): vtype = 'no' no_count += int(re.search( r'\((\d+)\):', text).group(1)) elif (text.startswith('Excused') or text.startswith('Abstain') or text.startswith('Absent') ): vtype = 'other' other_count += int(re.search( r'\((\d+)\):', text).group(1)) else: raise ValueError('bad vote type: %s' % tag.text) elif tag.tag == 'a': name = tag.text.strip() if vtype == 'yes': yes_votes.append(name) elif vtype == 'no': no_votes.append(name) elif vtype == 'other': other_votes.append(name) passed = yes_count > (no_count + other_count) vote = Vote('upper', date, '%s Committee Vote' % committee, passed, yes_count, no_count, other_count) for name in yes_votes: vote.yes(name) for name in no_votes: vote.no(name) for name in other_votes: vote.other(name) vote.add_source(self.url) self.bill.add_vote(vote) def get_versions(self): text = self.doc.xpath('//*[contains(., "Versions")]')[-1].text_content() version_text = re.sub('Versions:?\s*', '', text) url_tmpl = 'http://open.nysenate.gov/legislation/bill/' for version_bill_id in re.findall('\S+', version_text): version_bill_id_noyear, _ = version_bill_id.rsplit('-') version_url = url_tmpl + version_bill_id self.bill.add_version(version_bill_id_noyear, version_url, mimetype='text/html')
class AssemblyBillPage(object): '''Get the actions, sponsors, sponsors memo and summary and assembly floor votes from the assembly page. ''' metadata = metadata('ny') def __init__(self, scraper, session, chamber, url, doc, bill_type, bill_id, title, bill_id_parts): self.scraper = scraper self.session = session self.term = term_for_session('ny', session) for data in self.metadata['terms']: if session in data['sessions']: self.termdata = data self.term_start_year = data['start_year'] self.chamber = chamber self.url = url self.doc = doc self.bill_id = bill_id self.letter, self.number, self.version = bill_id_parts self.data = {} self.bill = Bill(session, chamber, bill_id, title, type=bill_type) self.succeeded = False self._build() def _build(self): if not self.doc.xpath('//pre/text()'): return self.get_actions() self.get_sponsors_memo() self.get_sponsors() self.get_summary() self.get_companions() self.get_lower_votes() self.get_version() self.succeeded = True self.bill.add_source(self.url) def _get_chunks(self): if 'summary' not in self.data: url = ('http://assembly.state.ny.us/leg/?default_fld=&' 'bn=%s&Summary=Y&Actions=Y&term=%s') url = url % (self.bill_id, self.term_start_year) doc = self.url2lxml(url) summary, actions = doc.xpath('//pre')[:2] summary = summary.text_content() actions = actions.text_content() self.data['summary'] = summary self.data['actions'] = actions return summary, actions else: return self.data['summary'], self.data['actions'] def url2lxml(self, url): self.bill.add_source(url) return self.scraper.url2lxml(url) def get_version(self): url = 'http://assembly.state.ny.us/leg/?sh=printbill&bn=%s&term=%s' url = url % (self.bill_id, self.term_start_year) version = self.bill_id self.bill.add_version(version, url, mimetype='text/html') def get_companions(self): summary, _ = self._get_chunks() chunks = summary.split('\n\n') for chunk in chunks: if chunk.startswith('SAME AS'): companions = chunk.replace('SAME AS ', '') if companions != 'No same as': for companion in re.split(r'\s*[\,\\]\s*', companions): companion = re.sub(r'^Same as ', '', companion) companion = re.sub(r'^Uni', '', companion) companion = re.sub(r'\-\w+$', '', companion) self.bill.add_companion(companion) def get_sponsors_memo(self): if self.chamber == 'lower': url = ('http://assembly.state.ny.us/leg/?' 'default_fld=&bn=%s&term=%s&Memo=Y') url = url % (self.bill_id, self.term_start_year) self.bill.add_document("Sponsor's Memorandum", url) def get_summary(self): summary, _ = self._get_chunks() chunks = summary.split('\n\n') self.bill['summary'] = chunks[-1] def _scrub_name(self, name): junk = [ r'^Rules\s+', '\(2nd Vice Chairperson\)', '\(MS\)', 'Assemblyman', 'Assemblywoman', 'Senator'] for rgx in junk: name = re.sub(rgx, '', name, re.I) # Collabpse whitespace. name = re.sub('\s+', ' ', name) return name.strip('(), ') def get_sponsors(self): summary, _ = self._get_chunks() chunks = summary.split('\n\n') for chunk in chunks: for sponsor_type in ('SPONSOR', 'COSPNSR', 'MLTSPNSR'): if chunk.startswith(sponsor_type): _, data = chunk.split(' ', 1) for sponsor in re.split(r',\s+', data.strip()): if not sponsor: continue # If it's a "Rules" bill, add the Rules committee # as the primary. if sponsor.startswith('Rules'): self.bill.add_sponsor('primary', 'Rules Committee', chamber='lower') sponsor = self._scrub_name(sponsor) # Figure out sponsor type. spons_swap = {'SPONSOR': 'primary'} _sponsor_type = spons_swap.get( sponsor_type, 'cosponsor') self.bill.add_sponsor(_sponsor_type, sponsor.strip(), official_type=sponsor_type) def get_actions(self): _, actions = self._get_chunks() categorizer = self.scraper.categorizer actions_rgx = r'(\d{2}/\d{2}/\d{4})\s+(.+)' actions_data = re.findall(actions_rgx, actions) for date, action in actions_data: date = datetime.datetime.strptime(date, r'%m/%d/%Y') act_chamber = ('upper' if action.isupper() else 'lower') types, attrs = categorizer.categorize(action) self.bill.add_action(act_chamber, action, date, type=types, **attrs) # Bail if the bill has been substituted by another. if 'substituted by' in action: return def get_lower_votes(self): url = ('http://assembly.state.ny.us/leg/?' 'default_fld=&bn=%s&term=%s&Votes=Y') url = url % (self.bill_id, self.term_start_year) doc = self.url2lxml(url) if doc is None: return pre = doc.xpath('//pre')[0].text_content() no_votes = ('There are no votes for this bill in this ' 'legislative session.') if pre == no_votes: return actual_vote = collections.defaultdict(list) for table in doc.xpath('//table'): date = table.xpath('caption/label[contains(., "DATE:")]') date = date[0].itersiblings().next().text date = datetime.datetime.strptime(date, '%m/%d/%Y') votes = table.xpath('caption/span/label[contains(., "YEA/NAY:")]') votes = votes[0].itersiblings().next().text yes_count, no_count = map(int, votes.split('/')) passed = yes_count > no_count vote = Vote('lower', date, 'Floor Vote', passed, yes_count, no_count, other_count=0) tds = table.xpath('tr/td/text()') votes = iter(tds) while True: try: data = list(islice(votes, 2)) name, vote_val = data except (StopIteration, ValueError): # End of data. Stop. break name = self._scrub_name(name) if vote_val.strip() == 'Y': vote.yes(name) elif vote_val.strip() in ('N', 'NO'): vote.no(name) else: vote.other(name) actual_vote[vote_val].append(name) # The page doesn't provide an other_count. vote['other_count'] = len(vote['other_votes']) vote['actual_vote'] = actual_vote self.bill.add_vote(vote)
def scrape_bill_sheet(self, session, chamber): sheet_url = self.get_bill_folder(session, chamber) bill_chamber = {"Senate": "upper", "House": "lower"}[chamber] index = { "id": 0, "title_sponsor": 1, "version": 2, "history": 3, "votes": 7 } with self.urlopen(sheet_url) as sheet_html: sheet_page = lxml.html.fromstring(sheet_html) bills = sheet_page.xpath('//table/tr') for bill in bills: bill_id = self.read_td(bill[index["id"]][0]) if bill_id == None: # Every other entry is null for some reason continue bill_id = bill_id[:bill_id.find(".")] title_and_sponsor = bill[index["title_sponsor"]][0] bill_title = title_and_sponsor.text bill_title_and_sponsor = title_and_sponsor.text_content() sponsors = bill_title_and_sponsor.replace(bill_title, "").\ replace(" & ...", "").split("--") bill_history_href = CO_URL_BASE + \ bill[index["history"]][0][0].attrib['href'] # ^^^^^^^ We assume this is a full path to the target. # might want to consider some better rel-path support # XXX: Look at this ^ history = self.parse_history(bill_history_href) b = Bill(session, bill_chamber, bill_id, bill_title) for action in history: self.add_action_to_bill(b, action) for sponsor in sponsors: b.add_sponsor("primary", sponsor) # Now that we have history, let's see if we can't grab some # votes bill_vote_href = self.get_vote_url(bill_id, session) votes = self.parse_votes(bill_vote_href) if votes['sanity-check'] != bill_id: print "XXX: READ ME!" print " -> Scraped ID: " + votes['sanity-check'] print " -> 'Real' ID: " + bill_id assert votes['sanity-check'] == bill_id for vote in votes['votes']: print vote filed_votes = vote['votes'] passage = vote['meta'] result = vote['result'] composite_time = "%s %s" % (passage['x-parent-date'], passage['TIME']) # It's now like: 04/01/2011 02:10:14 PM pydate = dt.datetime.strptime(composite_time, "%m/%d/%Y %I:%M:%S %p") hasHouse = "House" in passage['x-parent-ctty'] hasSenate = "Senate" in passage['x-parent-ctty'] if hasHouse and hasSenate: actor = "legislature" elif hasHouse: actor = "lower" else: actor = "upper" v = Vote(actor, pydate, passage['MOTION'], (result['FINAL_ACTION'] == "YES"), int(result['YES']), int(result['NO']), int(result['EXC'] + result['ABS']), moved=passage['MOVED'], seconded=passage['SECONDED']) # XXX: Add more stuff to kwargs, we have a ton of data for voter in filed_votes: who = voter vote = filed_votes[who] if vote.lower() == "yes": v.yes(who) elif vote.lower() == "no": v.no(who) else: v.other(who) v.add_source(bill_vote_href) b.add_vote(v) self.save_bill(b)
def scrape(self, session, chambers): bill_type_map = { 'B': 'bill', 'R': 'resolution', 'JR': 'joint resolution', 'CR': 'concurrent resolution', } chamber_map = { 'H': 'lower', 'S': 'upper', 'J': 'joint', 'E': 'other', # Effective date } action_code_map = { 'HI': ['other'], 'SI': ['other'], 'HH': ['other'], 'SH': ['other'], 'HPF': ['bill:introduced'], 'HDSAS': ['other'], 'SPF': ['bill:introduced'], 'HSR': ['bill:reading:2'], 'SSR': ['bill:reading:2'], 'HFR': ['bill:reading:1'], 'SFR': ['bill:reading:1'], 'HRECM': ['bill:withdrawn', 'committee:referred'], 'SRECM': ['bill:withdrawn', 'committee:referred'], 'SW&C': ['bill:withdrawn', 'committee:referred'], 'HW&C': ['bill:withdrawn', 'committee:referred'], 'HRA': ['bill:passed'], 'SRA': ['bill:passed'], 'HPA': ['bill:passed'], 'HRECO': ['other'], 'SPA': ['bill:passed'], 'HTABL': ['other'], # 'House Tabled' - what is this? 'SDHAS': ['other'], 'HCFR': ['committee:passed:favorable'], 'SCFR': ['committee:passed:favorable'], 'HRAR': ['committee:referred'], 'SRAR': ['committee:referred'], 'STR': ['bill:reading:3'], 'SAHAS': ['other'], 'SE': ['bill:passed'], 'SR': ['committee:referred'], 'HTRL': ['bill:reading:3', 'bill:failed'], 'HTR': ['bill:reading:3'], 'S3RLT': ['bill:reading:3', 'bill:failed'], 'HASAS': ['other'], 'S3RPP': ['other'], 'STAB': ['other'], 'SRECO': ['other'], 'SAPPT': ['other'], 'HCA': ['other'], 'HNOM': ['other'], 'HTT': ['other'], 'STT': ['other'], 'SRECP': ['other'], 'SCRA': ['other'], 'SNOM': ['other'], 'S2R': ['bill:reading:2'], 'H2R': ['bill:reading:2'], 'SENG': ['bill:passed'], 'HENG': ['bill:passed'], 'HPOST': ['other'], 'HCAP': ['other'], 'SDSG': ['governor:signed'], 'SSG': ['governor:received'], 'Signed Gov': ['governor:signed'], 'HDSG': ['governor:signed'], 'HSG': ['governor:received'], 'EFF': ['other'], 'HRP': ['other'], 'STH': ['other'], 'HTS': ['other'], } sid = self.metadata['session_details'][session]['_guid'] legislation = backoff( self.lservice.GetLegislationForSession, sid )['LegislationIndex'] for leg in legislation: lid = leg['Id'] instrument = backoff(self.lservice.GetLegislationDetail, lid) history = [x for x in instrument['StatusHistory'][0]] actions = reversed([{ 'code': x['Code'], 'action': x['Description'], '_guid': x['Id'], 'date': x['Date'] } for x in history]) guid = instrument['Id'] # A little bit hacky. bill_prefix = instrument['DocumentType'] bill_chamber = chamber_map[bill_prefix[0]] bill_type = bill_type_map[bill_prefix[1:]] bill_id = '%s %s' % ( bill_prefix, instrument['Number'], ) if instrument['Suffix']: bill_id += instrument['Suffix'] title = instrument['Caption'] description = instrument['Summary'] if title is None: continue bill = Bill(session, bill_chamber, bill_id, title, type=bill_type, description=description, _guid=guid) if instrument['Votes']: for vote_ in instrument['Votes']: _, vote_ = vote_ vote_ = backoff(self.vservice.GetVote, vote_[0]['VoteId']) vote = Vote( {'House': 'lower', 'Senate': 'upper'}[vote_['Branch']], vote_['Date'], vote_['Caption'] or 'Vote on Bill', (vote_['Yeas'] > vote_['Nays']), vote_['Yeas'], vote_['Nays'], (vote_['Excused'] + vote_['NotVoting']), session=session, bill_id=bill_id, bill_chamber=bill_chamber) vote.add_source(self.vsource) methods = {'Yea': vote.yes, 'Nay': vote.no,} for vdetail in vote_['Votes'][0]: whom = vdetail['Member'] how = vdetail['MemberVoted'] try: m = methods[how] except KeyError: m = vote.other m(whom['Name']) bill.add_vote(vote) ccommittees = defaultdict(list) committees = instrument['Committees'] if committees: for committee in committees[0]: ccommittees[{ 'House': 'lower', 'Senate': 'upper', }[committee['Type']]].append(committee['Name']) for action in actions: action_chamber = chamber_map[action['code'][0]] try: action_types = action_code_map[action['code']] except KeyError: error_msg = ('Code {code} for action {action} not ' 'recognized.'.format( code=action['code'], action=action['action'])) self.logger.warning(error_msg) action_types = ['other'] committees = [] if any(('committee' in x for x in action_types)): committees = [str(x) for x in ccommittees.get( action_chamber, [])] bill.add_action(action_chamber, action['action'], action['date'], action_types, committees=committees, _code=action['code'], _code_id=action['_guid']) sponsors = [] if instrument['Authors']: sponsors = instrument['Authors']['Sponsorship'] if 'Sponsors' in instrument and instrument['Sponsors']: sponsors += instrument['Sponsors']['Sponsorship'] sponsors = [ (x['Type'], self.get_member(x['MemberId'])) for x in sponsors ] for typ, sponsor in sponsors: name = '{First} {Last}'.format(**dict(sponsor['Name'])) bill.add_sponsor( 'primary' if 'Author' in typ else 'seconday', name ) for version in instrument['Versions']['DocumentDescription']: name, url, doc_id, version_id = [ version[x] for x in [ 'Description', 'Url', 'Id', 'Version' ] ] bill.add_version( name, url, mimetype='application/pdf', _internal_document_id=doc_id, _version_id=version_id ) versions = sorted( bill['versions'], key=lambda x: x['_internal_document_id'] ) bill['versions'] = versions bill.add_source(self.msource) bill.add_source(self.lsource) bill.add_source(SOURCE_URL.format(**{ 'session': session, 'bid': guid, })) self.save_bill(bill)
def scrape_pre_2009_bill(self, chamber, session, bill_id, short_title=''): """bills from 2008 and below are in a 'pre' element and is simpler to parse them as text""" url = 'http://legislature.idaho.gov/legislation/%s/%s.html' % ( session, bill_id.replace(' ', '')) with self.urlopen(url) as bill_page: html = lxml.html.fromstring(bill_page) text = html.xpath('//pre')[0].text.split('\r\n') # title title = " - ".join( [x.strip() for x in text[1].split('-') if x.isupper()]) # bill type bill_type = get_bill_type(bill_id) bill = Bill(session, chamber, bill_id, title, type=bill_type) # sponsors sponsors = text[0].split('by')[-1] for sponsor in sponsors.split(','): bill.add_sponsor('primary', sponsor) actor = chamber self.flag() # clear last bills vote flags self.vote = None # for line in text: if re.match(r'^\d\d/\d\d', line): date = date = datetime.datetime.strptime( line[0:5] + '/' + session[0:4], "%m/%d/%Y") self.last_date = date action_text = line[5:].strip() # actor if action_text.lower().startswith('house') or \ action_text.lower().startswith('senate'): actor = {'H': 'lower', 'S': 'upper'}[action_text[0]] action = get_action(actor, action_text) bill.add_action(actor, action_text, date, type=action) if "bill:passed" in action or "bill:failed" in action: passed = False if 'FAILED' in action_text else True votes = re.search(r'(\d+)-(\d+)-(\d+)', action_text) if votes: yes, no, other = votes.groups() self.in_vote = True self.vote = Vote(chamber, date, action_text, passed, int(yes), int(no), int(other)) else: date = self.last_date # nothing to do if its not a vote if "Floor Sponsor" in line: self.in_vote = False if self.vote: bill.add_vote(self.vote) self.vote = None if not self.in_vote: continue if 'AYES --' in line: self.flag(ayes=True) elif 'NAYS --' in line: self.flag(nays=True) elif 'Absent and excused' in line: self.flag(other=True) if self.ayes: for name in line.replace('AYES --', '').split(','): name = name.strip() if name: self.vote.yes(name) if self.nays: for name in line.replace('NAYS --', '').split(','): name = name.strip() if name: self.vote.no(name) if self.other: for name in line.replace('Absent and excused --', '').split(','): name = name.strip() if name: self.vote.other(name) self.save_bill(bill)
def scrape_bills(self, chamber_to_scrape, session): url = 'http://billstatus.ls.state.ms.us/%s/pdf/all_measures/allmsrs.xml' % session with self.urlopen(url) as bill_dir_page: root = lxml.etree.fromstring(bill_dir_page, lxml.etree.HTMLParser()) for mr in root.xpath('//lastaction/msrgroup'): bill_id = mr.xpath('string(measure)').replace(" ", "") if bill_id[0] == "S": chamber = "upper" else: chamber = "lower" bill_type = {'B':'bill', 'C': 'concurrent resolution', 'R': 'resolution', 'N': 'nomination'}[bill_id[1]] # just skip past bills that are of the wrong chamber if chamber != chamber_to_scrape: continue link = mr.xpath('string(actionlink)').replace("..", "") main_doc = mr.xpath('string(measurelink)').replace("../../../", "") main_doc_url = 'http://billstatus.ls.state.ms.us/%s' % main_doc bill_details_url = 'http://billstatus.ls.state.ms.us/%s/pdf/%s' % (session, link) with self.urlopen(bill_details_url) as details_page: details_page = details_page.decode('latin1').encode('utf8', 'ignore') details_root = lxml.etree.fromstring(details_page, lxml.etree.HTMLParser()) title = details_root.xpath('string(//shorttitle)') longtitle = details_root.xpath('string(//longtitle)') bill = Bill(session, chamber, bill_id, title, type=bill_type, longtitle=longtitle) #sponsors main_sponsor = details_root.xpath('string(//p_name)').split() if main_sponsor: main_sponsor = main_sponsor[0] main_sponsor_link = details_root.xpath('string(//p_link)').replace(" ", "_") main_sponsor_url = 'http://billstatus.ls.state.ms.us/%s/pdf/House_authors/%s.xml' % (session, main_sponsor_link) type = "primary" bill.add_sponsor(type, main_sponsor, main_sponsor_url = main_sponsor_url) for author in details_root.xpath('//authors/additional'): leg = author.xpath('string(co_name)').replace(" ", "_") if leg: leg_url = 'http://billstatus.ls.state.ms.us/%s/pdf/House_authors/%s.xml' % (session, leg) type = "cosponsor" bill.add_sponsor(type, leg, leg_url=leg_url) #Versions curr_version = details_root.xpath('string(//current_other)').replace("../../../../", "") curr_version_url = "http://billstatus.ls.state.ms.us/" + curr_version bill.add_version("Current version", curr_version_url) intro_version = details_root.xpath('string(//intro_other)').replace("../../../../", "") intro_version_url = "http://billstatus.ls.state.ms.us/" + intro_version bill.add_version("As Introduced", intro_version_url) comm_version = details_root.xpath('string(//cmtesub_other)').replace("../../../../", "") if comm_version.find("documents") != -1: comm_version_url = "http://billstatus.ls.state.ms.us/" + comm_version bill.add_version("Committee Substitute", comm_version_url) passed_version = details_root.xpath('string(//passed_other)').replace("../../../../", "") if passed_version.find("documents") != -1: passed_version_url = "http://billstatus.ls.state.ms.us/" + passed_version title = "As Passed the " + chamber bill.add_version(title, passed_version_url) asg_version = details_root.xpath('string(//asg_other)').replace("../../../../", "") if asg_version.find("documents") != -1: asg_version_url = "http://billstatus.ls.state.ms.us/" + asg_version bill.add_version("Approved by the Governor", asg_version_url) # avoid duplicate votes seen_votes = set() #Actions for action in details_root.xpath('//history/action'): action_num = action.xpath('string(act_number)').strip() action_num = int(action_num) act_vote = action.xpath('string(act_vote)').replace("../../../..", "") action_desc = action.xpath('string(act_desc)') date, action_desc = action_desc.split(" ", 1) date = date + "/" + session[0:4] date = datetime.strptime(date, "%m/%d/%Y") if action_desc.startswith("(H)"): actor = "lower" action = action_desc[4:] elif action_desc.startswith("(S)"): actor = "upper" action = action_desc[4:] else: actor = "executive" action = action_desc if action.find("Veto") != -1: version_path = details_root.xpath("string(//veto_other)") version_path = version_path.replace("../../../../", "") version_url = "http://billstatus.ls.state.ms.us/" + version_path bill.add_document("Veto", version_url) atype = 'other' for prefix, prefix_type in self._action_types: if action.startswith(prefix): atype = prefix_type break bill.add_action(actor, action, date, type=atype, action_num=action_num) # use committee names as scraped subjects subjects = details_root.xpath('//h_name/text()') subjects += details_root.xpath('//s_name/text()') bill['subjects'] = subjects if act_vote: vote_url = 'http://billstatus.ls.state.ms.us%s' % act_vote if vote_url not in seen_votes: seen_votes.add(vote_url) vote = self.scrape_votes(vote_url, action, date, actor) vote.add_source(vote_url) bill.add_vote(vote) bill.add_source(bill_details_url) self.save_bill(bill)
def bill_info(self, bill_link, session, chamber, main_url, bill_page): bill_page = lxml.html.fromstring(bill_page) #basic info try: long_title = bill_page.xpath('//div[@id="content_text"]/h2')[0].text.split() except IndexError: return None bill_id = long_title[0] title = '' for x in range(2, len(long_title)): title += long_title[x] + ' ' title = title[0:-1] #bill_type bill_type = 'resolution' if 'LR' in bill_id else 'bill' bill = Bill(session, chamber, bill_id, title, type = bill_type) #sources bill.add_source(main_url) bill.add_source(bill_link) #Sponsor introduced_by = bill_page.xpath('//div[@id="content_text"]/div[2]/table/tr[2]/td[1]/a[1]')[0].text bill.add_sponsor('primary', introduced_by) #actions for actions in bill_page.xpath('//div[@id="content_text"]/div[3]/table/tr[1]/td[1]/table/tr'): date = actions[0].text if 'Date' not in date: date = datetime.strptime(date, '%b %d, %Y') action = actions[1].text if '-' in action: vote_info = action.split()[-1].split('-') yes_count = int(vote_info[0]) no_count = int(vote_info[1]) abstention_count = int(vote_info[2]) passed = True if ( yes_count > no_count) else False vote = Vote(chamber, date, action, passed, yes_count, no_count, abstention_count) vote.add_source(bill_link) bill.add_vote(vote) if 'Governor' in action: actor = 'Governor' elif 'Speaker' in action: actor = 'Speaker' else: actor = chamber action_type = self.action_types(action) bill.add_action(actor, action, date, action_type) #versions for versions in bill_page.xpath('//div[@id="content_text"]/div[2]/table/tr[2]/td[2]/a'): version_url = versions.attrib['href'] version_url = 'http://nebraskalegislature.gov/' + version_url[3:len(version_url)] version_name = versions.text bill.add_version(version_name, version_url) #documents #additional_info for additional_info in bill_page.xpath('//div[@id="content_text"]/div[2]/table/tr[2]/td/a'): document_name = additional_info.text document_url = additional_info.attrib['href'] document_url = 'http://nebraskalegislature.gov/' + document_url[3:len(document_url)] if '.pdf' in document_url: bill.add_document(document_name, document_url) #amendments for admendments in bill_page.xpath('//div[@id="content_text"]/div[3]/table/tr[1]/td[2]/table/tr/td/a'): admendment_name = admendments.text admendment_url = admendments.attrib['href'] admendment_url = 'http://nebraskalegislature.gov/' + admendment_url[3:len(admendment_url)] bill.add_document(admendment_name, admendment_url) #related transcripts for transcripts in bill_page.xpath('//div[@id="content_text"]/div[3]/table/tr[2]/td[2]/a'): transcript_name = transcripts.text transcript_url = transcripts.attrib['href'] bill.add_document(transcript_name, transcript_url) self.save_bill(bill)
def scrape(self, session, chambers): sid = self.metadata['session_details'][session]['_guid'] legislation = backoff(self.lservice.GetLegislationForSession, sid)['LegislationIndex'] for leg in legislation: lid = leg['Id'] instrument = backoff(self.lservice.GetLegislationDetail, lid) history = [x for x in instrument['StatusHistory'][0]] actions = [{ "code": x['Code'], "action": x['Description'], "_guid": x['Id'], "date": x['Date'] } for x in history] guid = instrument['Id'] bill_type = instrument['DocumentType'] chamber = { "H": "lower", "S": "upper", "J": "joint" }[bill_type[0]] # XXX: This is a bit of a hack. bill_id = "%s %s" % ( bill_type, instrument['Number'], ) if instrument['Suffix']: bill_id += instrument['Suffix'] title = instrument['Caption'] description = instrument['Summary'] if title is None: continue bill = Bill(session, chamber, bill_id, title, description=description, _guid=guid) if instrument['Votes']: for vote_ in instrument['Votes']: _, vote_ = vote_ vote_ = backoff(self.vservice.GetVote, vote_[0]['VoteId']) vote = Vote({ "House": "lower", "Senate": "upper" }[vote_['Branch']], vote_['Date'], vote_['Caption'] or "Vote on Bill", (vote_['Yeas'] > vote_['Nays']), vote_['Yeas'], vote_['Nays'], (vote_['Excused'] + vote_['NotVoting']), session=session, bill_id=bill_id, bill_chamber=chamber) vote.add_source(self.vsource) methods = { "Yea": vote.yes, "Nay": vote.no, } for vdetail in vote_['Votes'][0]: whom = vdetail['Member'] how = vdetail['MemberVoted'] try: m = methods[how] except KeyError: m = vote.other m(whom['Name']) bill.add_vote(vote) types = { "HI": ["other"], "SI": ["other"], "HH": ["other"], "SH": ["other"], "HPF": ["bill:introduced"], "HDSAS": ["other"], "SPF": ["bill:introduced"], "HSR": ["bill:reading:2"], "SSR": ["bill:reading:2"], "HFR": ["bill:reading:1"], "SFR": ["bill:reading:1"], "HRECM": ["bill:withdrawn", "committee:referred"], "SRECM": ["bill:withdrawn", "committee:referred"], "SW&C": ["bill:withdrawn", "committee:referred"], "HW&C": ["bill:withdrawn", "committee:referred"], "HRA": ["bill:passed"], "SRA": ["bill:passed"], "HPA": ["bill:passed"], "HRECO": ["other"], "SPA": ["bill:passed"], "HTABL": ["other"], # "House Tabled" - what is this? "SDHAS": ["other"], "HCFR": ["committee:passed:favorable"], "SCFR": ["committee:passed:favorable"], "HRAR": ["committee:referred"], "SRAR": ["committee:referred"], "STR": ["bill:reading:3"], "SAHAS": ["other"], "SE": ["bill:passed"], "SR": ["committee:referred"], "HTRL": ["bill:reading:3", "bill:failed"], "HTR": ["bill:reading:3"], "S3RLT": ["bill:reading:3", "bill:failed"], "HASAS": ["other"], "S3RPP": ["other"], "STAB": ["other"], "SRECO": ["other"], "SAPPT": ["other"], "HCA": ["other"], "HNOM": ["other"], "HTT": ["other"], "STT": ["other"], "SRECP": ["other"], "SCRA": ["other"], "SNOM": ["other"], "S2R": ["bill:reading:2"], "H2R": ["bill:reading:2"], "SENG": ["bill:passed"], "HENG": ["bill:passed"], "HPOST": ["other"], "HCAP": ["other"], "SDSG": ["governor:signed"], "SSG": ["governor:received"], "Signed Gov": ["governor:signed"], "HDSG": ["governor:signed"], "HSG": ["governor:received"], "EFF": ["other"], "HRP": ["other"], "STH": ["other"], "HTS": ["other"], } ccommittees = defaultdict(list) committees = instrument['Committees'] if committees: for committee in committees[0]: ccommittees[{ "House": "lower", "Senate": "upper", }[committee['Type']]].append(committee['Name']) for action in actions: chamber = { "H": "lower", "S": "upper", "E": "other", # Effective Date }[action['code'][0]] try: _types = types[action['code']] except KeyError: self.debug(action) _types = ["other"] committees = [] if any(('committee' in x for x in _types)): committees = [str(x) for x in ccommittees.get(chamber, [])] bill.add_action(chamber, action['action'], action['date'], _types, committees=committees, _code=action['code'], _code_id=action['_guid']) sponsors = [] if instrument['Authors']: sponsors = instrument['Authors']['Sponsorship'] if 'Sponsors' in instrument and instrument['Sponsors']: sponsors += instrument['Sponsors']['Sponsorship'] sponsors = [(x['Type'], self.get_member(x['MemberId'])) for x in sponsors] for typ, sponsor in sponsors: name = "{First} {Last}".format(**dict(sponsor['Name'])) bill.add_sponsor('primary' if 'Author' in typ else 'seconday', name) for version in instrument['Versions']['DocumentDescription']: name, url, doc_id, version_id = [ version[x] for x in ['Description', 'Url', 'Id', 'Version'] ] bill.add_version(name, url, mimetype='application/pdf', _internal_document_id=doc_id, _version_id=version_id) bill.add_source(self.msource) bill.add_source(self.lsource) bill.add_source( SOURCE_URL.format(**{ "session": session, "bid": guid, })) self.save_bill(bill)
def scrape_bill(self, chamber, session, bill_id, bill_type): url = '%s?r=%s' % (self.base_url, bill_id) with self.urlopen(url) as html: doc = lxml.html.fromstring(html) # search for Titulo, accent over i messes up lxml, so use 'tulo' title = doc.xpath(u'//td/b[contains(text(),"tulo")]/../following-sibling::td/text()') if not title: raise NoSuchBill() bill = Bill(session, chamber, bill_id, title[0], type=bill_type) author = doc.xpath(u'//td/b[contains(text(),"Autor")]/../text()')[0] bill.add_sponsor('primary', author.strip()) co_authors = doc.xpath(u'//td/b[contains(text(),"Co-autor")]/../text()') if len(co_authors) !=0: for co_author in co_authors[1].split(','): bill.add_sponsor('cosponsor',co_author.strip()); action_table = doc.xpath('//table')[-1] for row in action_table[1:]: tds = row.xpath('td') # ignore row missing date if len(tds) != 2: continue date = datetime.datetime.strptime(tds[0].text_content(), "%m/%d/%Y") action = tds[1].text_content().strip() #parse the text to see if it's a new version or a unrelated document #if has - let's *shrug* assume it's a vote document #get url of action action_url = tds[1].xpath('a/@href') #check it has a url and is not just text if action_url: action_url = action_url[0] #check if it's a version of the bill or another type of document. #NOTE: not sure if new versions of the bill are only denoted with 'Entirillado' OR if that's the correct name but from what i gather it looks like it. if re.match('Entirillado', action): bill.add_version(action, action_url) else: bill.add_document(action, action_url) for pattern, atype in _classifiers: if re.match(pattern, action): break else: atype = 'other' bill.add_action(chamber, action, date, type=atype) if atype == 'bill:passed' and action_url: vote_chamber = None for pattern, vote_chamber in _voteChambers: if re.match(pattern,action): break else: self.warning('coudnt find voteChamber pattern') if vote_chamber == 'lower' and len(action_url) > 0: vote = self.scrape_votes(action_url, action,date, vote_chamber) if not vote[0] == None: vote[0].add_source(action_url) bill.add_vote(vote[0]) else: self.warning('Problem Reading vote: %s,%s' % (vote[1], bill_id)) bill.add_source(url) self.save_bill(bill)
def scrape_bills(self, chamber_to_scrape, session): url = 'http://billstatus.ls.state.ms.us/%s/pdf/all_measures/allmsrs.xml' % session with self.urlopen(url) as bill_dir_page: root = lxml.etree.fromstring(bill_dir_page, lxml.etree.HTMLParser()) for mr in root.xpath('//lastaction/msrgroup'): bill_id = mr.xpath('string(measure)').replace(" ", "") if bill_id[0] == "S": chamber = "upper" else: chamber = "lower" bill_type = {'B':'bill', 'C': 'concurrent resolution', 'R': 'resolution', 'N': 'nomination'}[bill_id[1]] # just skip past bills that are of the wrong chamber if chamber != chamber_to_scrape: continue link = mr.xpath('string(actionlink)').replace("..", "") main_doc = mr.xpath('string(measurelink)').replace("../../../", "") main_doc_url = 'http://billstatus.ls.state.ms.us/%s' % main_doc bill_details_url = 'http://billstatus.ls.state.ms.us/%s/pdf/%s' % (session, link) with self.urlopen(bill_details_url) as details_page: details_page = details_page.decode('latin1').encode('utf8', 'ignore') details_root = lxml.etree.fromstring(details_page, lxml.etree.HTMLParser()) title = details_root.xpath('string(//shorttitle)') longtitle = details_root.xpath('string(//longtitle)') bill = Bill(session, chamber, bill_id, title, type=bill_type, longtitle=longtitle) #sponsors main_sponsor = details_root.xpath('string(//p_name)').split() if main_sponsor: main_sponsor = main_sponsor[0] main_sponsor_link = details_root.xpath('string(//p_link)').replace(" ", "_") main_sponsor_url = 'http://billstatus.ls.state.ms.us/%s/pdf/House_authors/%s.xml' % (session, main_sponsor_link) type = "primary" bill.add_sponsor(type, main_sponsor, main_sponsor_url = main_sponsor_url) for author in details_root.xpath('//authors/additional'): leg = author.xpath('string(co_name)').replace(" ", "_") if leg: leg_url = 'http://billstatus.ls.state.ms.us/%s/pdf/House_authors/%s.xml' % (session, leg) type = "cosponsor" bill.add_sponsor(type, leg, leg_url=leg_url) #Versions curr_version = details_root.xpath('string(//current_other)').replace("../../../../", "") curr_version_url = "http://billstatus.ls.state.ms.us/" + curr_version bill.add_version("Current version", curr_version_url) intro_version = details_root.xpath('string(//intro_other)').replace("../../../../", "") intro_version_url = "http://billstatus.ls.state.ms.us/" + intro_version bill.add_version("As Introduced", intro_version_url) comm_version = details_root.xpath('string(//cmtesub_other)').replace("../../../../", "") if comm_version.find("documents") != -1: comm_version_url = "http://billstatus.ls.state.ms.us/" + comm_version bill.add_version("Committee Substitute", comm_version_url) passed_version = details_root.xpath('string(//passed_other)').replace("../../../../", "") if passed_version.find("documents") != -1: passed_version_url = "http://billstatus.ls.state.ms.us/" + passed_version title = "As Passed the " + chamber bill.add_version(title, passed_version_url) asg_version = details_root.xpath('string(//asg_other)').replace("../../../../", "") if asg_version.find("documents") != -1: asg_version_url = "http://billstatus.ls.state.ms.us/" + asg_version bill.add_version("Approved by the Governor", asg_version_url) # avoid duplicate votes seen_votes = set() #Actions for action in details_root.xpath('//history/action'): action_num = action.xpath('string(act_number)').strip() action_num = int(action_num) act_vote = action.xpath('string(act_vote)').replace("../../../..", "") action_desc = action.xpath('string(act_desc)') date, action_desc = action_desc.split(" ", 1) date = date + "/" + session[0:4] date = datetime.strptime(date, "%m/%d/%Y") if action_desc.startswith("(H)"): actor = "lower" action = action_desc[4:] elif action_desc.startswith("(S)"): actor = "upper" action = action_desc[4:] else: actor = "executive" action = action_desc if action.find("Veto") != -1: version_path = details_root.xpath("string(//veto_other)") version_path = version_path.replace("../../../../", "") version_url = "http://billstatus.ls.state.ms.us/" + version_path bill.add_document("Veto", version_url) atype = 'other' for prefix, prefix_type in self._action_types: if action.startswith(prefix): atype = prefix_type break bill.add_action(actor, action, date, type=atype, action_num=action_num) if act_vote: vote_url = 'http://billstatus.ls.state.ms.us%s' % act_vote if vote_url not in seen_votes: seen_votes.add(vote_url) vote = self.scrape_votes(vote_url, action, date, actor) vote.add_source(vote_url) bill.add_vote(vote) bill.add_source(bill_details_url) self.save_bill(bill)
def scrape(self, chamber, session): year = self.metadata['session_details'][session]['start_date'].year main_url = 'http://nebraskalegislature.gov/bills/search_by_date.php?SessionDay=%s' % year with self.urlopen(main_url) as page: page = lxml.html.fromstring(page) for docs in page.xpath('/html/body/div[@id="wrapper"]/div[@id="content"]/div[@id="content_text"]/div[@class="cal_content_full"]/table[@id="bill_results"]/tr/td[1]/a'): bill_link = docs.attrib['href'] bill_link = 'http://nebraskalegislature.gov/' + bill_link with self.urlopen(bill_link) as bill_page: bill_page = lxml.html.fromstring(bill_page) #basic info long_title = bill_page.xpath('/html/body/div[@id="wrapper"]/div[@id="content"]/div[@id="content_text"]/h2')[0].text.split() bill_id = long_title[0] title = '' for x in range(2, len(long_title)): title += long_title[x] + ' ' title = title[0:-1] #bill = Bill(session, chamber, bill_id, title) #bill_type if 'LR' in bill_id: bill_type = 'resolution' else: bill_type = 'bill' bill = Bill(session, chamber, bill_id, title, type = bill_type) #sources bill.add_source(main_url) bill.add_source(bill_link) #Sponsor introduced_by = bill_page.xpath('/html/body/div[@id="wrapper"]/div[@id="content"]/div[@id="content_text"]/div[2]/table/tr[2]/td[1]/a[1]')[0].text bill.add_sponsor('primary', introduced_by) #actions for actions in bill_page.xpath('/html/body/div[@id="wrapper"]/div[@id="content"]/div[@id="content_text"]/div[3]/table/tr[1]/td[1]/table/tr'): date = actions[0].text if 'Date' not in date: date = datetime.strptime(date, '%b %d, %Y') action = actions[1].text if '-' in action: vote_info = action.split()[-1].split('-') yes_count = int(vote_info[0]) no_count = int(vote_info[1]) abstention_count = int(vote_info[2]) if yes_count > no_count: passed = True else: passed = False vote = Vote(chamber, date, action, passed, yes_count, no_count, abstention_count) vote.add_source(bill_link) bill.add_vote(vote) if 'Governor' in action: actor = 'Governor' elif 'Speaker' in action: actor = 'Speaker' else: actor = chamber action_type = self.action_types(action) bill.add_action(actor, action, date, action_type) #versions for versions in bill_page.xpath('/html/body/div[@id="wrapper"]/div[@id="content"]/div[@id="content_text"]/div[2]/table/tr[2]/td[2]/a'): version_url = versions.attrib['href'] version_url = 'http://nebraskalegislature.gov/' + version_url[3:len(version_url)] version_name = versions.text bill.add_version(version_name, version_url) #documents #additional_info for additional_info in bill_page.xpath('/html/body/div[@id="wrapper"]/div[@id="content"]/div[@id="content_text"]/div[2]/table/tr[2]/td/a'): document_name = additional_info.text document_url = additional_info.attrib['href'] document_url = 'http://nebraskalegislature.gov/' + document_url[3:len(document_url)] if '.pdf' in document_url: bill.add_document(document_name, document_url) #amendments for admendments in bill_page.xpath('/html/body/div[@id="wrapper"]/div[@id="content"]/div[@id="content_text"]/div[3]/table/tr[1]/td[2]/table/tr/td/a'): admendment_name = admendments.text admendment_url = admendments.attrib['href'] admendment_url = 'http://nebraskalegislature.gov/' + admendment_url[3:len(admendment_url)] bill.add_document(admendment_name, admendment_url) #related transcripts for transcripts in bill_page.xpath('/html/body/div[@id="wrapper"]/div[@id="content"]/div[@id="content_text"]/div[3]/table/tr[2]/td[2]/a'): transcript_name = transcripts.text transcript_url = transcripts.attrib['href'] bill.add_document(transcript_name, transcript_url) self.save_bill(bill)
def scrape_bill_pages(self, session, year_abr): """ assemble information on a bill from a number of DBF files """ #Main Bill information main_bill_url, main_bill_db = self.get_dbf(year_abr, 'MAINBILL') # keep a dictionary of bills (mapping bill_id to Bill obj) bill_dict = {} for rec in main_bill_db: bill_type = rec["billtype"] bill_number = int(rec["billnumber"]) bill_id = bill_type + str(bill_number) title = rec["synopsis"] if bill_type[0] == 'A': chamber = "lower" else: chamber = "upper" # some bills have a blank title.. just skip it if not title: continue bill = Bill(str(session), chamber, bill_id, title, type=self._bill_types[bill_type[1:]]) bill.add_source(main_bill_url) bill_dict[bill_id] = bill #Sponsors bill_sponsors_url, bill_sponsors_db = self.get_dbf(year_abr, 'BILLSPON') for rec in bill_sponsors_db: bill_type = rec["billtype"] bill_number = int(rec["billnumber"]) bill_id = bill_type + str(bill_number) bill = bill_dict[bill_id] name = rec["sponsor"] sponsor_type = rec["type"] if sponsor_type == 'P': sponsor_type = "Primary" else: sponsor_type = "Co-sponsor" bill.add_sponsor(sponsor_type, name) #Documents bill_document_url, bill_document_db = self.get_dbf(year_abr, 'BILLWP') #print bill_document_db[2] for rec in bill_document_db: bill_type = rec["billtype"] bill_number = int(rec["billnumber"]) bill_id = bill_type + str(bill_number) bill = bill_dict[bill_id] document = rec["document"] document = document.split('\\') document = document[-2] + "/" + document[-1] year = str(year_abr) + str((year_abr + 1)) #doc_url = "ftp://www.njleg.state.nj.us/%s/%s" % (year, document) htm_url = 'http://www.njleg.state.nj.us/%s/Bills/%s' % (year_abr, document.replace('.DOC', '.HTM')) # name document based _doctype try: doc_name = self._doctypes[rec['doctype']] except KeyError: raise Exception('unknown doctype %s on %s' % (rec['doctype'], bill_id)) if rec['comment']: doc_name += ' ' + rec['comment'] if rec['doctype'] in self._version_types: bill.add_version(doc_name, htm_url) else: bill.add_document(doc_name, htm_url) # Votes next_year = int(year_abr)+1 vote_info_list = ['A%s' % year_abr, 'A%s' % next_year, 'S%s' % year_abr, 'S%s' % next_year, 'CA%s-%s' % (year_abr, next_year), 'CS%s-%s' % (year_abr, next_year), ] for filename in vote_info_list: s_vote_url = 'ftp://www.njleg.state.nj.us/votes/%s.zip' % filename s_vote_zip, resp = self.urlretrieve(s_vote_url) zipedfile = zipfile.ZipFile(s_vote_zip) vfile = "%s.txt" % filename vote_file = zipedfile.open(vfile, 'U') vdict_file = csv.DictReader(vote_file) votes = {} if filename.startswith('A') or filename.startswith('CA'): chamber = "lower" else: chamber = "upper" if filename.startswith('C'): vote_file_type = 'committee' else: vote_file_type = 'chamber' for rec in vdict_file: if vote_file_type == 'chamber': bill_id = rec["Bill"].strip() leg = rec["Full_Name"] date = rec["Session_Date"] action = rec["Action"] leg_vote = rec["Legislator_Vote"] else: bill_id = '%s%s' % (rec['Bill_Type'], rec['Bill_Number']) leg = rec['Name'] # drop time portion date = rec['Agenda_Date'].split()[0] # make motion readable action = self._com_vote_motions[rec['BillAction']] # first char (Y/N) use [0:1] to ignore '' leg_vote = rec['LegislatorVote'][0:1] date = datetime.strptime(date, "%m/%d/%Y") vote_id = '_'.join((bill_id, chamber, action)) vote_id = vote_id.replace(" ", "_") if vote_id not in votes: votes[vote_id] = Vote(chamber, date, action, None, None, None, None, bill_id=bill_id) if vote_file_type == 'committee': votes[vote_id]['committee'] = self._committees[ rec['Committee_House']] if leg_vote == "Y": votes[vote_id].yes(leg) elif leg_vote == "N": votes[vote_id].no(leg) else: votes[vote_id].other(leg) # remove temp file os.remove(s_vote_zip) #Counts yes/no/other votes and saves overall vote for vote in votes.itervalues(): vote_yes_count = len(vote["yes_votes"]) vote_no_count = len(vote["no_votes"]) vote_other_count = len(vote["other_votes"]) vote["yes_count"] = vote_yes_count vote["no_count"] = vote_no_count vote["other_count"] = vote_other_count if vote_yes_count > vote_no_count: vote["passed"] = True else: vote["passed"] = False vote_bill_id = vote["bill_id"] bill = bill_dict[vote_bill_id] bill.add_vote(vote) #Actions bill_action_url, bill_action_db = self.get_dbf(year_abr, 'BILLHIST') for rec in bill_action_db: bill_type = rec["billtype"] bill_number = int(rec["billnumber"]) bill_id = bill_type + str(bill_number) bill = bill_dict[bill_id] action = rec["action"] date = rec["dateaction"] actor = rec["house"] comment = rec["comment"] action, atype = self.categorize_action(action) if comment: action += (' ' + comment) bill.add_action(actor, action, date, type=atype) # Subjects subject_url, subject_db = self.get_dbf(year_abr, 'BILLSUBJ') for rec in subject_db: bill_id = rec['billtype'] + str(int(rec['billnumber'])) bill = bill_dict.get(bill_id) if bill: bill.setdefault('subjects', []).append(rec['subjectkey']) else: self.warning('invalid bill id in BILLSUBJ.DBF: %s' % bill_id) # save all bills at the end for bill in bill_dict.itervalues(): # add sources bill.add_source(bill_sponsors_url) bill.add_source(bill_document_url) bill.add_source(bill_action_url) bill.add_source(subject_url) self.save_bill(bill)
def scrape(self, session, chambers): HTML_TAGS_RE = r'<.*?>' year_slug = session[5:] # Load all bills and resolutions via the private API bills_url = \ 'http://legislature.vermont.gov/bill/loadBillsReleased/{}/'.\ format(year_slug) bills_json = self.get(bills_url).text bills = json.loads(bills_json)['data'] or [] bills_url = \ 'http://legislature.vermont.gov/bill/loadBillsIntroduced/{}/'.\ format(year_slug) bills_json = self.get(bills_url).text bills.extend(json.loads(bills_json)['data'] or []) resolutions_url = \ 'http://legislature.vermont.gov/bill/loadAllResolutionsByChamber/{}/both'.\ format(year_slug) resolutions_json = self.get(resolutions_url).text bills.extend(json.loads(resolutions_json)['data'] or []) # Parse the information from each bill for info in bills: # Strip whitespace from strings info = {k: v.strip() for k, v in info.iteritems()} # Identify the bill type and chamber if info['BillNumber'].startswith('J.R.H.'): bill_type = 'joint resolution' bill_chamber = 'lower' elif info['BillNumber'].startswith('J.R.S.'): bill_type = 'joint resolution' bill_chamber = 'upper' elif info['BillNumber'].startswith('H.C.R.'): bill_type = 'concurrent resolution' bill_chamber = 'lower' elif info['BillNumber'].startswith('S.C.R.'): bill_type = 'concurrent resolution' bill_chamber = 'upper' elif info['BillNumber'].startswith('H.R.'): bill_type = 'resolution' bill_chamber = 'lower' elif info['BillNumber'].startswith('S.R.'): bill_type = 'resolution' bill_chamber = 'upper' elif info['BillNumber'].startswith('PR.'): bill_type = 'constitutional amendment' if info['Body'] == 'H': bill_chamber = 'lower' elif info['Body'] == 'S': bill_chamber = 'upper' else: raise AssertionError("Amendment not tied to chamber") elif info['BillNumber'].startswith('H.'): bill_type = 'bill' bill_chamber = 'lower' elif info['BillNumber'].startswith('S.'): bill_type = 'bill' bill_chamber = 'upper' else: raise AssertionError("Unknown bill type found: '{}'".format( info['BillNumber'])) # Create the bill using its basic information bill = Bill(session=session, bill_id=info['BillNumber'], title=info['Title'], chamber=bill_chamber, type=bill_type) if 'resolution' in bill_type: bill.add_source(resolutions_url) else: bill.add_source(bills_url) # Load the bill's information page to access its metadata bill_url = \ 'http://legislature.vermont.gov/bill/status/{0}/{1}'.\ format(year_slug, info['BillNumber']) doc = self.lxmlize(bill_url) bill.add_source(bill_url) # Capture sponsors sponsors = doc.xpath( '//dl[@class="summary-table"]/dt[text()="Sponsor(s)"]/' 'following-sibling::dd[1]/ul/li') sponsor_type = 'primary' for sponsor in sponsors: if sponsor.xpath('span/text()') == ['Additional Sponsors']: sponsor_type = 'cosponsor' continue sponsor_name = sponsor.xpath('a/text()')[0].\ replace("Rep.", "").replace("Sen.", "").strip() if sponsor_name and not \ (sponsor_name[ :5] == "Less" and len(sponsor_name) == 5): bill.add_sponsor(sponsor_type, sponsor_name) # Capture bill text versions versions = doc.xpath( '//dl[@class="summary-table"]/dt[text()="Bill/Resolution Text"]/' 'following-sibling::dd[1]/ul/li/a') for version in versions: bill.add_version(name=version.xpath('text()')[0], url=version.xpath('@href')[0].replace( ' ', '%20'), mimetype='application/pdf') # Identify the internal bill ID, used for actions and votes # If there is no internal bill ID, then it has no extra information try: internal_bill_id = re.search( r'"bill/loadBillDetailedStatus/{}/(\d+)"'.format( year_slug), lxml.etree.tostring(doc)).group(1) except AttributeError: self.warning("Bill {} appears to have no activity".\ format(info['BillNumber'])) self.save_bill(bill) continue # Capture actions actions_url = 'http://legislature.vermont.gov/bill/loadBillDetailedStatus/{0}/{1}'.\ format(year_slug, internal_bill_id) actions_json = self.get(actions_url).text actions = json.loads(actions_json)['data'] bill.add_source(actions_url) chambers_passed = set() for action in actions: action = {k: v.strip() for k, v in action.iteritems()} if "Signed by Governor" in action['FullStatus']: actor = 'governor' elif action['ChamberCode'] == 'H': actor = 'lower' elif action['ChamberCode'] == 'S': actor = 'upper' else: raise AssertionError("Unknown actor for bill action") # Categorize action if "Signed by Governor" in action['FullStatus']: assert chambers_passed == set("HS") action_type = 'governor:signed' elif actor == 'lower' and \ any(x.lower().startswith('aspassed') for x in action['keywords'].split(';')): action_type = 'bill:passed' chambers_passed.add("H") elif actor == 'upper' and \ any(x.lower().startswith(' aspassed') or x.lower().startswith('aspassed') for x in action['keywords'].split(';')): action_type = 'bill:passed' chambers_passed.add("S") else: action_type = 'other' bill.add_action(actor=actor, action=re.sub(HTML_TAGS_RE, "", action['FullStatus']), date=datetime.datetime.strptime( action['StatusDate'], '%m/%d/%Y'), type=action_type) # Capture votes votes_url = 'http://legislature.vermont.gov/bill/loadBillRollCalls/{0}/{1}'.\ format(year_slug, internal_bill_id) votes_json = self.get(votes_url).text votes = json.loads(votes_json)['data'] bill.add_source(votes_url) for vote in votes: roll_call_id = vote['VoteHeaderID'] roll_call_url = 'http://legislature.vermont.gov/bill/loadBillRollCallDetails/{0}/{1}'.\ format(year_slug, roll_call_id) roll_call_json = self.get(roll_call_url).text roll_call = json.loads(roll_call_json)['data'] roll_call_yea = [] roll_call_nay = [] roll_call_other = [] for member in roll_call: (member_name, _district) = member['MemberName'].split(" of ") member_name = member_name.strip() if member['MemberVote'] == "Yea": roll_call_yea.append(member_name) elif member['MemberVote'] == "Nay": roll_call_nay.append(member_name) else: roll_call_other.append(member_name) if "Passed -- " in vote['FullStatus']: did_pass = True elif "Failed -- " in vote['FullStatus']: did_pass = False else: raise AssertionError("Roll call vote result is unclear") # Check vote counts yea_count = \ int(re.search(r'Yeas = (\d+)', vote['FullStatus']).group(1)) nay_count = \ int(re.search(r'Nays = (\d+)', vote['FullStatus']).group(1)) vote_to_add = Vote(chamber=('lower' if vote['ChamberCode'] == 'H' else 'upper'), date=datetime.datetime.strptime( vote['StatusDate'], '%m/%d/%Y'), motion=re.sub(HTML_TAGS_RE, "", vote['FullStatus']).strip(), passed=did_pass, yes_count=yea_count, no_count=nay_count, other_count=len(roll_call_other)) vote_to_add.add_source(roll_call_url) for member in roll_call_yea: vote_to_add.yes(member) for member in roll_call_nay: vote_to_add.no(member) for member in roll_call_other: vote_to_add.other(member) try: vote_to_add.validate() except ValueError as e: self.warning(e) bill.add_vote(vote_to_add) # Capture extra information # This is not in the OpenStates spec, but is available # Not yet implemented # Witnesses: http://legislature.vermont.gov/bill/loadBillWitnessList/{year_slug}/{internal_bill_id} # Conference committee members: http://legislature.vermont.gov/bill/loadBillConference/{year_slug}/{bill_number} # Committee meetings: http://legislature.vermont.gov/committee/loadHistoryByBill/{year_slug}?LegislationId={internal_bill_id} self.save_bill(bill)
def scrape(self, chamber, session): self.all_bills = {} self.slug = self.metadata['session_details'][session]['slug'] page = self.lxmlize(self.bill_directory_url.format(self.slug.upper())) page.make_links_absolute(self.base_url) ulid = 'senateBills' if chamber == 'upper' else 'houseBills' # id of <ul> header = page.xpath("//ul[@id='{0}_search']".format(ulid))[0] #Every ul with a data-load-action and an id bill_list_pages = header.xpath(".//ul[boolean(@data-load-action)" " and boolean(@id)]/@data-load-action") bill_anchors = [] for bill_list_url in bill_list_pages: bill_list_page = self.lxmlize('{}{}'.format( self.base_url, bill_list_url)) bill_list_page.make_links_absolute(self.base_url) bill_anchors.extend(bill_list_page.xpath('//a') or []) ws = re.compile(r"\s+") def _clean_ws(txt): """Remove extra whitespace from text.""" return ws.sub(' ', txt).strip() for a in bill_anchors: bid = ws.sub('', a.text_content()) # bill id bill_summary = _clean_ws(a.get('title')) # bill title is added below bill = Bill(session, chamber, bid, title='', summary=bill_summary) page = self.lxmlize(a.get('href')) versions = page.xpath('//ul[@class="dropdown-menu"]/li/span/' + 'a[contains(@title, "Get the Pdf")]/@href') measure_info = {} info = page.xpath("//table[@id='measureOverviewTable']/tr") for row in info: key, value = row.xpath("./*") key = key.text.replace(':', '').strip() measure_info[key] = value for sponsor in measure_info['Chief Sponsors'].xpath("./a"): if sponsor.text_content().strip(): bill.add_sponsor(type='primary', name=sponsor.text_content()) for sponsor in measure_info['Regular Sponsors'].xpath("./a"): if sponsor.text_content().strip(): bill.add_sponsor(type='cosponsor', name=sponsor.text_content()) title = _clean_ws(measure_info['Bill Title'].text_content()) # some bill titles need to be added manually if self.slug == "2013R1" and bid == "HB2010": title = ("Relating to Water Resources Department contested" "case proceedings.") bill['title'] = title for version in versions: name = version.split("/")[-1] bill.add_version(name=name, url=version, mimetype='application/pdf') history_url = self.create_url( 'Measures/Overview/GetHistory/{bill}', bid) history = self.lxmlize(history_url).xpath("//table/tr") for entry in history: wwhere, action = [ _clean_ws(x.text_content()) for x in entry.xpath("*") ] vote_cleaning_re = r'(.*?)((Ayes)|(Nays),\s.*)' if re.match(vote_cleaning_re, action): action = re.search(vote_cleaning_re, action).groups()[0] wwhere = re.match(r"(?P<when>.*) \((?P<where>.*)\)", wwhere).groupdict() action_chamber = {"S": "upper", "H": "lower"}[wwhere['where']] when = "%s-%s" % (self.slug[:4], wwhere['when']) when = dt.datetime.strptime(when, "%Y-%m-%d") types = [] for expr, types_ in self.action_classifiers: m = re.match(expr, action) if m: types += types_ if types == []: types = ['other'] # actor, action, date, type, committees, legislators bill.add_action(action_chamber, action, when, type=types) # Parse and store Vote information vote_id = entry.xpath('./td/a[contains(@href, "otes-")]/@href') if not vote_id: continue elif "#measureVotes-" in vote_id[0]: vote_id = vote_id[0].split("-")[-1] vote_url = "https://olis.leg.state.or.us/liz/" + \ "{0}/Measures/MeasureVotes?id={1}". \ format(self.slug, vote_id) else: vote_id = vote_id[0].split("-")[-1] vote_url = "https://olis.leg.state.or.us/liz/" + \ "{0}/CommitteeReports/MajorityReport/{1}". \ format(self.slug, vote_id) votes = self._get_votes(vote_url) if not any(len(x) for x in votes.values()): self.warning("The votes webpage was empty for " + "action {0} on bill {1}.".format(action, bid)) continue passed = (float(len(votes["yes_votes"])) / (len(votes["yes_votes"]) + len(votes["no_votes"])) > 0.5) vote = Vote(chamber=chamber, date=when, motion=action, passed=passed, yes_count=len(votes["yes_votes"]), no_count=len(votes["no_votes"]), other_count=len(votes["other_votes"]), session=session, bill_id=bid, bill_chamber=action_chamber) vote.update(votes) bill_url = "https://olis.leg.state.or.us/liz/" + \ "{0}/Measures/Overview/{1}".format(self.slug, bid) vote.add_source(bill_url) bill.add_vote(vote) amendments_url = self.create_url( 'Measures/ProposedAmendments/{bill}', bid) amendments = self.lxmlize(amendments_url).xpath( "//div[@id='amendments']/table//tr") for amendment in amendments: nodes = amendment.xpath("./td") if nodes == []: continue pdf_href, date, committee, adopted, when = nodes pdf_href, = pdf_href.xpath("./a") pdf_link = pdf_href.attrib['href'] name = "Ammendment %s" % (pdf_href.text_content()) adopted = adopted.text bill.add_document(name=name, url=pdf_link, adopted=adopted, mimetype='application/pdf') bill.add_source(a.get('href')) self.save_bill(bill)
def scrape_bill_pages(self, session, year_abr): """ assemble information on a bill from a number of DBF files """ # Main Bill information main_bill_url, main_bill_db = self.get_dbf(year_abr, "MAINBILL") # keep a dictionary of bills (mapping bill_id to Bill obj) bill_dict = {} for rec in main_bill_db: bill_type = rec["billtype"] bill_number = int(rec["billnumber"]) bill_id = bill_type + str(bill_number) title = rec["synopsis"] if bill_type[0] == "A": chamber = "lower" else: chamber = "upper" # some bills have a blank title.. just skip it if not title: continue bill = Bill(str(session), chamber, bill_id, title, type=self._bill_types[bill_type[1:]]) bill.add_source(main_bill_url) bill_dict[bill_id] = bill # Sponsors bill_sponsors_url, bill_sponsors_db = self.get_dbf(year_abr, "BILLSPON") for rec in bill_sponsors_db: bill_type = rec["billtype"] bill_number = int(rec["billnumber"]) bill_id = bill_type + str(bill_number) bill = bill_dict[bill_id] name = rec["sponsor"] sponsor_type = rec["type"] if sponsor_type == "P": sponsor_type = "Primary" else: sponsor_type = "Co-sponsor" bill.add_sponsor(sponsor_type, name) # Documents bill_document_url, bill_document_db = self.get_dbf(year_abr, "BILLWP") # print bill_document_db[2] for rec in bill_document_db: bill_type = rec["billtype"] bill_number = int(rec["billnumber"]) bill_id = bill_type + str(bill_number) if bill_id not in bill_dict: self.warning("unknown bill %s in document database" % bill_id) continue bill = bill_dict[bill_id] document = rec["document"] document = document.split("\\") document = document[-2] + "/" + document[-1] year = str(year_abr) + str((year_abr + 1)) # doc_url = "ftp://www.njleg.state.nj.us/%s/%s" % (year, document) htm_url = "http://www.njleg.state.nj.us/%s/Bills/%s" % (year_abr, document.replace(".DOC", ".HTM")) # name document based _doctype try: doc_name = self._doctypes[rec["doctype"]] except KeyError: raise Exception("unknown doctype %s on %s" % (rec["doctype"], bill_id)) if rec["comment"]: doc_name += " " + rec["comment"] if rec["doctype"] in self._version_types: bill.add_version(doc_name, htm_url) else: bill.add_document(doc_name, htm_url) # Votes next_year = int(year_abr) + 1 vote_info_list = [ "A%s" % year_abr, "A%s" % next_year, "S%s" % year_abr, "S%s" % next_year, "CA%s-%s" % (year_abr, next_year), "CS%s-%s" % (year_abr, next_year), ] for filename in vote_info_list: s_vote_url = "ftp://www.njleg.state.nj.us/votes/%s.zip" % filename try: s_vote_zip, resp = self.urlretrieve(s_vote_url) except scrapelib.FTPError: self.warning("could not find %s" % s_vote_url) continue zipedfile = zipfile.ZipFile(s_vote_zip) vfile = "%s.txt" % filename vote_file = zipedfile.open(vfile, "U") vdict_file = csv.DictReader(vote_file) votes = {} if filename.startswith("A") or filename.startswith("CA"): chamber = "lower" else: chamber = "upper" if filename.startswith("C"): vote_file_type = "committee" else: vote_file_type = "chamber" for rec in vdict_file: if vote_file_type == "chamber": bill_id = rec["Bill"].strip() leg = rec["Full_Name"] date = rec["Session_Date"] action = rec["Action"] leg_vote = rec["Legislator_Vote"] else: bill_id = "%s%s" % (rec["Bill_Type"], rec["Bill_Number"]) leg = rec["Name"] # drop time portion date = rec["Agenda_Date"].split()[0] # make motion readable action = self._com_vote_motions[rec["BillAction"]] # first char (Y/N) use [0:1] to ignore '' leg_vote = rec["LegislatorVote"][0:1] date = datetime.strptime(date, "%m/%d/%Y") vote_id = "_".join((bill_id, chamber, action)) vote_id = vote_id.replace(" ", "_") if vote_id not in votes: votes[vote_id] = Vote(chamber, date, action, None, None, None, None, bill_id=bill_id) if vote_file_type == "committee": votes[vote_id]["committee"] = self._committees[rec["Committee_House"]] if leg_vote == "Y": votes[vote_id].yes(leg) elif leg_vote == "N": votes[vote_id].no(leg) else: votes[vote_id].other(leg) # remove temp file os.remove(s_vote_zip) # Counts yes/no/other votes and saves overall vote for vote in votes.itervalues(): vote_yes_count = len(vote["yes_votes"]) vote_no_count = len(vote["no_votes"]) vote_other_count = len(vote["other_votes"]) vote["yes_count"] = vote_yes_count vote["no_count"] = vote_no_count vote["other_count"] = vote_other_count if vote_yes_count > vote_no_count: vote["passed"] = True else: vote["passed"] = False vote_bill_id = vote["bill_id"] bill = bill_dict[vote_bill_id] bill.add_vote(vote) # Actions bill_action_url, bill_action_db = self.get_dbf(year_abr, "BILLHIST") actor_map = {"A": "lower", "G": "executive", "S": "upper"} for rec in bill_action_db: bill_type = rec["billtype"] bill_number = int(rec["billnumber"]) bill_id = bill_type + str(bill_number) bill = bill_dict[bill_id] action = rec["action"] date = rec["dateaction"] actor = actor_map[rec["house"]] comment = rec["comment"] action, atype = self.categorize_action(action) if comment: action += " " + comment bill.add_action(actor, action, date, type=atype) # Subjects subject_url, subject_db = self.get_dbf(year_abr, "BILLSUBJ") for rec in subject_db: bill_id = rec["billtype"] + str(int(rec["billnumber"])) bill = bill_dict.get(bill_id) if bill: bill.setdefault("subjects", []).append(rec["subjectkey"]) else: self.warning("invalid bill id in BILLSUBJ.DBF: %s" % bill_id) # save all bills at the end for bill in bill_dict.itervalues(): # add sources bill.add_source(bill_sponsors_url) bill.add_source(bill_document_url) bill.add_source(bill_action_url) bill.add_source(subject_url) self.save_bill(bill)
class SenateBillPage(object): '''Used for categories, senate votes, events.''' def __init__(self, scraper, session, chamber, url, doc, bill_type, bill_id, title, bill_id_parts): self.scraper = scraper self.chamber = chamber self.url = url self.doc = doc self.bill_id = bill_id self.letter, self.number, self.version = bill_id_parts self.data = {} self.bill = Bill(session, chamber, bill_id, title, type=bill_type) self.succeeded = False self._build() self.bill.add_source(self.url) def _build(self): self.get_senate_votes() self.get_sponsors_memo() self.get_subjects() self.get_versions() self.succeeded = True def url2lxml(self, url): self.bill.add_source(url) return self.scraper.url2lxml(url) def get_subjects(self): subjects = [] for link in self.doc.xpath("//a[contains(@href, 'lawsection')]"): subjects.append(link.text.strip()) self.bill['subjects'] = subjects def get_sponsors_memo(self): if self.chamber == 'upper': self.bill.add_document("Sponsor's Memorandum", self.url) def get_senate_votes(self): for b in self.doc.xpath("//div/b[starts-with(., 'VOTE: FLOOR VOTE:')]"): date = b.text.split('-')[1].strip() date = datetime.datetime.strptime(date, "%b %d, %Y").date() yes_votes, no_votes, other_votes = [], [], [] yes_count, no_count, other_count = 0, 0, 0 actual_vote = collections.defaultdict(list) vtype = None for tag in b.xpath("following-sibling::blockquote/*"): if tag.tag == 'b': text = tag.text if text.startswith('Ayes'): vtype = 'yes' yes_count = int(re.search( r'\((\d+)\):', text).group(1)) elif text.startswith('Nays'): vtype = 'no' no_count = int(re.search( r'\((\d+)\):', text).group(1)) elif (text.startswith('Excused') or text.startswith('Abstain') or text.startswith('Absent') ): vtype = 'other' other_count += int(re.search( r'\((\d+)\):', text).group(1)) else: raise ValueError('bad vote type: %s' % tag.text) elif tag.tag == 'a': name = tag.text.strip() if vtype == 'yes': yes_votes.append(name) elif vtype == 'no': no_votes.append(name) elif vtype == 'other': other_votes.append((name, tag.text)) passed = yes_count > (no_count + other_count) vote = Vote('upper', date, 'Floor Vote', passed, yes_count, no_count, other_count) for name in yes_votes: vote.yes(name) for name in no_votes: vote.no(name) for name, vote_val in other_votes: vote.other(name) actual_vote[vote_val].append(name) vote['actual_vote'] = actual_vote vote.add_source(self.url) self.bill.add_vote(vote) for b in self.doc.xpath("//div/b[starts-with(., 'VOTE: COMMITTEE VOTE:')]"): _, committee, date = re.split(r'\s*\t+\s*-\s*', b.text) date = date.strip() date = datetime.datetime.strptime(date, "%b %d, %Y").date() yes_votes, no_votes, other_votes = [], [], [] yes_count, no_count, other_count = 0, 0, 0 vtype = None for tag in b.xpath("following-sibling::blockquote/*"): if tag.tag == 'b': text = tag.text if text.startswith('Ayes'): vtype = 'yes' yes_count += int(re.search( r'\((\d+)\):', text).group(1)) elif text.startswith('Nays'): vtype = 'no' no_count += int(re.search( r'\((\d+)\):', text).group(1)) elif (text.startswith('Excused') or text.startswith('Abstain') or text.startswith('Absent') ): vtype = 'other' other_count += int(re.search( r'\((\d+)\):', text).group(1)) else: raise ValueError('bad vote type: %s' % tag.text) elif tag.tag == 'a': name = tag.text.strip() if vtype == 'yes': yes_votes.append(name) elif vtype == 'no': no_votes.append(name) elif vtype == 'other': other_votes.append(name) passed = yes_count > (no_count + other_count) vote = Vote('upper', date, '%s Committee Vote' % committee, passed, yes_count, no_count, other_count) for name in yes_votes: vote.yes(name) for name in no_votes: vote.no(name) for name in other_votes: vote.other(name) vote.add_source(self.url) self.bill.add_vote(vote) def get_versions(self): text = self.doc.xpath('//*[contains(., "Versions:")]')[-1].text_content() version_text = text _, version_text = text.split('Versions:') url_tmpl = 'http://open.nysenate.gov/legislation/bill/' for version_bill_id in re.findall('\S+', version_text): version_bill_id_noyear, _ = version_bill_id.rsplit('-') version_url = url_tmpl + version_bill_id self.bill.add_version(version_bill_id_noyear, version_url, mimetype='text/html')
def scrape_bill_type(self, chamber, session, bill_type, type_abbr): if chamber == 'upper': chamber_name = 'SENATE' else: chamber_name = 'ASSEMBLY' bills = self.session.query(CABill).filter_by( session_year=session).filter_by( measure_type=type_abbr) for bill in bills: bill_session = session if bill.session_num != '0': bill_session += ' Special Session %s' % bill.session_num bill_id = bill.short_bill_id fsbill = Bill(bill_session, chamber, bill_id, '') # Construct session for web query, going from '20092010' to '0910' source_session = session[2:4] + session[6:8] # Turn 'AB 10' into 'ab_10' source_num = "%s_%s" % (bill.measure_type.lower(), bill.measure_num) # Construct a fake source url source_url = ("http://www.leginfo.ca.gov/cgi-bin/postquery?" "bill_number=%s&sess=%s" % (source_num, source_session)) fsbill.add_source(source_url) scraped_versions = self.scrape_site_versions(bill, source_url) title = '' short_title = '' type = ['bill'] subject = '' all_titles = set() i = 0 for version in bill.versions: if not version.bill_xml: continue title = clean_title(version.title) all_titles.add(title) short_title = clean_title(version.short_title) type = [bill_type] if version.appropriation == 'Yes': type.append('appropriation') if version.fiscal_committee == 'Yes': type.append('fiscal committee') if version.local_program == 'Yes': type.append('local program') if version.urgency == 'Yes': type.append('urgency') if version.taxlevy == 'Yes': type.append('tax levy') if version.subject: subject = clean_title(version.subject) date = version.bill_version_action_date.date() url = '' try: scraped_version = scraped_versions[i] if scraped_version[0] == date: url = scraped_version[1] i += 1 except IndexError: pass fsbill.add_version( version.bill_version_id, url, date=date, title=title, short_title=short_title, subject=[subject], type=type) if not title: self.warning("Couldn't find title for %s, skipping" % bill_id) continue fsbill['title'] = title fsbill['short_title'] = short_title fsbill['type'] = type fsbill['subjects'] = [subject] # We don't want the current title in alternate_titles all_titles.remove(title) fsbill['alternate_titles'] = list(all_titles) for author in version.authors: if author.house == chamber_name: fsbill.add_sponsor(author.contribution, author.name) introduced = False for action in bill.actions: if not action.action: # NULL action text seems to be an error on CA's part, # unless it has some meaning I'm missing continue actor = action.actor or chamber actor = actor.strip() match = re.match(r'(Assembly|Senate)($| \(Floor)', actor) if match: actor = {'Assembly': 'lower', 'Senate': 'upper'}[match.group(1)] elif actor.startswith('Governor'): actor = 'executive' else: actor = re.sub('^Assembly', 'lower', actor) actor = re.sub('^Senate', 'upper', actor) type = [] act_str = action.action act_str = re.sub(r'\s+', ' ', act_str) if act_str.startswith('Introduced'): introduced = True type.append('bill:introduced') if 'Read first time.' in act_str: if not introduced: type.append('bill:introduced') introduced = True type.append('bill:reading:1') if 'To Com' in act_str or 'referred to' in act_str.lower(): type.append('committee:referred') if 'Read third time. Passed.' in act_str: type.append('bill:passed') if 'Approved by Governor' in act_str: type.append('governor:signed') if 'Item veto' in act_str: type.append('governor:vetoed:line-item') if 'Vetoed by Governor' in act_str: type.append('governor:vetoed') if 'To Governor' in act_str: type.append('governor:received') if 'Read second time' in act_str: type.append('bill:reading:2') if not type: type = ['other'] fsbill.add_action(actor, act_str, action.action_date.date(), type=type) for vote in bill.votes: if vote.vote_result == '(PASS)': result = True else: result = False full_loc = vote.location.description first_part = full_loc.split(' ')[0].lower() if first_part in ['asm', 'assembly']: vote_chamber = 'lower' vote_location = ' '.join(full_loc.split(' ')[1:]) elif first_part.startswith('sen'): vote_chamber = 'upper' vote_location = ' '.join(full_loc.split(' ')[1:]) else: raise ScrapeError("Bad location: %s" % full_loc) motion = vote.motion.motion_text or '' if "Third Reading" in motion or "3rd Reading" in motion: vtype = 'passage' elif "Do Pass" in motion: vtype = 'passage' else: vtype = 'other' motion = motion.strip() # Why did it take until 2.7 to get a flags argument on re.sub? motion = re.compile(r'(\w+)( Extraordinary)? Session$', re.IGNORECASE).sub('', motion) motion = re.compile(r'^(Senate|Assembly) ', re.IGNORECASE).sub('', motion) motion = re.sub(r'^(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? ', '', motion) motion = re.sub(r' \(\w+\)$', '', motion) motion = re.sub(r'(SCR|SB|AB|AJR|ACR)\s?\d+ \w+\.?$', '', motion) motion = re.sub(r'(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? ' r'Urgency Clause$', '(Urgency Clause)', motion) motion = re.sub(r'\s+', ' ', motion) if not motion: self.warning("Got blank motion on vote for %s" % bill_id) continue fsvote = Vote(vote_chamber, self._tz.localize(vote.vote_date_time), motion, result, int(vote.ayes), int(vote.noes), int(vote.abstain), threshold=vote.threshold, type=vtype) if vote_location != 'Floor': fsvote['committee'] = vote_location for record in vote.votes: if record.vote_code == 'AYE': fsvote.yes(record.legislator_name) elif record.vote_code.startswith('NO'): fsvote.no(record.legislator_name) else: fsvote.other(record.legislator_name) # The abstain count field in CA's database includes # vacancies, which we aren't interested in. fsvote['other_count'] = len(fsvote['other_votes']) fsbill.add_vote(fsvote) self.save_bill(fsbill)
def scrape(self, session, chambers): sid = self.metadata['session_details'][session]['_guid'] legislation = backoff( self.lservice.GetLegislationForSession, sid )['LegislationIndex'] for leg in legislation: lid = leg['Id'] instrument = backoff(self.lservice.GetLegislationDetail, lid) history = [x for x in instrument['StatusHistory'][0]] actions = reversed([{ "code": x['Code'], "action": x['Description'], "_guid": x['Id'], "date": x['Date'] } for x in history]) guid = instrument['Id'] bill_type = instrument['DocumentType'] chamber = { "H": "lower", "S": "upper", "J": "joint" }[bill_type[0]] # XXX: This is a bit of a hack. bill_id = "%s %s" % ( bill_type, instrument['Number'], ) if instrument['Suffix']: bill_id += instrument['Suffix'] title = instrument['Caption'] description = instrument['Summary'] if title is None: continue bill = Bill( session, chamber, bill_id, title, description=description, _guid=guid ) if instrument['Votes']: for vote_ in instrument['Votes']: _, vote_ = vote_ vote_ = backoff(self.vservice.GetVote, vote_[0]['VoteId']) vote = Vote( {"House": "lower", "Senate": "upper"}[vote_['Branch']], vote_['Date'], vote_['Caption'] or "Vote on Bill", (vote_['Yeas'] > vote_['Nays']), vote_['Yeas'], vote_['Nays'], (vote_['Excused'] + vote_['NotVoting']), session=session, bill_id=bill_id, bill_chamber=chamber) vote.add_source(self.vsource) methods = {"Yea": vote.yes, "Nay": vote.no,} for vdetail in vote_['Votes'][0]: whom = vdetail['Member'] how = vdetail['MemberVoted'] try: m = methods[how] except KeyError: m = vote.other m(whom['Name']) bill.add_vote(vote) types = { "HI": ["other"], "SI": ["other"], "HH": ["other"], "SH": ["other"], "HPF": ["bill:introduced"], "HDSAS": ["other"], "SPF": ["bill:introduced"], "HSR": ["bill:reading:2"], "SSR": ["bill:reading:2"], "HFR": ["bill:reading:1"], "SFR": ["bill:reading:1"], "HRECM": ["bill:withdrawn", "committee:referred"], "SRECM": ["bill:withdrawn", "committee:referred"], "SW&C": ["bill:withdrawn", "committee:referred"], "HW&C": ["bill:withdrawn", "committee:referred"], "HRA": ["bill:passed"], "SRA": ["bill:passed"], "HPA": ["bill:passed"], "HRECO": ["other"], "SPA": ["bill:passed"], "HTABL": ["other"], # "House Tabled" - what is this? "SDHAS": ["other"], "HCFR": ["committee:passed:favorable"], "SCFR": ["committee:passed:favorable"], "HRAR": ["committee:referred"], "SRAR": ["committee:referred"], "STR": ["bill:reading:3"], "SAHAS": ["other"], "SE": ["bill:passed"], "SR": ["committee:referred"], "HTRL": ["bill:reading:3", "bill:failed"], "HTR": ["bill:reading:3"], "S3RLT": ["bill:reading:3", "bill:failed"], "HASAS": ["other"], "S3RPP": ["other"], "STAB": ["other"], "SRECO": ["other"], "SAPPT": ["other"], "HCA": ["other"], "HNOM": ["other"], "HTT": ["other"], "STT": ["other"], "SRECP": ["other"], "SCRA": ["other"], "SNOM": ["other"], "S2R": ["bill:reading:2"], "H2R": ["bill:reading:2"], "SENG": ["bill:passed"], "HENG": ["bill:passed"], "HPOST": ["other"], "HCAP": ["other"], "SDSG": ["governor:signed"], "SSG": ["governor:received"], "Signed Gov": ["governor:signed"], "HDSG": ["governor:signed"], "HSG": ["governor:received"], "EFF": ["other"], "HRP": ["other"], "STH": ["other"], "HTS": ["other"], } ccommittees = defaultdict(list) committees = instrument['Committees'] if committees: for committee in committees[0]: ccommittees[{ "House": "lower", "Senate": "upper", }[committee['Type']]].append(committee['Name']) for action in actions: chamber = { "H": "lower", "S": "upper", "E": "other", # Effective Date }[action['code'][0]] try: _types = types[action['code']] except KeyError: self.debug(action) _types = ["other"] committees = [] if any(('committee' in x for x in _types)): committees = [str(x) for x in ccommittees.get(chamber, [])] bill.add_action(chamber, action['action'], action['date'], _types, committees=committees, _code=action['code'], _code_id=action['_guid']) sponsors = [] if instrument['Authors']: sponsors = instrument['Authors']['Sponsorship'] if 'Sponsors' in instrument and instrument['Sponsors']: sponsors += instrument['Sponsors']['Sponsorship'] sponsors = [ (x['Type'], self.get_member(x['MemberId'])) for x in sponsors ] for typ, sponsor in sponsors: name = "{First} {Last}".format(**dict(sponsor['Name'])) bill.add_sponsor( 'primary' if 'Author' in typ else 'seconday', name ) for version in instrument['Versions']['DocumentDescription']: name, url, doc_id, version_id = [ version[x] for x in [ 'Description', 'Url', 'Id', 'Version' ] ] bill.add_version( name, url, mimetype='application/pdf', _internal_document_id=doc_id, _version_id=version_id ) versions = sorted( bill['versions'], key=lambda x: x['_internal_document_id'] ) bill['versions'] = versions bill.add_source(self.msource) bill.add_source(self.lsource) bill.add_source(SOURCE_URL.format(**{ "session": session, "bid": guid, })) self.save_bill(bill)
def scrape_bill(self,link,chamber,session): legislation_types = { 'House Bill': 'HB', 'House Concurrent Resolution': 'HCR', 'House Joint Resolution': 'HJR', 'House Resolution': 'HR', 'Senate Bill': 'SB', 'Senate Concurrent Resolution': 'SCR', 'Senate Joint Resolution': 'SJR', 'Senate Resolution': 'SR', } base_url = "http://legis.delaware.gov" text_base_url = "http://legis.delaware.gov/LIS/lis{session}.nsf/vwLegislation/{bill_id}/$file/legis.html?open" try: page = self.lxmlize(link, True) except requests.exceptions.HTTPError: self.logger.warning('404. Apparently the bill hasn\'t been posted') return nominee = self.get_node(page, './/div[@id="page_header"]/text()') if nominee is not None and nominee.strip().lower() == "nominee information": self.logger.info("Nominee, skipping") return bill_id = self.get_node(page, './/div[@align="center" or @style="text-align:center"]') try: bill_id = bill_id.text_content().strip() except IndexError: self.logger.warning("Can't find bill number, skipping") return #some bill_ids include relevant amendments #in the form "SB 10 w/SA1", so we fix it here bill_id = bill_id.split("w/")[0] bill_id = bill_id.split("(")[0] leg_type = None for long_name, short_name in legislation_types.items(): if long_name in bill_id: leg_type = short_name bill_num = bill_id.replace(long_name,"").strip() break if leg_type: bill_id = leg_type + " " + bill_num elif "for" in bill_id: bill_id = bill_id.split("for")[1] else: self.logger.warning("Unknown bill type for {}".format(bill_id)) return bill_id = bill_id.replace(' ',"") bill_id = bill_id.strip() #each row is in its own table #there are no classes/ids or anything, so we're going to loop #through the individual tables and look for keywords #in the first td to tell us what we're looking at tables = self.get_nodes(page, './/div[@id="page_content"]/table') bill_title = None primary_sponsors = [] cosponsors = [] bill_url = None bill_documents = {} action_list = [] vote_documents = {} sub_link = None bill_text_avail = False if tables is None or not tables: self.logger.warning('First xpath didn\'t work.') tables = self.get_nodes(page, './/table[@style="width:837.0px"]/tr') for table in tables: tds = table.xpath('.//td') if len(tds) == 0: #some kind of empty table for formatting reasons continue title_text = tds[0].text_content().strip().lower() if title_text.startswith('primary sponsor'): pri_sponsor_text = tds[1].text_content() primary_sponsors = self.separate_names(pri_sponsor_text) #sometimes additional sponsors are in a 3rd td #other times the 3rd td contains a blank image addl_sponsors = [] add_spons_text = tds[2].text_content().strip() if add_spons_text: add_spons_text = add_spons_text.replace("Additional Sponsor(s):","") if not "on behalf of all representatives" in add_spons_text.lower(): addl_sponsors = self.separate_names(add_spons_text) elif title_text.startswith('co-sponsor'): cosponsor_text = tds[1].text_content() if "none..." in cosponsor_text.lower(): cosponsors = [] continue cosponsors = self.separate_names(cosponsor_text) elif title_text.startswith('long title'): bill_title = tds[1].text_content().strip() elif title_text.startswith('amendment'): amendments = tds[1].xpath('.//a') for a in amendments: amm = a.text amm_text = "Amendment".format(amm.strip()) amm_slg = "+".join(amm.split()) amm_link = text_base_url.format(session=session, bill_id=amm_slg) bill_documents[amm_text] = amm_link amm_page = self.lxmlize(a.attrib["href"]) for tr in amm_page.xpath('//tr'): tds = tr.xpath("./td") if len(tds) > 1: if "voting" in tds[0].text_content().lower(): self.find_vote(tds,vote_documents,"Amendment: ") elif title_text.startswith('engrossed version'): if tds[1].text_content().strip(): engrossment_base = "http://legis.delaware.gov/LIS/lis{session}.nsf/EngrossmentsforLookup/{bill_id}/$file/Engross.html?open" engrossment_link = engrossment_base.format(session=session, bill_id = "+".join(bill_id.split())) if bill_url not in bill_documents.values(): bill_documents["Engrossed Version"] = engrossment_link elif title_text.startswith('substituted'): content = tds[1].text_content().strip() if ("Substitute" in content and not "Original" in content): sub_link = tds[1].xpath(".//a/@href")[0] elif ("full text" in title_text and ("(" not in title_text or "html" in title_text)): if tds[1].text_content().strip(): #it is totally unclear which version of the bill is referred to here #so I'm just calling it "bill text" bill_url = text_base_url.format( session=session, bill_id=bill_id.replace(" ","+")) if bill_url not in bill_documents.values(): bill_documents["Bill Text"] = bill_url elif title_text.startswith('fiscal notes'): pass #skipping fiscal notes for now, they are really ugly #but leaving in as a placeholder so we can remember to #do this someday, if we feel like it elif title_text.startswith('committee reports'): pass #the committee reports let a legislator #comment on a bill. They can comment as #"favorable","unfavorable" or "on its merits" #but these are NOT votes (per conversation w #seceretary of the DE senate 3/16/15). The bill is #considered if the majority sign it, which will #appear in the bill's action history as being #reported out of committee elif title_text.startswith('voting'): self.find_vote(tds,vote_documents) elif title_text.startswith('actions history'): action_list = tds[1].text_content().split("\n") sub_versions = [] use_sub = False if sub_link: bill = self.scrape_bill(sub_link,chamber,session) if bill: sub_versions = [v["url"] for v in bill["versions"]] bill.add_title(bill_id) use_sub = True if not use_sub: bill = Bill(session,chamber,bill_id,bill_title) for s in primary_sponsors: bill.add_sponsor("primary",s) for s in addl_sponsors: #it is not totally clear whether "additional sponsors" #are co or primary but primary is my best guess #based on the bill text, bc they're on the first #line with the primary sponsor bill.add_sponsor("primary",s) for s in cosponsors: bill.add_sponsor("cosponsor",s) for name, doc_link in bill_documents.items(): if "Engrossment" in name or "Bill Text" in name: if doc_link not in sub_versions: bill.add_version(name,doc_link,mimetype="text/html") else: pass bill.add_document(name,doc_link,mimetype="text/html") for a in action_list: if a.strip(): date, action = a.split('-', 1) try: date = datetime.strptime(date.strip(), '%b %d, %Y') except ValueError: date = datetime.strptime(date.strip(), '%B %d, %Y') # XXX: ugh. action = action.strip() actor = actions.get_actor(action, bill['chamber']) attrs = dict(actor=actor, action=action, date=date) attrs.update(**self.categorizer.categorize(action)) attrs["action"] = " ".join(attrs["action"].split()) bill.add_action(**attrs) for name, doc in vote_documents.items(): vote_chamber = "lower" if "house" in name.lower() else "upper" try: self.head(doc) except requests.exceptions.HTTPError: self.logger.warning("could not access vote document") continue vote_page = self.lxmlize(doc) try: vote_info = vote_page.xpath('.//div[@id="page_content"]/p')[-1] vote_tds = vote_page.xpath(".//table//td") except IndexError: vote_info = vote_page.xpath('.//form[1]')[0] vote_tds = vote_page.xpath('.//table[@border="0"]//td') yes_votes = [] no_votes = [] other_votes = [] lines = vote_info.text_content().split("\n") lines = filter(None, lines) for line in lines: if line.strip().startswith("Date"): date_str = " ".join(line.split()[1:4]) date = datetime.strptime(date_str,"%m/%d/%Y %I:%M %p") passage_status = line.strip().split()[-1] #we've never seen a vote with anything but "passed" #so throw an error otherwise so we can figure it out passed_statuses = ["Passed"] failed_statuses = ["Defeated", "Rescinded"] if passage_status not in passed_statuses+failed_statuses: raise AssertionError("Unknown passage state {}".format(passage_status)) passed = passage_status in passed_statuses if line.strip().startswith("Vote Type"): if "voice" in line.lower(): voice_vote = True else: voice_vote = False yes_count = int(re.findall("Yes: (\d+)",line)[0]) no_count = int(re.findall("No: (\d+)",line)[0]) other_count = int(re.findall("Not Voting: (\d+)",line)[0]) other_count += int(re.findall("Absent: (\d+)",line)[0]) person_seen = False for td in vote_tds: if person_seen: person_vote = td.text_content().strip() if person_vote == "Y": yes_votes.append(person) elif person_vote == "N": no_votes.append(person) elif person_vote in ["NV","A","X","C"]: other_votes.append(person) else: raise AssertionError("Unknown vote '{}'".format(person_vote)) person_seen = False else: person = td.text_content().strip() if person: person_seen = True if voice_vote: vote = Vote(vote_chamber,date,"passage",passed,0,0,0) else: vote = Vote(vote_chamber,date,"passage", passed,yes_count,no_count,other_count, yes_votes=[], no_votes=[], other_votes=[]) vote["yes_votes"] = yes_votes vote["no_votes"] = no_votes vote["other_votes"] = other_votes if (passed and vote["yes_count"] <= vote["no_count"] and not voice_vote): raise AssertionError("Vote passed with more N than Y votes?") if not passed and vote["yes_count"] > vote["no_count"]: self.logger.warning("Vote did not pass but had a majority \ probably worth checking") if "Amendment" in name: vote["type"] = "amendment" else: vote["type"] = "passage" vote.add_source(doc) bill.add_vote(vote) bill.add_source(link) return bill
def scrape_bill(self, chamber, session, bill_id, short_title=None): """ Scrapes documents, actions, vote counts and votes for bills from the 2009 session and above. """ url = BILL_URL % (session, bill_id.replace(' ', '')) with self.urlopen(url) as bill_page: html = lxml.html.fromstring(bill_page) html.make_links_absolute( 'http://legislature.idaho.gov/legislation/%s/' % session) bill_tables = html.xpath('./body/table/tr/td[2]')[0].xpath( './/table') title = bill_tables[1].text_content().strip() bill_type = get_bill_type(bill_id) bill = Bill(session, chamber, bill_id, title, type=bill_type) bill.add_source(url) bill['subjects'] = self._subjects[bill_id.replace(' ', '')] if short_title and bill['title'].lower() != short_title.lower(): bill.add_title(short_title) # documents doc_links = html.xpath('//span/a') for link in doc_links: name = link.text_content().strip() href = link.get('href') if 'Engrossment' in name or 'Bill Text' in name: bill.add_version(name, href) else: bill.add_document(name, href) # sponsors range from a committee to one legislator to a group of legs sponsor_lists = bill_tables[0].text_content().split('by') if len(sponsor_lists) > 1: for sponsors in sponsor_lists[1:]: for person in sponsors.split(','): bill.add_sponsor('primary', person) actor = chamber last_date = None for row in bill_tables[2]: # lots of empty rows if len(row) == 1: continue _, date, action, _ = [x.text_content().strip() for x in row] if date: last_date = date else: date = last_date date = datetime.datetime.strptime(date + '/' + session[0:4], "%m/%d/%Y") if action.startswith('House'): actor = 'lower' elif action.startswith('Senate'): actor = 'upper' # votes if 'AYES' in action or 'NAYS' in action: vote = self.parse_vote(actor, date, row[2]) vote.add_source(url) bill.add_vote(vote) # some td's text is seperated by br elements if len(row[2]): action = "".join(row[2].itertext()) action = action.replace(u'\xa0', ' ').strip() atype = get_action(actor, action) bill.add_action(actor, action, date, type=atype) # after voice vote/roll call and some actions the bill is sent # 'to House' or 'to Senate' if 'to House' in action: actor = 'lower' elif 'to Senate' in action: actor = 'upper' self.save_bill(bill)
def scrape_bill_sheet(self, session, chamber): """ Scrape the bill sheet (the page full of bills and other small bits of data) """ sheet_url = self.get_bill_folder(session, chamber) bill_chamber = {"Senate": "upper", "House": "lower"}[chamber] index = { "id": 0, "title_sponsor": 1, "version": 2, "history": 3, "votes": 7 } sheet_html = self.urlopen(sheet_url) sheet_page = lxml.html.fromstring(sheet_html) sheet_page.make_links_absolute(sheet_url) bills = sheet_page.xpath('//table/tr') for bill in bills: bill_id = self.read_td(bill[index["id"]][0]) if bill_id == None: # Every other entry is null for some reason continue dot_loc = bill_id.find('.') if dot_loc != -1: # budget bills are missing the .pdf, don't truncate bill_id = bill_id[:dot_loc] title_and_sponsor = bill[index["title_sponsor"]][0] bill_title = title_and_sponsor.text bill_title_and_sponsor = title_and_sponsor.text_content() if bill_title is None: continue # Odd ... sponsors = bill_title_and_sponsor.replace(bill_title, "").\ replace(" & ...", "").split("--") cats = { "SB": "bill", "HB": "bill", "HR": "resolution", "SR": "resolution", "SCR": "concurrent resolution", "HCR": "concurrent resolution", "SJR": "joint resolution", "HJR": "joint resolution", "SM": "memorial", "HM": "memorial" } bill_type = None for cat in cats: if bill_id[:len(cat)] == cat: bill_type = cats[cat] b = Bill(session, bill_chamber, bill_id, bill_title, type=bill_type) b.add_source(sheet_url) versions_url = \ bill[index["version"]].xpath('font/a')[0].attrib["href"] versions_url = versions_url versions = self.parse_versions(versions_url) for version in versions: b.add_version(version['name'], version['link'], mimetype=version['mimetype']) bill_history_href = bill[index["history"]][0][0].attrib['href'] history = self.parse_history(bill_history_href) b.add_source(bill_history_href) chamber_map = dict(Senate='upper', House='lower') for action, date in history: action_actor = chamber_map.get(chamber, chamber) attrs = dict(actor=action_actor, action=action, date=date) attrs.update(self.categorizer.categorize(action)) b.add_action(**attrs) for sponsor in sponsors: if sponsor != None and sponsor != "(NONE)" and \ sponsor != "": if "&" in sponsor: for sponsor in [x.strip() for x in sponsor.split("&")]: b.add_sponsor("primary", sponsor) else: b.add_sponsor("primary", sponsor) # Now that we have history, let's see if we can't grab some # votes bill_vote_href, = bill.xpath(".//a[contains(text(), 'Votes')]") bill_vote_href = bill_vote_href.attrib['href'] #bill_vote_href = self.get_vote_url(bill_id, session) votes = self.parse_votes(bill_vote_href) if (votes['sanity-check'] == 'This site only supports frames ' 'compatible browsers!'): votes['votes'] = [] elif votes['sanity-check'] != bill_id: self.warning("XXX: READ ME! Sanity check failed!") self.warning(" -> Scraped ID: " + votes['sanity-check']) self.warning(" -> 'Real' ID: " + bill_id) assert votes['sanity-check'] == bill_id for vote in votes['votes']: filed_votes = vote['votes'] passage = vote['meta'] result = vote['result'] composite_time = "%s %s" % (passage['x-parent-date'], passage['TIME']) # It's now like: 04/01/2011 02:10:14 PM pydate = dt.datetime.strptime(composite_time, "%m/%d/%Y %I:%M:%S %p") hasHouse = "House" in passage['x-parent-ctty'] hasSenate = "Senate" in passage['x-parent-ctty'] if hasHouse and hasSenate: actor = "joint" elif hasHouse: actor = "lower" else: actor = "upper" other = (int(result['EXC']) + int(result['ABS'])) # OK, sometimes the Other count is wrong. local_other = 0 for voter in filed_votes: l_vote = filed_votes[voter].lower().strip() if l_vote != "yes" and l_vote != "no": local_other = local_other + 1 if local_other != other: self.warning( \ "XXX: !!!WARNING!!! - resetting the 'OTHER' VOTES") self.warning(" -> Old: %s // New: %s" % (other, local_other)) other = local_other passed = (result['FINAL_ACTION'] == "PASS") if passage['MOTION'].strip() == "": continue if "without objection" in passage['MOTION'].lower(): passed = True v = Vote(actor, pydate, passage['MOTION'], passed, int(result['YES']), int(result['NO']), other, moved=passage['MOVED'], seconded=passage['SECONDED']) v.add_source(vote['meta']['url']) # v.add_source( bill_vote_href ) # XXX: Add more stuff to kwargs, we have a ton of data seen = set([]) for voter in filed_votes: who = voter if who in seen: raise Exception("Seeing the double-thing. - bug #702") seen.add(who) vote = filed_votes[who] if vote.lower() == "yes": v.yes(who) elif vote.lower() == "no": v.no(who) else: v.other(who) b.add_vote(v) self.save_bill(b)
def scrape(self, session, chambers): sid = self.metadata["session_details"][session]["_guid"] legislation = backoff(self.lservice.GetLegislationForSession, sid)["LegislationIndex"] for leg in legislation: lid = leg["Id"] instrument = backoff(self.lservice.GetLegislationDetail, lid) history = [x for x in instrument["StatusHistory"][0]] actions = reversed( [{"code": x["Code"], "action": x["Description"], "_guid": x["Id"], "date": x["Date"]} for x in history] ) guid = instrument["Id"] bill_type = instrument["DocumentType"] chamber = {"H": "lower", "S": "upper", "J": "joint"}[bill_type[0]] # XXX: This is a bit of a hack. bill_id = "%s %s" % (bill_type, instrument["Number"]) if instrument["Suffix"]: bill_id += instrument["Suffix"] title = instrument["Caption"] description = instrument["Summary"] if title is None: continue bill = Bill(session, chamber, bill_id, title, description=description, _guid=guid) if instrument["Votes"]: for vote_ in instrument["Votes"]: _, vote_ = vote_ vote_ = backoff(self.vservice.GetVote, vote_[0]["VoteId"]) vote = Vote( {"House": "lower", "Senate": "upper"}[vote_["Branch"]], vote_["Date"], vote_["Caption"] or "Vote on Bill", (vote_["Yeas"] > vote_["Nays"]), vote_["Yeas"], vote_["Nays"], (vote_["Excused"] + vote_["NotVoting"]), session=session, bill_id=bill_id, bill_chamber=chamber, ) vote.add_source(self.vsource) methods = {"Yea": vote.yes, "Nay": vote.no} for vdetail in vote_["Votes"][0]: whom = vdetail["Member"] how = vdetail["MemberVoted"] try: m = methods[how] except KeyError: m = vote.other m(whom["Name"]) bill.add_vote(vote) types = { "HI": ["other"], "SI": ["other"], "HH": ["other"], "SH": ["other"], "HPF": ["bill:introduced"], "HDSAS": ["other"], "SPF": ["bill:introduced"], "HSR": ["bill:reading:2"], "SSR": ["bill:reading:2"], "HFR": ["bill:reading:1"], "SFR": ["bill:reading:1"], "HRECM": ["bill:withdrawn", "committee:referred"], "SRECM": ["bill:withdrawn", "committee:referred"], "SW&C": ["bill:withdrawn", "committee:referred"], "HW&C": ["bill:withdrawn", "committee:referred"], "HRA": ["bill:passed"], "SRA": ["bill:passed"], "HPA": ["bill:passed"], "HRECO": ["other"], "SPA": ["bill:passed"], "HTABL": ["other"], # "House Tabled" - what is this? "SDHAS": ["other"], "HCFR": ["committee:passed:favorable"], "SCFR": ["committee:passed:favorable"], "HRAR": ["committee:referred"], "SRAR": ["committee:referred"], "STR": ["bill:reading:3"], "SAHAS": ["other"], "SE": ["bill:passed"], "SR": ["committee:referred"], "HTRL": ["bill:reading:3", "bill:failed"], "HTR": ["bill:reading:3"], "S3RLT": ["bill:reading:3", "bill:failed"], "HASAS": ["other"], "S3RPP": ["other"], "STAB": ["other"], "SRECO": ["other"], "SAPPT": ["other"], "HCA": ["other"], "HNOM": ["other"], "HTT": ["other"], "STT": ["other"], "SRECP": ["other"], "SCRA": ["other"], "SNOM": ["other"], "S2R": ["bill:reading:2"], "H2R": ["bill:reading:2"], "SENG": ["bill:passed"], "HENG": ["bill:passed"], "HPOST": ["other"], "HCAP": ["other"], "SDSG": ["governor:signed"], "SSG": ["governor:received"], "Signed Gov": ["governor:signed"], "HDSG": ["governor:signed"], "HSG": ["governor:received"], "EFF": ["other"], "HRP": ["other"], "STH": ["other"], "HTS": ["other"], } ccommittees = defaultdict(list) committees = instrument["Committees"] if committees: for committee in committees[0]: ccommittees[{"House": "lower", "Senate": "upper"}[committee["Type"]]].append(committee["Name"]) for action in actions: chamber = {"H": "lower", "S": "upper", "E": "other"}[action["code"][0]] # Effective Date try: _types = types[action["code"]] except KeyError: self.debug(action) _types = ["other"] committees = [] if any(("committee" in x for x in _types)): committees = [str(x) for x in ccommittees.get(chamber, [])] bill.add_action( chamber, action["action"], action["date"], _types, committees=committees, _code=action["code"], _code_id=action["_guid"], ) sponsors = [] if instrument["Authors"]: sponsors = instrument["Authors"]["Sponsorship"] if "Sponsors" in instrument and instrument["Sponsors"]: sponsors += instrument["Sponsors"]["Sponsorship"] sponsors = [(x["Type"], self.get_member(x["MemberId"])) for x in sponsors] for typ, sponsor in sponsors: name = "{First} {Last}".format(**dict(sponsor["Name"])) bill.add_sponsor("primary" if "Author" in typ else "seconday", name) for version in instrument["Versions"]["DocumentDescription"]: name, url, doc_id, version_id = [version[x] for x in ["Description", "Url", "Id", "Version"]] bill.add_version( name, url, mimetype="application/pdf", _internal_document_id=doc_id, _version_id=version_id ) bill.add_source(self.msource) bill.add_source(self.lsource) bill.add_source(SOURCE_URL.format(**{"session": session, "bid": guid})) self.save_bill(bill)
def scrape(self, chamber, session): for term in self.metadata['terms']: if term['sessions'][0] == session: year = str(term['start_year']) year2 = str(term['end_year']) break else: raise NoDataForPeriod(session) if chamber == 'upper': bill_abbr = 'SB|SCR|SJR' elif chamber == 'lower': bill_abbr = 'HB|HCR|HJR' # Full calendar year date1 = '0101' + year[2:] date2 = '1231' + year2[2:] # Get bill list bill_list_url = 'http://www.legis.state.ak.us/'\ 'basis/range_multi.asp?session=%s&date1=%s&date2=%s' % ( session, date1, date2) self.log("Getting bill list for %s %s (this may take a long time)." % (chamber, session)) bill_list = self.soup_parser(self.urlopen(bill_list_url)) # Find bill links re_str = "bill=%s\d+" % bill_abbr links = bill_list.findAll(href=re.compile(re_str)) for link in links: bill_id = link.contents[0].replace(' ', '') bill_name = link.parent.parent.findNext('td').find( 'font').contents[0].strip() if bill_id.startswith('HB') or bill_id.startswith('SB'): btype = ['bill'] elif bill_id.startswith('SJR') or bill_id.startswith('HJR'): btype = ['joint resolution'] elif bill_id.startswith('SR') or bill_id.startswith('HR'): btype = ['resolution'] elif bill_id.startswith('SCR') or bill_id.startswith('HCR'): btype = ['concurrent resolution'] if re.match(r'CONST\.? AM:', bill_name): btype.append('constitutional amendment') bill = Bill(session, chamber, bill_id, bill_name, type=btype) # Get the bill info page and strip malformed t info_url = "http://www.legis.state.ak.us/basis/%s" % link['href'] info_page = self.soup_parser(self.urlopen(info_url)) bill.add_source(info_url) # Get sponsors spons_str = info_page.find( text="SPONSOR(s):").parent.parent.contents[1] sponsors_match = re.match( ' (SENATOR|REPRESENTATIVE)\([Ss]\) ([^,]+(,[^,]+){0,})', spons_str) if sponsors_match: sponsors = sponsors_match.group(2).split(',') sponsor = sponsors[0].strip() if sponsor: bill.add_sponsor('primary', sponsors[0]) for sponsor in sponsors[1:]: sponsor = sponsor.strip() if sponsor: bill.add_sponsor('cosponsor', sponsor) else: # Committee sponsorship spons_str = spons_str.strip() if re.match(r' BY REQUEST OF THE GOVERNOR$', spons_str): spons_str = re.sub(r' BY REQUEST OF THE GOVERNOR$', '', spons_str).title() spons_str = (spons_str + " Committee (by request of the governor)") if spons_str: bill.add_sponsor('committee', spons_str) # Get actions self._current_comm = None act_rows = info_page.findAll('table', 'myth')[1].findAll('tr')[1:] for row in act_rows: cols = row.findAll('td') act_date = cols[0].font.contents[0] act_date = dt.datetime.strptime(act_date, '%m/%d/%y') if cols[2].font.string == "(H)": act_chamber = "lower" elif cols[2].font.string == "(S)": act_chamber = "upper" else: act_chamber = chamber action = cols[3].font.contents[0].strip() if re.match("\w+ Y(\d+) N(\d+)", action): try: vote = self.parse_vote(bill, action, act_chamber, act_date, cols[1].a['href']) bill.add_vote(vote) except: self.log("Failed parsing vote") action, atype = self.clean_action(action) match = re.match('^Prefile released (\d+/\d+/\d+)$', action) if match: action = 'Prefile released' act_date = dt.datetime.strptime(match.group(1), '%m/%d/%y') bill.add_action(act_chamber, action, act_date, type=atype) # Get subjects bill['subjects'] = [] subject_link_re = re.compile('.*subject=\w+$') for subject_link in info_page.findAll('a', href=subject_link_re): subject = subject_link.contents[0].strip() bill['subjects'].append(subject) # Get versions text_list_url = "http://www.legis.state.ak.us/"\ "basis/get_fulltext.asp?session=%s&bill=%s" % ( session, bill_id) text_list = self.soup_parser(self.urlopen(text_list_url)) bill.add_source(text_list_url) text_link_re = re.compile('^get_bill_text?') for text_link in text_list.findAll('a', href=text_link_re): text_name = text_link.parent.previousSibling.contents[0] text_name = text_name.strip() text_url = "http://www.legis.state.ak.us/basis/%s" % ( text_link['href']) bill.add_version(text_name, text_url) self.save_bill(bill)
def scrape_bill_sheet(self, session, chamber): """ Scrape the bill sheet (the page full of bills and other small bits of data) """ sheet_url = self.get_bill_folder(session, chamber) bill_chamber = {"Senate": "upper", "House": "lower"}[chamber] index = { "id": 0, "title_sponsor": 1, "version": 2, "history": 3, "votes": 7 } sheet_html = self.urlopen(sheet_url) sheet_page = lxml.html.fromstring(sheet_html) bills = sheet_page.xpath('//table/tr') for bill in bills: bill_id = self.read_td(bill[index["id"]][0]) if bill_id == None: # Every other entry is null for some reason continue dot_loc = bill_id.find('.') if dot_loc != -1: # budget bills are missing the .pdf, don't truncate bill_id = bill_id[:dot_loc] title_and_sponsor = bill[index["title_sponsor"]][0] bill_title = title_and_sponsor.text bill_title_and_sponsor = title_and_sponsor.text_content() if bill_title is None: continue # Odd ... sponsors = bill_title_and_sponsor.replace(bill_title, "").\ replace(" & ...", "").split("--") cats = { "SB": "bill", "HB": "bill", "HR": "resolution", "SR": "resolution", "SCR": "concurrent resolution", "HCR": "concurrent resolution", "SJR": "joint resolution", "HJR": "joint resolution", "SM": "memorial", "HM": "memorial" } bill_type = None for cat in cats: if bill_id[:len(cat)] == cat: bill_type = cats[cat] b = Bill(session, bill_chamber, bill_id, bill_title, type=bill_type) b.add_source(sheet_url) versions_url = \ bill[index["version"]].xpath('font/a')[0].attrib["href"] versions_url = CO_URL_BASE + versions_url versions = self.parse_versions(versions_url) for version in versions: b.add_version(version['name'], version['link'], mimetype=version['mimetype']) bill_history_href = CO_URL_BASE + \ bill[index["history"]][0][0].attrib['href'] # ^^^^^^^ We assume this is a full path to the target. # might want to consider some better rel-path support # XXX: Look at this ^ history = self.parse_history(bill_history_href) b.add_source(bill_history_href) chamber_map = dict(Senate='upper', House='lower') for action, date in history: action_actor = chamber_map.get(chamber, chamber) attrs = dict(actor=action_actor, action=action, date=date) attrs.update(self.categorizer.categorize(action)) b.add_action(**attrs) for sponsor in sponsors: if sponsor != None and sponsor != "(NONE)" and \ sponsor != "": b.add_sponsor("primary", sponsor) # Now that we have history, let's see if we can't grab some # votes bill_vote_href = self.get_vote_url(bill_id, session) votes = self.parse_votes(bill_vote_href) if votes['sanity-check'] != bill_id: self.warning("XXX: READ ME! Sanity check failed!") self.warning(" -> Scraped ID: " + votes['sanity-check']) self.warning(" -> 'Real' ID: " + bill_id) assert votes['sanity-check'] == bill_id for vote in votes['votes']: filed_votes = vote['votes'] passage = vote['meta'] result = vote['result'] composite_time = "%s %s" % ( passage['x-parent-date'], passage['TIME'] ) # It's now like: 04/01/2011 02:10:14 PM pydate = dt.datetime.strptime(composite_time, "%m/%d/%Y %I:%M:%S %p") hasHouse = "House" in passage['x-parent-ctty'] hasSenate = "Senate" in passage['x-parent-ctty'] if hasHouse and hasSenate: actor = "joint" elif hasHouse: actor = "lower" else: actor = "upper" other = (int(result['EXC']) + int(result['ABS'])) # OK, sometimes the Other count is wrong. local_other = 0 for voter in filed_votes: l_vote = filed_votes[voter].lower().strip() if l_vote != "yes" and l_vote != "no": local_other = local_other + 1 if local_other != other: self.warning( \ "XXX: !!!WARNING!!! - resetting the 'OTHER' VOTES") self.warning(" -> Old: %s // New: %s" % ( other, local_other )) other = local_other v = Vote(actor, pydate, passage['MOTION'], (result['FINAL_ACTION'] == "PASS"), int(result['YES']), int(result['NO']), other, moved=passage['MOVED'], seconded=passage['SECONDED']) v.add_source(vote['meta']['url']) # v.add_source( bill_vote_href ) # XXX: Add more stuff to kwargs, we have a ton of data for voter in filed_votes: who = voter vote = filed_votes[who] if vote.lower() == "yes": v.yes(who) elif vote.lower() == "no": v.no(who) else: v.other(who) b.add_vote(v) self.save_bill(b)
def scrape_bills(self, chamber_to_scrape, session): url = 'http://billstatus.ls.state.ms.us/%s/pdf/all_measures/allmsrs.xml' % session bill_dir_page = self.get(url) root = lxml.etree.fromstring(bill_dir_page.content) for mr in root.xpath('//LASTACTION/MSRGROUP'): bill_id = mr.xpath('string(MEASURE)').replace(" ", "") if bill_id[0] == "S": chamber = "upper" else: chamber = "lower" bill_type = {'B':'bill', 'C': 'concurrent resolution', 'R': 'resolution', 'N': 'nomination'}[bill_id[1]] # just skip past bills that are of the wrong chamber if chamber != chamber_to_scrape: continue link = mr.xpath('string(ACTIONLINK)').replace("..", "") main_doc = mr.xpath('string(MEASURELINK)').replace("../../../", "") main_doc_url = 'http://billstatus.ls.state.ms.us/%s' % main_doc bill_details_url = 'http://billstatus.ls.state.ms.us/%s/pdf/%s' % (session, link) details_page = self.get(bill_details_url) page = details_page.content.replace(chr(11), "") # Some pages have the (invalid) byte 11 sitting around. Just drop # them out. Might as well. details_root = lxml.etree.fromstring(page) title = details_root.xpath('string(//SHORTTITLE)') longtitle = details_root.xpath('string(//LONGTITLE)') bill = Bill(session, chamber, bill_id, title, type=bill_type, summary=longtitle) #sponsors main_sponsor = details_root.xpath('string(//P_NAME)').split() if main_sponsor: main_sponsor = main_sponsor[0] main_sponsor_link = details_root.xpath('string(//P_LINK)').replace(" ", "_") main_sponsor_url = 'http://billstatus.ls.state.ms.us/%s/pdf/House_authors/%s.xml' % (session, main_sponsor_link) type = "primary" bill.add_sponsor(type, main_sponsor, main_sponsor_url = main_sponsor_url) for author in details_root.xpath('//AUTHORS/ADDITIONAL'): leg = author.xpath('string(CO_NAME)').replace(" ", "_") if leg: leg_url = 'http://billstatus.ls.state.ms.us/%s/pdf/House_authors/%s.xml' % (session, leg) type = "cosponsor" bill.add_sponsor(type, leg, leg_url=leg_url) #Versions curr_version = details_root.xpath('string(//CURRENT_OTHER)').replace("../../../../", "") if curr_version != "": curr_version_url = "http://billstatus.ls.state.ms.us/" \ + curr_version bill.add_version("Current version", curr_version_url, on_duplicate='use_new', mimetype='text/html') intro_version = details_root.xpath('string(//INTRO_OTHER)').replace("../../../../", "") if intro_version != "": intro_version_url = "http://billstatus.ls.state.ms.us/"\ + intro_version bill.add_version("As Introduced", intro_version_url, on_duplicate='use_new', mimetype='text/html') comm_version = details_root.xpath('string(//CMTESUB_OTHER)').replace("../../../../", "") if comm_version.find("documents") != -1: comm_version_url = "http://billstatus.ls.state.ms.us/" + comm_version bill.add_version("Committee Substitute", comm_version_url, on_duplicate='use_new', mimetype='text/html') passed_version = details_root.xpath('string(//PASSED_OTHER)').replace("../../../../", "") if passed_version.find("documents") != -1: passed_version_url = "http://billstatus.ls.state.ms.us/" + passed_version title = "As Passed the " + chamber bill.add_version(title, passed_version_url, on_duplicate='use_new', mimetype='text/html') asg_version = details_root.xpath('string(//ASG_OTHER)').replace("../../../../", "") if asg_version.find("documents") != -1: asg_version_url = "http://billstatus.ls.state.ms.us/" + asg_version bill.add_version("Approved by the Governor", asg_version_url, on_duplicate='use_new', mimetype='text/html') # avoid duplicate votes seen_votes = set() #Actions for action in details_root.xpath('//HISTORY/ACTION'): action_num = action.xpath('string(ACT_NUMBER)').strip() action_num = int(action_num) act_vote = action.xpath('string(ACT_VOTE)').replace("../../../..", "") action_desc = action.xpath('string(ACT_DESC)') date, action_desc = action_desc.split(" ", 1) date = date + "/" + session[0:4] date = datetime.strptime(date, "%m/%d/%Y") if action_desc.startswith("(H)"): actor = "lower" action = action_desc[4:] elif action_desc.startswith("(S)"): actor = "upper" action = action_desc[4:] else: actor = "executive" action = action_desc if action.find("Veto") != -1: version_path = details_root.xpath("string(//VETO_OTHER)") version_path = version_path.replace("../../../../", "") version_url = "http://billstatus.ls.state.ms.us/" + version_path bill.add_document("Veto", version_url) atype = 'other' for prefix, prefix_type in self._action_types: if action.startswith(prefix): atype = prefix_type break bill.add_action(actor, action, date, type=atype, action_num=action_num) # use committee names as scraped subjects subjects = details_root.xpath('//H_NAME/text()') subjects += details_root.xpath('//S_NAME/text()') bill['subjects'] = subjects if act_vote: vote_url = 'http://billstatus.ls.state.ms.us%s' % act_vote if vote_url not in seen_votes: seen_votes.add(vote_url) vote = self.scrape_votes(vote_url, action, date, actor) vote.add_source(vote_url) bill.add_vote(vote) bill.add_source(bill_details_url) self.save_bill(bill)
def scrape_bill(self, link, chamber, session): legislation_types = { 'House Bill': 'HB', 'House Concurrent Resolution': 'HCR', 'House Joint Resolution': 'HJR', 'House Resolution': 'HR', 'Senate Bill': 'SB', 'Senate Concurrent Resolution': 'SCR', 'Senate Joint Resolution': 'SJR', 'Senate Resolution': 'SR', } base_url = "http://legis.delaware.gov" text_base_url = "http://legis.delaware.gov/LIS/lis{session}.nsf/vwLegislation/{bill_id}/$file/legis.html?open" try: page = self.lxmlize(link, True) except requests.exceptions.HTTPError: self.logger.warning('404. Apparently the bill hasn\'t been posted') return nominee = self.get_node(page, './/div[@id="page_header"]/text()') if nominee is not None and nominee.strip().lower( ) == "nominee information": self.logger.info("Nominee, skipping") return bill_id = self.get_node( page, './/div[@align="center" or @style="text-align:center"]') try: bill_id = bill_id.text_content().strip() except IndexError: self.logger.warning("Can't find bill number, skipping") return #some bill_ids include relevant amendments #in the form "SB 10 w/SA1", so we fix it here bill_id = bill_id.split("w/")[0] bill_id = bill_id.split("(")[0] leg_type = None for long_name, short_name in legislation_types.items(): if long_name in bill_id: leg_type = short_name bill_num = bill_id.replace(long_name, "").strip() break if leg_type: bill_id = leg_type + " " + bill_num elif "for" in bill_id: bill_id = bill_id.split("for")[1] else: self.logger.warning("Unknown bill type for {}".format(bill_id)) return bill_id = bill_id.replace(' ', "") bill_id = bill_id.strip() #each row is in its own table #there are no classes/ids or anything, so we're going to loop #through the individual tables and look for keywords #in the first td to tell us what we're looking at tables = self.get_nodes(page, './/div[@id="page_content"]/table') bill_title = None primary_sponsors = [] cosponsors = [] bill_url = None bill_documents = {} action_list = [] vote_documents = {} sub_link = None bill_text_avail = False if tables is None or not tables: self.logger.warning('First xpath didn\'t work.') tables = self.get_nodes(page, './/table[@style="width:837.0px"]/tr') for table in tables: tds = table.xpath('.//td') if len(tds) == 0: #some kind of empty table for formatting reasons continue title_text = tds[0].text_content().strip().lower() if title_text.startswith('primary sponsor'): pri_sponsor_text = tds[1].text_content() primary_sponsors = self.separate_names(pri_sponsor_text) #sometimes additional sponsors are in a 3rd td #other times the 3rd td contains a blank image addl_sponsors = [] add_spons_text = tds[2].text_content().strip() if add_spons_text: add_spons_text = add_spons_text.replace( "Additional Sponsor(s):", "") if not "on behalf of all representatives" in add_spons_text.lower( ): addl_sponsors = self.separate_names(add_spons_text) elif title_text.startswith('co-sponsor'): cosponsor_text = tds[1].text_content() if "none..." in cosponsor_text.lower(): cosponsors = [] continue cosponsors = self.separate_names(cosponsor_text) elif title_text.startswith('long title'): bill_title = tds[1].text_content().strip() elif title_text.startswith('amendment'): amendments = tds[1].xpath('.//a') for a in amendments: amm = a.text amm_text = "Amendment".format(amm.strip()) amm_slg = "+".join(amm.split()) amm_link = text_base_url.format(session=session, bill_id=amm_slg) bill_documents[amm_text] = amm_link amm_page = self.lxmlize(a.attrib["href"]) for tr in amm_page.xpath('//tr'): tds = tr.xpath("./td") if len(tds) > 1: if "voting" in tds[0].text_content().lower(): self.find_vote(tds, vote_documents, "Amendment: ") elif title_text.startswith('engrossed version'): if tds[1].text_content().strip(): engrossment_base = "http://legis.delaware.gov/LIS/lis{session}.nsf/EngrossmentsforLookup/{bill_id}/$file/Engross.html?open" engrossment_link = engrossment_base.format( session=session, bill_id="+".join(bill_id.split())) if bill_url not in bill_documents.values(): bill_documents["Engrossed Version"] = engrossment_link elif title_text.startswith('substituted'): content = tds[1].text_content().strip() if ("Substitute" in content and not "Original" in content): sub_link = tds[1].xpath(".//a/@href")[0] elif ("full text" in title_text and ("(" not in title_text or "html" in title_text)): if tds[1].text_content().strip(): #it is totally unclear which version of the bill is referred to here #so I'm just calling it "bill text" bill_url = text_base_url.format(session=session, bill_id=bill_id.replace( " ", "+")) if bill_url not in bill_documents.values(): bill_documents["Bill Text"] = bill_url elif title_text.startswith('fiscal notes'): pass #skipping fiscal notes for now, they are really ugly #but leaving in as a placeholder so we can remember to #do this someday, if we feel like it elif title_text.startswith('committee reports'): pass #the committee reports let a legislator #comment on a bill. They can comment as #"favorable","unfavorable" or "on its merits" #but these are NOT votes (per conversation w #seceretary of the DE senate 3/16/15). The bill is #considered if the majority sign it, which will #appear in the bill's action history as being #reported out of committee elif title_text.startswith('voting'): self.find_vote(tds, vote_documents) elif title_text.startswith('actions history'): action_list = tds[1].text_content().split("\n") sub_versions = [] use_sub = False if sub_link: bill = self.scrape_bill(sub_link, chamber, session) if bill: sub_versions = [v["url"] for v in bill["versions"]] bill.add_title(bill_id) use_sub = True if not use_sub: bill = Bill(session, chamber, bill_id, bill_title) for s in primary_sponsors: bill.add_sponsor("primary", s) for s in addl_sponsors: #it is not totally clear whether "additional sponsors" #are co or primary but primary is my best guess #based on the bill text, bc they're on the first #line with the primary sponsor bill.add_sponsor("primary", s) for s in cosponsors: bill.add_sponsor("cosponsor", s) for name, doc_link in bill_documents.items(): if "Engrossment" in name or "Bill Text" in name: if doc_link not in sub_versions: bill.add_version(name, doc_link, mimetype="text/html") else: pass bill.add_document(name, doc_link, mimetype="text/html") for a in action_list: if a.strip(): date, action = a.split('-', 1) try: date = datetime.strptime(date.strip(), '%b %d, %Y') except ValueError: date = datetime.strptime(date.strip(), '%B %d, %Y') # XXX: ugh. action = action.strip() actor = actions.get_actor(action, bill['chamber']) attrs = dict(actor=actor, action=action, date=date) attrs.update(**self.categorizer.categorize(action)) attrs["action"] = " ".join(attrs["action"].split()) bill.add_action(**attrs) for name, doc in vote_documents.items(): vote_chamber = "lower" if "house" in name.lower() else "upper" try: self.head(doc) except requests.exceptions.HTTPError: self.logger.warning("could not access vote document") continue vote_page = self.lxmlize(doc) vote_info = vote_page.xpath(".//div[@id='page_content']/p")[-1] yes_votes = [] no_votes = [] other_votes = [] lines = vote_info.text_content().split("\n") for line in lines: if line.strip().startswith("Date"): date_str = " ".join(line.split()[1:4]) date = datetime.strptime(date_str, "%m/%d/%Y %I:%M %p") passage_status = line.strip().split()[-1] #we've never seen a vote with anything but "passed" #so throw an error otherwise so we can figure it out passed_statuses = ["Passed"] failed_statuses = ["Defeated", "Rescinded"] if passage_status not in passed_statuses + failed_statuses: raise AssertionError( "Unknown passage state {}".format(passage_status)) passed = passage_status in passed_statuses if line.strip().startswith("Vote Type"): if "voice" in line.lower(): voice_vote = True else: voice_vote = False yes_count = int(re.findall("Yes: (\d+)", line)[0]) no_count = int(re.findall("No: (\d+)", line)[0]) other_count = int( re.findall("Not Voting: (\d+)", line)[0]) other_count += int( re.findall("Absent: (\d+)", line)[0]) vote_tds = vote_page.xpath(".//table//td") person_seen = False for td in vote_tds: if person_seen: person_vote = td.text_content().strip() if person_vote == "Y": yes_votes.append(person) elif person_vote == "N": no_votes.append(person) elif person_vote in ["NV", "A", "X", "C"]: other_votes.append(person) else: raise AssertionError( "Unknown vote '{}'".format( person_vote)) person_seen = False else: person = td.text_content().strip() if person: person_seen = True if voice_vote: vote = Vote(vote_chamber, date, "passage", passed, 0, 0, 0) else: vote = Vote(vote_chamber, date, "passage", passed, yes_count, no_count, other_count, yes_votes=[], no_votes=[], other_votes=[]) vote["yes_votes"] = yes_votes vote["no_votes"] = no_votes vote["other_votes"] = other_votes if (passed and vote["yes_count"] <= vote["no_count"] and not voice_vote): raise AssertionError("Vote passed with more N than Y votes?") if not passed and vote["yes_count"] > vote["no_count"]: self.logger.warning("Vote did not pass but had a majority \ probably worth checking") if "Amendment" in name: vote["type"] = "amendment" else: vote["type"] = "passage" vote.add_source(doc) bill.add_vote(vote) bill.add_source(link) return bill
def scrape_bill_type(self, chamber, session, bill_type, type_abbr): if chamber == 'upper': chamber_name = 'SENATE' else: chamber_name = 'ASSEMBLY' bills = self.session.query(CABill).filter_by( session_year=session).filter_by( measure_type=type_abbr) for bill in bills: bill_session = session if bill.session_num != '0': bill_session += ' Special Session %s' % bill.session_num bill_id = bill.short_bill_id fsbill = Bill(bill_session, chamber, bill_id, '') # Construct session for web query, going from '20092010' to '0910' source_session = session[2:4] + session[6:8] # Turn 'AB 10' into 'ab_10' source_num = "%s_%s" % (bill.measure_type.lower(), bill.measure_num) # Construct a fake source url source_url = ("http://www.leginfo.ca.gov/cgi-bin/postquery?" "bill_number=%s&sess=%s" % (source_num, source_session)) fsbill.add_source(source_url) scraped_versions = self.scrape_site_versions(source_url) title = '' short_title = '' type = ['bill'] subject = '' all_titles = set() i = 0 for version in bill.versions: if not version.bill_xml: continue title = clean_title(version.title) if title: all_titles.add(title) short_title = clean_title(version.short_title) type = [bill_type] if version.appropriation == 'Yes': type.append('appropriation') if version.fiscal_committee == 'Yes': type.append('fiscal committee') if version.local_program == 'Yes': type.append('local program') if version.urgency == 'Yes': type.append('urgency') if version.taxlevy == 'Yes': type.append('tax levy') if version.subject: subject = clean_title(version.subject) date = version.bill_version_action_date.date() url = '' try: scraped_version = scraped_versions[i] if scraped_version[0] == date: url = scraped_version[1] i += 1 except IndexError: pass fsbill.add_version( version.bill_version_id, url, date=date, title=title, short_title=short_title, subject=[subject], type=type) if not title: self.warning("Couldn't find title for %s, skipping" % bill_id) continue fsbill['title'] = title fsbill['short_title'] = short_title fsbill['type'] = type fsbill['subjects'] = [subject] # We don't want the current title in alternate_titles all_titles.remove(title) fsbill['alternate_titles'] = list(all_titles) for author in version.authors: if author.house == chamber_name: fsbill.add_sponsor(author.contribution, author.name) introduced = False for action in bill.actions: if not action.action: # NULL action text seems to be an error on CA's part, # unless it has some meaning I'm missing continue actor = action.actor or chamber actor = actor.strip() match = re.match(r'(Assembly|Senate)($| \(Floor)', actor) if match: actor = {'Assembly': 'lower', 'Senate': 'upper'}[match.group(1)] elif actor.startswith('Governor'): actor = 'executive' else: actor = re.sub('^Assembly', 'lower', actor) actor = re.sub('^Senate', 'upper', actor) type = [] act_str = action.action act_str = re.sub(r'\s+', ' ', act_str) if act_str.startswith('Introduced'): introduced = True type.append('bill:introduced') if 'Read first time.' in act_str: if not introduced: type.append('bill:introduced') introduced = True type.append('bill:reading:1') if 'To Com' in act_str or 'referred to' in act_str.lower(): type.append('committee:referred') if 'Read third time. Passed.' in act_str: type.append('bill:passed') if 'Approved by Governor' in act_str: type.append('governor:signed') if 'Item veto' in act_str: type.append('governor:vetoed:line-item') if 'Vetoed by Governor' in act_str: type.append('governor:vetoed') if 'To Governor' in act_str: type.append('governor:received') if 'Read second time' in act_str: type.append('bill:reading:2') if not type: type = ['other'] fsbill.add_action(actor, act_str, action.action_date.date(), type=type) for vote in bill.votes: if vote.vote_result == '(PASS)': result = True else: result = False full_loc = vote.location.description first_part = full_loc.split(' ')[0].lower() if first_part in ['asm', 'assembly']: vote_chamber = 'lower' vote_location = ' '.join(full_loc.split(' ')[1:]) elif first_part.startswith('sen'): vote_chamber = 'upper' vote_location = ' '.join(full_loc.split(' ')[1:]) else: raise ScrapeError("Bad location: %s" % full_loc) motion = vote.motion.motion_text or '' if "Third Reading" in motion or "3rd Reading" in motion: vtype = 'passage' elif "Do Pass" in motion: vtype = 'passage' else: vtype = 'other' motion = motion.strip() # Why did it take until 2.7 to get a flags argument on re.sub? motion = re.compile(r'(\w+)( Extraordinary)? Session$', re.IGNORECASE).sub('', motion) motion = re.compile(r'^(Senate|Assembly) ', re.IGNORECASE).sub('', motion) motion = re.sub(r'^(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? ', '', motion) motion = re.sub(r' \(\w+\)$', '', motion) motion = re.sub(r'(SCR|SB|AB|AJR|ACR)\s?\d+ \w+\.?$', '', motion) motion = re.sub(r'(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? ' r'Urgency Clause$', '(Urgency Clause)', motion) motion = re.sub(r'\s+', ' ', motion) if not motion: self.warning("Got blank motion on vote for %s" % bill_id) continue fsvote = Vote(vote_chamber, self._tz.localize(vote.vote_date_time), motion, result, int(vote.ayes), int(vote.noes), int(vote.abstain), threshold=vote.threshold, type=vtype) if vote_location != 'Floor': fsvote['committee'] = vote_location for record in vote.votes: if record.vote_code == 'AYE': fsvote.yes(record.legislator_name) elif record.vote_code.startswith('NO'): fsvote.no(record.legislator_name) else: fsvote.other(record.legislator_name) # The abstain count field in CA's database includes # vacancies, which we aren't interested in. fsvote['other_count'] = len(fsvote['other_votes']) fsbill.add_vote(fsvote) self.save_bill(fsbill)
def scrape_bill_type(self, chamber, session, bill_type, type_abbr, committee_abbr_regex=get_committee_name_regex()): if chamber == 'upper': chamber_name = 'SENATE' else: chamber_name = 'ASSEMBLY' bills = self.session.query(CABill).filter_by( session_year=session).filter_by( measure_type=type_abbr) for bill in bills: bill_session = session if bill.session_num != '0': bill_session += ' Special Session %s' % bill.session_num bill_id = bill.short_bill_id fsbill = Bill(bill_session, chamber, bill_id, '') # # Construct session for web query, going from '20092010' to '0910' # source_session = session[2:4] + session[6:8] # # Turn 'AB 10' into 'ab_10' # source_num = "%s_%s" % (bill.measure_type.lower(), # bill.measure_num) # Construct a fake source url source_url = ('http://leginfo.legislature.ca.gov/faces/' 'billNavClient.xhtml?bill_id=%s') % bill.bill_id fsbill.add_source(source_url) fsbill.add_version(bill_id, source_url, 'text/html') title = '' type_ = ['bill'] subject = '' all_titles = set() # Get digest test (aka "summary") from latest version. if bill.versions: version = bill.versions[-1] nsmap = version.xml.nsmap xpath = '//caml:DigestText/xhtml:p' els = version.xml.xpath(xpath, namespaces=nsmap) chunks = [] for el in els: t = etree_text_content(el) t = re.sub(r'\s+', ' ', t) t = re.sub(r'\)(\S)', lambda m: ') %s' % m.group(1), t) chunks.append(t) summary = '\n\n'.join(chunks) for version in bill.versions: if not version.bill_xml: continue # CA is inconsistent in that some bills have a short title # that is longer, more descriptive than title. if bill.measure_type in ('AB', 'SB'): impact_clause = clean_title(version.title) title = clean_title(version.short_title) else: impact_clause = None if len(version.title) < len(version.short_title) and \ not version.title.lower().startswith('an act'): title = clean_title(version.short_title) else: title = clean_title(version.title) if title: all_titles.add(title) type_ = [bill_type] if version.appropriation == 'Yes': type_.append('appropriation') if version.fiscal_committee == 'Yes': type_.append('fiscal committee') if version.local_program == 'Yes': type_.append('local program') if version.urgency == 'Yes': type_.append('urgency') if version.taxlevy == 'Yes': type_.append('tax levy') if version.subject: subject = clean_title(version.subject) if not title: self.warning("Couldn't find title for %s, skipping" % bill_id) continue fsbill['title'] = title fsbill['summary'] = summary fsbill['type'] = type_ fsbill['subjects'] = filter(None, [subject]) fsbill['impact_clause'] = impact_clause # We don't want the current title in alternate_titles all_titles.remove(title) fsbill['alternate_titles'] = list(all_titles) for author in version.authors: if author.house == chamber_name: fsbill.add_sponsor(SPONSOR_TYPES[author.contribution], author.name, official_type=author.contribution) for action in bill.actions: if not action.action: # NULL action text seems to be an error on CA's part, # unless it has some meaning I'm missing continue actor = action.actor or chamber actor = actor.strip() match = re.match(r'(Assembly|Senate)($| \(Floor)', actor) if match: actor = {'Assembly': 'lower', 'Senate': 'upper'}[match.group(1)] elif actor.startswith('Governor'): actor = 'other' else: def replacer(matchobj): if matchobj: return {'Assembly': 'lower', 'Senate': 'upper'}[matchobj.group()] else: return matchobj.group() actor = re.sub(r'^(Assembly|Senate)', replacer, actor) type_ = [] act_str = action.action act_str = re.sub(r'\s+', ' ', act_str) attrs = self.categorizer.categorize(act_str) # Add in the committee strings of the related committees, if any. kwargs = attrs matched_abbrs = committee_abbr_regex.findall(action.action) if 'Com. on' in action.action and not matched_abbrs: msg = 'Failed to extract committee abbr from %r.' self.logger.warning(msg % action.action) if matched_abbrs: committees = [] for abbr in matched_abbrs: try: name = self.committee_abbr_to_name(chamber, abbr) except KeyError: msg = ('Mapping contains no committee name for ' 'abbreviation %r. Action text was %r.') args = (abbr, action.action) raise KeyError(msg % args) else: committees.append(name) committees = filter(None, committees) kwargs['committees'] = committees code = re.search(r'C[SXZ]\d+', actor) if code is not None: code = code.group() kwargs['actor_info'] = {'committee_code': code} assert len(committees) == len(matched_abbrs) for committee, abbr in zip(committees, matched_abbrs): act_str = act_str.replace('Com. on ' + abbr, committee) act_str = act_str.replace(abbr, committee) changed = False for string in ['upper', 'lower', 'joint']: if actor.startswith(string): actor = string changed = True break if not changed: actor = 'other' if actor != action.actor: actor_info = kwargs.get('actor_info', {}) actor_info['details'] = action.actor kwargs['actor_info'] = actor_info # Add strings for related legislators, if any. rgx = '(?:senator|assembly[mwp][^ .,:;]+)\s+[^ .,:;]+' legislators = re.findall(rgx, action.action, re.I) if legislators: kwargs['legislators'] = legislators fsbill.add_action(actor, act_str, action.action_date.date(), **kwargs) for vote in bill.votes: if vote.vote_result == '(PASS)': result = True else: result = False full_loc = vote.location.description first_part = full_loc.split(' ')[0].lower() if first_part in ['asm', 'assembly']: vote_chamber = 'lower' vote_location = ' '.join(full_loc.split(' ')[1:]) elif first_part.startswith('sen'): vote_chamber = 'upper' vote_location = ' '.join(full_loc.split(' ')[1:]) else: raise ScrapeError("Bad location: %s" % full_loc) if vote.motion: motion = vote.motion.motion_text or '' else: motion = '' if "Third Reading" in motion or "3rd Reading" in motion: vtype = 'passage' elif "Do Pass" in motion: vtype = 'passage' else: vtype = 'other' motion = motion.strip() # Why did it take until 2.7 to get a flags argument on re.sub? motion = re.compile(r'(\w+)( Extraordinary)? Session$', re.IGNORECASE).sub('', motion) motion = re.compile(r'^(Senate|Assembly) ', re.IGNORECASE).sub('', motion) motion = re.sub(r'^(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? ', '', motion) motion = re.sub(r' \(\w+\)$', '', motion) motion = re.sub(r'(SCR|SB|AB|AJR|ACR)\s?\d+ \w+\.?$', '', motion) motion = re.sub(r'(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? ' r'Urgency Clause$', '(Urgency Clause)', motion) motion = re.sub(r'\s+', ' ', motion) if not motion: self.warning("Got blank motion on vote for %s" % bill_id) continue fsvote = Vote(vote_chamber, self._tz.localize(vote.vote_date_time), motion, result, int(vote.ayes), int(vote.noes), int(vote.abstain), threshold=vote.threshold, type_=vtype) if vote_location != 'Floor': fsvote['committee'] = vote_location for record in vote.votes: if record.vote_code == 'AYE': fsvote.yes(record.legislator_name) elif record.vote_code.startswith('NO'): fsvote.no(record.legislator_name) else: fsvote.other(record.legislator_name) for s in ('yes', 'no', 'other'): # Kill dupe votes. key = s + '_votes' fsvote[key] = list(set(fsvote[key])) # In a small percentage of bills, the integer vote counts # are inaccurate, so let's ignore them. for k in ('yes', 'no', 'other'): fsvote[k + '_count'] = len(fsvote[k + '_votes']) fsbill.add_vote(fsvote) self.save_bill(fsbill)
def scrape_bill(self, chamber, session, bill_id, bill_type): url = '%s?r=%s' % (self.base_url, bill_id) html = self.get(url).text if "error '80020009'" in html: self.warning('asp error on page, skipping %s', bill_id) return doc = lxml.html.fromstring(html) # search for Titulo, accent over i messes up lxml, so use 'tulo' title = doc.xpath(u'//td/b[contains(text(),"tulo")]/../following-sibling::td/text()') if not title: raise NoSuchBill() bill = Bill(session, chamber, bill_id, title[0], type=bill_type) author = doc.xpath(u'//td/b[contains(text(),"Autor")]/../text()')[0] for aname in author.split(','): aname = self.clean_name(aname).strip() if aname: bill.add_sponsor('primary', aname) co_authors = doc.xpath(u'//td/b[contains(text(),"Co-autor")]/../text()') if len(co_authors) != 0: for co_author in co_authors[1].split(','): bill.add_sponsor('cosponsor', self.clean_name(co_author).strip()); action_table = doc.xpath('//table')[-1] for row in action_table[1:]: tds = row.xpath('td') # ignore row missing date if len(tds) != 2: continue if tds[0].text_content(): date = datetime.datetime.strptime(tds[0].text_content(), "%m/%d/%Y") action = tds[1].text_content().strip() #parse the text to see if it's a new version or a unrelated document #if has a hyphen let's assume it's a vote document #get url of action action_url = tds[1].xpath('a/@href') atype,action = self.parse_action(chamber,bill,action,action_url,date) # Some lower-house roll calls could be parsed, but finnicky # Most roll lists are just images embedded within a document, # and offer no alt text to scrape # Instead, just scrape the vote counts vote_info = re.search(r'(?u)^(.*),\s([\s\d]{2})-([\s\d]{2})-([\s\d]{2})-([\s\d]{0,2})$', action) if vote_info and re.search(r'\d{1,2}', action): vote_name = vote_info.group(1) if u"Votación Final" in vote_name: (vote_chamber, vote_name) = re.search( r'(?u)^\w+ por (.*?) en (.*)$', vote_name).groups() if "Senado" in vote_chamber: vote_chamber = 'upper' else: vote_chamber = 'lower' elif "Cuerpo de Origen" in vote_name: vote_name = re.search( r'(?u)^Cuerpo de Origen (.*)$', vote_name).group(1) vote_chamber = chamber elif u"informe de Comisión de Conferencia" in vote_name: (vote_chamber, vote_name) = re.search( r'(?u)^(\w+) (\w+ informe de Comisi\wn de Conferencia)$', vote_name).groups() if vote_chamber == "Senado": vote_chamber = 'upper' else: vote_chamber = 'lower' elif u"Se reconsideró" in vote_name: if bill['votes']: vote_chamber = bill['votes'][-1]['chamber'] else: vote_chamber = chamber else: raise AssertionError( u"Unknown vote text found: {}".format(vote_name)) vote_name = vote_name.title() yes = int(vote_info.group(2)) no = int(vote_info.group(3)) other = 0 if vote_info.group(4).strip(): other += int(vote_info.group(4)) if vote_info.group(5).strip(): other += int(vote_info.group(5)) vote = Vote( chamber=vote_chamber, date=date, motion=vote_name, passed=(yes > no), yes_count=yes, no_count=no, other_count=other ) vote.add_source(url) bill.add_vote(vote) bill.add_source(url) self.save_bill(bill)
def scrape_bill(self, chamber, session, url): url = url + "&Year=%s" % session with self.urlopen(url) as page: page = page.replace(' ', ' ').replace('<br>', '\n') page = lxml.html.fromstring(page) page.make_links_absolute(url) title = page.xpath('//h3')[0].text.strip() title = re.match(r"^\w+\s+\d+:\s+(.*)$", title).group(1) bill_id = page.xpath("string(//pre[@class='billhistory']/b)") bill_id = bill_id.split()[0].strip() bill = Bill(session, chamber, bill_id, title) bill.add_source(url) hist = page.xpath("string(//pre[@class='billhistory'])") act_re = re.compile(r'^\s+(\d\d/\d\d/\d\d) (SENATE|HOUSE|\s)' r'([^\n]+\n?(\s{16,16}.*\n){0,})', re.MULTILINE) # Actions for match in act_re.finditer(hist): action = match.group(3).replace('\n', ' ') action = re.sub(r'\s+', ' ', action).strip() actor = match.group(2) if actor == 'SENATE': actor = 'upper' elif actor == 'HOUSE': actor = 'lower' else: actor = 'executive' date = match.group(1) date = datetime.datetime.strptime(date, "%m/%d/%y") for act_text in re.split(' -[HS]J \d+;? ?', action): act_text = act_text.strip() if not act_text: continue types = [] act_lower = act_text.lower() if act_lower.startswith('introduced'): types.append('bill:introduced') if 'referred to' in act_lower: types.append('committee:referred') if 'died in committee' in act_lower: types.append('committee:failed') if 'favorable by' in act_lower: types.append('committee:passed:favorable') if 'amendment(s) adopted' in act_lower: types.append('amendment:passed') bill.add_action(actor, act_text, date, type=types) # Sponsors primary_sponsor = re.search(r'by ([^;(\n]+;?|\w+)', hist).group(1).strip('; ') bill.add_sponsor('primary', primary_sponsor) cospon_re = re.compile(r'\((CO-SPONSORS|CO-AUTHORS)\) ' '([\w .]+(;[\w .\n]+){0,})', re.MULTILINE) match = cospon_re.search(hist) if match: for cosponsor in match.group(2).split(';'): cosponsor = cosponsor.replace('\n', '').strip() bill.add_sponsor('cosponsor', cosponsor) # Versions for link in page.xpath("//a[contains(@href, 'billtext/html')]"): version = link.xpath('string(../../td[1])').strip() bill.add_version(version, link.attrib['href']) # House Votes for link in page.xpath("//a[contains(@href, 'votes/html/h')]"): bill.add_vote(self.scrape_lower_vote(link.attrib['href'])) # Senate Votes for link in page.xpath("//a[contains(@href, 'votes/html/S')]"): bill.add_vote(self.scrape_upper_vote(link.attrib['href'])) self.save_bill(bill)
def scrape_bill_sheet(self, session, chamber): """ Scrape the bill sheet (the page full of bills and other small bits of data) """ sheet_url = self.get_bill_folder(session, chamber) bill_chamber = {"Senate": "upper", "House": "lower"}[chamber] index = { "id": 0, "title_sponsor": 1, "version": 2, "history": 3, "votes": 7 } with self.urlopen(sheet_url) as sheet_html: sheet_page = lxml.html.fromstring(sheet_html) bills = sheet_page.xpath('//table/tr') for bill in bills: bill_id = self.read_td(bill[index["id"]][0]) if bill_id == None: # Every other entry is null for some reason continue dot_loc = bill_id.find('.') if dot_loc != -1: # budget bills are missing the .pdf, don't truncate bill_id = bill_id[:dot_loc] title_and_sponsor = bill[index["title_sponsor"]][0] bill_title = title_and_sponsor.text bill_title_and_sponsor = title_and_sponsor.text_content() sponsors = bill_title_and_sponsor.replace(bill_title, "").\ replace(" & ...", "").split("--") cats = { "SB": "bill", "HB": "bill", "HR": "resolution", "SR": "resolution", "SCR": "concurrent resolution", "HCR": "concurrent resolution", "SJR": "joint resolution", "HJR": "joint resolution", "SM": "memorial", "HM": "memorial" } bill_type = None for cat in cats: if bill_id[:len(cat)] == cat: bill_type = cats[cat] b = Bill(session, bill_chamber, bill_id, bill_title, type=bill_type) b.add_source(sheet_url) versions_url = \ bill[index["version"]].xpath('font/a')[0].attrib["href"] versions_url = CO_URL_BASE + versions_url versions = self.parse_versions(versions_url) for version in versions: b.add_version(version['name'], version['link'], mimetype=version['mimetype']) bill_history_href = CO_URL_BASE + \ bill[index["history"]][0][0].attrib['href'] # ^^^^^^^ We assume this is a full path to the target. # might want to consider some better rel-path support # XXX: Look at this ^ history = self.parse_history(bill_history_href) b.add_source(bill_history_href) for action in history: self.add_action_to_bill(b, action) for sponsor in sponsors: if sponsor != None and sponsor != "(NONE)" and \ sponsor != "": b.add_sponsor("primary", sponsor) # Now that we have history, let's see if we can't grab some # votes bill_vote_href = self.get_vote_url(bill_id, session) votes = self.parse_votes(bill_vote_href) if votes['sanity-check'] != bill_id: self.warning("XXX: READ ME! Sanity check failed!") self.warning(" -> Scraped ID: " + votes['sanity-check']) self.warning(" -> 'Real' ID: " + bill_id) assert votes['sanity-check'] == bill_id for vote in votes['votes']: filed_votes = vote['votes'] passage = vote['meta'] result = vote['result'] composite_time = "%s %s" % (passage['x-parent-date'], passage['TIME']) # It's now like: 04/01/2011 02:10:14 PM pydate = dt.datetime.strptime(composite_time, "%m/%d/%Y %I:%M:%S %p") hasHouse = "House" in passage['x-parent-ctty'] hasSenate = "Senate" in passage['x-parent-ctty'] if hasHouse and hasSenate: actor = "joint" elif hasHouse: actor = "lower" else: actor = "upper" other = (int(result['EXC']) + int(result['ABS'])) # OK, sometimes the Other count is wrong. local_other = 0 for voter in filed_votes: l_vote = filed_votes[voter].lower().strip() if l_vote != "yes" and l_vote != "no": local_other = local_other + 1 if local_other != other: self.warning( \ "XXX: !!!WARNING!!! - resetting the 'OTHER' VOTES" ) self.warning(" -> Old: %s // New: %s" % (other, local_other)) other = local_other v = Vote(actor, pydate, passage['MOTION'], (result['FINAL_ACTION'] == "PASS"), int(result['YES']), int(result['NO']), other, moved=passage['MOVED'], seconded=passage['SECONDED']) v.add_source(vote['meta']['url']) # v.add_source( bill_vote_href ) # XXX: Add more stuff to kwargs, we have a ton of data for voter in filed_votes: who = voter vote = filed_votes[who] if vote.lower() == "yes": v.yes(who) elif vote.lower() == "no": v.no(who) else: v.other(who) b.add_vote(v) self.save_bill(b)
def scrape_xml(self, chamber, session): start_letter = 'S' if chamber == 'upper' else 'H' sponsor_type_dict = {'3': 'cosponsor', '4': 'primary', '5': 'primary'} version_url = 'http://www1.legis.ga.gov/legis/%s/versions/' % session summary_url = ( 'http://www1.legis.ga.gov/legis/%s/list/BillSummary.xml' % session) xml = self.urlopen(summary_url).bytes doc = lxml.etree.fromstring(xml) for bxml in doc.xpath('//Bill'): type = bxml.get('Type') # if this is from the other chamber skip it if not type.startswith(start_letter): continue bill_id = type + bxml.get('Num') + bxml.get('Suffix') if type in ('HB', 'SB'): type = 'bill' elif type in ('HR', 'SR'): type = 'resolution' else: raise ValueError('unknown type: %s' % type) # use short_title as title and long as summary title = bxml.xpath('Short_Title/text()') summary = bxml.xpath('Title/text()') if summary: summary = summary[0] else: summary = '' if title: title = title[0] else: title = summary summary = '' if not title and not summary: self.warning('no title or summary for %s, skipping' % bill_id) continue bill = Bill(session, chamber, bill_id, title, type=type, summary=summary) bill_url = 'http://www1.legis.ga.gov/legis/%s/sum/%s.htm' % ( session, bill_id.lower()) bill.add_source(bill_url) bill.add_source(summary_url) # get votes from ids bhtml = lxml.html.fromstring(self.urlopen(bill_url)) vote_links = bhtml.xpath('//a[contains(@href, "/votes/")]/@href') vote_ids = [l.rsplit('/')[-1].split('.')[0] for l in vote_links] for vid in vote_ids: bill.add_vote(self.votes[vid]) for sponsor in bxml.xpath('Sponsor'): sponsor_name, code = sponsor.text.rsplit(' ', 1) sponsor_name = sponsor_name.replace(',', ', ') bill.add_sponsor(sponsor_type_dict[sponsor.get('Type')], sponsor_name, _code=code) for version in bxml.xpath('Versions/Version'): # NOTE: it is possible to get PDF versions by using .get('Id') # ex. URL: legis.ga.gov/Legislation/20112012/108025.pdf # for now we just get HTML description, file_id = version.xpath('*/text()') bill.add_version(description, version_url + file_id, mimetype='text/html') for action in bxml.xpath('StatusHistory/Status'): date = datetime.datetime.strptime(action.get('StatusDate'), "%Y-%m-%dT%H:%M:%S") code = action.get('StatusCode') if code in ('EFF', 'Signed Gov'): actor = 'executive' elif code[0] == 'S': actor = 'upper' elif code[0] == 'H': actor = 'lower' try: atype = self._action_codes[code] except KeyError: self.warning("unknown action code %s on %s" % (code, action.text)) bill.add_action(actor, action.text, date, atype) self.save_bill(bill)
def scrape_pre_2009_bill(self, chamber, session, bill_id, short_title=''): """bills from 2008 and below are in a 'pre' element and is simpler to parse them as text""" url = 'http://legislature.idaho.gov/legislation/%s/%s.html' % (session, bill_id.replace(' ', '')) bill_page = self.get(url).text html = lxml.html.fromstring(bill_page) text = html.xpath('//pre')[0].text.split('\r\n') # title title = " - ".join([x.strip() for x in text[1].split('-') if x.isupper()]) # bill type bill_type = get_bill_type(bill_id) bill = Bill(session, chamber, bill_id, title, type=bill_type) # sponsors sponsors = text[0].split('by')[-1] for sponsor in sponsors.split(','): bill.add_sponsor('primary', sponsor) actor = chamber self.flag() # clear last bills vote flags self.vote = None # for line in text: if re.match(r'^\d\d/\d\d', line): date = date = datetime.datetime.strptime(line[0:5] + '/' + session[0:4], "%m/%d/%Y") self.last_date = date action_text = line[5:].strip() # actor if action_text.lower().startswith('house') or \ action_text.lower().startswith('senate'): actor = {'H': 'lower', 'S': 'upper'}[action_text[0]] action = get_action(actor, action_text) bill.add_action(actor, action_text, date, type=action) if "bill:passed" in action or "bill:failed" in action: passed = False if 'FAILED' in action_text else True votes = re.search(r'(\d+)-(\d+)-(\d+)', action_text) if votes: yes, no, other = votes.groups() self.in_vote = True self.vote = Vote(chamber, date, action_text, passed, int(yes), int(no), int(other)) else: date = self.last_date # nothing to do if its not a vote if "Floor Sponsor" in line: self.in_vote = False if self.vote: bill.add_vote(self.vote) self.vote = None if not self.in_vote: continue if 'AYES --' in line: self.flag(ayes=True) elif 'NAYS --' in line: self.flag(nays=True) elif 'Absent and excused' in line: self.flag(other=True) if self.ayes: for name in line.replace('AYES --', '').split(','): name = name.split('(')[0].strip() if name: self.vote.yes(name) if self.nays: for name in line.replace('NAYS --', '').split(','): name = name.split('(')[0].strip() if name: self.vote.no(name) if self.other: for name in line.replace('Absent and excused --', '').split(','): name = name.split('(')[0].strip() if name: self.vote.other(name) self.save_bill(bill)
def scrape_bill(self, chamber, session, bill_id): # try and get bill for current year url = 'http://legislature.mi.gov/doc.aspx?%s-%s' % ( session[:4], bill_id.replace(' ', '-')) html = self.urlopen(url) # if first page isn't found, try second year if 'Page Not Found' in html: html = self.urlopen('http://legislature.mi.gov/doc.aspx?%s-%s' % (session[-4:], bill_id.replace(' ', '-'))) if 'Page Not Found' in html: return None doc = lxml.html.fromstring(html) title = doc.xpath( '//span[@id="frg_billstatus_ObjectSubject"]')[0].text_content() # get B/R/JR/CR part and look up bill type bill_type = bill_types[bill_id.split(' ')[0][1:]] bill = Bill(session=session, chamber=chamber, bill_id=bill_id, title=title, type=bill_type) bill.add_source(url) # sponsors sp_type = 'primary' for sponsor in doc.xpath( '//span[@id="frg_billstatus_SponsorList"]/a/text()'): sponsor = sponsor.replace(u'\xa0', ' ') bill.add_sponsor(sp_type, sponsor) sp_type = 'cosponsor' bill['subjects'] = doc.xpath( '//span[@id="frg_billstatus_CategoryList"]/a/text()') # actions (skip header) for row in doc.xpath( '//table[@id="frg_billstatus_HistoriesGridView"]/tr')[1:]: tds = row.xpath('td') # date, journal link, action date = tds[0].text_content() journal = tds[1].text_content() action = tds[2].text_content() date = datetime.datetime.strptime(date, "%m/%d/%Y") # instead of trusting upper/lower case, use journal for actor actor = 'upper' if 'SJ' in journal else 'lower' type = categorize_action(action) bill.add_action(actor, action, date, type=type) # check if action mentions a vote rcmatch = re.search('Roll Call # (\d+)', action, re.IGNORECASE) if rcmatch: rc_num = rcmatch.groups()[0] # in format mileg.aspx?page=getobject&objectname=2011-SJ-02-10-011 journal_link = tds[1].xpath('a/@href') if journal_link: objectname = journal_link[0].rsplit('=', 1)[-1] chamber_name = {'upper': 'Senate', 'lower': 'House'}[actor] vote_url = BASE_URL + '/documents/%s/Journal/%s/htm/%s.htm' % ( session, chamber_name, objectname) vote = Vote(actor, date, action, False, 0, 0, 0) self.parse_roll_call(vote, vote_url, rc_num) # check the expected counts vs actual count = re.search('YEAS (\d+)', action, re.IGNORECASE) count = int(count.groups()[0]) if count else 0 if count != len(vote['yes_votes']): self.warning( 'vote count mismatch for %s %s, %d != %d' % (bill_id, action, count, len(vote['yes_votes']))) count = re.search('NAYS (\d+)', action, re.IGNORECASE) count = int(count.groups()[0]) if count else 0 if count != len(vote['no_votes']): self.warning( 'vote count mismatch for %s %s, %d != %d' % (bill_id, action, count, len(vote['no_votes']))) vote['yes_count'] = len(vote['yes_votes']) vote['no_count'] = len(vote['no_votes']) vote['other_count'] = len(vote['other_votes']) vote['passed'] = vote['yes_count'] > vote['no_count'] vote.add_source(vote_url) bill.add_vote(vote) else: self.warning("missing journal link for %s %s" % (bill_id, journal)) # versions for row in doc.xpath( '//table[@id="frg_billstatus_DocumentGridTable"]/tr'): version = self.parse_doc_row(row) if version: bill.add_version(*version) # documents for row in doc.xpath('//table[@id="frg_billstatus_HlaTable"]/tr'): document = self.parse_doc_row(row) if document: bill.add_document(*document) for row in doc.xpath('//table[@id="frg_billstatus_SfaTable"]/tr'): document = self.parse_doc_row(row) if document: bill.add_document(*document) self.save_bill(bill) return True
def scrape_bill(self, chamber, session, bill_id): # try and get bill for current year url = 'http://legislature.mi.gov/doc.aspx?%s-%s' % ( session[:4], bill_id.replace(' ', '-')) html = self.get(url).text # if first page isn't found, try second year if ('Page Not Found' in html or 'The bill you are looking for is not available yet' in html): html = self.get('http://legislature.mi.gov/doc.aspx?%s-%s' % (session[-4:], bill_id.replace(' ','-'))).text if ('Page Not Found' in html or 'The bill you are looking for is not available yet' in html): return None doc = lxml.html.fromstring(html) title = doc.xpath('//span[@id="frg_billstatus_ObjectSubject"]')[0].text_content() # get B/R/JR/CR part and look up bill type bill_type = bill_types[bill_id.split(' ')[0][1:]] bill = Bill(session=session, chamber=chamber, bill_id=bill_id, title=title, type=bill_type) bill.add_source(url) # sponsors sp_type = 'primary' for sponsor in doc.xpath('//span[@id="frg_billstatus_SponsorList"]/a/text()'): sponsor = sponsor.replace(u'\xa0', ' ') bill.add_sponsor(sp_type, sponsor) sp_type = 'cosponsor' bill['subjects'] = doc.xpath('//span[@id="frg_billstatus_CategoryList"]/a/text()') # actions (skip header) for row in doc.xpath('//table[@id="frg_billstatus_HistoriesGridView"]/tr')[1:]: tds = row.xpath('td') # date, journal link, action date = tds[0].text_content() journal = tds[1].text_content() action = tds[2].text_content() date = datetime.datetime.strptime(date, "%m/%d/%Y") # instead of trusting upper/lower case, use journal for actor actor = 'upper' if 'SJ' in journal else 'lower' type = categorize_action(action) bill.add_action(actor, action, date, type=type) # check if action mentions a vote rcmatch = re.search('Roll Call # (\d+)', action, re.IGNORECASE) if rcmatch: rc_num = rcmatch.groups()[0] # in format mileg.aspx?page=getobject&objectname=2011-SJ-02-10-011 journal_link = tds[1].xpath('a/@href') if journal_link: objectname = journal_link[0].rsplit('=', 1)[-1] chamber_name = {'upper': 'Senate', 'lower': 'House'}[actor] vote_url = BASE_URL + '/documents/%s/Journal/%s/htm/%s.htm' % ( session, chamber_name, objectname) vote = Vote(actor, date, action, False, 0, 0, 0) self.parse_roll_call(vote, vote_url, rc_num) # check the expected counts vs actual count = re.search('YEAS (\d+)', action, re.IGNORECASE) count = int(count.groups()[0]) if count else 0 if count != len(vote['yes_votes']): self.warning('vote count mismatch for %s %s, %d != %d' % (bill_id, action, count, len(vote['yes_votes']))) count = re.search('NAYS (\d+)', action, re.IGNORECASE) count = int(count.groups()[0]) if count else 0 if count != len(vote['no_votes']): self.warning('vote count mismatch for %s %s, %d != %d' % (bill_id, action, count, len(vote['no_votes']))) vote['yes_count'] = len(vote['yes_votes']) vote['no_count'] = len(vote['no_votes']) vote['other_count'] = len(vote['other_votes']) vote['passed'] = vote['yes_count'] > vote['no_count'] vote.add_source(vote_url) bill.add_vote(vote) else: self.warning("missing journal link for %s %s" % (bill_id, journal)) # versions for row in doc.xpath('//table[@id="frg_billstatus_DocumentGridTable"]/tr'): version = self.parse_doc_row(row) if version: if version[1].endswith('.pdf'): mimetype = 'application/pdf' elif version[1].endswith('.htm'): mimetype = 'text/html' bill.add_version(*version, mimetype=mimetype) # documents for row in doc.xpath('//table[@id="frg_billstatus_HlaTable"]/tr'): document = self.parse_doc_row(row) if document: bill.add_document(*document) for row in doc.xpath('//table[@id="frg_billstatus_SfaTable"]/tr'): document = self.parse_doc_row(row) if document: bill.add_document(*document) self.save_bill(bill) return True
def _scrape_bill(self, session, bill_data): details = self._parse_bill_details(bill_data) (senate_url, assembly_url, bill_chamber, bill_type, bill_id, title, (prefix, number, active_version)) = details bill = Bill( session, bill_chamber, bill_id, title, type=bill_type, summary=bill_data['summary']) if bill_data['title'] is None: bill['title'] = bill_data['summary'] bill_active_version = bill_data['amendments']['items'][active_version] # Parse sponsors. if bill_data['sponsor']['rules'] == True: bill.add_sponsor('primary', 'Rules Committee', chamber=bill_chamber) elif not bill_data['sponsor']['budget']: primary_sponsor = bill_data['sponsor']['member'] bill.add_sponsor('primary', primary_sponsor['shortName']) # There *shouldn't* be cosponsors if there is no sponsor. cosponsors = bill_active_version['coSponsors']['items'] for cosponsor in cosponsors: bill.add_sponsor('cosponsor', cosponsor['shortName']) # List companion bill. same_as = bill_active_version.get('sameAs', {}) # Check whether "sameAs" property is populated with at least one bill. if same_as['items']: # Get companion bill ID. companion_bill_id = same_as['items'][0]['basePrintNo'] # Build companion bill session. start_year = same_as['items'][0]['session'] end_year = start_year + 1 companion_bill_session = '-'.join([str(start_year), str(end_year)]) # Determine companion bill chamber. companion_bill_prefix = self._parse_bill_number( same_as['items'][0]['basePrintNo'])[0] companion_bill_chamber = self._parse_bill_prefix( companion_bill_prefix)[0] # Attach companion bill data. bill.add_companion( companion_bill_id, companion_bill_session, companion_bill_chamber, ) # Parse actions. chamber_map = { 'senate': 'upper', 'assembly': 'lower', } for action in bill_data['actions']['items']: chamber = chamber_map[action['chamber'].lower()] action_datetime = datetime.datetime.strptime(action['date'], '%Y-%m-%d') action_date = action_datetime.date() types, attrs = NYBillScraper.categorizer.categorize(action['text']) bill.add_action( chamber, action['text'], action_date, type=types, **attrs) # Chamber-specific processing. if bill_chamber == 'upper': # Collect votes. for vote_data in bill_data['votes']['items']: vote = self._parse_senate_votes(vote_data) bill.add_vote(vote) elif bill_chamber == 'lower': assembly = AssemblyBillPage(self, session, bill, details) assembly.build() assembly_bill_data = assembly.bill # A little strange the way it works out, but the Assembly # provides the HTML version documents and the Senate provides # the PDF version documents. amendments = bill_data['amendments']['items'] for key, amendment in amendments.iteritems(): version = amendment['printNo'] html_version = version + ' HTML' html_url = 'http://assembly.state.ny.us/leg/?sh=printbill&bn='\ '{}&term={}'.format(bill_id, self.term_start_year) bill.add_version(html_version, html_version, mimetype='text/html') pdf_version = version + ' PDF' pdf_url = 'http://legislation.nysenate.gov/pdf/bills/{}/{}'\ .format(self.term_start_year, bill_id) bill.add_version(pdf_version, pdf_version, mimetype='application/pdf') # Handling of sources follows. Sources serving either chamber # maintain duplicate data, so we can see certain bill data # through either chamber's resources. However, we have to refer # to a specific chamber's resources if we want to grab certain # specific information such as vote data. # # As such, I'm placing all potential sources in the interest of # thoroughness. - Andy Lo # List Open Legislation API endpoint as a source. bill.add_source(self.api_client.root + self.api_client.\ resources['bill'].format( session_year=session, bill_id=bill_id, summary='', detail='')) bill.add_source(senate_url) bill.add_source(assembly_url) return bill
def scrape_bill(self, url, kw, re_amendment=re.compile(r'(^[A-Z]A \d{1,3}) to'), re_substitution=re.compile(r'(^[A-Z]S \d{1,2}) for'), re_digits=re.compile(r'\d{,5}'), actions_get_actor=actions.get_actor): bill = Bill(**kw) bill.add_source(url) #--------------------------------------------------------------------- # A few helpers. _url_2_lxml = self._url_2_lxml _cleanup_sponsors = self._cleanup_sponsors # Shortcut function partial to get text at a particular xpath: doc = _url_2_lxml(url) _get_text = partial(get_text, doc, 0) # Get session number--needed for fetching related documents (see below). xpath = '//font[contains(., "General Assembly") and @face="Arial"]' session_num = doc.xpath(xpath)[0].text_content() session_num = re_digits.match(session_num).group() #--------------------------------------------------------------------- # Sponsors chamber = bill['chamber'] sponsor_types = { 'Additional Sponsor(s):': 'cosponsor', 'CoSponsors:': 'cosponsor', 'Primary Sponsor:': 'primary'} xpath = '//font[contains(., "Sponsor") and @color="#008080"]' headings = doc.xpath(xpath + '/text()') sponsors = doc.xpath(xpath + '/../../following-sibling::td/font/text()') for h, s in zip(headings, sponsors): names = _cleanup_sponsors(s, chamber) type_ = sponsor_types[h.strip()] if names: for name, _chamber in names: bill.add_sponsor(type_, name, chamber=_chamber) #--------------------------------------------------------------------- # Versions tmp = '/'.join([ 'http://www.legis.delaware.gov', 'LIS/lis{session_num}.nsf/vwLegislation', '{moniker}/$file/{filename}{format_}?open']) documents = self.scrape_documents(source=url, docname="introduced", filename="Legis", tmp=tmp, session_num=session_num) for d in documents: bill.add_version(**d) # If bill is a substitution, add the original as a version. names = doc.xpath('//*[contains(text(), "Substituted ' 'Legislation for Bill:")]/text()') urls = doc.xpath('//*[contains(text(), "Substituted ' 'Legislation for Bill:")]' '/following-sibling::a/@href') for name, url in zip(names, urls): name = re_substitution.match(name).group(1) bill.add_version(name, url, description='original bill') #--------------------------------------------------------------------- # Actions actions = doc.xpath('//font[contains(., "Actions History")]' '/../following-sibling::table/descendant::td[2]') actions = actions[0].text_content() actions = filter(None, actions.splitlines()) for a in reversed(actions): date, action = a.split(' - ', 1) try: date = datetime.strptime(date, '%b %d, %Y') except ValueError: date = datetime.strptime(date, '%B %d, %Y') # XXX: ugh. actor = actions_get_actor(action, bill['chamber']) attrs = dict(actor=actor, action=action, date=date) attrs.update(**self.categorizer.categorize(action)) bill.add_action(**attrs) #--------------------------------------------------------------------- # Votes vote_strings = doc.xpath('//*[contains(text(), "vote:")]/text()') # Sometimes vote strings are contained in weird, separate elements. Probably # hand edited. if not all(re.search('\d', string) for string in vote_strings): # Use the parent's text_content instead. vote_strings = [] for el in doc.xpath('//*[contains(text(), "vote:")]/..'): vote_strings.append(el.text_content()) vote_urls = doc.xpath('//*[contains(text(), "vote:")]' '/following-sibling::a/@href') for string, url in zip(vote_strings, vote_urls): vote_data = parse_votestring(string) vote = self.scrape_vote(url, **vote_data) if vote: bill.add_vote(vote) #--------------------------------------------------------------------- # Amendments xpath = ("//font[contains(., 'Amendments')]/" "../../../td[2]/font/a") tmp = ('http://www.legis.delaware.gov/LIS/lis{session_num}.nsf/' 'vwLegislation/{id_}/$file/{filename}{format_}?open') for source, id_ in zip(doc.xpath(xpath + '/@href'), doc.xpath(xpath + '/text()')): short_id = re_amendment.match(id_).group(1) documents = self.scrape_documents( source=source, docname='amendment (%s)' % short_id, filename='Legis', tmp=tmp, session_num=session_num, id_=id_) for d in documents: bill.add_document(**d) #--------------------------------------------------------------------- # Add any related "Engrossments". # See www.ncsl.org/documents/legismgt/ILP/98Tab3Pt4.pdf for # an explanation of the engrossment process in DE. source = doc.xpath('//img[@alt="Engrossment"]/../@href') if source: tmp = '/'.join([ 'http://www.legis.delaware.gov', 'LIS/lis{session_num}.nsf/EngrossmentsforLookup', '{moniker}/$file/{filename}{format_}?open']) documents = self.scrape_documents( source=source[0], docname="Engrossment", filename="Engross", tmp=tmp, session_num=session_num, id_=bill['bill_id']) for d in documents: bill.add_version(**d) # -------------------------------------------------------------------- # Add any fiscal notes. source = doc.xpath("//img[@alt='Fiscal Note']/../@href") if source: tmp = '/'.join([ 'http://www.legis.delaware.gov', 'LIS/lis{session_num}.nsf/FiscalforLookup', '{docnum}/$file/{filename}{format_}?open']) documents = self.scrape_documents( source=source[0], docname="Fiscal Note", filename="Fiscal", tmp=tmp, session_num=session_num) for d in documents: bill.add_document(**d) #--------------------------------------------------------------------- # Extra fields # Helper to get the first td sibling of certain nodes. tmp = '//font[contains(., "%s")]/../../../td[2]' first_sibling_text = lambda heading: _get_text(tmp % heading) extra_fields = { # A long description of the legislation. "summary": "Synopsis", # Codification details for enacted legislation. "volume_chapter": "Volume Chapter", # Presumably the date of approval/veto. "date_governor_acted": "Date Governor Acted", "fiscal_notes": "Fiscal Notes", } for key, name in extra_fields.iteritems(): try: bill[key] = first_sibling_text(name) except IndexError: # xpath lookup failed. pass self.save_bill(bill)
def scrape_bill(self, chamber, bill): bill_id = bill['id'].replace('w/', 'with ') page = lxml.html.fromstring(self.urlopen(bill['url'])) page.make_links_absolute(bill['url']) title_row = page.xpath('//tr[td/b[contains(font,"Long Title")]]')[0] # text_content() == make sure any tags in the title don't cause issues title = title_row.xpath('td[@width="79%"]/font')[0].text_content() # now we can create a bill object b = Bill(bill['session'], bill['chamber'], bill_id, title) b.add_source(bill['url']) sponsors_row = page.xpath( '//tr[td/b[contains(font,"Primary Sponsor")]]')[0] sponsor = sponsors_row.xpath('td[@width="31%"]/font')[0].text b.add_sponsor('primary', sponsor) # scraping these and co-sponsors, but not doing anything with them until # it's decided whether or not to attempt to split 'em up additional = sponsors_row.xpath('td[@width="48%"]/font') additional_sponsors = additional[0].text if len(additional) > 0 else "" additional_sponsors = additional_sponsors.replace( '   ', '') cosponsors_row = page.xpath( '//tr[td/b[contains(font,"CoSponsors")]]')[0] cosponsors = cosponsors_row.xpath('td[@width="79%"]/font')[0].text cosponsors = cosponsors if cosponsors != '{ NONE...}' else '' introduced_row = page.xpath( '//tr[td/b[contains(font,"Introduced On")]]') if len(introduced_row) > 0: introduced = introduced_row[0].expath( '/td[@width="31%"]/font')[0].text introduced = datetime.strptime(introduced, '%b %d, %Y') b.add_action(bill['chamber'], 'introduced', introduced, 'bill:introduced') actions = page.xpath( '//table[preceding-sibling::b[contains(font,"Actions History:")]]/tr/td[@width="79%"]/font' ) if len(actions) > 0: actions = actions[0].text_content().split('\n') for act in actions: act = act.partition(' - ') date = datetime.strptime(act[0], '%b %d, %Y') b.add_action(bill['chamber'], act[2], date) # resources = page.xpath('//tr[td/b[contains(font, "Full text of Legislation")]]') # save vote urls for scraping later vote_urls = [] voting_reports = page.xpath( '//tr[td/b[contains(font, "Voting Reports")]]') if (len(voting_reports) > 0): for report in voting_reports[0].xpath('td/font/a'): vote_urls.append(report.attrib['href']) # Scrape votes for url in vote_urls: vote = self.scrape_votes(chamber, title, bill_id, url) b.add_vote(vote) # Save bill self.save_bill(b)
def scrape_bill(self, url, kw, re_amendment=re.compile(r'(^[A-Z]A \d{1,3}) to'), re_substitution=re.compile(r'(^[A-Z]S \d{1,2}) for'), re_digits=re.compile(r'\d{,5}'), actions_categorize=actions.categorize, actions_get_actor=actions.get_actor): bill = Bill(**kw) bill.add_source(url) #--------------------------------------------------------------------- # A few helpers. _url_2_lxml = self._url_2_lxml _cleanup_sponsors = self._cleanup_sponsors # Shortcut function partial to get text at a particular xpath: doc = _url_2_lxml(url) _get_text = partial(get_text, doc, 0) # Get session number--needed for fetching related documents (see below). xpath = '//font[contains(., "General Assembly") and @face="Arial"]' session_num = doc.xpath(xpath)[0].text_content() session_num = re_digits.match(session_num).group() #--------------------------------------------------------------------- # Sponsors chamber = bill['chamber'] sponsor_types = { 'Additional Sponsor(s):': 'cosponsor', 'CoSponsors:': 'cosponsor', 'Primary Sponsor:': 'primary' } xpath = '//font[contains(., "Sponsor") and @color="#008080"]' headings = doc.xpath(xpath + '/text()') sponsors = doc.xpath(xpath + '/../../following-sibling::td/font/text()') for h, s in zip(headings, sponsors): names = _cleanup_sponsors(s, chamber) type_ = sponsor_types[h.strip()] if names: for name, _chamber in names: bill.add_sponsor(type_, name, chamber=_chamber) #--------------------------------------------------------------------- # Versions tmp = '/'.join([ 'http://www.legis.delaware.gov', 'LIS/lis{session_num}.nsf/vwLegislation', '{moniker}/$file/{filename}{format_}?open' ]) documents = self.scrape_documents(source=url, docname="introduced", filename="Legis", tmp=tmp, session_num=session_num) for d in documents: bill.add_version(**d) # If bill is a substitution, add the original as a version. names = doc.xpath('//*[contains(text(), "Substituted ' 'Legislation for Bill:")]/text()') urls = doc.xpath('//*[contains(text(), "Substituted ' 'Legislation for Bill:")]' '/following-sibling::a/@href') for name, url in zip(names, urls): name = re_substitution.match(name).group(1) bill.add_version(name, url, description='original bill') #--------------------------------------------------------------------- # Actions actions = doc.xpath('//font[contains(., "Actions History")]' '/../following-sibling::table/descendant::td[2]') actions = actions[0].text_content() actions = filter(None, actions.splitlines()) for a in reversed(actions): date, action = a.split(' - ', 1) try: date = datetime.strptime(date, '%b %d, %Y') except ValueError: date = datetime.strptime(date, '%B %d, %Y') # XXX: ugh. actor = actions_get_actor(action, bill['chamber']) type_ = actions_categorize(action) bill.add_action(actor, action, date, type_) #--------------------------------------------------------------------- # Votes vote_strings = doc.xpath('//*[contains(text(), "vote:")]/text()') # Sometimes vote strings are contained in weird, separate elements. Probably # hand edited. if not all(re.search('\d', string) for string in vote_strings): # Use the parent's text_content instead. vote_strings = [] for el in doc.xpath('//*[contains(text(), "vote:")]/..'): vote_strings.append(el.text_content()) vote_urls = doc.xpath('//*[contains(text(), "vote:")]' '/following-sibling::a/@href') for string, url in zip(vote_strings, vote_urls): vote_data = parse_votestring(string) vote = self.scrape_vote(url, **vote_data) if vote: bill.add_vote(vote) #--------------------------------------------------------------------- # Amendments xpath = ("//font[contains(., 'Amendments')]/" "../../../td[2]/font/a") tmp = ('http://www.legis.delaware.gov/LIS/lis{session_num}.nsf/' 'vwLegislation/{id_}/$file/{filename}{format_}?open') for source, id_ in zip(doc.xpath(xpath + '/@href'), doc.xpath(xpath + '/text()')): short_id = re_amendment.match(id_).group(1) documents = self.scrape_documents(source=source, docname='amendment (%s)' % short_id, filename='Legis', tmp=tmp, session_num=session_num, id_=id_) for d in documents: bill.add_document(**d) #--------------------------------------------------------------------- # Add any related "Engrossments". # See www.ncsl.org/documents/legismgt/ILP/98Tab3Pt4.pdf for # an explanation of the engrossment process in DE. source = doc.xpath('//img[@alt="Engrossment"]/../@href') if source: tmp = '/'.join([ 'http://www.legis.delaware.gov', 'LIS/lis{session_num}.nsf/EngrossmentsforLookup', '{moniker}/$file/{filename}{format_}?open' ]) documents = self.scrape_documents(source=source[0], docname="Engrossment", filename="Engross", tmp=tmp, session_num=session_num, id_=bill['bill_id']) for d in documents: bill.add_version(**d) # -------------------------------------------------------------------- # Add any fiscal notes. source = doc.xpath("//img[@alt='Fiscal Note']/../@href") if source: tmp = '/'.join([ 'http://www.legis.delaware.gov', 'LIS/lis{session_num}.nsf/FiscalforLookup', '{docnum}/$file/{filename}{format_}?open' ]) documents = self.scrape_documents(source=source[0], docname="Fiscal Note", filename="Fiscal", tmp=tmp, session_num=session_num) for d in documents: bill.add_document(**d) #--------------------------------------------------------------------- # Extra fields # Helper to get the first td sibling of certain nodes. tmp = '//font[contains(., "%s")]/../../../td[2]' first_sibling_text = lambda heading: _get_text(tmp % heading) extra_fields = { # A long description of the legislation. "summary": "Synopsis", # Codification details for enacted legislation. "volume_chapter": "Volume Chapter", # Presumably the date of approval/veto. "date_governor_acted": "Date Governor Acted", "fiscal_notes": "Fiscal Notes", } for key, name in extra_fields.iteritems(): try: bill[key] = first_sibling_text(name) except IndexError: # xpath lookup failed. pass self.save_bill(bill)
def scrape_bill_type(self, chamber, session, bill_type, type_abbr): if chamber == 'upper': chamber_name = 'SENATE' else: chamber_name = 'ASSEMBLY' bills = self.session.query(CABill).filter_by( session_year=session).filter_by(measure_type=type_abbr) for bill in bills: bill_session = session if bill.session_num != '0': bill_session += ' Special Session %s' % bill.session_num bill_id = bill.short_bill_id fsbill = Bill(bill_session, chamber, bill_id, '') # # Construct session for web query, going from '20092010' to '0910' # source_session = session[2:4] + session[6:8] # # Turn 'AB 10' into 'ab_10' # source_num = "%s_%s" % (bill.measure_type.lower(), # bill.measure_num) # Construct a fake source url source_url = ('http://leginfo.legislature.ca.gov/faces/' 'billNavClient.xhtml?bill_id=%s') % bill.bill_id fsbill.add_source(source_url) fsbill.add_version(bill_id, source_url, 'text/html') title = '' short_title = '' type = ['bill'] subject = '' all_titles = set() for version in bill.versions: if not version.bill_xml: continue title = clean_title(version.title) if title: all_titles.add(title) short_title = clean_title(version.short_title) type = [bill_type] if version.appropriation == 'Yes': type.append('appropriation') if version.fiscal_committee == 'Yes': type.append('fiscal committee') if version.local_program == 'Yes': type.append('local program') if version.urgency == 'Yes': type.append('urgency') if version.taxlevy == 'Yes': type.append('tax levy') if version.subject: subject = clean_title(version.subject) if not title: self.warning("Couldn't find title for %s, skipping" % bill_id) continue fsbill['title'] = title fsbill['short_title'] = short_title fsbill['type'] = type fsbill['subjects'] = filter(None, [subject]) # We don't want the current title in alternate_titles all_titles.remove(title) fsbill['alternate_titles'] = list(all_titles) for author in version.authors: if author.house == chamber_name: fsbill.add_sponsor(author.contribution, author.name) introduced = False committee_code_regex = self.committee_code_regex() committee_slug_regex = self.committee_slug_regex() for action in bill.actions: if not action.action: # NULL action text seems to be an error on CA's part, # unless it has some meaning I'm missing continue actor = action.actor or chamber actor = actor.strip() match = re.match(r'(Assembly|Senate)($| \(Floor)', actor) if match: actor = { 'Assembly': 'lower', 'Senate': 'upper' }[match.group(1)] elif actor.startswith('Governor'): actor = 'executive' else: actor = re.sub('^Assembly', 'lower', actor) actor = re.sub('^Senate', 'upper', actor) type = [] act_str = action.action act_str = re.sub(r'\s+', ' ', act_str) if act_str.startswith('Introduced'): introduced = True type.append('bill:introduced') if 'Read first time.' in act_str: if not introduced: type.append('bill:introduced') introduced = True type.append('bill:reading:1') if 'To Com' in act_str or 'referred to' in act_str.lower(): type.append('committee:referred') if 'Read third time. Passed' in act_str: type.append('bill:passed') if 'Read third time. Passed' in act_str: type.append('bill:passed') if 'Read third time, passed' in act_str: type.append('bill:passed') if re.search(r'Read third time.+?Passed and', act_str): type.append('bill:passed') if 'Approved by Governor' in act_str: type.append('governor:signed') if 'Item veto' in act_str: type.append('governor:vetoed:line-item') if 'Vetoed by Governor' in act_str: type.append('governor:vetoed') if 'To Governor' in act_str: type.append('governor:received') if 'Read second time' in act_str: type.append('bill:reading:2') if not type: type = ['other'] # Add in the committee ID of the related committee, if any. kwargs = {} code = re.search(committee_code_regex, actor, re.I) if code: code = code.group() committee_id = self.committee_code_to_id(code) if committee_id: kwargs['actor_id'] = committee_id kwargs['actor_collection'] = 'committees' actor_text = re.search(committee_slug_regex, action.action) if actor_text: actor_text = actor_text.group() kwargs['actor_text'] = actor_text else: kwargs['actor_text'] = 'committee' fsbill.add_action(actor, act_str, action.action_date.date(), type=type, **kwargs) for vote in bill.votes: if vote.vote_result == '(PASS)': result = True else: result = False full_loc = vote.location.description first_part = full_loc.split(' ')[0].lower() if first_part in ['asm', 'assembly']: vote_chamber = 'lower' vote_location = ' '.join(full_loc.split(' ')[1:]) elif first_part.startswith('sen'): vote_chamber = 'upper' vote_location = ' '.join(full_loc.split(' ')[1:]) else: raise ScrapeError("Bad location: %s" % full_loc) motion = vote.motion.motion_text or '' if "Third Reading" in motion or "3rd Reading" in motion: vtype = 'passage' elif "Do Pass" in motion: vtype = 'passage' else: vtype = 'other' motion = motion.strip() # Why did it take until 2.7 to get a flags argument on re.sub? motion = re.compile(r'(\w+)( Extraordinary)? Session$', re.IGNORECASE).sub('', motion) motion = re.compile(r'^(Senate|Assembly) ', re.IGNORECASE).sub('', motion) motion = re.sub(r'^(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? ', '', motion) motion = re.sub(r' \(\w+\)$', '', motion) motion = re.sub(r'(SCR|SB|AB|AJR|ACR)\s?\d+ \w+\.?$', '', motion) motion = re.sub( r'(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? ' r'Urgency Clause$', '(Urgency Clause)', motion) motion = re.sub(r'\s+', ' ', motion) if not motion: self.warning("Got blank motion on vote for %s" % bill_id) continue fsvote = Vote(vote_chamber, self._tz.localize(vote.vote_date_time), motion, result, int(vote.ayes), int(vote.noes), int(vote.abstain), threshold=vote.threshold, type=vtype) if vote_location != 'Floor': fsvote['committee'] = vote_location for record in vote.votes: if record.vote_code == 'AYE': fsvote.yes(record.legislator_name) elif record.vote_code.startswith('NO'): fsvote.no(record.legislator_name) else: fsvote.other(record.legislator_name) for s in ('yes', 'no', 'other'): # Kill dupe votes. key = s + '_votes' fsvote[key] = list(set(fsvote[key])) # In a small percentage of bills, the integer vote counts # are inaccurate, so let's ignore them. for k in ('yes', 'no', 'other'): fsvote[k + '_count'] = len(fsvote[k + '_votes']) fsbill.add_vote(fsvote) self.save_bill(fsbill)