def process_bill(self, data): chamber = parse_psuedo_id(data['from_organization'])['classification'] if chamber == 'legislature': chamber = 'upper' bill = Bill(data['legislative_session'], chamber, data['identifier'], data['title'], subjects=data['subject'], type=data['classification']) if data['abstracts']: bill['summary'] = data['abstracts'][0]['abstract'] bill.update(**data['extras']) for action in data['actions']: actor = parse_psuedo_id(action['organization_id'])['classification'] legislators = [] committees = [] for rel in action['related_entities']: if rel['entity_type'] == 'organization': committees.append(rel['name']) elif rel['entity_type'] == 'person': legislators.append(rel['name']) bill.add_action(actor, action['description'], parse_date(action['date']), type=_action_categories(action['classification']), committees=committees, legislators=legislators, **action.get('extras', {}), ) for source in data['sources']: bill.add_source(source['url']) for sponsor in data['sponsorships']: bill.add_sponsor(sponsor['classification'], sponsor['name'], ) for version in data['versions']: for link in version['links']: bill.add_version(version['note'], link['url'], mimetype=link['media_type'], date=parse_date(version['date']), **version.get('extras', {})) for doc in data['documents']: for link in doc['links']: bill.add_document(doc['note'], link['url'], mimetype=link['media_type'], date=parse_date(doc['date']), **doc.get('extras', {})) for title in data['other_titles']: bill.add_title(title['title']) for related in data['related_bills']: bill.add_companion(related['identifier'], related['legislative_session'], chamber ) self.save_bill(bill)
def get_bill_info(self, chamber, session, bill_detail_url, version_list_url): """ Extracts all the requested info for a given bill. Calls the parent's methods to enter the results into JSON files. """ chamber = 'lower' if chamber.lower() == 'house' else chamber chamber = 'upper' if chamber.lower() == 'senate' else chamber # Get html and parse doc = self.lxmlize(bill_detail_url) # Get the basic parts of the bill bill_id = self.get_node(doc, '//h1/text()') self.logger.debug(bill_id) bill_title_text = self.get_node(doc, '//h2[text()[contains(.,' '"Description")]]/following-sibling::p/text()') if bill_title_text is not None: bill_title = bill_title_text.strip() else: long_desc_url = self.get_node(doc, '//a[text()[contains(.,' '"Long Description")]]/@href') long_desc_page = self.lxmlize(long_desc_url) long_desc_text = self.get_node(long_desc_page, '//h1/' 'following-sibling::p/text()') if long_desc_text is not None: bill_title = long_desc_text.strip() else: bill_title = 'No title found.' self.logger.warning('No title found for {}.'.format(bill_id)) self.logger.debug(bill_title) bill_type = {'F': 'bill', 'R':'resolution', 'C': 'concurrent resolution'}[bill_id[1]] bill = Bill(session, chamber, bill_id, bill_title, type=bill_type) # Add source bill.add_source(bill_detail_url) # Add subjects. Currently we are not mapping to Open States # standardized subjects, so use 'scraped_subjects' bill['scraped_subjects'] = self._subject_mapping[bill_id] # Get companion bill. companion = doc.xpath('//table[@class="status_info"]//tr[1]/td[2]/a[starts-with(@href, "?")]/text()') companion = self.make_bill_id(companion[0]) if len(companion) > 0 else None companion_chamber = self.chamber_from_bill(companion) if companion is not None: bill.add_companion(companion, chamber=companion_chamber) # Grab sponsors bill = self.extract_sponsors(bill, doc, chamber) # Add Actions performed on the bill. bill = self.extract_actions(bill, doc, chamber) # Get all versions of the bill. bill = self.extract_versions(bill, doc, chamber, version_list_url) self.save_bill(bill)
def get_bill_info(self, chamber, session, bill_detail_url, version_list_url): """ Extracts all the requested info for a given bill. Calls the parent's methods to enter the results into JSON files. """ chamber = 'lower' if chamber.lower() == 'house' else chamber chamber = 'upper' if chamber.lower() == 'senate' else chamber # Get html and parse bill_html = self.urlopen(bill_detail_url) doc = lxml.html.fromstring(bill_html) # Get the basic parts of the bill bill_id = doc.xpath('//h1/text()')[0] bill_title = doc.xpath('//h2/following-sibling::p/text()')[0].strip() bill_type = { 'F': 'bill', 'R': 'resolution', 'C': 'concurrent resolution' }[bill_id[1]] bill = Bill(session, chamber, bill_id, bill_title, type=bill_type) # Add source bill.add_source(bill_detail_url) # Add subjects. Currently we are not mapping to Open States # standardized subjects, so use 'scraped_subjects' bill['scraped_subjects'] = self._subject_mapping[bill_id] # Get companion bill. companion = doc.xpath( '//table[@class="status_info"]//tr[1]/td[2]/a[starts-with(@href, "?")]/text()' ) companion = self.make_bill_id( companion[0]) if len(companion) > 0 else None companion_chamber = self.chamber_from_bill(companion) if companion is not None: bill.add_companion(companion, chamber=companion_chamber) # Grab sponsors bill = self.extract_sponsors(bill, doc, chamber) # Add Actions performed on the bill. bill = self.extract_actions(bill, doc, chamber) # Get all versions of the bill. bill = self.extract_versions(bill, doc, chamber, version_list_url) self.save_bill(bill)
def scrape_bill(self, session, chamber, bill_type, url): bill_html = self.get(url).text bill_page = lxml.html.fromstring(bill_html) scraped_bill_id = bill_page.xpath( "//a[contains(@id, 'LinkButtonMeasure')]")[0].text_content() bill_id = scraped_bill_id.split(' ')[0] versions = bill_page.xpath( "//table[contains(@id, 'GridViewVersions')]")[0] tables = bill_page.xpath("//table") metainf_table = bill_page.xpath( '//div[contains(@id, "itemPlaceholder")]//table[1]')[0] action_table = bill_page.xpath( '//div[contains(@id, "UpdatePanel1")]//table[1]')[0] meta = self.parse_bill_metainf_table(metainf_table) subs = [s.strip() for s in meta['Report Title'].split(";")] if "" in subs: subs.remove("") b = Bill(session, chamber, bill_id, title=meta['Measure Title'], summary=meta['Description'], referral=meta['Current Referral'], subjects=subs, type=bill_type) b.add_source(url) companion = meta['Companion'].strip() if companion: b['companion'] = companion prior = bill_page.xpath( "//table[@id='ctl00_ContentPlaceHolderCol1_GridViewStatus']/tr/td/font/text()" )[-1] if 'carried over' in prior.lower(): prior_session = '{} Regular Session'.format( str(int(session[:4]) - 1)) b.add_companion(bill_id, prior_session, chamber) for sponsor in meta['Introducer(s)']: b.add_sponsor(type='primary', name=sponsor) actions = self.parse_bill_actions_table(b, action_table) versions = self.parse_bill_versions_table(b, versions) self.save_bill(b)
def get_bill_info(self, chamber, session, bill_detail_url, version_list_url): """ Extracts all the requested info for a given bill. Calls the parent's methods to enter the results into JSON files. """ chamber = 'lower' if chamber.lower() == 'house' else chamber chamber = 'upper' if chamber.lower() == 'senate' else chamber # Get html and parse bill_html = self.urlopen(bill_detail_url) doc = lxml.html.fromstring(bill_html) # Get the basic parts of the bill bill_id = doc.xpath('//h1/text()')[0] bill_title = doc.xpath('//h2/following-sibling::p/text()')[0].strip() bill_type = {'F': 'bill', 'R':'resolution', 'C': 'concurrent resolution'}[bill_id[1]] bill = Bill(session, chamber, bill_id, bill_title, type=bill_type) # Add source bill.add_source(bill_detail_url) # Add subjects. Currently we are not mapping to Open States # standardized subjects, so use 'scraped_subjects' bill['scraped_subjects'] = self._subject_mapping[bill_id] # Get companion bill. companion = doc.xpath('//table[@class="status_info"]//tr[1]/td[2]/a[starts-with(@href, "?")]/text()') companion = self.make_bill_id(companion[0]) if len(companion) > 0 else None companion_chamber = self.chamber_from_bill(companion) if companion is not None: bill.add_companion(companion, chamber=companion_chamber) # Grab sponsors bill = self.extract_sponsors(bill, doc, chamber) # Add Actions performed on the bill. bill = self.extract_actions(bill, doc, chamber) # Get all versions of the bill. bill = self.extract_versions(bill, doc, chamber, version_list_url) self.save_bill(bill)
def scrape_bill(self, session, chamber, bill_type, url): bill_html = self.get(url).text bill_page = lxml.html.fromstring(bill_html) scraped_bill_id = bill_page.xpath( "//a[contains(@id, 'LinkButtonMeasure')]")[0].text_content() bill_id = scraped_bill_id.split(' ')[0] versions = bill_page.xpath( "//table[contains(@id, 'GridViewVersions')]" )[0] tables = bill_page.xpath("//table") metainf_table = bill_page.xpath('//div[contains(@id, "itemPlaceholder")]//table[1]')[0] action_table = bill_page.xpath('//div[contains(@id, "UpdatePanel1")]//table[1]')[0] meta = self.parse_bill_metainf_table(metainf_table) subs = [ s.strip() for s in meta['Report Title'].split(";") ] if "" in subs: subs.remove("") b = Bill(session, chamber, bill_id, title=meta['Measure Title'], summary=meta['Description'], referral=meta['Current Referral'], subjects=subs, type=bill_type) b.add_source(url) companion = meta['Companion'].strip() if companion: b['companion'] = companion prior = bill_page.xpath( "//table[@id='ctl00_ContentPlaceHolderCol1_GridViewStatus']/tr/td/font/text()")[-1] if 'carried over' in prior.lower(): prior_session = '{} Regular Session'.format(str(int(session[:4])-1)) b.add_companion(bill_id, prior_session, chamber) for sponsor in meta['Introducer(s)']: b.add_sponsor(type='primary', name=sponsor) actions = self.parse_bill_actions_table(b, action_table) versions = self.parse_bill_versions_table(b, versions) self.save_bill(b)
def scrape_bills(self, session, year_abr): #Main Bill information main_bill_csv = self.access_to_csv('MainBill') # keep a dictionary of bills (mapping bill_id to Bill obj) bill_dict = {} for rec in main_bill_csv: bill_type = rec["BillType"].strip() bill_number = int(rec["BillNumber"]) bill_id = bill_type + str(bill_number) title = rec["Synopsis"] if bill_type[0] == 'A': chamber = "lower" else: chamber = "upper" # some bills have a blank title.. just skip it if not title: continue bill = Bill(str(session), chamber, bill_id, title, type=self._bill_types[bill_type[1:]]) if rec['IdenticalBillNumber'].strip(): bill.add_companion(rec['IdenticalBillNumber'].split()[0]) # TODO: last session info is in there too bill_dict[bill_id] = bill #Sponsors bill_sponsors_csv = self.access_to_csv('BillSpon') for rec in bill_sponsors_csv: bill_type = rec["BillType"].strip() bill_number = int(rec["BillNumber"]) bill_id = bill_type + str(bill_number) if bill_id not in bill_dict: self.warning('unknown bill %s in sponsor database' % bill_id) continue bill = bill_dict[bill_id] name = rec["Sponsor"] sponsor_type = rec["Type"] if sponsor_type == 'P': sponsor_type = "primary" else: sponsor_type = "cosponsor" bill.add_sponsor(sponsor_type, name) #Documents bill_document_csv = self.access_to_csv('BillWP') for rec in bill_document_csv: bill_type = rec["BillType"].strip() bill_number = int(rec["BillNumber"]) bill_id = bill_type + str(bill_number) if bill_id not in bill_dict: self.warning('unknown bill %s in document database' % bill_id) continue bill = bill_dict[bill_id] document = rec["Document"] document = document.split('\\') document = document[-2] + "/" + document[-1] year = str(year_abr) + str((year_abr + 1)) #doc_url = "ftp://www.njleg.state.nj.us/%s/%s" % (year, document) htm_url = 'http://www.njleg.state.nj.us/%s/Bills/%s' % (year_abr, document.replace('.DOC', '.HTM')) # name document based _doctype try: doc_name = self._doctypes[rec['DocType']] except KeyError: raise Exception('unknown doctype %s on %s' % (rec['DocType'], bill_id)) if rec['Comment']: doc_name += ' ' + rec['Comment'] if rec['DocType'] in self._version_types: # Clean HTMX links. if htm_url.endswith('HTMX'): htm_url = re.sub('X$', '', htm_url) if htm_url.endswith('HTM'): mimetype = 'text/html' elif htm_url.endswith('wpd'): mimetype = 'application/vnd.wordperfect' bill.add_version(doc_name, htm_url, mimetype=mimetype) else: bill.add_document(doc_name, htm_url) # Votes next_year = int(year_abr)+1 vote_info_list = ['A%s' % year_abr, 'A%s' % next_year, 'S%s' % year_abr, 'S%s' % next_year, 'CA%s-%s' % (year_abr, next_year), 'CS%s-%s' % (year_abr, next_year), ] for filename in vote_info_list: s_vote_url = 'ftp://www.njleg.state.nj.us/votes/%s.zip' % filename try: s_vote_zip, resp = self.urlretrieve(s_vote_url) except scrapelib.FTPError: self.warning('could not find %s' % s_vote_url) continue zipedfile = zipfile.ZipFile(s_vote_zip) for vfile in ["%s.txt" % (filename), "%sEnd.txt" % (filename)]: try: vote_file = zipedfile.open(vfile, 'U') except KeyError: # # Right, so, 2011 we have an "End" file with more # vote data than was in the original dump. # self.warning("No such file: %s" % (vfile)) continue vdict_file = csv.DictReader(vote_file) votes = {} if filename.startswith('A') or filename.startswith('CA'): chamber = "lower" else: chamber = "upper" if filename.startswith('C'): vote_file_type = 'committee' else: vote_file_type = 'chamber' for rec in vdict_file: if vote_file_type == 'chamber': bill_id = rec["Bill"].strip() leg = rec["Full_Name"] date = rec["Session_Date"] action = rec["Action"] leg_vote = rec["Legislator_Vote"] else: bill_id = '%s%s' % (rec['Bill_Type'], rec['Bill_Number']) leg = rec['Name'] # drop time portion date = rec['Agenda_Date'].split()[0] # make motion readable action = self._com_vote_motions[rec['BillAction']] # first char (Y/N) use [0:1] to ignore '' leg_vote = rec['LegislatorVote'][0:1] date = datetime.strptime(date, "%m/%d/%Y") vote_id = '_'.join((bill_id, chamber, action)) vote_id = vote_id.replace(" ", "_") if vote_id not in votes: votes[vote_id] = Vote(chamber, date, action, None, None, None, None, bill_id=bill_id) if vote_file_type == 'committee': votes[vote_id]['committee'] = self._committees[ rec['Committee_House']] if leg_vote == "Y": votes[vote_id].yes(leg) elif leg_vote == "N": votes[vote_id].no(leg) else: votes[vote_id].other(leg) # remove temp file os.remove(s_vote_zip) #Counts yes/no/other votes and saves overall vote for vote in votes.itervalues(): vote_yes_count = len(vote["yes_votes"]) vote_no_count = len(vote["no_votes"]) vote_other_count = len(vote["other_votes"]) vote["yes_count"] = vote_yes_count vote["no_count"] = vote_no_count vote["other_count"] = vote_other_count # Veto override. if vote['motion'] == 'OVERRIDE': # Per the NJ leg's glossary, a veto override requires # 2/3ds of each chamber. 27 in the senate, 54 in the house. # http://www.njleg.state.nj.us/legislativepub/glossary.asp vote['passed'] = False if vote['chamber'] == 'lower': if vote_yes_count >= 54: vote['passed'] = True elif vote['chamber'] == 'upper': if vote_yes_count >= 27: vote['passed'] = True # Regular vote. elif vote_yes_count > vote_no_count: vote["passed"] = True else: vote["passed"] = False vote_bill_id = vote["bill_id"] bill = bill_dict[vote_bill_id] bill.add_vote(vote) #Actions bill_action_csv = self.access_to_csv('BillHist') actor_map = {'A': 'lower', 'G': 'executive', 'S': 'upper'} for rec in bill_action_csv: bill_type = rec["BillType"].strip() bill_number = int(rec["BillNumber"]) bill_id = bill_type + str(bill_number) if bill_id not in bill_dict: self.warning('unknown bill %s in action database' % bill_id) continue bill = bill_dict[bill_id] action = rec["Action"] date = rec["DateAction"] date = datetime.strptime(date, "%m/%d/%y %H:%M:%S") actor = actor_map[rec["House"]] comment = rec["Comment"] action, atype = self.categorize_action(action, bill_id) if comment: action += (' ' + comment) bill.add_action(actor, action, date, type=atype) # Subjects subject_csv = self.access_to_csv('BillSubj') for rec in subject_csv: bill_id = rec['BillType'].strip() + str(int(rec['BillNumber'])) if bill_id not in bill_dict: self.warning('unknown bill %s in subject database' % bill_id) continue bill = bill_dict.get(bill_id) if bill: bill.setdefault('subjects', []).append(rec['SubjectKey']) else: self.warning('invalid bill id in BillSubj: %s' % bill_id) phony_bill_count = 0 # save all bills at the end for bill in bill_dict.itervalues(): # add sources if not bill['actions'] and not bill['versions']: self.warning('probable phony bill detected %s', bill['bill_id']) phony_bill_count += 1 else: bill.add_source('http://www.njleg.state.nj.us/downloads.asp') self.save_bill(bill) if phony_bill_count: self.warning('%s total phony bills detected', phony_bill_count)
class AssemblyBillPage(object): '''Get the actions, sponsors, sponsors memo and summary and assembly floor votes from the assembly page. ''' def __init__(self, scraper, session, chamber, url, doc, bill_type, bill_id, title, bill_id_parts): self.scraper = scraper self.chamber = chamber self.url = url self.doc = doc self.bill_id = bill_id self.letter, self.number, self.version = bill_id_parts self.data = {} self.bill = Bill(session, chamber, bill_id, title, type=bill_type) self.succeeded = False self._build() def _build(self): if not self.doc.xpath('//pre/text()'): return self.get_actions() self.get_sponsors_memo() self.get_sponsors() self.get_summary() self.get_companions() self.get_lower_votes() self.get_version() self.succeeded = True self.bill.add_source(self.url) def _get_chunks(self): if 'summary' not in self.data: url = ('http://assembly.state.ny.us/leg/?default_fld=&' 'bn=%s&Summary=Y&Actions=Y') % self.bill_id doc = self.url2lxml(url) summary, actions = doc.xpath('//pre/text()') self.data['summary'], self.data['actions'] = summary, actions return summary, actions else: return self.data['summary'], self.data['actions'] def url2lxml(self, url): self.bill.add_source(url) return self.scraper.url2lxml(url) def get_version(self): url = 'http://assembly.state.ny.us/leg/?sh=printbill&bn=' + self.bill_id version = self.bill_id self.bill.add_version(version, url, mimetype='text/html') def get_companions(self): summary, _ = self._get_chunks() chunks = summary.split('\n\n') for chunk in chunks: if chunk.startswith('SAME AS'): companions = chunk.replace('SAME AS ', '') if companions != 'No same as': for companion in re.split(r'\s*[\,\\]\s*', companions): companion = re.sub(r'^Same as ', '', companion) companion = re.sub(r'^Uni', '', companion) companion = re.sub(r'\-\w+$', '', companion) self.bill.add_companion(companion) def get_sponsors_memo(self): if self.chamber == 'lower': url = ('http://assembly.state.ny.us/leg/?' 'default_fld=&bn=%s&term=&Memo=Y') % self.bill_id self.bill.add_document("Sponsor's Memorandum", url) def get_summary(self): summary, _ = self._get_chunks() chunks = summary.split('\n\n') self.bill['summary'] = chunks[-1] def _scrub_name(self, name): junk = [ r'^Rules\s+', '\(2nd Vice Chairperson\)', '\(MS\)', 'Assemblyman', 'Assemblywoman', 'Senator'] for rgx in junk: name = re.sub(rgx, '', name, re.I) return name.strip('(), ') def get_sponsors(self): summary, _ = self._get_chunks() chunks = summary.split('\n\n') for chunk in chunks: for sponsor_type in ('SPONSOR', 'COSPNSR', 'MLTSPNSR'): if chunk.startswith(sponsor_type): _, data = chunk.split(' ', 1) for sponsor in re.split(r',\s+', data.strip()): if not sponsor: continue # If it's a "Rules" bill, add the Rules committee # as the primary. if sponsor.startswith('Rules'): self.bill.add_sponsor('primary', 'Rules Committee', chamber='lower') sponsor = self._scrub_name(sponsor) # Figure out sponsor type. spons_swap = {'SPONSOR': 'primary'} _sponsor_type = spons_swap.get( sponsor_type, 'cosponsor') self.bill.add_sponsor(_sponsor_type, sponsor.strip(), official_type=sponsor_type) def get_actions(self): _, actions = self._get_chunks() categorizer = self.scraper.categorizer actions_rgx = r'(\d{2}/\d{2}/\d{4})\s+(.+)' actions_data = re.findall(actions_rgx, actions) for date, action in actions_data: date = datetime.datetime.strptime(date, r'%m/%d/%Y') act_chamber = ('upper' if action.isupper() else 'lower') types, attrs = categorizer.categorize(action) self.bill.add_action(act_chamber, action, date, type=types, **attrs) # Bail if the bill has been substituted by another. if 'substituted by' in action: return def get_lower_votes(self): url = ('http://assembly.state.ny.us/leg/?' 'default_fld=&bn=%s&term=&Votes=Y') doc = self.url2lxml(url % self.bill_id) if doc is None: return pre = doc.xpath('//pre')[0].text_content() no_votes = ('There are no votes for this bill in this ' 'legislative session.') if pre == no_votes: return actual_vote = collections.defaultdict(list) for table in doc.xpath('//table'): date = table.xpath('caption/label[contains(., "DATE:")]') date = date[0].itersiblings().next().text date = datetime.datetime.strptime(date, '%m/%d/%Y') votes = table.xpath('caption/span/label[contains(., "YEA/NAY:")]') votes = votes[0].itersiblings().next().text yes_count, no_count = map(int, votes.split('/')) passed = yes_count > no_count vote = Vote('lower', date, 'Floor Vote', passed, yes_count, no_count, other_count=0) tds = table.xpath('tr/td/text()') votes = iter(tds) while True: try: data = list(islice(votes, 2)) name, vote_val = data except (StopIteration, ValueError): # End of data. Stop. break name = self._scrub_name(name) if vote_val.strip() == 'Y': vote.yes(name) elif vote_val.strip() in ('N', 'NO'): vote.no(name) else: vote.other(name) actual_vote[vote_val].append(name) # The page doesn't provide an other_count. vote['other_count'] = len(vote['other_votes']) vote['actual_vote'] = actual_vote self.bill.add_vote(vote)
def process_bill(self, data): chamber = parse_psuedo_id(data['from_organization'])['classification'] if chamber == 'legislature': chamber = 'upper' bill = Bill(data['legislative_session'], chamber, data['identifier'], data['title'], subjects=data['subject'], type=data['classification']) if data['abstracts']: bill['summary'] = data['abstracts'][0]['abstract'] bill.update(**data['extras']) for action in data['actions']: actor = parse_psuedo_id( action['organization_id'])['classification'] legislators = [] committees = [] for rel in action['related_entities']: if rel['entity_type'] == 'organization': committees.append(rel['name']) elif rel['entity_type'] == 'person': legislators.append(rel['name']) bill.add_action(actor, action['description'], parse_date(action['date']), type=_action_categories(action['classification']), committees=committees, legislators=legislators, **action.get('extras', {})) for source in data['sources']: bill.add_source(source['url']) for sponsor in data['sponsorships']: bill.add_sponsor( sponsor['classification'], sponsor['name'], ) for version in data['versions']: for link in version['links']: bill.add_version(version['note'], link['url'], mimetype=link['media_type'], date=parse_date(version['date']), **version.get('extras', {})) for doc in data['documents']: for link in doc['links']: bill.add_document(doc['note'], link['url'], mimetype=link['media_type'], date=parse_date(doc['date']), **doc.get('extras', {})) for title in data['other_titles']: bill.add_title(title['title']) for related in data['related_bills']: bill.add_companion(related['identifier'], related['legislative_session'], chamber) bill['alternate_bill_ids'] = [ oi['identifier'] for oi in data['other_identifiers'] ] self.save_bill(bill)
def _scrape_bill(self, session, bill_data): details = self._parse_bill_details(bill_data) (senate_url, assembly_url, bill_chamber, bill_type, bill_id, title, (prefix, number, active_version)) = details bill = Bill( session, bill_chamber, bill_id, title, type=bill_type, summary=bill_data['summary']) if bill_data['title'] is None: bill['title'] = bill_data['summary'] bill_active_version = bill_data['amendments']['items'][active_version] # Parse sponsors. if bill_data['sponsor']['rules'] == True: bill.add_sponsor('primary', 'Rules Committee', chamber=bill_chamber) elif not bill_data['sponsor']['budget']: primary_sponsor = bill_data['sponsor']['member'] bill.add_sponsor('primary', primary_sponsor['shortName']) # There *shouldn't* be cosponsors if there is no sponsor. cosponsors = bill_active_version['coSponsors']['items'] for cosponsor in cosponsors: bill.add_sponsor('cosponsor', cosponsor['shortName']) # List companion bill. same_as = bill_active_version.get('sameAs', {}) # Check whether "sameAs" property is populated with at least one bill. if same_as['items']: # Get companion bill ID. companion_bill_id = same_as['items'][0]['basePrintNo'] # Build companion bill session. start_year = same_as['items'][0]['session'] end_year = start_year + 1 companion_bill_session = '-'.join([str(start_year), str(end_year)]) # Determine companion bill chamber. companion_bill_prefix = self._parse_bill_number( same_as['items'][0]['basePrintNo'])[0] companion_bill_chamber = self._parse_bill_prefix( companion_bill_prefix)[0] # Attach companion bill data. bill.add_companion( companion_bill_id, companion_bill_session, companion_bill_chamber, ) # Parse actions. chamber_map = { 'senate': 'upper', 'assembly': 'lower', } for action in bill_data['actions']['items']: chamber = chamber_map[action['chamber'].lower()] action_datetime = datetime.datetime.strptime(action['date'], '%Y-%m-%d') action_date = action_datetime.date() types, attrs = NYBillScraper.categorizer.categorize(action['text']) bill.add_action( chamber, action['text'], action_date, type=types, **attrs) # Chamber-specific processing. if bill_chamber == 'upper': # Collect votes. for vote_data in bill_data['votes']['items']: vote = self._parse_senate_votes(vote_data) bill.add_vote(vote) elif bill_chamber == 'lower': assembly = AssemblyBillPage(self, session, bill, details) assembly.build() assembly_bill_data = assembly.bill # A little strange the way it works out, but the Assembly # provides the HTML version documents and the Senate provides # the PDF version documents. amendments = bill_data['amendments']['items'] for key, amendment in amendments.iteritems(): version = amendment['printNo'] html_version = version + ' HTML' html_url = 'http://assembly.state.ny.us/leg/?sh=printbill&bn='\ '{}&term={}'.format(bill_id, self.term_start_year) bill.add_version(html_version, html_version, mimetype='text/html') pdf_version = version + ' PDF' pdf_url = 'http://legislation.nysenate.gov/pdf/bills/{}/{}'\ .format(self.term_start_year, bill_id) bill.add_version(pdf_version, pdf_version, mimetype='application/pdf') # Handling of sources follows. Sources serving either chamber # maintain duplicate data, so we can see certain bill data # through either chamber's resources. However, we have to refer # to a specific chamber's resources if we want to grab certain # specific information such as vote data. # # As such, I'm placing all potential sources in the interest of # thoroughness. - Andy Lo # List Open Legislation API endpoint as a source. bill.add_source(self.api_client.root + self.api_client.\ resources['bill'].format( session_year=session, bill_id=bill_id, summary='', detail='')) bill.add_source(senate_url) bill.add_source(assembly_url) return bill
def _scrape_bill(self, session, bill_data): details = self._parse_bill_details(bill_data) (senate_url, assembly_url, bill_chamber, bill_type, bill_id, title, (prefix, number, active_version)) = details bill = Bill(session, bill_chamber, bill_id, title, type=bill_type, summary=bill_data['summary']) if bill_data['title'] is None: bill['title'] = bill_data['summary'] bill_active_version = bill_data['amendments']['items'][active_version] # Parse sponsors. if bill_data['sponsor']['rules'] == True: bill.add_sponsor('primary', 'Rules Committee', chamber=bill_chamber) elif not bill_data['sponsor']['budget']: primary_sponsor = bill_data['sponsor']['member'] bill.add_sponsor('primary', primary_sponsor['shortName']) # There *shouldn't* be cosponsors if there is no sponsor. cosponsors = bill_active_version['coSponsors']['items'] for cosponsor in cosponsors: bill.add_sponsor('cosponsor', cosponsor['shortName']) # List companion bill. same_as = bill_active_version.get('sameAs', {}) # Check whether "sameAs" property is populated with at least one bill. if same_as['items']: # Get companion bill ID. companion_bill_id = same_as['items'][0]['basePrintNo'] # Build companion bill session. start_year = same_as['items'][0]['session'] end_year = start_year + 1 companion_bill_session = '-'.join([str(start_year), str(end_year)]) # Determine companion bill chamber. companion_bill_prefix = self._parse_bill_number( same_as['items'][0]['basePrintNo'])[0] companion_bill_chamber = self._parse_bill_prefix( companion_bill_prefix)[0] # Attach companion bill data. bill.add_companion( companion_bill_id, companion_bill_session, companion_bill_chamber, ) # Parse actions. chamber_map = { 'senate': 'upper', 'assembly': 'lower', } for action in bill_data['actions']['items']: chamber = chamber_map[action['chamber'].lower()] action_datetime = datetime.datetime.strptime( action['date'], '%Y-%m-%d') action_date = action_datetime.date() types, attrs = NYBillScraper.categorizer.categorize(action['text']) bill.add_action(chamber, action['text'], action_date, type=types, **attrs) # Chamber-specific processing. if bill_chamber == 'upper': # Collect votes. for vote_data in bill_data['votes']['items']: vote = self._parse_senate_votes(vote_data) bill.add_vote(vote) elif bill_chamber == 'lower': assembly = AssemblyBillPage(self, session, bill, details) assembly.build() assembly_bill_data = assembly.bill # A little strange the way it works out, but the Assembly # provides the HTML version documents and the Senate provides # the PDF version documents. amendments = bill_data['amendments']['items'] for key, amendment in amendments.iteritems(): version = amendment['printNo'] html_version = version + ' HTML' html_url = 'http://assembly.state.ny.us/leg/?sh=printbill&bn='\ '{}&term={}'.format(bill_id, self.term_start_year) bill.add_version(html_version, html_version, mimetype='text/html') pdf_version = version + ' PDF' pdf_url = 'http://legislation.nysenate.gov/pdf/bills/{}/{}'\ .format(self.term_start_year, bill_id) bill.add_version(pdf_version, pdf_version, mimetype='application/pdf') # Handling of sources follows. Sources serving either chamber # maintain duplicate data, so we can see certain bill data # through either chamber's resources. However, we have to refer # to a specific chamber's resources if we want to grab certain # specific information such as vote data. # # As such, I'm placing all potential sources in the interest of # thoroughness. - Andy Lo # List Open Legislation API endpoint as a source. bill.add_source(self.api_client.root + self.api_client.\ resources['bill'].format( session_year=session, bill_id=bill_id, summary='', detail='')) bill.add_source(senate_url) bill.add_source(assembly_url) return bill
def scrape_bill_pages(self, session, year_abr): """ assemble information on a bill from a number of DBF files """ #Main Bill information main_bill_url, main_bill_db = self.get_dbf(year_abr, 'MAINBILL') # keep a dictionary of bills (mapping bill_id to Bill obj) bill_dict = {} for rec in main_bill_db: bill_type = rec["billtype"] bill_number = int(rec["billnumber"]) bill_id = bill_type + str(bill_number) title = rec["synopsis"] if bill_type[0] == 'A': chamber = "lower" else: chamber = "upper" # some bills have a blank title.. just skip it if not title: continue bill = Bill(str(session), chamber, bill_id, title, type=self._bill_types[bill_type[1:]]) if rec['identicalb']: bill.add_companion(rec['identicalb'].split()[0]) # TODO: last session info is in there too bill.add_source(main_bill_url) bill_dict[bill_id] = bill #Sponsors bill_sponsors_url, bill_sponsors_db = self.get_dbf(year_abr, 'BILLSPON') for rec in bill_sponsors_db: bill_type = rec["billtype"] bill_number = int(rec["billnumber"]) bill_id = bill_type + str(bill_number) bill = bill_dict[bill_id] name = rec["sponsor"] sponsor_type = rec["type"] if sponsor_type == 'P': sponsor_type = "primary" else: sponsor_type = "cosponsor" bill.add_sponsor(sponsor_type, name) #Documents bill_document_url, bill_document_db = self.get_dbf(year_abr, 'BILLWP') #print bill_document_db[2] for rec in bill_document_db: bill_type = rec["billtype"] bill_number = int(rec["billnumber"]) bill_id = bill_type + str(bill_number) if bill_id not in bill_dict: self.warning('unknown bill %s in document database' % bill_id) continue bill = bill_dict[bill_id] document = rec["document"] document = document.split('\\') document = document[-2] + "/" + document[-1] year = str(year_abr) + str((year_abr + 1)) #doc_url = "ftp://www.njleg.state.nj.us/%s/%s" % (year, document) htm_url = 'http://www.njleg.state.nj.us/%s/Bills/%s' % (year_abr, document.replace('.DOC', '.HTM')) # name document based _doctype try: doc_name = self._doctypes[rec['doctype']] except KeyError: raise Exception('unknown doctype %s on %s' % (rec['doctype'], bill_id)) if rec['comment']: doc_name += ' ' + rec['comment'] if rec['doctype'] in self._version_types: if htm_url.endswith('HTM'): mimetype = 'text/html' elif htm_url.endswith('wpd'): mimetype = 'application/vnd.wordperfect' bill.add_version(doc_name, htm_url, mimetype=mimetype) else: bill.add_document(doc_name, htm_url) # Votes next_year = int(year_abr)+1 vote_info_list = ['A%s' % year_abr, 'A%s' % next_year, 'S%s' % year_abr, 'S%s' % next_year, 'CA%s-%s' % (year_abr, next_year), 'CS%s-%s' % (year_abr, next_year), ] for filename in vote_info_list: s_vote_url = 'ftp://www.njleg.state.nj.us/votes/%s.zip' % filename try: s_vote_zip, resp = self.urlretrieve(s_vote_url) except scrapelib.FTPError: self.warning('could not find %s' % s_vote_url) continue zipedfile = zipfile.ZipFile(s_vote_zip) vfile = "%s.txt" % filename vote_file = zipedfile.open(vfile, 'U') vdict_file = csv.DictReader(vote_file) votes = {} if filename.startswith('A') or filename.startswith('CA'): chamber = "lower" else: chamber = "upper" if filename.startswith('C'): vote_file_type = 'committee' else: vote_file_type = 'chamber' for rec in vdict_file: if vote_file_type == 'chamber': bill_id = rec["Bill"].strip() leg = rec["Full_Name"] date = rec["Session_Date"] action = rec["Action"] leg_vote = rec["Legislator_Vote"] else: bill_id = '%s%s' % (rec['Bill_Type'], rec['Bill_Number']) leg = rec['Name'] # drop time portion date = rec['Agenda_Date'].split()[0] # make motion readable action = self._com_vote_motions[rec['BillAction']] # first char (Y/N) use [0:1] to ignore '' leg_vote = rec['LegislatorVote'][0:1] date = datetime.strptime(date, "%m/%d/%Y") vote_id = '_'.join((bill_id, chamber, action)) vote_id = vote_id.replace(" ", "_") if vote_id not in votes: votes[vote_id] = Vote(chamber, date, action, None, None, None, None, bill_id=bill_id) if vote_file_type == 'committee': votes[vote_id]['committee'] = self._committees[ rec['Committee_House']] if leg_vote == "Y": votes[vote_id].yes(leg) elif leg_vote == "N": votes[vote_id].no(leg) else: votes[vote_id].other(leg) # remove temp file os.remove(s_vote_zip) #Counts yes/no/other votes and saves overall vote for vote in votes.itervalues(): vote_yes_count = len(vote["yes_votes"]) vote_no_count = len(vote["no_votes"]) vote_other_count = len(vote["other_votes"]) vote["yes_count"] = vote_yes_count vote["no_count"] = vote_no_count vote["other_count"] = vote_other_count if vote_yes_count > vote_no_count: vote["passed"] = True else: vote["passed"] = False vote_bill_id = vote["bill_id"] bill = bill_dict[vote_bill_id] bill.add_vote(vote) #Actions bill_action_url, bill_action_db = self.get_dbf(year_abr, 'BILLHIST') actor_map = {'A': 'lower', 'G': 'executive', 'S': 'upper'} for rec in bill_action_db: bill_type = rec["billtype"] bill_number = int(rec["billnumber"]) bill_id = bill_type + str(bill_number) bill = bill_dict[bill_id] action = rec["action"] date = rec["dateaction"] actor = actor_map[rec["house"]] comment = rec["comment"] action, atype = self.categorize_action(action, bill_id) if comment: action += (' ' + comment) bill.add_action(actor, action, date, type=atype) # Subjects subject_url, subject_db = self.get_dbf(year_abr, 'BILLSUBJ') for rec in subject_db: bill_id = rec['billtype'] + str(int(rec['billnumber'])) bill = bill_dict.get(bill_id) if bill: bill.setdefault('subjects', []).append(rec['subjectkey']) else: self.warning('invalid bill id in BILLSUBJ.DBF: %s' % bill_id) # save all bills at the end for bill in bill_dict.itervalues(): # add sources bill.add_source(bill_sponsors_url) bill.add_source(bill_document_url) bill.add_source(bill_action_url) bill.add_source(subject_url) self.save_bill(bill)
def get_bill_info(self, chamber, session, bill_detail_url, version_list_url): """ Extracts all the requested info for a given bill. Calls the parent's methods to enter the results into JSON files. """ chamber = 'lower' if chamber.lower() == 'house' else chamber chamber = 'upper' if chamber.lower() == 'senate' else chamber # Get html and parse doc = self.lxmlize(bill_detail_url) # Check if bill hasn't been transmitted to the other chamber yet transmit_check = self.get_node( doc, '//h1[text()[contains(.,"Bills")]]/following-sibling::ul/li/text()' ) if transmit_check is not None and 'has not been transmitted' in transmit_check.strip( ): self.logger.debug( 'Bill has not been transmitted to other chamber ... skipping {0}' .format(bill_detail_url)) return # Get the basic parts of the bill bill_id = self.get_node(doc, '//h1/text()') self.logger.debug(bill_id) bill_title_text = self.get_node( doc, '//h2[text()[contains(.,' '"Description")]]/following-sibling::p/text()') if bill_title_text is not None: bill_title = bill_title_text.strip() else: long_desc_url = self.get_node( doc, '//a[text()[contains(.,' '"Long Description")]]/@href') long_desc_page = self.lxmlize(long_desc_url) long_desc_text = self.get_node( long_desc_page, '//h1/' 'following-sibling::p/text()') if long_desc_text is not None: bill_title = long_desc_text.strip() else: bill_title = 'No title found.' self.logger.warning('No title found for {}.'.format(bill_id)) self.logger.debug(bill_title) bill_type = { 'F': 'bill', 'R': 'resolution', 'C': 'concurrent resolution' }[bill_id[1]] bill = Bill(session, chamber, bill_id, bill_title, type=bill_type) # Add source bill.add_source(bill_detail_url) # Add subjects. Currently we are not mapping to Open States # standardized subjects, so use 'scraped_subjects' bill['scraped_subjects'] = self._subject_mapping[bill_id] # Get companion bill. companion = doc.xpath( '//table[@class="status_info"]//tr[1]/td[2]/a[starts-with(@href, "?")]/text()' ) companion = self.make_bill_id( companion[0]) if len(companion) > 0 else None companion_chamber = self.chamber_from_bill(companion) if companion is not None: bill.add_companion(companion, chamber=companion_chamber) # Grab sponsors bill = self.extract_sponsors(bill, doc, chamber) # Add Actions performed on the bill. bill = self.extract_actions(bill, doc, chamber) # Get all versions of the bill. bill = self.extract_versions(bill, doc, chamber, version_list_url) self.save_bill(bill)
def scrape_bill_pages(self, session, year_abr): """ assemble information on a bill from a number of DBF files """ #Main Bill information main_bill_url, main_bill_db = self.get_dbf(year_abr, 'MAINBILL') # keep a dictionary of bills (mapping bill_id to Bill obj) bill_dict = {} for rec in main_bill_db: bill_type = rec["billtype"] bill_number = int(rec["billnumber"]) bill_id = bill_type + str(bill_number) title = rec["synopsis"] if bill_type[0] == 'A': chamber = "lower" else: chamber = "upper" # some bills have a blank title.. just skip it if not title: continue bill = Bill(str(session), chamber, bill_id, title, type=self._bill_types[bill_type[1:]]) if rec['identicalb']: bill.add_companion(rec['identicalb'].split()[0]) # TODO: last session info is in there too bill.add_source(main_bill_url) bill_dict[bill_id] = bill #Sponsors bill_sponsors_url, bill_sponsors_db = self.get_dbf( year_abr, 'BILLSPON') for rec in bill_sponsors_db: bill_type = rec["billtype"] bill_number = int(rec["billnumber"]) bill_id = bill_type + str(bill_number) bill = bill_dict[bill_id] name = rec["sponsor"] sponsor_type = rec["type"] if sponsor_type == 'P': sponsor_type = "primary" else: sponsor_type = "cosponsor" bill.add_sponsor(sponsor_type, name) #Documents bill_document_url, bill_document_db = self.get_dbf(year_abr, 'BILLWP') #print bill_document_db[2] for rec in bill_document_db: bill_type = rec["billtype"] bill_number = int(rec["billnumber"]) bill_id = bill_type + str(bill_number) if bill_id not in bill_dict: self.warning('unknown bill %s in document database' % bill_id) continue bill = bill_dict[bill_id] document = rec["document"] document = document.split('\\') document = document[-2] + "/" + document[-1] year = str(year_abr) + str((year_abr + 1)) #doc_url = "ftp://www.njleg.state.nj.us/%s/%s" % (year, document) htm_url = 'http://www.njleg.state.nj.us/%s/Bills/%s' % ( year_abr, document.replace('.DOC', '.HTM')) # name document based _doctype try: doc_name = self._doctypes[rec['doctype']] except KeyError: raise Exception('unknown doctype %s on %s' % (rec['doctype'], bill_id)) if rec['comment']: doc_name += ' ' + rec['comment'] if rec['doctype'] in self._version_types: if htm_url.endswith('HTM'): mimetype = 'text/html' elif htm_url.endswith('wpd'): mimetype = 'application/vnd.wordperfect' bill.add_version(doc_name, htm_url, mimetype=mimetype) else: bill.add_document(doc_name, htm_url) # Votes next_year = int(year_abr) + 1 vote_info_list = [ 'A%s' % year_abr, 'A%s' % next_year, 'S%s' % year_abr, 'S%s' % next_year, 'CA%s-%s' % (year_abr, next_year), 'CS%s-%s' % (year_abr, next_year), ] for filename in vote_info_list: s_vote_url = 'ftp://www.njleg.state.nj.us/votes/%s.zip' % filename try: s_vote_zip, resp = self.urlretrieve(s_vote_url) except scrapelib.FTPError: self.warning('could not find %s' % s_vote_url) continue zipedfile = zipfile.ZipFile(s_vote_zip) vfile = "%s.txt" % filename vote_file = zipedfile.open(vfile, 'U') vdict_file = csv.DictReader(vote_file) votes = {} if filename.startswith('A') or filename.startswith('CA'): chamber = "lower" else: chamber = "upper" if filename.startswith('C'): vote_file_type = 'committee' else: vote_file_type = 'chamber' for rec in vdict_file: if vote_file_type == 'chamber': bill_id = rec["Bill"].strip() leg = rec["Full_Name"] date = rec["Session_Date"] action = rec["Action"] leg_vote = rec["Legislator_Vote"] else: bill_id = '%s%s' % (rec['Bill_Type'], rec['Bill_Number']) leg = rec['Name'] # drop time portion date = rec['Agenda_Date'].split()[0] # make motion readable action = self._com_vote_motions[rec['BillAction']] # first char (Y/N) use [0:1] to ignore '' leg_vote = rec['LegislatorVote'][0:1] date = datetime.strptime(date, "%m/%d/%Y") vote_id = '_'.join((bill_id, chamber, action)) vote_id = vote_id.replace(" ", "_") if vote_id not in votes: votes[vote_id] = Vote(chamber, date, action, None, None, None, None, bill_id=bill_id) if vote_file_type == 'committee': votes[vote_id]['committee'] = self._committees[ rec['Committee_House']] if leg_vote == "Y": votes[vote_id].yes(leg) elif leg_vote == "N": votes[vote_id].no(leg) else: votes[vote_id].other(leg) # remove temp file os.remove(s_vote_zip) #Counts yes/no/other votes and saves overall vote for vote in votes.itervalues(): vote_yes_count = len(vote["yes_votes"]) vote_no_count = len(vote["no_votes"]) vote_other_count = len(vote["other_votes"]) vote["yes_count"] = vote_yes_count vote["no_count"] = vote_no_count vote["other_count"] = vote_other_count if vote_yes_count > vote_no_count: vote["passed"] = True else: vote["passed"] = False vote_bill_id = vote["bill_id"] bill = bill_dict[vote_bill_id] bill.add_vote(vote) #Actions bill_action_url, bill_action_db = self.get_dbf(year_abr, 'BILLHIST') actor_map = {'A': 'lower', 'G': 'executive', 'S': 'upper'} for rec in bill_action_db: bill_type = rec["billtype"] bill_number = int(rec["billnumber"]) bill_id = bill_type + str(bill_number) bill = bill_dict[bill_id] action = rec["action"] date = rec["dateaction"] actor = actor_map[rec["house"]] comment = rec["comment"] action, atype = self.categorize_action(action, bill_id) if comment: action += (' ' + comment) bill.add_action(actor, action, date, type=atype) # Subjects subject_url, subject_db = self.get_dbf(year_abr, 'BILLSUBJ') for rec in subject_db: bill_id = rec['billtype'] + str(int(rec['billnumber'])) bill = bill_dict.get(bill_id) if bill: bill.setdefault('subjects', []).append(rec['subjectkey']) else: self.warning('invalid bill id in BILLSUBJ.DBF: %s' % bill_id) # save all bills at the end for bill in bill_dict.itervalues(): # add sources bill.add_source(bill_sponsors_url) bill.add_source(bill_document_url) bill.add_source(bill_action_url) bill.add_source(subject_url) self.save_bill(bill)
class AssemblyBillPage(object): '''Get the actions, sponsors, sponsors memo and summary and assembly floor votes from the assembly page. ''' metadata = metadata('ny') def __init__(self, scraper, session, chamber, url, doc, bill_type, bill_id, title, bill_id_parts): self.scraper = scraper self.session = session self.term = term_for_session('ny', session) for data in self.metadata['terms']: if session in data['sessions']: self.termdata = data self.term_start_year = data['start_year'] self.chamber = chamber self.url = url self.doc = doc self.bill_id = bill_id self.letter, self.number, self.version = bill_id_parts self.data = {} self.bill = Bill(session, chamber, bill_id, title, type=bill_type) self.succeeded = False self._build() def _build(self): if not self.doc.xpath('//pre/text()'): return self.get_actions() self.get_sponsors_memo() self.get_sponsors() self.get_summary() self.get_companions() self.get_lower_votes() self.get_version() self.succeeded = True self.bill.add_source(self.url) def _get_chunks(self): if 'summary' not in self.data: url = ('http://assembly.state.ny.us/leg/?default_fld=&' 'bn=%s&Summary=Y&Actions=Y&term=%s') url = url % (self.bill_id, self.term_start_year) doc = self.url2lxml(url) summary, actions = doc.xpath('//pre')[:2] summary = summary.text_content() actions = actions.text_content() self.data['summary'] = summary self.data['actions'] = actions return summary, actions else: return self.data['summary'], self.data['actions'] def url2lxml(self, url): self.bill.add_source(url) return self.scraper.url2lxml(url) def get_version(self): url = 'http://assembly.state.ny.us/leg/?sh=printbill&bn=%s&term=%s' url = url % (self.bill_id, self.term_start_year) version = self.bill_id self.bill.add_version(version, url, mimetype='text/html') def get_companions(self): summary, _ = self._get_chunks() chunks = summary.split('\n\n') for chunk in chunks: if chunk.startswith('SAME AS'): companions = chunk.replace('SAME AS ', '') if companions != 'No same as': for companion in re.split(r'\s*[\,\\]\s*', companions): companion = re.sub(r'^Same as ', '', companion) companion = re.sub(r'^Uni', '', companion) companion = re.sub(r'\-\w+$', '', companion) self.bill.add_companion(companion) def get_sponsors_memo(self): if self.chamber == 'lower': url = ('http://assembly.state.ny.us/leg/?' 'default_fld=&bn=%s&term=%s&Memo=Y') url = url % (self.bill_id, self.term_start_year) self.bill.add_document("Sponsor's Memorandum", url) def get_summary(self): summary, _ = self._get_chunks() chunks = summary.split('\n\n') self.bill['summary'] = chunks[-1] def _scrub_name(self, name): junk = [ r'^Rules\s+', '\(2nd Vice Chairperson\)', '\(MS\)', 'Assemblyman', 'Assemblywoman', 'Senator'] for rgx in junk: name = re.sub(rgx, '', name, re.I) # Collabpse whitespace. name = re.sub('\s+', ' ', name) return name.strip('(), ') def get_sponsors(self): summary, _ = self._get_chunks() chunks = summary.split('\n\n') for chunk in chunks: for sponsor_type in ('SPONSOR', 'COSPNSR', 'MLTSPNSR'): if chunk.startswith(sponsor_type): _, data = chunk.split(' ', 1) for sponsor in re.split(r',\s+', data.strip()): if not sponsor: continue # If it's a "Rules" bill, add the Rules committee # as the primary. if sponsor.startswith('Rules'): self.bill.add_sponsor('primary', 'Rules Committee', chamber='lower') sponsor = self._scrub_name(sponsor) # Figure out sponsor type. spons_swap = {'SPONSOR': 'primary'} _sponsor_type = spons_swap.get( sponsor_type, 'cosponsor') self.bill.add_sponsor(_sponsor_type, sponsor.strip(), official_type=sponsor_type) def get_actions(self): _, actions = self._get_chunks() categorizer = self.scraper.categorizer actions_rgx = r'(\d{2}/\d{2}/\d{4})\s+(.+)' actions_data = re.findall(actions_rgx, actions) for date, action in actions_data: date = datetime.datetime.strptime(date, r'%m/%d/%Y') act_chamber = ('upper' if action.isupper() else 'lower') types, attrs = categorizer.categorize(action) self.bill.add_action(act_chamber, action, date, type=types, **attrs) # Bail if the bill has been substituted by another. if 'substituted by' in action: return def get_lower_votes(self): url = ('http://assembly.state.ny.us/leg/?' 'default_fld=&bn=%s&term=%s&Votes=Y') url = url % (self.bill_id, self.term_start_year) doc = self.url2lxml(url) if doc is None: return pre = doc.xpath('//pre')[0].text_content() no_votes = ('There are no votes for this bill in this ' 'legislative session.') if pre == no_votes: return actual_vote = collections.defaultdict(list) for table in doc.xpath('//table'): date = table.xpath('caption/label[contains(., "DATE:")]') date = date[0].itersiblings().next().text date = datetime.datetime.strptime(date, '%m/%d/%Y') votes = table.xpath('caption/span/label[contains(., "YEA/NAY:")]') votes = votes[0].itersiblings().next().text yes_count, no_count = map(int, votes.split('/')) passed = yes_count > no_count vote = Vote('lower', date, 'Floor Vote', passed, yes_count, no_count, other_count=0) tds = table.xpath('tr/td/text()') votes = iter(tds) while True: try: data = list(islice(votes, 2)) name, vote_val = data except (StopIteration, ValueError): # End of data. Stop. break name = self._scrub_name(name) if vote_val.strip() == 'Y': vote.yes(name) elif vote_val.strip() in ('N', 'NO'): vote.no(name) else: vote.other(name) actual_vote[vote_val].append(name) # The page doesn't provide an other_count. vote['other_count'] = len(vote['other_votes']) vote['actual_vote'] = actual_vote self.bill.add_vote(vote)
def scrape_bill_pages(self, session, year_abr): """ assemble information on a bill from a number of DBF files """ # Main Bill information main_bill_url, main_bill_db = self.get_dbf(year_abr, "MAINBILL") # keep a dictionary of bills (mapping bill_id to Bill obj) bill_dict = {} for rec in main_bill_db: bill_type = rec["billtype"] bill_number = int(rec["billnumber"]) bill_id = bill_type + str(bill_number) title = rec["synopsis"] if bill_type[0] == "A": chamber = "lower" else: chamber = "upper" # some bills have a blank title.. just skip it if not title: continue bill = Bill(str(session), chamber, bill_id, title, type=self._bill_types[bill_type[1:]]) if rec["identicalb"]: bill.add_companion(rec["identicalb"].split()[0]) # TODO: last session info is in there too bill.add_source(main_bill_url) bill_dict[bill_id] = bill # Sponsors bill_sponsors_url, bill_sponsors_db = self.get_dbf(year_abr, "BILLSPON") for rec in bill_sponsors_db: bill_type = rec["billtype"] bill_number = int(rec["billnumber"]) bill_id = bill_type + str(bill_number) bill = bill_dict[bill_id] name = rec["sponsor"] sponsor_type = rec["type"] if sponsor_type == "P": sponsor_type = "primary" else: sponsor_type = "cosponsor" bill.add_sponsor(sponsor_type, name) # Documents bill_document_url, bill_document_db = self.get_dbf(year_abr, "BILLWP") # print bill_document_db[2] for rec in bill_document_db: bill_type = rec["billtype"] bill_number = int(rec["billnumber"]) bill_id = bill_type + str(bill_number) if bill_id not in bill_dict: self.warning("unknown bill %s in document database" % bill_id) continue bill = bill_dict[bill_id] document = rec["document"] document = document.split("\\") document = document[-2] + "/" + document[-1] year = str(year_abr) + str((year_abr + 1)) # doc_url = "ftp://www.njleg.state.nj.us/%s/%s" % (year, document) htm_url = "http://www.njleg.state.nj.us/%s/Bills/%s" % (year_abr, document.replace(".DOC", ".HTM")) # name document based _doctype try: doc_name = self._doctypes[rec["doctype"]] except KeyError: raise Exception("unknown doctype %s on %s" % (rec["doctype"], bill_id)) if rec["comment"]: doc_name += " " + rec["comment"] if rec["doctype"] in self._version_types: if htm_url.endswith("HTM"): mimetype = "text/html" elif htm_url.endswith("wpd"): mimetype = "application/vnd.wordperfect" bill.add_version(doc_name, htm_url, mimetype=mimetype) else: bill.add_document(doc_name, htm_url) # Votes next_year = int(year_abr) + 1 vote_info_list = [ "A%s" % year_abr, "A%s" % next_year, "S%s" % year_abr, "S%s" % next_year, "CA%s-%s" % (year_abr, next_year), "CS%s-%s" % (year_abr, next_year), ] for filename in vote_info_list: s_vote_url = "ftp://www.njleg.state.nj.us/votes/%s.zip" % filename try: s_vote_zip, resp = self.urlretrieve(s_vote_url) except scrapelib.FTPError: self.warning("could not find %s" % s_vote_url) continue zipedfile = zipfile.ZipFile(s_vote_zip) vfile = "%s.txt" % filename vote_file = zipedfile.open(vfile, "U") vdict_file = csv.DictReader(vote_file) votes = {} if filename.startswith("A") or filename.startswith("CA"): chamber = "lower" else: chamber = "upper" if filename.startswith("C"): vote_file_type = "committee" else: vote_file_type = "chamber" for rec in vdict_file: if vote_file_type == "chamber": bill_id = rec["Bill"].strip() leg = rec["Full_Name"] date = rec["Session_Date"] action = rec["Action"] leg_vote = rec["Legislator_Vote"] else: bill_id = "%s%s" % (rec["Bill_Type"], rec["Bill_Number"]) leg = rec["Name"] # drop time portion date = rec["Agenda_Date"].split()[0] # make motion readable action = self._com_vote_motions[rec["BillAction"]] # first char (Y/N) use [0:1] to ignore '' leg_vote = rec["LegislatorVote"][0:1] date = datetime.strptime(date, "%m/%d/%Y") vote_id = "_".join((bill_id, chamber, action)) vote_id = vote_id.replace(" ", "_") if vote_id not in votes: votes[vote_id] = Vote(chamber, date, action, None, None, None, None, bill_id=bill_id) if vote_file_type == "committee": votes[vote_id]["committee"] = self._committees[rec["Committee_House"]] if leg_vote == "Y": votes[vote_id].yes(leg) elif leg_vote == "N": votes[vote_id].no(leg) else: votes[vote_id].other(leg) # remove temp file os.remove(s_vote_zip) # Counts yes/no/other votes and saves overall vote for vote in votes.itervalues(): vote_yes_count = len(vote["yes_votes"]) vote_no_count = len(vote["no_votes"]) vote_other_count = len(vote["other_votes"]) vote["yes_count"] = vote_yes_count vote["no_count"] = vote_no_count vote["other_count"] = vote_other_count if vote_yes_count > vote_no_count: vote["passed"] = True else: vote["passed"] = False vote_bill_id = vote["bill_id"] bill = bill_dict[vote_bill_id] bill.add_vote(vote) # Actions bill_action_url, bill_action_db = self.get_dbf(year_abr, "BILLHIST") actor_map = {"A": "lower", "G": "executive", "S": "upper"} for rec in bill_action_db: bill_type = rec["billtype"] bill_number = int(rec["billnumber"]) bill_id = bill_type + str(bill_number) bill = bill_dict[bill_id] action = rec["action"] date = rec["dateaction"] actor = actor_map[rec["house"]] comment = rec["comment"] action, atype = self.categorize_action(action, bill_id) if comment: action += " " + comment bill.add_action(actor, action, date, type=atype) # Subjects subject_url, subject_db = self.get_dbf(year_abr, "BILLSUBJ") for rec in subject_db: bill_id = rec["billtype"] + str(int(rec["billnumber"])) bill = bill_dict.get(bill_id) if bill: bill.setdefault("subjects", []).append(rec["subjectkey"]) else: self.warning("invalid bill id in BILLSUBJ.DBF: %s" % bill_id) phony_bill_count = 0 # save all bills at the end for bill in bill_dict.itervalues(): # add sources bill.add_source(bill_sponsors_url) bill.add_source(bill_document_url) bill.add_source(bill_action_url) bill.add_source(subject_url) if not bill["actions"] and not bill["versions"]: self.warning("probable phony bill detected %s", bill["bill_id"]) phony_bill_count += 1 else: self.save_bill(bill) if phony_bill_count: self.warning("%s total phony bills detected", phony_bill_count)
def scrape_bills(self, session, year_abr): #Main Bill information main_bill_csv = self.access_to_csv('MainBill') # keep a dictionary of bills (mapping bill_id to Bill obj) bill_dict = {} for rec in main_bill_csv: bill_type = rec["BillType"].strip() bill_number = int(rec["BillNumber"]) bill_id = bill_type + str(bill_number) title = rec["Synopsis"] if bill_type[0] == 'A': chamber = "lower" else: chamber = "upper" # some bills have a blank title.. just skip it if not title: continue bill = Bill(str(session), chamber, bill_id, title, type=self._bill_types[bill_type[1:]]) if rec['IdenticalBillNumber'].strip(): bill.add_companion(rec['IdenticalBillNumber'].split()[0]) # TODO: last session info is in there too bill_dict[bill_id] = bill #Sponsors bill_sponsors_csv = self.access_to_csv('BillSpon') for rec in bill_sponsors_csv: bill_type = rec["BillType"].strip() bill_number = int(rec["BillNumber"]) bill_id = bill_type + str(bill_number) if bill_id not in bill_dict: self.warning('unknown bill %s in sponsor database' % bill_id) continue bill = bill_dict[bill_id] name = rec["Sponsor"] sponsor_type = rec["Type"] if sponsor_type == 'P': sponsor_type = "primary" else: sponsor_type = "cosponsor" bill.add_sponsor(sponsor_type, name) #Documents bill_document_csv = self.access_to_csv('BillWP') for rec in bill_document_csv: bill_type = rec["BillType"].strip() bill_number = int(rec["BillNumber"]) bill_id = bill_type + str(bill_number) if bill_id not in bill_dict: self.warning('unknown bill %s in document database' % bill_id) continue bill = bill_dict[bill_id] document = rec["Document"] document = document.split('\\') document = document[-2] + "/" + document[-1] year = str(year_abr) + str((year_abr + 1)) #doc_url = "ftp://www.njleg.state.nj.us/%s/%s" % (year, document) htm_url = 'http://www.njleg.state.nj.us/%s/Bills/%s' % (year_abr, document.replace('.DOC', '.HTM')) # name document based _doctype try: doc_name = self._doctypes[rec['DocType']] except KeyError: raise Exception('unknown doctype %s on %s' % (rec['DocType'], bill_id)) if rec['Comment']: doc_name += ' ' + rec['Comment'] # Clean HTMX links. if htm_url.endswith('HTMX'): htm_url = re.sub('X$', '', htm_url) if rec['DocType'] in self._version_types: if htm_url.endswith('HTM'): mimetype = 'text/html' elif htm_url.endswith('wpd'): mimetype = 'application/vnd.wordperfect' try: bill.add_version(doc_name, htm_url, mimetype=mimetype) except ValueError: self.warning("Couldn't find a document for bill {}".format(bill_id)) pass else: bill.add_document(doc_name, htm_url) # Votes next_year = int(year_abr)+1 vote_info_list = ['A%s' % year_abr, 'A%s' % next_year, 'S%s' % year_abr, 'S%s' % next_year, 'CA%s-%s' % (year_abr, next_year), 'CS%s-%s' % (year_abr, next_year), ] for filename in vote_info_list: s_vote_url = 'ftp://www.njleg.state.nj.us/votes/%s.zip' % filename try: s_vote_zip, resp = self.urlretrieve(s_vote_url) except scrapelib.FTPError: self.warning('could not find %s' % s_vote_url) continue zipedfile = zipfile.ZipFile(s_vote_zip) for vfile in ["%s.txt" % (filename), "%sEnd.txt" % (filename)]: try: vote_file = zipedfile.open(vfile, 'U') except KeyError: # # Right, so, 2011 we have an "End" file with more # vote data than was in the original dump. # self.warning("No such file: %s" % (vfile)) continue vdict_file = csv.DictReader(vote_file) votes = {} if filename.startswith('A') or filename.startswith('CA'): chamber = "lower" else: chamber = "upper" if filename.startswith('C'): vote_file_type = 'committee' else: vote_file_type = 'chamber' for rec in vdict_file: if vote_file_type == 'chamber': bill_id = rec["Bill"].strip() leg = rec["Full_Name"] date = rec["Session_Date"] action = rec["Action"] leg_vote = rec["Legislator_Vote"] else: bill_id = '%s%s' % (rec['Bill_Type'], rec['Bill_Number']) leg = rec['Name'] # drop time portion date = rec['Agenda_Date'].split()[0] # make motion readable action = self._com_vote_motions[rec['BillAction']] # first char (Y/N) use [0:1] to ignore '' leg_vote = rec['LegislatorVote'][0:1] date = datetime.strptime(date, "%m/%d/%Y") vote_id = '_'.join((bill_id, chamber, action)) vote_id = vote_id.replace(" ", "_") if vote_id not in votes: votes[vote_id] = Vote(chamber, date, action, None, None, None, None, bill_id=bill_id) if vote_file_type == 'committee': votes[vote_id]['committee'] = self._committees[ rec['Committee_House']] if leg_vote == "Y": votes[vote_id].yes(leg) elif leg_vote == "N": votes[vote_id].no(leg) else: votes[vote_id].other(leg) # remove temp file os.remove(s_vote_zip) #Counts yes/no/other votes and saves overall vote for vote in votes.itervalues(): vote_yes_count = len(vote["yes_votes"]) vote_no_count = len(vote["no_votes"]) vote_other_count = len(vote["other_votes"]) vote["yes_count"] = vote_yes_count vote["no_count"] = vote_no_count vote["other_count"] = vote_other_count # Veto override. if vote['motion'] == 'OVERRIDE': # Per the NJ leg's glossary, a veto override requires # 2/3ds of each chamber. 27 in the senate, 54 in the house. # http://www.njleg.state.nj.us/legislativepub/glossary.asp vote['passed'] = False if vote['chamber'] == 'lower': if vote_yes_count >= 54: vote['passed'] = True elif vote['chamber'] == 'upper': if vote_yes_count >= 27: vote['passed'] = True # Regular vote. elif vote_yes_count > vote_no_count: vote["passed"] = True else: vote["passed"] = False vote_bill_id = vote["bill_id"] bill = bill_dict[vote_bill_id] bill.add_vote(vote) #Actions bill_action_csv = self.access_to_csv('BillHist') actor_map = {'A': 'lower', 'G': 'executive', 'S': 'upper'} for rec in bill_action_csv: bill_type = rec["BillType"].strip() bill_number = int(rec["BillNumber"]) bill_id = bill_type + str(bill_number) if bill_id not in bill_dict: self.warning('unknown bill %s in action database' % bill_id) continue bill = bill_dict[bill_id] action = rec["Action"] date = rec["DateAction"] date = datetime.strptime(date, "%m/%d/%y %H:%M:%S") actor = actor_map[rec["House"]] comment = rec["Comment"] action, atype = self.categorize_action(action, bill_id) if comment: action += (' ' + comment) bill.add_action(actor, action, date, type=atype) # Subjects subject_csv = self.access_to_csv('BillSubj') for rec in subject_csv: bill_id = rec['BillType'].strip() + str(int(rec['BillNumber'])) if bill_id not in bill_dict: self.warning('unknown bill %s in subject database' % bill_id) continue bill = bill_dict.get(bill_id) if bill: bill.setdefault('subjects', []).append(rec['SubjectKey']) else: self.warning('invalid bill id in BillSubj: %s' % bill_id) phony_bill_count = 0 # save all bills at the end for bill in bill_dict.itervalues(): # add sources if not bill['actions'] and not bill['versions']: self.warning('probable phony bill detected %s', bill['bill_id']) phony_bill_count += 1 else: bill.add_source('http://www.njleg.state.nj.us/downloads.asp') self.save_bill(bill) if phony_bill_count: self.warning('%s total phony bills detected', phony_bill_count)
class AssemblyBillPage(object): """Get the actions, sponsors, sponsors memo and summary and assembly floor votes from the assembly page. """ metadata = metadata("ny") def __init__(self, scraper, session, chamber, url, doc, bill_type, bill_id, title, bill_id_parts): self.scraper = scraper self.session = session self.term = term_for_session("ny", session) for data in self.metadata["terms"]: if session in data["sessions"]: self.termdata = data self.term_start_year = data["start_year"] self.chamber = chamber self.url = url self.doc = doc self.bill_id = bill_id self.letter, self.number, self.version = bill_id_parts self.data = {} self.bill = Bill(session, chamber, bill_id, title, type=bill_type) self.succeeded = False self._build() def _build(self): if not self.doc.xpath("//pre/text()"): return self.get_actions() self.get_sponsors_memo() self.get_sponsors() self.get_summary() self.get_companions() self.get_lower_votes() self.get_version() self.succeeded = True self.bill.add_source(self.url) def _get_chunks(self): if "summary" not in self.data: url = "http://assembly.state.ny.us/leg/?default_fld=&" "bn=%s&Summary=Y&Actions=Y&term=%s" url = url % (self.bill_id, self.term_start_year) doc = self.url2lxml(url) summary, actions = doc.xpath("//pre")[:2] summary = summary.text_content() actions = actions.text_content() self.data["summary"] = summary self.data["actions"] = actions return summary, actions else: return self.data["summary"], self.data["actions"] def url2lxml(self, url): self.bill.add_source(url) return self.scraper.url2lxml(url) def get_version(self): url = "http://assembly.state.ny.us/leg/?sh=printbill&bn=%s&term=%s" url = url % (self.bill_id, self.term_start_year) version = self.bill_id self.bill.add_version(version, url, mimetype="text/html") def get_companions(self): summary, _ = self._get_chunks() chunks = summary.split("\n\n") for chunk in chunks: if chunk.startswith("SAME AS"): companions = chunk.replace("SAME AS ", "") if companions != "No same as": for companion in re.split(r"\s*[\,\\]\s*", companions): companion = re.sub(r"^Same as ", "", companion) companion = re.sub(r"^Uni", "", companion) companion = re.sub(r"\-\w+$", "", companion) self.bill.add_companion(companion) def get_sponsors_memo(self): if self.chamber == "lower": url = "http://assembly.state.ny.us/leg/?" "default_fld=&bn=%s&term=%s&Memo=Y" url = url % (self.bill_id, self.term_start_year) self.bill.add_document("Sponsor's Memorandum", url) def get_summary(self): summary, _ = self._get_chunks() chunks = summary.split("\n\n") self.bill["summary"] = chunks[-1] def _scrub_name(self, name): junk = [r"^Rules\s+", "\(2nd Vice Chairperson\)", "\(MS\)", "Assemblyman", "Assemblywoman", "Senator"] for rgx in junk: name = re.sub(rgx, "", name, re.I) # Collabpse whitespace. name = re.sub("\s+", " ", name) return name.strip("(), ") def get_sponsors(self): summary, _ = self._get_chunks() chunks = summary.split("\n\n") for chunk in chunks: for sponsor_type in ("SPONSOR", "COSPNSR", "MLTSPNSR"): if chunk.startswith(sponsor_type): _, data = chunk.split(" ", 1) for sponsor in re.split(r",\s+", data.strip()): if not sponsor: continue # If it's a "Rules" bill, add the Rules committee # as the primary. if sponsor.startswith("Rules"): self.bill.add_sponsor("primary", "Rules Committee", chamber="lower") sponsor = self._scrub_name(sponsor) # Figure out sponsor type. spons_swap = {"SPONSOR": "primary"} _sponsor_type = spons_swap.get(sponsor_type, "cosponsor") self.bill.add_sponsor(_sponsor_type, sponsor.strip(), official_type=sponsor_type) def get_actions(self): _, actions = self._get_chunks() categorizer = self.scraper.categorizer actions_rgx = r"(\d{2}/\d{2}/\d{4})\s+(.+)" actions_data = re.findall(actions_rgx, actions) for date, action in actions_data: date = datetime.datetime.strptime(date, r"%m/%d/%Y") act_chamber = "upper" if action.isupper() else "lower" types, attrs = categorizer.categorize(action) self.bill.add_action(act_chamber, action, date, type=types, **attrs) # Bail if the bill has been substituted by another. if "substituted by" in action: return def get_lower_votes(self): url = "http://assembly.state.ny.us/leg/?" "default_fld=&bn=%s&term=%s&Votes=Y" url = url % (self.bill_id, self.term_start_year) doc = self.url2lxml(url) if doc is None: return pre = doc.xpath("//pre")[0].text_content() no_votes = "There are no votes for this bill in this " "legislative session." if pre == no_votes: return actual_vote = collections.defaultdict(list) for table in doc.xpath("//table"): date = table.xpath('caption/label[contains(., "DATE:")]') date = date[0].itersiblings().next().text date = datetime.datetime.strptime(date, "%m/%d/%Y") votes = table.xpath('caption/span/label[contains(., "YEA/NAY:")]') votes = votes[0].itersiblings().next().text yes_count, no_count = map(int, votes.split("/")) passed = yes_count > no_count vote = Vote("lower", date, "Floor Vote", passed, yes_count, no_count, other_count=0) tds = table.xpath("tr/td/text()") votes = iter(tds) while True: try: data = list(islice(votes, 2)) name, vote_val = data except (StopIteration, ValueError): # End of data. Stop. break name = self._scrub_name(name) if vote_val.strip() == "Y": vote.yes(name) elif vote_val.strip() in ("N", "NO"): vote.no(name) else: vote.other(name) actual_vote[vote_val].append(name) # The page doesn't provide an other_count. vote["other_count"] = len(vote["other_votes"]) vote["actual_vote"] = actual_vote self.bill.add_vote(vote)