def scrape(self): self.session = '2011' for i, page in enumerate(self.searchLegislation()) : for legislation_summary in self.parseSearchResults(page) : title = legislation_summary['Title'].strip() if title == "": continue bill = Bill(name=legislation_summary['Record #'], session=self.session, title=title, type=[legislation_summary['Type'].lower()], organization=self.jurisdiction.name) bill.add_source(legislation_summary['URL']) legislation_details = self.expandLegislationSummary(legislation_summary) for related_bill in legislation_details.get('Related files', []) : bill.add_related_bill(name = related_bill, session = self.session, relation='other-session', chamber=None) for i, sponsor in enumerate(legislation_details.get('Sponsors', [])) : if i == 0 : primary = True sponsorship_type = "Primary" else : primary = False sponsorship_type = "Regular" bill.add_sponsor(sponsor, sponsorship_type, 'person', primary) for subject in legislation_details.get(u'Topics', []) : bill.add_subject(subject) for attachment in legislation_details.get(u'Attachments', []) : bill.add_version_link('PDF', attachment['url'], mimetype="application/pdf") yield bill
def scrape_bill(self, session, chamber, bill_type, url): bill_html = self.get(url).text bill_page = lxml.html.fromstring(bill_html) qs = dict(urlparse.parse_qsl(urlparse.urlparse(url).query)) bill_id = '{}{}'.format(qs['billtype'], qs['billnumber']) versions = bill_page.xpath("//table[contains(@id, 'GridViewVersions')]")[0] metainf_table = bill_page.xpath('//div[contains(@id, "itemPlaceholder")]//table[1]')[0] action_table = bill_page.xpath('//div[contains(@id, "UpdatePanel1")]//table[1]')[0] meta = self.parse_bill_metainf_table(metainf_table) subs = [s.strip() for s in meta['Report Title'].split(";")] if "" in subs: subs.remove("") b = Bill(bill_id, session, meta['Measure Title'], chamber=chamber, classification=bill_type) if meta['Description']: b.add_abstract(meta['Description'], 'description') for subject in subs: b.add_subject(subject) if url: b.add_source(url) prior_session = '{} Regular Session'.format(str(int(session[:4]) - 1)) companion = meta['Companion'].strip() if companion: b.add_related_bill(identifier=companion.replace(u'\xa0', ' '), legislative_session=prior_session, relation_type="companion") prior = bill_page.xpath( "//table[@id='ContentPlaceHolderCol1_GridViewStatus']/tr/td/font/text()")[-1] if 'carried over' in prior.lower(): b.add_related_bill(identifier=bill_id.replace(u'\xa0', ' '), legislative_session=prior_session, relation_type="companion") for sponsor in meta['Introducer(s)']: b.add_sponsorship(sponsor, 'primary', 'person', True) versions = self.parse_bill_versions_table(b, versions) yield from self.parse_bill_actions_table(b, action_table, bill_id, session, url, chamber) yield b
def _scrape_bill(self, session, bill_data): details = self._parse_bill_details(bill_data) (senate_url, assembly_url, bill_chamber, bill_type, bill_id, title, (prefix, number, active_version)) = details bill = Bill( bill_id, legislative_session=session, chamber=bill_chamber, title=title or bill_data['summary'], classification=bill_type, ) if bill_data['summary']: bill.add_abstract(bill_data['summary'], note='') bill_active_version = bill_data['amendments']['items'][active_version] # Parse sponsors. if bill_data['sponsor'] is not None: if bill_data['sponsor']['rules'] is True: bill.add_sponsorship( 'Rules Committee', entity_type='organization', classification='primary', primary=True, ) elif not bill_data['sponsor']['budget']: primary_sponsor = bill_data['sponsor']['member'] bill.add_sponsorship( primary_sponsor['shortName'], entity_type='person', classification='primary', primary=True, ) # There *shouldn't* be cosponsors if there is no sponsor. cosponsors = bill_active_version['coSponsors']['items'] for cosponsor in cosponsors: bill.add_sponsorship( cosponsor['shortName'], entity_type='person', classification='cosponsor', primary=False, ) # List companion bill. same_as = bill_active_version.get('sameAs', {}) # Check whether "sameAs" property is populated with at least one bill. if same_as['items']: # Get companion bill ID. companion_bill_id = same_as['items'][0]['basePrintNo'] # Build companion bill session. start_year = same_as['items'][0]['session'] end_year = start_year + 1 companion_bill_session = '-'.join([str(start_year), str(end_year)]) # Attach companion bill data. bill.add_related_bill( companion_bill_id, companion_bill_session, relation_type='companion', ) # Parse actions. chamber_map = { 'senate': 'upper', 'assembly': 'lower', } for action in bill_data['actions']['items']: chamber = chamber_map[action['chamber'].lower()] action_datetime = datetime.datetime.strptime(action['date'], '%Y-%m-%d') action_date = action_datetime.date() types, _ = NYBillScraper.categorizer.categorize(action['text']) bill.add_action( action['text'], action_date.strftime('%Y-%m-%d'), chamber=chamber, classification=types, ) # Handling of sources follows. Sources serving either chamber # maintain duplicate data, so we can see certain bill data # through either chamber's resources. However, we have to refer # to a specific chamber's resources if we want to grab certain # specific information such as vote data. # # As such, I'm placing all potential sources in the interest of # thoroughness. - Andy Lo # List Open Legislation API endpoint as a source. api_url = self.api_client.root + self.api_client.resources['bill'].format( session_year=session, bill_id=bill_id, summary='', detail='') bill.add_source(api_url) bill.add_source(senate_url) bill.add_source(assembly_url) # Chamber-specific processing. if bill_chamber == 'upper': # Collect votes. for vote_data in bill_data['votes']['items']: yield self._parse_senate_votes(vote_data, bill, api_url) elif bill_chamber == 'lower': assembly = AssemblyBillPage(self, session, bill, details) assembly.build() # A little strange the way it works out, but the Assembly # provides the HTML version documents and the Senate provides # the PDF version documents. amendments = bill_data['amendments']['items'] for key, amendment in amendments.items(): version = amendment['printNo'] html_version = version + ' HTML' html_url = 'http://assembly.state.ny.us/leg/?sh=printbill&bn='\ '{}&term={}'.format(bill_id, self.term_start_year) bill.add_version_link( html_version, html_url, on_duplicate='ignore', media_type='text/html', ) pdf_version = version + ' PDF' pdf_url = 'http://legislation.nysenate.gov/pdf/bills/{}/{}'\ .format(self.term_start_year, bill_id) bill.add_version_link( pdf_version, pdf_url, on_duplicate='ignore', media_type='application/pdf', ) yield bill
def scrape(self): unreachable_urls = [] for leg_summary in self.legislation(created_after=datetime.datetime(2015, 5, 17)) : title = leg_summary['Title'].strip() if not title or not leg_summary['Intro\xa0Date'] : continue # https://chicago.legistar.com/LegislationDetail.aspx?ID=1800754&GUID=29575A7A-5489-4D8B-8347-4FC91808B201&Options=Advanced&Search= # doesn't have an intro date bill_type = BILL_TYPES[leg_summary['Type']] bill_session = self.session(self.toTime(leg_summary['Intro\xa0Date'])) bill = Bill(identifier=leg_summary['Record #'], legislative_session=bill_session, title=title, classification=bill_type, from_organization={"name":"Chicago City Council"}) bill.add_source(leg_summary['url']) try : leg_details = self.legDetails(leg_summary['url']) except IndexError : unreachable_urls.append(leg_summary['url']) yield bill continue for related_bill in leg_details.get('Related files', []) : lower_title = title.lower() if "sundry" in title or "miscellaneous" in title: #these are ominbus bill.add_related_bill(identifier = related_bill['label'], legislative_session = bill.legislative_session, relation_type='replaces') #for now we're skipping related bills if they #don't contain words that make us think they're #in a ominbus relationship with each other for i, sponsor in enumerate(leg_details.get('Sponsors', [])) : if i == 0 : primary = True sponsorship_type = "Primary" else : primary = False sponsorship_type = "Regular" sponsor_name = sponsor['label'] # Does the Mayor/Clerk introduce legisislation as # individuals role holders or as the OFfice of City # Clerk and the Office of the Mayor? entity_type = 'person' if sponsor_name.startswith(('City Clerk', 'Mendoza, Susana')) : sponsor_name = 'Office of the City Clerk' entity_type = 'organization' elif sponsor_name.startswith(('Emanuel, Rahm',)) : sponsor_name = 'Office of the Mayor' entity_type = 'organization' if not sponsor_name.startswith(('Misc. Transmittal', 'No Sponsor', 'Dept./Agency')) : bill.add_sponsorship(sponsor_name, sponsorship_type, entity_type, primary, entity_id = _make_pseudo_id(name=sponsor_name)) if 'Topic' in leg_details : for subject in leg_details[u'Topic'].split(',') : bill.add_subject(subject) for attachment in leg_details.get('Attachments', []) : if attachment['label'] : bill.add_version_link(attachment['label'], attachment['url'], media_type="application/pdf") for action in self.history(leg_summary['url']) : action_description = action['Action'] try : action_date = self.toTime(action['Date']).date().isoformat() except AttributeError : # https://chicago.legistar.com/LegislationDetail.aspx?ID=1424866&GUID=CEC53337-B991-4268-AE8A-D4D174F8D492 continue if action_description : try : responsible_org = action['Action\xa0By']['label'] except TypeError : responsible_org = action['Action\xa0By'] if responsible_org == 'City Council' : responsible_org = 'Chicago City Council' act = bill.add_action(action_description, action_date, organization={'name': responsible_org}, classification=ACTION_CLASSIFICATION[action_description]) if action_description == 'Referred' : try : leg_details['Current Controlling Legislative Body']['label'] controlling_bodies = [leg_details['Current Controlling Legislative Body']] except TypeError : controlling_bodies = leg_details['Current Controlling Legislative Body'] if controlling_bodies : for controlling_body in controlling_bodies : body_name = controlling_body['label'] if body_name.startswith("Joint Committee") : act.add_related_entity(body_name, 'organization') else : act.add_related_entity(body_name, 'organization', entity_id = _make_pseudo_id(name=body_name)) if 'url' in action['Action\xa0Details'] : action_detail_url = action['Action\xa0Details']['url'] result, votes = self.extractVotes(action_detail_url) if votes and result : # see https://github.com/datamade/municipal-scrapers-us/issues/15 action_vote = VoteEvent(legislative_session=bill.legislative_session, motion_text=action_description, organization={'name': responsible_org}, classification=None, start_date=action_date, result=result, bill=bill) action_vote.add_source(action_detail_url) for option, voter in votes : action_vote.vote(option, voter) yield action_vote bill.extras = {'local_classification' : leg_summary['Type']} yield bill print(unreachable_urls)
def scrape_bills(self, session, year_abr): # Main Bill information main_bill_csv = self.access_to_csv('MainBill') # keep a dictionary of bills (mapping bill_id to Bill obj) bill_dict = {} for rec in main_bill_csv: bill_type = rec["BillType"].strip() bill_number = int(rec["BillNumber"]) bill_id = bill_type + str(bill_number) title = rec["Synopsis"] if bill_type[0] == 'A': chamber = "lower" else: chamber = "upper" # some bills have a blank title.. just skip it if not title: continue bill = Bill( bill_id, title=title, chamber=chamber, legislative_session=session, classification=self._bill_types[bill_type[1:]], ) if rec['IdenticalBillNumber'].strip(): bill.add_related_bill( rec['IdenticalBillNumber'].split()[0], legislative_session=session, relation_type='companion', ) # TODO: last session info is in there too bill_dict[bill_id] = bill # Sponsors bill_sponsors_csv = self.access_to_csv('BillSpon') for rec in bill_sponsors_csv: bill_type = rec["BillType"].strip() bill_number = int(rec["BillNumber"]) bill_id = bill_type + str(bill_number) if bill_id not in bill_dict: self.warning('unknown bill %s in sponsor database' % bill_id) continue bill = bill_dict[bill_id] name = rec["Sponsor"] sponsor_type = rec["Type"] if sponsor_type == 'P': sponsor_type = "primary" else: sponsor_type = "cosponsor" bill.add_sponsorship(name, classification=sponsor_type, entity_type='person', primary=sponsor_type == 'primary') # Documents bill_document_csv = self.access_to_csv('BillWP') for rec in bill_document_csv: bill_type = rec["BillType"].strip() bill_number = int(rec["BillNumber"]) bill_id = bill_type + str(bill_number) if bill_id not in bill_dict: self.warning('unknown bill %s in document database' % bill_id) continue bill = bill_dict[bill_id] document = rec["Document"] document = document.split('\\') document = document[-2] + "/" + document[-1] # doc_url = "ftp://www.njleg.state.nj.us/%s/%s" % (year, document) htm_url = 'http://www.njleg.state.nj.us/{}/Bills/{}'.format( year_abr, document.replace('.DOC', '.HTM'), ) # name document based _doctype try: doc_name = self._doctypes[rec['DocType']] except KeyError: raise Exception('unknown doctype %s on %s' % (rec['DocType'], bill_id)) if rec['Comment']: doc_name += ' ' + rec['Comment'] # Clean HTMX links. if htm_url.endswith('HTMX'): htm_url = re.sub('X$', '', htm_url) if rec['DocType'] in self._version_types: if htm_url.endswith('HTM'): mimetype = 'text/html' elif htm_url.endswith('wpd'): mimetype = 'application/vnd.wordperfect' try: bill.add_version_link(doc_name, htm_url, media_type=mimetype) except ValueError: self.warning("Couldn't find a document for bill {}".format(bill_id)) pass else: bill.add_document_link(doc_name, htm_url) # Votes next_year = int(year_abr) + 1 vote_info_list = [ 'A%s' % year_abr, 'A%s' % next_year, 'S%s' % year_abr, 'S%s' % next_year, 'CA%s-%s' % (year_abr, next_year), 'CS%s-%s' % (year_abr, next_year), ] for filename in vote_info_list: s_vote_url = 'ftp://www.njleg.state.nj.us/votes/%s.zip' % filename try: s_vote_zip, resp = self.urlretrieve(s_vote_url) except scrapelib.FTPError: self.warning('could not find %s' % s_vote_url) continue zippedfile = zipfile.ZipFile(s_vote_zip) for vfile in ["%s.txt" % (filename), "%sEnd.txt" % (filename)]: try: vote_file = io.TextIOWrapper(zippedfile.open(vfile, 'rU')) except KeyError: # # Right, so, 2011 we have an "End" file with more # vote data than was in the original dump. # self.warning("No such file: %s" % (vfile)) continue vdict_file = csv.DictReader(vote_file) votes = {} if filename.startswith('A') or filename.startswith('CA'): chamber = "lower" else: chamber = "upper" if filename.startswith('C'): vote_file_type = 'committee' else: vote_file_type = 'chamber' for rec in vdict_file: if vote_file_type == 'chamber': bill_id = rec["Bill"].strip() leg = rec["Full_Name"] date = rec["Session_Date"] action = rec["Action"] leg_vote = rec["Legislator_Vote"] vote_parts = (bill_id, chamber, action) else: bill_id = '%s%s' % (rec['Bill_Type'], rec['Bill_Number']) leg = rec['Name'] # drop time portion date = rec['Agenda_Date'].split()[0] # make motion readable action = self._com_vote_motions[rec['BillAction']] # first char (Y/N) use [0:1] to ignore '' leg_vote = rec['LegislatorVote'][0:1] committee = rec['Committee_House'] vote_parts = (bill_id, chamber, action, committee) date = datetime.strptime(date, "%m/%d/%Y") vote_id = '_'.join(vote_parts).replace(' ', '_') if bill_id[0] == 'A': b_chamber = "lower" else: b_chamber = "upper" if vote_id not in votes: votes[vote_id] = VoteEvent( start_date=TIMEZONE.localize(date), chamber=chamber, motion_text=action, classification='passage', result=None, bill=bill_id, bill_chamber=b_chamber, legislative_session=session, ) if leg_vote == "Y": votes[vote_id].vote('yes', leg) elif leg_vote == "N": votes[vote_id].vote('no', leg) else: votes[vote_id].vote('other', leg) # remove temp file os.remove(s_vote_zip) # Counts yes/no/other votes and saves overall vote for vote in votes.values(): counts = collections.defaultdict(int) for count in vote.votes: counts[count['option']] += 1 vote.set_count('yes', counts['yes']) vote.set_count('no', counts['no']) vote.set_count('other', counts['other']) # Veto override. if vote.motion_text == 'OVERRIDE': # Per the NJ leg's glossary, a veto override requires # 2/3ds of each chamber. 27 in the senate, 54 in the house. # http://www.njleg.state.nj.us/legislativepub/glossary.asp if vote.chamber == 'lower': vote.result = 'pass' if counts['yes'] >= 54 else 'fail' elif vote['chamber'] == 'upper': vote.result = 'pass' if counts['yes'] >= 27 else 'fail' else: # Regular vote. vote.result = 'pass' if counts['yes'] > counts['no'] else 'fail' vote.add_source('http://www.njleg.state.nj.us/downloads.asp') yield vote # Actions bill_action_csv = self.access_to_csv('BillHist') actor_map = {'A': 'lower', 'G': 'executive', 'S': 'upper'} for rec in bill_action_csv: bill_type = rec["BillType"].strip() bill_number = int(rec["BillNumber"]) bill_id = bill_type + str(bill_number) if bill_id not in bill_dict: self.warning('unknown bill %s in action database' % bill_id) continue bill = bill_dict[bill_id] action = rec["Action"] date = rec["DateAction"] date = datetime.strptime(date, "%m/%d/%y %H:%M:%S") actor = actor_map[rec["House"]] comment = rec["Comment"] action, atype = self.categorize_action(action, bill_id) if comment: action += (' ' + comment) bill.add_action( action, date=TIMEZONE.localize(date), classification=atype, chamber=actor, ) # Subjects subject_csv = self.access_to_csv('BillSubj') for rec in subject_csv: bill_id = rec['BillType'].strip() + str(int(rec['BillNumber'])) if bill_id not in bill_dict: self.warning('unknown bill %s in subject database' % bill_id) continue bill = bill_dict.get(bill_id) if bill: bill.subject.append(rec['SubjectKey']) else: self.warning('invalid bill id in BillSubj: %s' % bill_id) phony_bill_count = 0 # save all bills at the end for bill in bill_dict.values(): # add sources if not bill.actions and not bill.versions: self.warning('probable phony bill detected %s', bill.identifier) phony_bill_count += 1 else: bill.add_source('http://www.njleg.state.nj.us/downloads.asp') yield bill if phony_bill_count: self.warning('%s total phony bills detected', phony_bill_count)
def scrape_bill(self, bill_id): old = self.api('bills/' + bill_id + '?') # not needed old.pop('id') old.pop('state') old.pop('level', None) old.pop('country', None) old.pop('created_at') old.pop('updated_at') old.pop('action_dates') old.pop('+bill_type',None) old.pop('+subject', None) old.pop('+scraped_subjects', None) old.pop('subjects', []) classification = old.pop('type') # ca weirdness if 'fiscal committee' in classification: classification.remove('fiscal committee') if 'urgency' in classification: classification.remove('urgency') if 'local program' in classification: classification.remove('local program') if 'tax levy' in classification: classification.remove('tax levy') if classification[0] in ['miscellaneous', 'jres', 'cres']: return if classification == ['memorial resolution'] and self.state == 'ar': classification = ['memorial'] if classification == ['concurrent memorial resolution'] and self.state == 'ar': classification = ['concurrent memorial'] if classification == ['joint session resolution'] and self.state == 'il': classification = ['joint resolution'] if classification == ['legislative resolution'] and self.state == 'ny': classification = ['resolution'] if classification == ['address'] and self.state == 'nh': classification = ['resolution'] if not old['title'] and self.state == 'me': old['title'] = '(unknown)' chamber = old.pop('chamber') if self.state in ('ne', 'dc'): chamber = 'legislature' elif chamber in ('joint', 'conference'): chamber = 'legislature' new = Bill(old.pop('bill_id'), old.pop('session'), old.pop('title'), chamber=chamber, classification=classification) abstract = old.pop('summary', None) if abstract: new.add_abstract(abstract, note='') for title in old.pop('alternate_titles'): new.add_title(title) for doc in old.pop('documents'): new.add_document_link(doc['name'], doc['url'], on_duplicate='ignore') for doc in old.pop('versions'): new.add_version_link(doc['name'], doc['url'], media_type=doc.pop('mimetype', '')) for subj in old.pop('scraped_subjects', []): if subj: new.add_subject(subj) for spon in old.pop('sponsors'): if spon.get('committee_id') is not None: entity_type = 'organization' elif spon.get('leg_id') is not None: entity_type = 'person' else: entity_type = '' new.add_sponsorship(spon['name'], spon['type'], entity_type, spon['type'] == 'primary') for act in old.pop('actions'): actor = act['actor'] if actor.lower() in ('governor', 'mayor', 'secretary of state'): actor = 'executive' elif actor.lower() == 'house' or (actor.lower().startswith('lower (') and self.state == 'ca'): actor = 'lower' elif actor.lower() in ('senate', 'upper`') or (actor.lower().startswith('upper (') and self.state == 'ca'): actor = 'upper' elif actor in ('joint', 'other', 'Data Systems', 'Speaker', 'clerk', 'Office of the Legislative Fiscal Analyst', 'Became Law w', 'conference') or (actor.lower().startswith('legislature (') and self.state == 'ca'): actor = 'legislature' if actor in ('committee', 'sponsor') and self.state == 'pr': actor = 'legislature' # nebraska & DC if actor in ('upper','council') and self.state in ('ne', 'dc'): actor = 'legislature' if act['action']: newact = new.add_action(act['action'], act['date'][:10], chamber=actor, classification=[action_types[c] for c in act['type'] if c != 'other']) for re in act.get('related_entities', []): if re['type'] == 'committee': re['type'] = 'organization' elif re['type'] == 'legislator': re['type'] = 'person' newact.add_related_entity(re['name'], re['type']) for comp in old.pop('companions', []): if self.state in ('nj', 'ny', 'mn'): rtype = 'companion' new.add_related_bill(comp['bill_id'], comp['session'], rtype) for abid in old.pop('alternate_bill_ids', []) + old.pop('+alternate_bill_ids', []): new.add_identifier(abid) # generic OpenStates stuff for id in old.pop('all_ids'): new.add_identifier(id, scheme='openstates') for source in old.pop('sources'): source.pop('retrieved', None) new.add_source(**source) ext_title = old.pop('+extended_title', None) if ext_title: new.add_title(ext_title, note='Extended Title') official_title = old.pop('+official_title', None) if official_title: new.add_title(official_title, note='Official Title') to_extras = ['+status', '+final_disposition', '+volume_chapter', '+ld_number', '+referral', '+companion', '+description', '+fiscal_note_probable:', '+preintroduction_required:', '+drafter', '+category:', '+chapter', '+requester', '+transmittal_date:', '+by_request_of', '+bill_draft_number:', '+bill_lr', '+bill_url', '+rcs_num', '+fiscal_note', '+impact_clause', '+fiscal_notes', '+short_title', '+type_', '+conference_committee', 'conference_committee', '+companion_bill_ids', '+additional_information'] for k in to_extras: v = old.pop(k, None) if v: new.extras[k.replace('+', '')] = v # votes vote_no = 1 for vote in old.pop('votes'): vote.pop('id') vote.pop('state') vote.pop('bill_id') vote.pop('bill_chamber', None) vote.pop('+state', None) vote.pop('+country', None) vote.pop('+level', None) vote.pop('+vacant', None) vote.pop('+not_voting', None) vote.pop('+amended', None) vote.pop('+excused', None) vote.pop('+NV', None) vote.pop('+AB', None) vote.pop('+P', None) vote.pop('+V', None) vote.pop('+E', None) vote.pop('+EXC', None) vote.pop('+EMER', None) vote.pop('+present', None) vote.pop('+absent', None) vote.pop('+seconded', None) vote.pop('+moved', None) vote.pop('+vote_type', None) vote.pop('+actual_vote', None) vote.pop('+skip_votes', None) vote.pop('vote_id') vote.pop('+bill_chamber', None) vote.pop('+session', None) vote.pop('+bill_id', None) vote.pop('+bill_session', None) vote.pop('committee', None) vote.pop('committee_id', None) vtype = vote.pop('type', 'passage') if vtype == 'veto_override': vtype = ['veto-override'] elif vtype == 'amendment': vtype = ['amendment-passage'] elif vtype == 'other': vtype = '' else: vtype = ['bill-passage'] # most states need identifiers for uniqueness, just do it everywhere identifier = vote['date'] + '-' + str(vote_no) vote_no += 1 chamber = vote.pop('chamber') if chamber == 'upper' and self.state in ('ne', 'dc'): chamber = 'legislature' elif chamber == 'joint': chamber = 'legislature' newvote = VoteEvent(legislative_session=vote.pop('session'), motion_text=vote.pop('motion'), result='pass' if vote.pop('passed') else 'fail', chamber=chamber, start_date=vote.pop('date'), classification=vtype, bill=new, identifier=identifier) for vt in ('yes', 'no', 'other'): newvote.set_count(vt, vote.pop(vt + '_count')) for name in vote.pop(vt + '_votes'): newvote.vote(vt, name['name']) for source in vote.pop('sources'): source.pop('retrieved', None) newvote.add_source(**source) if not newvote.sources: newvote.sources = new.sources to_extras = ['+record', '+method', 'method', '+filename', 'record', '+action', '+location', '+rcs_num', '+type_', '+threshold', '+other_vote_detail', '+voice_vote'] for k in to_extras: v = vote.pop(k, None) if v: newvote.extras[k.replace('+', '')] = v assert not vote, vote.keys() yield newvote assert not old, old.keys() yield new
def _scrape_bills(self): """ Does the following 1) Scrapes bill data from unitedstates project and saves the data to path specified in UnitedStates module 2) Iterates over bill data and converts each one to an OCD-compliant bill model. 3) Yields the OCD-compliant bill model instance @return: generator for federal US bills in OCD-compliant format @rtype: generator """ # run scraper first to pull in all the bill data self._run_unitedstates_bill_scraper() # iterate over all the files and build and yield Bill objects for filename in find_files(settings.SCRAPED_DATA_DIR, '.*/data/[0-9]+/bills/[^\/]+/[^\/]+/data.json'): try: with open(filename) as json_file: json_data = json.load(json_file) # Initialize Object bill = Bill(constants.TYPE_MAP[json_data['bill_type']]['canonical'] + ' ' + json_data['number'], json_data['congress'], json_data['official_title'], chamber=constants.TYPE_MAP[json_data['bill_type']]['chamber'] ) # add source of data bill.add_source(json_data['url'], note='all') # add subjects for subject in json_data['subjects']: bill.add_subject(subject) # add summary if 'summary' in json_data and json_data['summary'] is not None: bill.add_abstract(json_data['summary']['text'], json_data['summary']['as'], json_data['summary']['date']) # add titles for item in json_data['titles']: bill.add_title(item['title'], item['type']) # add other/related Bills for b in json_data['related_bills']: if 'type' in b and b['type'] == 'bill': split = b['bill_id'].split('-') m = UnitedStatesBillScraper.BILL_SPLIT.match(split[0]) bill.add_related_bill(constants.TYPE_MAP[m.group(1)]['canonical'] + ' ' + m.group(2), legislative_session=split[1], relation_type='companion') # add sponsor bill.add_sponsorship_by_identifier(json_data['sponsor']['name'], 'person', 'person', True, scheme='thomas_id', identifier=json_data['sponsor']['thomas_id'], chamber=constants.TYPE_MAP[json_data['bill_type']]['chamber']) # add cosponsors for cs in json_data['cosponsors']: bill.add_sponsorship_by_identifier(cs['name'], 'person', 'person', False, scheme='thomas_id', identifier=cs['thomas_id'], chamber=constants.TYPE_MAP[json_data['bill_type']]['chamber']) # add introduced_at and actions bill.add_action('date of introduction', datetime_to_date(json_data['introduced_at']), chamber=constants.TYPE_MAP[json_data['bill_type']]['chamber'], related_entities=[]) # add other actions for action in json_data['actions']: bill.actions.append({'date': datetime_to_date(action['acted_at']), 'type': [action['type']], 'description': action['text'], 'actor': constants.TYPE_MAP[json_data['bill_type']]['chamber'], 'related_entities': [] }) # add bill versions for version_path in find_files(os.path.join(settings.SCRAPED_DATA_DIR, 'data', bill.legislative_session, 'bills', json_data['bill_type'], json_data['bill_type'] + json_data['number'], 'text-versions'), '/.*/*\.json'): try: with open(version_path) as version_file: version_json_data = json.load(version_file) for k, v in version_json_data['urls'].items(): bill.versions.append({'date': datetime_to_date(version_json_data['issued_on']), 'type': version_json_data['version_code'], 'name': constants.VERSION_MAP[version_json_data['version_code']], 'links': [{'mimetype': k, 'url': v}]}) except IOError: print("Unable to open or parse file with path " + version_path) continue # finally yield bill object yield bill except IOError: print("Unable to open file with path " + filename) print(traceback.format_exc()) continue except KeyError: print("Unable to parse file with path " + filename) print(traceback.format_exc()) continue except: print('Unknown error with ' + filename) print(traceback.format_exc()) continue
def test_full_bill(): create_jurisdiction() person = Person.objects.create(id='person-id', name='Adam Smith') org = ScrapeOrganization(name='House', classification='lower') com = ScrapeOrganization(name='Arbitrary Committee', classification='committee', parent_id=org._id) oldbill = ScrapeBill('HB 99', '1899', 'Axe & Tack Tax Act', classification='tax bill', from_organization=org._id) bill = ScrapeBill('HB 1', '1900', 'Axe & Tack Tax Act', classification='tax bill', from_organization=org._id) bill.subject = ['taxes', 'axes'] bill.add_identifier('SB 9') bill.add_title('Tack & Axe Tax Act') bill.add_action('introduced in house', '1900-04-01', chamber='lower') act = bill.add_action('sent to arbitrary committee', '1900-04-04', chamber='lower') act.add_related_entity('arbitrary committee', 'organization', com._id) bill.add_related_bill("HB 99", legislative_session="1899", relation_type="prior-session") bill.add_sponsorship('Adam Smith', classification='extra sponsor', entity_type='person', primary=False, entity_id=person.id) bill.add_sponsorship('Jane Smith', classification='lead sponsor', entity_type='person', primary=True) bill.add_abstract('This is an act about axes and taxes and tacks.', note="official") bill.add_document_link('Fiscal Note', 'http://example.com/fn.pdf', media_type='application/pdf') bill.add_document_link('Fiscal Note', 'http://example.com/fn.html', media_type='text/html') bill.add_version_link('Fiscal Note', 'http://example.com/v/1', media_type='text/html') bill.add_source('http://example.com/source') # import bill oi = OrganizationImporter('jid') oi.import_data([org.as_dict(), com.as_dict()]) pi = PersonImporter('jid') pi.json_to_db_id['person-id'] = 'person-id' # Since we have to create this person behind the back of the import # transaction, we'll fake the json-id to db-id, since they match in this # case. This is *really* getting at some implementation detail, but it's # the cleanest way to ensure we short-circut the json id lookup. BillImporter('jid', oi, pi).import_data([oldbill.as_dict(), bill.as_dict()]) # get bill from db and assert it imported correctly b = Bill.objects.get(identifier='HB 1') assert b.from_organization.classification == 'lower' assert b.identifier == bill.identifier assert b.title == bill.title assert b.classification == bill.classification assert b.subject == ['taxes', 'axes'] assert b.abstracts.get().note == 'official' # other_title, other_identifier added assert b.other_titles.get().title == 'Tack & Axe Tax Act' assert b.other_identifiers.get().identifier == 'SB 9' # actions actions = list(b.actions.all()) assert len(actions) == 2 # ensure order was preserved (if this breaks it'll be intermittent) assert actions[0].organization == Organization.objects.get(classification='lower') assert actions[0].description == "introduced in house" assert actions[1].description == "sent to arbitrary committee" assert (actions[1].related_entities.get().organization == Organization.objects.get(classification='committee')) # related_bills were added rb = b.related_bills.get() assert rb.identifier == 'HB 99' # and bill got resolved assert rb.related_bill.identifier == 'HB 99' # sponsors added, linked & unlinked sponsorships = b.sponsorships.all() assert len(sponsorships) == 2 for ss in sponsorships: if ss.primary: assert ss.person is None assert ss.organization is None else: assert ss.person == person # versions & documents with their links versions = b.versions.all() assert len(versions) == 1 assert versions[0].links.count() == 1 documents = b.documents.all() assert len(documents) == 1 assert documents[0].links.count() == 2 # sources assert b.sources.count() == 1
def get_bill(self, matter): '''Make Bill object from given matter.''' ''' Currently, NYC Legistar does not have conventional "Types" for three newly added committees: https://legistar.council.nyc.gov/Departments.aspx We communicated the issue to NYC, and until we learn more, we will skip the bills attached to those committees. ''' orgs_without_type = ['Charter Revision Commission 2019', 'New York City Advisory Commission on Property Tax Reform', 'Democratic Conference of the Council of the City of New York'] if matter['MatterBodyName'].strip() in orgs_without_type: return None matter_id = matter['MatterId'] if matter_id in DUPLICATED_ACTIONS: return None date = matter['MatterIntroDate'] title = matter['MatterName'] identifier = matter['MatterFile'] if not all((date, title, identifier)): return None leg_type = BILL_TYPES[matter['MatterTypeName']] bill_session = self.sessions(self.toTime(date)) bill = Bill(identifier=identifier, title=title, classification=leg_type, legislative_session=bill_session, from_organization={"name": "New York City Council"}) legistar_web = matter['legistar_url'] legistar_api = self.BASE_URL + '/matters/{0}'.format(matter_id) bill.add_source(legistar_web, note='web') bill.add_source(legistar_api, note='api') if matter['MatterTitle']: bill.add_title(matter['MatterTitle']) if matter['MatterEXText5']: bill.add_abstract(matter['MatterEXText5'], note='') try: for sponsorship in self.sponsorships(matter_id): bill.add_sponsorship(**sponsorship) except KeyError: self.version_errors.append(legistar_web) return None for attachment in self.attachments(matter_id): if attachment['MatterAttachmentId'] == 103315: # Duplicate return None if attachment['MatterAttachmentName']: bill.add_document_link(attachment['MatterAttachmentName'], attachment['MatterAttachmentHyperlink'], media_type='application/pdf') for topic in self.topics(matter_id) : bill.add_subject(topic['MatterIndexName'].strip()) for relation in self.relations(matter_id): try: related_bill = self.endpoint('/matters/{0}', relation['MatterRelationMatterId']) except scrapelib.HTTPError: return None else: date = related_bill['MatterIntroDate'] related_bill_session = self.session(self.toTime(date)) identifier = related_bill['MatterFile'] bill.add_related_bill(identifier=identifier, legislative_session=related_bill_session, relation_type='companion') try: text = self.text(matter_id) except KeyError: self.version_errors.append(legistar_web) return None bill.extras['local_classification'] = matter['MatterTypeName'] if text: if text['MatterTextPlain']: bill.extras['plain_text'] = text['MatterTextPlain'].replace(u'\u0000', '') if text['MatterTextRtf']: bill.extras['rtf_text'] = text['MatterTextRtf'].replace(u'\u0000', '') return bill