def prepare_votes(abbr, session, bill_id, scraped_votes): # if bill already exists, try and preserve vote_ids vote_matcher = VoteMatcher(abbr) if bill_id: existing_votes = list(db.votes.find({'bill_id': bill_id})) if existing_votes: vote_matcher.learn_ids(existing_votes) vote_matcher.set_ids(scraped_votes) # link votes to committees and legislators for vote in scraped_votes: # committee_ids if 'committee' in vote: committee_id = get_committee_id(abbr, vote['chamber'], vote['committee']) vote['committee_id'] = committee_id # vote leg_ids vote['_voters'] = [] for vtype in ('yes_votes', 'no_votes', 'other_votes'): svlist = [] for svote in vote[vtype]: id = get_legislator_id(abbr, session, vote['chamber'], svote) svlist.append({'name': svote, 'leg_id': id}) vote['_voters'].append(id) vote[vtype] = svlist
def match_sponsor_ids(abbr, bill): for sponsor in bill['sponsors']: # use sponsor's chamber if specified sponsor['leg_id'] = get_legislator_id( abbr, bill['session'], sponsor.get('chamber', bill['chamber']), sponsor['name']) if sponsor['leg_id'] is None: sponsor['leg_id'] = get_legislator_id(abbr, bill['session'], None, sponsor['name']) if sponsor['leg_id'] is None: sponsor['committee_id'] = get_committee_id(abbr, bill['chamber'], sponsor['name'])
def match_sponsor_ids(abbr, bill): for sponsor in bill['sponsors']: # use sponsor's chamber if specified sponsor['leg_id'] = get_legislator_id(abbr, bill['session'], sponsor.get('chamber', bill['chamber']), sponsor['name']) if sponsor['leg_id'] is None: sponsor['leg_id'] = get_legislator_id(abbr, bill['session'], None, sponsor['name']) if sponsor['leg_id'] is None: sponsor['committee_id'] = get_committee_id(abbr, bill['chamber'], sponsor['name'])
def _match_committee(name): return get_committee_id(abbr, action['actor'], name)
def import_bills(state, data_dir): data_dir = os.path.join(data_dir, state) pattern = os.path.join(data_dir, 'bills', '*.json') meta = db.metadata.find_one({'_id': state}) # Build a session to term mapping sessions = {} for term in meta['terms']: for session in term['sessions']: sessions[session] = term['name'] votes = import_votes(state, data_dir) paths = glob.glob(pattern) for path in paths: with open(path) as f: data = prepare_obj(json.load(f)) # clean up bill_id data['bill_id'] = fix_bill_id(data['bill_id']) # move subjects to scraped_subjects subjects = data.pop('subjects', None) if subjects: data['scraped_subjects'] = subjects # add loaded votes to data bill_votes = votes.pop( (data['chamber'], data['session'], data['bill_id']), []) data['votes'].extend(bill_votes) bill = db.bills.find_one({ 'state': data['state'], 'session': data['session'], 'chamber': data['chamber'], 'bill_id': data['bill_id'] }) vote_matcher = VoteMatcher(data['state']) if bill: vote_matcher.learn_vote_ids(bill['votes']) vote_matcher.set_vote_ids(data['votes']) # match sponsor leg_ids for sponsor in data['sponsors']: id = get_legislator_id(state, data['session'], None, sponsor['name']) sponsor['leg_id'] = id for vote in data['votes']: # committee_ids if 'committee' in vote: committee_id = get_committee_id(state, vote['chamber'], vote['committee']) vote['committee_id'] = committee_id # vote leg_ids for vtype in ('yes_votes', 'no_votes', 'other_votes'): svlist = [] for svote in vote[vtype]: id = get_legislator_id(state, data['session'], vote['chamber'], svote) svlist.append({'name': svote, 'leg_id': id}) vote[vtype] = svlist data['_term'] = sessions[data['session']] # Merge any version titles into the alternate_titles list alt_titles = set(data.get('alternate_titles', [])) for version in data['versions']: if 'title' in version: alt_titles.add(version['title']) if '+short_title' in version: alt_titles.add(version['+short_title']) try: # Make sure the primary title isn't included in the # alternate title list alt_titles.remove(data['title']) except KeyError: pass data['alternate_titles'] = list(alt_titles) if not bill: data['_keywords'] = list(bill_keywords(data)) insert_with_id(data) else: data['_keywords'] = list(bill_keywords(data)) update(bill, data, db.bills) print 'imported %s bill files' % len(paths) for remaining in votes.keys(): print 'Failed to match vote %s %s %s' % tuple( [r.encode('ascii', 'replace') for r in remaining]) populate_current_fields(state) ensure_indexes()
def _resolve_ctty(committee): return get_committee_id(data[settings.LEVEL_FIELD], committee["chamber"], committee["participant"])
def _resolve_ctty(committee): return get_committee_id(data[settings.LEVEL_FIELD], committee['chamber'], committee['participant'])
def import_bills(state, data_dir): data_dir = os.path.join(data_dir, state) pattern = os.path.join(data_dir, 'bills', '*.json') meta = db.metadata.find_one({'_id': state}) # Build a session to term mapping sessions = {} for term in meta['terms']: for session in term['sessions']: sessions[session] = term['name'] paths = glob.glob(pattern) for path in paths: with open(path) as f: data = prepare_obj(json.load(f)) # clean up bill_id data['bill_id'] = fix_bill_id(data['bill_id']) subjects = data.pop('subjects', None) if subjects: data['scraped_subjects'] = subjects bill = db.bills.find_one({'state': data['state'], 'session': data['session'], 'chamber': data['chamber'], 'bill_id': data['bill_id']}) for sponsor in data['sponsors']: id = get_legislator_id(state, data['session'], None, sponsor['name']) sponsor['leg_id'] = id for vote in data['votes']: if 'committee' in vote: committee_id = get_committee_id(state, vote['chamber'], vote['committee']) vote['committee_id'] = committee_id for vtype in ('yes_votes', 'no_votes', 'other_votes'): svlist = [] for svote in vote[vtype]: id = get_legislator_id(state, data['session'], vote['chamber'], svote) svlist.append({'name': svote, 'leg_id': id}) vote[vtype] = svlist data['_term'] = sessions[data['session']] # Merge any version titles into the alternate_titles list alt_titles = set(data.get('alternate_titles', [])) for version in data['versions']: if 'title' in version: alt_titles.add(version['title']) if '+short_title' in version: alt_titles.add(version['+short_title']) try: # Make sure the primary title isn't included in the # alternate title list alt_titles.remove(data['title']) except KeyError: pass data['alternate_titles'] = list(alt_titles) if not bill: data['_keywords'] = list(bill_keywords(data)) insert_with_id(data) else: data['_keywords'] = list(bill_keywords(data)) update(bill, data, db.bills) print 'imported %s bill files' % len(paths) populate_current_fields(state) ensure_indexes()
def import_bills(state, data_dir): data_dir = os.path.join(data_dir, state) pattern = os.path.join(data_dir, 'bills', '*.json') meta = db.metadata.find_one({'_id': state}) # Build a session to term mapping sessions = {} for term in meta['terms']: for session in term['sessions']: sessions[session] = term['name'] votes = import_votes(state, data_dir) paths = glob.glob(pattern) for path in paths: with open(path) as f: data = prepare_obj(json.load(f)) # clean up bill_id data['bill_id'] = fix_bill_id(data['bill_id']) # move subjects to scraped_subjects subjects = data.pop('subjects', None) # NOTE: intentionally doesn't copy blank lists of subjects # this avoids the problem where a bill is re-run but we can't # get subjects anymore (quite common in fact) if subjects: data['scraped_subjects'] = subjects # add loaded votes to data bill_votes = votes.pop((data['chamber'], data['session'], data['bill_id']), []) data['votes'].extend(bill_votes) bill = db.bills.find_one({'state': data['state'], 'session': data['session'], 'chamber': data['chamber'], 'bill_id': data['bill_id']}) vote_matcher = VoteMatcher(data['state']) if bill: vote_matcher.learn_vote_ids(bill['votes']) vote_matcher.set_vote_ids(data['votes']) # match sponsor leg_ids for sponsor in data['sponsors']: id = get_legislator_id(state, data['session'], None, sponsor['name']) sponsor['leg_id'] = id for vote in data['votes']: # committee_ids if 'committee' in vote: committee_id = get_committee_id(state, vote['chamber'], vote['committee']) vote['committee_id'] = committee_id # vote leg_ids for vtype in ('yes_votes', 'no_votes', 'other_votes'): svlist = [] for svote in vote[vtype]: id = get_legislator_id(state, data['session'], vote['chamber'], svote) svlist.append({'name': svote, 'leg_id': id}) vote[vtype] = svlist data['_term'] = sessions[data['session']] # Merge any version titles into the alternate_titles list alt_titles = set(data.get('alternate_titles', [])) for version in data['versions']: if 'title' in version: alt_titles.add(version['title']) if '+short_title' in version: alt_titles.add(version['+short_title']) try: # Make sure the primary title isn't included in the # alternate title list alt_titles.remove(data['title']) except KeyError: pass data['alternate_titles'] = list(alt_titles) if not bill: data['_keywords'] = list(bill_keywords(data)) insert_with_id(data) else: data['_keywords'] = list(bill_keywords(data)) update(bill, data, db.bills) print 'imported %s bill files' % len(paths) for remaining in votes.keys(): print 'Failed to match vote %s %s %s' % tuple([ r.encode('ascii', 'replace') for r in remaining]) populate_current_fields(state) ensure_indexes()
def import_bill(data, standalone_votes, categorizer): """ insert or update a bill data - raw bill JSON standalone_votes - votes scraped separately categorizer - SubjectCategorizer (None - no categorization) """ abbr = data[settings.LEVEL_FIELD] # clean up bill_ids data['bill_id'] = fix_bill_id(data['bill_id']) if 'alternate_bill_ids' in data: data['alternate_bill_ids'] = [fix_bill_id(bid) for bid in data['alternate_bill_ids']] # move subjects to scraped_subjects # NOTE: intentionally doesn't copy blank lists of subjects # this avoids the problem where a bill is re-run but we can't # get subjects anymore (quite common) subjects = data.pop('subjects', None) if subjects: data['scraped_subjects'] = subjects # update categorized subjects if categorizer: categorizer.categorize_bill(data) # companions for companion in data['companions']: companion['bill_id'] = fix_bill_id(companion['bill_id']) # query based on companion spec = companion.copy() spec[settings.LEVEL_FIELD] = abbr if not spec['chamber']: spec.pop('chamber') companion_obj = db.bills.find_one(spec) if companion_obj: companion['internal_id'] = companion_obj['_id'] else: logger.warning('Unknown companion: {chamber} {session} {bill_id}' .format(**companion)) # look for a prior version of this bill bill = db.bills.find_one({settings.LEVEL_FIELD: abbr, 'session': data['session'], 'chamber': data['chamber'], 'bill_id': data['bill_id']}) # keep doc ids consistent doc_matcher = DocumentMatcher(abbr) if bill: doc_matcher.learn_ids(bill['versions'] + bill['documents']) doc_matcher.set_ids(data['versions'] + data['documents']) # match sponsor leg_ids for sponsor in data['sponsors']: # use sponsor's chamber if specified id = get_legislator_id(abbr, data['session'], sponsor.get('chamber'), sponsor['name']) sponsor['leg_id'] = id if id is None: cid = get_committee_id(abbr, data['chamber'], sponsor['name']) if not cid is None: sponsor['committee_id'] = cid # process votes ############ # pull votes off bill bill_votes = data.pop('votes', []) # grab the external bill votes if present if metadata(abbr).get('_partial_vote_bill_id'): # this is a hack initially added for Rhode Island where we can't # determine the full bill_id, if this key is in the metadata # we just use the numeric portion, not ideal as it won't work # where HB/SBs overlap, but in RI they never do # pull off numeric portion of bill_id numeric_bill_id = data['bill_id'].split()[1] bill_votes += standalone_votes.pop((data['chamber'], data['session'], numeric_bill_id), []) else: # add loaded votes to data bill_votes += standalone_votes.pop((data['chamber'], data['session'], data['bill_id']), []) # do id matching and other vote prep if bill: prepare_votes(abbr, data['session'], bill['_id'], bill_votes) else: prepare_votes(abbr, data['session'], None, bill_votes) # process actions ########### dates = {'first': None, 'last': None, 'passed_upper': None, 'passed_lower': None, 'signed': None} vote_flags = { "bill:passed", "bill:failed", "bill:veto_override:passed", "bill:veto_override:failed", "amendment:passed", "amendment:failed", "committee:passed", "committee:passed:favorable", "committee:passed:unfavorable", "committee:passed:failed" } already_linked = set() remove_vote = set() for action in data['actions']: adate = action['date'] def _match_committee(name): return get_committee_id(abbr, action['actor'], name) def _match_legislator(name): return get_legislator_id(abbr, data['session'], action['actor'], name) resolvers = { "committee": _match_committee, "legislator": _match_legislator } if "related_entities" in action: for entity in action['related_entities']: try: resolver = resolvers[entity['type']] except KeyError as e: # We don't know how to deal. logger.error("I don't know how to sort a %s" % e) continue id = resolver(entity['name']) entity['id'] = id # first & last dates if not dates['first'] or adate < dates['first']: dates['first'] = adate if not dates['last'] or adate > dates['last']: dates['last'] = adate # passed & signed dates if (not dates['passed_upper'] and action['actor'] == 'upper' and 'bill:passed' in action['type']): dates['passed_upper'] = adate elif (not dates['passed_lower'] and action['actor'] == 'lower' and 'bill:passed' in action['type']): dates['passed_lower'] = adate elif (not dates['signed'] and 'governor:signed' in action['type']): dates['signed'] = adate # vote-action matching action_attached = False # only attempt vote matching if action has a date and is one of the # designated vote action types if set(action['type']).intersection(vote_flags) and action['date']: for vote in bill_votes: if not vote['date']: continue delta = abs(vote['date'] - action['date']) if (delta < datetime.timedelta(hours=20) and vote['chamber'] == action['actor']): if action_attached: # multiple votes match, we can't guess action.pop('related_votes', None) else: related_vote = vote['vote_id'] if related_vote in already_linked: remove_vote.add(related_vote) already_linked.add(related_vote) action['related_votes'] = [related_vote] action_attached = True # remove related_votes that we linked to multiple actions for action in data['actions']: for vote in remove_vote: if vote in action.get('related_votes', []): action['related_votes'].remove(vote) # save action dates to data data['action_dates'] = dates data['_term'] = term_for_session(abbr, data['session']) alt_titles = set(data.get('alternate_titles', [])) for version in data['versions']: # push versions to oyster if settings.ENABLE_OYSTER and 'url' in version: oysterize_version(data, version) # Merge any version titles into the alternate_titles list if 'title' in version: alt_titles.add(version['title']) if '+short_title' in version: alt_titles.add(version['+short_title']) try: # Make sure the primary title isn't included in the # alternate title list alt_titles.remove(data['title']) except KeyError: pass data['alternate_titles'] = list(alt_titles) data = apply_filters(filters, data) if not bill: bill_id = insert_with_id(data) git_add_bill(data) save_votes(data, bill_votes) return "insert" else: git_add_bill(bill) update(bill, data, db.bills) save_votes(bill, bill_votes) return "update"
def import_bills(state, data_dir): data_dir = os.path.join(data_dir, state) pattern = os.path.join(data_dir, "bills", "*.json") meta = db.metadata.find_one({"_id": state}) # Build a session to term mapping sessions = {} for term in meta["terms"]: for session in term["sessions"]: sessions[session] = term["name"] votes = import_votes(state, data_dir) paths = glob.glob(pattern) for path in paths: with open(path) as f: data = prepare_obj(json.load(f)) # clean up bill_id data["bill_id"] = fix_bill_id(data["bill_id"]) # move subjects to scraped_subjects subjects = data.pop("subjects", None) # NOTE: intentionally doesn't copy blank lists of subjects # this avoids the problem where a bill is re-run but we can't # get subjects anymore (quite common in fact) if subjects: data["scraped_subjects"] = subjects # add loaded votes to data bill_votes = votes.pop((data["chamber"], data["session"], data["bill_id"]), []) data["votes"].extend(bill_votes) bill = db.bills.find_one( {"state": data["state"], "session": data["session"], "chamber": data["chamber"], "bill_id": data["bill_id"]} ) vote_matcher = VoteMatcher(data["state"]) if bill: vote_matcher.learn_vote_ids(bill["votes"]) vote_matcher.set_vote_ids(data["votes"]) # match sponsor leg_ids for sponsor in data["sponsors"]: id = get_legislator_id(state, data["session"], None, sponsor["name"]) sponsor["leg_id"] = id for vote in data["votes"]: # committee_ids if "committee" in vote: committee_id = get_committee_id(state, vote["chamber"], vote["committee"]) vote["committee_id"] = committee_id # vote leg_ids for vtype in ("yes_votes", "no_votes", "other_votes"): svlist = [] for svote in vote[vtype]: id = get_legislator_id(state, data["session"], vote["chamber"], svote) svlist.append({"name": svote, "leg_id": id}) vote[vtype] = svlist data["_term"] = sessions[data["session"]] # Merge any version titles into the alternate_titles list alt_titles = set(data.get("alternate_titles", [])) for version in data["versions"]: if "title" in version: alt_titles.add(version["title"]) if "+short_title" in version: alt_titles.add(version["+short_title"]) try: # Make sure the primary title isn't included in the # alternate title list alt_titles.remove(data["title"]) except KeyError: pass data["alternate_titles"] = list(alt_titles) if not bill: data["_keywords"] = list(bill_keywords(data)) insert_with_id(data) else: data["_keywords"] = list(bill_keywords(data)) update(bill, data, db.bills) print "imported %s bill files" % len(paths) for remaining in votes.keys(): print "Failed to match vote %s %s %s" % tuple([r.encode("ascii", "replace") for r in remaining]) populate_current_fields(state) ensure_indexes()