def test_insert_with_idlevels(): state_obj = {'_type': 'person', 'level': 'state', 'state': 'ex', 'country': 'us'} country_obj = {'_type': 'person', 'level': 'country', 'state': 'ex', 'country': 'us'} assert utils.insert_with_id(state_obj).startswith('EX') assert utils.insert_with_id(country_obj).startswith('US')
def test_insert_with_id_increments(): obj1 = {'full_name': 'a test legislator', '_type': 'person', 'level': 'state', 'state': 'ex'} obj2 = {'full_name': 'another legislator', '_type': 'person', 'level': 'state', 'state': 'ex'} leg_id_re = re.compile(r'^EXL\d{6,6}$') id1 = utils.insert_with_id(obj1) assert leg_id_re.match(id1) found = db.legislators.find_one({'_id': id1}) assert found['_all_ids'] == [id1] id2 = utils.insert_with_id(obj2) assert leg_id_re.match(id2) assert id2 != id1 found = db.legislators.find_one({'_id': id2}) assert found assert found['_all_ids'] == [id2] # also check the timestamp creation assert found['created_at'] == found['updated_at'] assert isinstance(found['created_at'], datetime.datetime)
def test_insert_with_id_increments(): obj1 = {'full_name': 'a test legislator', '_type': 'person', 'state': 'ex'} obj2 = {'full_name': 'another legislator', '_type': 'person', 'state': 'ex'} leg_id_re = re.compile(r'^EXL\d{6,6}$') id1 = utils.insert_with_id(obj1) assert leg_id_re.match(id1) found = db.legislators.find_one({'_id': id1}) assert found['_all_ids'] == [id1] id2 = utils.insert_with_id(obj2) assert leg_id_re.match(id2) assert id2 != id1 found = db.legislators.find_one({'_id': id2}) assert found assert found['_all_ids'] == [id2] # also check the timestamp creation assert found['created_at'] == found['updated_at'] assert isinstance(found['created_at'], datetime.datetime)
def import_committees_from_legislators(current_term, abbr): """ create committees from legislators that have committee roles """ # for all current legislators for legislator in db.legislators.find({ 'roles': { '$elemMatch': { 'term': current_term, settings.LEVEL_FIELD: abbr } } }): # for all committee roles for role in legislator['roles']: if (role['type'] == 'committee member' and 'committee_id' not in role): spec = { settings.LEVEL_FIELD: abbr, 'chamber': role['chamber'], 'committee': role['committee'] } if 'subcommittee' in role: spec['subcommittee'] = role['subcommittee'] committee = db.committees.find_one(spec) if not committee: committee = spec committee['_type'] = 'committee' # copy LEVEL_FIELD from legislator to committee committee[settings.LEVEL_FIELD] = \ legislator[settings.LEVEL_FIELD] committee['members'] = [] committee['sources'] = [] if 'subcommittee' not in committee: committee['subcommittee'] = None insert_with_id(committee) for member in committee['members']: if member['leg_id'] == legislator['leg_id']: break else: committee['members'].append({ 'name': legislator['full_name'], 'leg_id': legislator['leg_id'], 'role': role.get('position') or 'member' }) for source in legislator['sources']: if source not in committee['sources']: committee['sources'].append(source) db.committees.save(committee, safe=True) role['committee_id'] = committee['_id'] db.legislators.save(legislator, safe=True)
def import_legislator(data): data = prepare_obj(data) data['_scraped_name'] = data['full_name'] # Rename 'role' -> 'type' for role in data['roles']: if 'role' in role: role['type'] = role.pop('role') # copy over country and/or state into role # TODO: base this on all possible level fields role['level'] = data['level'] if 'country' in data: role['country'] = data['country'] if 'state' in data: role['state'] = data['state'] cur_role = data['roles'][0] term = cur_role['term'] level = data['level'] abbrev = data[level] prev_term = get_previous_term(abbrev, term) next_term = get_next_term(abbrev, term) spec = {level: abbrev, 'type': cur_role['type'], 'term': {'$in': [term, prev_term, next_term]}} if 'district' in cur_role: spec['district'] = cur_role['district'] if 'chamber' in cur_role: spec['chamber'] = cur_role['chamber'] leg = db.legislators.find_one( {'level': level, level: abbrev, '_scraped_name': data['full_name'], 'roles': {'$elemMatch': spec}}) if leg: if 'old_roles' not in leg: leg['old_roles'] = {} if leg['roles'][0]['term'] == prev_term: # Move to old leg['old_roles'][leg['roles'][0]['term']] = leg['roles'] elif leg['roles'][0]['term'] == next_term: leg['old_roles'][term] = data['roles'] data['roles'] = leg['roles'] update(leg, data, db.legislators) return "update" else: insert_with_id(data) return "insert"
def import_legislator(data): data = prepare_obj(data) data['_scraped_name'] = data['full_name'] # Rename 'role' -> 'type' for role in data['roles']: if 'role' in role: role['type'] = role['role'] del role['role'] # copy over country and/or state into role # TODO: base this on all possible level fields role['level'] = data['level'] if 'country' in data: role['country'] = data['country'] if 'state' in data: role['state'] = data['state'] cur_role = data['roles'][0] term = cur_role['term'] level = data['level'] abbrev = data[level] prev_term = get_previous_term(abbrev, term) next_term = get_next_term(abbrev, term) spec = {level: abbrev, 'type': cur_role['type'], 'term': {'$in': [term, prev_term, next_term]}} if 'district' in cur_role: spec['district'] = cur_role['district'] if 'chamber' in cur_role: spec['chamber'] = cur_role['chamber'] leg = db.legislators.find_one( {'level': level, level: abbrev, '_scraped_name': data['full_name'], 'roles': {'$elemMatch': spec}}) if leg: if 'old_roles' not in leg: leg['old_roles'] = {} if leg['roles'][0]['term'] == prev_term: # Move to old leg['old_roles'][leg['roles'][0]['term']] = leg['roles'] elif leg['roles'][0]['term'] == next_term: leg['old_roles'][term] = data['roles'] data['roles'] = leg['roles'] update(leg, data, db.legislators) else: insert_with_id(data)
def test_insert_with_id_types(): person = {'_type': 'person', 'level': 'state', 'state': 'ex'} legislator = {'_type': 'person', 'level': 'state', 'state': 'ex'} committee = {'_type': 'committee', 'level': 'state', 'state': 'ex'} bill = {'_type': 'bill', 'level': 'state', 'state': 'ex'} other = {'_type': 'other', 'level': 'state', 'state': 'ex'} assert utils.insert_with_id(person).startswith('EXL') assert utils.insert_with_id(legislator).startswith('EXL') assert utils.insert_with_id(committee).startswith('EXC') assert utils.insert_with_id(bill).startswith('EXB') assert_raises(ValueError, utils.insert_with_id, other)
def test_insert_with_id_types(): person = {'_type': 'person', 'state': 'ex'} legislator = {'_type': 'person', 'state': 'ex'} committee = {'_type': 'committee', 'state': 'ex'} bill = {'_type': 'bill', 'state': 'ex'} other = {'_type': 'other', 'state': 'ex'} assert utils.insert_with_id(person).startswith('EXL') assert utils.insert_with_id(legislator).startswith('EXL') assert utils.insert_with_id(committee).startswith('EXC') assert utils.insert_with_id(bill).startswith('EXB') assert_raises(ValueError, utils.insert_with_id, other)
def test_activate_legislators(): # Previous term leg1 = {'_type': 'person', 'level': 'state', 'state': 'ex', 'roles': [{'type': 'member', 'chamber': 'upper', 'level': 'state', 'state': 'ex', 'term': '2009-2010', 'district': '1', 'party': 'Democrat', 'start_date': None, 'end_date': None}]} # Current term, no end date leg2 = {'_type': 'person', 'level': 'state', 'state': 'ex', 'roles': [{'type': 'member', 'chamber': 'upper', 'level': 'state', 'state': 'ex', 'term': '2011-2012', 'district': '2', 'party': 'Democrat', 'start_date': None, 'end_date': None}]} # Current term, end date leg3 = {'_type': 'person', 'level': 'state', 'state': 'ex', 'roles': [{'type': 'member', 'chamber': 'upper', 'level': 'state', 'state': 'ex', 'term': '2011-2012', 'district': '3', 'party': 'Democrat', 'start_date': None, 'end_date': datetime.datetime(2011, 1, 1)}]} id1 = utils.insert_with_id(leg1) id2 = utils.insert_with_id(leg2) id3 = utils.insert_with_id(leg3) legislators.activate_legislators('2011-2012', 'ex', 'state') leg1 = db.legislators.find_one({'_id': id1}) assert 'active' not in leg1 assert 'district' not in leg1 assert 'chamber' not in leg1 assert 'party' not in leg1 leg2 = db.legislators.find_one({'_id': id2}) assert leg2['active'] == True assert leg2['district'] == '2' assert leg2['chamber'] == 'upper' assert leg2['party'] == 'Democrat' leg3 = db.legislators.find_one({'_id': id3}) assert 'active' not in leg3 assert 'district' not in leg3 assert 'chamber' not in leg3 assert 'party' not in leg3
def import_legislator(data): data = prepare_obj(data) data['_scraped_name'] = data['full_name'] # Rename 'role' -> 'type' for role in data['roles']: if 'role' in role: role['type'] = role['role'] del role['role'] cur_role = data['roles'][0] term = cur_role['term'] prev_term = get_previous_term(data['state'], term) next_term = get_next_term(data['state'], term) spec = { 'state': data['state'], 'type': cur_role['type'], 'term': { '$in': [term, prev_term, next_term] } } if 'district' in cur_role: spec['district'] = cur_role['district'] if 'chamber' in cur_role: spec['chamber'] = cur_role['chamber'] leg = db.legislators.find_one({ 'state': data['state'], '_scraped_name': data['full_name'], 'roles': { '$elemMatch': spec } }) if leg: if 'old_roles' not in leg: leg['old_roles'] = {} if leg['roles'][0]['term'] == prev_term: # Move to old leg['old_roles'][leg['roles'][0]['term']] = leg['roles'] elif leg['roles'][0]['term'] == next_term: leg['old_roles'][term] = data['roles'] data['roles'] = leg['roles'] update(leg, data, db.legislators) else: insert_with_id(data)
def import_committees_from_legislators(current_term, level, abbr): """ create committees from legislators that have committee roles """ # for all current legislators for legislator in db.legislators.find({ 'level': level, 'roles': {'$elemMatch': {'term': current_term, level: abbr}}}): # for all committee roles for role in legislator['roles']: if (role['type'] == 'committee member' and 'committee_id' not in role): spec = {'level': level, level: abbr, 'chamber': role['chamber'], 'committee': role['committee']} if 'subcommittee' in role: spec['subcommittee'] = role['subcommittee'] committee = db.committees.find_one(spec) if not committee: committee = spec committee['_type'] = 'committee' # copy required fields from legislator to committee for f in settings.BILLY_LEVEL_FIELDS: committee[f] = legislator[f] committee['members'] = [] committee['sources'] = [] if 'subcommittee' not in committee: committee['subcommittee'] = None insert_with_id(committee) for member in committee['members']: if member['leg_id'] == legislator['leg_id']: break else: committee['members'].append( {'name': legislator['full_name'], 'leg_id': legislator['leg_id'], 'role': role.get('position') or 'member'}) db.committees.save(committee, safe=True) role['committee_id'] = committee['_id'] db.legislators.save(legislator, safe=True)
def test_update(): dt = datetime.datetime.utcnow() obj1 = {'_type': 'bill', 'state': 'ex', 'field1': 'stuff', 'field2': 'original', '_locked_fields': 'field2', 'created_at': dt, 'updated_at': dt} id1 = utils.insert_with_id(obj1) obj1 = db.bills.find_one(id1) # Updating a bill with itself shouldn't cause 'updated_at' to be changed utils.update(obj1, obj1, db.bills) obj2 = db.bills.find_one({'_id': id1}) assert obj2['created_at'] == obj2['updated_at'] assert obj1['updated_at'] == obj2['updated_at'] utils.update(obj1, {'_type': 'bill', 'field1': 'more stuff', 'field2': 'a change', 'state': 'ex'}, db.bills) obj2 = db.bills.find_one({'_id': id1}) assert obj2['created_at'] != obj2['updated_at'] assert obj1['updated_at'] != obj2['updated_at'] assert obj2['field1'] == 'more stuff' # make sure locked fields don't get overwritten assert obj2['field2'] == 'original'
def test_update(): obj0 = {'_type': 'bill', 'level': 'state', 'state': 'ex', 'field1': 'stuff', 'field2': 'original', '_locked_fields': ['field2']} id1 = utils.insert_with_id(obj0) obj1 = db.bills.find_one(id1) # Updating a bill with itself shouldn't cause 'updated_at' to be changed utils.update(obj1, obj1, db.bills) obj2 = db.bills.find_one({'_id': id1}) assert obj2['created_at'] == obj2['updated_at'] == obj1['updated_at'] initial_timestamp = obj2['created_at'] # we need this later # update with a few fields changed changes = {'field1': 'more stuff', 'field2': 'a change'} time.sleep(0.005) # sleep long enough to avoid created_at == updated_at utils.update(obj1, changes, db.bills) obj2 = db.bills.find_one({'_id': id1}) # check that timestamps have updated assert obj2['created_at'] < obj2['updated_at'] assert initial_timestamp < obj2['updated_at'] # make sure field1 gets overwritten and field 2 doesn't assert obj2['field1'] == 'more stuff' assert obj2['field2'] == 'original'
def test_update(): dt = datetime.datetime.utcnow() obj1 = { "_type": "bill", "state": "ex", "field1": "stuff", "field2": "original", "_locked_fields": "field2", "created_at": dt, "updated_at": dt, } id1 = utils.insert_with_id(obj1) obj1 = db.bills.find_one(id1) # Updating a bill with itself shouldn't cause 'updated_at' to be changed utils.update(obj1, obj1, db.bills) obj2 = db.bills.find_one({"_id": id1}) assert obj2["created_at"] == obj2["updated_at"] assert obj1["updated_at"] == obj2["updated_at"] utils.update(obj1, {"_type": "bill", "field1": "more stuff", "field2": "a change", "state": "ex"}, db.bills) obj2 = db.bills.find_one({"_id": id1}) assert obj2["created_at"] != obj2["updated_at"] assert obj1["updated_at"] != obj2["updated_at"] assert obj2["field1"] == "more stuff" # make sure locked fields don't get overwritten assert obj2["field2"] == "original"
def test_update(): dt = datetime.datetime.utcnow() obj1 = { '_type': 'bill', 'state': 'ex', 'field1': 'stuff', 'field2': 'original', '_locked_fields': 'field2', 'created_at': dt, 'updated_at': dt } id1 = utils.insert_with_id(obj1) obj1 = db.bills.find_one(id1) # Updating a bill with itself shouldn't cause 'updated_at' to be changed utils.update(obj1, obj1, db.bills) obj2 = db.bills.find_one({'_id': id1}) assert obj2['created_at'] == obj2['updated_at'] assert obj1['updated_at'] == obj2['updated_at'] utils.update( obj1, { '_type': 'bill', 'field1': 'more stuff', 'field2': 'a change', 'state': 'ex' }, db.bills) obj2 = db.bills.find_one({'_id': id1}) assert obj2['created_at'] != obj2['updated_at'] assert obj1['updated_at'] != obj2['updated_at'] assert obj2['field1'] == 'more stuff' # make sure locked fields don't get overwritten assert obj2['field2'] == 'original'
def test_update_sneaky_filter(): obj = { '_type': 'bill', 'state': 'ex', 'normal_field': 1, 'set_field': [1, 2, 3] } def _set_changed(old, new): return set(old) != set(new) sneaky_filter = {'set_field': _set_changed} id = utils.insert_with_id(obj) obj = db.bills.find_one(id) # the set will be the same, shouldn't update utils.update(obj, {'set_field': [3, 2, 1]}, db.bills, sneaky_filter) assert obj['set_field'] == [1, 2, 3] assert obj['updated_at'] == obj['created_at'] # the set now differs, should update utils.update(obj, {'set_field': [4, 3, 2, 1]}, db.bills, sneaky_filter) assert obj['set_field'] == [4, 3, 2, 1] assert obj['updated_at'] > obj['created_at']
def test_update(): obj0 = {'_type': 'bill', 'state': 'ex', 'field1': 'stuff', 'field2': 'original', '_locked_fields': ['field2']} id1 = utils.insert_with_id(obj0) obj1 = db.bills.find_one(id1) # Updating a bill with itself shouldn't cause 'updated_at' to be changed utils.update(obj1, obj1, db.bills) obj2 = db.bills.find_one({'_id': id1}) assert obj2['created_at'] == obj2['updated_at'] == obj1['updated_at'] initial_timestamp = obj2['created_at'] # we need this later # update with a few fields changed changes = {'field1': 'more stuff', 'field2': 'a change'} time.sleep(0.005) # sleep long enough to avoid created_at == updated_at utils.update(obj1, changes, db.bills) obj2 = db.bills.find_one({'_id': id1}) # check that timestamps have updated assert obj2['created_at'] < obj2['updated_at'] assert initial_timestamp < obj2['updated_at'] # make sure field1 gets overwritten and field 2 doesn't assert obj2['field1'] == 'more stuff' assert obj2['field2'] == 'original'
def test_insert_with_id(): obj1 = {"full_name": "a test legislator", "_type": "person", "state": "ex"} obj2 = {"full_name": "another legislator", "_type": "person", "state": "ex"} id_re = r"^EXL\d{6,6}$" id1 = utils.insert_with_id(obj1) assert re.match(id_re, id1) found = db.legislators.find_one({"_id": id1}) assert found["_all_ids"] == [id1] id2 = utils.insert_with_id(obj2) assert re.match(id_re, id2) assert id2 != id1 found = db.legislators.find_one({"_id": id2}) assert found assert found["_all_ids"] == [id2]
def test_deactivate_legislators(): # Previous term leg1 = {'_type': 'person', 'state': 'ex', 'roles': [{'type': 'member', 'chamber': 'upper', 'state': 'ex', 'term': '2009-2010', 'district': '1', 'party': 'Democrat', 'start_date': None, 'end_date': None}], 'active': True, 'district': '1', 'chamber': 'upper', 'party': 'Democrat'} leg1_roles = leg1['roles'] # Current term, no end date leg2 = {'_type': 'person', 'state': 'ex', 'roles': [{'type': 'member', 'chamber': 'upper', 'state': 'ex', 'term': '2011-2012', 'district': '2', 'party': 'Democrat', 'start_date': None, 'end_date': None}], 'active': True, 'district': '2', 'chamber': 'upper', 'party': 'Democrat'} leg2_roles = leg2['roles'] id1 = utils.insert_with_id(leg1) id2 = utils.insert_with_id(leg2) legislators.deactivate_legislators('ex', '2011-2012') leg1 = db.legislators.find_one({'_id': id1}) assert leg1['active'] == False assert 'chamber' not in leg1 assert 'district' not in leg1 assert 'party' not in leg1 assert leg1['roles'] == [] assert leg1['old_roles']['2009-2010'] == leg1_roles leg2 = db.legislators.find_one({'_id': id2}) assert leg2['active'] == True assert leg2['chamber'] == 'upper' assert leg2['district'] == '2' assert leg2['party'] == 'Democrat' assert leg2['roles'] == leg2_roles assert 'old_roles' not in leg2
def import_legislator(data): data = prepare_obj(data) data['_scraped_name'] = data['full_name'] # Rename 'role' -> 'type' for role in data['roles']: if 'role' in role: role['type'] = role['role'] del role['role'] cur_role = data['roles'][0] term = cur_role['term'] prev_term = get_previous_term(data['state'], term) next_term = get_next_term(data['state'], term) spec = {'state': data['state'], 'type': cur_role['type'], 'term': {'$in': [term, prev_term, next_term]}} if 'district' in cur_role: spec['district'] = cur_role['district'] if 'chamber' in cur_role: spec['chamber'] = cur_role['chamber'] leg = db.legislators.find_one( {'state': data['state'], '_scraped_name': data['full_name'], 'roles': {'$elemMatch': spec}}) if leg: if 'old_roles' not in leg: leg['old_roles'] = {} if leg['roles'][0]['term'] == prev_term: # Move to old leg['old_roles'][leg['roles'][0]['term']] = leg['roles'] elif leg['roles'][0]['term'] == next_term: leg['old_roles'][term] = data['roles'] data['roles'] = leg['roles'] update(leg, data, db.legislators) else: insert_with_id(data)
def test_insert_with_id(): obj1 = {'full_name': 'a test legislator', '_type': 'person', 'state': 'ex'} obj2 = {'full_name': 'another legislator', '_type': 'person', 'state': 'ex'} id_re = r'^EXL\d{6,6}$' id1 = utils.insert_with_id(obj1) assert re.match(id_re, id1) found = db.legislators.find_one({'_id': id1}) assert found['_all_ids'] == [id1] id2 = utils.insert_with_id(obj2) assert re.match(id_re, id2) assert id2 != id1 found = db.legislators.find_one({'_id': id2}) assert found assert found['_all_ids'] == [id2]
def test_insert_with_id(): obj1 = {'full_name': 'a test legislator', '_type': 'person', 'state': 'ex'} obj2 = { 'full_name': 'another legislator', '_type': 'person', 'state': 'ex' } id_re = r'^EXL\d{6,6}$' id1 = utils.insert_with_id(obj1) assert re.match(id_re, id1) found = db.legislators.find_one({'_id': id1}) assert found['_all_ids'] == [id1] id2 = utils.insert_with_id(obj2) assert re.match(id_re, id2) assert id2 != id1 found = db.legislators.find_one({'_id': id2}) assert found assert found['_all_ids'] == [id2]
def test_update_sneaky_filter(): obj = {'_type': 'bill', 'level': 'state', 'state': 'ex', 'normal_field': 1, 'set_field': [1,2,3]} def _set_changed(old, new): return set(old) != set(new) sneaky_filter = {'set_field': _set_changed} id = utils.insert_with_id(obj) obj = db.bills.find_one(id) # the set will be the same, shouldn't update utils.update(obj, {'set_field': [3,2,1]}, db.bills, sneaky_filter) assert obj['set_field'] == [1,2,3] assert obj['updated_at'] == obj['created_at'] # the set now differs, should update utils.update(obj, {'set_field': [4,3,2,1]}, db.bills, sneaky_filter) assert obj['set_field'] == [4,3,2,1] assert obj['updated_at'] > obj['created_at']
def import_bills(state, data_dir): data_dir = os.path.join(data_dir, state) pattern = os.path.join(data_dir, 'bills', '*.json') meta = db.metadata.find_one({'_id': state}) # Build a session to term mapping sessions = {} for term in meta['terms']: for session in term['sessions']: sessions[session] = term['name'] paths = glob.glob(pattern) for path in paths: with open(path) as f: data = prepare_obj(json.load(f)) # clean up bill_id data['bill_id'] = fix_bill_id(data['bill_id']) subjects = data.pop('subjects', None) if subjects: data['scraped_subjects'] = subjects bill = db.bills.find_one({'state': data['state'], 'session': data['session'], 'chamber': data['chamber'], 'bill_id': data['bill_id']}) for sponsor in data['sponsors']: id = get_legislator_id(state, data['session'], None, sponsor['name']) sponsor['leg_id'] = id for vote in data['votes']: if 'committee' in vote: committee_id = get_committee_id(state, vote['chamber'], vote['committee']) vote['committee_id'] = committee_id for vtype in ('yes_votes', 'no_votes', 'other_votes'): svlist = [] for svote in vote[vtype]: id = get_legislator_id(state, data['session'], vote['chamber'], svote) svlist.append({'name': svote, 'leg_id': id}) vote[vtype] = svlist data['_term'] = sessions[data['session']] # Merge any version titles into the alternate_titles list alt_titles = set(data.get('alternate_titles', [])) for version in data['versions']: if 'title' in version: alt_titles.add(version['title']) if '+short_title' in version: alt_titles.add(version['+short_title']) try: # Make sure the primary title isn't included in the # alternate title list alt_titles.remove(data['title']) except KeyError: pass data['alternate_titles'] = list(alt_titles) if not bill: data['_keywords'] = list(bill_keywords(data)) insert_with_id(data) else: data['_keywords'] = list(bill_keywords(data)) update(bill, data, db.bills) print 'imported %s bill files' % len(paths) populate_current_fields(state) ensure_indexes()
def import_legislator(data): data = prepare_obj(data) if data.get('_scraped_name') is None: data['_scraped_name'] = data['full_name'] # Rename 'role' -> 'type' for role in data['roles']: if 'role' in role: role['type'] = role.pop('role') # copy over LEVEL_FIELD into role if settings.LEVEL_FIELD in data: role[settings.LEVEL_FIELD] = data[settings.LEVEL_FIELD] scraped_role = data['roles'][0] scraped_term = scraped_role['term'] abbr = data[settings.LEVEL_FIELD] spec = { settings.LEVEL_FIELD: abbr, 'type': scraped_role['type'], 'term': scraped_term } if 'district' in scraped_role: spec['district'] = scraped_role['district'] if 'chamber' in scraped_role: spec['chamber'] = scraped_role['chamber'] # find matching legislator in current term leg = db.legislators.find_one({ settings.LEVEL_FIELD: abbr, '_scraped_name': data['_scraped_name'], 'roles': { '$elemMatch': spec } }) # legislator with a matching old_role if not leg: spec.pop('term') leg = db.legislators.find_one({ settings.LEVEL_FIELD: abbr, '_scraped_name': data['_scraped_name'], 'old_roles.%s' % scraped_term: { '$elemMatch': spec } }) if leg: if 'old_roles' not in data: data['old_roles'] = leg.get('old_roles', {}) # put scraped roles into their old_roles data['old_roles'][scraped_term] = data['roles'] data['roles'] = leg['roles'] # don't overwrite their current roles # active matching legislator from different term if not leg: spec.pop('term', None) leg = db.legislators.find_one({ settings.LEVEL_FIELD: abbr, '_scraped_name': data['_scraped_name'], 'roles': { '$elemMatch': spec } }) if leg: if 'old_roles' not in data: data['old_roles'] = leg.get('old_roles', {}) # scraped_term < leg's term if term_older_than(abbr, scraped_term, leg['roles'][0]['term']): # move scraped roles into old_roles data['old_roles'][scraped_term] = data['roles'] data['roles'] = leg['roles'] else: data['old_roles'][leg['roles'][0]['term']] = leg['roles'] data = apply_filters(filters, data) if leg: update(leg, data, db.legislators) return "update" else: insert_with_id(data) return "insert"
def import_bill(data, standalone_votes, categorizer): """ insert or update a bill data - raw bill JSON standalone_votes - votes scraped separately categorizer - SubjectCategorizer (None - no categorization) """ abbr = data[settings.LEVEL_FIELD] # clean up bill_ids data['bill_id'] = fix_bill_id(data['bill_id']) if 'alternate_bill_ids' in data: data['alternate_bill_ids'] = [fix_bill_id(bid) for bid in data['alternate_bill_ids']] # move subjects to scraped_subjects # NOTE: intentionally doesn't copy blank lists of subjects # this avoids the problem where a bill is re-run but we can't # get subjects anymore (quite common) subjects = data.pop('subjects', None) if subjects: data['scraped_subjects'] = subjects # update categorized subjects if categorizer: categorizer.categorize_bill(data) # companions for companion in data['companions']: companion['bill_id'] = fix_bill_id(companion['bill_id']) # query based on companion spec = companion.copy() spec[settings.LEVEL_FIELD] = abbr if not spec['chamber']: spec.pop('chamber') companion_obj = db.bills.find_one(spec) if companion_obj: companion['internal_id'] = companion_obj['_id'] else: logger.warning('Unknown companion: {chamber} {session} {bill_id}' .format(**companion)) # look for a prior version of this bill bill = db.bills.find_one({settings.LEVEL_FIELD: abbr, 'session': data['session'], 'chamber': data['chamber'], 'bill_id': data['bill_id']}) # keep doc ids consistent doc_matcher = DocumentMatcher(abbr) if bill: doc_matcher.learn_ids(bill['versions'] + bill['documents']) doc_matcher.set_ids(data['versions'] + data['documents']) # match sponsor leg_ids match_sponsor_ids(abbr, data) # process votes ############ # pull votes off bill bill_votes = data.pop('votes', []) # grab the external bill votes if present if metadata(abbr).get('_partial_vote_bill_id'): # this is a hack initially added for Rhode Island where we can't # determine the full bill_id, if this key is in the metadata # we just use the numeric portion, not ideal as it won't work # where HB/SBs overlap, but in RI they never do # pull off numeric portion of bill_id numeric_bill_id = data['bill_id'].split()[1] bill_votes += standalone_votes.pop((data['chamber'], data['session'], numeric_bill_id), []) else: # add loaded votes to data bill_votes += standalone_votes.pop((data['chamber'], data['session'], data['bill_id']), []) # do id matching and other vote prep if bill: prepare_votes(abbr, data['session'], bill['_id'], bill_votes) else: prepare_votes(abbr, data['session'], None, bill_votes) # process actions ########### dates = {'first': None, 'last': None, 'passed_upper': None, 'passed_lower': None, 'signed': None} vote_flags = { "bill:passed", "bill:failed", "bill:veto_override:passed", "bill:veto_override:failed", "amendment:passed", "amendment:failed", "committee:passed", "committee:passed:favorable", "committee:passed:unfavorable", "committee:passed:failed" } already_linked = set() remove_vote = set() for action in data['actions']: adate = action['date'] def _match_committee(name): return get_committee_id(abbr, action['actor'], name) def _match_legislator(name): return get_legislator_id(abbr, data['session'], action['actor'], name) resolvers = { "committee": _match_committee, "legislator": _match_legislator } if "related_entities" in action: for entity in action['related_entities']: try: resolver = resolvers[entity['type']] except KeyError as e: # We don't know how to deal. logger.error("I don't know how to sort a %s" % e) continue id = resolver(entity['name']) entity['id'] = id # first & last dates if not dates['first'] or adate < dates['first']: dates['first'] = adate if not dates['last'] or adate > dates['last']: dates['last'] = adate # passed & signed dates if (not dates['passed_upper'] and action['actor'] == 'upper' and 'bill:passed' in action['type']): dates['passed_upper'] = adate elif (not dates['passed_lower'] and action['actor'] == 'lower' and 'bill:passed' in action['type']): dates['passed_lower'] = adate elif (not dates['signed'] and 'governor:signed' in action['type']): dates['signed'] = adate # vote-action matching action_attached = False # only attempt vote matching if action has a date and is one of the # designated vote action types if set(action['type']).intersection(vote_flags) and action['date']: for vote in bill_votes: if not vote['date']: continue delta = abs(vote['date'] - action['date']) if (delta < datetime.timedelta(hours=20) and vote['chamber'] == action['actor']): if action_attached: # multiple votes match, we can't guess action.pop('related_votes', None) else: related_vote = vote['vote_id'] if related_vote in already_linked: remove_vote.add(related_vote) already_linked.add(related_vote) action['related_votes'] = [related_vote] action_attached = True # remove related_votes that we linked to multiple actions for action in data['actions']: for vote in remove_vote: if vote in action.get('related_votes', []): action['related_votes'].remove(vote) # save action dates to data data['action_dates'] = dates data['_term'] = term_for_session(abbr, data['session']) alt_titles = set(data.get('alternate_titles', [])) for version in data['versions']: # add/update tracked_versions collection track_version(data, version) # Merge any version titles into the alternate_titles list if 'title' in version: alt_titles.add(version['title']) if '+short_title' in version: alt_titles.add(version['+short_title']) try: # Make sure the primary title isn't included in the # alternate title list alt_titles.remove(data['title']) except KeyError: pass data['alternate_titles'] = list(alt_titles) data = apply_filters(filters, data) if not bill: insert_with_id(data) git_add_bill(data) save_votes(data, bill_votes) return "insert" else: update(bill, data, db.bills) git_add_bill(bill) save_votes(bill, bill_votes) return "update"
def import_bill(data, votes): level = data['level'] abbr = data[level] # clean up bill_id data['bill_id'] = fix_bill_id(data['bill_id']) # move subjects to scraped_subjects subjects = data.pop('subjects', None) # NOTE: intentionally doesn't copy blank lists of subjects # this avoids the problem where a bill is re-run but we can't # get subjects anymore (quite common) if subjects: data['scraped_subjects'] = subjects # add loaded votes to data bill_votes = votes.pop((data['chamber'], data['session'], data['bill_id']), []) data['votes'].extend(bill_votes) bill = db.bills.find_one({'level': level, level: abbr, 'session': data['session'], 'chamber': data['chamber'], 'bill_id': data['bill_id']}) vote_matcher = VoteMatcher(abbr) if bill: vote_matcher.learn_vote_ids(bill['votes']) vote_matcher.set_vote_ids(data['votes']) # match sponsor leg_ids for sponsor in data['sponsors']: id = get_legislator_id(abbr, data['session'], None, sponsor['name']) sponsor['leg_id'] = id for vote in data['votes']: # committee_ids if 'committee' in vote: committee_id = get_committee_id(level, abbr, vote['chamber'], vote['committee']) vote['committee_id'] = committee_id # vote leg_ids for vtype in ('yes_votes', 'no_votes', 'other_votes'): svlist = [] for svote in vote[vtype]: id = get_legislator_id(abbr, data['session'], vote['chamber'], svote) svlist.append({'name': svote, 'leg_id': id}) vote[vtype] = svlist data['_term'] = term_for_session(abbr, data['session']) # Merge any version titles into the alternate_titles list alt_titles = set(data.get('alternate_titles', [])) for version in data['versions']: if 'title' in version: alt_titles.add(version['title']) if '+short_title' in version: alt_titles.add(version['+short_title']) try: # Make sure the primary title isn't included in the # alternate title list alt_titles.remove(data['title']) except KeyError: pass data['alternate_titles'] = list(alt_titles) # update keywords data['_keywords'] = list(bill_keywords(data)) if not bill: insert_with_id(data) else: update(bill, data, db.bills)
def import_bill(data, votes, categorizer): level = data['level'] abbr = data[level] # clean up bill_ids data['bill_id'] = fix_bill_id(data['bill_id']) if 'alternate_bill_ids' in data: data['alternate_bill_ids'] = [fix_bill_id(bid) for bid in data['alternate_bill_ids']] # move subjects to scraped_subjects # NOTE: intentionally doesn't copy blank lists of subjects # this avoids the problem where a bill is re-run but we can't # get subjects anymore (quite common) subjects = data.pop('subjects', None) if subjects: data['scraped_subjects'] = subjects # update categorized subjects if categorizer: categorizer.categorize_bill(data) # this is a hack added for Rhode Island where we can't # determine the full bill_id, if this key is in the metadata # we just use the numeric portion, not ideal as it won't work # in states where HB/SBs overlap, but in RI they never do if metadata(abbr).get('_partial_vote_bill_id'): # pull off numeric portion of bill_id numeric_bill_id = data['bill_id'].split()[1] bill_votes = votes.pop((data['chamber'], data['session'], numeric_bill_id), []) else: # add loaded votes to data bill_votes = votes.pop((data['chamber'], data['session'], data['bill_id']), []) data['votes'].extend(bill_votes) bill = db.bills.find_one({'level': level, level: abbr, 'session': data['session'], 'chamber': data['chamber'], 'bill_id': data['bill_id']}) # keep vote/doc ids consistent vote_matcher = VoteMatcher(abbr) doc_matcher = DocumentMatcher(abbr) if bill: vote_matcher.learn_ids(bill['votes']) doc_matcher.learn_ids(bill['versions'] + bill['documents']) vote_matcher.set_ids(data['votes']) doc_matcher.set_ids(data['versions'] + data['documents']) # match sponsor leg_ids for sponsor in data['sponsors']: id = get_legislator_id(abbr, data['session'], None, sponsor['name']) sponsor['leg_id'] = id if id is None: cid = get_committee_id(level, abbr, data['chamber'], sponsor['name']) if not cid is None: sponsor['committee_id'] = cid # process votes for vote in data['votes']: # committee_ids if 'committee' in vote: committee_id = get_committee_id(level, abbr, vote['chamber'], vote['committee']) vote['committee_id'] = committee_id # vote leg_ids for vtype in ('yes_votes', 'no_votes', 'other_votes'): svlist = [] for svote in vote[vtype]: id = get_legislator_id(abbr, data['session'], vote['chamber'], svote) svlist.append({'name': svote, 'leg_id': id}) vote[vtype] = svlist # process actions dates = {'first': None, 'last': None, 'passed_upper': None, 'passed_lower': None, 'signed': None} for action in data['actions']: # We'll try to recover some Committee IDs here. if "committee" in action: cid = get_committee_id(level, abbr, data['chamber'], action['committee']) action['_scraped_committee_name'] = action['committee'] if cid is not None: action['committee'] = cid else: del(action['committee']) adate = action['date'] # first & last if not dates['first'] or adate < dates['first']: dates['first'] = adate elif not dates['last'] or adate > dates['last']: dates['last'] = adate # passed & signed if (not dates['passed_upper'] and action['actor'] == 'upper' and 'bill:passed' in action['type']): dates['passed_upper'] = adate elif (not dates['passed_lower'] and action['actor'] == 'lower' and 'bill:passed' in action['type']): dates['passed_lower'] = adate elif (not dates['signed'] and 'governor:signed' in action['type']): dates['signed'] = adate # save action dates to data data['action_dates'] = dates data['_term'] = term_for_session(abbr, data['session']) alt_titles = set(data.get('alternate_titles', [])) for version in data['versions']: # push versions to oyster if settings.ENABLE_OYSTER and 'url' in version: oysterize_version(data, version) # Merge any version titles into the alternate_titles list if 'title' in version: alt_titles.add(version['title']) if '+short_title' in version: alt_titles.add(version['+short_title']) try: # Make sure the primary title isn't included in the # alternate title list alt_titles.remove(data['title']) except KeyError: pass data['alternate_titles'] = list(alt_titles) if not bill: bill_id = insert_with_id(data) denormalize_votes(data, bill_id) return "insert" else: update(bill, data, db.bills) denormalize_votes(data, bill['_id']) return "update"
def import_legislator(data): data = prepare_obj(data) if data.get('_scraped_name') is None: data['_scraped_name'] = data['full_name'] # Rename 'role' -> 'type' for role in data['roles']: if 'role' in role: role['type'] = role.pop('role') # copy over LEVEL_FIELD into role if settings.LEVEL_FIELD in data: role[settings.LEVEL_FIELD] = data[settings.LEVEL_FIELD] scraped_role = data['roles'][0] scraped_term = scraped_role['term'] abbr = data[settings.LEVEL_FIELD] spec = {settings.LEVEL_FIELD: abbr, 'type': scraped_role['type'], 'term': scraped_term} if 'district' in scraped_role: spec['district'] = scraped_role['district'] if 'chamber' in scraped_role: spec['chamber'] = scraped_role['chamber'] # find matching legislator in current term leg = db.legislators.find_one( {settings.LEVEL_FIELD: abbr, '_scraped_name': data['_scraped_name'], 'roles': {'$elemMatch': spec}}) # legislator with a matching old_role if not leg: spec.pop('term') leg = db.legislators.find_one({ settings.LEVEL_FIELD: abbr, '_scraped_name': data['_scraped_name'], 'old_roles.%s' % scraped_term: {'$elemMatch': spec} }) if leg: if 'old_roles' not in data: data['old_roles'] = leg.get('old_roles', {}) # put scraped roles into their old_roles data['old_roles'][scraped_term] = data['roles'] data['roles'] = leg['roles'] # don't overwrite their current roles # active matching legislator from different term if not leg: spec.pop('term', None) leg = db.legislators.find_one( {settings.LEVEL_FIELD: abbr, '_scraped_name': data['_scraped_name'], 'roles': {'$elemMatch': spec}}) if leg: if 'old_roles' not in data: data['old_roles'] = leg.get('old_roles', {}) # scraped_term < leg's term if term_older_than(abbr, scraped_term, leg['roles'][0]['term']): # move scraped roles into old_roles data['old_roles'][scraped_term] = data['roles'] data['roles'] = leg['roles'] else: data['old_roles'][leg['roles'][0]['term']] = leg['roles'] data = apply_filters(filters, data) if leg: update(leg, data, db.legislators) return "update" else: insert_with_id(data) return "insert"
def import_committee(data, current_session, current_term): level = data['level'] abbr = data[level] spec = {'level': level, level: abbr, 'chamber': data['chamber'], 'committee': data['committee']} if 'subcommittee' in data: spec['subcommittee'] = data['subcommittee'] # insert/update the actual committee object committee = db.committees.find_one(spec) if not committee: insert_with_id(data) committee = data else: update(committee, data, db.committees) # deal with the members, add roles for member in committee['members']: if not member['name']: continue leg_id = get_legislator_id(abbr, current_session, data['chamber'], member['name']) if not leg_id: print "No matches for %s" % member['name'].encode( 'ascii', 'ignore') member['leg_id'] = None continue legislator = db.legislators.find_one({'_id': leg_id}) member['leg_id'] = leg_id for role in legislator['roles']: if (role['type'] == 'committee member' and role['term'] == current_term and role.get('committee_id') == committee['_id']): break else: new_role = {'type': 'committee member', 'committee': committee['committee'], 'term': current_term, 'chamber': committee['chamber'], 'committee_id': committee['_id'], 'level': level, } # copy over all necessary fields from committee for f in settings.BILLY_LEVEL_FIELDS: new_role[f] = committee[f] if 'subcommittee' in committee: new_role['subcommittee'] = committee['subcommittee'] legislator['roles'].append(new_role) legislator['updated_at'] = datetime.datetime.utcnow() db.legislators.save(legislator, safe=True) db.committees.save(committee, safe=True)
def import_bills(state, data_dir): data_dir = os.path.join(data_dir, state) pattern = os.path.join(data_dir, "bills", "*.json") meta = db.metadata.find_one({"_id": state}) # Build a session to term mapping sessions = {} for term in meta["terms"]: for session in term["sessions"]: sessions[session] = term["name"] votes = import_votes(state, data_dir) paths = glob.glob(pattern) for path in paths: with open(path) as f: data = prepare_obj(json.load(f)) # clean up bill_id data["bill_id"] = fix_bill_id(data["bill_id"]) # move subjects to scraped_subjects subjects = data.pop("subjects", None) # NOTE: intentionally doesn't copy blank lists of subjects # this avoids the problem where a bill is re-run but we can't # get subjects anymore (quite common in fact) if subjects: data["scraped_subjects"] = subjects # add loaded votes to data bill_votes = votes.pop((data["chamber"], data["session"], data["bill_id"]), []) data["votes"].extend(bill_votes) bill = db.bills.find_one( {"state": data["state"], "session": data["session"], "chamber": data["chamber"], "bill_id": data["bill_id"]} ) vote_matcher = VoteMatcher(data["state"]) if bill: vote_matcher.learn_vote_ids(bill["votes"]) vote_matcher.set_vote_ids(data["votes"]) # match sponsor leg_ids for sponsor in data["sponsors"]: id = get_legislator_id(state, data["session"], None, sponsor["name"]) sponsor["leg_id"] = id for vote in data["votes"]: # committee_ids if "committee" in vote: committee_id = get_committee_id(state, vote["chamber"], vote["committee"]) vote["committee_id"] = committee_id # vote leg_ids for vtype in ("yes_votes", "no_votes", "other_votes"): svlist = [] for svote in vote[vtype]: id = get_legislator_id(state, data["session"], vote["chamber"], svote) svlist.append({"name": svote, "leg_id": id}) vote[vtype] = svlist data["_term"] = sessions[data["session"]] # Merge any version titles into the alternate_titles list alt_titles = set(data.get("alternate_titles", [])) for version in data["versions"]: if "title" in version: alt_titles.add(version["title"]) if "+short_title" in version: alt_titles.add(version["+short_title"]) try: # Make sure the primary title isn't included in the # alternate title list alt_titles.remove(data["title"]) except KeyError: pass data["alternate_titles"] = list(alt_titles) if not bill: data["_keywords"] = list(bill_keywords(data)) insert_with_id(data) else: data["_keywords"] = list(bill_keywords(data)) update(bill, data, db.bills) print "imported %s bill files" % len(paths) for remaining in votes.keys(): print "Failed to match vote %s %s %s" % tuple([r.encode("ascii", "replace") for r in remaining]) populate_current_fields(state) ensure_indexes()
def import_committees(state, data_dir): data_dir = os.path.join(data_dir, state) pattern = os.path.join(data_dir, 'committees', '*.json') meta = db.metadata.find_one({'_id': state}) current_term = meta['terms'][-1]['name'] current_session = meta['terms'][-1]['sessions'][-1] paths = glob.glob(pattern) for committee in db.committees.find({'state': state}): committee['members'] = [] db.committees.save(committee) if not paths: # Not standalone committees for legislator in db.legislators.find({ 'roles': {'$elemMatch': {'term': current_term, 'state': state}}}): for role in legislator['roles']: if (role['type'] == 'committee member' and 'committee_id' not in role): spec = {'state': role['state'], 'chamber': role['chamber'], 'committee': role['committee']} if 'subcommittee' in role: spec['subcommittee'] = role['subcommittee'] committee = db.committees.find_one(spec) if not committee: committee = spec committee['_type'] = 'committee' committee['members'] = [] committee['sources'] = [] if 'subcommittee' not in committee: committee['subcommittee'] = None insert_with_id(committee) for member in committee['members']: if member['leg_id'] == legislator['leg_id']: break else: committee['members'].append( {'name': legislator['full_name'], 'leg_id': legislator['leg_id'], 'role': role.get('position') or 'member'}) db.committees.save(committee, safe=True) role['committee_id'] = committee['_id'] db.legislators.save(legislator, safe=True) for path in paths: with open(path) as f: data = prepare_obj(json.load(f)) spec = {'state': state, 'chamber': data['chamber'], 'committee': data['committee']} if 'subcommittee' in data: spec['subcommittee'] = data['subcommittee'] committee = db.committees.find_one(spec) if not committee: insert_with_id(data) committee = data else: update(committee, data, db.committees) for member in committee['members']: if not member['name']: continue leg_id = get_legislator_id(state, current_session, data['chamber'], member['name']) if not leg_id: print "No matches for %s" % member['name'].encode( 'ascii', 'ignore') member['leg_id'] = None continue legislator = db.legislators.find_one({'_id': leg_id}) member['leg_id'] = leg_id for role in legislator['roles']: if (role['type'] == 'committee member' and role['term'] == current_term and role['committee_id'] == committee['_id']): break else: new_role = {'type': 'committee member', 'committee': committee['committee'], 'term': current_term, 'chamber': committee['chamber'], 'committee_id': committee['_id'], 'state': state} if 'subcommittee' in committee: new_role['subcommittee'] = committee['subcommittee'] legislator['roles'].append(new_role) legislator['updated_at'] = datetime.datetime.utcnow() db.legislators.save(legislator, safe=True) db.committees.save(committee, safe=True) print 'imported %s committee files' % len(paths) link_parents(state) ensure_indexes()
def import_bill(data, votes): level = data['level'] abbr = data[level] # clean up bill_id data['bill_id'] = fix_bill_id(data['bill_id']) # move subjects to scraped_subjects subjects = data.pop('subjects', None) # NOTE: intentionally doesn't copy blank lists of subjects # this avoids the problem where a bill is re-run but we can't # get subjects anymore (quite common) if subjects: data['scraped_subjects'] = subjects # add loaded votes to data bill_votes = votes.pop((data['chamber'], data['session'], data['bill_id']), []) data['votes'].extend(bill_votes) bill = db.bills.find_one({ 'level': level, level: abbr, 'session': data['session'], 'chamber': data['chamber'], 'bill_id': data['bill_id'] }) vote_matcher = VoteMatcher(abbr) if bill: vote_matcher.learn_vote_ids(bill['votes']) vote_matcher.set_vote_ids(data['votes']) # match sponsor leg_ids for sponsor in data['sponsors']: id = get_legislator_id(abbr, data['session'], None, sponsor['name']) sponsor['leg_id'] = id for vote in data['votes']: # committee_ids if 'committee' in vote: committee_id = get_committee_id(level, abbr, vote['chamber'], vote['committee']) vote['committee_id'] = committee_id # vote leg_ids for vtype in ('yes_votes', 'no_votes', 'other_votes'): svlist = [] for svote in vote[vtype]: id = get_legislator_id(abbr, data['session'], vote['chamber'], svote) svlist.append({'name': svote, 'leg_id': id}) vote[vtype] = svlist data['_term'] = term_for_session(abbr, data['session']) # Merge any version titles into the alternate_titles list alt_titles = set(data.get('alternate_titles', [])) for version in data['versions']: if 'title' in version: alt_titles.add(version['title']) if '+short_title' in version: alt_titles.add(version['+short_title']) try: # Make sure the primary title isn't included in the # alternate title list alt_titles.remove(data['title']) except KeyError: pass data['alternate_titles'] = list(alt_titles) # update keywords data['_keywords'] = list(bill_keywords(data)) if not bill: insert_with_id(data) else: update(bill, data, db.bills)
def test_deactivate_legislators(): # Previous term leg1 = { '_type': 'person', 'state': 'ex', 'roles': [{ 'type': 'member', 'chamber': 'upper', 'state': 'ex', 'term': 'T1', 'district': '1', 'party': 'Democrat', 'start_date': None, 'end_date': None }], 'active': True, 'district': '1', 'chamber': 'upper', 'party': 'Democrat' } leg1_roles = leg1['roles'] # Current term, no end date leg2 = { '_type': 'person', 'state': 'ex', 'roles': [{ 'type': 'member', 'chamber': 'upper', 'state': 'ex', 'term': 'T2', 'district': '2', 'party': 'Democrat', 'start_date': None, 'end_date': None }], 'active': True, 'district': '2', 'chamber': 'upper', 'party': 'Democrat' } leg2_roles = leg2['roles'] # Current term, with end date leg3 = { '_type': 'person', 'state': 'ex', 'roles': [{ 'type': 'member', 'chamber': 'upper', 'state': 'ex', 'term': 'T2', 'district': '3', 'party': 'Democrat', 'start_date': None, 'end_date': datetime.datetime(2012, 1, 1) }] } leg3_roles = leg3['roles'] id1 = utils.insert_with_id(leg1) id2 = utils.insert_with_id(leg2) id3 = utils.insert_with_id(leg3) legislators.deactivate_legislators('T2', 'ex') leg1 = db.legislators.find_one({'_id': id1}) assert leg1['active'] is False assert 'chamber' not in leg1 assert 'district' not in leg1 assert 'party' not in leg1 assert leg1['roles'] == [] assert leg1['old_roles']['T1'] == leg1_roles leg2 = db.legislators.find_one({'_id': id2}) assert leg2['active'] is True assert leg2['chamber'] == 'upper' assert leg2['district'] == '2' assert leg2['party'] == 'Democrat' assert leg2['roles'] == leg2_roles assert 'old_roles' not in leg2 leg3 = db.legislators.find_one({'_id': id3}) assert leg3['active'] is False assert 'chamber' not in leg3 assert 'district' not in leg3 assert 'party' not in leg3 assert leg3['roles'] == [] assert leg3['old_roles']['T2'] == leg3_roles
def import_bills(state, data_dir): data_dir = os.path.join(data_dir, state) pattern = os.path.join(data_dir, 'bills', '*.json') meta = db.metadata.find_one({'_id': state}) # Build a session to term mapping sessions = {} for term in meta['terms']: for session in term['sessions']: sessions[session] = term['name'] votes = import_votes(state, data_dir) paths = glob.glob(pattern) for path in paths: with open(path) as f: data = prepare_obj(json.load(f)) # clean up bill_id data['bill_id'] = fix_bill_id(data['bill_id']) # move subjects to scraped_subjects subjects = data.pop('subjects', None) if subjects: data['scraped_subjects'] = subjects # add loaded votes to data bill_votes = votes.pop( (data['chamber'], data['session'], data['bill_id']), []) data['votes'].extend(bill_votes) bill = db.bills.find_one({ 'state': data['state'], 'session': data['session'], 'chamber': data['chamber'], 'bill_id': data['bill_id'] }) vote_matcher = VoteMatcher(data['state']) if bill: vote_matcher.learn_vote_ids(bill['votes']) vote_matcher.set_vote_ids(data['votes']) # match sponsor leg_ids for sponsor in data['sponsors']: id = get_legislator_id(state, data['session'], None, sponsor['name']) sponsor['leg_id'] = id for vote in data['votes']: # committee_ids if 'committee' in vote: committee_id = get_committee_id(state, vote['chamber'], vote['committee']) vote['committee_id'] = committee_id # vote leg_ids for vtype in ('yes_votes', 'no_votes', 'other_votes'): svlist = [] for svote in vote[vtype]: id = get_legislator_id(state, data['session'], vote['chamber'], svote) svlist.append({'name': svote, 'leg_id': id}) vote[vtype] = svlist data['_term'] = sessions[data['session']] # Merge any version titles into the alternate_titles list alt_titles = set(data.get('alternate_titles', [])) for version in data['versions']: if 'title' in version: alt_titles.add(version['title']) if '+short_title' in version: alt_titles.add(version['+short_title']) try: # Make sure the primary title isn't included in the # alternate title list alt_titles.remove(data['title']) except KeyError: pass data['alternate_titles'] = list(alt_titles) if not bill: data['_keywords'] = list(bill_keywords(data)) insert_with_id(data) else: data['_keywords'] = list(bill_keywords(data)) update(bill, data, db.bills) print 'imported %s bill files' % len(paths) for remaining in votes.keys(): print 'Failed to match vote %s %s %s' % tuple( [r.encode('ascii', 'replace') for r in remaining]) populate_current_fields(state) ensure_indexes()
def import_bill(data, votes, categorizer): level = data["level"] abbr = data[level] # clean up bill_ids data["bill_id"] = fix_bill_id(data["bill_id"]) if "alternate_bill_ids" in data: data["alternate_bill_ids"] = [fix_bill_id(bid) for bid in data["alternate_bill_ids"]] # move subjects to scraped_subjects # NOTE: intentionally doesn't copy blank lists of subjects # this avoids the problem where a bill is re-run but we can't # get subjects anymore (quite common) subjects = data.pop("subjects", None) if subjects: data["scraped_subjects"] = subjects # update categorized subjects if categorizer: categorizer.categorize_bill(data) # this is a hack added for Rhode Island where we can't # determine the full bill_id, if this key is in the metadata # we just use the numeric portion, not ideal as it won't work # in states where HB/SBs overlap, but in RI they never do if metadata(abbr).get("_partial_vote_bill_id"): # pull off numeric portion of bill_id numeric_bill_id = data["bill_id"].split()[1] bill_votes = votes.pop((data["chamber"], data["session"], numeric_bill_id), []) else: # add loaded votes to data bill_votes = votes.pop((data["chamber"], data["session"], data["bill_id"]), []) data["votes"].extend(bill_votes) bill = db.bills.find_one( { "level": level, level: abbr, "session": data["session"], "chamber": data["chamber"], "bill_id": data["bill_id"], } ) # keep vote/doc ids consistent vote_matcher = VoteMatcher(abbr) doc_matcher = DocumentMatcher(abbr) if bill: vote_matcher.learn_ids(bill["votes"]) doc_matcher.learn_ids(bill["versions"] + bill["documents"]) vote_matcher.set_ids(data["votes"]) doc_matcher.set_ids(data["versions"] + data["documents"]) # match sponsor leg_ids for sponsor in data["sponsors"]: id = get_legislator_id(abbr, data["session"], None, sponsor["name"]) sponsor["leg_id"] = id for vote in data["votes"]: # committee_ids if "committee" in vote: committee_id = get_committee_id(level, abbr, vote["chamber"], vote["committee"]) vote["committee_id"] = committee_id # vote leg_ids for vtype in ("yes_votes", "no_votes", "other_votes"): svlist = [] for svote in vote[vtype]: id = get_legislator_id(abbr, data["session"], vote["chamber"], svote) svlist.append({"name": svote, "leg_id": id}) vote[vtype] = svlist data["_term"] = term_for_session(abbr, data["session"]) alt_titles = set(data.get("alternate_titles", [])) for version in data["versions"]: # push versions to oyster if settings.ENABLE_OYSTER and "url" in version: oysterize_version(data, version) # Merge any version titles into the alternate_titles list if "title" in version: alt_titles.add(version["title"]) if "+short_title" in version: alt_titles.add(version["+short_title"]) try: # Make sure the primary title isn't included in the # alternate title list alt_titles.remove(data["title"]) except KeyError: pass data["alternate_titles"] = list(alt_titles) if not bill: insert_with_id(data) return "insert" else: update(bill, data, db.bills) return "update"
def import_committee(data, current_session, current_term): level = data['level'] abbr = data[level] spec = {'level': level, level: abbr, 'chamber': data['chamber'], 'committee': data['committee']} if 'subcommittee' in data: spec['subcommittee'] = data['subcommittee'] # insert/update the actual committee object committee = db.committees.find_one(spec) committee_return_status = None if not committee: insert_with_id(data) committee = data committee_return_status = "insert" else: update(committee, data, db.committees) committee_return_status = "update" # deal with the members, add roles for member in committee['members']: if not member['name']: continue leg_id = get_legislator_id(abbr, current_session, data['chamber'], member['name']) if not leg_id: logger.debug("No matches for %s" % member['name'].encode('ascii', 'ignore')) member['leg_id'] = None continue legislator = db.legislators.find_one({'_id': leg_id}) if not legislator: logger.warning('No legislator with ID %s' % leg_id) member['leg_id'] = None continue member['leg_id'] = leg_id for role in legislator['roles']: if (role['type'] == 'committee member' and role['term'] == current_term and role.get('committee_id') == committee['_id']): break else: new_role = {'type': 'committee member', 'committee': committee['committee'], 'term': current_term, 'chamber': committee['chamber'], 'committee_id': committee['_id'], 'level': level, } # copy over all necessary fields from committee for f in settings.BILLY_LEVEL_FIELDS: new_role[f] = committee[f] if 'subcommittee' in committee: new_role['subcommittee'] = committee['subcommittee'] legislator['roles'].append(new_role) legislator['updated_at'] = datetime.datetime.utcnow() db.legislators.save(legislator, safe=True) db.committees.save(committee, safe=True) return committee_return_status
def import_bills(state, data_dir): data_dir = os.path.join(data_dir, state) pattern = os.path.join(data_dir, 'bills', '*.json') meta = db.metadata.find_one({'_id': state}) # Build a session to term mapping sessions = {} for term in meta['terms']: for session in term['sessions']: sessions[session] = term['name'] votes = import_votes(state, data_dir) paths = glob.glob(pattern) for path in paths: with open(path) as f: data = prepare_obj(json.load(f)) # clean up bill_id data['bill_id'] = fix_bill_id(data['bill_id']) # move subjects to scraped_subjects subjects = data.pop('subjects', None) # NOTE: intentionally doesn't copy blank lists of subjects # this avoids the problem where a bill is re-run but we can't # get subjects anymore (quite common in fact) if subjects: data['scraped_subjects'] = subjects # add loaded votes to data bill_votes = votes.pop((data['chamber'], data['session'], data['bill_id']), []) data['votes'].extend(bill_votes) bill = db.bills.find_one({'state': data['state'], 'session': data['session'], 'chamber': data['chamber'], 'bill_id': data['bill_id']}) vote_matcher = VoteMatcher(data['state']) if bill: vote_matcher.learn_vote_ids(bill['votes']) vote_matcher.set_vote_ids(data['votes']) # match sponsor leg_ids for sponsor in data['sponsors']: id = get_legislator_id(state, data['session'], None, sponsor['name']) sponsor['leg_id'] = id for vote in data['votes']: # committee_ids if 'committee' in vote: committee_id = get_committee_id(state, vote['chamber'], vote['committee']) vote['committee_id'] = committee_id # vote leg_ids for vtype in ('yes_votes', 'no_votes', 'other_votes'): svlist = [] for svote in vote[vtype]: id = get_legislator_id(state, data['session'], vote['chamber'], svote) svlist.append({'name': svote, 'leg_id': id}) vote[vtype] = svlist data['_term'] = sessions[data['session']] # Merge any version titles into the alternate_titles list alt_titles = set(data.get('alternate_titles', [])) for version in data['versions']: if 'title' in version: alt_titles.add(version['title']) if '+short_title' in version: alt_titles.add(version['+short_title']) try: # Make sure the primary title isn't included in the # alternate title list alt_titles.remove(data['title']) except KeyError: pass data['alternate_titles'] = list(alt_titles) if not bill: data['_keywords'] = list(bill_keywords(data)) insert_with_id(data) else: data['_keywords'] = list(bill_keywords(data)) update(bill, data, db.bills) print 'imported %s bill files' % len(paths) for remaining in votes.keys(): print 'Failed to match vote %s %s %s' % tuple([ r.encode('ascii', 'replace') for r in remaining]) populate_current_fields(state) ensure_indexes()
def import_committee(data, current_session, current_term): abbr = data[settings.LEVEL_FIELD] spec = { settings.LEVEL_FIELD: abbr, 'chamber': data['chamber'], 'committee': data['committee'] } if 'subcommittee' in data: spec['subcommittee'] = data['subcommittee'] # insert/update the actual committee object committee = db.committees.find_one(spec) committee_return_status = None if not committee: insert_with_id(data) committee = data committee_return_status = "insert" else: update(committee, data, db.committees) committee_return_status = "update" # deal with the members, add roles for member in committee['members']: if not member['name']: continue leg_id = get_legislator_id(abbr, current_session, data['chamber'], member['name']) if not leg_id: logger.debug("No matches for %s" % member['name'].encode('ascii', 'ignore')) member['leg_id'] = None continue legislator = db.legislators.find_one({'_all_ids': leg_id}) if not legislator: logger.warning('No legislator with ID %s' % leg_id) member['leg_id'] = None continue member['leg_id'] = legislator['_id'] for role in legislator['roles']: if (role['type'] == 'committee member' and role['term'] == current_term and role.get('committee_id') == committee['_id']): # if the position hadn't been copied over before, copy it now if role.get('position') != member['role']: role['position'] = member['role'] db.legislators.save(legislator, safe=True) break else: new_role = { 'type': 'committee member', 'committee': committee['committee'], 'term': current_term, 'chamber': committee['chamber'], 'committee_id': committee['_id'], 'position': member['role'] } # copy over all necessary fields from committee new_role[settings.LEVEL_FIELD] = committee[settings.LEVEL_FIELD] if 'subcommittee' in committee: new_role['subcommittee'] = committee['subcommittee'] legislator['roles'].append(new_role) legislator['updated_at'] = datetime.datetime.utcnow() db.legislators.save(legislator, safe=True) db.committees.save(committee, safe=True) return committee_return_status
def test_deactivate_legislators(): # Previous term leg1 = {'_type': 'person', 'state': 'ex', 'roles': [{'type': 'member', 'chamber': 'upper', 'state': 'ex', 'term': 'T1', 'district': '1', 'party': 'Democrat', 'start_date': None, 'end_date': None}], 'active': True, 'district': '1', 'chamber': 'upper', 'party': 'Democrat'} leg1_roles = leg1['roles'] # Current term, no end date leg2 = {'_type': 'person', 'state': 'ex', 'roles': [{'type': 'member', 'chamber': 'upper', 'state': 'ex', 'term': 'T2', 'district': '2', 'party': 'Democrat', 'start_date': None, 'end_date': None}], 'active': True, 'district': '2', 'chamber': 'upper', 'party': 'Democrat'} leg2_roles = leg2['roles'] # Current term, with end date leg3 = {'_type': 'person', 'state': 'ex', 'roles': [{'type': 'member', 'chamber': 'upper', 'state': 'ex', 'term': 'T2', 'district': '3', 'party': 'Democrat', 'start_date': None, 'end_date': datetime.datetime(2012, 1, 1)}]} leg3_roles = leg3['roles'] id1 = utils.insert_with_id(leg1) id2 = utils.insert_with_id(leg2) id3 = utils.insert_with_id(leg3) legislators.deactivate_legislators('T2', 'ex') leg1 = db.legislators.find_one({'_id': id1}) assert leg1['active'] is False assert 'chamber' not in leg1 assert 'district' not in leg1 assert 'party' not in leg1 assert leg1['roles'] == [] assert leg1['old_roles']['T1'] == leg1_roles leg2 = db.legislators.find_one({'_id': id2}) assert leg2['active'] is True assert leg2['chamber'] == 'upper' assert leg2['district'] == '2' assert leg2['party'] == 'Democrat' assert leg2['roles'] == leg2_roles assert 'old_roles' not in leg2 leg3 = db.legislators.find_one({'_id': id3}) assert leg3['active'] is False assert 'chamber' not in leg3 assert 'district' not in leg3 assert 'party' not in leg3 assert leg3['roles'] == [] assert leg3['old_roles']['T2'] == leg3_roles
def import_bill(data, standalone_votes, categorizer): """ insert or update a bill data - raw bill JSON standalone_votes - votes scraped separately categorizer - SubjectCategorizer (None - no categorization) """ abbr = data[settings.LEVEL_FIELD] # clean up bill_ids data['bill_id'] = fix_bill_id(data['bill_id']) if 'alternate_bill_ids' in data: data['alternate_bill_ids'] = [ fix_bill_id(bid) for bid in data['alternate_bill_ids'] ] # move subjects to scraped_subjects # NOTE: intentionally doesn't copy blank lists of subjects # this avoids the problem where a bill is re-run but we can't # get subjects anymore (quite common) subjects = data.pop('subjects', None) if subjects: data['scraped_subjects'] = subjects # update categorized subjects if categorizer: categorizer.categorize_bill(data) # companions for companion in data['companions']: companion['bill_id'] = fix_bill_id(companion['bill_id']) # query based on companion spec = companion.copy() spec[settings.LEVEL_FIELD] = abbr if not spec['chamber']: spec.pop('chamber') companion_obj = db.bills.find_one(spec) if companion_obj: companion['internal_id'] = companion_obj['_id'] else: logger.warning( 'Unknown companion: {chamber} {session} {bill_id}'.format( **companion)) # look for a prior version of this bill bill = db.bills.find_one({ settings.LEVEL_FIELD: abbr, 'session': data['session'], 'chamber': data['chamber'], 'bill_id': data['bill_id'] }) # keep doc ids consistent doc_matcher = DocumentMatcher(abbr) if bill: doc_matcher.learn_ids(bill['versions'] + bill['documents']) doc_matcher.set_ids(data['versions'] + data['documents']) # match sponsor leg_ids match_sponsor_ids(abbr, data) # process votes ############ # pull votes off bill bill_votes = data.pop('votes', []) # grab the external bill votes if present if metadata(abbr).get('_partial_vote_bill_id'): # this is a hack initially added for Rhode Island where we can't # determine the full bill_id, if this key is in the metadata # we just use the numeric portion, not ideal as it won't work # where HB/SBs overlap, but in RI they never do # pull off numeric portion of bill_id numeric_bill_id = data['bill_id'].split()[1] bill_votes += standalone_votes.pop( (data['chamber'], data['session'], numeric_bill_id), []) else: # add loaded votes to data bill_votes += standalone_votes.pop( (data['chamber'], data['session'], data['bill_id']), []) # do id matching and other vote prep if bill: prepare_votes(abbr, data['session'], bill['_id'], bill_votes) else: prepare_votes(abbr, data['session'], None, bill_votes) # process actions ########### dates = { 'first': None, 'last': None, 'passed_upper': None, 'passed_lower': None, 'signed': None } vote_flags = { "bill:passed", "bill:failed", "bill:veto_override:passed", "bill:veto_override:failed", "amendment:passed", "amendment:failed", "committee:passed", "committee:passed:favorable", "committee:passed:unfavorable", "committee:passed:failed" } already_linked = set() remove_vote = set() for action in data['actions']: adate = action['date'] def _match_committee(name): return get_committee_id(abbr, action['actor'], name) def _match_legislator(name): return get_legislator_id(abbr, data['session'], action['actor'], name) resolvers = { "committee": _match_committee, "legislator": _match_legislator } if "related_entities" in action: for entity in action['related_entities']: try: resolver = resolvers[entity['type']] except KeyError as e: # We don't know how to deal. logger.error("I don't know how to sort a %s" % e) continue id = resolver(entity['name']) entity['id'] = id # first & last dates if not dates['first'] or adate < dates['first']: dates['first'] = adate if not dates['last'] or adate > dates['last']: dates['last'] = adate # passed & signed dates if (not dates['passed_upper'] and action['actor'] == 'upper' and 'bill:passed' in action['type']): dates['passed_upper'] = adate elif (not dates['passed_lower'] and action['actor'] == 'lower' and 'bill:passed' in action['type']): dates['passed_lower'] = adate elif (not dates['signed'] and 'governor:signed' in action['type']): dates['signed'] = adate # vote-action matching action_attached = False # only attempt vote matching if action has a date and is one of the # designated vote action types if set(action['type']).intersection(vote_flags) and action['date']: for vote in bill_votes: if not vote['date']: continue delta = abs(vote['date'] - action['date']) if (delta < datetime.timedelta(hours=20) and vote['chamber'] == action['actor']): if action_attached: # multiple votes match, we can't guess action.pop('related_votes', None) else: related_vote = vote['vote_id'] if related_vote in already_linked: remove_vote.add(related_vote) already_linked.add(related_vote) action['related_votes'] = [related_vote] action_attached = True # remove related_votes that we linked to multiple actions for action in data['actions']: for vote in remove_vote: if vote in action.get('related_votes', []): action['related_votes'].remove(vote) # save action dates to data data['action_dates'] = dates data['_term'] = term_for_session(abbr, data['session']) alt_titles = set(data.get('alternate_titles', [])) for version in data['versions']: # Merge any version titles into the alternate_titles list if 'title' in version: alt_titles.add(version['title']) if '+short_title' in version: alt_titles.add(version['+short_title']) try: # Make sure the primary title isn't included in the # alternate title list alt_titles.remove(data['title']) except KeyError: pass data['alternate_titles'] = list(alt_titles) data = apply_filters(filters, data) if not bill: insert_with_id(data) elasticsearch_push(data) git_add_bill(data) save_votes(data, bill_votes) return "insert" else: update(bill, data, db.bills) elasticsearch_push(bill) git_add_bill(bill) save_votes(bill, bill_votes) return "update"
def import_bill(data, votes, categorizer): level = data['level'] abbr = data[level] # clean up bill_ids data['bill_id'] = fix_bill_id(data['bill_id']) if 'alternate_bill_ids' in data: data['alternate_bill_ids'] = [ fix_bill_id(bid) for bid in data['alternate_bill_ids'] ] # move subjects to scraped_subjects # NOTE: intentionally doesn't copy blank lists of subjects # this avoids the problem where a bill is re-run but we can't # get subjects anymore (quite common) subjects = data.pop('subjects', None) if subjects: data['scraped_subjects'] = subjects # update categorized subjects if categorizer: categorizer.categorize_bill(data) # this is a hack added for Rhode Island where we can't # determine the full bill_id, if this key is in the metadata # we just use the numeric portion, not ideal as it won't work # in states where HB/SBs overlap, but in RI they never do if metadata(abbr).get('_partial_vote_bill_id'): # pull off numeric portion of bill_id numeric_bill_id = data['bill_id'].split()[1] bill_votes = votes.pop( (data['chamber'], data['session'], numeric_bill_id), []) else: # add loaded votes to data bill_votes = votes.pop( (data['chamber'], data['session'], data['bill_id']), []) data['votes'].extend(bill_votes) bill = db.bills.find_one({ 'level': level, level: abbr, 'session': data['session'], 'chamber': data['chamber'], 'bill_id': data['bill_id'] }) # keep vote/doc ids consistent vote_matcher = VoteMatcher(abbr) doc_matcher = DocumentMatcher(abbr) if bill: vote_matcher.learn_ids(bill['votes']) doc_matcher.learn_ids(bill['versions'] + bill['documents']) vote_matcher.set_ids(data['votes']) doc_matcher.set_ids(data['versions'] + data['documents']) # match sponsor leg_ids for sponsor in data['sponsors']: id = get_legislator_id(abbr, data['session'], None, sponsor['name']) sponsor['leg_id'] = id for vote in data['votes']: # committee_ids if 'committee' in vote: committee_id = get_committee_id(level, abbr, vote['chamber'], vote['committee']) vote['committee_id'] = committee_id # vote leg_ids for vtype in ('yes_votes', 'no_votes', 'other_votes'): svlist = [] for svote in vote[vtype]: id = get_legislator_id(abbr, data['session'], vote['chamber'], svote) svlist.append({'name': svote, 'leg_id': id}) vote[vtype] = svlist data['_term'] = term_for_session(abbr, data['session']) alt_titles = set(data.get('alternate_titles', [])) for version in data['versions']: # push versions to oyster if settings.ENABLE_OYSTER and 'url' in version: oysterize_version(data, version) # Merge any version titles into the alternate_titles list if 'title' in version: alt_titles.add(version['title']) if '+short_title' in version: alt_titles.add(version['+short_title']) try: # Make sure the primary title isn't included in the # alternate title list alt_titles.remove(data['title']) except KeyError: pass data['alternate_titles'] = list(alt_titles) if not bill: insert_with_id(data) return "insert" else: update(bill, data, db.bills) return "update"