def import_speeches(abbr, data_dir): data_dir = os.path.join(data_dir, abbr) pattern = os.path.join(data_dir, 'speeches', '*.json') speech_record_ids = defaultdict(set) for path in glob.iglob(pattern): # OK, We need to first go through all the JSON and load the document # IDs to clear out. with open(path) as f: data = prepare_obj(json.load(f)) session = data['session'] chamber = data['chamber'] speech_record_ids[session].add((chamber, data['record_id'])) for session in speech_record_ids: for obj in speech_record_ids[session]: chamber, record = obj # XXX: Should we really be clearing them all up front? Should # we clear as we process each record block? Is it OK to # store everything in memory? (there's a lot) # # this will result in broken data if the import breaks # below. # -- PRT clear_old_speeches(abbr, chamber, session, record) for path in glob.iglob(pattern): # OK, now we need to import all the JSON. We don't keep the objects # from above, since that'd really dent memory, and a few more ms on # import isn't the end of the world. with open(path) as f: data = prepare_obj(json.load(f)) import_speech(data)
def import_events(state, data_dir, import_actions=True): data_dir = os.path.join(data_dir, state) pattern = os.path.join(data_dir, 'events', '*.json') for path in glob.iglob(pattern): with open(path) as f: data = prepare_obj(json.load(f)) event = None if '_guid' in data: event = db.events.find_one({'state': data['state'], '_guid': data['_guid']}) if not event: event = db.events.find_one({'state': data['state'], 'when': data['when'], 'end': data['end'], 'type': data['type'], 'description': data['description']}) if not event: data['created_at'] = datetime.datetime.utcnow() data['updated_at'] = data['created_at'] _insert_with_id(data) else: update(event, data, db.events) # if import_actions: # actions_to_events(state) ensure_indexes()
def import_bills(abbr, data_dir): data_dir = os.path.join(data_dir, abbr) pattern = os.path.join(data_dir, 'bills', '*.json') git_prelod(abbr) counts = {"update": 0, "insert": 0, "total": 0} votes = load_standalone_votes(data_dir) try: categorizer = SubjectCategorizer(abbr) except Exception as e: logger.debug('Proceeding without subject categorizer: %s' % e) categorizer = None paths = glob.glob(pattern) for path in paths: with open(path) as f: data = prepare_obj(json.load(f)) counts["total"] += 1 ret = import_bill(data, votes, categorizer) counts[ret] += 1 logger.info('imported %s bill files' % len(paths)) for remaining in votes.keys(): logger.debug('Failed to match vote %s %s %s' % tuple([r.encode('ascii', 'replace') for r in remaining])) populate_current_fields(abbr) git_commit("Import Update") return counts
def import_committees(abbr, data_dir): data_dir = os.path.join(data_dir, abbr) pattern = os.path.join(data_dir, 'committees', '*.json') counts = {"update": 0, "insert": 0, "total": 0} meta = db.metadata.find_one({'_id': abbr}) current_term = meta['terms'][-1]['name'] current_session = meta['terms'][-1]['sessions'][-1] paths = glob.glob(pattern) for committee in db.committees.find({settings.LEVEL_FIELD: abbr}): committee['members'] = [] db.committees.save(committee, safe=True) # import committees from legislator roles, no standalone committees scraped if not paths: import_committees_from_legislators(current_term, abbr) for path in paths: with open(path) as f: data = prepare_obj(json.load(f)) counts["total"] += 1 ret = import_committee(data, current_session, current_term) counts[ret] += 1 logger.info('imported %s committee files' % len(paths)) link_parents(abbr) ensure_indexes() return counts
def import_committees(abbr, data_dir): data_dir = os.path.join(data_dir, abbr) pattern = os.path.join(data_dir, 'committees', '*.json') meta = db.metadata.find_one({'_id': abbr}) current_term = meta['terms'][-1]['name'] current_session = meta['terms'][-1]['sessions'][-1] level = meta['level'] paths = glob.glob(pattern) for committee in db.committees.find({'level': level, level: abbr}): committee['members'] = [] db.committees.save(committee, safe=True) # import committees from legislator roles, no standalone committees scraped if not paths: import_committees_from_legislators(current_term, level, abbr) for path in paths: with open(path) as f: data = prepare_obj(json.load(f)) import_committee(data, current_session, current_term) print 'imported %s committee files' % len(paths) link_parents(level, abbr) ensure_indexes()
def import_bills(abbr, data_dir): data_dir = os.path.join(data_dir, abbr) pattern = os.path.join(data_dir, "bills", "*.json") counts = {"update": 0, "insert": 0, "total": 0} votes = import_votes(data_dir) try: categorizer = SubjectCategorizer(abbr) except Exception as e: logger.debug("Proceeding without subject categorizer: %s" % e) categorizer = None paths = glob.glob(pattern) for path in paths: with open(path) as f: data = prepare_obj(json.load(f)) counts["total"] += 1 ret = import_bill(data, votes, categorizer) counts[ret] += 1 logger.info("imported %s bill files" % len(paths)) for remaining in votes.keys(): logger.debug("Failed to match vote %s %s %s" % tuple([r.encode("ascii", "replace") for r in remaining])) meta = db.metadata.find_one({"_id": abbr}) level = meta["level"] populate_current_fields(level, abbr) ensure_indexes() return counts
def import_bills(abbr, data_dir): data_dir = os.path.join(data_dir, abbr) pattern = os.path.join(data_dir, 'bills', '*.json') counts = {"update": 0, "insert": 0, "total": 0} votes = import_votes(data_dir) try: categorizer = SubjectCategorizer(abbr) except Exception as e: logger.debug('Proceeding without subject categorizer: %s' % e) categorizer = None paths = glob.glob(pattern) for path in paths: with open(path) as f: data = prepare_obj(json.load(f)) counts["total"] += 1 ret = import_bill(data, votes, categorizer) counts[ret] += 1 logger.info('imported %s bill files' % len(paths)) for remaining in votes.keys(): logger.debug('Failed to match vote %s %s %s' % tuple([r.encode('ascii', 'replace') for r in remaining])) meta = db.metadata.find_one({'_id': abbr}) level = meta['level'] populate_current_fields(level, abbr) ensure_indexes() return counts
def import_events(abbr, data_dir, import_actions=False): data_dir = os.path.join(data_dir, abbr) pattern = os.path.join(data_dir, 'events', '*.json') for path in glob.iglob(pattern): with open(path) as f: data = prepare_obj(json.load(f)) def _resolve_ctty(committee): return get_committee_id(data[settings.LEVEL_FIELD], committee['chamber'], committee['participant']) def _resolve_leg(leg): chamber = leg['chamber'] if leg['chamber'] in ['upper', 'lower'] \ else None return get_legislator_id(abbr, data['session'], chamber, leg['participant']) resolvers = {"committee": _resolve_ctty, "legislator": _resolve_leg} for entity in data['participants']: type = entity['participant_type'] id = None if type in resolvers: id = resolvers[type](entity) else: logger.warning("I don't know how to resolve a %s" % type) entity['id'] = id for bill in data['related_bills']: bill['_scraped_bill_id'] = bill['bill_id'] bill_id = bill['bill_id'] bill_id = fix_bill_id(bill_id) bill['bill_id'] = "" db_bill = db.bills.find_one({ "$or": [{ settings.LEVEL_FIELD: abbr, 'session': data['session'], 'bill_id': bill_id }, { settings.LEVEL_FIELD: abbr, 'session': data['session'], 'alternate_bill_ids': bill_id }] }) if not db_bill: logger.warning("Error: Can't find %s" % bill_id) db_bill = {} db_bill['_id'] = None # Events are really hard to pin to a chamber. Some of these are # also a committee considering a bill from the other chamber, or # something like that. bill['bill_id'] = db_bill['_id'] import_event(data) ensure_indexes()
def import_legislator(data): data = prepare_obj(data) data['_scraped_name'] = data['full_name'] # Rename 'role' -> 'type' for role in data['roles']: if 'role' in role: role['type'] = role.pop('role') # copy over country and/or state into role # TODO: base this on all possible level fields role['level'] = data['level'] if 'country' in data: role['country'] = data['country'] if 'state' in data: role['state'] = data['state'] cur_role = data['roles'][0] term = cur_role['term'] level = data['level'] abbrev = data[level] prev_term = get_previous_term(abbrev, term) next_term = get_next_term(abbrev, term) spec = {level: abbrev, 'type': cur_role['type'], 'term': {'$in': [term, prev_term, next_term]}} if 'district' in cur_role: spec['district'] = cur_role['district'] if 'chamber' in cur_role: spec['chamber'] = cur_role['chamber'] leg = db.legislators.find_one( {'level': level, level: abbrev, '_scraped_name': data['full_name'], 'roles': {'$elemMatch': spec}}) if leg: if 'old_roles' not in leg: leg['old_roles'] = {} if leg['roles'][0]['term'] == prev_term: # Move to old leg['old_roles'][leg['roles'][0]['term']] = leg['roles'] elif leg['roles'][0]['term'] == next_term: leg['old_roles'][term] = data['roles'] data['roles'] = leg['roles'] update(leg, data, db.legislators) return "update" else: insert_with_id(data) return "insert"
def import_events(abbr, data_dir, import_actions=False): data_dir = os.path.join(data_dir, abbr) pattern = os.path.join(data_dir, 'events', '*.json') for path in glob.iglob(pattern): with open(path) as f: data = prepare_obj(json.load(f)) import_event(data) ensure_indexes()
def import_legislator(data): data = prepare_obj(data) data['_scraped_name'] = data['full_name'] # Rename 'role' -> 'type' for role in data['roles']: if 'role' in role: role['type'] = role['role'] del role['role'] # copy over country and/or state into role # TODO: base this on all possible level fields role['level'] = data['level'] if 'country' in data: role['country'] = data['country'] if 'state' in data: role['state'] = data['state'] cur_role = data['roles'][0] term = cur_role['term'] level = data['level'] abbrev = data[level] prev_term = get_previous_term(abbrev, term) next_term = get_next_term(abbrev, term) spec = {level: abbrev, 'type': cur_role['type'], 'term': {'$in': [term, prev_term, next_term]}} if 'district' in cur_role: spec['district'] = cur_role['district'] if 'chamber' in cur_role: spec['chamber'] = cur_role['chamber'] leg = db.legislators.find_one( {'level': level, level: abbrev, '_scraped_name': data['full_name'], 'roles': {'$elemMatch': spec}}) if leg: if 'old_roles' not in leg: leg['old_roles'] = {} if leg['roles'][0]['term'] == prev_term: # Move to old leg['old_roles'][leg['roles'][0]['term']] = leg['roles'] elif leg['roles'][0]['term'] == next_term: leg['old_roles'][term] = data['roles'] data['roles'] = leg['roles'] update(leg, data, db.legislators) else: insert_with_id(data)
def import_events(abbr, data_dir, import_actions=False): data_dir = os.path.join(data_dir, abbr) pattern = os.path.join(data_dir, "events", "*.json") for path in glob.iglob(pattern): with open(path) as f: data = prepare_obj(json.load(f)) def _resolve_ctty(committee): return get_committee_id(data[settings.LEVEL_FIELD], committee["chamber"], committee["participant"]) def _resolve_leg(leg): chamber = leg["chamber"] if leg["chamber"] in ["upper", "lower"] else None return get_legislator_id(abbr, data["session"], chamber, leg["participant"]) resolvers = {"committee": _resolve_ctty, "legislator": _resolve_leg} for entity in data["participants"]: type = entity["participant_type"] id = None if type in resolvers: id = resolvers[type](entity) else: logger.warning("I don't know how to resolve a %s" % type) entity["id"] = id for bill in data["related_bills"]: bill["_scraped_bill_id"] = bill["bill_id"] bill_id = bill["bill_id"] bill_id = fix_bill_id(bill_id) bill["bill_id"] = "" db_bill = db.bills.find_one( { "$or": [ {settings.LEVEL_FIELD: abbr, "session": data["session"], "bill_id": bill_id}, {settings.LEVEL_FIELD: abbr, "session": data["session"], "alternate_bill_ids": bill_id}, ] } ) if not db_bill: logger.warning("Error: Can't find %s" % bill_id) db_bill = {} db_bill["_id"] = None # Events are really hard to pin to a chamber. Some of these are # also a committee considering a bill from the other chamber, or # something like that. bill["bill_id"] = db_bill["_id"] import_event(data) ensure_indexes()
def import_legislator(data): data = prepare_obj(data) data['_scraped_name'] = data['full_name'] # Rename 'role' -> 'type' for role in data['roles']: if 'role' in role: role['type'] = role['role'] del role['role'] cur_role = data['roles'][0] term = cur_role['term'] prev_term = get_previous_term(data['state'], term) next_term = get_next_term(data['state'], term) spec = { 'state': data['state'], 'type': cur_role['type'], 'term': { '$in': [term, prev_term, next_term] } } if 'district' in cur_role: spec['district'] = cur_role['district'] if 'chamber' in cur_role: spec['chamber'] = cur_role['chamber'] leg = db.legislators.find_one({ 'state': data['state'], '_scraped_name': data['full_name'], 'roles': { '$elemMatch': spec } }) if leg: if 'old_roles' not in leg: leg['old_roles'] = {} if leg['roles'][0]['term'] == prev_term: # Move to old leg['old_roles'][leg['roles'][0]['term']] = leg['roles'] elif leg['roles'][0]['term'] == next_term: leg['old_roles'][term] = data['roles'] data['roles'] = leg['roles'] update(leg, data, db.legislators) else: insert_with_id(data)
def import_votes(state, data_dir): data_dir = os.path.join(data_dir, state) pattern = os.path.join(data_dir, 'votes', '*.json') paths = glob.glob(pattern) for path in paths: with open(path) as f: data = prepare_obj(json.load(f)) # clean up bill_id, needs to match the one already in the database data['bill_id'] = fix_bill_id(data['bill_id']) bill = db.bills.find_one({'state': state, 'chamber': data['bill_chamber'], 'session': data['session'], 'bill_id': data['bill_id']}) if not bill: _log.warning("Couldn't find bill %s" % data['bill_id']) continue del data['bill_id'] try: del data['filename'] except KeyError: pass for vtype in ('yes_votes', 'no_votes', 'other_votes'): svlist = [] for svote in data[vtype]: id = get_legislator_id(state, data['session'], data['chamber'], svote) svlist.append({'name': svote, 'leg_id': id}) data[vtype] = svlist for vote in bill['votes']: if (vote['motion'] == data['motion'] and vote['date'] == data['date']): vote.update(data) break else: bill['votes'].append(data) db.bills.save(bill, safe=True) print 'imported %s vote files' % len(paths)
def import_metadata(state, data_dir): preserved = {} old_metadata = db.metadata.find_one({'_id':state}) or {} for field in PRESERVED_FIELDS: if field in old_metadata: preserved[field] = old_metadata[field] data_dir = os.path.join(data_dir, state) with open(os.path.join(data_dir, 'state_metadata.json')) as f: data = json.load(f) data['_type'] = 'metadata' data = prepare_obj(data) data['_id'] = state data.update(preserved) db.metadata.save(data, safe=True)
def import_votes(data_dir): pattern = os.path.join(data_dir, "votes", "*.json") paths = glob.glob(pattern) votes = defaultdict(list) for path in paths: with open(path) as f: data = prepare_obj(json.load(f)) # need to match bill_id already in the database bill_id = fix_bill_id(data.pop("bill_id")) votes[(data["bill_chamber"], data["session"], bill_id)].append(data) logger.info("imported %s vote files" % len(paths)) return votes
def load_standalone_votes(data_dir): pattern = os.path.join(data_dir, 'votes', '*.json') paths = glob.glob(pattern) votes = defaultdict(list) for path in paths: with open(path) as f: data = prepare_obj(json.load(f)) # need to match bill_id already in the database bill_id = fix_bill_id(data.pop('bill_id')) votes[(data['bill_chamber'], data['session'], bill_id)].append(data) logger.info('imported %s vote files' % len(paths)) return votes
def import_votes(state, data_dir): pattern = os.path.join(data_dir, 'votes', '*.json') paths = glob.glob(pattern) votes = defaultdict(list) for path in paths: with open(path) as f: data = prepare_obj(json.load(f)) # need to match bill_id already in the database bill_id = fix_bill_id(data.pop('bill_id')) votes[(data['bill_chamber'], data['session'], bill_id)].append(data) print 'imported %s vote files' % len(paths) return votes
def import_events(abbr, data_dir, import_actions=False): data_dir = os.path.join(data_dir, abbr) pattern = os.path.join(data_dir, 'events', '*.json') for path in glob.iglob(pattern): with open(path) as f: data = prepare_obj(json.load(f)) for committee in data['participants']: cttyid = get_committee_id(data['level'], data['state'], committee['participant'], committee['chamber'] ) if cttyid: committee['committee_id'] = cttyid for bill in data['related_bills']: bill['_scraped_bill_id'] = bill['bill_id'] bill_id = bill['bill_id'] bill_id = fix_bill_id(bill_id) bill['bill_id'] = "" db_bill = db.bills.find_one({ "$or": [ { "state": abbr, 'session': data['session'], 'bill_id': bill_id }, { "state": abbr, 'session': data['session'], 'alternate_bill_ids': bill_id } ] }) if not db_bill: logger.warning("Error: Can't find %s" % bill_id) db_bill = {} db_bill['_id'] = None # Events are really hard to pin to a chamber. Some of these are # also a committee considering a bill from the other chamber, or # something like that. bill['bill_id'] = db_bill['_id'] import_event(data) ensure_indexes()
def import_metadata(state, data_dir): preserved = {} old_metadata = db.metadata.find_one({'_id':state}) or {} for field in PRESERVED_FIELDS: if field in old_metadata: preserved[field] = old_metadata[field] data_dir = os.path.join(data_dir, state) with open(os.path.join(data_dir, 'state_metadata.json')) as f: data = json.load(f) data['_type'] = 'metadata' data = prepare_obj(data) data['_id'] = state data.update(preserved) data['latest_update'] = datetime.datetime.utcnow() db.metadata.save(data, safe=True)
def import_metadata(abbr, data_dir): preserved = {} old_metadata = db.metadata.find_one({'_id': abbr}) or {} for field in PRESERVED_FIELDS: if field in old_metadata: preserved[field] = old_metadata[field] data_dir = os.path.join(data_dir, abbr) with open(os.path.join(data_dir, 'metadata.json')) as f: data = json.load(f) data['_type'] = 'metadata' data = prepare_obj(data) data['_id'] = abbr data.update(preserved) data['latest_update'] = datetime.datetime.utcnow() db.metadata.save(data, safe=True)
def import_legislator(data): data = prepare_obj(data) data['_scraped_name'] = data['full_name'] # Rename 'role' -> 'type' for role in data['roles']: if 'role' in role: role['type'] = role['role'] del role['role'] cur_role = data['roles'][0] term = cur_role['term'] prev_term = get_previous_term(data['state'], term) next_term = get_next_term(data['state'], term) spec = {'state': data['state'], 'type': cur_role['type'], 'term': {'$in': [term, prev_term, next_term]}} if 'district' in cur_role: spec['district'] = cur_role['district'] if 'chamber' in cur_role: spec['chamber'] = cur_role['chamber'] leg = db.legislators.find_one( {'state': data['state'], '_scraped_name': data['full_name'], 'roles': {'$elemMatch': spec}}) if leg: if 'old_roles' not in leg: leg['old_roles'] = {} if leg['roles'][0]['term'] == prev_term: # Move to old leg['old_roles'][leg['roles'][0]['term']] = leg['roles'] elif leg['roles'][0]['term'] == next_term: leg['old_roles'][term] = data['roles'] data['roles'] = leg['roles'] update(leg, data, db.legislators) else: insert_with_id(data)
def import_bills(abbr, data_dir): data_dir = os.path.join(data_dir, abbr) pattern = os.path.join(data_dir, 'bills', '*.json') git_prelod(abbr) counts = { "update": 0, "insert": 0, "total": 0 } votes = load_standalone_votes(data_dir) try: categorizer = SubjectCategorizer(abbr) except Exception as e: logger.debug('Proceeding without subject categorizer: %s' % e) categorizer = None paths = glob.glob(pattern) for path in paths: with open(path) as f: data = prepare_obj(json.load(f)) counts["total"] += 1 ret = import_bill(data, votes, categorizer) counts[ret] += 1 logger.info('imported %s bill files' % len(paths)) for remaining in votes.keys(): logger.debug('Failed to match vote %s %s %s' % tuple([ r.encode('ascii', 'replace') for r in remaining])) populate_current_fields(abbr) git_commit("Import Update") ensure_indexes() return counts
def import_committees(abbr, data_dir): data_dir = os.path.join(data_dir, abbr) pattern = os.path.join(data_dir, 'committees', '*.json') counts = { "update": 0, "insert": 0, "total": 0 } meta = db.metadata.find_one({'_id': abbr}) current_term = meta['terms'][-1]['name'] current_session = meta['terms'][-1]['sessions'][-1] paths = glob.glob(pattern) for committee in db.committees.find({settings.LEVEL_FIELD: abbr}): committee['members'] = [] db.committees.save(committee, safe=True) # import committees from legislator roles, no standalone committees scraped if not paths: import_committees_from_legislators(current_term, abbr) for path in paths: with open(path) as f: data = prepare_obj(json.load(f)) counts["total"] += 1 ret = import_committee(data, current_session, current_term) counts[ret] += 1 logger.info('imported %s committee files' % len(paths)) link_parents(abbr) ensure_indexes() return counts
def import_bills(abbr, data_dir): data_dir = os.path.join(data_dir, abbr) pattern = os.path.join(data_dir, 'bills', '*.json') votes = import_votes(data_dir) paths = glob.glob(pattern) for path in paths: with open(path) as f: data = prepare_obj(json.load(f)) import_bill(data, votes) print 'imported %s bill files' % len(paths) for remaining in votes.keys(): print 'Failed to match vote %s %s %s' % tuple([ r.encode('ascii', 'replace') for r in remaining]) meta = db.metadata.find_one({'_id': abbr}) level = meta['level'] populate_current_fields(level, abbr) ensure_indexes()
def import_bills(abbr, data_dir): data_dir = os.path.join(data_dir, abbr) pattern = os.path.join(data_dir, 'bills', '*.json') votes = import_votes(data_dir) paths = glob.glob(pattern) for path in paths: with open(path) as f: data = prepare_obj(json.load(f)) import_bill(data, votes) print 'imported %s bill files' % len(paths) for remaining in votes.keys(): print 'Failed to match vote %s %s %s' % tuple( [r.encode('ascii', 'replace') for r in remaining]) meta = db.metadata.find_one({'_id': abbr}) level = meta['level'] populate_current_fields(level, abbr) ensure_indexes()
def import_bills(state, data_dir): data_dir = os.path.join(data_dir, state) pattern = os.path.join(data_dir, "bills", "*.json") meta = db.metadata.find_one({"_id": state}) # Build a session to term mapping sessions = {} for term in meta["terms"]: for session in term["sessions"]: sessions[session] = term["name"] votes = import_votes(state, data_dir) paths = glob.glob(pattern) for path in paths: with open(path) as f: data = prepare_obj(json.load(f)) # clean up bill_id data["bill_id"] = fix_bill_id(data["bill_id"]) # move subjects to scraped_subjects subjects = data.pop("subjects", None) # NOTE: intentionally doesn't copy blank lists of subjects # this avoids the problem where a bill is re-run but we can't # get subjects anymore (quite common in fact) if subjects: data["scraped_subjects"] = subjects # add loaded votes to data bill_votes = votes.pop((data["chamber"], data["session"], data["bill_id"]), []) data["votes"].extend(bill_votes) bill = db.bills.find_one( {"state": data["state"], "session": data["session"], "chamber": data["chamber"], "bill_id": data["bill_id"]} ) vote_matcher = VoteMatcher(data["state"]) if bill: vote_matcher.learn_vote_ids(bill["votes"]) vote_matcher.set_vote_ids(data["votes"]) # match sponsor leg_ids for sponsor in data["sponsors"]: id = get_legislator_id(state, data["session"], None, sponsor["name"]) sponsor["leg_id"] = id for vote in data["votes"]: # committee_ids if "committee" in vote: committee_id = get_committee_id(state, vote["chamber"], vote["committee"]) vote["committee_id"] = committee_id # vote leg_ids for vtype in ("yes_votes", "no_votes", "other_votes"): svlist = [] for svote in vote[vtype]: id = get_legislator_id(state, data["session"], vote["chamber"], svote) svlist.append({"name": svote, "leg_id": id}) vote[vtype] = svlist data["_term"] = sessions[data["session"]] # Merge any version titles into the alternate_titles list alt_titles = set(data.get("alternate_titles", [])) for version in data["versions"]: if "title" in version: alt_titles.add(version["title"]) if "+short_title" in version: alt_titles.add(version["+short_title"]) try: # Make sure the primary title isn't included in the # alternate title list alt_titles.remove(data["title"]) except KeyError: pass data["alternate_titles"] = list(alt_titles) if not bill: data["_keywords"] = list(bill_keywords(data)) insert_with_id(data) else: data["_keywords"] = list(bill_keywords(data)) update(bill, data, db.bills) print "imported %s bill files" % len(paths) for remaining in votes.keys(): print "Failed to match vote %s %s %s" % tuple([r.encode("ascii", "replace") for r in remaining]) populate_current_fields(state) ensure_indexes()
def import_bills(state, data_dir): data_dir = os.path.join(data_dir, state) pattern = os.path.join(data_dir, 'bills', '*.json') meta = db.metadata.find_one({'_id': state}) # Build a session to term mapping sessions = {} for term in meta['terms']: for session in term['sessions']: sessions[session] = term['name'] votes = import_votes(state, data_dir) paths = glob.glob(pattern) for path in paths: with open(path) as f: data = prepare_obj(json.load(f)) # clean up bill_id data['bill_id'] = fix_bill_id(data['bill_id']) # move subjects to scraped_subjects subjects = data.pop('subjects', None) if subjects: data['scraped_subjects'] = subjects # add loaded votes to data bill_votes = votes.pop( (data['chamber'], data['session'], data['bill_id']), []) data['votes'].extend(bill_votes) bill = db.bills.find_one({ 'state': data['state'], 'session': data['session'], 'chamber': data['chamber'], 'bill_id': data['bill_id'] }) vote_matcher = VoteMatcher(data['state']) if bill: vote_matcher.learn_vote_ids(bill['votes']) vote_matcher.set_vote_ids(data['votes']) # match sponsor leg_ids for sponsor in data['sponsors']: id = get_legislator_id(state, data['session'], None, sponsor['name']) sponsor['leg_id'] = id for vote in data['votes']: # committee_ids if 'committee' in vote: committee_id = get_committee_id(state, vote['chamber'], vote['committee']) vote['committee_id'] = committee_id # vote leg_ids for vtype in ('yes_votes', 'no_votes', 'other_votes'): svlist = [] for svote in vote[vtype]: id = get_legislator_id(state, data['session'], vote['chamber'], svote) svlist.append({'name': svote, 'leg_id': id}) vote[vtype] = svlist data['_term'] = sessions[data['session']] # Merge any version titles into the alternate_titles list alt_titles = set(data.get('alternate_titles', [])) for version in data['versions']: if 'title' in version: alt_titles.add(version['title']) if '+short_title' in version: alt_titles.add(version['+short_title']) try: # Make sure the primary title isn't included in the # alternate title list alt_titles.remove(data['title']) except KeyError: pass data['alternate_titles'] = list(alt_titles) if not bill: data['_keywords'] = list(bill_keywords(data)) insert_with_id(data) else: data['_keywords'] = list(bill_keywords(data)) update(bill, data, db.bills) print 'imported %s bill files' % len(paths) for remaining in votes.keys(): print 'Failed to match vote %s %s %s' % tuple( [r.encode('ascii', 'replace') for r in remaining]) populate_current_fields(state) ensure_indexes()
def import_bills(state, data_dir): data_dir = os.path.join(data_dir, state) pattern = os.path.join(data_dir, 'bills', '*.json') meta = db.metadata.find_one({'_id': state}) # Build a session to term mapping sessions = {} for term in meta['terms']: for session in term['sessions']: sessions[session] = term['name'] votes = import_votes(state, data_dir) paths = glob.glob(pattern) for path in paths: with open(path) as f: data = prepare_obj(json.load(f)) # clean up bill_id data['bill_id'] = fix_bill_id(data['bill_id']) # move subjects to scraped_subjects subjects = data.pop('subjects', None) # NOTE: intentionally doesn't copy blank lists of subjects # this avoids the problem where a bill is re-run but we can't # get subjects anymore (quite common in fact) if subjects: data['scraped_subjects'] = subjects # add loaded votes to data bill_votes = votes.pop((data['chamber'], data['session'], data['bill_id']), []) data['votes'].extend(bill_votes) bill = db.bills.find_one({'state': data['state'], 'session': data['session'], 'chamber': data['chamber'], 'bill_id': data['bill_id']}) vote_matcher = VoteMatcher(data['state']) if bill: vote_matcher.learn_vote_ids(bill['votes']) vote_matcher.set_vote_ids(data['votes']) # match sponsor leg_ids for sponsor in data['sponsors']: id = get_legislator_id(state, data['session'], None, sponsor['name']) sponsor['leg_id'] = id for vote in data['votes']: # committee_ids if 'committee' in vote: committee_id = get_committee_id(state, vote['chamber'], vote['committee']) vote['committee_id'] = committee_id # vote leg_ids for vtype in ('yes_votes', 'no_votes', 'other_votes'): svlist = [] for svote in vote[vtype]: id = get_legislator_id(state, data['session'], vote['chamber'], svote) svlist.append({'name': svote, 'leg_id': id}) vote[vtype] = svlist data['_term'] = sessions[data['session']] # Merge any version titles into the alternate_titles list alt_titles = set(data.get('alternate_titles', [])) for version in data['versions']: if 'title' in version: alt_titles.add(version['title']) if '+short_title' in version: alt_titles.add(version['+short_title']) try: # Make sure the primary title isn't included in the # alternate title list alt_titles.remove(data['title']) except KeyError: pass data['alternate_titles'] = list(alt_titles) if not bill: data['_keywords'] = list(bill_keywords(data)) insert_with_id(data) else: data['_keywords'] = list(bill_keywords(data)) update(bill, data, db.bills) print 'imported %s bill files' % len(paths) for remaining in votes.keys(): print 'Failed to match vote %s %s %s' % tuple([ r.encode('ascii', 'replace') for r in remaining]) populate_current_fields(state) ensure_indexes()
def import_bills(state, data_dir): data_dir = os.path.join(data_dir, state) pattern = os.path.join(data_dir, 'bills', '*.json') meta = db.metadata.find_one({'_id': state}) # Build a session to term mapping sessions = {} for term in meta['terms']: for session in term['sessions']: sessions[session] = term['name'] paths = glob.glob(pattern) for path in paths: with open(path) as f: data = prepare_obj(json.load(f)) # clean up bill_id data['bill_id'] = fix_bill_id(data['bill_id']) subjects = data.pop('subjects', None) if subjects: data['scraped_subjects'] = subjects bill = db.bills.find_one({'state': data['state'], 'session': data['session'], 'chamber': data['chamber'], 'bill_id': data['bill_id']}) for sponsor in data['sponsors']: id = get_legislator_id(state, data['session'], None, sponsor['name']) sponsor['leg_id'] = id for vote in data['votes']: if 'committee' in vote: committee_id = get_committee_id(state, vote['chamber'], vote['committee']) vote['committee_id'] = committee_id for vtype in ('yes_votes', 'no_votes', 'other_votes'): svlist = [] for svote in vote[vtype]: id = get_legislator_id(state, data['session'], vote['chamber'], svote) svlist.append({'name': svote, 'leg_id': id}) vote[vtype] = svlist data['_term'] = sessions[data['session']] # Merge any version titles into the alternate_titles list alt_titles = set(data.get('alternate_titles', [])) for version in data['versions']: if 'title' in version: alt_titles.add(version['title']) if '+short_title' in version: alt_titles.add(version['+short_title']) try: # Make sure the primary title isn't included in the # alternate title list alt_titles.remove(data['title']) except KeyError: pass data['alternate_titles'] = list(alt_titles) if not bill: data['_keywords'] = list(bill_keywords(data)) insert_with_id(data) else: data['_keywords'] = list(bill_keywords(data)) update(bill, data, db.bills) print 'imported %s bill files' % len(paths) populate_current_fields(state) ensure_indexes()
def import_legislator(data): data = prepare_obj(data) if data.get('_scraped_name') is None: data['_scraped_name'] = data['full_name'] # Rename 'role' -> 'type' for role in data['roles']: if 'role' in role: role['type'] = role.pop('role') # copy over LEVEL_FIELD into role if settings.LEVEL_FIELD in data: role[settings.LEVEL_FIELD] = data[settings.LEVEL_FIELD] scraped_role = data['roles'][0] scraped_term = scraped_role['term'] abbr = data[settings.LEVEL_FIELD] spec = { settings.LEVEL_FIELD: abbr, 'type': scraped_role['type'], 'term': scraped_term } if 'district' in scraped_role: spec['district'] = scraped_role['district'] if 'chamber' in scraped_role: spec['chamber'] = scraped_role['chamber'] # find matching legislator in current term leg = db.legislators.find_one({ settings.LEVEL_FIELD: abbr, '_scraped_name': data['_scraped_name'], 'roles': { '$elemMatch': spec } }) # legislator with a matching old_role if not leg: spec.pop('term') leg = db.legislators.find_one({ settings.LEVEL_FIELD: abbr, '_scraped_name': data['_scraped_name'], 'old_roles.%s' % scraped_term: { '$elemMatch': spec } }) if leg: if 'old_roles' not in data: data['old_roles'] = leg.get('old_roles', {}) # put scraped roles into their old_roles data['old_roles'][scraped_term] = data['roles'] data['roles'] = leg['roles'] # don't overwrite their current roles # active matching legislator from different term if not leg: spec.pop('term', None) leg = db.legislators.find_one({ settings.LEVEL_FIELD: abbr, '_scraped_name': data['_scraped_name'], 'roles': { '$elemMatch': spec } }) if leg: if 'old_roles' not in data: data['old_roles'] = leg.get('old_roles', {}) # scraped_term < leg's term if term_older_than(abbr, scraped_term, leg['roles'][0]['term']): # move scraped roles into old_roles data['old_roles'][scraped_term] = data['roles'] data['roles'] = leg['roles'] else: data['old_roles'][leg['roles'][0]['term']] = leg['roles'] data = apply_filters(filters, data) if leg: update(leg, data, db.legislators) return "update" else: insert_with_id(data) return "insert"
def import_events(abbr, data_dir, import_actions=False): data_dir = os.path.join(data_dir, abbr) pattern = os.path.join(data_dir, 'events', '*.json') for path in glob.iglob(pattern): with open(path) as f: data = prepare_obj(json.load(f)) def _resolve_ctty(committee): return get_committee_id(data[settings.LEVEL_FIELD], committee['chamber'], committee['participant']) def _resolve_leg(leg): chamber = leg['chamber'] if leg['chamber'] in ['upper', 'lower'] \ else None return get_legislator_id(abbr, data['session'], chamber, leg['participant']) resolvers = { "committee": _resolve_ctty, "legislator": _resolve_leg } for entity in data['participants']: type = entity['participant_type'] id = None if type in resolvers: id = resolvers[type](entity) else: logger.warning("I don't know how to resolve a %s" % type) entity['id'] = id for bill in data['related_bills']: bill_id = bill['bill_id'] bill_id = fix_bill_id(bill_id) db_bill = db.bills.find_one({ "$or": [ { settings.LEVEL_FIELD: abbr, 'session': data['session'], 'bill_id': bill_id }, { settings.LEVEL_FIELD: abbr, 'session': data['session'], 'alternate_bill_ids': bill_id } ] }) if not db_bill: logger.warning("Error: Can't find %s" % bill_id) db_bill = {} db_bill['_id'] = None # Events are really hard to pin to a chamber. Some of these are # also a committee considering a bill from the other chamber, or # something like that. bill['id'] = db_bill['_id'] import_event(data) ensure_indexes()
def import_committees(state, data_dir): data_dir = os.path.join(data_dir, state) pattern = os.path.join(data_dir, 'committees', '*.json') meta = db.metadata.find_one({'_id': state}) current_term = meta['terms'][-1]['name'] current_session = meta['terms'][-1]['sessions'][-1] paths = glob.glob(pattern) for committee in db.committees.find({'state': state}): committee['members'] = [] db.committees.save(committee) if not paths: # Not standalone committees for legislator in db.legislators.find({ 'roles': {'$elemMatch': {'term': current_term, 'state': state}}}): for role in legislator['roles']: if (role['type'] == 'committee member' and 'committee_id' not in role): spec = {'state': role['state'], 'chamber': role['chamber'], 'committee': role['committee']} if 'subcommittee' in role: spec['subcommittee'] = role['subcommittee'] committee = db.committees.find_one(spec) if not committee: committee = spec committee['_type'] = 'committee' committee['members'] = [] committee['sources'] = [] if 'subcommittee' not in committee: committee['subcommittee'] = None insert_with_id(committee) for member in committee['members']: if member['leg_id'] == legislator['leg_id']: break else: committee['members'].append( {'name': legislator['full_name'], 'leg_id': legislator['leg_id'], 'role': role.get('position') or 'member'}) db.committees.save(committee, safe=True) role['committee_id'] = committee['_id'] db.legislators.save(legislator, safe=True) for path in paths: with open(path) as f: data = prepare_obj(json.load(f)) spec = {'state': state, 'chamber': data['chamber'], 'committee': data['committee']} if 'subcommittee' in data: spec['subcommittee'] = data['subcommittee'] committee = db.committees.find_one(spec) if not committee: insert_with_id(data) committee = data else: update(committee, data, db.committees) for member in committee['members']: if not member['name']: continue leg_id = get_legislator_id(state, current_session, data['chamber'], member['name']) if not leg_id: print "No matches for %s" % member['name'].encode( 'ascii', 'ignore') member['leg_id'] = None continue legislator = db.legislators.find_one({'_id': leg_id}) member['leg_id'] = leg_id for role in legislator['roles']: if (role['type'] == 'committee member' and role['term'] == current_term and role['committee_id'] == committee['_id']): break else: new_role = {'type': 'committee member', 'committee': committee['committee'], 'term': current_term, 'chamber': committee['chamber'], 'committee_id': committee['_id'], 'state': state} if 'subcommittee' in committee: new_role['subcommittee'] = committee['subcommittee'] legislator['roles'].append(new_role) legislator['updated_at'] = datetime.datetime.utcnow() db.legislators.save(legislator, safe=True) db.committees.save(committee, safe=True) print 'imported %s committee files' % len(paths) link_parents(state) ensure_indexes()
def import_legislator(data): data = prepare_obj(data) if data.get('_scraped_name') is None: data['_scraped_name'] = data['full_name'] # Rename 'role' -> 'type' for role in data['roles']: if 'role' in role: role['type'] = role.pop('role') # copy over LEVEL_FIELD into role if settings.LEVEL_FIELD in data: role[settings.LEVEL_FIELD] = data[settings.LEVEL_FIELD] scraped_role = data['roles'][0] scraped_term = scraped_role['term'] abbr = data[settings.LEVEL_FIELD] spec = {settings.LEVEL_FIELD: abbr, 'type': scraped_role['type'], 'term': scraped_term} if 'district' in scraped_role: spec['district'] = scraped_role['district'] if 'chamber' in scraped_role: spec['chamber'] = scraped_role['chamber'] # find matching legislator in current term leg = db.legislators.find_one( {settings.LEVEL_FIELD: abbr, '_scraped_name': data['_scraped_name'], 'roles': {'$elemMatch': spec}}) # legislator with a matching old_role if not leg: spec.pop('term') leg = db.legislators.find_one({ settings.LEVEL_FIELD: abbr, '_scraped_name': data['_scraped_name'], 'old_roles.%s' % scraped_term: {'$elemMatch': spec} }) if leg: if 'old_roles' not in data: data['old_roles'] = leg.get('old_roles', {}) # put scraped roles into their old_roles data['old_roles'][scraped_term] = data['roles'] data['roles'] = leg['roles'] # don't overwrite their current roles # active matching legislator from different term if not leg: spec.pop('term', None) leg = db.legislators.find_one( {settings.LEVEL_FIELD: abbr, '_scraped_name': data['_scraped_name'], 'roles': {'$elemMatch': spec}}) if leg: if 'old_roles' not in data: data['old_roles'] = leg.get('old_roles', {}) # scraped_term < leg's term if term_older_than(abbr, scraped_term, leg['roles'][0]['term']): # move scraped roles into old_roles data['old_roles'][scraped_term] = data['roles'] data['roles'] = leg['roles'] else: data['old_roles'][leg['roles'][0]['term']] = leg['roles'] data = apply_filters(filters, data) if leg: update(leg, data, db.legislators) return "update" else: insert_with_id(data) return "insert"