def summarize_person(self, person): role_type = None district = None self.person_count += 1 self.optional_fields.update(set(person.keys()) & self.OPTIONAL_FIELD_SET) self.extra_counts.update(person.get("extras", {}).keys()) for role in person.get("roles", []): if role_is_active(role): role_type = role["type"] district = role.get("district") break self.active_legislators[role_type][district].append(person) for role in person.get("party", []): if role_is_active(role): self.parties[role["name"]] += 1 for cd in person.get("contact_details", []): for key, value in cd.items(): if key != "note": self.contact_counts[key] += 1 # currently too aggressive: # plenty of valid cases where legislators share # phone numbers & addresses apparently # self.duplicate_values[key][value].append(person) for scheme, value in person.get("ids", {}).items(): self.id_counts[scheme] += 1 self.duplicate_values[scheme][value].append(person) for id in person.get("other_identifiers", []): self.id_counts[id["scheme"]] += 1 self.duplicate_values[id["scheme"]][id["identifier"]].append(person)
def summarize_person(self, person): role_type = None district = None self.person_count += 1 self.optional_fields.update( set(person.keys()) & self.OPTIONAL_FIELD_SET) self.extra_counts.update(person.get('extras', {}).keys()) for role in person.get('roles', []): if role_is_active(role): role_type = role['type'] district = role.get('district') break self.active_legislators[role_type][district].append(person) for role in person.get('party', []): if role_is_active(role): self.parties[role['name']] += 1 for cd in person.get('contact_details', []): for key in cd: if key != 'note': self.contact_counts[key] += 1 for scheme in person.get('ids', {}): self.id_counts[scheme] += 1 for id in person.get('other_identifiers', []): self.id_counts[id['scheme']] += 1
def summarize(self, person): self.person_count += 1 self.optional_fields.update(set(person.keys()) & OPTIONAL_FIELD_SET) self.extra_counts.update(person.get("extras", {}).keys()) district = role_type = None for role in person.get("roles", []): if role_is_active(role): role_type = role["type"] district = role.get("district") break if role_type: self.active_legislators[role_type][district].append(person) for role in person.get("party", []): if role_is_active(role): self.parties[role["name"]] += 1 for cd in person.get("contact_details", []): for key, value in cd.items(): if key != "note": self.contact_counts[cd["note"] + " " + key] += 1 for scheme, value in person.get("ids", {}).items(): self.id_counts[scheme] += 1 for id in person.get("other_identifiers", []): if id["scheme"] not in ("openstates", "legacy_openstates"): self.id_counts[id["scheme"]] += 1
def validate_person(self, person, filename, person_type, date=None): self.errors[filename] = validate_obj(person, PERSON_FIELDS) uid = person["id"].split("/")[1] if uid not in filename: self.errors[filename].append(f"id piece {uid} not in filename") self.errors[filename].extend( validate_jurisdictions(person, self.municipalities)) self.errors[filename].extend( validate_roles(person, "roles", person_type == PersonType.RETIRED, date=date)) if person_type in (PersonType.LEGISLATIVE, PersonType.EXECUTIVE): self.errors[filename].extend(validate_roles(person, "party")) self.errors[filename].extend(validate_offices(person)) # active party validation active_parties = [] for party in person.get("party", []): if party["name"] not in self.valid_parties: self.errors[filename].append(f"invalid party {party['name']}") if role_is_active(party): active_parties.append(party["name"]) if len(active_parties) > 1: if len( [party for party in active_parties if party in MAJOR_PARTIES]) > 1: self.errors[filename].append( f"multiple active major party memberships {active_parties}" ) else: self.warnings[filename].append( f"multiple active party memberships {active_parties}") # TODO: this was too ambitious, disabling this for now # self.warnings[filename] = self.check_https(person) if person_type == PersonType.RETIRED: self.errors[filename].extend( self.validate_old_district_names(person)) # check duplicate IDs for scheme, value in person.get("ids", {}).items(): self.duplicate_values[scheme][value].append(filename) for id in person.get("other_identifiers", []): self.duplicate_values[id["scheme"]][id["identifier"]].append( filename) # update active legislators if person_type == PersonType.LEGISLATIVE: role_type = district = None for role in person.get("roles", []): if role_is_active(role, date=date): role_type = role["type"] district = role.get("district") break self.active_legislators[role_type][district].append(filename)
def retire_person(person, end_date): num = 0 for role in person['roles']: if role_is_active(role): role['end_date'] = end_date num += 1 return person, num
def validate_person(self, person, filename, person_type): self.errors[filename] = validate_obj(person, PERSON_FIELDS) uid = person["id"].split("/")[1] if uid not in filename: self.errors[filename].append(f"id piece {uid} not in filename") self.errors[filename].extend(validate_jurisdictions(person, self.municipalities)) self.errors[filename].extend( validate_roles(person, "roles", person_type == PersonType.RETIRED) ) if person_type in (PersonType.LEGISLATIVE, PersonType.EXECUTIVE): self.errors[filename].extend(validate_roles(person, "party")) active_parties = [] for party in person.get("party", []): if party["name"] not in self.valid_parties: self.errors[filename].append(f"invalid party {party['name']}") if role_is_active(party): active_parties.append(party["name"]) if len(active_parties) > 1: if len([party for party in active_parties if party in MAJOR_PARTIES]) > 1: self.errors[filename].append( f"multiple active major party memberships {active_parties}" ) else: self.warnings[filename].append( f"multiple active party memberships {active_parties}" ) # TODO: this was too ambitious, disabling this for now # self.warnings[filename] = self.check_https(person) self.person_mapping[person["id"]] = person["name"] if person_type == PersonType.RETIRED: self.retired_count += 1 self.errors[filename].extend(self.validate_old_district_names(person)) elif person_type == PersonType.LEGISLATIVE: self.summarize_person(person)
def retire_from_committee(committee, person_id, end_date): num = 0 for role in committee['memberships']: if role.get('id') == person_id and role_is_active(role): role['end_date'] = end_date num += 1 return committee, num
def generate_template_csv(abbreviations, filename, missing_id=None): fields = ("id", "name", "chamber", "district", "jurisdiction") with open(filename, "w") as outfile: out = csv.DictWriter(outfile, fields) out.writeheader() for abbr in abbreviations: for person, filename in iter_objects(abbr, "people"): skip = False if missing_id: for oid in person.get("other_identifiers", []): if oid["scheme"] == missing_id: skip = True break if not skip: for role in person["roles"]: if role_is_active(role): break else: raise Exception() out.writerow({ "id": person["id"], "name": person["name"], "chamber": role["type"], "district": role["district"], "jurisdiction": role["jurisdiction"], })
def retire_from_committee(committee, person_id, end_date): num = 0 for role in committee["memberships"]: if role.get("id") == person_id and role_is_active(role): role["end_date"] = end_date num += 1 return committee, num
def generate_template_csv(abbreviations, filename, missing_id=None): fields = ('id', 'name', 'chamber', 'district', 'jurisdiction') with open(filename, 'w') as outfile: out = csv.DictWriter(outfile, fields) out.writeheader() for abbr in abbreviations: for person, filename in iter_objects(abbr, 'people'): skip = False if missing_id: for oid in person.get('other_identifiers', []): if oid['scheme'] == missing_id: skip = True break if not skip: for role in person['roles']: if role_is_active(role): break else: raise Exception() out.writerow({ 'id': person['id'], 'name': person['name'], 'chamber': role['type'], 'district': role['district'], 'jurisdiction': role['jurisdiction'], })
def validate_roles(person, roles_key, retired=False): active = [role for role in person[roles_key] if role_is_active(role)] if len(active) == 0 and not retired: return [f"no active {roles_key}"] elif roles_key == "roles" and retired and len(active) > 0: return [f"{len(active)} active roles on retired person"] elif roles_key == "roles" and len(active) > 1: return [f"{len(active)} active roles"] return []
def retire_person(person, end_date, reason=None, death=False): num = 0 for role in person["roles"]: if role_is_active(role): role["end_date"] = end_date if reason: role["end_reason"] = reason num += 1 if death: person["death_date"] = end_date return person, num
def summarize_org(self, org): self.org_count += 1 if org["parent"].startswith("ocd-organization"): self.parent_types["subcommittee of " + org["parent"]] += 1 else: self.parent_types[org["parent"]] += 1 for m in org["memberships"]: if not m.get("id"): self.missing_person_id += 1 if role_is_active(m): self.role_types[m.get("role", "member")] += 1
def retire_person(person, end_date, reason=None, death=False): num = 0 for role in person['roles']: if role_is_active(role): role['end_date'] = end_date if reason: role['end_reason'] = reason num += 1 if death: person['death_date'] = end_date return person, num
def summarize_org(self, org): self.org_count += 1 if org['parent'].startswith('ocd-organization'): self.parent_types['subcommittee of ' + org['parent']] += 1 else: self.parent_types[org['parent']] += 1 for m in org['memberships']: if not m.get('id'): self.missing_person_id += 1 if role_is_active(m): self.role_types[m.get('role', 'member')] += 1
def retire_person(person, end_date, reason=None, death=False): num = 0 for role in person["roles"]: if role_is_active(role): role["end_date"] = end_date if reason: role["end_reason"] = reason num += 1 if death: person["death_date"] = end_date # remove old contact details person.pop("contact_details", None) return person, num
def update(self, existing, new): moving = "" # end any active roles for role in existing.data['roles']: district = role['district'] seat = role['type'], int( district) if district.isdigit() else district if role_is_active(role) and seat != new.seat: role['end_date'] = self.end_date moving = f" and moving to {new.seat}" click.secho( f"In {existing.seat} updating " f"{existing.name}{moving}.", fg='yellow') if self.save: existing.merge(new) existing.save()
def load_person(data): # import has to be here so that Django is set up from openstates.data.models import Person, Organization, Post fields = dict( id=data["id"], name=data["name"], given_name=data.get("given_name", ""), family_name=data.get("family_name", ""), gender=data.get("gender", ""), biography=data.get("biography", ""), birth_date=data.get("birth_date", ""), death_date=data.get("death_date", ""), image=data.get("image", ""), extras=data.get("extras", {}), ) person, created, updated = get_update_or_create(Person, fields, ["id"]) updated |= update_subobjects(person, "other_names", data.get("other_names", [])) updated |= update_subobjects(person, "links", data.get("links", [])) updated |= update_subobjects(person, "sources", data.get("sources", [])) identifiers = [] for scheme, value in data.get("ids", {}).items(): identifiers.append({"scheme": scheme, "identifier": value}) for identifier in data.get("other_identifiers", []): identifiers.append(identifier) updated |= update_subobjects(person, "identifiers", identifiers) contact_details = [] for cd in data.get("contact_details", []): for type in ("address", "email", "voice", "fax"): if cd.get(type): contact_details.append({ "note": cd.get("note", ""), "type": type, "value": cd[type] }) updated |= update_subobjects(person, "contact_details", contact_details) memberships = [] primary_party = "" active_division_id = "" current_state = "" for party in data.get("party", []): party_name = party["name"] try: org = cached_lookup(Organization, classification="party", name=party["name"]) except Organization.DoesNotExist: click.secho(f"no such party {party['name']}", fg="red") raise CancelTransaction() memberships.append({ "organization": org, "start_date": party.get("start_date", ""), "end_date": party.get("end_date", ""), }) if role_is_active(party): if primary_party in MAJOR_PARTIES and party_name in MAJOR_PARTIES: raise ValueError( f"two primary parties for ({data['name']} {data['id']})") elif primary_party in MAJOR_PARTIES: # already set correct primary party, so do nothing pass else: primary_party = party_name for role in data.get("roles", []): if role["type"] not in ("upper", "lower", "legislature"): raise ValueError("unsupported role type") try: org = cached_lookup( Organization, classification=role["type"], jurisdiction_id=role["jurisdiction"], ) post = org.posts.get(label=role["district"]) except Organization.DoesNotExist: click.secho( f"{person} no such organization {role['jurisdiction']} {role['type']}", fg="red", ) raise CancelTransaction() except Post.DoesNotExist: # if this is a legacy district, be quiet lds = legacy_districts(jurisdiction_id=role["jurisdiction"]) if role["district"] not in lds[role["type"]]: click.secho(f"no such post {role}", fg="red") raise CancelTransaction() else: post = None if role_is_active(role): state_metadata = metadata.lookup( jurisdiction_id=role["jurisdiction"]) district = state_metadata.lookup_district(name=str( role["district"]), chamber=role["type"]) assert district active_division_id = district.division_id current_state = state_metadata.abbr.upper() elif not current_state: # set current_state to *something* -- since legislators # are only going to ever appear in one state this is fine # it may become necessary to make this smarter if legislators start # crossing state lines, but we don't have any examples of this state_metadata = metadata.lookup( jurisdiction_id=role["jurisdiction"]) current_state = state_metadata.abbr.upper() memberships.append({ "organization": org, "post": post, "start_date": role.get("start_date", ""), "end_date": role.get("end_date", ""), }) # note that we don't manage committee memberships here updated |= update_subobjects( person, "memberships", memberships, read_manager=person.memberships.exclude( organization__classification="committee"), ) # set computed fields (avoid extra save) if (person.current_role_division_id != active_division_id or person.primary_party != primary_party or person.current_state != current_state): person.current_role_division_id = active_division_id person.current_state = current_state person.primary_party = primary_party person.save() return created, updated
import glob from utils import load_yaml, dump_obj, role_is_active for file in glob.glob("data/ca/legislature/*.yml"): with open(file) as inf: data = load_yaml(inf) for role in data["roles"]: if role_is_active(role): letter = "A" if role["type"] == "lower" else "S" district = int(role["district"]) url = f"https://lcmspubcontact.lc.ca.gov/PublicLCMS/ContactPopup.php?district={letter}D{district:02d}&inframe=N" data["links"].append({"url": url, "note": "Contact Form"},) dump_obj(data, filename=file)
def load_person(data): # import has to be here so that Django is set up from openstates.data.models import Person, Organization, Post fields = dict( id=data["id"], name=data["name"], given_name=data.get("given_name", ""), family_name=data.get("family_name", ""), gender=data.get("gender", ""), biography=data.get("biography", ""), birth_date=data.get("birth_date", ""), death_date=data.get("death_date", ""), image=data.get("image", ""), extras=data.get("extras", {}), ) person, created, updated = get_update_or_create(Person, fields, ["id"]) updated |= update_subobjects(person, "other_names", data.get("other_names", [])) updated |= update_subobjects(person, "links", data.get("links", [])) updated |= update_subobjects(person, "sources", data.get("sources", [])) identifiers = [] for scheme, value in data.get("ids", {}).items(): identifiers.append({"scheme": scheme, "identifier": value}) for identifier in data.get("other_identifiers", []): identifiers.append(identifier) updated |= update_subobjects(person, "identifiers", identifiers) contact_details = [] for cd in data.get("contact_details", []): for type in ("address", "email", "voice", "fax"): if cd.get(type): contact_details.append( {"note": cd.get("note", ""), "type": type, "value": cd[type]} ) updated |= update_subobjects(person, "contact_details", contact_details) memberships = [] primary_party = "" current_jurisdiction_id = None current_role = None for party in data.get("party", []): party_name = party["name"] try: org = cached_lookup(Organization, classification="party", name=party["name"]) except Organization.DoesNotExist: click.secho(f"no such party {party['name']}", fg="red") raise CancelTransaction() memberships.append( { "organization": org, "start_date": party.get("start_date", ""), "end_date": party.get("end_date", ""), } ) if role_is_active(party): if primary_party in MAJOR_PARTIES and party_name in MAJOR_PARTIES: raise ValueError(f"two primary parties for ({data['name']} {data['id']})") elif primary_party in MAJOR_PARTIES: # already set correct primary party, so do nothing pass else: primary_party = party_name for role in data.get("roles", []): if role["type"] in ("mayor",): role_name = "Mayor" org_type = "government" use_district = False elif role["type"] == "governor": role_name = "Governor" if role["jurisdiction"] == "ocd-jurisdiction/country:us/district:dc/government": role_name = "Mayor" org_type = "executive" use_district = False elif role["type"] in ("upper", "lower", "legislature"): org_type = role["type"] use_district = True else: raise ValueError("unsupported role type") try: org = cached_lookup( Organization, classification=org_type, jurisdiction_id=role["jurisdiction"] ) if use_district: post = org.posts.get(label=role["district"]) else: post = None except Organization.DoesNotExist: click.secho( f"{person} no such organization {role['jurisdiction']} {org_type}", fg="red" ) raise CancelTransaction() except Post.DoesNotExist: # if this is a legacy district, be quiet lds = legacy_districts(jurisdiction_id=role["jurisdiction"]) if role["district"] not in lds[role["type"]]: click.secho(f"no such post {role}", fg="red") raise CancelTransaction() else: post = None if role_is_active(role): current_jurisdiction_id = role["jurisdiction"] current_role = {"org_classification": org_type, "district": None, "division_id": None} if use_district: state_metadata = metadata.lookup(jurisdiction_id=role["jurisdiction"]) district = state_metadata.lookup_district( name=str(role["district"]), chamber=role["type"] ) assert district current_role["division_id"] = district.division_id current_role["title"] = getattr(state_metadata, role["type"]).title # try to force district to an int for sorting, but allow strings for non-numeric districts try: current_role["district"] = int(role["district"]) except ValueError: current_role["district"] = str(role["district"]) else: current_role["title"] = role_name elif not current_jurisdiction_id: current_jurisdiction_id = role["jurisdiction"] membership = { "organization": org, "post": post, "start_date": role.get("start_date", ""), "end_date": role.get("end_date", ""), } if not use_district: membership["role"] = role_name memberships.append(membership) # note that we don't manage committee memberships here updated |= update_subobjects( person, "memberships", memberships, read_manager=person.memberships.exclude(organization__classification="committee"), ) # set computed fields (avoid extra save) if ( person.primary_party != primary_party or person.current_role != current_role or person.current_jurisdiction_id != current_jurisdiction_id ): person.primary_party = primary_party person.current_role = current_role person.current_jurisdiction_id = current_jurisdiction_id person.save() return created, updated
def get_chamber_and_district(person): for role in person["roles"]: if role_is_active(role): return role["type"], role["district"]
def write_csv(files, jurisdiction_id, output_filename): with open(output_filename, "w") as outf: out = csv.DictWriter( outf, ("id", "name", "current_party", "current_district", "current_chamber", "given_name", "family_name", "gender", "biography", "birth_date", "death_date", "image", "links", "sources", "capitol_address", "capitol_email", "capitol_voice", "capitol_fax", "district_address", "district_email", "district_voice", "district_fax", "twitter", "youtube", "instagram", "facebook", ) ) out.writeheader() for filename in files: with open(filename) as f: data = load_yaml(f) # current party for role in data["party"]: if role_is_active(role): current_party = role["name"] break # current district for role in data["roles"]: if role_is_active(role): current_chamber = role["type"] current_district = role["district"] district_address = district_email = district_voice = district_fax = None capitol_address = capitol_email = capitol_voice = capitol_fax = None for cd in data.get("contact_details", {}): note = cd["note"].lower() if "district" in note: district_address = cd.get("address") district_email = cd.get("email") district_voice = cd.get("voice") district_fax = cd.get("fax") elif "capitol" in note: capitol_address = cd.get("address") capitol_email = cd.get("email") capitol_voice = cd.get("voice") capitol_fax = cd.get("fax") else: click.secho("unknown office: " + note, fg="red") links = ";".join(l["url"] for l in data.get("links", [])) sources = ";".join(l["url"] for l in data.get("sources", [])) obj = { "id": data["id"], "name": data["name"], "current_party": current_party, "current_district": current_district, "current_chamber": current_chamber, "given_name": data.get("given_name"), "family_name": data.get("family_name"), "gender": data.get("gender"), "biography": data.get("biography"), "birth_date": data.get("birth_date"), "death_date": data.get("death_date"), "image": data.get("image"), "twitter": data.get("ids", {}).get("twitter"), "youtube": data.get("ids", {}).get("youtube"), "instagram": data.get("ids", {}).get("instagram"), "facebook": data.get("ids", {}).get("facebook"), "links": links, "sources": sources, "district_address": district_address, "district_email": district_email, "district_voice": district_voice, "district_fax": district_fax, "capitol_address": capitol_address, "capitol_email": capitol_email, "capitol_voice": capitol_voice, "capitol_fax": capitol_fax, } out.writerow(obj) click.secho(f'processed {len(files)} files', fg='green')
def is_inactive(person, date=None): active = [ role for role in person.get("roles", []) if role_is_active(role, date=date) ] return len(active) == 0
def dir_to_mongo(abbr, create, clear_old_roles, verbose): db = pymongo.MongoClient(os.environ.get('BILLY_MONGO_HOST', 'localhost'))['fiftystates'] metadata = db.metadata.find({'_id': abbr})[0] latest_term = metadata['terms'][-1]['name'] active_ids = [] for person, filename in iter_objects(abbr, 'people'): legacy_ids = [ oid['identifier'] for oid in person.get('other_identifiers', []) if oid['scheme'] == 'legacy_openstates' ] if not legacy_ids: if create: # get next ID new_id = get_next_id(db, abbr) legacy_ids = [new_id] if 'other_identifiers' not in person: person['other_identifiers'] = [] person['other_identifiers'].append({ 'scheme': 'legacy_openstates', 'identifier': new_id }) dump_obj(person, filename=filename) else: click.secho( f'{filename} does not have legacy ID, run with --create', fg='red') sys.exit(1) active_ids.append(legacy_ids[0]) # handle name prefix, first_name, last_name, suffixes = name_tools.split( person['name']) # get chamber, district, party for role in person['roles']: if role_is_active(role): chamber = role['type'] district = role['district'] break for role in person['party']: if role_is_active(role): party = role['name'] url = person['links'][0]['url'] email = '' offices = [] for cd in person.get('contact_details', []): office = { 'fax': cd.get('fax'), 'phone': cd.get('voice'), 'address': cd.get('address'), 'email': cd.get('email'), 'name': cd['note'], 'type': 'capitol' if 'capitol' in cd['note'].lower() else 'district' } offices.append(office) if office['email'] and not email: email = office['email'] # NE & DC if chamber == 'legislature': chamber = 'upper' # get some old data to keep around created_at = datetime.datetime.utcnow() old_roles = {} old_person = None try: old_person = db.legislators.find({'_id': legacy_ids[0]})[0] created_at = old_person['created_at'] if not clear_old_roles: old_roles = old_person.get('old_roles', {}) except IndexError: pass mongo_person = { '_id': legacy_ids[0], 'leg_id': legacy_ids[0], '_all_ids': legacy_ids, '_type': 'person', 'active': True, 'full_name': person['name'], '_scraped_name': person['name'], 'photo_url': person.get('image'), 'state': abbr, 'district': district, 'chamber': chamber, 'party': party, 'email': email, 'url': url, 'offices': offices, 'created_at': created_at, 'first_name': first_name, 'middle_name': '', 'last_name': last_name, 'suffixes': suffixes, 'sources': person['sources'], 'old_roles': old_roles, 'roles': [ { 'term': latest_term, 'district': district, 'chamber': chamber, 'state': abbr, 'party': party, 'type': 'member', 'start_date': None, 'end_date': None }, ], } # TODO: committee info # { "term" : "2017-2018", "committee_id" : "NCC000233", "chamber" : "lower", # "state" : "nc", "subcommittee" : null, "committee" : "State and Local Government II", # "position" : "member", "type" : "committee member" }, # compare if old_person: old_person.pop('updated_at', None) if old_person == mongo_person: if verbose: click.secho(f'no updates to {mongo_person["_id"]}') else: # print(mongo_person, old_person) # raise Exception() click.secho(f'updating {mongo_person["_id"]}', fg='green') mongo_person['updated_at'] = datetime.datetime.utcnow() try: db.legislators.save(mongo_person) except Exception as e: print(e) continue to_retire = db.legislators.find({ '_id': { '$nin': active_ids }, 'state': abbr }) click.secho(f'going to try to retire {to_retire.count()}') for leg in to_retire: retire_person(db, leg)
def test_role_is_active(role, expected): assert role_is_active(role) == expected
def get_chamber_and_district(person): for role in person['roles']: if role_is_active(role): return role['type'], role['district']