def to_dict(self): party = PARTIES.get(self.party.lower(), self.party) d = OrderedDict( { "id": f"ocd-person/{uuid.uuid4()}", "name": self.name, "party": [{"name": party}], "roles": [ { "district": self.district, "type": self.chamber, "jurisdiction": get_jurisdiction_id(self.state), } ], "links": self.links, "sources": self.sources, } ) if self.given_name: d["given_name"] = self.given_name if self.family_name: d["family_name"] = self.family_name if self.image: d["image"] = self.image # contact details d["contact_details"] = [] if self.district_office.to_dict(): d["contact_details"].append(self.district_office.to_dict()) if self.capitol_office.to_dict(): d["contact_details"].append(self.capitol_office.to_dict()) return d
def create_person(fname, lname, name, state, district, party, rtype, url, image, start_date): person = OrderedDict({ "id": ocd_uuid("person"), "name": name or f"{fname} {lname}", "given_name": fname, "family_name": lname, "image": image, "party": [{ "name": party }], "roles": [{ "type": rtype, "district": district, "jurisdiction": get_jurisdiction_id(state), "start_date": start_date, }], "links": [{ "url": url }], "sources": [{ "url": url }], }) output_dir = get_data_dir(state) dump_obj(person, output_dir=os.path.join(output_dir, "people"))
def to_yaml(input_dir): """ Convert scraped JSON in INPUT_DIR to YAML files for this repo. Will put data into incoming/ directory for usage with merge.py's --incoming option. """ # abbr is last piece of directory name abbr = None for piece in input_dir.split("/")[::-1]: if piece: abbr = piece break output_dir = get_data_dir(abbr) jurisdiction_id = get_jurisdiction_id(abbr) output_dir = output_dir.replace("data", "incoming") assert "incoming" in output_dir try: os.makedirs(os.path.join(output_dir, "legislature")) except FileExistsError: for file in glob.glob(os.path.join(output_dir, "legislature", "*.yml")): os.remove(file) process_dir(input_dir, output_dir, jurisdiction_id)
def to_csv(abbreviations, upload): """ Sync YAML files to DB. """ if not abbreviations: abbreviations = get_all_abbreviations() if upload: s3 = boto3.client("s3") for abbr in abbreviations: click.secho("==== {} ====".format(abbr), bold=True) directory = get_data_dir(abbr) jurisdiction_id = get_jurisdiction_id(abbr) person_files = sorted( glob.glob(os.path.join(directory, "legislature/*.yml"))) fname = f"{abbr}.csv" write_csv(person_files, jurisdiction_id, fname) if upload: s3.upload_file( fname, "data.openstates.org", f"people/current/{abbr}.csv", ExtraArgs={ "ContentType": "text/csv", "ACL": "public-read" }, ) click.secho( f"uploaded to data.openstates.org/people/current/{abbr}.csv", fg="green")
def archive_leg_to_csv(state_abbr=None): output_filename = f"unmatched_{state_abbr}.csv" jurisdiction_id = get_jurisdiction_id(state_abbr) # name -> session -> count missing_votes = Counter() missing_sponsors = Counter() sessions_for_name = defaultdict(set) voters, bill_sponsors = get_unmatched(jurisdiction_id) for voter in voters: missing_votes[voter["name"]] += voter["n"] sessions_for_name[voter["name"]].add(voter["session"]) for bill_sponsor in bill_sponsors: missing_sponsors[bill_sponsor["name"]] += bill_sponsor["n"] sessions_for_name[bill_sponsor["name"]].add(bill_sponsor["session"]) all_names = sorted(sessions_for_name.keys()) with open(output_filename, "w") as outf: out = csv.DictWriter(outf, ("name", "jurisdiction", "sessions", "votes", "sponsorships")) out.writeheader() for name in all_names: obj = { "name": name, "jurisdiction": state_abbr, "sessions": "; ".join(sorted(sessions_for_name[name])), "votes": missing_votes[name], "sponsorships": missing_sponsors[name], } out.writerow(obj)
def to_yaml(input_dir): """ Convert pupa scraped JSON in INPUT_DIR to YAML files for this repo. Will put data into incoming/ directory for usage with merge.py's --incoming option. """ # abbr is last piece of directory name abbr = None for piece in input_dir.split('/')[::-1]: if piece: abbr = piece break output_dir = get_data_dir(abbr) jurisdiction_id = get_jurisdiction_id(abbr) output_dir = output_dir.replace('data', 'incoming') assert 'incoming' in output_dir for dir in ('people', 'organizations'): try: os.makedirs(os.path.join(output_dir, dir)) except FileExistsError: for file in glob.glob(os.path.join(output_dir, dir, '*.yml')): os.remove(file) process_dir(input_dir, output_dir, jurisdiction_id)
def to_database(abbreviations, purge, safe): """ Sync YAML files to DB. """ init_django() if not abbreviations: abbreviations = get_all_abbreviations() settings = get_settings() for abbr in abbreviations: click.secho('==== {} ===='.format(abbr), bold=True) directory = get_data_dir(abbr) jurisdiction_id = get_jurisdiction_id(abbr) person_files = (glob.glob(os.path.join(directory, 'people/*.yml')) + glob.glob(os.path.join(directory, 'retired/*.yml'))) committee_files = glob.glob(os.path.join(directory, 'organizations/*.yml')) if safe: click.secho('running in safe mode, no changes will be made', fg='magenta') state_settings = settings[abbr] try: with transaction.atomic(): create_posts(jurisdiction_id, state_settings) load_directory(person_files, 'person', jurisdiction_id, purge=purge) load_directory(committee_files, 'organization', jurisdiction_id, purge=purge) if safe: click.secho('ran in safe mode, no changes were made', fg='magenta') raise CancelTransaction() except CancelTransaction: pass
def to_csv(abbreviations): """ Sync YAML files to DB. """ if not abbreviations: abbreviations = get_all_abbreviations() for abbr in abbreviations: click.secho('==== {} ===='.format(abbr), bold=True) directory = get_data_dir(abbr) jurisdiction_id = get_jurisdiction_id(abbr) person_files = sorted(glob.glob(os.path.join(directory, 'people/*.yml'))) write_csv(person_files, jurisdiction_id, f"csv/{abbr}_legislators.csv")
def to_database(abbreviations, purge, safe): """ Sync YAML files to DB. """ init_django() create_parties() if not abbreviations: abbreviations = get_all_abbreviations() for abbr in abbreviations: click.secho("==== {} ====".format(abbr), bold=True) directory = get_data_dir(abbr) jurisdiction_id = get_jurisdiction_id(abbr) municipalities = load_municipalities(abbr) with transaction.atomic(): create_municipalities(municipalities) person_files = ( glob.glob(os.path.join(directory, "legislature/*.yml")) + glob.glob(os.path.join(directory, "executive/*.yml")) + glob.glob(os.path.join(directory, "municipalities/*.yml")) + glob.glob(os.path.join(directory, "retired/*.yml"))) committee_files = glob.glob( os.path.join(directory, "organizations/*.yml")) if safe: click.secho("running in safe mode, no changes will be made", fg="magenta") try: with transaction.atomic(): load_directory(person_files, "person", jurisdiction_id, purge=purge) load_directory(committee_files, "organization", jurisdiction_id, purge=purge) if safe: click.secho("ran in safe mode, no changes were made", fg="magenta") raise CancelTransaction() except CancelTransaction: sys.exit(1)
def to_database(abbr, verbose, summary, purge, safe): init_django() directory = get_data_dir(abbr) jurisdiction_id = get_jurisdiction_id(abbr) person_files = (glob.glob(os.path.join(directory, 'people/*.yml')) + glob.glob(os.path.join(directory, 'retired/*.yml'))) committee_files = glob.glob(os.path.join(directory, 'organizations/*.yml')) if safe: click.secho('running in safe mode, no changes will be made', fg='magenta') try: with transaction.atomic(): load_directory(person_files, 'person', jurisdiction_id, purge=purge) load_directory(committee_files, 'organization', jurisdiction_id, purge=purge) if safe: click.secho('ran in safe mode, no changes were made', fg='magenta') raise CancelTransaction() except CancelTransaction: pass
def to_yaml(input_dir, reset): # TODO: remove reset option once we're in prod # abbr is last piece of directory name abbr = None for piece in input_dir.split('/')[::-1]: if piece: abbr = piece break output_dir = get_data_dir(abbr) jurisdiction_id = get_jurisdiction_id(abbr) for dir in ('people', 'organizations'): try: os.makedirs(os.path.join(output_dir, dir)) except FileExistsError: if reset: for file in glob.glob(os.path.join(output_dir, dir, '*.yml')): os.remove(file) process_dir(input_dir, output_dir, jurisdiction_id)
def create_person(fname, lname, name, state, district, party, rtype, url, image, email, start_date): role = { "type": rtype, "district": district, "jurisdiction": get_jurisdiction_id(state), "start_date": start_date, } if rtype in ("upper", "lower", "legislature"): directory = "legislature" elif rtype in ("mayor", ): directory = "municipalities" role.pop("district") elif rtype in ("governor", "lt_governor"): directory = "executive" role.pop("district") else: raise ValueError(f"unknown role type {rtype}") person = OrderedDict({ "id": ocd_uuid("person"), "name": name or f"{fname} {lname}", "given_name": fname, "family_name": lname, "image": image, "email": email, "party": [{ "name": party }], "roles": [role], "links": [{ "url": url }], "sources": [{ "url": url }], }) output_dir = get_data_dir(state) dump_obj(person, output_dir=os.path.join(output_dir, directory))
def create_person(fname, lname, name, state, district, party, rtype, url, image, start_date): person = OrderedDict({ 'id': ocd_uuid('person'), 'name': name or f'{fname} {lname}', 'given_name': fname, 'family_name': lname, 'image': image, 'party': [{'name': party}], 'roles': [ {'type': rtype, 'district': district, 'jurisdiction': get_jurisdiction_id(state), 'start_date': start_date, } ], 'links': [{'url': url}], 'sources': [{'url': url}], }) output_dir = get_data_dir(state) dump_obj(person, output_dir=os.path.join(output_dir, 'people'))
def create_committee(*, name, state, parent, url): members = [] click.echo("Enter members, enter a blank member to stop.") while True: mname = click.prompt("Member name ('done' to stop)") if mname == "done": break members.append({"name": mname}) com = OrderedDict( { "id": ocd_uuid("organization"), "name": name, "classification": "committee", "jurisdiction": get_jurisdiction_id(state), "parent": parent, "sources": [{"url": url}], "links": [{"url": url}], "memberships": members, } ) output_dir = get_data_dir(state) dump_obj(com, output_dir=os.path.join(output_dir, "organizations"))
def check_historical_matches(abbr, dry=True): jurisdiction_id = get_jurisdiction_id(abbr) voters, sponsorships = get_unmatched(jurisdiction_id) update_objects(jurisdiction_id, voters, "vote", dry) update_objects(jurisdiction_id, sponsorships, "sponsorship", dry)
def process_old_file(filename, metadata): data = json.load(open(filename)) if data["leg_id"] != data["_id"]: raise Exception() if data.get("active"): print(data) return raise Exception() if data.get("roles", []): raise Exception() # remove unused fields for k in ( "_yearly_contributions", "nimsp_candidate_id", "votesmart_id", "_contributions_start_year", "_scraped_name", "_total_contributions", "transparencydata_id", "_locked_fields", "level", "nimsp_id", "_type", "country", "updated_at", "_id", "active", "roles", "offices", "notice", "nickname", "district", "party", "chamber", "csrfmiddlewaretoken", "email", "created_at", "office_address", "office_phone", "occupation", "_guid", "_code", "all_ids", "2008-2011", ): data.pop(k, None) # remove plus fields for k in [k for k in data.keys() if k.startswith("+")]: data.pop(k) leg_obj = OrderedDict({"id": ocd_uuid("person")}) leg_obj["name"] = data.pop("full_name") first_name = data.pop("first_name") middle_name = data.pop("middle_name") last_name = data.pop("last_name") suffixes = data.pop("suffixes", "") suffix = data.pop("suffix", "") if first_name: leg_obj["given_name"] = first_name if last_name: leg_obj["family_name"] = last_name if middle_name: leg_obj["middle_name"] = middle_name if suffix: leg_obj["suffix"] = suffixes or suffix state = data.pop("state") jurisdiction_id = get_jurisdiction_id(state) # pull useful fields old_roles = data.pop("old_roles", {}) parties = set() new_roles = [] for session, roles in old_roles.items(): for role in roles: if role["type"] in ( "committee member", "Minority Floor Leader", "Majority Floor Leader", "Majority Caucus Chair", "Minority Caucus Chair", "Speaker Pro Tem", "President Pro Tem", "Senate President", "Speaker of the House", "Minority Whip", "Majority Whip", "Lt. Governor", ) or role.get("committee"): continue parties.add(role["party"]) new_roles.append({ "term": role["term"], "chamber": role["chamber"], "district": role["district"] }) leg_obj["party"] = [{"name": party} for party in parties] # add these to leg_obj roles = terms_to_roles(new_roles, metadata["terms"]) formatted_roles = [] for chamber, district, start, end in roles: formatted_roles.append( OrderedDict({ "district": district, "jurisdiction": jurisdiction_id, "type": chamber, "start_date": f"{start}-01-01", "end_date": f"{end}-12-31", })) leg_obj["roles"] = formatted_roles all_ids = data.pop("_all_ids") leg_id = data.pop("leg_id") if leg_id not in all_ids: all_ids.append(leg_id) image = data.pop("photo_url", "") if image: leg_obj["image"] = image url = data.pop("url", "") if url: leg_obj["links"] = [{"url": url}] leg_obj["sources"] = data.pop("sources") leg_obj["other_identifiers"] = [{ "identifier": id_, "scheme": "legacy_openstates" } for id_ in all_ids] if data: print(data) raise Exception() output_dir = get_data_dir(state) dump_obj(leg_obj, output_dir=os.path.join(output_dir, "retired"))