def raw_elements_to_cdf( session, juris: jm.Jurisdiction, mu: jm.Munger, raw: pd.DataFrame, count_cols: List[str], err: dict, constants: dict, ) -> dict: """load data from <raw> into the database.""" working = raw.copy() try: working, new_err = munge_and_melt(mu, working, count_cols, err) if new_err: err = ui.consolidate_errors([err, new_err]) if ui.fatal_error(new_err): return err except Exception as exc: err = ui.add_new_error( err, "system", "munge.raw_elements_to_cdf", f"Unexpected exception during munge_and_melt: {exc}", ) return err # enter elements from sources outside raw data, including creating id column(s) for k in constants.keys(): working = add_constant_column(working, k, constants[k]) # add Contest_Id (unless it was passed in ids) if "Contest_Id" not in working.columns: try: working, err = add_contest_id(working, juris, err, session) except Exception as exc: err = ui.add_new_error( err, "system", "munge.raw_elements_to_cdf", f"Unexpected exception while adding Contest_Id: {exc}", ) return err if ui.fatal_error(err): return err # get ids for remaining info sourced from rows and columns (except Selection_Id) element_list = [ t for t in mu.cdf_elements.index if (t[-7:] != "Contest" and ( t[-9:] != "Selection") and f"{t}_Id" not in constants.keys()) ] for t in element_list: try: # capture id from db in new column and erase any now-redundant cols df = pd.read_sql_table(t, session.bind) name_field = db.get_name_field(t) # set drop_unmatched = True for fields necessary to BallotMeasure rows, # drop_unmatched = False otherwise to prevent losing BallotMeasureContests for BM-inessential fields if t == "ReportingUnit" or t == "CountItemType": drop = True else: drop = False if t == "CountItemType": # munge raw to internal CountItemType r_i = pd.read_csv(os.path.join(juris.path_to_juris_dir, "dictionary.txt"), sep="\t") r_i = r_i[r_i.cdf_element == "CountItemType"] recognized = r_i.raw_identifier_value.unique() matched = (working.CountItemType_raw.isin(recognized)) if not matched.all(): unmatched = "\n".join( (working[~matched]["CountItemType_raw"]).unique()) ui.add_new_error( err, "warn-jurisdiction", juris.short_name, f"Some unmatched CountItemTypes:\n{unmatched}", ) working = working.merge( r_i, how="left", left_on="CountItemType_raw", right_on="raw_identifier_value", ).rename(columns={"cdf_internal_name": "CountItemType"}) # join CountItemType_Id and OtherCountItemType cit = pd.read_sql_table("CountItemType", session.bind) working = enum_col_to_id_othertext(working, "CountItemType", cit) working, err_df = clean_ids(working, ["CountItemType_Id"]) working = clean_strings(working, ["OtherCountItemType"]) working = working.drop([ "raw_identifier_value", "cdf_element", "CountItemType_raw" ], axis=1) else: none_or_unknown_id = db.name_to_id(session, t, "none or unknown") working, new_err = replace_raw_with_internal_ids( working, juris, df, t, name_field, err, drop_unmatched=drop, unmatched_id=none_or_unknown_id, ) err = ui.consolidate_errors([err, new_err]) if ui.fatal_error(new_err): return err working.drop(t, axis=1, inplace=True) except KeyError as exc: err = ui.add_new_error( err, "system", "munge.raw_elements_to_cdf", f"KeyError ({exc}) while adding internal ids for {t}.", ) except Exception as exc: err = ui.add_new_error( err, "system", "munge.raw_elements_to_cdf", f"Exception ({exc}) while adding internal ids for {t}.", ) return err # add Selection_Id (combines info from BallotMeasureSelection and CandidateContestSelection) try: working, err = add_selection_id(working, session.bind, juris, err) working, err_df = clean_ids(working, ["Selection_Id"]) except Exception as exc: err = ui.add_new_error( err, "system", "munge.raw_elements_to_cdf", f"Unexpected exception while adding Selection_Id:\n{exc}", ) return err if working.empty: err = ui.add_new_error( err, "jurisdiction", juris.short_name, "No contests found, or no selections found for contests.", ) return err # restrict to just the VoteCount columns (so that groupby.sum will work) vc_cols = [ "Count", "CountItemType_Id", "OtherCountItemType", "ReportingUnit_Id", "Contest_Id", "Selection_Id", "Election_Id", "_datafile_Id", ] working = working[vc_cols] working, e = clean_count_cols(working, ["Count"]) # TODO there are edge cases where this might include dupes # that should be omitted. E.g., if data mistakenly read twice # Sum any rows that were disambiguated (otherwise dupes will be dropped # when VoteCount is filled) group_cols = [c for c in working.columns if c != "Count"] working = working.groupby(group_cols).sum().reset_index() # TODO clean before inserting? All should be already clean, no? # Fill VoteCount try: e = db.insert_to_cdf_db(session.bind, working, "VoteCount") if e: err = ui.add_new_error( err, "system", "munge.raw_elements_to_cdf", f"database insertion error {e}", ) return err except Exception as exc: err = ui.add_new_error( err, "system", "munge.raw_elements_to_cdf", f"Error filling VoteCount:\n{exc}", ) return err
def load_juris_dframe_into_cdf(session, element, juris_path, error) -> dict: """TODO""" project_root = Path(__file__).parents[1].absolute() cdf_schema_def_dir = os.path.join( project_root, "CDF_schema_def_info", ) element_fpath = os.path.join(juris_path, f"{element}.txt") if not os.path.exists(element_fpath): error = ui.add_new_error(error, "jurisdiction", Path(juris_path).name, f"File {element}.txt not found") return error df = pd.read_csv(element_fpath, sep="\t", encoding="iso-8859-1", quoting=csv.QUOTE_MINIMAL).fillna("none or unknown") # TODO check that df has the right format # add 'none or unknown' record df = add_none_or_unknown(df) # dedupe df dupes, df = ui.find_dupes(df) if not dupes.empty: error = ui.add_new_error(error, "warn-jurisdiction", Path(juris_path).name, f"Duplicates were found in {element}.txt") # replace plain text enumerations from file system with id/othertext from db enum_file = os.path.join(cdf_schema_def_dir, "elements", element, "enumerations.txt") if os.path.isfile( enum_file): # (if not, there are no enums for this element) enums = pd.read_csv(enum_file, sep="\t") # get all relevant enumeration tables for e in enums["enumeration"]: # e.g., e = "ReportingUnitType" cdf_e = pd.read_sql_table(e, session.bind) # for every instance of the enumeration in the current table, add id and othertype columns to the dataframe if e in df.columns: df = m.enum_col_to_id_othertext(df, e, cdf_e) # get Ids for any foreign key (or similar) in the table, e.g., Party_Id, etc. fk_file_path = os.path.join(cdf_schema_def_dir, "elements", element, "foreign_keys.txt") if os.path.isfile(fk_file_path): foreign_keys = pd.read_csv(fk_file_path, sep="\t", index_col="fieldname") for fn in foreign_keys.index: ref = foreign_keys.loc[ fn, "refers_to"] # NB: juris elements have no multiple referents (as joins may) col_map = {fn[:-3]: db.get_name_field(ref)} df = db.append_id_to_dframe( session.bind, df, ref, col_map=col_map).rename(columns={f"{ref}_Id": fn}) # commit info in df to corresponding cdf table to db err_string = db.insert_to_cdf_db(session.bind, df, element) if err_string: error = ui.add_new_error( error, "system", "juris_and_munger.load_juris_dframe_into_cdf", f"Error loading {element} to database: {e}") return error
def add_selection_id(df: pd.DataFrame, engine, jurisdiction: jm.Jurisdiction, err: dict) -> (pd.DataFrame, dict): """Assumes <df> has contest_type, BallotMeasureSelection_raw, Candidate_Id column. Loads CandidateSelection table. Appends & fills Selection_Id columns""" # split df by contest type w = dict() for ct in ["BallotMeasure", "Candidate"]: w[ct] = df[df.contest_type == ct].copy() # append BallotMeasureSelection_Id as Selection_Id to w['BallotMeasure'] if not w["BallotMeasure"].empty: bms = pd.read_sql_table(f"BallotMeasureSelection", engine) w["BallotMeasure"], err = replace_raw_with_internal_ids( w["BallotMeasure"], jurisdiction, bms, "BallotMeasureSelection", "Name", err, drop_unmatched=True, drop_all_ok=True, ) w["BallotMeasure"].rename( columns={"BallotMeasureSelection_Id": "Selection_Id"}, inplace=True) w["BallotMeasure"].drop(["BallotMeasureSelection", "Candidate_Id"], axis=1, inplace=True) # prepare to append CandidateSelection_Id as Selection_Id if not w["Candidate"].empty: c_df = w["Candidate"][["Candidate_Id", "Party_Id"]].drop_duplicates() # clean Ids and drop any that were null (i.e., 0 after cleaning) c_df, err_df = clean_ids(c_df, ["Candidate_Id", "Party_Id"]) c_df = c_df[c_df.Candidate_Id != 0] # pull any existing Ids into a new CandidateSelection_Id column col_map = {c: c for c in ["Party_Id", "Candidate_Id"]} c_df = db.append_id_to_dframe(engine, c_df, "CandidateSelection", col_map=col_map) # find unmatched records # TODO this throws error (FutureWarning: elementwise comparison failed), # maybe because CandidateSelection_Id cannot be compared to ""? c_df_unmatched = c_df[(c_df.CandidateSelection_Id == 0) | (c_df.CandidateSelection_Id == "") | (c_df.CandidateSelection_Id.isnull())].copy() if not c_df_unmatched.empty: # Load CandidateSelections to Selection table (for unmatched) id_list = db.add_records_to_selection_table( engine, c_df_unmatched.shape[0]) # Load unmatched records into CandidateSelection table c_df_unmatched["Id"] = pd.Series(id_list, index=c_df_unmatched.index) db.insert_to_cdf_db(engine, c_df_unmatched, "CandidateSelection") # update CandidateSelection_Id column for previously unmatched, merging on Candidate_Id and Party_Id c_df.loc[c_df_unmatched.index, "CandidateSelection_Id"] = c_df_unmatched["Id"] # recast Candidate_Id and Party_Id to int in w['Candidate']; # Note that neither should have nulls, but rather the 'none or unknown' Id # NB: c_df had this recasting done in the append_id_to_dframe routine w["Candidate"], err_df = clean_ids(w["Candidate"], ["Candidate_Id", "Party_Id"]) if not err_df.empty: # show all columns of dataframe with problem in Party_Id or Candidate_Id pd.set_option("max_columns", None) err = ui.add_new_error( err, "system", "munge.add_selection_id", f"Problem with Candidate_Id or Party_Id in some rows:\n{err_df}", ) pd.reset_option("max_columns") # append CandidateSelection_Id to w['Candidate'] w["Candidate"] = w["Candidate"].merge(c_df, how="left", on=["Candidate_Id", "Party_Id"]) # rename to Selection_Id w["Candidate"] = w["Candidate"].rename( columns={"CandidateSelection_Id": "Selection_Id"}) # and drop extraneous to_drop = [ x for x in w["Candidate"].columns if x in ["Candidate_Id", "BallotMeasureSelection_raw"] ] w["Candidate"].drop(to_drop, axis=1, inplace=True) working = pd.concat([w["BallotMeasure"], w["Candidate"]]) return working, err
def load_contests(self, engine, contest_type: str, error: dict) -> dict: # read <contest_type>Contests from jurisdiction folder element_fpath = os.path.join(self.path_to_juris_dir, f"{contest_type}Contest.txt") if not os.path.exists(element_fpath): error[f"{contest_type}Contest.txt"] = "file not found" return error df = pd.read_csv(element_fpath, sep="\t", encoding="iso-8859-1", quoting=csv.QUOTE_MINIMAL).fillna("none or unknown") # add contest_type column df = m.add_constant_column(df, "contest_type", contest_type) # add 'none or unknown' record df = add_none_or_unknown(df, contest_type=contest_type) # dedupe df dupes, df = ui.find_dupes(df) if not dupes.empty: print( f"WARNING: duplicates removed from dataframe, may indicate a problem.\n" ) if not f"{contest_type}Contest" in error: error[f"{contest_type}Contest"] = {} error[f"{contest_type}Contest"]["found_duplicates"] = True # insert into in Contest table e = db.insert_to_cdf_db(engine, df[["Name", "contest_type"]], "Contest") # append Contest_Id col_map = {"Name": "Name", "contest_type": "contest_type"} df = db.append_id_to_dframe(engine, df, "Contest", col_map=col_map) if contest_type == "BallotMeasure": # append ElectionDistrict_Id, Election_Id for fk, ref in [ ("ElectionDistrict", "ReportingUnit"), ("Election", "Election"), ]: col_map = {fk: "Name"} df = (db.append_id_to_dframe( engine, df, ref, col_map=col_map).rename(columns={ f"{ref}_Id": f"{fk}_Id" }).drop(fk, axis=1)) else: # append Office_Id, PrimaryParty_Id for fk, ref in [("Office", "Office"), ("PrimaryParty", "Party")]: col_map = {fk: "Name"} df = db.append_id_to_dframe( engine, df, ref, col_map=col_map).rename(columns={f"{ref}_Id": f"{fk}_Id"}) # create entries in <contest_type>Contest table # commit info in df to <contest_type>Contest table to db err = db.insert_to_cdf_db(engine, df.rename(columns={"Contest_Id": "Id"}), f"{contest_type}Contest") if err: if f"{contest_type}Contest" not in error: error[f"{contest_type}Contest"] = {} error[f"{contest_type}Contest"]["database"] = err return error