def check_results_munger_compatibility(mu: Munger, df: pd.DataFrame, file_name, error: dict) -> dict: # check that count columns exist missing = [i for i in mu.options["count_columns"] if i >= df.shape[1]] if missing: error = ui.add_new_error( error, "munger", mu.name, f"Only {df.shape[1]} columns read from results file {file_name}. Check file_type in format.config", ) else: # check that count cols are numeric for i in mu.options["count_columns"]: if not is_numeric_dtype(df.iloc[:, i]): try: df.iloc[:, i] = df.iloc[:, i].astype(int) except ValueError as ve: error = ui.add_new_error( error, "munger", mu.name, f"Column {i} ({df.columns[i]}) cannot be parsed as an integer.\n{ve}", ) return error
def cast_cols_as_int( df: pd.DataFrame, col_list: list, mode="name", error_msg="", munger_name="unknown", ) -> (pd.DataFrame, dict): """recast columns as integer where possible, leaving columns with text entries as non-numeric)""" err = None if mode == "index": num_columns = [df.columns[idx] for idx in col_list] elif mode == "name": num_columns = [c for c in df.columns if c in col_list] else: err = ui.add_new_error( err, "system", "munge.cast_cols_as_int", f"Mode {mode} not recognized", ) return df, err for c in num_columns: try: df[c] = df[c].astype("int64", errors="raise") except ValueError as e: err = ui.add_new_error( err, "warn-munger", munger_name, f"{error_msg}\nColumn {c} cannot be cast as integer:\n{e}", ) return df, err
def get_aux_data(self, aux_data_path, err) -> (dict, dict): """creates dictionary of dataframes, one for each auxiliary datafile. DataFrames returned are (multi-)indexed by the primary key(s)""" aux_data_dict = { } # will hold dataframe for each abbreviated file name field_list = list(set([x[0] for x in self.auxiliary_fields()])) for abbrev in field_list: # get munger for the auxiliary file munger_path = os.path.join(self.path_to_munger_dir, abbrev) aux_mu, mu_err = check_and_init_munger(munger_path) if ui.fatal_error(mu_err): err = ui.consolidate_errors([err, mu_err]) return dict(), err # find file in aux_data_path whose name contains the string <afn> aux_filename_list = [ x for x in os.listdir(aux_data_path) if abbrev in x ] if len(aux_filename_list) == 0: # TODO check this error err = ui.add_new_error( err, "file", aux_data_path, f"No file found with name containing {abbrev}", ) elif len(aux_filename_list) > 1: # TODO check this error err = ui.add_new_error( err, "file", aux_data_path, f"Too many files found with name containing {abbrev}", ) else: aux_path = os.path.join(aux_data_path, aux_filename_list[0]) # read and clean the auxiliary data file, including setting primary key columns as int df, err = ui.read_single_datafile(aux_mu, aux_path, err) # cast primary key(s) as int if possible, and set as (multi-)index primary_keys = self.aux_meta.loc[abbrev, "primary_key"].split(",") df, new_err = m.cast_cols_as_int( df, primary_keys, error_msg=f"In dataframe for {abbrev}", munger_name=aux_mu.name, ) if new_err: err = ui.consolidate_errors([err, new_err]) if ui.fatal_error(new_err): return aux_data_dict, err df.set_index(primary_keys, inplace=True) aux_data_dict[abbrev] = df return aux_data_dict, err
def check_munger_files(munger_path: str) -> dict: """Check that the munger files are complete and consistent with one another. Assumes munger directory exists. Assumes dictionary.txt is in the template file. <munger_path> is the path to the directory of the particular munger """ err = None project_root = Path(__file__).parents[1].absolute() munger_name = Path(munger_path).name # check whether directory exists if not os.path.isdir(munger_path): err = ui.add_new_error(err, "munger", munger_name, f"Directory does not exist: {munger_path}") return err # check whether all files exist templates = os.path.join(project_root, "juris_and_munger", "munger_templates") template_with_extension_list = os.listdir(templates) for munger_file in template_with_extension_list: # TODO create optional template for aux_meta.txt cf_path = os.path.join(munger_path, munger_file) # if file does not already exist in munger dir, throw error file_exists = os.path.isfile(cf_path) # if file exists, check format against template and then contents if file_exists: err = check_munger_file_format(munger_path, munger_file, templates, err) # if no errors found so far, check contents if not ui.fatal_error(err, error_type_list=["munger"], name_key_list=[munger_file]): err = check_munger_file_contents(munger_path, munger_file, err) else: err = ui.add_new_error(err, "munger", munger_name, "File does not exist") # if the munger requires auxiliary data if os.path.isfile(os.path.join(munger_path, "aux_meta.txt")): # TODO check that each abbrev in aux_meta.txt has an associated sub_munger # check sub-mungers (in sub-directories of munger) sub_mungers = os.listdir(munger_path) for f in sub_mungers: if os.path.isdir(f): new_err = check_munger_files(f) if new_err: ui.add_new_error([err, new_err]) return err
def write_element(juris_path: str, element: str, df: pd.DataFrame, file_name=None) -> dict: """<juris> is path to jurisdiction directory. Info taken from <element>.txt file in that directory. <element>.txt is overwritten with info in <df>""" err = None if not file_name: file_name = f"{element}.txt" dupes_df, deduped = ui.find_dupes(df) if element == "dictionary": deduped = remove_empty_lines(deduped, element) try: deduped.drop_duplicates().fillna("").to_csv( os.path.join(juris_path, file_name), index=False, sep="\t", ) except Exception as e: err = ui.add_new_error( err, "system", "preparation.write_element", f"Unexpected exception writing to file: {e}", ) return err
def munge_clean(raw: pd.DataFrame, munger: jm.Munger, count_columns_by_name: List[str]) -> (pd.DataFrame, dict): """Drop unnecessary columns. Append '_SOURCE' suffix to raw column names to avoid conflicts""" err = None working = raw.copy() working, count_columns_by_name, e = clean_column_names( working, count_cols=count_columns_by_name) try: # define columns named in munger formulas (both plain from 'row' sourced info and # 'variable_j' from column-sourced) munger_formula_row_sourced = [ x for x in munger.field_list if x in working.columns ] munger_formula_column_sourced = [ f"variable_{j}" for j in munger.field_list if f"variable_{j}" in working.columns ] # keep columns named in munger formulas; keep count columns; drop all else. working = working[munger_formula_row_sourced + munger_formula_column_sourced + count_columns_by_name] # add suffix '_SOURCE' to certain columns to avoid any conflict with db table names # (since no db table name ends with _SOURCE) renamer = {x: f"{x}_SOURCE" for x in munger_formula_row_sourced} working.rename(columns=renamer, inplace=True) except Exception as e: err = ui.add_new_error(err, "system", "munge.munge_clean", "Unspecified error") return working, err
def read_xml( f_path: str, munger: jm.Munger, err: Optional[Dict], ) -> (pd.DataFrame, Optional[Dict]): """Create dataframe from the xml file, with column names matching the fields in the raw_identifier formulas. Skip nodes whose tags are unrecognized""" # read data from file try: tree = et.parse(f_path) except FileNotFoundError: err = ui.add_new_error(err, "file", Path(f_path).name, "File not found") return pd.DataFrame(), err # identify tags with counts or other raw data (the info we want) # and list data to be pulled from each tag # TODO tech debt: simplify fields = set(munger.options["count_columns_by_name"]).union( munger.field_list) tags = {f.split(".")[0] for f in fields} # if munger has nesting tags in format.config if munger.options["nesting_tags"] is not None: tags.update(munger.options["nesting_tags"]) attributes = { t: [x.split(".")[1] for x in fields if x.split(".")[0] == t] for t in tags } try: root = tree.getroot() results_list = results_below(root, tags, attributes) raw_results = pd.DataFrame(results_list) for c in munger.options["count_columns_by_name"]: raw_results[c] = pd.to_numeric(raw_results[c], errors="coerce") raw_results, err_df = m.clean_count_cols( raw_results, munger.options["count_columns_by_name"], ) if not err_df.empty: err = ui.add_err_df(err, err_df, munger, f_path) except Exception as e: err = ui.add_new_error(err, "munger", munger.name, f"Error reading xml: {e}") raw_results = pd.DataFrame() return raw_results, err
def add_munged_column( raw: pd.DataFrame, munger: jm.Munger, element: str, err: Optional[dict], mode: str = "row", inplace: bool = True, ) -> (pd.DataFrame, dict): """Alters dataframe <raw>, adding or redefining <element>_raw column via the <formula>. Assumes "_SOURCE" has been appended to all columns of raw Does not alter row count.""" if raw.empty: return raw, err if inplace: working = raw else: working = raw.copy() try: formula = munger.cdf_elements.loc[element, "raw_identifier_formula"] if mode == "row": for field in munger.field_list: formula = formula.replace(f"<{field}>", f"<{field}_SOURCE>") elif mode == "column": for i in range(munger.options["header_row_count"]): formula = formula.replace(f"<{i}>", f"<variable_{i}>") working, new_err = add_column_from_formula(working, formula, f"{element}_raw", err, munger.name) if new_err: err = ui.consolidate_errors([err, new_err]) if ui.fatal_error(new_err): return working, err # correct any disambiguated names back to the original if element in munger.alt.keys(): working.replace({f"{element}_raw": munger.alt[element]}, inplace=True) except Exception as e: err = ui.add_new_error( err, "munger", munger.name, f"Error interpreting formula for {element} in cdf_element.txt. {e}", ) return working, err # compress whitespace for <element>_raw working.loc[:, f"{element}_raw"] = working[f"{element}_raw"].apply( compress_whitespace) return working, err
def read_nested_json(f_path: str, munger: jm.Munger, err: Optional[Dict]) -> (pd.DataFrame, Optional[Dict]): """ Create dataframe from a nested json file, by traversing the json dictionary recursively, similar to the case of xml. """ # read data from file try: with open(f_path, 'r') as f: j = json.load(f) except FileNotFoundError: traceback.print_exc() err = ui.add_new_error(err, "file", Path(f_path).name, "File not found") return pd.DataFrame(), err # Identify keys for counts and other raw data (attributes) we want count_keys = set(munger.options["count_columns_by_name"]) attribute_keys = set(munger.field_list) try: current_values = {} results_list = json_results_below(j, count_keys, attribute_keys, current_values) raw_results = pd.DataFrame(results_list) for c in munger.options["count_columns_by_name"]: raw_results[c] = pd.to_numeric(raw_results[c], errors="coerce") raw_results, err_df = m.clean_count_cols( raw_results, munger.options["count_columns_by_name"], ) if not err_df.empty: err = ui.add_err_df(err, err_df, munger, f_path) except Exception as e: traceback.print_exc() err = ui.add_new_error(err, "munger", munger.name, f"Error reading xml: {e}") raw_results = pd.DataFrame() return raw_results, err
def check_munger_file_format(munger_path: str, munger_file: str, templates: str, err: dict) -> dict: if munger_file[-4:] == ".txt": cf_df = pd.read_csv(os.path.join(munger_path, munger_file), sep="\t", encoding="iso-8859-1") temp = pd.read_csv(os.path.join(templates, munger_file), sep="\t", encoding="iso-8859-1") # check column names are correct if set(cf_df.columns) != set(temp.columns): err = ui.add_new_error( err, "munger", munger_path, f"Columns in {munger_file} do not match template.:\n" f"Columns of {munger_file}: {cf_df.columns}\n" f"Columns of template: {temp.columns}", ) elif munger_file == "format.config": d, err = ui.get_runtime_parameters( required_keys=munger_pars_req, param_file=os.path.join(munger_path, munger_file), header="format", err=err, optional_keys=list(munger_pars_opt.keys()), ) else: err = ui.add_new_error( err, "munger", munger_path, f"Unrecognized file in munger: {munger_file}", ) return err
def read_alternate_munger(file_type: str, f_path: str, munger: jm.Munger, err: Optional[dict]) -> (pd.DataFrame, dict): if file_type in ["concatenated-blocks"]: raw_results, err = read_concatenated_blocks(f_path, munger, err) elif file_type in ["xls-multi"]: raw_results, err = read_multi_sheet_excel(f_path, munger, err) elif file_type in ["xml"]: raw_results, err = read_xml(f_path, munger, err) elif file_type in ["json-nested"]: raw_results, err = read_nested_json(f_path, munger, err) else: err = ui.add_new_error(err, "munger", munger.name, f"file type not recognized: {file_type}") raw_results = pd.DataFrame() # clean the raw results raw_results, err_df = m.clean_count_cols(raw_results, ["count"]) if not err_df.empty: err = ui.add_new_error(err, "warn-file", Path(f_path).name, f"Some counts not read, set to 0") str_cols = [c for c in raw_results.columns if c != "count"] raw_results = m.clean_strings(raw_results, str_cols) return raw_results, err
def get_ids_for_foreign_keys(session, df1, element, foreign_key, refs, load_refs, error): """ TODO <fn> is foreign key""" df = df1.copy() # append the Id corresponding to <fn> from the db foreign_elt = f"{foreign_key[:-3]}" interim = f"{foreign_elt}_Name" target_list = [] for r in refs: ref_name_field = db.get_name_field(r) r_target = pd.read_sql_table(r, session.bind)[["Id", ref_name_field]] r_target.rename(columns={ "Id": foreign_key, ref_name_field: interim }, inplace=True) target_list.append(r_target) target = pd.concat(target_list) df = df.merge(target, how="left", left_on=foreign_elt, right_on=interim) # TODO might have to check for '' or 0 as well as nulls missing = df[(df[foreign_elt].notnull()) & (df[interim].isnull())] if missing.empty: df.drop([interim], axis=1) else: if load_refs: # Always try to handle/fill in the missing IDs raise ForeignKeyException( f"For some {element} records, {foreign_elt} was not found") else: if not element in error: error = ui.add_new_error( error, "system", "juris_and_munger.get_ids_for_foreign_keys", f"For some {element} records, {foreign_elt} was not found", ) return df
def check_munger_file_format(munger_path: str, munger_file: str, templates: str, err: dict) -> dict: if munger_file == "cdf_elements.txt": pass # nothing to check now that entries may vary elif munger_file == "format.config": d, err = ui.get_runtime_parameters( required_keys=munger_pars_req, param_file=os.path.join(munger_path, munger_file), header="format", err=err, optional_keys=list(munger_pars_opt.keys()), ) else: err = ui.add_new_error( err, "munger", munger_path, f"Unrecognized file in munger: {munger_file}", ) return err
def ensure_juris_files(juris_path, ignore_empty=False) -> dict: """Check that the jurisdiction files are complete and consistent with one another. Check for extraneous files in Jurisdiction directory. Assumes Jurisdiction directory exists. Assumes dictionary.txt is in the template file""" # package possible errors from this function into a dictionary and return them err = None juris_name = Path(juris_path).name project_root = Path(__file__).parents[1].absolute() templates_dir = os.path.join(project_root, "juris_and_munger", "jurisdiction_templates") # notify user of any extraneous files extraneous = [ f for f in os.listdir(juris_path) if f not in os.listdir(templates_dir) and f[0] != "." ] if extraneous: err = ui.add_new_error( err, "jurisdiction", juris_name, f"extraneous_files_in_juris_directory {extraneous}", ) template_list = [x[:-4] for x in os.listdir(templates_dir)] # reorder template_list, so that first things are created first ordered_list = [ "dictionary", "ReportingUnit", "Office", "CandidateContest" ] template_list = ordered_list + [ x for x in template_list if x not in ordered_list ] # ensure necessary all files exist for juris_file in template_list: # a list of file empty errors cf_path = os.path.join(juris_path, f"{juris_file}.txt") created = False # if file does not already exist in jurisdiction directory, create from template and invite user to fill try: temp = pd.read_csv( os.path.join(templates_dir, f"{juris_file}.txt"), sep="\t", encoding="iso-8859-1", ) except pd.errors.EmptyDataError: if not ignore_empty: err = ui.add_new_error( err, "system", "juris_and_munger.ensure_juris_files", "Template file {" + juris_file + "}.txt has no contents", ) temp = pd.DataFrame() # if file does not exist if not os.path.isfile(cf_path): # create the file temp.to_csv(cf_path, sep="\t", index=False) created = True # if file exists, check format against template if not created: cf_df = pd.read_csv( os.path.join(juris_path, f"{juris_file}.txt"), sep="\t", encoding="iso=8859-1", quoting=csv.QUOTE_MINIMAL, ) if set(cf_df.columns) != set(temp.columns): print(juris_file) cols = "\t".join(temp.columns.to_list()) err = ui.add_new_error( err, "jurisdiction", juris_name, f"Columns of {juris_file}.txt need to be (tab-separated):\n {cols}\n", ) if juris_file == "dictionary": # dedupe the dictionary dedupe(cf_path) else: # dedupe the file dedupe(cf_path) # check for problematic null entries null_columns = check_nulls(juris_file, cf_path, project_root) if null_columns: err = ui.add_new_error( err, "jurisdiction", juris_name, f"Null entries in {juris_file} in columns {null_columns}", ) # check dependencies for juris_file in [ x for x in template_list if x != "remark" and x != "dictionary" ]: # check dependencies d, new_err = check_dependencies(juris_path, juris_file) if new_err: err = ui.consolidate_errors([err, new_err]) return err
def check_munger_file_contents(munger_path, munger_file, err): """check whether munger files are internally consistent""" munger_name = Path(munger_path).name if munger_file == "cdf_elements.txt": # read cdf_elements and format from files cdf_elements = pd.read_csv( os.path.join(munger_path, "cdf_elements.txt"), sep="\t", encoding="iso-8859-1", ).fillna("") # every source in cdf_elements is either row, column or other bad_source = [ x for x in cdf_elements.source if x not in ["row", "column"] ] if bad_source: err = ui.add_new_error( err, "warn-munger", munger_name, f"Source(s) in cdf_elements.txt not recognized: {bad_source}", ) # formulas have good syntax bad_formula = [ x for x in cdf_elements.raw_identifier_formula.unique() if not m.good_syntax(x) ] if bad_formula: f_str = ",".join(bad_formula) err = ui.add_new_error( err, "warn-munger", munger_name, f"At least one formula in cdf_elements.txt has bad syntax: {f_str}", ) # for each column-source record in cdf_element, contents of bracket are numbers in the header_rows p_not_just_digits = re.compile(r"<.*\D.*>") p_catch_digits = re.compile(r"<(\d+)>") bad_column_formula = set() # TODO check: can this error out now? for i, r in cdf_elements[cdf_elements.source == "column"].iterrows(): if p_not_just_digits.search(r["raw_identifier_formula"]): bad_column_formula.add(r["raw_identifier_formula"]) if bad_column_formula: err = ui.add_new_error( err, "munger", munger_name, f"At least one column-source formula in cdf_elements.txt has bad syntax: {bad_column_formula}", ) elif munger_file == "format.config": format_d, err = ui.get_runtime_parameters( required_keys=munger_pars_req, param_file=os.path.join(munger_path, "format.config"), header="format", err=err, optional_keys=list(munger_pars_opt.keys()), ) # stop and return error if fatal if ui.fatal_error(err): return err # warn if encoding missing or is not recognized if "encoding" not in format_d.keys(): err = ui.add_new_error( err, "warn-munger", munger_name, f"No encoding specified; iso-8859-1 will be used", ) elif not format_d["encoding"] in ui.recognized_encodings: err = ui.add_new_error( err, "warn-munger", munger_name, (f"Encoding {format_d['encoding']} in format file is not recognized;" f"iso-8859-1 will be used"), ) # check all parameters for flat files if format_d["file_type"] in ["txt", "csv", "xls"]: # Either field_name_row is a number, or field_names_if_no_field_name_row is a non-empty list if (not format_d["field_name_row"]) or ( not format_d["field_name_row"].isnumeric()): if (not format_d["field_names_if_no_field_name_row"]) or (len( format_d["field_names_if_no_field_name_row"]) == 0): err = ui.add_new_error( err, "munger", munger_name, (f"field_name_row is not an integer, " f"but no field names are given in field_names_if_no_field_name_row." ), ) # other entries in format.config are of correct type try: int(format_d["header_row_count"]) except (TypeError, ValueError): err = ui.add_new_error( err, "munger", munger_name, f'header_row_count is not an integer: {format_d["header_row_count"]}', ) # check all parameters for concatenated blocks (e.g., Georgia ExpressVote output) elif format_d["file_type"] in ["concatenated-blocks"]: for key in [ "count_of_top_lines_to_skip", "last_header_column_count", "column_width", ]: try: int(format_d[key]) except (ValueError, TypeError): err = ui.add_new_error( err, "munger", munger_name, f"{key} is not an integer: {format_d[key]}", ) else: err = ui.add_new_error( err, "system", "juris_and_munger.check_munger_file_contents", f"Munger template file not recognized: {munger_file}", ) return err
def read_multi_sheet_excel( f_path: str, munger: jm.Munger, err: dict, ) -> (pd.DataFrame, dict): # get munger parameters sheets_to_skip = munger.options["sheets_to_skip"] count_of_top_lines_to_skip = munger.options["count_of_top_lines_to_skip"] constant_line_count = munger.options["constant_line_count"] constant_column_count = munger.options["constant_column_count"] header_row_count = munger.options["header_row_count"] columns_to_skip = munger.options["columns_to_skip"] try: df = pd.read_excel(f_path, sheet_name=None, header=None) except Exception as e: new_err = ui.add_new_error(err, "file", Path(f_path).name, f"Error reading file: {e}") if new_err: err = ui.consolidate_errors([err, new_err]) if ui.fatal_error(new_err): return pd.DataFrame(), err sheets_to_read = [k for k in df.keys() if k not in sheets_to_skip] raw_results = pd.DataFrame() for sh in sheets_to_read: try: data = df[sh].copy() # remove lines designated ignorable data.drop(data.index[:count_of_top_lines_to_skip], inplace=True) # remove any all-null rows data.dropna(how="all", inplace=True) # read constant_line info from first non-null entries of constant-header rows # then drop those rows if constant_line_count > 0: constant_lines = (data.iloc[:constant_line_count].fillna( method="bfill", axis=1).iloc[:, 0]) data.drop(data.index[:constant_line_count], inplace=True) # read constant_column info from first non-null entries of constant columns # and drop those columns if constant_column_count > 0: constant_columns = (data.T.iloc[:constant_column_count].fillna( method="bfill", axis=1).iloc[:, 0]) data.drop(data.columns[:constant_column_count], axis=1, inplace=True) # add multi-index for actual header rows header_variable_names = [ f"header_{j}" for j in range(header_row_count) ] col_multi_index = pd.MultiIndex.from_frame( data.iloc[range(header_row_count), :].transpose().fillna( method="ffill"), names=header_variable_names, ) data.columns = col_multi_index # remove header rows from data data.drop(data.index[:header_row_count], inplace=True) # Drop extraneous columns per munger, and columns without data data.drop(data.columns[columns_to_skip], axis=1, inplace=True) data.dropna(axis=1, how="all", inplace=True) # make first column into an index data.set_index(keys=data.columns[0], inplace=True) # move header info to columns data = pd.melt( data, ignore_index=False, value_name="count", var_name=header_variable_names, ) # add column(s) for constant info for j in range(constant_line_count): data = m.add_constant_column(data, f"constant_line_{j}", constant_lines.iloc[j]) for j in range(constant_column_count): data = m.add_constant_column(data, f"constant_column_{j}", constant_columns.iloc[j]) # Make row index (from first column of blocks) into a column called 'first_column' data.reset_index(inplace=True) data.rename(columns={data.columns[0]: "first_column"}, inplace=True) raw_results = pd.concat([raw_results, data]) except Exception as e: err = ui.add_new_error( err, "system", "special_formats.read_multi_sheet_excel", f"Unexpected exception while processing sheet {sh}: {e}", ) return raw_results, err
def read_concatenated_blocks(f_path: str, munger: jm.Munger, err: dict) -> (pd.DataFrame, dict): """Assumes first column of each block is ReportingUnit, last column is contest total""" try: with open(f_path, "r") as f: data = f.readlines() except Exception as exc: err = ui.add_new_error(err, "file", f_path, f"Datafile not read:\n{exc}\n") return pd.DataFrame(), err # get munger parameters w = munger.options["column_width"] tlts = munger.options["count_of_top_lines_to_skip"] v_t_cc = munger.options["last_header_column_count"] skip_cols = munger.options["columns_to_skip"] df = dict() # skip lines at top data = data[tlts:] try: while len(data) > 3: # TODO allow number & interps of headers to vary? # get rid of blank lines while data[0] == "\n": data.pop(0) # get the header lines header_0 = data.pop(0).strip() header_1 = data.pop(0) header_line = data.pop(0) # get info from header line field_list = extract_items(header_line, w) # Add back county header in case of Iowa: if header_line.startswith(" " * w): field_list = [""] + field_list # remove first column header and headers of any columns to be skipped last_header = remove_by_index(field_list, [0] + skip_cols) # check that the size of the side-to-side repeated block is consistent if len(last_header) % v_t_cc != 0: e = ( f"Count of last header (per munger) ({v_t_cc}) " f"does not evenly divide the number of count columns in the results file " f"({len(last_header)})") err = ui.add_new_error( err, "munger", munger.name, e, ) return pd.DataFrame(), err # get list from next header row and disambiguate # TODO tech debt: disambiguation assumes Candidate formula is <header_1> header_1_list, alts = disambiguate( extract_items(header_1, w * v_t_cc)) # add disambiguated entries to munger's dictionary of alternatives if alts: if "Candidate" in munger.alt.keys(): munger.alt["Candidate"].update(alts) else: munger.alt["Candidate"] = alts # create df from next batch of lines, with that multi-index # find idx of next empty line (or end of data) try: next_empty = next(idx for idx in range(len(data)) if data[idx] == "\n") except StopIteration: next_empty = len(data) # create io vote_count_block = io.StringIO() vote_count_block.write("".join(data[:next_empty])) vote_count_block.seek(0) df[header_0] = pd.read_fwf(vote_count_block, colspecs="infer", index=False, header=None) # Drop extraneous columns (per munger). Negative numbers count from right side df[header_0].drop(df[header_0].columns[skip_cols], axis=1, inplace=True) # make first column into an index df[header_0].set_index(keys=[0], inplace=True) # add multi-index with header_1 and header_2 info index_array = [ [ y for z in [[cand] * v_t_cc for cand in header_1_list] for y in z ], last_header, ] # Create map from integer columns to (header_1, header_2) values header_map = {} for i, col in enumerate(df[header_0].columns): header_map[col] = (index_array[0][i], index_array[1][i]) # Move header to columns df[header_0] = pd.melt( df[header_0], ignore_index=False, value_vars=df[header_0].columns.tolist(), value_name="count", var_name="header_tmp", ) # Gather values for header_1 and header_2 columns. header_1_col = [ header_map[i][0] for i in df[header_0]["header_tmp"] ] header_2_col = [ header_map[i][1] for i in df[header_0]["header_tmp"] ] # Add header_1 and header_2 columns, and remove header_tmp. df[header_0]["header_1"] = header_1_col df[header_0]["header_2"] = header_2_col df[header_0] = df[header_0].drop(columns="header_tmp") # Add columns for header_0 df[header_0] = m.add_constant_column(df[header_0], "header_0", header_0) # remove processed lines from data data = data[next_empty:] except Exception as exc: err = ui.add_new_error( err, "warn-munger", munger.name, f"unparsed lines at bottom of file ({Path(f_path).name}):\n{data}\n", ) # consolidate all into one dataframe try: raw_results = pd.concat(list(df.values())) except ValueError as e: err = ui.add_new_error( err, "munger", munger.name, f"Error concatenating data from blocks: {e}", ) return pd.DataFrame, err # Make row index (from first column of blocks) into a column called 'first_column' raw_results.reset_index(inplace=True) # TODO tech debt is next line still necessary? raw_results.rename(columns={0: "first_column"}, inplace=True) return raw_results, err
def raw_elements_to_cdf( session, juris: jm.Jurisdiction, mu: jm.Munger, raw: pd.DataFrame, count_cols: List[str], err: dict, constants: dict, ) -> dict: """load data from <raw> into the database.""" working = raw.copy() try: working, new_err = munge_and_melt(mu, working, count_cols, err) if new_err: err = ui.consolidate_errors([err, new_err]) if ui.fatal_error(new_err): return err except Exception as exc: err = ui.add_new_error( err, "system", "munge.raw_elements_to_cdf", f"Unexpected exception during munge_and_melt: {exc}", ) return err # enter elements from sources outside raw data, including creating id column(s) for k in constants.keys(): working = add_constant_column(working, k, constants[k]) # add Contest_Id (unless it was passed in ids) if "Contest_Id" not in working.columns: try: working, err = add_contest_id(working, juris, err, session) except Exception as exc: err = ui.add_new_error( err, "system", "munge.raw_elements_to_cdf", f"Unexpected exception while adding Contest_Id: {exc}", ) return err if ui.fatal_error(err): return err # get ids for remaining info sourced from rows and columns (except Selection_Id) element_list = [ t for t in mu.cdf_elements.index if (t[-7:] != "Contest" and ( t[-9:] != "Selection") and f"{t}_Id" not in constants.keys()) ] for t in element_list: try: # capture id from db in new column and erase any now-redundant cols df = pd.read_sql_table(t, session.bind) name_field = db.get_name_field(t) # set drop_unmatched = True for fields necessary to BallotMeasure rows, # drop_unmatched = False otherwise to prevent losing BallotMeasureContests for BM-inessential fields if t == "ReportingUnit" or t == "CountItemType": drop = True else: drop = False if t == "CountItemType": # munge raw to internal CountItemType r_i = pd.read_csv(os.path.join(juris.path_to_juris_dir, "dictionary.txt"), sep="\t") r_i = r_i[r_i.cdf_element == "CountItemType"] recognized = r_i.raw_identifier_value.unique() matched = (working.CountItemType_raw.isin(recognized)) if not matched.all(): unmatched = "\n".join( (working[~matched]["CountItemType_raw"]).unique()) ui.add_new_error( err, "warn-jurisdiction", juris.short_name, f"Some unmatched CountItemTypes:\n{unmatched}", ) working = working.merge( r_i, how="left", left_on="CountItemType_raw", right_on="raw_identifier_value", ).rename(columns={"cdf_internal_name": "CountItemType"}) # join CountItemType_Id and OtherCountItemType cit = pd.read_sql_table("CountItemType", session.bind) working = enum_col_to_id_othertext(working, "CountItemType", cit) working, err_df = clean_ids(working, ["CountItemType_Id"]) working = clean_strings(working, ["OtherCountItemType"]) working = working.drop([ "raw_identifier_value", "cdf_element", "CountItemType_raw" ], axis=1) else: none_or_unknown_id = db.name_to_id(session, t, "none or unknown") working, new_err = replace_raw_with_internal_ids( working, juris, df, t, name_field, err, drop_unmatched=drop, unmatched_id=none_or_unknown_id, ) err = ui.consolidate_errors([err, new_err]) if ui.fatal_error(new_err): return err working.drop(t, axis=1, inplace=True) except KeyError as exc: err = ui.add_new_error( err, "system", "munge.raw_elements_to_cdf", f"KeyError ({exc}) while adding internal ids for {t}.", ) except Exception as exc: err = ui.add_new_error( err, "system", "munge.raw_elements_to_cdf", f"Exception ({exc}) while adding internal ids for {t}.", ) return err # add Selection_Id (combines info from BallotMeasureSelection and CandidateContestSelection) try: working, err = add_selection_id(working, session.bind, juris, err) working, err_df = clean_ids(working, ["Selection_Id"]) except Exception as exc: err = ui.add_new_error( err, "system", "munge.raw_elements_to_cdf", f"Unexpected exception while adding Selection_Id:\n{exc}", ) return err if working.empty: err = ui.add_new_error( err, "jurisdiction", juris.short_name, "No contests found, or no selections found for contests.", ) return err # restrict to just the VoteCount columns (so that groupby.sum will work) vc_cols = [ "Count", "CountItemType_Id", "OtherCountItemType", "ReportingUnit_Id", "Contest_Id", "Selection_Id", "Election_Id", "_datafile_Id", ] working = working[vc_cols] working, e = clean_count_cols(working, ["Count"]) # TODO there are edge cases where this might include dupes # that should be omitted. E.g., if data mistakenly read twice # Sum any rows that were disambiguated (otherwise dupes will be dropped # when VoteCount is filled) group_cols = [c for c in working.columns if c != "Count"] working = working.groupby(group_cols).sum().reset_index() # TODO clean before inserting? All should be already clean, no? # Fill VoteCount try: e = db.insert_to_cdf_db(session.bind, working, "VoteCount") if e: err = ui.add_new_error( err, "system", "munge.raw_elements_to_cdf", f"database insertion error {e}", ) return err except Exception as exc: err = ui.add_new_error( err, "system", "munge.raw_elements_to_cdf", f"Error filling VoteCount:\n{exc}", ) return err
def add_selection_id(df: pd.DataFrame, engine, jurisdiction: jm.Jurisdiction, err: dict) -> (pd.DataFrame, dict): """Assumes <df> has contest_type, BallotMeasureSelection_raw, Candidate_Id column. Loads CandidateSelection table. Appends & fills Selection_Id columns""" # split df by contest type w = dict() for ct in ["BallotMeasure", "Candidate"]: w[ct] = df[df.contest_type == ct].copy() # append BallotMeasureSelection_Id as Selection_Id to w['BallotMeasure'] if not w["BallotMeasure"].empty: bms = pd.read_sql_table(f"BallotMeasureSelection", engine) w["BallotMeasure"], err = replace_raw_with_internal_ids( w["BallotMeasure"], jurisdiction, bms, "BallotMeasureSelection", "Name", err, drop_unmatched=True, drop_all_ok=True, ) w["BallotMeasure"].rename( columns={"BallotMeasureSelection_Id": "Selection_Id"}, inplace=True) w["BallotMeasure"].drop(["BallotMeasureSelection", "Candidate_Id"], axis=1, inplace=True) # prepare to append CandidateSelection_Id as Selection_Id if not w["Candidate"].empty: c_df = w["Candidate"][["Candidate_Id", "Party_Id"]].drop_duplicates() # clean Ids and drop any that were null (i.e., 0 after cleaning) c_df, err_df = clean_ids(c_df, ["Candidate_Id", "Party_Id"]) c_df = c_df[c_df.Candidate_Id != 0] # pull any existing Ids into a new CandidateSelection_Id column col_map = {c: c for c in ["Party_Id", "Candidate_Id"]} c_df = db.append_id_to_dframe(engine, c_df, "CandidateSelection", col_map=col_map) # find unmatched records # TODO this throws error (FutureWarning: elementwise comparison failed), # maybe because CandidateSelection_Id cannot be compared to ""? c_df_unmatched = c_df[(c_df.CandidateSelection_Id == 0) | (c_df.CandidateSelection_Id == "") | (c_df.CandidateSelection_Id.isnull())].copy() if not c_df_unmatched.empty: # Load CandidateSelections to Selection table (for unmatched) id_list = db.add_records_to_selection_table( engine, c_df_unmatched.shape[0]) # Load unmatched records into CandidateSelection table c_df_unmatched["Id"] = pd.Series(id_list, index=c_df_unmatched.index) db.insert_to_cdf_db(engine, c_df_unmatched, "CandidateSelection") # update CandidateSelection_Id column for previously unmatched, merging on Candidate_Id and Party_Id c_df.loc[c_df_unmatched.index, "CandidateSelection_Id"] = c_df_unmatched["Id"] # recast Candidate_Id and Party_Id to int in w['Candidate']; # Note that neither should have nulls, but rather the 'none or unknown' Id # NB: c_df had this recasting done in the append_id_to_dframe routine w["Candidate"], err_df = clean_ids(w["Candidate"], ["Candidate_Id", "Party_Id"]) if not err_df.empty: # show all columns of dataframe with problem in Party_Id or Candidate_Id pd.set_option("max_columns", None) err = ui.add_new_error( err, "system", "munge.add_selection_id", f"Problem with Candidate_Id or Party_Id in some rows:\n{err_df}", ) pd.reset_option("max_columns") # append CandidateSelection_Id to w['Candidate'] w["Candidate"] = w["Candidate"].merge(c_df, how="left", on=["Candidate_Id", "Party_Id"]) # rename to Selection_Id w["Candidate"] = w["Candidate"].rename( columns={"CandidateSelection_Id": "Selection_Id"}) # and drop extraneous to_drop = [ x for x in w["Candidate"].columns if x in ["Candidate_Id", "BallotMeasureSelection_raw"] ] w["Candidate"].drop(to_drop, axis=1, inplace=True) working = pd.concat([w["BallotMeasure"], w["Candidate"]]) return working, err
def add_contest_id(df: pd.DataFrame, juris: jm.Jurisdiction, err: dict, session: Session) -> (pd.DataFrame, dict): working = df.copy() """Append Contest_Id and contest_type. Add contest_type column and fill it correctly. Drop rows which match neither BM nor C contest""" # add Contest_Id and contest_type df_for_type = dict() w_for_type = dict() df_contest = pd.read_sql_table(f"Contest", session.bind) for c_type in ["BallotMeasure", "Candidate"]: if f"{c_type}Contest_raw" in working.columns: # restrict df_contest to the contest_type <c_type> and get the <c_type>Contest_Id df_for_type[c_type] = df_contest[df_contest.contest_type == c_type] none_or_unknown_id = db.name_to_id(session, f"{c_type}Contest", "none or unknown") working, new_err = replace_raw_with_internal_ids( working, juris, df_for_type[c_type], f"{c_type}Contest", "Name", err, drop_unmatched=False, unmatched_id=none_or_unknown_id, drop_all_ok=True, ) if new_err: err = ui.consolidate_errors([err, new_err]) # restrict working to the contest_type <c_type>, add contest_type column w_for_type[c_type] = working[ working[f"{c_type}Contest"] != "none or unknown"] w_for_type[c_type] = add_constant_column( w_for_type[c_type], "contest_type", c_type).rename(columns={f"{c_type}Contest_Id": "Contest_Id"}) # drop text column w_for_type[c_type] = w_for_type[c_type].drop(f"{c_type}Contest", axis=1) else: w_for_type[c_type] = pd.DataFrame() # FIXME: check somewhere that no name (other than 'none or unknown') is shared by BMContests and CandidateContests # TODO check this also when juris files loaded, to save time for user # drop obsolete columns if w_for_type["BallotMeasure"].empty: working_temp = w_for_type["Candidate"] elif w_for_type["Candidate"].empty: working_temp = w_for_type["BallotMeasure"] else: common_cols = [ c for c in w_for_type["BallotMeasure"].columns if c in w_for_type["Candidate"].columns ] for c_type in ["BallotMeasure", "Candidate"]: w_for_type[c_type] = w_for_type[c_type][common_cols] # assemble working from the two pieces working_temp = pd.concat( [w_for_type[ct] for ct in ["BallotMeasure", "Candidate"]]) # fail if fatal errors or no contests recognized (in reverse order, just for fun if working_temp.empty: err = ui.add_new_error(err, "jurisdiction", juris.short_name, f"No contests recognized.") else: working = working_temp if ui.fatal_error(err): return working, err return working, err
def replace_raw_with_internal_ids( df: pd.DataFrame, juris: jm.Jurisdiction, table_df: pd.DataFrame, element: str, internal_name_column: str, error: dict, drop_unmatched: bool = False, drop_extraneous: bool = True, mode: str = "row", unmatched_id: int = 0, drop_all_ok: bool = False, ) -> (pd.DataFrame, dict): """replace columns in <working> with raw_identifier values by columns with internal names and Ids from <table_df>, which has structure of a db table for <element>. <unmatched_id> is the id to assign to unmatched records. If <drop_extraneous> = True and dictionary matches raw_identifier to "row should be dropped", drop that row EVEN IF <drop_unmatched> = False. """ working = df.copy() # join the 'cdf_internal_name' from the raw_identifier table -- this is the internal name field value, # no matter what the name field name is in the internal element table (e.g. 'Name', 'BallotName' or 'Selection') # use dictionary.txt from jurisdiction raw_identifiers = pd.read_csv(os.path.join(juris.path_to_juris_dir, "dictionary.txt"), sep="\t") # restrict to the element at hand raw_ids_for_element = raw_identifiers[raw_identifiers["cdf_element"] == element].copy() if element == "Candidate": # remove any lines with nulls raw_ids_for_element = raw_ids_for_element[ raw_ids_for_element.notnull().all(axis=1)] # Regularize candidate names (to match what's done during upload of candidates to Candidate # table in db) raw_ids_for_element["cdf_internal_name"] = regularize_candidate_names( raw_ids_for_element["cdf_internal_name"]) raw_ids_for_element.drop_duplicates(inplace=True) working = working.merge( raw_ids_for_element, how="left", left_on=f"{element}_raw", right_on="raw_identifier_value", suffixes=["", f"_{element}_ei"], ) # identify unmatched unmatched = working[working["cdf_internal_name"].isnull()] unmatched_raw = sorted(unmatched[f"{element}_raw"].unique(), reverse=True) if len(unmatched_raw) > 0 and element != "BallotMeasureContest": unmatched_str = "\n".join(unmatched_raw) e = f"\n{element}s not found in dictionary.txt:\n{unmatched_str}" error = ui.add_new_error(error, "warn-jurisdiction", juris.short_name, e) if drop_unmatched: working = working[working["cdf_internal_name"].notnull()] if drop_extraneous: # TODO tech debt - note change of case for Candidate above which, if # changed, might affect this in unexpected ways # drop extraneous rows identified in dictionary working = working[ working["cdf_internal_name"] != "row should be dropped"] if working.empty: e = f"No true raw {element} in 'dictionary.txt' matched any raw {element} derived from the result file" if drop_unmatched and not drop_all_ok: error = ui.add_new_error( error, "jurisdiction", juris.short_name, e, ) else: error = ui.add_new_error(error, "warn-jurisdiction", juris.short_name, e) # give working the proper columns and return new_cols = [ c for c in working.columns if (c not in [ "raw_identifier_value", "cdf_element", f"_{element}_ei", "cdf_internal_name", ]) ] + [f"{element}_Id", element] working = pd.DataFrame(columns=new_cols) return working, error else: if mode == "column": # drop rows that melted from unrecognized columns, EVEN IF drop_unmatched=False. # These rows are ALWAYS extraneous. Drop cols where raw_identifier is not null # but no cdf_internal_name was found (pd.merge yields nulls) # working = working[(working["raw_identifier_value"].isnull()) | (working["cdf_internal_name"].notnull())] if drop_extraneous: working = working[ working["cdf_internal_name"] != "row should be dropped"] # TODO tech debt more efficient to drop these earlier, before melting # unmatched elements get nan in fields from dictionary table. Change these to "none or unknown" if not drop_unmatched: working["cdf_internal_name"] = working["cdf_internal_name"].fillna( "none or unknown") # drop extraneous cols from mu.raw_identifier working = working.drop(["raw_identifier_value", "cdf_element"], axis=1) # ensure that there is a column in working called by the element # containing the internal name of the element if f"_{element}_ei" in working.columns: working.rename(columns={f"_{element}_ei": element}, inplace=True) else: working.rename(columns={"cdf_internal_name": element}, inplace=True) # join the element table Id and name columns. # This will create two columns with the internal name field, # whose names will be <element> (from above) # and either internal_name_column or internal_name_column_table_name working = working.merge( table_df[["Id", internal_name_column]], how="left", left_on=element, right_on=internal_name_column, ) # error/warning for unmatched elements working_unmatched = working[(working.Id.isnull()) & (working[element].notnull())] if not working_unmatched.empty and element != "BallotMeasureContest": unmatched_pairs = [ f'({r[f"{element}_raw"]},{r[element]})' for i, r in working_unmatched[[f"{element}_raw", element ]].drop_duplicates().iterrows() ] unmatched_str = "\n\t".join(unmatched_pairs) e = ( f"Warning: Results for {working_unmatched.shape[0]} rows with unmatched {element}s " f"will not be loaded to database. These records (raw name, internal name) were found in dictionary.txt, but " f"no corresponding record was found in the {element} table in the database: \n\t{unmatched_str}" ) error = ui.add_new_error( error, "warn-jurisdiction", juris.short_name, e, ) if drop_unmatched: # if all are unmatched if working_unmatched.shape[0] == working.shape[0]: error = ui.add_new_error( error, "jurisdiction", juris.short_name, (f"No {element} was matched. Either raw values are not in dictionary.txt, or " f"the corresponding cdf_internal_names are missing from {element}.txt" ), ) return working.drop(working.index), error # if only some are unmatched else: # drop the unmatched ones working.drop(labels=working_unmatched.index, inplace=True) else: # change name of unmatched to 'none or unknown' and assign <unmatched_id> as Id working.loc[working.Id.isnull(), internal_name_column] = "none or unknown" working["Id"].fillna(unmatched_id, inplace=True) working = working.drop([internal_name_column, f"{element}_raw"], axis=1) working.rename(columns={"Id": f"{element}_Id"}, inplace=True) return working, error
def check_dependencies(juris_dir, element) -> (list, dict): """Looks in <juris_dir> to check that every dependent column in <element>.txt is listed in the corresponding jurisdiction file. Note: <juris_dir> assumed to exist. """ err = None juris_name = Path(juris_dir).name d = juris_dependency_dictionary() f_path = os.path.join(juris_dir, f"{element}.txt") try: element_df = pd.read_csv( f_path, sep="\t", index_col=None, encoding="iso-8859-1", quoting=csv.QUOTE_MINIMAL, ) except FileNotFoundError: err = ui.add_new_error( err, "system", "juris_and_munger.check_dependencies", f"file doesn't exist: {f_path}", ) # Find all dependent columns dependent = [c for c in element_df if c in d.keys()] changed_elements = set() for c in dependent: target = d[c] ed = (pd.read_csv( os.path.join(juris_dir, f"{element}.txt"), sep="\t", header=0, encoding="iso-8859-1", quoting=csv.QUOTE_MINIMAL, ).fillna("").loc[:, c].unique()) # create list of elements, removing any nulls ru = list( pd.read_csv( os.path.join(juris_dir, f"{target}.txt"), sep="\t", encoding="iso-8859-1", quoting=csv.QUOTE_MINIMAL, ).fillna("").loc[:, db.get_name_field(target)]) try: ru.remove(np.nan) except ValueError: pass missing = [x for x in ed if x not in ru] # if the only missing is null or blank if len(missing) == 1 and missing == [""]: # exclude PrimaryParty, which isn't required to be not-null if c != "PrimaryParty": err = ui.add_new_error(err, "jurisdiction", juris_name, f"Some {c} are null.") elif missing: changed_elements.add(element) changed_elements.add(target) m_str = "\n".join(missing) err = ui.add_new_error( err, "jurisdiction", juris_name, f"Every {c} in {element}.txt must be in {target}.txt. Offenders are:\n{m_str}", ) return changed_elements, err
def add_column_from_formula( working: pd.DataFrame, formula: str, new_col: str, err: Optional[dict], munger_name: str, suffix=None, ) -> (pd.DataFrame, Optional[dict]): """If <suffix> is given, add it to each field in the formula If formula is enclosed in braces, parse first entry as formula, second as a regex (with one parenthesized group) as a recipe for pulling the value via regex analysis """ w = working.copy() # for each {} pair in the formula, create a new column # (assuming formula is well-formed) brace_pattern = re.compile(r"{<([^,]*)>,([^{}]*|[^{}]*{[^{}]*}[^{}]*)}") try: temp_cols = [] for x in brace_pattern.finditer(formula): # create a new column with the extracted info old_col, pattern_str = x.groups() temp_col = f"extracted_from_{old_col}" w, new_err = add_regex_column(w, old_col, temp_col, pattern_str) # change the formula to use the temp column formula = formula.replace(f"{{<{old_col}>,{pattern_str}}}", f"<{temp_col}>") if new_err: err = ui.consolidate_errors([err, new_err]) if ui.fatal_error(new_err): return w, err temp_cols.append(temp_col) # once all {} pairs are gone, use concatenation to build the column to be returned text_field_list, last_text = text_fragments_and_fields(formula) # add suffix, if required if suffix: text_field_list = [(t, f"{f}{suffix}") for (t, f) in text_field_list] # add column to <working> dataframe via the concatenation formula if last_text: w.loc[:, new_col] = last_text[0] else: w.loc[:, new_col] = "" text_field_list.reverse() for t, f in text_field_list: try: w.loc[:, new_col] = (w.loc[:, f].apply(lambda x: f"{t}{x}") + w.loc[:, new_col]) except KeyError as ke: err = ui.add_new_error( err, "munger", munger_name, f"Expected transformed column '{f}' not found, " f"perhaps because of mismatch between munger and results file.", ) return w, err except Exception as e: err = ui.add_new_error(err, "system", "munge.add_column_from_formula", f"Unexpected error: {e}") # delete temporary columns w.drop(temp_cols, axis=1, inplace=True) return w, err
def load_juris_dframe_into_cdf(session, element, juris_path, error) -> dict: """TODO""" project_root = Path(__file__).parents[1].absolute() cdf_schema_def_dir = os.path.join( project_root, "CDF_schema_def_info", ) element_fpath = os.path.join(juris_path, f"{element}.txt") if not os.path.exists(element_fpath): error = ui.add_new_error(error, "jurisdiction", Path(juris_path).name, f"File {element}.txt not found") return error df = pd.read_csv(element_fpath, sep="\t", encoding="iso-8859-1", quoting=csv.QUOTE_MINIMAL).fillna("none or unknown") # TODO check that df has the right format # add 'none or unknown' record df = add_none_or_unknown(df) # dedupe df dupes, df = ui.find_dupes(df) if not dupes.empty: error = ui.add_new_error(error, "warn-jurisdiction", Path(juris_path).name, f"Duplicates were found in {element}.txt") # replace plain text enumerations from file system with id/othertext from db enum_file = os.path.join(cdf_schema_def_dir, "elements", element, "enumerations.txt") if os.path.isfile( enum_file): # (if not, there are no enums for this element) enums = pd.read_csv(enum_file, sep="\t") # get all relevant enumeration tables for e in enums["enumeration"]: # e.g., e = "ReportingUnitType" cdf_e = pd.read_sql_table(e, session.bind) # for every instance of the enumeration in the current table, add id and othertype columns to the dataframe if e in df.columns: df = m.enum_col_to_id_othertext(df, e, cdf_e) # get Ids for any foreign key (or similar) in the table, e.g., Party_Id, etc. fk_file_path = os.path.join(cdf_schema_def_dir, "elements", element, "foreign_keys.txt") if os.path.isfile(fk_file_path): foreign_keys = pd.read_csv(fk_file_path, sep="\t", index_col="fieldname") for fn in foreign_keys.index: ref = foreign_keys.loc[ fn, "refers_to"] # NB: juris elements have no multiple referents (as joins may) col_map = {fn[:-3]: db.get_name_field(ref)} df = db.append_id_to_dframe( session.bind, df, ref, col_map=col_map).rename(columns={f"{ref}_Id": fn}) # commit info in df to corresponding cdf table to db err_string = db.insert_to_cdf_db(session.bind, df, element) if err_string: error = ui.add_new_error( error, "system", "juris_and_munger.load_juris_dframe_into_cdf", f"Error loading {element} to database: {e}") return error
def run2( load_data: bool = True, dbname: Optional[str] = None, test_dir: Optional[str] = None, election_jurisdiction_list: Optional[list] = None, ) -> Optional[dict]: dl = None # to keep syntax-checker happy err = None db_removed = False if not test_dir: # set the test_dir to the directory containing this file test_dir = Path(__file__).parent.absolute() # name the db if dbname is None: # create unique name for test database ts = datetime.datetime.now().strftime("%m%d_%H%M") dbname = f"test_{ts}" if load_data: get_testing_data( url="https://github.com/ElectionDataAnalysis/TestingData.git", results_dir="TestingData", ) # restrict elections and jurisdictions to those given (if given) # otherwise use all in TestingData if not election_jurisdiction_list: election_jurisdiction_list = ui.election_juris_list("TestingData") if load_data: try: # Load the data dl = eda.DataLoader() dl.change_db(dbname) dl.change_dir("results_dir", "TestingData") err, success = dl.load_all( move_files=False, election_jurisdiction_list=election_jurisdiction_list) if not success: print("At least one file did not load correctly.") err, db_removed = optional_remove(dl, "TestingData") except Exception as exc: print(f"Exception occurred: {exc}") if dl: optional_remove(dl, "TestingData") err = ui.add_new_error( err, "file", "TestingData", f"Exception during data loading: {exc}", ) return err if ui.fatal_error(err): optional_remove(dl, "TestingData") return err if not db_removed: result = ui.run_tests( test_dir, dbname, election_jurisdiction_list=election_jurisdiction_list) # remove all .ini files par_files = [x for x in os.listdir("TestingData") if x[-4:] == ".ini"] for f in par_files: os.remove(os.path.join("TestingData", f)) if load_data: err, db_removed = optional_remove(dl, "TestingData") return err