Exemple #1
0
    def get_aux_data(self, aux_data_path, err) -> (dict, dict):
        """creates dictionary of dataframes, one for each auxiliary datafile.
        DataFrames returned are (multi-)indexed by the primary key(s)"""
        aux_data_dict = {
        }  # will hold dataframe for each abbreviated file name

        field_list = list(set([x[0] for x in self.auxiliary_fields()]))
        for abbrev in field_list:
            # get munger for the auxiliary file
            munger_path = os.path.join(self.path_to_munger_dir, abbrev)
            aux_mu, mu_err = check_and_init_munger(munger_path)
            if ui.fatal_error(mu_err):
                err = ui.consolidate_errors([err, mu_err])
                return dict(), err

            # find file in aux_data_path whose name contains the string <afn>
            aux_filename_list = [
                x for x in os.listdir(aux_data_path) if abbrev in x
            ]
            if len(aux_filename_list) == 0:
                # TODO check this error
                err = ui.add_new_error(
                    err,
                    "file",
                    aux_data_path,
                    f"No file found with name containing {abbrev}",
                )
            elif len(aux_filename_list) > 1:
                # TODO check this error
                err = ui.add_new_error(
                    err,
                    "file",
                    aux_data_path,
                    f"Too many files found with name containing {abbrev}",
                )
            else:
                aux_path = os.path.join(aux_data_path, aux_filename_list[0])

            # read and clean the auxiliary data file, including setting primary key columns as int
            df, err = ui.read_single_datafile(aux_mu, aux_path, err)

            # cast primary key(s) as int if possible, and set as (multi-)index
            primary_keys = self.aux_meta.loc[abbrev, "primary_key"].split(",")
            df, new_err = m.cast_cols_as_int(
                df,
                primary_keys,
                error_msg=f"In dataframe for {abbrev}",
                munger_name=aux_mu.name,
            )
            if new_err:
                err = ui.consolidate_errors([err, new_err])
                if ui.fatal_error(new_err):
                    return aux_data_dict, err

            df.set_index(primary_keys, inplace=True)

            aux_data_dict[abbrev] = df

        return aux_data_dict, err
Exemple #2
0
def check_and_init_munger(munger_path: str,
                          aux_data_path: str = None) -> (Munger, dict):
    err = check_munger_files(munger_path)
    if ui.fatal_error(err):
        munger = None
    else:
        munger = Munger(munger_path, aux_data_path=aux_data_path)
    return munger, err
Exemple #3
0
def add_munged_column(
    raw: pd.DataFrame,
    munger: jm.Munger,
    element: str,
    err: Optional[dict],
    mode: str = "row",
    inplace: bool = True,
) -> (pd.DataFrame, dict):
    """Alters dataframe <raw>, adding or redefining <element>_raw column
    via the <formula>. Assumes "_SOURCE" has been appended to all columns of raw
    Does not alter row count."""
    if raw.empty:
        return raw, err
    if inplace:
        working = raw
    else:
        working = raw.copy()

    try:
        formula = munger.cdf_elements.loc[element, "raw_identifier_formula"]
        if mode == "row":
            for field in munger.field_list:
                formula = formula.replace(f"<{field}>", f"<{field}_SOURCE>")
        elif mode == "column":
            for i in range(munger.options["header_row_count"]):
                formula = formula.replace(f"<{i}>", f"<variable_{i}>")

        working, new_err = add_column_from_formula(working, formula,
                                                   f"{element}_raw", err,
                                                   munger.name)
        if new_err:
            err = ui.consolidate_errors([err, new_err])
            if ui.fatal_error(new_err):
                return working, err

        # correct any disambiguated names back to the original
        if element in munger.alt.keys():
            working.replace({f"{element}_raw": munger.alt[element]},
                            inplace=True)

    except Exception as e:
        err = ui.add_new_error(
            err,
            "munger",
            munger.name,
            f"Error interpreting formula for {element} in cdf_element.txt. {e}",
        )
        return working, err

    # compress whitespace for <element>_raw
    working.loc[:, f"{element}_raw"] = working[f"{element}_raw"].apply(
        compress_whitespace)
    return working, err
Exemple #4
0
def check_munger_files(munger_path: str) -> dict:
    """Check that the munger files are complete and consistent with one another.
    Assumes munger directory exists. Assumes dictionary.txt is in the template file.
    <munger_path> is the path to the directory of the particular munger
    """

    err = None
    project_root = Path(__file__).parents[1].absolute()
    munger_name = Path(munger_path).name

    # check whether directory exists
    if not os.path.isdir(munger_path):
        err = ui.add_new_error(err, "munger", munger_name,
                               f"Directory does not exist: {munger_path}")
        return err

    # check whether all files exist
    templates = os.path.join(project_root, "juris_and_munger",
                             "munger_templates")
    template_with_extension_list = os.listdir(templates)
    for munger_file in template_with_extension_list:
        # TODO create optional template for aux_meta.txt
        cf_path = os.path.join(munger_path, munger_file)
        # if file does not already exist in munger dir, throw error
        file_exists = os.path.isfile(cf_path)

        # if file exists, check format against template and then contents
        if file_exists:
            err = check_munger_file_format(munger_path, munger_file, templates,
                                           err)

            # if no errors found so far, check contents
            if not ui.fatal_error(err,
                                  error_type_list=["munger"],
                                  name_key_list=[munger_file]):
                err = check_munger_file_contents(munger_path, munger_file, err)
        else:
            err = ui.add_new_error(err, "munger", munger_name,
                                   "File does not exist")

    # if the munger requires auxiliary data
    if os.path.isfile(os.path.join(munger_path, "aux_meta.txt")):
        # TODO check that each abbrev in aux_meta.txt has an associated sub_munger
        # check sub-mungers (in sub-directories of munger)
        sub_mungers = os.listdir(munger_path)
        for f in sub_mungers:
            if os.path.isdir(f):
                new_err = check_munger_files(f)
                if new_err:
                    ui.add_new_error([err, new_err])
    return err
Exemple #5
0
def read_multi_sheet_excel(
    f_path: str,
    munger: jm.Munger,
    err: dict,
) -> (pd.DataFrame, dict):
    # get munger parameters
    sheets_to_skip = munger.options["sheets_to_skip"]
    count_of_top_lines_to_skip = munger.options["count_of_top_lines_to_skip"]
    constant_line_count = munger.options["constant_line_count"]
    constant_column_count = munger.options["constant_column_count"]
    header_row_count = munger.options["header_row_count"]
    columns_to_skip = munger.options["columns_to_skip"]

    try:
        df = pd.read_excel(f_path, sheet_name=None, header=None)
    except Exception as e:
        new_err = ui.add_new_error(err, "file",
                                   Path(f_path).name,
                                   f"Error reading file: {e}")
        if new_err:
            err = ui.consolidate_errors([err, new_err])
            if ui.fatal_error(new_err):
                return pd.DataFrame(), err

    sheets_to_read = [k for k in df.keys() if k not in sheets_to_skip]

    raw_results = pd.DataFrame()
    for sh in sheets_to_read:
        try:
            data = df[sh].copy()

            # remove lines designated ignorable
            data.drop(data.index[:count_of_top_lines_to_skip], inplace=True)

            # remove any all-null rows
            data.dropna(how="all", inplace=True)

            # read constant_line info from first non-null entries of constant-header rows
            # then drop those rows
            if constant_line_count > 0:
                constant_lines = (data.iloc[:constant_line_count].fillna(
                    method="bfill", axis=1).iloc[:, 0])
                data.drop(data.index[:constant_line_count], inplace=True)

            # read constant_column info from first non-null entries of constant columns
            # and drop those columns
            if constant_column_count > 0:
                constant_columns = (data.T.iloc[:constant_column_count].fillna(
                    method="bfill", axis=1).iloc[:, 0])
                data.drop(data.columns[:constant_column_count],
                          axis=1,
                          inplace=True)

            # add multi-index for actual header rows
            header_variable_names = [
                f"header_{j}" for j in range(header_row_count)
            ]

            col_multi_index = pd.MultiIndex.from_frame(
                data.iloc[range(header_row_count), :].transpose().fillna(
                    method="ffill"),
                names=header_variable_names,
            )
            data.columns = col_multi_index

            # remove header rows from data
            data.drop(data.index[:header_row_count], inplace=True)

            # Drop extraneous columns per munger, and columns without data
            data.drop(data.columns[columns_to_skip], axis=1, inplace=True)
            data.dropna(axis=1, how="all", inplace=True)

            # make first column into an index
            data.set_index(keys=data.columns[0], inplace=True)

            # move header info to columns
            data = pd.melt(
                data,
                ignore_index=False,
                value_name="count",
                var_name=header_variable_names,
            )

            # add column(s) for constant info
            for j in range(constant_line_count):
                data = m.add_constant_column(data, f"constant_line_{j}",
                                             constant_lines.iloc[j])
            for j in range(constant_column_count):
                data = m.add_constant_column(data, f"constant_column_{j}",
                                             constant_columns.iloc[j])

            # Make row index (from first column of blocks) into a column called 'first_column'
            data.reset_index(inplace=True)
            data.rename(columns={data.columns[0]: "first_column"},
                        inplace=True)

            raw_results = pd.concat([raw_results, data])
        except Exception as e:
            err = ui.add_new_error(
                err,
                "system",
                "special_formats.read_multi_sheet_excel",
                f"Unexpected exception while processing sheet {sh}: {e}",
            )
    return raw_results, err
Exemple #6
0
def raw_elements_to_cdf(
    session,
    juris: jm.Jurisdiction,
    mu: jm.Munger,
    raw: pd.DataFrame,
    count_cols: List[str],
    err: dict,
    constants: dict,
) -> dict:
    """load data from <raw> into the database."""
    working = raw.copy()

    try:
        working, new_err = munge_and_melt(mu, working, count_cols, err)
        if new_err:
            err = ui.consolidate_errors([err, new_err])
            if ui.fatal_error(new_err):
                return err
    except Exception as exc:
        err = ui.add_new_error(
            err,
            "system",
            "munge.raw_elements_to_cdf",
            f"Unexpected exception during munge_and_melt: {exc}",
        )
        return err

    # enter elements from sources outside raw data, including creating id column(s)
    for k in constants.keys():
        working = add_constant_column(working, k, constants[k])

    # add Contest_Id (unless it was passed in ids)
    if "Contest_Id" not in working.columns:
        try:
            working, err = add_contest_id(working, juris, err, session)
        except Exception as exc:
            err = ui.add_new_error(
                err,
                "system",
                "munge.raw_elements_to_cdf",
                f"Unexpected exception while adding Contest_Id: {exc}",
            )
            return err
        if ui.fatal_error(err):
            return err

    # get ids for remaining info sourced from rows and columns (except Selection_Id)
    element_list = [
        t for t in mu.cdf_elements.index if (t[-7:] != "Contest" and (
            t[-9:] != "Selection") and f"{t}_Id" not in constants.keys())
    ]
    for t in element_list:
        try:
            # capture id from db in new column and erase any now-redundant cols
            df = pd.read_sql_table(t, session.bind)
            name_field = db.get_name_field(t)
            # set drop_unmatched = True for fields necessary to BallotMeasure rows,
            #  drop_unmatched = False otherwise to prevent losing BallotMeasureContests for BM-inessential fields
            if t == "ReportingUnit" or t == "CountItemType":
                drop = True
            else:
                drop = False
            if t == "CountItemType":
                # munge raw to internal CountItemType
                r_i = pd.read_csv(os.path.join(juris.path_to_juris_dir,
                                               "dictionary.txt"),
                                  sep="\t")
                r_i = r_i[r_i.cdf_element == "CountItemType"]
                recognized = r_i.raw_identifier_value.unique()
                matched = (working.CountItemType_raw.isin(recognized))
                if not matched.all():
                    unmatched = "\n".join(
                        (working[~matched]["CountItemType_raw"]).unique())
                    ui.add_new_error(
                        err,
                        "warn-jurisdiction",
                        juris.short_name,
                        f"Some unmatched CountItemTypes:\n{unmatched}",
                    )
                working = working.merge(
                    r_i,
                    how="left",
                    left_on="CountItemType_raw",
                    right_on="raw_identifier_value",
                ).rename(columns={"cdf_internal_name": "CountItemType"})

                # join CountItemType_Id and OtherCountItemType
                cit = pd.read_sql_table("CountItemType", session.bind)
                working = enum_col_to_id_othertext(working, "CountItemType",
                                                   cit)
                working, err_df = clean_ids(working, ["CountItemType_Id"])
                working = clean_strings(working, ["OtherCountItemType"])
                working = working.drop([
                    "raw_identifier_value", "cdf_element", "CountItemType_raw"
                ],
                                       axis=1)
            else:
                none_or_unknown_id = db.name_to_id(session, t,
                                                   "none or unknown")
                working, new_err = replace_raw_with_internal_ids(
                    working,
                    juris,
                    df,
                    t,
                    name_field,
                    err,
                    drop_unmatched=drop,
                    unmatched_id=none_or_unknown_id,
                )
                err = ui.consolidate_errors([err, new_err])
                if ui.fatal_error(new_err):
                    return err
                working.drop(t, axis=1, inplace=True)
        except KeyError as exc:
            err = ui.add_new_error(
                err,
                "system",
                "munge.raw_elements_to_cdf",
                f"KeyError ({exc}) while adding internal ids for {t}.",
            )
        except Exception as exc:
            err = ui.add_new_error(
                err,
                "system",
                "munge.raw_elements_to_cdf",
                f"Exception ({exc}) while adding internal ids for {t}.",
            )

            return err

    # add Selection_Id (combines info from BallotMeasureSelection and CandidateContestSelection)
    try:
        working, err = add_selection_id(working, session.bind, juris, err)
        working, err_df = clean_ids(working, ["Selection_Id"])
    except Exception as exc:
        err = ui.add_new_error(
            err,
            "system",
            "munge.raw_elements_to_cdf",
            f"Unexpected exception while adding Selection_Id:\n{exc}",
        )
        return err
    if working.empty:
        err = ui.add_new_error(
            err,
            "jurisdiction",
            juris.short_name,
            "No contests found, or no selections found for contests.",
        )
        return err

    # restrict to just the VoteCount columns (so that groupby.sum will work)
    vc_cols = [
        "Count",
        "CountItemType_Id",
        "OtherCountItemType",
        "ReportingUnit_Id",
        "Contest_Id",
        "Selection_Id",
        "Election_Id",
        "_datafile_Id",
    ]
    working = working[vc_cols]
    working, e = clean_count_cols(working, ["Count"])

    # TODO there are edge cases where this might include dupes
    #  that should be omitted. E.g., if data mistakenly read twice
    # Sum any rows that were disambiguated (otherwise dupes will be dropped
    #  when VoteCount is filled)
    group_cols = [c for c in working.columns if c != "Count"]
    working = working.groupby(group_cols).sum().reset_index()
    # TODO clean before inserting? All should be already clean, no?

    # Fill VoteCount
    try:
        e = db.insert_to_cdf_db(session.bind, working, "VoteCount")
        if e:
            err = ui.add_new_error(
                err,
                "system",
                "munge.raw_elements_to_cdf",
                f"database insertion error {e}",
            )
            return err
    except Exception as exc:
        err = ui.add_new_error(
            err,
            "system",
            "munge.raw_elements_to_cdf",
            f"Error filling VoteCount:\n{exc}",
        )

    return err
Exemple #7
0
def add_contest_id(df: pd.DataFrame, juris: jm.Jurisdiction, err: dict,
                   session: Session) -> (pd.DataFrame, dict):
    working = df.copy()
    """Append Contest_Id and contest_type. Add contest_type column and fill it correctly.
    Drop rows which match neither BM nor C contest"""

    # add Contest_Id and contest_type
    df_for_type = dict()
    w_for_type = dict()
    df_contest = pd.read_sql_table(f"Contest", session.bind)
    for c_type in ["BallotMeasure", "Candidate"]:
        if f"{c_type}Contest_raw" in working.columns:
            # restrict df_contest to the contest_type <c_type> and get the <c_type>Contest_Id
            df_for_type[c_type] = df_contest[df_contest.contest_type == c_type]
            none_or_unknown_id = db.name_to_id(session, f"{c_type}Contest",
                                               "none or unknown")
            working, new_err = replace_raw_with_internal_ids(
                working,
                juris,
                df_for_type[c_type],
                f"{c_type}Contest",
                "Name",
                err,
                drop_unmatched=False,
                unmatched_id=none_or_unknown_id,
                drop_all_ok=True,
            )
            if new_err:
                err = ui.consolidate_errors([err, new_err])
            # restrict working to the contest_type <c_type>, add contest_type column
            w_for_type[c_type] = working[
                working[f"{c_type}Contest"] != "none or unknown"]
            w_for_type[c_type] = add_constant_column(
                w_for_type[c_type], "contest_type",
                c_type).rename(columns={f"{c_type}Contest_Id": "Contest_Id"})

            # drop text column
            w_for_type[c_type] = w_for_type[c_type].drop(f"{c_type}Contest",
                                                         axis=1)
        else:
            w_for_type[c_type] = pd.DataFrame()

    # FIXME: check somewhere that no name (other than 'none or unknown') is shared by BMContests and CandidateContests
    # TODO check this also when juris files loaded, to save time for user

    # drop obsolete columns
    if w_for_type["BallotMeasure"].empty:
        working_temp = w_for_type["Candidate"]
    elif w_for_type["Candidate"].empty:
        working_temp = w_for_type["BallotMeasure"]
    else:
        common_cols = [
            c for c in w_for_type["BallotMeasure"].columns
            if c in w_for_type["Candidate"].columns
        ]
        for c_type in ["BallotMeasure", "Candidate"]:
            w_for_type[c_type] = w_for_type[c_type][common_cols]

        # assemble working from the two pieces
        working_temp = pd.concat(
            [w_for_type[ct] for ct in ["BallotMeasure", "Candidate"]])

    # fail if fatal errors or no contests recognized (in reverse order, just for fun
    if working_temp.empty:
        err = ui.add_new_error(err, "jurisdiction", juris.short_name,
                               f"No contests recognized.")
    else:
        working = working_temp
    if ui.fatal_error(err):
        return working, err

    return working, err
Exemple #8
0
def munge_and_melt(mu: jm.Munger, raw: pd.DataFrame, count_cols: List[str],
                   err: Optional[dict]) -> (pd.DataFrame, Optional[dict]):
    """Does not alter raw; returns transformation of raw:
     all row- and column-sourced mungeable info into columns (but doesn't translate via dictionary)
    new column names are, e.g., ReportingUnit_raw, Candidate_raw, etc.
    """
    working = raw.copy()

    # melt all column (multi-) index info into columns
    non_count_cols = [x for x in working.columns if x not in count_cols]
    working = working.melt(id_vars=non_count_cols)

    # ensure all columns have string names
    # (i.e., get rid of any tuples from column multi-index)
    new_col_index = [
        c[mu.options["field_name_row"]] if isinstance(c, tuple) else c
        for c in working.columns
    ]
    working.columns = new_col_index

    #  if only one header row, rename variable to variable_0 for consistency
    working.rename(columns={"variable": "variable_0"}, inplace=True)

    # clean and append "_SOURCE" to each original non-count column name
    working, new_err = munge_clean(working, mu, ["value"])
    if new_err:
        err = ui.consolidate_errors([err, new_err])
        if ui.fatal_error(new_err):
            return working, err

    # NB: if there is just one numerical column, melt still creates dummy variable col
    #  in which each value is 'value'

    # rename value to Count
    #  NB: any unnecessary numerical cols (e.g., Contest Group ID) will not matter
    #  as they will be be missing from dictionary.txt and hence will be ignored.
    working.rename(columns={"value": "Count"}, inplace=True)

    # apply munging formula from row sources (after renaming fields in raw formula as necessary)
    for t in mu.cdf_elements[mu.cdf_elements.source == "row"].index:
        working, new_err = add_munged_column(working, mu, t, None, mode="row")
        if new_err:
            err = ui.consolidate_errors([err, new_err])
            if ui.fatal_error(new_err):
                return working, err

    # remove original row-munge columns
    munged = [x for x in working.columns if x[-7:] == "_SOURCE"]
    working.drop(munged, axis=1, inplace=True)

    # apply munge formulas for column sources
    for t in mu.cdf_elements[mu.cdf_elements.source == "column"].index:
        working, new_err = add_munged_column(working,
                                             mu,
                                             t,
                                             None,
                                             mode="column")
        if new_err:
            err = ui.consolidate_errors([err, new_err])
            if ui.fatal_error(new_err):
                return working, err

    # remove unnecessary columns
    not_needed = [c for c in working.columns if c[:9] == "variable_"]
    working.drop(not_needed, axis=1, inplace=True)

    return working, err
Exemple #9
0
def add_column_from_formula(
    working: pd.DataFrame,
    formula: str,
    new_col: str,
    err: Optional[dict],
    munger_name: str,
    suffix=None,
) -> (pd.DataFrame, Optional[dict]):
    """If <suffix> is given, add it to each field in the formula
    If formula is enclosed in braces, parse first entry as formula, second as a
    regex (with one parenthesized group) as a recipe for pulling the value via regex analysis
    """
    w = working.copy()
    #  for each {} pair in the formula, create a new column
    # (assuming formula is well-formed)
    brace_pattern = re.compile(r"{<([^,]*)>,([^{}]*|[^{}]*{[^{}]*}[^{}]*)}")

    try:
        temp_cols = []
        for x in brace_pattern.finditer(formula):
            # create a new column with the extracted info
            old_col, pattern_str = x.groups()
            temp_col = f"extracted_from_{old_col}"
            w, new_err = add_regex_column(w, old_col, temp_col, pattern_str)
            # change the formula to use the temp column
            formula = formula.replace(f"{{<{old_col}>,{pattern_str}}}",
                                      f"<{temp_col}>")
            if new_err:
                err = ui.consolidate_errors([err, new_err])
                if ui.fatal_error(new_err):
                    return w, err
            temp_cols.append(temp_col)
        # once all {} pairs are gone, use concatenation to build the column to be returned
        text_field_list, last_text = text_fragments_and_fields(formula)

        # add suffix, if required
        if suffix:
            text_field_list = [(t, f"{f}{suffix}")
                               for (t, f) in text_field_list]

        # add column to <working> dataframe via the concatenation formula
        if last_text:
            w.loc[:, new_col] = last_text[0]
        else:
            w.loc[:, new_col] = ""
        text_field_list.reverse()
        for t, f in text_field_list:
            try:
                w.loc[:, new_col] = (w.loc[:, f].apply(lambda x: f"{t}{x}") +
                                     w.loc[:, new_col])
            except KeyError as ke:
                err = ui.add_new_error(
                    err,
                    "munger",
                    munger_name,
                    f"Expected transformed column '{f}' not found, "
                    f"perhaps because of mismatch between munger and results file.",
                )
                return w, err

    except Exception as e:
        err = ui.add_new_error(err, "system", "munge.add_column_from_formula",
                               f"Unexpected error: {e}")

    # delete temporary columns
    w.drop(temp_cols, axis=1, inplace=True)
    return w, err
Exemple #10
0
def run2(
    load_data: bool = True,
    dbname: Optional[str] = None,
    test_dir: Optional[str] = None,
    election_jurisdiction_list: Optional[list] = None,
) -> Optional[dict]:
    dl = None  # to keep syntax-checker happy

    err = None
    db_removed = False
    if not test_dir:
        # set the test_dir to the directory containing this file
        test_dir = Path(__file__).parent.absolute()

    # name the db
    if dbname is None:
        # create unique name for test database
        ts = datetime.datetime.now().strftime("%m%d_%H%M")
        dbname = f"test_{ts}"

    if load_data:
        get_testing_data(
            url="https://github.com/ElectionDataAnalysis/TestingData.git",
            results_dir="TestingData",
        )

    # restrict elections and jurisdictions to those given (if given)
    # otherwise use all in TestingData
    if not election_jurisdiction_list:
        election_jurisdiction_list = ui.election_juris_list("TestingData")

    if load_data:
        try:
            # Load the data
            dl = eda.DataLoader()
            dl.change_db(dbname)

            dl.change_dir("results_dir", "TestingData")
            err, success = dl.load_all(
                move_files=False,
                election_jurisdiction_list=election_jurisdiction_list)
            if not success:
                print("At least one file did not load correctly.")
                err, db_removed = optional_remove(dl, "TestingData")
        except Exception as exc:
            print(f"Exception occurred: {exc}")
            if dl:
                optional_remove(dl, "TestingData")
            err = ui.add_new_error(
                err,
                "file",
                "TestingData",
                f"Exception during data loading: {exc}",
            )
            return err

        if ui.fatal_error(err):
            optional_remove(dl, "TestingData")
            return err

    if not db_removed:
        result = ui.run_tests(
            test_dir,
            dbname,
            election_jurisdiction_list=election_jurisdiction_list)

        # remove all .ini files
        par_files = [x for x in os.listdir("TestingData") if x[-4:] == ".ini"]
        for f in par_files:
            os.remove(os.path.join("TestingData", f))

        if load_data:
            err, db_removed = optional_remove(dl, "TestingData")
    return err
Exemple #11
0
def check_munger_file_contents(munger_path, munger_file, err):
    """check whether munger files are internally consistent"""
    munger_name = Path(munger_path).name
    if munger_file == "cdf_elements.txt":
        # read cdf_elements and format from files
        cdf_elements = pd.read_csv(
            os.path.join(munger_path, "cdf_elements.txt"),
            sep="\t",
            encoding="iso-8859-1",
        ).fillna("")

        # every source in cdf_elements is either row, column or other
        bad_source = [
            x for x in cdf_elements.source if x not in ["row", "column"]
        ]
        if bad_source:
            err = ui.add_new_error(
                err,
                "warn-munger",
                munger_name,
                f"Source(s) in cdf_elements.txt not recognized: {bad_source}",
            )

        # formulas have good syntax
        bad_formula = [
            x for x in cdf_elements.raw_identifier_formula.unique()
            if not m.good_syntax(x)
        ]
        if bad_formula:
            f_str = ",".join(bad_formula)
            err = ui.add_new_error(
                err,
                "warn-munger",
                munger_name,
                f"At least one formula in cdf_elements.txt has bad syntax: {f_str}",
            )

        # for each column-source record in cdf_element, contents of bracket are numbers in the header_rows
        p_not_just_digits = re.compile(r"<.*\D.*>")
        p_catch_digits = re.compile(r"<(\d+)>")
        bad_column_formula = set()

        # TODO check: can this error out now?
        for i, r in cdf_elements[cdf_elements.source == "column"].iterrows():
            if p_not_just_digits.search(r["raw_identifier_formula"]):
                bad_column_formula.add(r["raw_identifier_formula"])
        if bad_column_formula:
            err = ui.add_new_error(
                err,
                "munger",
                munger_name,
                f"At least one column-source formula in cdf_elements.txt has bad syntax: {bad_column_formula}",
            )

    elif munger_file == "format.config":
        format_d, err = ui.get_runtime_parameters(
            required_keys=munger_pars_req,
            param_file=os.path.join(munger_path, "format.config"),
            header="format",
            err=err,
            optional_keys=list(munger_pars_opt.keys()),
        )

        # stop and return error if fatal
        if ui.fatal_error(err):
            return err

        # warn if encoding missing or is not recognized
        if "encoding" not in format_d.keys():
            err = ui.add_new_error(
                err,
                "warn-munger",
                munger_name,
                f"No encoding specified; iso-8859-1 will be used",
            )
        elif not format_d["encoding"] in ui.recognized_encodings:
            err = ui.add_new_error(
                err,
                "warn-munger",
                munger_name,
                (f"Encoding {format_d['encoding']} in format file is not recognized;"
                 f"iso-8859-1 will be used"),
            )

        # check all parameters for flat files
        if format_d["file_type"] in ["txt", "csv", "xls"]:
            # Either field_name_row is a number, or field_names_if_no_field_name_row is a non-empty list
            if (not format_d["field_name_row"]) or (
                    not format_d["field_name_row"].isnumeric()):
                if (not format_d["field_names_if_no_field_name_row"]) or (len(
                        format_d["field_names_if_no_field_name_row"]) == 0):
                    err = ui.add_new_error(
                        err,
                        "munger",
                        munger_name,
                        (f"field_name_row is not an integer, "
                         f"but no field names are given in field_names_if_no_field_name_row."
                         ),
                    )

            # other entries in format.config are of correct type
            try:
                int(format_d["header_row_count"])
            except (TypeError, ValueError):
                err = ui.add_new_error(
                    err,
                    "munger",
                    munger_name,
                    f'header_row_count is not an integer:  {format_d["header_row_count"]}',
                )

        # check all parameters for concatenated blocks (e.g., Georgia ExpressVote output)
        elif format_d["file_type"] in ["concatenated-blocks"]:
            for key in [
                    "count_of_top_lines_to_skip",
                    "last_header_column_count",
                    "column_width",
            ]:
                try:
                    int(format_d[key])
                except (ValueError, TypeError):
                    err = ui.add_new_error(
                        err,
                        "munger",
                        munger_name,
                        f"{key} is not an integer:  {format_d[key]}",
                    )
    else:
        err = ui.add_new_error(
            err,
            "system",
            "juris_and_munger.check_munger_file_contents",
            f"Munger template file not recognized: {munger_file}",
        )

    return err