Exemple #1
0
def grab_ini_files(results_dir, path_to_repo):
    jurisdictions = [
        name for name in os.listdir(results_dir)
        if os.path.isdir(os.path.join(results_dir, name))
    ]
    path_to_ini = os.path.join(path_to_repo, "src", "ini_files_for_results")
    for j in jurisdictions:
        copy_path = os.path.join(path_to_ini, j)
        if os.path.isdir(copy_path):
            copy_tree(copy_path, results_dir)

    par_files = [f for f in os.listdir(results_dir) if f[-4:] == ".ini"]

    # if the results file not found, delete the .ini file & warn user
    for par_file in par_files:
        d, err = ui.get_runtime_parameters(
            required_keys=["results_file"],
            header="election_data_analysis",
            param_file=os.path.join(results_dir, par_file),
        )
        # delete any .ini files whose results file is not found
        if not os.path.isfile(os.path.join(results_dir, d["results_file"])):
            print(
                f"File referenced in .ini file, but not found: {d['results_file']}"
            )
            os.remove(os.path.join(results_dir, par_file))
    return
Exemple #2
0
def read_munger_info_from_files(dir_path):
    """"""
    # create auxiliary dataframe
    if "aux_meta.txt" in os.listdir(dir_path):
        # if some elements are reported in separate files per auxilliary.txt file, read from file
        aux_meta = pd.read_csv(
            os.path.join(dir_path, "aux_meta.txt"),
            sep="\t",
            index_col="abbreviated_file_name",
        )
    else:
        # set auxiliary dataframe to empty
        aux_meta = pd.DataFrame([[]])

    # read cdf_element info
    cdf_elements = pd.read_csv(
        os.path.join(dir_path, "cdf_elements.txt"),
        sep="\t",
        index_col="name",
        encoding="iso-8859-1",
        quoting=csv.QUOTE_MINIMAL,
    ).fillna("")

    # add column for list of fields used in formulas
    cdf_elements["fields"] = [[]] * cdf_elements.shape[0]
    for i, r in cdf_elements.iterrows():
        text_field_list, last_text = m.text_fragments_and_fields(
            cdf_elements.loc[i, "raw_identifier_formula"])
        cdf_elements.loc[i, "fields"] = [f for t, f in text_field_list]

    # read formatting info
    required_keys = munger_pars_req
    optional_keys = list(munger_pars_opt.keys())
    options, missing_required_params = ui.get_runtime_parameters(
        required_keys=required_keys,
        param_file=os.path.join(dir_path, "format.config"),
        header="format",
        optional_keys=optional_keys,
    )
    options = recast_options(options, munger_pars_opt)

    file_type = options["file_type"]
    if "encoding" in options.keys():
        encoding = options["encoding"]
    else:
        encoding = "iso-8859-1"
    if "thousands_separator" in options.keys(
    ) and options["thousands_separator"] not in ["", "None"]:
        thousands_separator = options["thousands_separator"]
    else:
        thousands_separator = None

    # TODO have options hold all optional parameters (and maybe even all parameters)
    #  and remove explicit attributes entirely?
    return [
        cdf_elements, file_type, encoding, thousands_separator, aux_meta,
        options
    ]
Exemple #3
0
def check_munger_file_format(munger_path: str, munger_file: str,
                             templates: str, err: dict) -> dict:

    if munger_file == "cdf_elements.txt":
        pass  # nothing to check now that entries may vary
    elif munger_file == "format.config":
        d, err = ui.get_runtime_parameters(
            required_keys=munger_pars_req,
            param_file=os.path.join(munger_path, munger_file),
            header="format",
            err=err,
            optional_keys=list(munger_pars_opt.keys()),
        )
    else:
        err = ui.add_new_error(
            err,
            "munger",
            munger_path,
            f"Unrecognized file in munger: {munger_file}",
        )
    return err
Exemple #4
0
def check_munger_file_format(munger_path: str, munger_file: str,
                             templates: str, err: dict) -> dict:

    if munger_file[-4:] == ".txt":
        cf_df = pd.read_csv(os.path.join(munger_path, munger_file),
                            sep="\t",
                            encoding="iso-8859-1")
        temp = pd.read_csv(os.path.join(templates, munger_file),
                           sep="\t",
                           encoding="iso-8859-1")

        # check column names are correct
        if set(cf_df.columns) != set(temp.columns):
            err = ui.add_new_error(
                err,
                "munger",
                munger_path,
                f"Columns in {munger_file} do not match template.:\n"
                f"Columns of {munger_file}: {cf_df.columns}\n"
                f"Columns of template: {temp.columns}",
            )

    elif munger_file == "format.config":
        d, err = ui.get_runtime_parameters(
            required_keys=munger_pars_req,
            param_file=os.path.join(munger_path, munger_file),
            header="format",
            err=err,
            optional_keys=list(munger_pars_opt.keys()),
        )
    else:
        err = ui.add_new_error(
            err,
            "munger",
            munger_path,
            f"Unrecognized file in munger: {munger_file}",
        )
    return err
Exemple #5
0
def check_munger_file_contents(munger_path, munger_file, err):
    """check whether munger files are internally consistent"""
    munger_name = Path(munger_path).name
    if munger_file == "cdf_elements.txt":
        # read cdf_elements and format from files
        cdf_elements = pd.read_csv(
            os.path.join(munger_path, "cdf_elements.txt"),
            sep="\t",
            encoding="iso-8859-1",
        ).fillna("")

        # every source in cdf_elements is either row, column or other
        bad_source = [
            x for x in cdf_elements.source if x not in ["row", "column"]
        ]
        if bad_source:
            err = ui.add_new_error(
                err,
                "warn-munger",
                munger_name,
                f"Source(s) in cdf_elements.txt not recognized: {bad_source}",
            )

        # formulas have good syntax
        bad_formula = [
            x for x in cdf_elements.raw_identifier_formula.unique()
            if not m.good_syntax(x)
        ]
        if bad_formula:
            f_str = ",".join(bad_formula)
            err = ui.add_new_error(
                err,
                "warn-munger",
                munger_name,
                f"At least one formula in cdf_elements.txt has bad syntax: {f_str}",
            )

        # for each column-source record in cdf_element, contents of bracket are numbers in the header_rows
        p_not_just_digits = re.compile(r"<.*\D.*>")
        p_catch_digits = re.compile(r"<(\d+)>")
        bad_column_formula = set()

        # TODO check: can this error out now?
        for i, r in cdf_elements[cdf_elements.source == "column"].iterrows():
            if p_not_just_digits.search(r["raw_identifier_formula"]):
                bad_column_formula.add(r["raw_identifier_formula"])
        if bad_column_formula:
            err = ui.add_new_error(
                err,
                "munger",
                munger_name,
                f"At least one column-source formula in cdf_elements.txt has bad syntax: {bad_column_formula}",
            )

    elif munger_file == "format.config":
        format_d, err = ui.get_runtime_parameters(
            required_keys=munger_pars_req,
            param_file=os.path.join(munger_path, "format.config"),
            header="format",
            err=err,
            optional_keys=list(munger_pars_opt.keys()),
        )

        # stop and return error if fatal
        if ui.fatal_error(err):
            return err

        # warn if encoding missing or is not recognized
        if "encoding" not in format_d.keys():
            err = ui.add_new_error(
                err,
                "warn-munger",
                munger_name,
                f"No encoding specified; iso-8859-1 will be used",
            )
        elif not format_d["encoding"] in ui.recognized_encodings:
            err = ui.add_new_error(
                err,
                "warn-munger",
                munger_name,
                (f"Encoding {format_d['encoding']} in format file is not recognized;"
                 f"iso-8859-1 will be used"),
            )

        # check all parameters for flat files
        if format_d["file_type"] in ["txt", "csv", "xls"]:
            # Either field_name_row is a number, or field_names_if_no_field_name_row is a non-empty list
            if (not format_d["field_name_row"]) or (
                    not format_d["field_name_row"].isnumeric()):
                if (not format_d["field_names_if_no_field_name_row"]) or (len(
                        format_d["field_names_if_no_field_name_row"]) == 0):
                    err = ui.add_new_error(
                        err,
                        "munger",
                        munger_name,
                        (f"field_name_row is not an integer, "
                         f"but no field names are given in field_names_if_no_field_name_row."
                         ),
                    )

            # other entries in format.config are of correct type
            try:
                int(format_d["header_row_count"])
            except (TypeError, ValueError):
                err = ui.add_new_error(
                    err,
                    "munger",
                    munger_name,
                    f'header_row_count is not an integer:  {format_d["header_row_count"]}',
                )

        # check all parameters for concatenated blocks (e.g., Georgia ExpressVote output)
        elif format_d["file_type"] in ["concatenated-blocks"]:
            for key in [
                    "count_of_top_lines_to_skip",
                    "last_header_column_count",
                    "column_width",
            ]:
                try:
                    int(format_d[key])
                except (ValueError, TypeError):
                    err = ui.add_new_error(
                        err,
                        "munger",
                        munger_name,
                        f"{key} is not an integer:  {format_d[key]}",
                    )
    else:
        err = ui.add_new_error(
            err,
            "system",
            "juris_and_munger.check_munger_file_contents",
            f"Munger template file not recognized: {munger_file}",
        )

    return err