Beispiel #1
0
def check_results_munger_compatibility(mu: Munger, df: pd.DataFrame, file_name,
                                       error: dict) -> dict:
    # check that count columns exist
    missing = [i for i in mu.options["count_columns"] if i >= df.shape[1]]
    if missing:
        error = ui.add_new_error(
            error,
            "munger",
            mu.name,
            f"Only {df.shape[1]} columns read from results file {file_name}. Check file_type in format.config",
        )
    else:
        # check that count cols are numeric
        for i in mu.options["count_columns"]:
            if not is_numeric_dtype(df.iloc[:, i]):
                try:
                    df.iloc[:, i] = df.iloc[:, i].astype(int)
                except ValueError as ve:
                    error = ui.add_new_error(
                        error,
                        "munger",
                        mu.name,
                        f"Column {i} ({df.columns[i]}) cannot be parsed as an integer.\n{ve}",
                    )
    return error
Beispiel #2
0
def cast_cols_as_int(
    df: pd.DataFrame,
    col_list: list,
    mode="name",
    error_msg="",
    munger_name="unknown",
) -> (pd.DataFrame, dict):
    """recast columns as integer where possible, leaving columns with text entries as non-numeric)"""
    err = None
    if mode == "index":
        num_columns = [df.columns[idx] for idx in col_list]
    elif mode == "name":
        num_columns = [c for c in df.columns if c in col_list]
    else:
        err = ui.add_new_error(
            err,
            "system",
            "munge.cast_cols_as_int",
            f"Mode {mode} not recognized",
        )
        return df, err
    for c in num_columns:
        try:
            df[c] = df[c].astype("int64", errors="raise")
        except ValueError as e:
            err = ui.add_new_error(
                err,
                "warn-munger",
                munger_name,
                f"{error_msg}\nColumn {c} cannot be cast as integer:\n{e}",
            )
    return df, err
Beispiel #3
0
    def get_aux_data(self, aux_data_path, err) -> (dict, dict):
        """creates dictionary of dataframes, one for each auxiliary datafile.
        DataFrames returned are (multi-)indexed by the primary key(s)"""
        aux_data_dict = {
        }  # will hold dataframe for each abbreviated file name

        field_list = list(set([x[0] for x in self.auxiliary_fields()]))
        for abbrev in field_list:
            # get munger for the auxiliary file
            munger_path = os.path.join(self.path_to_munger_dir, abbrev)
            aux_mu, mu_err = check_and_init_munger(munger_path)
            if ui.fatal_error(mu_err):
                err = ui.consolidate_errors([err, mu_err])
                return dict(), err

            # find file in aux_data_path whose name contains the string <afn>
            aux_filename_list = [
                x for x in os.listdir(aux_data_path) if abbrev in x
            ]
            if len(aux_filename_list) == 0:
                # TODO check this error
                err = ui.add_new_error(
                    err,
                    "file",
                    aux_data_path,
                    f"No file found with name containing {abbrev}",
                )
            elif len(aux_filename_list) > 1:
                # TODO check this error
                err = ui.add_new_error(
                    err,
                    "file",
                    aux_data_path,
                    f"Too many files found with name containing {abbrev}",
                )
            else:
                aux_path = os.path.join(aux_data_path, aux_filename_list[0])

            # read and clean the auxiliary data file, including setting primary key columns as int
            df, err = ui.read_single_datafile(aux_mu, aux_path, err)

            # cast primary key(s) as int if possible, and set as (multi-)index
            primary_keys = self.aux_meta.loc[abbrev, "primary_key"].split(",")
            df, new_err = m.cast_cols_as_int(
                df,
                primary_keys,
                error_msg=f"In dataframe for {abbrev}",
                munger_name=aux_mu.name,
            )
            if new_err:
                err = ui.consolidate_errors([err, new_err])
                if ui.fatal_error(new_err):
                    return aux_data_dict, err

            df.set_index(primary_keys, inplace=True)

            aux_data_dict[abbrev] = df

        return aux_data_dict, err
Beispiel #4
0
def check_munger_files(munger_path: str) -> dict:
    """Check that the munger files are complete and consistent with one another.
    Assumes munger directory exists. Assumes dictionary.txt is in the template file.
    <munger_path> is the path to the directory of the particular munger
    """

    err = None
    project_root = Path(__file__).parents[1].absolute()
    munger_name = Path(munger_path).name

    # check whether directory exists
    if not os.path.isdir(munger_path):
        err = ui.add_new_error(err, "munger", munger_name,
                               f"Directory does not exist: {munger_path}")
        return err

    # check whether all files exist
    templates = os.path.join(project_root, "juris_and_munger",
                             "munger_templates")
    template_with_extension_list = os.listdir(templates)
    for munger_file in template_with_extension_list:
        # TODO create optional template for aux_meta.txt
        cf_path = os.path.join(munger_path, munger_file)
        # if file does not already exist in munger dir, throw error
        file_exists = os.path.isfile(cf_path)

        # if file exists, check format against template and then contents
        if file_exists:
            err = check_munger_file_format(munger_path, munger_file, templates,
                                           err)

            # if no errors found so far, check contents
            if not ui.fatal_error(err,
                                  error_type_list=["munger"],
                                  name_key_list=[munger_file]):
                err = check_munger_file_contents(munger_path, munger_file, err)
        else:
            err = ui.add_new_error(err, "munger", munger_name,
                                   "File does not exist")

    # if the munger requires auxiliary data
    if os.path.isfile(os.path.join(munger_path, "aux_meta.txt")):
        # TODO check that each abbrev in aux_meta.txt has an associated sub_munger
        # check sub-mungers (in sub-directories of munger)
        sub_mungers = os.listdir(munger_path)
        for f in sub_mungers:
            if os.path.isdir(f):
                new_err = check_munger_files(f)
                if new_err:
                    ui.add_new_error([err, new_err])
    return err
Beispiel #5
0
def write_element(juris_path: str,
                  element: str,
                  df: pd.DataFrame,
                  file_name=None) -> dict:
    """<juris> is path to jurisdiction directory. Info taken
    from <element>.txt file in that directory.
    <element>.txt is overwritten with info in <df>"""
    err = None
    if not file_name:
        file_name = f"{element}.txt"
    dupes_df, deduped = ui.find_dupes(df)
    if element == "dictionary":
        deduped = remove_empty_lines(deduped, element)
    try:
        deduped.drop_duplicates().fillna("").to_csv(
            os.path.join(juris_path, file_name),
            index=False,
            sep="\t",
        )
    except Exception as e:
        err = ui.add_new_error(
            err,
            "system",
            "preparation.write_element",
            f"Unexpected exception writing to file: {e}",
        )
    return err
Beispiel #6
0
def munge_clean(raw: pd.DataFrame, munger: jm.Munger,
                count_columns_by_name: List[str]) -> (pd.DataFrame, dict):
    """Drop unnecessary columns.
    Append '_SOURCE' suffix to raw column names to avoid conflicts"""
    err = None
    working = raw.copy()
    working, count_columns_by_name, e = clean_column_names(
        working, count_cols=count_columns_by_name)
    try:
        #  define columns named in munger formulas (both plain from 'row' sourced info and
        #  'variable_j' from column-sourced)
        munger_formula_row_sourced = [
            x for x in munger.field_list if x in working.columns
        ]
        munger_formula_column_sourced = [
            f"variable_{j}" for j in munger.field_list
            if f"variable_{j}" in working.columns
        ]

        # keep columns named in munger formulas; keep count columns; drop all else.
        working = working[munger_formula_row_sourced +
                          munger_formula_column_sourced +
                          count_columns_by_name]

        # add suffix '_SOURCE' to certain columns to avoid any conflict with db table names
        # (since no db table name ends with _SOURCE)

        renamer = {x: f"{x}_SOURCE" for x in munger_formula_row_sourced}
        working.rename(columns=renamer, inplace=True)
    except Exception as e:
        err = ui.add_new_error(err, "system", "munge.munge_clean",
                               "Unspecified error")
    return working, err
Beispiel #7
0
def read_xml(
    f_path: str,
    munger: jm.Munger,
    err: Optional[Dict],
) -> (pd.DataFrame, Optional[Dict]):
    """Create dataframe from the xml file, with column names matching the fields in the raw_identifier formulas.
    Skip nodes whose tags are unrecognized"""

    # read data from file
    try:
        tree = et.parse(f_path)
    except FileNotFoundError:
        err = ui.add_new_error(err, "file",
                               Path(f_path).name, "File not found")
        return pd.DataFrame(), err

    # identify tags with counts or other raw data (the info we want)
    # and list data to be pulled from each tag
    # TODO tech debt: simplify
    fields = set(munger.options["count_columns_by_name"]).union(
        munger.field_list)
    tags = {f.split(".")[0] for f in fields}
    # if munger has nesting tags in format.config
    if munger.options["nesting_tags"] is not None:
        tags.update(munger.options["nesting_tags"])
    attributes = {
        t: [x.split(".")[1] for x in fields if x.split(".")[0] == t]
        for t in tags
    }

    try:
        root = tree.getroot()
        results_list = results_below(root, tags, attributes)
        raw_results = pd.DataFrame(results_list)
        for c in munger.options["count_columns_by_name"]:
            raw_results[c] = pd.to_numeric(raw_results[c], errors="coerce")
        raw_results, err_df = m.clean_count_cols(
            raw_results,
            munger.options["count_columns_by_name"],
        )
        if not err_df.empty:
            err = ui.add_err_df(err, err_df, munger, f_path)
    except Exception as e:
        err = ui.add_new_error(err, "munger", munger.name,
                               f"Error reading xml: {e}")
        raw_results = pd.DataFrame()
    return raw_results, err
Beispiel #8
0
def add_munged_column(
    raw: pd.DataFrame,
    munger: jm.Munger,
    element: str,
    err: Optional[dict],
    mode: str = "row",
    inplace: bool = True,
) -> (pd.DataFrame, dict):
    """Alters dataframe <raw>, adding or redefining <element>_raw column
    via the <formula>. Assumes "_SOURCE" has been appended to all columns of raw
    Does not alter row count."""
    if raw.empty:
        return raw, err
    if inplace:
        working = raw
    else:
        working = raw.copy()

    try:
        formula = munger.cdf_elements.loc[element, "raw_identifier_formula"]
        if mode == "row":
            for field in munger.field_list:
                formula = formula.replace(f"<{field}>", f"<{field}_SOURCE>")
        elif mode == "column":
            for i in range(munger.options["header_row_count"]):
                formula = formula.replace(f"<{i}>", f"<variable_{i}>")

        working, new_err = add_column_from_formula(working, formula,
                                                   f"{element}_raw", err,
                                                   munger.name)
        if new_err:
            err = ui.consolidate_errors([err, new_err])
            if ui.fatal_error(new_err):
                return working, err

        # correct any disambiguated names back to the original
        if element in munger.alt.keys():
            working.replace({f"{element}_raw": munger.alt[element]},
                            inplace=True)

    except Exception as e:
        err = ui.add_new_error(
            err,
            "munger",
            munger.name,
            f"Error interpreting formula for {element} in cdf_element.txt. {e}",
        )
        return working, err

    # compress whitespace for <element>_raw
    working.loc[:, f"{element}_raw"] = working[f"{element}_raw"].apply(
        compress_whitespace)
    return working, err
Beispiel #9
0
def read_nested_json(f_path: str, munger: jm.Munger,
                     err: Optional[Dict]) -> (pd.DataFrame, Optional[Dict]):
    """
    Create dataframe from a nested json file, by traversing the json dictionary
    recursively, similar to the case of xml.
    """

    # read data from file
    try:
        with open(f_path, 'r') as f:
            j = json.load(f)
    except FileNotFoundError:
        traceback.print_exc()
        err = ui.add_new_error(err, "file",
                               Path(f_path).name, "File not found")
        return pd.DataFrame(), err

    # Identify keys for counts and other raw data (attributes) we want
    count_keys = set(munger.options["count_columns_by_name"])
    attribute_keys = set(munger.field_list)

    try:
        current_values = {}
        results_list = json_results_below(j, count_keys, attribute_keys,
                                          current_values)
        raw_results = pd.DataFrame(results_list)
        for c in munger.options["count_columns_by_name"]:
            raw_results[c] = pd.to_numeric(raw_results[c], errors="coerce")
        raw_results, err_df = m.clean_count_cols(
            raw_results,
            munger.options["count_columns_by_name"],
        )
        if not err_df.empty:
            err = ui.add_err_df(err, err_df, munger, f_path)
    except Exception as e:
        traceback.print_exc()
        err = ui.add_new_error(err, "munger", munger.name,
                               f"Error reading xml: {e}")
        raw_results = pd.DataFrame()
    return raw_results, err
Beispiel #10
0
def check_munger_file_format(munger_path: str, munger_file: str,
                             templates: str, err: dict) -> dict:

    if munger_file[-4:] == ".txt":
        cf_df = pd.read_csv(os.path.join(munger_path, munger_file),
                            sep="\t",
                            encoding="iso-8859-1")
        temp = pd.read_csv(os.path.join(templates, munger_file),
                           sep="\t",
                           encoding="iso-8859-1")

        # check column names are correct
        if set(cf_df.columns) != set(temp.columns):
            err = ui.add_new_error(
                err,
                "munger",
                munger_path,
                f"Columns in {munger_file} do not match template.:\n"
                f"Columns of {munger_file}: {cf_df.columns}\n"
                f"Columns of template: {temp.columns}",
            )

    elif munger_file == "format.config":
        d, err = ui.get_runtime_parameters(
            required_keys=munger_pars_req,
            param_file=os.path.join(munger_path, munger_file),
            header="format",
            err=err,
            optional_keys=list(munger_pars_opt.keys()),
        )
    else:
        err = ui.add_new_error(
            err,
            "munger",
            munger_path,
            f"Unrecognized file in munger: {munger_file}",
        )
    return err
Beispiel #11
0
def read_alternate_munger(file_type: str, f_path: str, munger: jm.Munger,
                          err: Optional[dict]) -> (pd.DataFrame, dict):
    if file_type in ["concatenated-blocks"]:
        raw_results, err = read_concatenated_blocks(f_path, munger, err)
    elif file_type in ["xls-multi"]:
        raw_results, err = read_multi_sheet_excel(f_path, munger, err)
    elif file_type in ["xml"]:
        raw_results, err = read_xml(f_path, munger, err)
    elif file_type in ["json-nested"]:
        raw_results, err = read_nested_json(f_path, munger, err)
    else:
        err = ui.add_new_error(err, "munger", munger.name,
                               f"file type not recognized: {file_type}")
        raw_results = pd.DataFrame()

    # clean the raw results
    raw_results, err_df = m.clean_count_cols(raw_results, ["count"])
    if not err_df.empty:
        err = ui.add_new_error(err, "warn-file",
                               Path(f_path).name,
                               f"Some counts not read, set to 0")
    str_cols = [c for c in raw_results.columns if c != "count"]
    raw_results = m.clean_strings(raw_results, str_cols)
    return raw_results, err
Beispiel #12
0
def get_ids_for_foreign_keys(session, df1, element, foreign_key, refs,
                             load_refs, error):
    """ TODO <fn> is foreign key"""
    df = df1.copy()
    # append the Id corresponding to <fn> from the db
    foreign_elt = f"{foreign_key[:-3]}"
    interim = f"{foreign_elt}_Name"

    target_list = []
    for r in refs:
        ref_name_field = db.get_name_field(r)

        r_target = pd.read_sql_table(r, session.bind)[["Id", ref_name_field]]
        r_target.rename(columns={
            "Id": foreign_key,
            ref_name_field: interim
        },
                        inplace=True)

        target_list.append(r_target)

    target = pd.concat(target_list)

    df = df.merge(target, how="left", left_on=foreign_elt, right_on=interim)

    # TODO might have to check for '' or 0 as well as nulls
    missing = df[(df[foreign_elt].notnull()) & (df[interim].isnull())]
    if missing.empty:
        df.drop([interim], axis=1)
    else:
        if load_refs:
            # Always try to handle/fill in the missing IDs
            raise ForeignKeyException(
                f"For some {element} records, {foreign_elt} was not found")
        else:
            if not element in error:
                error = ui.add_new_error(
                    error,
                    "system",
                    "juris_and_munger.get_ids_for_foreign_keys",
                    f"For some {element} records, {foreign_elt} was not found",
                )
    return df
Beispiel #13
0
def check_munger_file_format(munger_path: str, munger_file: str,
                             templates: str, err: dict) -> dict:

    if munger_file == "cdf_elements.txt":
        pass  # nothing to check now that entries may vary
    elif munger_file == "format.config":
        d, err = ui.get_runtime_parameters(
            required_keys=munger_pars_req,
            param_file=os.path.join(munger_path, munger_file),
            header="format",
            err=err,
            optional_keys=list(munger_pars_opt.keys()),
        )
    else:
        err = ui.add_new_error(
            err,
            "munger",
            munger_path,
            f"Unrecognized file in munger: {munger_file}",
        )
    return err
Beispiel #14
0
def ensure_juris_files(juris_path, ignore_empty=False) -> dict:
    """Check that the jurisdiction files are complete and consistent with one another.
    Check for extraneous files in Jurisdiction directory.
    Assumes Jurisdiction directory exists. Assumes dictionary.txt is in the template file"""

    # package possible errors from this function into a dictionary and return them
    err = None
    juris_name = Path(juris_path).name

    project_root = Path(__file__).parents[1].absolute()
    templates_dir = os.path.join(project_root, "juris_and_munger",
                                 "jurisdiction_templates")
    # notify user of any extraneous files
    extraneous = [
        f for f in os.listdir(juris_path)
        if f not in os.listdir(templates_dir) and f[0] != "."
    ]
    if extraneous:
        err = ui.add_new_error(
            err,
            "jurisdiction",
            juris_name,
            f"extraneous_files_in_juris_directory {extraneous}",
        )

    template_list = [x[:-4] for x in os.listdir(templates_dir)]

    # reorder template_list, so that first things are created first
    ordered_list = [
        "dictionary", "ReportingUnit", "Office", "CandidateContest"
    ]
    template_list = ordered_list + [
        x for x in template_list if x not in ordered_list
    ]

    # ensure necessary all files exist
    for juris_file in template_list:
        # a list of file empty errors
        cf_path = os.path.join(juris_path, f"{juris_file}.txt")
        created = False
        # if file does not already exist in jurisdiction directory, create from template and invite user to fill
        try:
            temp = pd.read_csv(
                os.path.join(templates_dir, f"{juris_file}.txt"),
                sep="\t",
                encoding="iso-8859-1",
            )
        except pd.errors.EmptyDataError:
            if not ignore_empty:
                err = ui.add_new_error(
                    err,
                    "system",
                    "juris_and_munger.ensure_juris_files",
                    "Template file {" + juris_file + "}.txt has no contents",
                )
            temp = pd.DataFrame()
        # if file does not exist
        if not os.path.isfile(cf_path):
            # create the file
            temp.to_csv(cf_path, sep="\t", index=False)
            created = True

        # if file exists, check format against template
        if not created:
            cf_df = pd.read_csv(
                os.path.join(juris_path, f"{juris_file}.txt"),
                sep="\t",
                encoding="iso=8859-1",
                quoting=csv.QUOTE_MINIMAL,
            )
            if set(cf_df.columns) != set(temp.columns):
                print(juris_file)
                cols = "\t".join(temp.columns.to_list())
                err = ui.add_new_error(
                    err,
                    "jurisdiction",
                    juris_name,
                    f"Columns of {juris_file}.txt need to be (tab-separated):\n {cols}\n",
                )

            if juris_file == "dictionary":
                # dedupe the dictionary
                dedupe(cf_path)
            else:
                # dedupe the file
                dedupe(cf_path)
                # check for problematic null entries
                null_columns = check_nulls(juris_file, cf_path, project_root)
                if null_columns:
                    err = ui.add_new_error(
                        err,
                        "jurisdiction",
                        juris_name,
                        f"Null entries in {juris_file} in columns {null_columns}",
                    )

    # check dependencies
    for juris_file in [
            x for x in template_list if x != "remark" and x != "dictionary"
    ]:
        # check dependencies
        d, new_err = check_dependencies(juris_path, juris_file)
        if new_err:
            err = ui.consolidate_errors([err, new_err])
    return err
Beispiel #15
0
def check_munger_file_contents(munger_path, munger_file, err):
    """check whether munger files are internally consistent"""
    munger_name = Path(munger_path).name
    if munger_file == "cdf_elements.txt":
        # read cdf_elements and format from files
        cdf_elements = pd.read_csv(
            os.path.join(munger_path, "cdf_elements.txt"),
            sep="\t",
            encoding="iso-8859-1",
        ).fillna("")

        # every source in cdf_elements is either row, column or other
        bad_source = [
            x for x in cdf_elements.source if x not in ["row", "column"]
        ]
        if bad_source:
            err = ui.add_new_error(
                err,
                "warn-munger",
                munger_name,
                f"Source(s) in cdf_elements.txt not recognized: {bad_source}",
            )

        # formulas have good syntax
        bad_formula = [
            x for x in cdf_elements.raw_identifier_formula.unique()
            if not m.good_syntax(x)
        ]
        if bad_formula:
            f_str = ",".join(bad_formula)
            err = ui.add_new_error(
                err,
                "warn-munger",
                munger_name,
                f"At least one formula in cdf_elements.txt has bad syntax: {f_str}",
            )

        # for each column-source record in cdf_element, contents of bracket are numbers in the header_rows
        p_not_just_digits = re.compile(r"<.*\D.*>")
        p_catch_digits = re.compile(r"<(\d+)>")
        bad_column_formula = set()

        # TODO check: can this error out now?
        for i, r in cdf_elements[cdf_elements.source == "column"].iterrows():
            if p_not_just_digits.search(r["raw_identifier_formula"]):
                bad_column_formula.add(r["raw_identifier_formula"])
        if bad_column_formula:
            err = ui.add_new_error(
                err,
                "munger",
                munger_name,
                f"At least one column-source formula in cdf_elements.txt has bad syntax: {bad_column_formula}",
            )

    elif munger_file == "format.config":
        format_d, err = ui.get_runtime_parameters(
            required_keys=munger_pars_req,
            param_file=os.path.join(munger_path, "format.config"),
            header="format",
            err=err,
            optional_keys=list(munger_pars_opt.keys()),
        )

        # stop and return error if fatal
        if ui.fatal_error(err):
            return err

        # warn if encoding missing or is not recognized
        if "encoding" not in format_d.keys():
            err = ui.add_new_error(
                err,
                "warn-munger",
                munger_name,
                f"No encoding specified; iso-8859-1 will be used",
            )
        elif not format_d["encoding"] in ui.recognized_encodings:
            err = ui.add_new_error(
                err,
                "warn-munger",
                munger_name,
                (f"Encoding {format_d['encoding']} in format file is not recognized;"
                 f"iso-8859-1 will be used"),
            )

        # check all parameters for flat files
        if format_d["file_type"] in ["txt", "csv", "xls"]:
            # Either field_name_row is a number, or field_names_if_no_field_name_row is a non-empty list
            if (not format_d["field_name_row"]) or (
                    not format_d["field_name_row"].isnumeric()):
                if (not format_d["field_names_if_no_field_name_row"]) or (len(
                        format_d["field_names_if_no_field_name_row"]) == 0):
                    err = ui.add_new_error(
                        err,
                        "munger",
                        munger_name,
                        (f"field_name_row is not an integer, "
                         f"but no field names are given in field_names_if_no_field_name_row."
                         ),
                    )

            # other entries in format.config are of correct type
            try:
                int(format_d["header_row_count"])
            except (TypeError, ValueError):
                err = ui.add_new_error(
                    err,
                    "munger",
                    munger_name,
                    f'header_row_count is not an integer:  {format_d["header_row_count"]}',
                )

        # check all parameters for concatenated blocks (e.g., Georgia ExpressVote output)
        elif format_d["file_type"] in ["concatenated-blocks"]:
            for key in [
                    "count_of_top_lines_to_skip",
                    "last_header_column_count",
                    "column_width",
            ]:
                try:
                    int(format_d[key])
                except (ValueError, TypeError):
                    err = ui.add_new_error(
                        err,
                        "munger",
                        munger_name,
                        f"{key} is not an integer:  {format_d[key]}",
                    )
    else:
        err = ui.add_new_error(
            err,
            "system",
            "juris_and_munger.check_munger_file_contents",
            f"Munger template file not recognized: {munger_file}",
        )

    return err
Beispiel #16
0
def read_multi_sheet_excel(
    f_path: str,
    munger: jm.Munger,
    err: dict,
) -> (pd.DataFrame, dict):
    # get munger parameters
    sheets_to_skip = munger.options["sheets_to_skip"]
    count_of_top_lines_to_skip = munger.options["count_of_top_lines_to_skip"]
    constant_line_count = munger.options["constant_line_count"]
    constant_column_count = munger.options["constant_column_count"]
    header_row_count = munger.options["header_row_count"]
    columns_to_skip = munger.options["columns_to_skip"]

    try:
        df = pd.read_excel(f_path, sheet_name=None, header=None)
    except Exception as e:
        new_err = ui.add_new_error(err, "file",
                                   Path(f_path).name,
                                   f"Error reading file: {e}")
        if new_err:
            err = ui.consolidate_errors([err, new_err])
            if ui.fatal_error(new_err):
                return pd.DataFrame(), err

    sheets_to_read = [k for k in df.keys() if k not in sheets_to_skip]

    raw_results = pd.DataFrame()
    for sh in sheets_to_read:
        try:
            data = df[sh].copy()

            # remove lines designated ignorable
            data.drop(data.index[:count_of_top_lines_to_skip], inplace=True)

            # remove any all-null rows
            data.dropna(how="all", inplace=True)

            # read constant_line info from first non-null entries of constant-header rows
            # then drop those rows
            if constant_line_count > 0:
                constant_lines = (data.iloc[:constant_line_count].fillna(
                    method="bfill", axis=1).iloc[:, 0])
                data.drop(data.index[:constant_line_count], inplace=True)

            # read constant_column info from first non-null entries of constant columns
            # and drop those columns
            if constant_column_count > 0:
                constant_columns = (data.T.iloc[:constant_column_count].fillna(
                    method="bfill", axis=1).iloc[:, 0])
                data.drop(data.columns[:constant_column_count],
                          axis=1,
                          inplace=True)

            # add multi-index for actual header rows
            header_variable_names = [
                f"header_{j}" for j in range(header_row_count)
            ]

            col_multi_index = pd.MultiIndex.from_frame(
                data.iloc[range(header_row_count), :].transpose().fillna(
                    method="ffill"),
                names=header_variable_names,
            )
            data.columns = col_multi_index

            # remove header rows from data
            data.drop(data.index[:header_row_count], inplace=True)

            # Drop extraneous columns per munger, and columns without data
            data.drop(data.columns[columns_to_skip], axis=1, inplace=True)
            data.dropna(axis=1, how="all", inplace=True)

            # make first column into an index
            data.set_index(keys=data.columns[0], inplace=True)

            # move header info to columns
            data = pd.melt(
                data,
                ignore_index=False,
                value_name="count",
                var_name=header_variable_names,
            )

            # add column(s) for constant info
            for j in range(constant_line_count):
                data = m.add_constant_column(data, f"constant_line_{j}",
                                             constant_lines.iloc[j])
            for j in range(constant_column_count):
                data = m.add_constant_column(data, f"constant_column_{j}",
                                             constant_columns.iloc[j])

            # Make row index (from first column of blocks) into a column called 'first_column'
            data.reset_index(inplace=True)
            data.rename(columns={data.columns[0]: "first_column"},
                        inplace=True)

            raw_results = pd.concat([raw_results, data])
        except Exception as e:
            err = ui.add_new_error(
                err,
                "system",
                "special_formats.read_multi_sheet_excel",
                f"Unexpected exception while processing sheet {sh}: {e}",
            )
    return raw_results, err
Beispiel #17
0
def read_concatenated_blocks(f_path: str, munger: jm.Munger,
                             err: dict) -> (pd.DataFrame, dict):
    """Assumes first column of each block is ReportingUnit, last column is contest total"""
    try:
        with open(f_path, "r") as f:
            data = f.readlines()
    except Exception as exc:
        err = ui.add_new_error(err, "file", f_path,
                               f"Datafile not read:\n{exc}\n")
        return pd.DataFrame(), err

    # get  munger parameters
    w = munger.options["column_width"]
    tlts = munger.options["count_of_top_lines_to_skip"]
    v_t_cc = munger.options["last_header_column_count"]
    skip_cols = munger.options["columns_to_skip"]

    df = dict()

    # skip lines at top
    data = data[tlts:]

    try:
        while len(data) > 3:
            # TODO allow number & interps of headers to vary?
            # get rid of blank lines
            while data[0] == "\n":
                data.pop(0)

            # get the header lines
            header_0 = data.pop(0).strip()
            header_1 = data.pop(0)
            header_line = data.pop(0)

            # get info from header line
            field_list = extract_items(header_line, w)

            # Add back county header in case of Iowa:
            if header_line.startswith(" " * w):
                field_list = [""] + field_list

            # remove first column header and headers of any columns to be skipped
            last_header = remove_by_index(field_list, [0] + skip_cols)

            # check that the size of the side-to-side repeated block is consistent
            if len(last_header) % v_t_cc != 0:
                e = (
                    f"Count of last header (per munger) ({v_t_cc}) "
                    f"does not evenly divide the number of count columns in the results file "
                    f"({len(last_header)})")
                err = ui.add_new_error(
                    err,
                    "munger",
                    munger.name,
                    e,
                )
                return pd.DataFrame(), err

            # get list from next header row and disambiguate
            # TODO tech debt: disambiguation assumes Candidate formula is <header_1>
            header_1_list, alts = disambiguate(
                extract_items(header_1, w * v_t_cc))

            #  add disambiguated entries to munger's dictionary of alternatives
            if alts:
                if "Candidate" in munger.alt.keys():
                    munger.alt["Candidate"].update(alts)
                else:
                    munger.alt["Candidate"] = alts

            # create df from next batch of lines, with that multi-index
            # find idx of next empty line (or end of data)
            try:
                next_empty = next(idx for idx in range(len(data))
                                  if data[idx] == "\n")
            except StopIteration:
                next_empty = len(data)
            # create io
            vote_count_block = io.StringIO()
            vote_count_block.write("".join(data[:next_empty]))
            vote_count_block.seek(0)

            df[header_0] = pd.read_fwf(vote_count_block,
                                       colspecs="infer",
                                       index=False,
                                       header=None)

            # Drop extraneous columns (per munger). Negative numbers count from right side
            df[header_0].drop(df[header_0].columns[skip_cols],
                              axis=1,
                              inplace=True)

            # make first column into an index
            df[header_0].set_index(keys=[0], inplace=True)

            # add multi-index with header_1 and header_2 info
            index_array = [
                [
                    y for z in [[cand] * v_t_cc for cand in header_1_list]
                    for y in z
                ],
                last_header,
            ]

            # Create map from integer columns to (header_1, header_2) values
            header_map = {}
            for i, col in enumerate(df[header_0].columns):
                header_map[col] = (index_array[0][i], index_array[1][i])

            # Move header to columns
            df[header_0] = pd.melt(
                df[header_0],
                ignore_index=False,
                value_vars=df[header_0].columns.tolist(),
                value_name="count",
                var_name="header_tmp",
            )

            # Gather values for header_1 and header_2 columns.
            header_1_col = [
                header_map[i][0] for i in df[header_0]["header_tmp"]
            ]
            header_2_col = [
                header_map[i][1] for i in df[header_0]["header_tmp"]
            ]

            # Add header_1 and header_2 columns, and remove header_tmp.
            df[header_0]["header_1"] = header_1_col
            df[header_0]["header_2"] = header_2_col
            df[header_0] = df[header_0].drop(columns="header_tmp")

            # Add columns for header_0
            df[header_0] = m.add_constant_column(df[header_0], "header_0",
                                                 header_0)

            # remove processed lines from data
            data = data[next_empty:]
    except Exception as exc:
        err = ui.add_new_error(
            err,
            "warn-munger",
            munger.name,
            f"unparsed lines at bottom of file ({Path(f_path).name}):\n{data}\n",
        )

    # consolidate all into one dataframe
    try:
        raw_results = pd.concat(list(df.values()))
    except ValueError as e:
        err = ui.add_new_error(
            err,
            "munger",
            munger.name,
            f"Error concatenating data from blocks: {e}",
        )
        return pd.DataFrame, err

    # Make row index (from first column of blocks) into a column called 'first_column'
    raw_results.reset_index(inplace=True)
    # TODO tech debt is next line still necessary?
    raw_results.rename(columns={0: "first_column"}, inplace=True)

    return raw_results, err
Beispiel #18
0
def raw_elements_to_cdf(
    session,
    juris: jm.Jurisdiction,
    mu: jm.Munger,
    raw: pd.DataFrame,
    count_cols: List[str],
    err: dict,
    constants: dict,
) -> dict:
    """load data from <raw> into the database."""
    working = raw.copy()

    try:
        working, new_err = munge_and_melt(mu, working, count_cols, err)
        if new_err:
            err = ui.consolidate_errors([err, new_err])
            if ui.fatal_error(new_err):
                return err
    except Exception as exc:
        err = ui.add_new_error(
            err,
            "system",
            "munge.raw_elements_to_cdf",
            f"Unexpected exception during munge_and_melt: {exc}",
        )
        return err

    # enter elements from sources outside raw data, including creating id column(s)
    for k in constants.keys():
        working = add_constant_column(working, k, constants[k])

    # add Contest_Id (unless it was passed in ids)
    if "Contest_Id" not in working.columns:
        try:
            working, err = add_contest_id(working, juris, err, session)
        except Exception as exc:
            err = ui.add_new_error(
                err,
                "system",
                "munge.raw_elements_to_cdf",
                f"Unexpected exception while adding Contest_Id: {exc}",
            )
            return err
        if ui.fatal_error(err):
            return err

    # get ids for remaining info sourced from rows and columns (except Selection_Id)
    element_list = [
        t for t in mu.cdf_elements.index if (t[-7:] != "Contest" and (
            t[-9:] != "Selection") and f"{t}_Id" not in constants.keys())
    ]
    for t in element_list:
        try:
            # capture id from db in new column and erase any now-redundant cols
            df = pd.read_sql_table(t, session.bind)
            name_field = db.get_name_field(t)
            # set drop_unmatched = True for fields necessary to BallotMeasure rows,
            #  drop_unmatched = False otherwise to prevent losing BallotMeasureContests for BM-inessential fields
            if t == "ReportingUnit" or t == "CountItemType":
                drop = True
            else:
                drop = False
            if t == "CountItemType":
                # munge raw to internal CountItemType
                r_i = pd.read_csv(os.path.join(juris.path_to_juris_dir,
                                               "dictionary.txt"),
                                  sep="\t")
                r_i = r_i[r_i.cdf_element == "CountItemType"]
                recognized = r_i.raw_identifier_value.unique()
                matched = (working.CountItemType_raw.isin(recognized))
                if not matched.all():
                    unmatched = "\n".join(
                        (working[~matched]["CountItemType_raw"]).unique())
                    ui.add_new_error(
                        err,
                        "warn-jurisdiction",
                        juris.short_name,
                        f"Some unmatched CountItemTypes:\n{unmatched}",
                    )
                working = working.merge(
                    r_i,
                    how="left",
                    left_on="CountItemType_raw",
                    right_on="raw_identifier_value",
                ).rename(columns={"cdf_internal_name": "CountItemType"})

                # join CountItemType_Id and OtherCountItemType
                cit = pd.read_sql_table("CountItemType", session.bind)
                working = enum_col_to_id_othertext(working, "CountItemType",
                                                   cit)
                working, err_df = clean_ids(working, ["CountItemType_Id"])
                working = clean_strings(working, ["OtherCountItemType"])
                working = working.drop([
                    "raw_identifier_value", "cdf_element", "CountItemType_raw"
                ],
                                       axis=1)
            else:
                none_or_unknown_id = db.name_to_id(session, t,
                                                   "none or unknown")
                working, new_err = replace_raw_with_internal_ids(
                    working,
                    juris,
                    df,
                    t,
                    name_field,
                    err,
                    drop_unmatched=drop,
                    unmatched_id=none_or_unknown_id,
                )
                err = ui.consolidate_errors([err, new_err])
                if ui.fatal_error(new_err):
                    return err
                working.drop(t, axis=1, inplace=True)
        except KeyError as exc:
            err = ui.add_new_error(
                err,
                "system",
                "munge.raw_elements_to_cdf",
                f"KeyError ({exc}) while adding internal ids for {t}.",
            )
        except Exception as exc:
            err = ui.add_new_error(
                err,
                "system",
                "munge.raw_elements_to_cdf",
                f"Exception ({exc}) while adding internal ids for {t}.",
            )

            return err

    # add Selection_Id (combines info from BallotMeasureSelection and CandidateContestSelection)
    try:
        working, err = add_selection_id(working, session.bind, juris, err)
        working, err_df = clean_ids(working, ["Selection_Id"])
    except Exception as exc:
        err = ui.add_new_error(
            err,
            "system",
            "munge.raw_elements_to_cdf",
            f"Unexpected exception while adding Selection_Id:\n{exc}",
        )
        return err
    if working.empty:
        err = ui.add_new_error(
            err,
            "jurisdiction",
            juris.short_name,
            "No contests found, or no selections found for contests.",
        )
        return err

    # restrict to just the VoteCount columns (so that groupby.sum will work)
    vc_cols = [
        "Count",
        "CountItemType_Id",
        "OtherCountItemType",
        "ReportingUnit_Id",
        "Contest_Id",
        "Selection_Id",
        "Election_Id",
        "_datafile_Id",
    ]
    working = working[vc_cols]
    working, e = clean_count_cols(working, ["Count"])

    # TODO there are edge cases where this might include dupes
    #  that should be omitted. E.g., if data mistakenly read twice
    # Sum any rows that were disambiguated (otherwise dupes will be dropped
    #  when VoteCount is filled)
    group_cols = [c for c in working.columns if c != "Count"]
    working = working.groupby(group_cols).sum().reset_index()
    # TODO clean before inserting? All should be already clean, no?

    # Fill VoteCount
    try:
        e = db.insert_to_cdf_db(session.bind, working, "VoteCount")
        if e:
            err = ui.add_new_error(
                err,
                "system",
                "munge.raw_elements_to_cdf",
                f"database insertion error {e}",
            )
            return err
    except Exception as exc:
        err = ui.add_new_error(
            err,
            "system",
            "munge.raw_elements_to_cdf",
            f"Error filling VoteCount:\n{exc}",
        )

    return err
Beispiel #19
0
def add_selection_id(df: pd.DataFrame, engine, jurisdiction: jm.Jurisdiction,
                     err: dict) -> (pd.DataFrame, dict):
    """Assumes <df> has contest_type, BallotMeasureSelection_raw, Candidate_Id column.
    Loads CandidateSelection table.
    Appends & fills Selection_Id columns"""

    # split df by contest type
    w = dict()
    for ct in ["BallotMeasure", "Candidate"]:
        w[ct] = df[df.contest_type == ct].copy()

    # append BallotMeasureSelection_Id as Selection_Id to w['BallotMeasure']
    if not w["BallotMeasure"].empty:
        bms = pd.read_sql_table(f"BallotMeasureSelection", engine)
        w["BallotMeasure"], err = replace_raw_with_internal_ids(
            w["BallotMeasure"],
            jurisdiction,
            bms,
            "BallotMeasureSelection",
            "Name",
            err,
            drop_unmatched=True,
            drop_all_ok=True,
        )
        w["BallotMeasure"].rename(
            columns={"BallotMeasureSelection_Id": "Selection_Id"},
            inplace=True)
        w["BallotMeasure"].drop(["BallotMeasureSelection", "Candidate_Id"],
                                axis=1,
                                inplace=True)

    # prepare to append CandidateSelection_Id as Selection_Id
    if not w["Candidate"].empty:
        c_df = w["Candidate"][["Candidate_Id", "Party_Id"]].drop_duplicates()

        # clean Ids and drop any that were null (i.e., 0 after cleaning)
        c_df, err_df = clean_ids(c_df, ["Candidate_Id", "Party_Id"])
        c_df = c_df[c_df.Candidate_Id != 0]

        # pull any existing Ids into a new CandidateSelection_Id column
        col_map = {c: c for c in ["Party_Id", "Candidate_Id"]}
        c_df = db.append_id_to_dframe(engine,
                                      c_df,
                                      "CandidateSelection",
                                      col_map=col_map)

        # find unmatched records
        # TODO this throws error (FutureWarning: elementwise comparison failed),
        #  maybe because CandidateSelection_Id cannot be compared to ""?
        c_df_unmatched = c_df[(c_df.CandidateSelection_Id == 0)
                              | (c_df.CandidateSelection_Id == "")
                              | (c_df.CandidateSelection_Id.isnull())].copy()

        if not c_df_unmatched.empty:
            #  Load CandidateSelections to Selection table (for unmatched)
            id_list = db.add_records_to_selection_table(
                engine, c_df_unmatched.shape[0])

            # Load unmatched records into CandidateSelection table
            c_df_unmatched["Id"] = pd.Series(id_list,
                                             index=c_df_unmatched.index)
            db.insert_to_cdf_db(engine, c_df_unmatched, "CandidateSelection")

            # update CandidateSelection_Id column for previously unmatched, merging on Candidate_Id and Party_Id
            c_df.loc[c_df_unmatched.index,
                     "CandidateSelection_Id"] = c_df_unmatched["Id"]
        # recast Candidate_Id and Party_Id to int in w['Candidate'];
        # Note that neither should have nulls, but rather the 'none or unknown' Id
        #  NB: c_df had this recasting done in the append_id_to_dframe routine
        w["Candidate"], err_df = clean_ids(w["Candidate"],
                                           ["Candidate_Id", "Party_Id"])
        if not err_df.empty:
            # show all columns of dataframe with problem in Party_Id or Candidate_Id
            pd.set_option("max_columns", None)
            err = ui.add_new_error(
                err,
                "system",
                "munge.add_selection_id",
                f"Problem with Candidate_Id or Party_Id in some rows:\n{err_df}",
            )
            pd.reset_option("max_columns")

        # append CandidateSelection_Id to w['Candidate']
        w["Candidate"] = w["Candidate"].merge(c_df,
                                              how="left",
                                              on=["Candidate_Id", "Party_Id"])

        # rename to Selection_Id
        w["Candidate"] = w["Candidate"].rename(
            columns={"CandidateSelection_Id": "Selection_Id"})
        # and drop extraneous
        to_drop = [
            x for x in w["Candidate"].columns
            if x in ["Candidate_Id", "BallotMeasureSelection_raw"]
        ]
        w["Candidate"].drop(to_drop, axis=1, inplace=True)

    working = pd.concat([w["BallotMeasure"], w["Candidate"]])

    return working, err
Beispiel #20
0
def add_contest_id(df: pd.DataFrame, juris: jm.Jurisdiction, err: dict,
                   session: Session) -> (pd.DataFrame, dict):
    working = df.copy()
    """Append Contest_Id and contest_type. Add contest_type column and fill it correctly.
    Drop rows which match neither BM nor C contest"""

    # add Contest_Id and contest_type
    df_for_type = dict()
    w_for_type = dict()
    df_contest = pd.read_sql_table(f"Contest", session.bind)
    for c_type in ["BallotMeasure", "Candidate"]:
        if f"{c_type}Contest_raw" in working.columns:
            # restrict df_contest to the contest_type <c_type> and get the <c_type>Contest_Id
            df_for_type[c_type] = df_contest[df_contest.contest_type == c_type]
            none_or_unknown_id = db.name_to_id(session, f"{c_type}Contest",
                                               "none or unknown")
            working, new_err = replace_raw_with_internal_ids(
                working,
                juris,
                df_for_type[c_type],
                f"{c_type}Contest",
                "Name",
                err,
                drop_unmatched=False,
                unmatched_id=none_or_unknown_id,
                drop_all_ok=True,
            )
            if new_err:
                err = ui.consolidate_errors([err, new_err])
            # restrict working to the contest_type <c_type>, add contest_type column
            w_for_type[c_type] = working[
                working[f"{c_type}Contest"] != "none or unknown"]
            w_for_type[c_type] = add_constant_column(
                w_for_type[c_type], "contest_type",
                c_type).rename(columns={f"{c_type}Contest_Id": "Contest_Id"})

            # drop text column
            w_for_type[c_type] = w_for_type[c_type].drop(f"{c_type}Contest",
                                                         axis=1)
        else:
            w_for_type[c_type] = pd.DataFrame()

    # FIXME: check somewhere that no name (other than 'none or unknown') is shared by BMContests and CandidateContests
    # TODO check this also when juris files loaded, to save time for user

    # drop obsolete columns
    if w_for_type["BallotMeasure"].empty:
        working_temp = w_for_type["Candidate"]
    elif w_for_type["Candidate"].empty:
        working_temp = w_for_type["BallotMeasure"]
    else:
        common_cols = [
            c for c in w_for_type["BallotMeasure"].columns
            if c in w_for_type["Candidate"].columns
        ]
        for c_type in ["BallotMeasure", "Candidate"]:
            w_for_type[c_type] = w_for_type[c_type][common_cols]

        # assemble working from the two pieces
        working_temp = pd.concat(
            [w_for_type[ct] for ct in ["BallotMeasure", "Candidate"]])

    # fail if fatal errors or no contests recognized (in reverse order, just for fun
    if working_temp.empty:
        err = ui.add_new_error(err, "jurisdiction", juris.short_name,
                               f"No contests recognized.")
    else:
        working = working_temp
    if ui.fatal_error(err):
        return working, err

    return working, err
Beispiel #21
0
def replace_raw_with_internal_ids(
    df: pd.DataFrame,
    juris: jm.Jurisdiction,
    table_df: pd.DataFrame,
    element: str,
    internal_name_column: str,
    error: dict,
    drop_unmatched: bool = False,
    drop_extraneous: bool = True,
    mode: str = "row",
    unmatched_id: int = 0,
    drop_all_ok: bool = False,
) -> (pd.DataFrame, dict):
    """replace columns in <working> with raw_identifier values by columns with internal names and Ids
    from <table_df>, which has structure of a db table for <element>.
    <unmatched_id> is the id to assign to unmatched records.
    If <drop_extraneous> = True and dictionary matches raw_identifier to "row should be dropped",
    drop that row EVEN IF <drop_unmatched> = False.
    """
    working = df.copy()
    # join the 'cdf_internal_name' from the raw_identifier table -- this is the internal name field value,
    # no matter what the name field name is in the internal element table (e.g. 'Name', 'BallotName' or 'Selection')
    # use dictionary.txt from jurisdiction

    raw_identifiers = pd.read_csv(os.path.join(juris.path_to_juris_dir,
                                               "dictionary.txt"),
                                  sep="\t")

    # restrict to the element at hand
    raw_ids_for_element = raw_identifiers[raw_identifiers["cdf_element"] ==
                                          element].copy()

    if element == "Candidate":
        # remove any lines with nulls
        raw_ids_for_element = raw_ids_for_element[
            raw_ids_for_element.notnull().all(axis=1)]

        # Regularize candidate names (to match what's done during upload of candidates to Candidate
        #  table in db)
        raw_ids_for_element["cdf_internal_name"] = regularize_candidate_names(
            raw_ids_for_element["cdf_internal_name"])
        raw_ids_for_element.drop_duplicates(inplace=True)

    working = working.merge(
        raw_ids_for_element,
        how="left",
        left_on=f"{element}_raw",
        right_on="raw_identifier_value",
        suffixes=["", f"_{element}_ei"],
    )

    # identify unmatched
    unmatched = working[working["cdf_internal_name"].isnull()]
    unmatched_raw = sorted(unmatched[f"{element}_raw"].unique(), reverse=True)
    if len(unmatched_raw) > 0 and element != "BallotMeasureContest":
        unmatched_str = "\n".join(unmatched_raw)
        e = f"\n{element}s not found in dictionary.txt:\n{unmatched_str}"
        error = ui.add_new_error(error, "warn-jurisdiction", juris.short_name,
                                 e)

    if drop_unmatched:
        working = working[working["cdf_internal_name"].notnull()]

    if drop_extraneous:
        # TODO tech debt - note change of case for Candidate above which, if
        #  changed, might affect this in unexpected ways
        # drop extraneous rows identified in dictionary
        working = working[
            working["cdf_internal_name"] != "row should be dropped"]

    if working.empty:
        e = f"No true raw {element} in 'dictionary.txt' matched any raw {element} derived from the result file"
        if drop_unmatched and not drop_all_ok:
            error = ui.add_new_error(
                error,
                "jurisdiction",
                juris.short_name,
                e,
            )
        else:
            error = ui.add_new_error(error, "warn-jurisdiction",
                                     juris.short_name, e)
        # give working the proper columns and return
        new_cols = [
            c for c in working.columns if (c not in [
                "raw_identifier_value",
                "cdf_element",
                f"_{element}_ei",
                "cdf_internal_name",
            ])
        ] + [f"{element}_Id", element]
        working = pd.DataFrame(columns=new_cols)

        return working, error
    else:
        if mode == "column":
            # drop rows that melted from unrecognized columns, EVEN IF drop_unmatched=False.
            #  These rows are ALWAYS extraneous. Drop cols where raw_identifier is not null
            #  but no cdf_internal_name was found (pd.merge yields nulls)
            #
            working = working[(working["raw_identifier_value"].isnull())
                              | (working["cdf_internal_name"].notnull())]
            if drop_extraneous:
                working = working[
                    working["cdf_internal_name"] != "row should be dropped"]
            # TODO tech debt more efficient to drop these earlier, before melting

    # unmatched elements get nan in fields from dictionary table. Change these to "none or unknown"
    if not drop_unmatched:
        working["cdf_internal_name"] = working["cdf_internal_name"].fillna(
            "none or unknown")

    # drop extraneous cols from mu.raw_identifier
    working = working.drop(["raw_identifier_value", "cdf_element"], axis=1)

    # ensure that there is a column in working called by the element
    # containing the internal name of the element
    if f"_{element}_ei" in working.columns:
        working.rename(columns={f"_{element}_ei": element}, inplace=True)
    else:
        working.rename(columns={"cdf_internal_name": element}, inplace=True)

    # join the element table Id and name columns.
    # This will create two columns with the internal name field,
    # whose names will be <element> (from above)
    # and either internal_name_column or internal_name_column_table_name
    working = working.merge(
        table_df[["Id", internal_name_column]],
        how="left",
        left_on=element,
        right_on=internal_name_column,
    )

    # error/warning for unmatched elements
    working_unmatched = working[(working.Id.isnull())
                                & (working[element].notnull())]
    if not working_unmatched.empty and element != "BallotMeasureContest":
        unmatched_pairs = [
            f'({r[f"{element}_raw"]},{r[element]})'
            for i, r in working_unmatched[[f"{element}_raw", element
                                           ]].drop_duplicates().iterrows()
        ]
        unmatched_str = "\n\t".join(unmatched_pairs)
        e = (
            f"Warning: Results for {working_unmatched.shape[0]} rows with unmatched {element}s "
            f"will not be loaded to database. These records (raw name, internal name) were found in dictionary.txt, but "
            f"no corresponding record was found in the {element} table in the database: \n\t{unmatched_str}"
        )
        error = ui.add_new_error(
            error,
            "warn-jurisdiction",
            juris.short_name,
            e,
        )

    if drop_unmatched:
        # if all are unmatched
        if working_unmatched.shape[0] == working.shape[0]:
            error = ui.add_new_error(
                error,
                "jurisdiction",
                juris.short_name,
                (f"No {element} was matched. Either raw values are not in dictionary.txt, or "
                 f"the corresponding cdf_internal_names are missing from {element}.txt"
                 ),
            )
            return working.drop(working.index), error
        # if only some are unmatched
        else:
            # drop the unmatched ones
            working.drop(labels=working_unmatched.index, inplace=True)

    else:
        # change name of unmatched to 'none or unknown' and assign <unmatched_id> as Id
        working.loc[working.Id.isnull(),
                    internal_name_column] = "none or unknown"
        working["Id"].fillna(unmatched_id, inplace=True)

    working = working.drop([internal_name_column, f"{element}_raw"], axis=1)
    working.rename(columns={"Id": f"{element}_Id"}, inplace=True)
    return working, error
Beispiel #22
0
def check_dependencies(juris_dir, element) -> (list, dict):
    """Looks in <juris_dir> to check that every dependent column in <element>.txt
    is listed in the corresponding jurisdiction file. Note: <juris_dir> assumed to exist.
    """
    err = None
    juris_name = Path(juris_dir).name
    d = juris_dependency_dictionary()
    f_path = os.path.join(juris_dir, f"{element}.txt")
    try:
        element_df = pd.read_csv(
            f_path,
            sep="\t",
            index_col=None,
            encoding="iso-8859-1",
            quoting=csv.QUOTE_MINIMAL,
        )
    except FileNotFoundError:
        err = ui.add_new_error(
            err,
            "system",
            "juris_and_munger.check_dependencies",
            f"file doesn't exist: {f_path}",
        )

    # Find all dependent columns
    dependent = [c for c in element_df if c in d.keys()]
    changed_elements = set()
    for c in dependent:
        target = d[c]
        ed = (pd.read_csv(
            os.path.join(juris_dir, f"{element}.txt"),
            sep="\t",
            header=0,
            encoding="iso-8859-1",
            quoting=csv.QUOTE_MINIMAL,
        ).fillna("").loc[:, c].unique())

        # create list of elements, removing any nulls
        ru = list(
            pd.read_csv(
                os.path.join(juris_dir, f"{target}.txt"),
                sep="\t",
                encoding="iso-8859-1",
                quoting=csv.QUOTE_MINIMAL,
            ).fillna("").loc[:, db.get_name_field(target)])
        try:
            ru.remove(np.nan)
        except ValueError:
            pass

        missing = [x for x in ed if x not in ru]
        # if the only missing is null or blank
        if len(missing) == 1 and missing == [""]:
            # exclude PrimaryParty, which isn't required to be not-null
            if c != "PrimaryParty":
                err = ui.add_new_error(err, "jurisdiction", juris_name,
                                       f"Some {c} are null.")
        elif missing:
            changed_elements.add(element)
            changed_elements.add(target)
            m_str = "\n".join(missing)
            err = ui.add_new_error(
                err,
                "jurisdiction",
                juris_name,
                f"Every {c} in {element}.txt must be in {target}.txt. Offenders are:\n{m_str}",
            )

    return changed_elements, err
Beispiel #23
0
def add_column_from_formula(
    working: pd.DataFrame,
    formula: str,
    new_col: str,
    err: Optional[dict],
    munger_name: str,
    suffix=None,
) -> (pd.DataFrame, Optional[dict]):
    """If <suffix> is given, add it to each field in the formula
    If formula is enclosed in braces, parse first entry as formula, second as a
    regex (with one parenthesized group) as a recipe for pulling the value via regex analysis
    """
    w = working.copy()
    #  for each {} pair in the formula, create a new column
    # (assuming formula is well-formed)
    brace_pattern = re.compile(r"{<([^,]*)>,([^{}]*|[^{}]*{[^{}]*}[^{}]*)}")

    try:
        temp_cols = []
        for x in brace_pattern.finditer(formula):
            # create a new column with the extracted info
            old_col, pattern_str = x.groups()
            temp_col = f"extracted_from_{old_col}"
            w, new_err = add_regex_column(w, old_col, temp_col, pattern_str)
            # change the formula to use the temp column
            formula = formula.replace(f"{{<{old_col}>,{pattern_str}}}",
                                      f"<{temp_col}>")
            if new_err:
                err = ui.consolidate_errors([err, new_err])
                if ui.fatal_error(new_err):
                    return w, err
            temp_cols.append(temp_col)
        # once all {} pairs are gone, use concatenation to build the column to be returned
        text_field_list, last_text = text_fragments_and_fields(formula)

        # add suffix, if required
        if suffix:
            text_field_list = [(t, f"{f}{suffix}")
                               for (t, f) in text_field_list]

        # add column to <working> dataframe via the concatenation formula
        if last_text:
            w.loc[:, new_col] = last_text[0]
        else:
            w.loc[:, new_col] = ""
        text_field_list.reverse()
        for t, f in text_field_list:
            try:
                w.loc[:, new_col] = (w.loc[:, f].apply(lambda x: f"{t}{x}") +
                                     w.loc[:, new_col])
            except KeyError as ke:
                err = ui.add_new_error(
                    err,
                    "munger",
                    munger_name,
                    f"Expected transformed column '{f}' not found, "
                    f"perhaps because of mismatch between munger and results file.",
                )
                return w, err

    except Exception as e:
        err = ui.add_new_error(err, "system", "munge.add_column_from_formula",
                               f"Unexpected error: {e}")

    # delete temporary columns
    w.drop(temp_cols, axis=1, inplace=True)
    return w, err
Beispiel #24
0
def load_juris_dframe_into_cdf(session, element, juris_path, error) -> dict:
    """TODO"""
    project_root = Path(__file__).parents[1].absolute()
    cdf_schema_def_dir = os.path.join(
        project_root,
        "CDF_schema_def_info",
    )
    element_fpath = os.path.join(juris_path, f"{element}.txt")
    if not os.path.exists(element_fpath):
        error = ui.add_new_error(error, "jurisdiction",
                                 Path(juris_path).name,
                                 f"File {element}.txt not found")
        return error
    df = pd.read_csv(element_fpath,
                     sep="\t",
                     encoding="iso-8859-1",
                     quoting=csv.QUOTE_MINIMAL).fillna("none or unknown")
    # TODO check that df has the right format

    # add 'none or unknown' record
    df = add_none_or_unknown(df)

    # dedupe df
    dupes, df = ui.find_dupes(df)
    if not dupes.empty:
        error = ui.add_new_error(error, "warn-jurisdiction",
                                 Path(juris_path).name,
                                 f"Duplicates were found in {element}.txt")

    # replace plain text enumerations from file system with id/othertext from db
    enum_file = os.path.join(cdf_schema_def_dir, "elements", element,
                             "enumerations.txt")
    if os.path.isfile(
            enum_file):  # (if not, there are no enums for this element)
        enums = pd.read_csv(enum_file, sep="\t")
        # get all relevant enumeration tables
        for e in enums["enumeration"]:  # e.g., e = "ReportingUnitType"
            cdf_e = pd.read_sql_table(e, session.bind)
            # for every instance of the enumeration in the current table, add id and othertype columns to the dataframe
            if e in df.columns:
                df = m.enum_col_to_id_othertext(df, e, cdf_e)

    # get Ids for any foreign key (or similar) in the table, e.g., Party_Id, etc.
    fk_file_path = os.path.join(cdf_schema_def_dir, "elements", element,
                                "foreign_keys.txt")
    if os.path.isfile(fk_file_path):
        foreign_keys = pd.read_csv(fk_file_path,
                                   sep="\t",
                                   index_col="fieldname")

        for fn in foreign_keys.index:
            ref = foreign_keys.loc[
                fn,
                "refers_to"]  # NB: juris elements have no multiple referents (as joins may)
            col_map = {fn[:-3]: db.get_name_field(ref)}
            df = db.append_id_to_dframe(
                session.bind, df, ref,
                col_map=col_map).rename(columns={f"{ref}_Id": fn})

    # commit info in df to corresponding cdf table to db
    err_string = db.insert_to_cdf_db(session.bind, df, element)
    if err_string:
        error = ui.add_new_error(
            error, "system", "juris_and_munger.load_juris_dframe_into_cdf",
            f"Error loading {element} to database: {e}")
    return error
Beispiel #25
0
def run2(
    load_data: bool = True,
    dbname: Optional[str] = None,
    test_dir: Optional[str] = None,
    election_jurisdiction_list: Optional[list] = None,
) -> Optional[dict]:
    dl = None  # to keep syntax-checker happy

    err = None
    db_removed = False
    if not test_dir:
        # set the test_dir to the directory containing this file
        test_dir = Path(__file__).parent.absolute()

    # name the db
    if dbname is None:
        # create unique name for test database
        ts = datetime.datetime.now().strftime("%m%d_%H%M")
        dbname = f"test_{ts}"

    if load_data:
        get_testing_data(
            url="https://github.com/ElectionDataAnalysis/TestingData.git",
            results_dir="TestingData",
        )

    # restrict elections and jurisdictions to those given (if given)
    # otherwise use all in TestingData
    if not election_jurisdiction_list:
        election_jurisdiction_list = ui.election_juris_list("TestingData")

    if load_data:
        try:
            # Load the data
            dl = eda.DataLoader()
            dl.change_db(dbname)

            dl.change_dir("results_dir", "TestingData")
            err, success = dl.load_all(
                move_files=False,
                election_jurisdiction_list=election_jurisdiction_list)
            if not success:
                print("At least one file did not load correctly.")
                err, db_removed = optional_remove(dl, "TestingData")
        except Exception as exc:
            print(f"Exception occurred: {exc}")
            if dl:
                optional_remove(dl, "TestingData")
            err = ui.add_new_error(
                err,
                "file",
                "TestingData",
                f"Exception during data loading: {exc}",
            )
            return err

        if ui.fatal_error(err):
            optional_remove(dl, "TestingData")
            return err

    if not db_removed:
        result = ui.run_tests(
            test_dir,
            dbname,
            election_jurisdiction_list=election_jurisdiction_list)

        # remove all .ini files
        par_files = [x for x in os.listdir("TestingData") if x[-4:] == ".ini"]
        for f in par_files:
            os.remove(os.path.join("TestingData", f))

        if load_data:
            err, db_removed = optional_remove(dl, "TestingData")
    return err