Beispiel #1
0
def raw_elements_to_cdf(
    session,
    juris: jm.Jurisdiction,
    mu: jm.Munger,
    raw: pd.DataFrame,
    count_cols: List[str],
    err: dict,
    constants: dict,
) -> dict:
    """load data from <raw> into the database."""
    working = raw.copy()

    try:
        working, new_err = munge_and_melt(mu, working, count_cols, err)
        if new_err:
            err = ui.consolidate_errors([err, new_err])
            if ui.fatal_error(new_err):
                return err
    except Exception as exc:
        err = ui.add_new_error(
            err,
            "system",
            "munge.raw_elements_to_cdf",
            f"Unexpected exception during munge_and_melt: {exc}",
        )
        return err

    # enter elements from sources outside raw data, including creating id column(s)
    for k in constants.keys():
        working = add_constant_column(working, k, constants[k])

    # add Contest_Id (unless it was passed in ids)
    if "Contest_Id" not in working.columns:
        try:
            working, err = add_contest_id(working, juris, err, session)
        except Exception as exc:
            err = ui.add_new_error(
                err,
                "system",
                "munge.raw_elements_to_cdf",
                f"Unexpected exception while adding Contest_Id: {exc}",
            )
            return err
        if ui.fatal_error(err):
            return err

    # get ids for remaining info sourced from rows and columns (except Selection_Id)
    element_list = [
        t for t in mu.cdf_elements.index if (t[-7:] != "Contest" and (
            t[-9:] != "Selection") and f"{t}_Id" not in constants.keys())
    ]
    for t in element_list:
        try:
            # capture id from db in new column and erase any now-redundant cols
            df = pd.read_sql_table(t, session.bind)
            name_field = db.get_name_field(t)
            # set drop_unmatched = True for fields necessary to BallotMeasure rows,
            #  drop_unmatched = False otherwise to prevent losing BallotMeasureContests for BM-inessential fields
            if t == "ReportingUnit" or t == "CountItemType":
                drop = True
            else:
                drop = False
            if t == "CountItemType":
                # munge raw to internal CountItemType
                r_i = pd.read_csv(os.path.join(juris.path_to_juris_dir,
                                               "dictionary.txt"),
                                  sep="\t")
                r_i = r_i[r_i.cdf_element == "CountItemType"]
                recognized = r_i.raw_identifier_value.unique()
                matched = (working.CountItemType_raw.isin(recognized))
                if not matched.all():
                    unmatched = "\n".join(
                        (working[~matched]["CountItemType_raw"]).unique())
                    ui.add_new_error(
                        err,
                        "warn-jurisdiction",
                        juris.short_name,
                        f"Some unmatched CountItemTypes:\n{unmatched}",
                    )
                working = working.merge(
                    r_i,
                    how="left",
                    left_on="CountItemType_raw",
                    right_on="raw_identifier_value",
                ).rename(columns={"cdf_internal_name": "CountItemType"})

                # join CountItemType_Id and OtherCountItemType
                cit = pd.read_sql_table("CountItemType", session.bind)
                working = enum_col_to_id_othertext(working, "CountItemType",
                                                   cit)
                working, err_df = clean_ids(working, ["CountItemType_Id"])
                working = clean_strings(working, ["OtherCountItemType"])
                working = working.drop([
                    "raw_identifier_value", "cdf_element", "CountItemType_raw"
                ],
                                       axis=1)
            else:
                none_or_unknown_id = db.name_to_id(session, t,
                                                   "none or unknown")
                working, new_err = replace_raw_with_internal_ids(
                    working,
                    juris,
                    df,
                    t,
                    name_field,
                    err,
                    drop_unmatched=drop,
                    unmatched_id=none_or_unknown_id,
                )
                err = ui.consolidate_errors([err, new_err])
                if ui.fatal_error(new_err):
                    return err
                working.drop(t, axis=1, inplace=True)
        except KeyError as exc:
            err = ui.add_new_error(
                err,
                "system",
                "munge.raw_elements_to_cdf",
                f"KeyError ({exc}) while adding internal ids for {t}.",
            )
        except Exception as exc:
            err = ui.add_new_error(
                err,
                "system",
                "munge.raw_elements_to_cdf",
                f"Exception ({exc}) while adding internal ids for {t}.",
            )

            return err

    # add Selection_Id (combines info from BallotMeasureSelection and CandidateContestSelection)
    try:
        working, err = add_selection_id(working, session.bind, juris, err)
        working, err_df = clean_ids(working, ["Selection_Id"])
    except Exception as exc:
        err = ui.add_new_error(
            err,
            "system",
            "munge.raw_elements_to_cdf",
            f"Unexpected exception while adding Selection_Id:\n{exc}",
        )
        return err
    if working.empty:
        err = ui.add_new_error(
            err,
            "jurisdiction",
            juris.short_name,
            "No contests found, or no selections found for contests.",
        )
        return err

    # restrict to just the VoteCount columns (so that groupby.sum will work)
    vc_cols = [
        "Count",
        "CountItemType_Id",
        "OtherCountItemType",
        "ReportingUnit_Id",
        "Contest_Id",
        "Selection_Id",
        "Election_Id",
        "_datafile_Id",
    ]
    working = working[vc_cols]
    working, e = clean_count_cols(working, ["Count"])

    # TODO there are edge cases where this might include dupes
    #  that should be omitted. E.g., if data mistakenly read twice
    # Sum any rows that were disambiguated (otherwise dupes will be dropped
    #  when VoteCount is filled)
    group_cols = [c for c in working.columns if c != "Count"]
    working = working.groupby(group_cols).sum().reset_index()
    # TODO clean before inserting? All should be already clean, no?

    # Fill VoteCount
    try:
        e = db.insert_to_cdf_db(session.bind, working, "VoteCount")
        if e:
            err = ui.add_new_error(
                err,
                "system",
                "munge.raw_elements_to_cdf",
                f"database insertion error {e}",
            )
            return err
    except Exception as exc:
        err = ui.add_new_error(
            err,
            "system",
            "munge.raw_elements_to_cdf",
            f"Error filling VoteCount:\n{exc}",
        )

    return err
Beispiel #2
0
def load_juris_dframe_into_cdf(session, element, juris_path, error) -> dict:
    """TODO"""
    project_root = Path(__file__).parents[1].absolute()
    cdf_schema_def_dir = os.path.join(
        project_root,
        "CDF_schema_def_info",
    )
    element_fpath = os.path.join(juris_path, f"{element}.txt")
    if not os.path.exists(element_fpath):
        error = ui.add_new_error(error, "jurisdiction",
                                 Path(juris_path).name,
                                 f"File {element}.txt not found")
        return error
    df = pd.read_csv(element_fpath,
                     sep="\t",
                     encoding="iso-8859-1",
                     quoting=csv.QUOTE_MINIMAL).fillna("none or unknown")
    # TODO check that df has the right format

    # add 'none or unknown' record
    df = add_none_or_unknown(df)

    # dedupe df
    dupes, df = ui.find_dupes(df)
    if not dupes.empty:
        error = ui.add_new_error(error, "warn-jurisdiction",
                                 Path(juris_path).name,
                                 f"Duplicates were found in {element}.txt")

    # replace plain text enumerations from file system with id/othertext from db
    enum_file = os.path.join(cdf_schema_def_dir, "elements", element,
                             "enumerations.txt")
    if os.path.isfile(
            enum_file):  # (if not, there are no enums for this element)
        enums = pd.read_csv(enum_file, sep="\t")
        # get all relevant enumeration tables
        for e in enums["enumeration"]:  # e.g., e = "ReportingUnitType"
            cdf_e = pd.read_sql_table(e, session.bind)
            # for every instance of the enumeration in the current table, add id and othertype columns to the dataframe
            if e in df.columns:
                df = m.enum_col_to_id_othertext(df, e, cdf_e)

    # get Ids for any foreign key (or similar) in the table, e.g., Party_Id, etc.
    fk_file_path = os.path.join(cdf_schema_def_dir, "elements", element,
                                "foreign_keys.txt")
    if os.path.isfile(fk_file_path):
        foreign_keys = pd.read_csv(fk_file_path,
                                   sep="\t",
                                   index_col="fieldname")

        for fn in foreign_keys.index:
            ref = foreign_keys.loc[
                fn,
                "refers_to"]  # NB: juris elements have no multiple referents (as joins may)
            col_map = {fn[:-3]: db.get_name_field(ref)}
            df = db.append_id_to_dframe(
                session.bind, df, ref,
                col_map=col_map).rename(columns={f"{ref}_Id": fn})

    # commit info in df to corresponding cdf table to db
    err_string = db.insert_to_cdf_db(session.bind, df, element)
    if err_string:
        error = ui.add_new_error(
            error, "system", "juris_and_munger.load_juris_dframe_into_cdf",
            f"Error loading {element} to database: {e}")
    return error
Beispiel #3
0
def add_selection_id(df: pd.DataFrame, engine, jurisdiction: jm.Jurisdiction,
                     err: dict) -> (pd.DataFrame, dict):
    """Assumes <df> has contest_type, BallotMeasureSelection_raw, Candidate_Id column.
    Loads CandidateSelection table.
    Appends & fills Selection_Id columns"""

    # split df by contest type
    w = dict()
    for ct in ["BallotMeasure", "Candidate"]:
        w[ct] = df[df.contest_type == ct].copy()

    # append BallotMeasureSelection_Id as Selection_Id to w['BallotMeasure']
    if not w["BallotMeasure"].empty:
        bms = pd.read_sql_table(f"BallotMeasureSelection", engine)
        w["BallotMeasure"], err = replace_raw_with_internal_ids(
            w["BallotMeasure"],
            jurisdiction,
            bms,
            "BallotMeasureSelection",
            "Name",
            err,
            drop_unmatched=True,
            drop_all_ok=True,
        )
        w["BallotMeasure"].rename(
            columns={"BallotMeasureSelection_Id": "Selection_Id"},
            inplace=True)
        w["BallotMeasure"].drop(["BallotMeasureSelection", "Candidate_Id"],
                                axis=1,
                                inplace=True)

    # prepare to append CandidateSelection_Id as Selection_Id
    if not w["Candidate"].empty:
        c_df = w["Candidate"][["Candidate_Id", "Party_Id"]].drop_duplicates()

        # clean Ids and drop any that were null (i.e., 0 after cleaning)
        c_df, err_df = clean_ids(c_df, ["Candidate_Id", "Party_Id"])
        c_df = c_df[c_df.Candidate_Id != 0]

        # pull any existing Ids into a new CandidateSelection_Id column
        col_map = {c: c for c in ["Party_Id", "Candidate_Id"]}
        c_df = db.append_id_to_dframe(engine,
                                      c_df,
                                      "CandidateSelection",
                                      col_map=col_map)

        # find unmatched records
        # TODO this throws error (FutureWarning: elementwise comparison failed),
        #  maybe because CandidateSelection_Id cannot be compared to ""?
        c_df_unmatched = c_df[(c_df.CandidateSelection_Id == 0)
                              | (c_df.CandidateSelection_Id == "")
                              | (c_df.CandidateSelection_Id.isnull())].copy()

        if not c_df_unmatched.empty:
            #  Load CandidateSelections to Selection table (for unmatched)
            id_list = db.add_records_to_selection_table(
                engine, c_df_unmatched.shape[0])

            # Load unmatched records into CandidateSelection table
            c_df_unmatched["Id"] = pd.Series(id_list,
                                             index=c_df_unmatched.index)
            db.insert_to_cdf_db(engine, c_df_unmatched, "CandidateSelection")

            # update CandidateSelection_Id column for previously unmatched, merging on Candidate_Id and Party_Id
            c_df.loc[c_df_unmatched.index,
                     "CandidateSelection_Id"] = c_df_unmatched["Id"]
        # recast Candidate_Id and Party_Id to int in w['Candidate'];
        # Note that neither should have nulls, but rather the 'none or unknown' Id
        #  NB: c_df had this recasting done in the append_id_to_dframe routine
        w["Candidate"], err_df = clean_ids(w["Candidate"],
                                           ["Candidate_Id", "Party_Id"])
        if not err_df.empty:
            # show all columns of dataframe with problem in Party_Id or Candidate_Id
            pd.set_option("max_columns", None)
            err = ui.add_new_error(
                err,
                "system",
                "munge.add_selection_id",
                f"Problem with Candidate_Id or Party_Id in some rows:\n{err_df}",
            )
            pd.reset_option("max_columns")

        # append CandidateSelection_Id to w['Candidate']
        w["Candidate"] = w["Candidate"].merge(c_df,
                                              how="left",
                                              on=["Candidate_Id", "Party_Id"])

        # rename to Selection_Id
        w["Candidate"] = w["Candidate"].rename(
            columns={"CandidateSelection_Id": "Selection_Id"})
        # and drop extraneous
        to_drop = [
            x for x in w["Candidate"].columns
            if x in ["Candidate_Id", "BallotMeasureSelection_raw"]
        ]
        w["Candidate"].drop(to_drop, axis=1, inplace=True)

    working = pd.concat([w["BallotMeasure"], w["Candidate"]])

    return working, err
Beispiel #4
0
    def load_contests(self, engine, contest_type: str, error: dict) -> dict:
        # read <contest_type>Contests from jurisdiction folder
        element_fpath = os.path.join(self.path_to_juris_dir,
                                     f"{contest_type}Contest.txt")
        if not os.path.exists(element_fpath):
            error[f"{contest_type}Contest.txt"] = "file not found"
            return error
        df = pd.read_csv(element_fpath,
                         sep="\t",
                         encoding="iso-8859-1",
                         quoting=csv.QUOTE_MINIMAL).fillna("none or unknown")

        # add contest_type column
        df = m.add_constant_column(df, "contest_type", contest_type)

        # add 'none or unknown' record
        df = add_none_or_unknown(df, contest_type=contest_type)

        # dedupe df
        dupes, df = ui.find_dupes(df)
        if not dupes.empty:
            print(
                f"WARNING: duplicates removed from dataframe, may indicate a problem.\n"
            )
            if not f"{contest_type}Contest" in error:
                error[f"{contest_type}Contest"] = {}
            error[f"{contest_type}Contest"]["found_duplicates"] = True

        # insert into in Contest table
        e = db.insert_to_cdf_db(engine, df[["Name", "contest_type"]],
                                "Contest")

        # append Contest_Id
        col_map = {"Name": "Name", "contest_type": "contest_type"}
        df = db.append_id_to_dframe(engine, df, "Contest", col_map=col_map)

        if contest_type == "BallotMeasure":
            # append ElectionDistrict_Id, Election_Id
            for fk, ref in [
                ("ElectionDistrict", "ReportingUnit"),
                ("Election", "Election"),
            ]:
                col_map = {fk: "Name"}
                df = (db.append_id_to_dframe(
                    engine, df, ref,
                    col_map=col_map).rename(columns={
                        f"{ref}_Id": f"{fk}_Id"
                    }).drop(fk, axis=1))

        else:
            # append Office_Id, PrimaryParty_Id
            for fk, ref in [("Office", "Office"), ("PrimaryParty", "Party")]:
                col_map = {fk: "Name"}
                df = db.append_id_to_dframe(
                    engine, df, ref,
                    col_map=col_map).rename(columns={f"{ref}_Id": f"{fk}_Id"})

        # create entries in <contest_type>Contest table
        # commit info in df to <contest_type>Contest table to db
        err = db.insert_to_cdf_db(engine,
                                  df.rename(columns={"Contest_Id": "Id"}),
                                  f"{contest_type}Contest")
        if err:
            if f"{contest_type}Contest" not in error:
                error[f"{contest_type}Contest"] = {}
            error[f"{contest_type}Contest"]["database"] = err
        return error