Python append_id_to_dframe Exemples, election_data_analysis.database.append_id_to_dframe Python Exemples

Exemple #1

0

Afficher le fichier

def load_juris_dframe_into_cdf(session, element, juris_path, error) -> dict:
    """TODO"""
    project_root = Path(__file__).parents[1].absolute()
    cdf_schema_def_dir = os.path.join(
        project_root,
        "CDF_schema_def_info",
    )
    element_fpath = os.path.join(juris_path, f"{element}.txt")
    if not os.path.exists(element_fpath):
        error = ui.add_new_error(error, "jurisdiction",
                                 Path(juris_path).name,
                                 f"File {element}.txt not found")
        return error
    df = pd.read_csv(element_fpath,
                     sep="\t",
                     encoding="iso-8859-1",
                     quoting=csv.QUOTE_MINIMAL).fillna("none or unknown")
    # TODO check that df has the right format

    # add 'none or unknown' record
    df = add_none_or_unknown(df)

    # dedupe df
    dupes, df = ui.find_dupes(df)
    if not dupes.empty:
        error = ui.add_new_error(error, "warn-jurisdiction",
                                 Path(juris_path).name,
                                 f"Duplicates were found in {element}.txt")

    # replace plain text enumerations from file system with id/othertext from db
    enum_file = os.path.join(cdf_schema_def_dir, "elements", element,
                             "enumerations.txt")
    if os.path.isfile(
            enum_file):  # (if not, there are no enums for this element)
        enums = pd.read_csv(enum_file, sep="\t")
        # get all relevant enumeration tables
        for e in enums["enumeration"]:  # e.g., e = "ReportingUnitType"
            cdf_e = pd.read_sql_table(e, session.bind)
            # for every instance of the enumeration in the current table, add id and othertype columns to the dataframe
            if e in df.columns:
                df = m.enum_col_to_id_othertext(df, e, cdf_e)

    # get Ids for any foreign key (or similar) in the table, e.g., Party_Id, etc.
    fk_file_path = os.path.join(cdf_schema_def_dir, "elements", element,
                                "foreign_keys.txt")
    if os.path.isfile(fk_file_path):
        foreign_keys = pd.read_csv(fk_file_path,
                                   sep="\t",
                                   index_col="fieldname")

        for fn in foreign_keys.index:
            ref = foreign_keys.loc[
                fn,
                "refers_to"]  # NB: juris elements have no multiple referents (as joins may)
            col_map = {fn[:-3]: db.get_name_field(ref)}
            df = db.append_id_to_dframe(
                session.bind, df, ref,
                col_map=col_map).rename(columns={f"{ref}_Id": fn})

    # commit info in df to corresponding cdf table to db
    err_string = db.insert_to_cdf_db(session.bind, df, element)
    if err_string:
        error = ui.add_new_error(
            error, "system", "juris_and_munger.load_juris_dframe_into_cdf",
            f"Error loading {element} to database: {e}")
    return error

Exemple #2

0

Afficher le fichier

def add_selection_id(df: pd.DataFrame, engine, jurisdiction: jm.Jurisdiction,
                     err: dict) -> (pd.DataFrame, dict):
    """Assumes <df> has contest_type, BallotMeasureSelection_raw, Candidate_Id column.
    Loads CandidateSelection table.
    Appends & fills Selection_Id columns"""

    # split df by contest type
    w = dict()
    for ct in ["BallotMeasure", "Candidate"]:
        w[ct] = df[df.contest_type == ct].copy()

    # append BallotMeasureSelection_Id as Selection_Id to w['BallotMeasure']
    if not w["BallotMeasure"].empty:
        bms = pd.read_sql_table(f"BallotMeasureSelection", engine)
        w["BallotMeasure"], err = replace_raw_with_internal_ids(
            w["BallotMeasure"],
            jurisdiction,
            bms,
            "BallotMeasureSelection",
            "Name",
            err,
            drop_unmatched=True,
            drop_all_ok=True,
        )
        w["BallotMeasure"].rename(
            columns={"BallotMeasureSelection_Id": "Selection_Id"},
            inplace=True)
        w["BallotMeasure"].drop(["BallotMeasureSelection", "Candidate_Id"],
                                axis=1,
                                inplace=True)

    # prepare to append CandidateSelection_Id as Selection_Id
    if not w["Candidate"].empty:
        c_df = w["Candidate"][["Candidate_Id", "Party_Id"]].drop_duplicates()

        # clean Ids and drop any that were null (i.e., 0 after cleaning)
        c_df, err_df = clean_ids(c_df, ["Candidate_Id", "Party_Id"])
        c_df = c_df[c_df.Candidate_Id != 0]

        # pull any existing Ids into a new CandidateSelection_Id column
        col_map = {c: c for c in ["Party_Id", "Candidate_Id"]}
        c_df = db.append_id_to_dframe(engine,
                                      c_df,
                                      "CandidateSelection",
                                      col_map=col_map)

        # find unmatched records
        # TODO this throws error (FutureWarning: elementwise comparison failed),
        #  maybe because CandidateSelection_Id cannot be compared to ""?
        c_df_unmatched = c_df[(c_df.CandidateSelection_Id == 0)
                              | (c_df.CandidateSelection_Id == "")
                              | (c_df.CandidateSelection_Id.isnull())].copy()

        if not c_df_unmatched.empty:
            #  Load CandidateSelections to Selection table (for unmatched)
            id_list = db.add_records_to_selection_table(
                engine, c_df_unmatched.shape[0])

            # Load unmatched records into CandidateSelection table
            c_df_unmatched["Id"] = pd.Series(id_list,
                                             index=c_df_unmatched.index)
            db.insert_to_cdf_db(engine, c_df_unmatched, "CandidateSelection")

            # update CandidateSelection_Id column for previously unmatched, merging on Candidate_Id and Party_Id
            c_df.loc[c_df_unmatched.index,
                     "CandidateSelection_Id"] = c_df_unmatched["Id"]
        # recast Candidate_Id and Party_Id to int in w['Candidate'];
        # Note that neither should have nulls, but rather the 'none or unknown' Id
        #  NB: c_df had this recasting done in the append_id_to_dframe routine
        w["Candidate"], err_df = clean_ids(w["Candidate"],
                                           ["Candidate_Id", "Party_Id"])
        if not err_df.empty:
            # show all columns of dataframe with problem in Party_Id or Candidate_Id
            pd.set_option("max_columns", None)
            err = ui.add_new_error(
                err,
                "system",
                "munge.add_selection_id",
                f"Problem with Candidate_Id or Party_Id in some rows:\n{err_df}",
            )
            pd.reset_option("max_columns")

        # append CandidateSelection_Id to w['Candidate']
        w["Candidate"] = w["Candidate"].merge(c_df,
                                              how="left",
                                              on=["Candidate_Id", "Party_Id"])

        # rename to Selection_Id
        w["Candidate"] = w["Candidate"].rename(
            columns={"CandidateSelection_Id": "Selection_Id"})
        # and drop extraneous
        to_drop = [
            x for x in w["Candidate"].columns
            if x in ["Candidate_Id", "BallotMeasureSelection_raw"]
        ]
        w["Candidate"].drop(to_drop, axis=1, inplace=True)

    working = pd.concat([w["BallotMeasure"], w["Candidate"]])

    return working, err

Exemple #3

0

Afficher le fichier

    def load_contests(self, engine, contest_type: str, error: dict) -> dict:
        # read <contest_type>Contests from jurisdiction folder
        element_fpath = os.path.join(self.path_to_juris_dir,
                                     f"{contest_type}Contest.txt")
        if not os.path.exists(element_fpath):
            error[f"{contest_type}Contest.txt"] = "file not found"
            return error
        df = pd.read_csv(element_fpath,
                         sep="\t",
                         encoding="iso-8859-1",
                         quoting=csv.QUOTE_MINIMAL).fillna("none or unknown")

        # add contest_type column
        df = m.add_constant_column(df, "contest_type", contest_type)

        # add 'none or unknown' record
        df = add_none_or_unknown(df, contest_type=contest_type)

        # dedupe df
        dupes, df = ui.find_dupes(df)
        if not dupes.empty:
            print(
                f"WARNING: duplicates removed from dataframe, may indicate a problem.\n"
            )
            if not f"{contest_type}Contest" in error:
                error[f"{contest_type}Contest"] = {}
            error[f"{contest_type}Contest"]["found_duplicates"] = True

        # insert into in Contest table
        e = db.insert_to_cdf_db(engine, df[["Name", "contest_type"]],
                                "Contest")

        # append Contest_Id
        col_map = {"Name": "Name", "contest_type": "contest_type"}
        df = db.append_id_to_dframe(engine, df, "Contest", col_map=col_map)

        if contest_type == "BallotMeasure":
            # append ElectionDistrict_Id, Election_Id
            for fk, ref in [
                ("ElectionDistrict", "ReportingUnit"),
                ("Election", "Election"),
            ]:
                col_map = {fk: "Name"}
                df = (db.append_id_to_dframe(
                    engine, df, ref,
                    col_map=col_map).rename(columns={
                        f"{ref}_Id": f"{fk}_Id"
                    }).drop(fk, axis=1))

        else:
            # append Office_Id, PrimaryParty_Id
            for fk, ref in [("Office", "Office"), ("PrimaryParty", "Party")]:
                col_map = {fk: "Name"}
                df = db.append_id_to_dframe(
                    engine, df, ref,
                    col_map=col_map).rename(columns={f"{ref}_Id": f"{fk}_Id"})

        # create entries in <contest_type>Contest table
        # commit info in df to <contest_type>Contest table to db
        err = db.insert_to_cdf_db(engine,
                                  df.rename(columns={"Contest_Id": "Id"}),
                                  f"{contest_type}Contest")
        if err:
            if f"{contest_type}Contest" not in error:
                error[f"{contest_type}Contest"] = {}
            error[f"{contest_type}Contest"]["database"] = err
        return error