Example #1
0
def convert_types(df: pd.DataFrame, type_mapping: dict) -> tuple:
    """
    Uses the passed type_mapping dictionary to convert the indicated
    columns into the paired type object. Errors in type conversion
    will silently fail, so be sure to check types and maybe explore
    again to see if there are any pieces of data that failed to convert
    and give them additional attention.

    Args:
        df: A DataFrame.
        type_mapping: A dictionary containing column names as keys and
            python objects as values. Objects must be accepted by
            util.gconvert.

    Returns: The DataFrame, with the passed columns converted to the
        desired types, as well as a metadata dictionary.

    """
    md = u.gen_empty_md_df(df.columns)
    for col, type_ in type_mapping.items():
        result = df[col].apply(u.gconvert, args=(type_, ))
        md[col] = (result.apply(type) != df[col].apply(type)).sum()
        df[col] = result

    return df, {"metadata": md}
Example #2
0
def cleanse_redundancies(df: pd.DataFrame, redundancy_map: dict) -> tuple:
    """
    For each row in the DataFrame, if a key in redundancy_map contains
    the same value as the column(s) in the paired value, replaces the
    column(s)' value with nan, removing the redundant data.

    Args:
        df: A DataFrame.
        redundancy_map: A dictionary with master column names as keys
            (the columns that *should* contain the data) and a one or
            more other columns that some rows may also contain the
            value in the master column.

    Returns: The DataFrame, with redundant data removed from rows
        where it is appropriate, as well as a metadata dictionary.

    """
    for k, v in redundancy_map.items():
        redundancy_map[k] = u.tuplify(v)

    md = u.gen_empty_md_df(df.columns)
    for master, extras in redundancy_map.items():
        for e in extras:
            result = df.apply(lambda row: nan
                              if row[master] == row[e] else row[e],
                              axis=1)
            md[e] = df[e].count() - result.count()
            df[e] = result
    return df, {"metadata": md}
Example #3
0
def cleanse_typos(df: pd.DataFrame, cleaning_guides: dict):
    """
    Corrects typos in the passed DataFrame based on keyword args where
    the key is the column and the arg is a dictionary of simple
    mappings or a CleaningGuide object.

    Args:
        df: A DataFrame.
        cleaning_guides: A dict where each key is a column name and
            each value is a dict or gd.CleaningGuide object.

    Returns: The df, with the specified columns cleaned of typos, and a
        metadata dictionary.

    """
    results = u.gen_empty_md_df(df.columns)
    for k, v in cleaning_guides.items():
        cleaning_guides[k] = gd.CleaningGuide.convert(v)

    for k, cl_guide in cleaning_guides.items():
        new = df[k].apply(cl_guide)
        # nan != nan always evaluates to True, so need to subtract the
        # number of nans from the differing values:
        results[k] = (df[k] != new).sum() - df[k].isna().sum()
        df[k] = new

    return df, {"metadata": results}
def id_nullable_violations(
    df: pd.DataFrame, not_nullable: (list, tuple)) -> tuple:
    """
    Checks if each column in not_nullable contains no nulls.

    Args:
        df: A DataFrame.
        not_nullable: A list of columns in df that shouldn't contain
            nan values.

    Returns: The DataFrame, and a metadata dictionary.

    """
    result = u.gen_empty_md_df(df.columns, False)
    nulls = pd.DataFrame(df.isna().sum()).T
    for col in not_nullable:
        result[col] = nulls[col] > 0
    return df, {"metadata": result}
def count_uniques(df: pd.DataFrame):
    """
    Counts the unique values in each column in the passed DataFrame.
    Null values are not counted.

    Args:
        df: A DataFrame.

    Returns: The DataFrame, and a metadata dictionary.

    """
    md = u.gen_empty_md_df(df.columns)
    # This loop avoids using nunique on raw data, which can cause
    # errors if the data contains unexpected data types like lists.
    # Using gconvert to convert to string before counting uniques
    # prevents errors.
    for c in df.columns:
        md[c] = df[c].apply(u.gconvert, target_type=str).nunique()
    return df, {"metadata": md}
def normalize_whitespace(df: pd.DataFrame) -> tuple:
    """
    Simple function that applies util.clean_whitespace to every cell
    in a DataFrame.

    Args:
        df: A DataFrame.

    Returns: The DataFrame, with any string values cleaned of excess
        whitespace.

    """
    md_df = u.gen_empty_md_df(df.columns)
    for c in df.columns:
        result = df[c].apply(u.clean_whitespace)
        # Pass the index in case the DataFrame is being chunked on read:
        result = pd.DataFrame(result.to_list(), index=df.index)
        df[c] = result[1]
        md_df[c] = result[0].sum()
    return df, {"metadata": md_df}
Example #7
0
def redistribute(df: pd.DataFrame, redistribution_guides: dict) -> tuple:
    """
    Uses the passed redistribution_guides to find matching values in
    the specified columns and move them to the destination columns.

    Args:
        df: A DataFrame.
        redistribution_guides: A dictionary with source columns as keys
            and RedistributionGuide objects as values. Tuples of
            RedistributionGuides as values are also acceptable.

    Returns: The transformed DataFrame, as well as a metadata
        dictionary.

    """
    md = u.gen_empty_md_df(df.columns)
    redistribution_guides = u.tuplify_iterable(redistribution_guides)
    for k, rd_guides in redistribution_guides.items():
        for rd_guide in rd_guides:
            result = df[k].apply(rd_guide)
            c = rd_guide.destination
            if rd_guide.mode == "overwrite":
                rd_val_ct = result.count()
                df[c] = result.fillna(df[c])
            elif rd_guide.mode == "append":
                # To properly append, need both result and destination
                # to be strings:
                df[c] = df[c].apply(u.gconvert, target_type=str)
                result = result.apply(u.gconvert, target_type=str)
                rd_val_ct = result.count()
                spaces = result.notna().replace([True, False], [" ", ""])
                df[c] = df[c] + spaces + result.fillna("")
                df[c] = df[c].fillna(result)
            else:
                df[c] = df[c].fillna(result)
                rd_val_ct = (result == df[c]).sum()
            # Replace moved values with nan:
            df.loc[result[result.notna()].index, k] = nan
            md[k] += result.count()
            md[c] += rd_val_ct
    return df, {"metadata": md}
def fill_defaults(df: pd.DataFrame, defaults_mapping: dict) -> tuple:
    """
    Fills each column specified in defaults_mapping with the values
    contained therein.

    Args:
        df: A DataFrame.
        defaults_mapping: A dictionary containing columns from df as
            keys and values being the value to fill nan cells in that
            column with.

    Returns: The passed DataFrame with null values filled in the
        columns specified with the values specified. Also a metadata
        dictionary.

    """
    md = u.gen_empty_md_df(df.columns)
    for k, v in defaults_mapping.items():
        md[k] = df[k].isna().sum()
        df[k] = df[k].fillna(v)
    return df, {"metadata": md}
def id_type_violations(df: pd.DataFrame, required_types: dict) -> tuple:
    """
    Checks if each value in the columns specified in the passed dict
    is an object of the passed type. Note that nan values will always
    count as matching the passed type, see id_nullable_violations
    to find erroneous nulls.

    Args:
        df: A DataFrame.
        required_types: A dictionary containing keys corresponding to
            columns in df, and values corresponding to the python type
            you want each value in that column to be.

    Returns: The DataFrame, and a metadata dictionary.

    """
    result = u.gen_empty_md_df(df.columns, False)
    types = df.applymap(u.gtype)
    for col, type_ in required_types.items():
        types[col] = types[col].fillna(type_)
        result[col] = (types[col] != type_).sum() > 0
    return df, {"metadata": result}
Example #10
0
def collect_data_types(df: pd.DataFrame):
    """
    Collects the unique python data types in the passed DataFrame's
    columns and assembles a string of each unique type with the percent
    of values that type represents in that column.

    Args:
        df: A DataFrame.

    Returns: The DataFrame, and a metadata dictionary.

    """
    dtypes = df.applymap(u.get_class_name)
    orig_cols = list(dtypes.columns)
    dtypes["ctr"] = 1
    result = u.gen_empty_md_df(df.columns)
    for c in orig_cols:
        c_pcts = (dtypes.groupby([c]).sum() / dtypes[c].count()).round(2)
        c_pcts = c_pcts.reset_index()
        result[c] = ",".join(
            c_pcts.apply(lambda s: f"{s[c]}({s.ctr})", axis=1).tolist())
    return df, {"metadata": result}
Example #11
0
def accrete(
        df: pd.DataFrame,
        accrete_group_by: list,
        accretion_cols: (str, tuple),
        accretion_sep: str = " ",
) -> tuple:
    """
    Groups the dataframe by the passed group_by values and then
    combines text values in the accretion columns.

    Args:
        df: A DataFrame.
        accrete_group_by: A list of columns to group by.
        accretion_cols: The columns you want to accrete on within
            groups created by accrete_group_by.
        accretion_sep: A string indicating how you want the combined
            string values to be separated.

    Returns: The transformed DataFrame, and a metadata dictionary.

    """
    accretion_cols = u.tuplify(accretion_cols)
    md = u.gen_empty_md_df(df.columns)
    for c in accretion_cols:
        df[c] = df[c].fillna("")
        df[c] = df[c].astype(str)
        result = df.groupby(accrete_group_by)[c].apply(
            accretion_sep.join).reset_index()
        df = df.merge(result, on=accrete_group_by, suffixes=("", "_x"))
        cx = c + "_x"
        md[c] = (df[c] != df[cx]).sum()
        df[c] = df[cx]
        df.drop(columns=cx, inplace=True)
        df[c] = df[c].str.strip()
        df[c] = df[c].apply(
            lambda x: x if len(x) > 0 and x[-1] != accretion_sep else x[:-1])
        df[c] = df[c].replace("", nan)
    return df, {"metadata": md}
Example #12
0
def complete_clusters(df: pd.DataFrame, clustered_columns: Sequence) -> tuple:
    """
    Forward propagates values in the given columns into nan values that
    follow non-nan values. Useful when you have a report-like dataset
    where the rows are clustered into groups where the columns that
    were grouped on aren't repeated if they're the same value.

    Args:
        df: A DataFrame.
        clustered_columns: The columns in the DataFrame to fill nan
            values with the last valid value.

    Returns: The DataFrame, with the passed columns forward filled with
        valid values instead of nans. Also a metadata dictionary.

    """
    md_df = u.gen_empty_md_df(clustered_columns)
    for c in clustered_columns:
        before_ct = df[c].count()
        df[c] = df[c].fillna(method="ffill")
        after_ct = df[c].count()
        md_df[c] = after_ct - before_ct
    return df, {"metadata": md_df}
def test_gen_empty_md_df():
    expected = pd.DataFrame([dict(a=0, b=0, c=0)])
    pd.testing.assert_frame_equal(u.gen_empty_md_df(["a", "b", "c"]), expected)

    expected = pd.DataFrame([dict(a="x", b="x", c="x")])
    pd.testing.assert_frame_equal(u.gen_empty_md_df(["a", "b", "c"], "x"), expected)
Example #14
0
def id_clustering_violations(df: pd.DataFrame, cluster_group_by: list,
                             cluster_unique_cols: list) -> tuple:
    """
    Clusters are sets of rows that share one or more identical columns
    and have another set of columns which must be unique within the
    cluster. This function identifies rows that are part of a cluster
    that violates these rules.

    Args:
        df: A DataFrame.
        cluster_group_by: A list of columns in df that, when grouped,
            define a cluster.
        cluster_unique_cols: A list of columns in df that, when the df
            is grouped on cluster_group_by, must be unique within the
            cluster. If you want combinations of columns to be unique,
            pass them as a tuple within cluster_unique_cols.

    Returns: The DataFrame, with each row appended with details about
        whether it violates clustering, and how. Also a metadata
        dictionary.

    """
    md = u.gen_empty_md_df(df.columns)
    df["row_ct"] = 1
    # Preprocess cluster_unique_cols to handle column combinations:
    u_col_names = []
    cols_to_count = []
    for c in cluster_unique_cols:
        if isinstance(c, tuple):
            cols_to_count += [*c]
            name = "_".join(c) + "_x"
            df[name] = ""
            for source_col in c:
                df[name] = df[name] + df[source_col].astype(str)
            u_col_names.append(name)
        else:
            u_col_names.append(c)
            cols_to_count.append(c)
    # Reset_index twice so each cluster has a unique id:
    cluster_row_cts = (df.groupby(cluster_group_by)
                       ["row_ct"].sum().reset_index().reset_index())
    cluster_row_cts.rename(columns={"index": "cluster_id"}, inplace=True)
    # row_ct in core df no longer necessary:
    df = df.drop(columns="row_ct")
    # Get the count of each unique value in the ungrouped columns:
    nu = df.groupby(cluster_group_by).nunique().reset_index()
    # Get the count of each value in the ungrouped columns:
    ct = df.groupby(cluster_group_by).count().reset_index()
    # Combine row_cts, unique counts, and counts into core df:
    clusters = (df.merge(cluster_row_cts, how="left",
                         on=cluster_group_by).merge(
                             nu[[*cluster_group_by, *u_col_names]],
                             on=cluster_group_by,
                             how="left",
                             suffixes=("", "_nu"),
                         ).merge(
                             ct[[*cluster_group_by, *cols_to_count]],
                             on=cluster_group_by,
                             how="left",
                             suffixes=("", "_ct"),
                         ))
    # Apply a row_number to each row within each cluster:
    clusters["rn"] = clusters.groupby([*cluster_group_by, "cluster_id"
                                       ]).cumcount() + 1
    # Unique columns and column combinations must be unique:
    for c in u_col_names:
        result = clusters[c + "_nu"] != clusters["row_ct"]
        md[c] = result.sum()
        clusters[c + "_invalid"] = result
    # Unique columns must either have no values or be fully not-null:
    for c in cols_to_count:
        result = (clusters["row_ct"] !=
                  clusters[c + "_ct"]) & (clusters[c + "_ct"] != 0)
        md[c] = result.sum()
        clusters[c + "_invalid"] = result
    # Rows that fail any of the above tests are invalid:
    invalid_inds = u.broadcast_suffix(list({*u_col_names, *cols_to_count}),
                                      "_invalid")
    clusters["cluster_invalid"] = clusters[invalid_inds].any(axis=1)
    return clusters, {"metadata": md}