def convert_types(df: pd.DataFrame, type_mapping: dict) -> tuple: """ Uses the passed type_mapping dictionary to convert the indicated columns into the paired type object. Errors in type conversion will silently fail, so be sure to check types and maybe explore again to see if there are any pieces of data that failed to convert and give them additional attention. Args: df: A DataFrame. type_mapping: A dictionary containing column names as keys and python objects as values. Objects must be accepted by util.gconvert. Returns: The DataFrame, with the passed columns converted to the desired types, as well as a metadata dictionary. """ md = u.gen_empty_md_df(df.columns) for col, type_ in type_mapping.items(): result = df[col].apply(u.gconvert, args=(type_, )) md[col] = (result.apply(type) != df[col].apply(type)).sum() df[col] = result return df, {"metadata": md}
def cleanse_redundancies(df: pd.DataFrame, redundancy_map: dict) -> tuple: """ For each row in the DataFrame, if a key in redundancy_map contains the same value as the column(s) in the paired value, replaces the column(s)' value with nan, removing the redundant data. Args: df: A DataFrame. redundancy_map: A dictionary with master column names as keys (the columns that *should* contain the data) and a one or more other columns that some rows may also contain the value in the master column. Returns: The DataFrame, with redundant data removed from rows where it is appropriate, as well as a metadata dictionary. """ for k, v in redundancy_map.items(): redundancy_map[k] = u.tuplify(v) md = u.gen_empty_md_df(df.columns) for master, extras in redundancy_map.items(): for e in extras: result = df.apply(lambda row: nan if row[master] == row[e] else row[e], axis=1) md[e] = df[e].count() - result.count() df[e] = result return df, {"metadata": md}
def cleanse_typos(df: pd.DataFrame, cleaning_guides: dict): """ Corrects typos in the passed DataFrame based on keyword args where the key is the column and the arg is a dictionary of simple mappings or a CleaningGuide object. Args: df: A DataFrame. cleaning_guides: A dict where each key is a column name and each value is a dict or gd.CleaningGuide object. Returns: The df, with the specified columns cleaned of typos, and a metadata dictionary. """ results = u.gen_empty_md_df(df.columns) for k, v in cleaning_guides.items(): cleaning_guides[k] = gd.CleaningGuide.convert(v) for k, cl_guide in cleaning_guides.items(): new = df[k].apply(cl_guide) # nan != nan always evaluates to True, so need to subtract the # number of nans from the differing values: results[k] = (df[k] != new).sum() - df[k].isna().sum() df[k] = new return df, {"metadata": results}
def id_nullable_violations( df: pd.DataFrame, not_nullable: (list, tuple)) -> tuple: """ Checks if each column in not_nullable contains no nulls. Args: df: A DataFrame. not_nullable: A list of columns in df that shouldn't contain nan values. Returns: The DataFrame, and a metadata dictionary. """ result = u.gen_empty_md_df(df.columns, False) nulls = pd.DataFrame(df.isna().sum()).T for col in not_nullable: result[col] = nulls[col] > 0 return df, {"metadata": result}
def count_uniques(df: pd.DataFrame): """ Counts the unique values in each column in the passed DataFrame. Null values are not counted. Args: df: A DataFrame. Returns: The DataFrame, and a metadata dictionary. """ md = u.gen_empty_md_df(df.columns) # This loop avoids using nunique on raw data, which can cause # errors if the data contains unexpected data types like lists. # Using gconvert to convert to string before counting uniques # prevents errors. for c in df.columns: md[c] = df[c].apply(u.gconvert, target_type=str).nunique() return df, {"metadata": md}
def normalize_whitespace(df: pd.DataFrame) -> tuple: """ Simple function that applies util.clean_whitespace to every cell in a DataFrame. Args: df: A DataFrame. Returns: The DataFrame, with any string values cleaned of excess whitespace. """ md_df = u.gen_empty_md_df(df.columns) for c in df.columns: result = df[c].apply(u.clean_whitespace) # Pass the index in case the DataFrame is being chunked on read: result = pd.DataFrame(result.to_list(), index=df.index) df[c] = result[1] md_df[c] = result[0].sum() return df, {"metadata": md_df}
def redistribute(df: pd.DataFrame, redistribution_guides: dict) -> tuple: """ Uses the passed redistribution_guides to find matching values in the specified columns and move them to the destination columns. Args: df: A DataFrame. redistribution_guides: A dictionary with source columns as keys and RedistributionGuide objects as values. Tuples of RedistributionGuides as values are also acceptable. Returns: The transformed DataFrame, as well as a metadata dictionary. """ md = u.gen_empty_md_df(df.columns) redistribution_guides = u.tuplify_iterable(redistribution_guides) for k, rd_guides in redistribution_guides.items(): for rd_guide in rd_guides: result = df[k].apply(rd_guide) c = rd_guide.destination if rd_guide.mode == "overwrite": rd_val_ct = result.count() df[c] = result.fillna(df[c]) elif rd_guide.mode == "append": # To properly append, need both result and destination # to be strings: df[c] = df[c].apply(u.gconvert, target_type=str) result = result.apply(u.gconvert, target_type=str) rd_val_ct = result.count() spaces = result.notna().replace([True, False], [" ", ""]) df[c] = df[c] + spaces + result.fillna("") df[c] = df[c].fillna(result) else: df[c] = df[c].fillna(result) rd_val_ct = (result == df[c]).sum() # Replace moved values with nan: df.loc[result[result.notna()].index, k] = nan md[k] += result.count() md[c] += rd_val_ct return df, {"metadata": md}
def fill_defaults(df: pd.DataFrame, defaults_mapping: dict) -> tuple: """ Fills each column specified in defaults_mapping with the values contained therein. Args: df: A DataFrame. defaults_mapping: A dictionary containing columns from df as keys and values being the value to fill nan cells in that column with. Returns: The passed DataFrame with null values filled in the columns specified with the values specified. Also a metadata dictionary. """ md = u.gen_empty_md_df(df.columns) for k, v in defaults_mapping.items(): md[k] = df[k].isna().sum() df[k] = df[k].fillna(v) return df, {"metadata": md}
def id_type_violations(df: pd.DataFrame, required_types: dict) -> tuple: """ Checks if each value in the columns specified in the passed dict is an object of the passed type. Note that nan values will always count as matching the passed type, see id_nullable_violations to find erroneous nulls. Args: df: A DataFrame. required_types: A dictionary containing keys corresponding to columns in df, and values corresponding to the python type you want each value in that column to be. Returns: The DataFrame, and a metadata dictionary. """ result = u.gen_empty_md_df(df.columns, False) types = df.applymap(u.gtype) for col, type_ in required_types.items(): types[col] = types[col].fillna(type_) result[col] = (types[col] != type_).sum() > 0 return df, {"metadata": result}
def collect_data_types(df: pd.DataFrame): """ Collects the unique python data types in the passed DataFrame's columns and assembles a string of each unique type with the percent of values that type represents in that column. Args: df: A DataFrame. Returns: The DataFrame, and a metadata dictionary. """ dtypes = df.applymap(u.get_class_name) orig_cols = list(dtypes.columns) dtypes["ctr"] = 1 result = u.gen_empty_md_df(df.columns) for c in orig_cols: c_pcts = (dtypes.groupby([c]).sum() / dtypes[c].count()).round(2) c_pcts = c_pcts.reset_index() result[c] = ",".join( c_pcts.apply(lambda s: f"{s[c]}({s.ctr})", axis=1).tolist()) return df, {"metadata": result}
def accrete( df: pd.DataFrame, accrete_group_by: list, accretion_cols: (str, tuple), accretion_sep: str = " ", ) -> tuple: """ Groups the dataframe by the passed group_by values and then combines text values in the accretion columns. Args: df: A DataFrame. accrete_group_by: A list of columns to group by. accretion_cols: The columns you want to accrete on within groups created by accrete_group_by. accretion_sep: A string indicating how you want the combined string values to be separated. Returns: The transformed DataFrame, and a metadata dictionary. """ accretion_cols = u.tuplify(accretion_cols) md = u.gen_empty_md_df(df.columns) for c in accretion_cols: df[c] = df[c].fillna("") df[c] = df[c].astype(str) result = df.groupby(accrete_group_by)[c].apply( accretion_sep.join).reset_index() df = df.merge(result, on=accrete_group_by, suffixes=("", "_x")) cx = c + "_x" md[c] = (df[c] != df[cx]).sum() df[c] = df[cx] df.drop(columns=cx, inplace=True) df[c] = df[c].str.strip() df[c] = df[c].apply( lambda x: x if len(x) > 0 and x[-1] != accretion_sep else x[:-1]) df[c] = df[c].replace("", nan) return df, {"metadata": md}
def complete_clusters(df: pd.DataFrame, clustered_columns: Sequence) -> tuple: """ Forward propagates values in the given columns into nan values that follow non-nan values. Useful when you have a report-like dataset where the rows are clustered into groups where the columns that were grouped on aren't repeated if they're the same value. Args: df: A DataFrame. clustered_columns: The columns in the DataFrame to fill nan values with the last valid value. Returns: The DataFrame, with the passed columns forward filled with valid values instead of nans. Also a metadata dictionary. """ md_df = u.gen_empty_md_df(clustered_columns) for c in clustered_columns: before_ct = df[c].count() df[c] = df[c].fillna(method="ffill") after_ct = df[c].count() md_df[c] = after_ct - before_ct return df, {"metadata": md_df}
def test_gen_empty_md_df(): expected = pd.DataFrame([dict(a=0, b=0, c=0)]) pd.testing.assert_frame_equal(u.gen_empty_md_df(["a", "b", "c"]), expected) expected = pd.DataFrame([dict(a="x", b="x", c="x")]) pd.testing.assert_frame_equal(u.gen_empty_md_df(["a", "b", "c"], "x"), expected)
def id_clustering_violations(df: pd.DataFrame, cluster_group_by: list, cluster_unique_cols: list) -> tuple: """ Clusters are sets of rows that share one or more identical columns and have another set of columns which must be unique within the cluster. This function identifies rows that are part of a cluster that violates these rules. Args: df: A DataFrame. cluster_group_by: A list of columns in df that, when grouped, define a cluster. cluster_unique_cols: A list of columns in df that, when the df is grouped on cluster_group_by, must be unique within the cluster. If you want combinations of columns to be unique, pass them as a tuple within cluster_unique_cols. Returns: The DataFrame, with each row appended with details about whether it violates clustering, and how. Also a metadata dictionary. """ md = u.gen_empty_md_df(df.columns) df["row_ct"] = 1 # Preprocess cluster_unique_cols to handle column combinations: u_col_names = [] cols_to_count = [] for c in cluster_unique_cols: if isinstance(c, tuple): cols_to_count += [*c] name = "_".join(c) + "_x" df[name] = "" for source_col in c: df[name] = df[name] + df[source_col].astype(str) u_col_names.append(name) else: u_col_names.append(c) cols_to_count.append(c) # Reset_index twice so each cluster has a unique id: cluster_row_cts = (df.groupby(cluster_group_by) ["row_ct"].sum().reset_index().reset_index()) cluster_row_cts.rename(columns={"index": "cluster_id"}, inplace=True) # row_ct in core df no longer necessary: df = df.drop(columns="row_ct") # Get the count of each unique value in the ungrouped columns: nu = df.groupby(cluster_group_by).nunique().reset_index() # Get the count of each value in the ungrouped columns: ct = df.groupby(cluster_group_by).count().reset_index() # Combine row_cts, unique counts, and counts into core df: clusters = (df.merge(cluster_row_cts, how="left", on=cluster_group_by).merge( nu[[*cluster_group_by, *u_col_names]], on=cluster_group_by, how="left", suffixes=("", "_nu"), ).merge( ct[[*cluster_group_by, *cols_to_count]], on=cluster_group_by, how="left", suffixes=("", "_ct"), )) # Apply a row_number to each row within each cluster: clusters["rn"] = clusters.groupby([*cluster_group_by, "cluster_id" ]).cumcount() + 1 # Unique columns and column combinations must be unique: for c in u_col_names: result = clusters[c + "_nu"] != clusters["row_ct"] md[c] = result.sum() clusters[c + "_invalid"] = result # Unique columns must either have no values or be fully not-null: for c in cols_to_count: result = (clusters["row_ct"] != clusters[c + "_ct"]) & (clusters[c + "_ct"] != 0) md[c] = result.sum() clusters[c + "_invalid"] = result # Rows that fail any of the above tests are invalid: invalid_inds = u.broadcast_suffix(list({*u_col_names, *cols_to_count}), "_invalid") clusters["cluster_invalid"] = clusters[invalid_inds].any(axis=1) return clusters, {"metadata": md}