def categories_ord_cols(df, ord_cols): '''Returns the categories of the ordinal columns in such a way they can be displayed to the front-end Parameters: ----------- df: pd.DataFrame ord_cols: list of str, containing the names of the ordinal columns ''' result = [] df_new = make_missing_np_nan(df) for col in ord_cols: col_result = {"name": col, "values": []} i = 1 categories = list(set(df_new[col])) if pd.isnull(categories).any(): idx = np.where(pd.isnull(categories) == True)[0][0] del categories[idx] for category in categories: col_result['values'].append({ "name": category, "id": i, "fixed": "false" }) i += 1 result.append(col_result) return result
def MICE_imputation(df, categorical=False, nr_iter=3): '''Returns the dataframe where missing values are imputed using MICE Parameters: ----------- df: pd.DataFrame categorical: boolean, if set to True, the returned dataframe will contain the original category values (as opposed to their integer index) nr_iter: int, the number of imputations to be generated Returns: -------- df_result: pd.DataFrame where the missing values are imputed using MICE ''' df_new = df.copy() df_new = make_missing_np_nan(df_new) missing, unique = imputation_heuristic_column(df, 0.99) df_new = delete_cols(df_new, missing) df_new = delete_cols(df_new, unique) cat_cols, date_cols, num_cols = type_cols(df_new) df_new = df_new[num_cols] columns = df_new.columns result = [0] * nr_iter for i in range(nr_iter): imputer = IterativeImputer(sample_posterior=True) imputed = imputer.fit_transform(df_new) df_imputed = pd.DataFrame(imputed, columns=columns) result[i] = df_imputed return result
def KNN_imputation(df, k=5): '''Imputes the missing values in a dataframe using K-Nearest Neighbor Parameters: ----------- df: pd.DataFrame k: int, number of neighboring samples to use for imputation Returns: -------- df_result: pd.DataFrame where the missing values are imputed using KNN ''' df_new = df.copy() df_new = make_missing_np_nan(df_new) missing, unique = imputation_heuristic_column(df, 1) df_new = delete_cols(df_new, missing) df_new = delete_cols(df_new, unique) cat_cols, date_cols, num_cols = type_cols(df_new) df_new = df_new[num_cols] columns = df_new.columns imputer = KNNImputer(n_neighbors=k) imputed = imputer.fit_transform(df_new) df_imputed = pd.DataFrame(imputed, columns=columns) not_imputed_cols = cat_cols + date_cols df_result = pd.concat([df_imputed, df[not_imputed_cols]], axis=1) return df_result
def regression_imputation(df): '''Returns the dataframe where missing values are imputed using IterativeImputer and BayesianRidge() This is a regression imputation method. Parameters: ----------- df: pd.DataFrame Returns: -------- df_result: pd.DataFrame where the missing values are imputed using multiple imputation ''' df_new = df.copy() df_new = make_missing_np_nan(df_new) missing, unique = imputation_heuristic_column(df, 0.99) df_new = delete_cols(df_new, missing) df_new = delete_cols(df_new, unique) cat_cols, date_cols, num_cols = type_cols(df_new) df_new = df_new[num_cols] columns = df_new.columns imputer = IterativeImputer(random_state=0) imputed = imputer.fit_transform(df_new) df_imputed = pd.DataFrame(imputed, columns=columns) not_imputed_cols = cat_cols + date_cols df_result = pd.concat([df_imputed, df[not_imputed_cols]], axis=1) return df_result
def median_imputation(df): '''Imputes the missing values in a data frame using median imputation Parameters: ----------- df: pd.DataFrame Returns: -------- df_result: pd.DataFrame where the missing values are imputed using the median ''' imp_median = SimpleImputer(missing_values=np.nan, strategy='median') df = make_missing_np_nan(df) cat_cols, date_cols, num_cols = type_cols(df) df_new = df[num_cols] columns = df_new.columns df_imputed = imp_median.fit_transform(df_new) df_imputed = pd.DataFrame(df_imputed, columns=columns) not_imputed_cols = cat_cols + date_cols df_result = pd.concat([df_imputed, df[not_imputed_cols]], axis=1) return df_result
def get_outliers_info(df, outlier_method): '''For the frontend produces the outlier scores, the three sigmas, and the corresponding plot of the outliers Parameters: ---------- df: pd.DataFrame outlier_method: str, corresponding to one of the following outlier detection methods: ['LOF', 'IF', 'SVM', 'KNN', 'VAE'] ''' df_new = make_missing_np_nan(df) df_new = df_new.dropna() cat_cols, date_cols, num_cols = type_cols(df_new) df_new = df_new[num_cols] if outlier_method == "LOF": outliers = detect_outliers_LOF(df_new) elif outlier_method == "IF": outliers = detect_outliers_IF(df_new) elif outlier_method == "SVM": outliers = detect_outliers_SVM(df_new) elif outlier_method == "KNN": outliers = detect_outliers_KNN(df_new) elif outlier_method == "VAE": outliers = detect_outliers_VAE(df_new) return outliers, three_sigma(outliers), detection_scores(outliers), outlier_scores_plot(outliers)
def line_plot(df, target=False, color=False, plot_range=None): '''Returns a lineplot of a prespecified column Parameters: ----------- df: pd.DataFrame target: str, name of the column, default is the last column of the dataframe (which is usually the target variable) color: str, name of a column which will illustrate the color of the lines (in combination with the target) plot_range: list with 2 values, indicating the range of the x-axis ''' if not target: target = df.columns[-1] #sets it to the "target" variable on default df = make_missing_np_nan(df, replace_with='nan') if color: df_small = df[[target, color]] fig = go.Figure() for val in set(df_small[color]): df_new = df_small[df_small[color] == val] unique_vals, unique_vals_counts = np.unique( [int_element for int_element in df_new[target].tolist()], return_counts=True) fig.add_trace( go.Scatter(x=unique_vals, y=unique_vals_counts, mode='lines', name=val)) fig.update_layout(xaxis={ 'range': plot_range, 'title': target }, yaxis={'title': 'count'}) return fig else: unique_vals, unique_vals_counts = np.unique( [int_element for int_element in df[target].tolist()], return_counts=True) fig = go.Figure() fig.add_trace( go.Scatter(x=unique_vals, y=unique_vals_counts, mode='lines', name=target)) fig.update_layout(xaxis={ 'range': plot_range, 'title': target }, yaxis={'title': 'count'}) return fig
def nr_rows_missing(df): '''Returns the number of rows with missing values Parameters: ----------- df: pd.DataFrame Returns: -------- int: number of rows with missing values ''' df_all_nan = make_missing_np_nan(df) return len(df_all_nan[df_all_nan.isnull().any(axis=1)])
def inference(df, fast=True): ''' Returns a dictionary with information about each column in a dataframe using PANDAS, information includes: - detected data type (using pandas) -> data_type_pandas - nr. of missing values -> nr_missing - % of data consist of missing values -> pct_missing - nr. of unique values -> nr_unique Parameters: ----------- df: pd.DataFrame fast: bool, indicates whether to run fast (less accurate) or slow (more accurate) data type detection inference ''' df = make_missing_np_nan(df) #set all missing value encodings to np.nan if fast: data_types = detect_datatypes(df) else: data_types = detect_datatypes_ptype(df) columns_missing = df.columns[df.isna().any()].tolist() result_inference = {} for idx,col in enumerate(df.columns): unique_vals, unique_vals_counts = np.unique([str(int_element) for int_element in df[col].tolist()], return_counts=True) nan_idx = np.where(unique_vals == 'nan') nr_missing = df[col].isnull().sum() if col in columns_missing: result_inference[col] = {'data_type': str(data_types[col]), 'nr_missing': int(nr_missing), 'pct_missing': str('{:.1f}%'.format(nr_missing / len(df) * 100)), 'nr_unique': int(len(unique_vals)), 'pct_unique': str('{:.1f}%'.format(len(unique_vals) / len(df) * 100)), 'distribution_plot': distribution_plot(col, df[col], str(data_types[col]), unique_vals, unique_vals_counts) } else: result_inference[col] = {'data_type': str(data_types[col]), 'nr_missing': 0, 'pct_missing': '0.0%', 'nr_unique': int(len(unique_vals)), 'pct_unique': str('{:.1f}%'.format(len(unique_vals) / len(df) * 100)), 'distribution_plot': distribution_plot(col, df[col], str(data_types[col]), unique_vals, unique_vals_counts) } return result_inference
def RF_imputation(df, fast=True): '''Returns the dataframe where missing values are imputed using Random Forest Imputation (sklearn) ExtraTreesRegressor is used for increased speed. Parameters: ----------- df: pd.DataFrame fast: boolean, if set to True, ExtraTreesRegressor is used in preference of RandomForestRegressor Returns: -------- df_result: pd.DataFrame where the missing values are imputed using Random Forest (MissForest) ''' df_new = df.copy() df_new = make_missing_np_nan(df_new) missing, unique = imputation_heuristic_column(df, 0.99) df_new = delete_cols(df_new, missing) df_new = delete_cols(df_new, unique) #categorical and datetime columns cannot be imputed, so are removed from the imputation dataframe cat_cols, date_cols, num_cols = type_cols(df_new) df_new = df_new[num_cols] columns = df_new.columns if fast: imputer = IterativeImputer(random_state=0, estimator=ExtraTreesRegressor( n_estimators=10, random_state=0)) else: imputer = IterativeImputer(random_state=0, estimator=RandomForestRegressor( n_estimators=10, random_state=0)) imputed = imputer.fit_transform(df_new) df_imputed = pd.DataFrame(imputed, columns=columns) #categorical and datetime columns are added back not_imputed_cols = cat_cols + date_cols df_result = pd.concat([df_imputed, df[not_imputed_cols]], axis=1) return df_result
def placeholder_imputation(df, col, placeholder): '''Returns the data frame with the missing values of the specified columns replaced by a placeholder Parameters: ----------- df: pd.DataFrame col: str, corresponding to a categorical column in the data frame placehold: str, placeholder value to replace the missing values with Returns: -------- pd.DataFrame: where the missing values for the specified columns are replaced by a placeholder ''' df = make_missing_np_nan(df) df_new = df[[col]] df_new.fillna(value=placeholder, inplace=True) df[col] = df_new[col] return df
def DL_imputation(df, categorical=True): '''Returns the dataframe where missing values are imputed using DataWig Parameters: ----------- df: pd.DataFrame categorical: boolean, if set to True, the returned dataframe will contain the original category values (as opposed to their integer index) Returns: -------- df_result: pd.DataFrame where the missing values are imputed using DataWig ''' df_new = df.copy() df_new = make_missing_np_nan(df_new) missing, unique = imputation_heuristic_column(df, 0.99) df_new = delete_cols(df_new, missing) df_new = delete_cols(df_new, unique) cat_cols, date_cols, num_cols = type_cols(df_new) df_new = df_new[num_cols] columns = df_new.columns num_cols = [col for col in df_new.columns if is_numeric_dtype(df_new[col])] string_cols = list(set(df_new.columns) - set(num_cols)) imputer = simple_imputer.SimpleImputer(input_columns=['1'], output_column='2') imputed = imputer.complete(df_new) df_imputed = pd.DataFrame(imputed, columns=columns) not_imputed_cols = cat_cols + date_cols df_result = pd.concat([df_imputed, df[not_imputed_cols]], axis=1) return df_result
def __init__(self, data): self.data = make_missing_np_nan(data)