def fill_nan_by_mode(self, df, df_features, feature_name, z_score=None, _add_to_que=True): check_if_feature_exists(df, feature_name) if feature_name not in df_features.all_features(): raise KeyError( f"The feature \'{feature_name}\' was not found in the dataframe!" + " Please select a valid feature from the df_features.") if z_score: series_obj = zcore_remove_outliers(df, feature_name, z_score).dropna() else: series_obj = df[feature_name].dropna() mode_series = series_obj.mode() if not len(mode_series): pass else: replace_value = mode_series[0] if not self.__test_cleaning_methods: print("Fill nan by mode") self.fill_nan_with_specfic_value(df, df_features, feature_name=feature_name, replace_value=replace_value, _add_to_que=_add_to_que)
def fill_nan_with_specfic_value(self, df, df_features, feature_name, replace_value, _add_to_que=True): check_if_feature_exists(df, feature_name) if feature_name not in df_features.all_features(): raise KeyError( f"The feature \'{feature_name}\' was not found in the dataframe!" + " Please select a valid feature from the df_features.") try: replace_value = replace_value.item() except AttributeError: pass if not self.__test_cleaning_methods: df[feature_name].fillna(replace_value, inplace=True) if _add_to_que: params_dict = locals() parameters = get_parameters(self.fill_nan_with_specfic_value) print("Replace nan with {0} on feature: {1}".format(replace_value, feature_name)) self._DataPipelineSegment__add_function_to_que("fill_nan_with_specfic_value", parameters, params_dict)
def make_nan_assertions(self, df, df_features, feature_name, _add_to_que=True): """ Make nan assertions for boolean features. Args: df: pd.Dataframe Pandas Dataframe df_features: DataFrameType from eflow Organizes feature types into groups. feature_name: string Name of the feature in the datatframe _add_to_que: bool Pushes the function to pipeline segment parent if set to 'True'. """ check_if_feature_exists(df, feature_name) if feature_name not in df_features.all_features(): raise KeyError( f"The feature \'{feature_name}\' was not found in the dataframe!" + " Please select a valid feature from the df_features.") if feature_name not in df_features.bool_features(): raise UnsatisfiedRequirments(f"{feature_name} must be a bool feature.") unique_series = df[feature_name].dropna().unique().tolist() if len(unique_series) == 1 and ( unique_series[0] == 1 or unique_series[0] == 0): replace_value = int(unique_series[0] == 1) self.fill_nan_with_specfic_value(df, df_features, feature_name=feature_name, replace_value=replace_value, _add_to_que=_add_to_que) else: raise UnsatisfiedRequirments(f"Boolean assertions can't be made with this given feature {feature_name}.")
def remove_nans(self, df, df_features, feature_name, _add_to_que=True): """ Remove rows of data based on the given feature. Args: df: pd.Dataframe Pandas Dataframe df_features: DataFrameType from eflow Organizes feature types into groups. feature_name: string Name of the feature in the datatframe _add_to_que: bool Pushes the function to pipeline segment parent if set to 'True'. """ check_if_feature_exists(df, feature_name) if feature_name not in df_features.all_features(): raise KeyError( f"The feature \'{feature_name}\' was not found in the dataframe!" + " Please select a valid feature from the df_features.") if not self.__test_cleaning_methods: print(f"Remove data from rows where the feature {feature_name} is equal to nan") df[feature_name].dropna(inplace=True) df.reset_index(drop=True, inplace=True) df_features.remove_feature(feature_name) if _add_to_que: # Remove any unwanted arguments in params_dict params_dict = locals() parameters = get_parameters(self.remove_nans) self._DataPipelineSegment__add_function_to_que("remove_nans", parameters, params_dict)
def fill_nan_by_average(self, df, df_features, feature_name, z_score=None, _add_to_que=True): check_if_feature_exists(df, feature_name) if feature_name not in df_features.all_features(): raise KeyError( f"The feature \'{feature_name}\' was not found in the dataframe!" + " Please select a valid feature from the df_features.") params_dict = locals() # Remove any unwanted arguments in params_dict if _add_to_que: params_dict = locals() for arg in ["self", "df", "df_features", "_add_to_que", "params_dict"]: del params_dict[arg] if feature_name not in df_features.continuous_numerical_features(): raise UnsatisfiedRequirments(f"{feature_name} must be a saved as float or integer in df_features") if z_score: if isinstance(z_score,float) or isinstance(z_score,int): series_obj = zcore_remove_outliers(df, feature_name, z_score).dropna() else: raise ValueError("Z-Score must be at numerical value.") else: series_obj = df[feature_name].dropna() replace_value = series_obj.mean() if not self.__test_cleaning_methods: print("Fill nan based on the average of the distribution.") self.fill_nan_with_specfic_value(df, df_features, feature_name=feature_name, replace_value=replace_value, _add_to_que=_add_to_que)
def drop_feature(self, df, df_features, feature_name, _add_to_que=True): """ Drop a feature in the dataframe. Args: df: pd.Dataframe Pandas Dataframe df_features: DataFrameType from eflow Organizes feature types into groups. feature_name: string Name of the feature in the datatframe _add_to_que: bool Pushes the function to pipeline segment parent if set to 'True'. """ check_if_feature_exists(df, feature_name) if feature_name not in df_features.all_features(): raise KeyError( f"The feature \'{feature_name}\' was not found in the dataframe!" + " Please select a valid feature from the df_features.") if not self.__test_cleaning_methods: print("Droping Feature: ", feature_name) df.drop(columns=feature_name, inplace=True) df.reset_index(drop=True, inplace=True) df_features.remove_feature(feature_name) if _add_to_que: params_dict = locals() parameters = get_parameters(self.drop_feature) self._DataPipelineSegment__add_function_to_que("drop_feature", parameters, params_dict)
def remove_features(self, df, df_features, feature_names, _add_to_que=True): """ Removes unwanted features from the dataframe and saves them to the pipeline segment structure if _add_to_que is set to True. Args: df: Pandas Dataframe to update. df_features: DataFrameTypes object to update. feature_names: Features to remove _add_to_que: Pushes the function to pipeline segment parent if set to 'True'. """ if isinstance(feature_names, str): feature_names = [feature_names] for feature_n in feature_names: try: if feature_n in df_features.all_features(): df_features.remove_feature(feature_n) check_if_feature_exists(df, feature_n) df.drop(columns=[feature_n], inplace=True) except KeyError: pass if _add_to_que: params_dict = locals() parameters = get_parameters(self.remove_features) self._DataPipelineSegment__add_function_to_que( "remove_features", parameters, params_dict)
def fill_nan_by_occurance_percentaile(self, df, df_features, feature_name, percentaile, z_score=None, _add_to_que=True): check_if_feature_exists(df, feature_name) if feature_name not in df_features.all_features(): raise KeyError( f"The feature \'{feature_name}\' was not found in the dataframe!" + " Please select a valid feature from the df_features.") # Remove any unwanted arguments in params_dict if _add_to_que: params_dict = locals() for arg in ["self", "df", "df_features", "_add_to_que", "params_dict"]: try: del params_dict[arg] except KeyError: pass if z_score: series_obj = zcore_remove_outliers(df, feature_name, z_score).dropna() else: series_obj = df[feature_name].dropna() array = np.asarray(series_obj.value_counts() / df.dropna().shape[0]) idx = (np.abs(array - (percentaile / 100))).argmin() replace_value = series_obj.value_counts().keys()[idx] if not self.__test_cleaning_methods: print("Fill nan by occurance percentaile.") self.fill_nan_with_specfic_value(df, df_features, feature_name=feature_name, replace_value=replace_value, _add_to_que=_add_to_que)
def fill_nan_by_distribution(self, df, df_features, feature_name, percentile, z_score=None, _add_to_que=True): """ Fill nan by the distribution of data. Args: df: pd.Dataframe Pandas Dataframe df_features: DataFrameType from eflow Organizes feature types into groups. feature_name: string Name of the feature in the datatframe percentile: float or int z_score: _add_to_que: bool Pushes the function to pipeline segment parent if set to 'True'. """ check_if_feature_exists(df, feature_name) if feature_name not in df_features.all_features(): raise KeyError( f"The feature \'{feature_name}\' was not found in the dataframe!" + " Please select a valid feature from the df_features.") if feature_name in df_features.continuous_numerical_features(): series_obj = df[feature_name].sort_values() else: series_obj = df.sort_values([feature_name], ascending=True).groupby(feature_name).head(float("inf"))[feature_name] if z_score: if isinstance(z_score, float) or isinstance(z_score, int): series_obj = zcore_remove_outliers(series_obj.to_frame(), feature_name, z_score).dropna() else: raise ValueError("Z-Score must be at numerical value.") else: series_obj = df[feature_name].dropna() replace_value = np.percentile(series_obj, percentile) # Remove any unwanted arguments in params_dict if not self.__test_cleaning_methods: print(f"Fill nan on distribution; {percentile}% of {feature_name}") self.fill_nan_with_specfic_value(df, df_features, feature_name=feature_name, replace_value=replace_value, _add_to_que=_add_to_que)