def make_values_bool(self, df, df_features, _add_to_que=True): """ Convert all string bools to numeric bool value Args: df: pd.Dataframe Pandas dataframe. df_features: DataFrameTypes from eflow DataFrameTypes object. _add_to_que: bool Hidden variable to determine if the function should be pushed to the pipeline segment. """ for bool_feature in df_features.bool_features(): if df[bool_feature].dtype == "O": bool_check, true_val, false_val = self.__bool_string_values_check( df[bool_feature].dropna().unique()) # Replace bool string values with bools if bool_check: df[bool_feature].replace({ true_val: 1, false_val: 0 }, inplace=True) if _add_to_que: params_dict = locals() parameters = get_parameters(self.make_values_bool) self._DataPipelineSegment__add_function_to_que( "make_values_bool", parameters, params_dict)
def fill_nan_with_specfic_value(self, df, df_features, feature_name, replace_value, _add_to_que=True): check_if_feature_exists(df, feature_name) if feature_name not in df_features.all_features(): raise KeyError( f"The feature \'{feature_name}\' was not found in the dataframe!" + " Please select a valid feature from the df_features.") try: replace_value = replace_value.item() except AttributeError: pass if not self.__test_cleaning_methods: df[feature_name].fillna(replace_value, inplace=True) if _add_to_que: params_dict = locals() parameters = get_parameters(self.fill_nan_with_specfic_value) print("Replace nan with {0} on feature: {1}".format(replace_value, feature_name)) self._DataPipelineSegment__add_function_to_que("fill_nan_with_specfic_value", parameters, params_dict)
def apply_value_representation(self, df, df_features, _add_to_que=True): """ Translate features into most understandable/best representation Args: df: pd.Dataframe Pandas dataframe. df_features: DataFrameTypes from eflow DataFrameTypes object. _add_to_que: bool Hidden variable to determine if the function should be pushed to the pipeline segment. """ feature_value_represention = df_features.get_feature_value_representation( ) # Replace values by each corresponding feature value related dict for feature_name in feature_value_represention: if feature_name not in df.columns: raise KeyError( f"Dataframe doesn't have feature name '{feature_name}'.") df[feature_name].replace(feature_value_represention[feature_name], inplace=True) if _add_to_que: params_dict = locals() parameters = get_parameters(self.decode_data) self._DataPipelineSegment__add_function_to_que( "apply_value_representation", parameters, params_dict)
def decode_data(self, df, df_features, apply_value_representation=True, _add_to_que=True): """ Decode the data into non-numerical values for more descriptive analysis. Args: df: pd.Dataframe Pandas dataframe. df_features: DataFrameTypes from eflow DataFrameTypes object. apply_value_representation: bool Translate features into most understandable/best representation/ _add_to_que: bool Hidden variable to determine if the function should be pushed to the pipeline segment. """ # Decode data from categorical values to proper strings. decoder_dict = df_features.get_label_decoder() for feature_name in decoder_dict.keys(): if feature_name not in df.columns: continue if df[feature_name].dtype != "O": df[feature_name].replace(decoder_dict[feature_name], inplace=True) # Apply value representation to feature values if apply_value_representation: feature_value_represention = df_features.get_feature_value_representation( ) # Replace values by each corresponding feature value related dict for feature_name in feature_value_represention.keys(): if feature_name not in df.columns: continue if df[feature_name].dtype == "O": df[feature_name].replace( feature_value_represention[feature_name], inplace=True) df_features.set_feature_to_string(feature_name) if _add_to_que: params_dict = locals() parameters = get_parameters(self.decode_data) self._DataPipelineSegment__add_function_to_que( "decode_data", parameters, params_dict)
def remove_nans(self, df, df_features, feature_name, _add_to_que=True): """ Remove rows of data based on the given feature. Args: df: pd.Dataframe Pandas Dataframe df_features: DataFrameType from eflow Organizes feature types into groups. feature_name: string Name of the feature in the datatframe _add_to_que: bool Pushes the function to pipeline segment parent if set to 'True'. """ check_if_feature_exists(df, feature_name) if feature_name not in df_features.all_features(): raise KeyError( f"The feature \'{feature_name}\' was not found in the dataframe!" + " Please select a valid feature from the df_features.") if not self.__test_cleaning_methods: print(f"Remove data from rows where the feature {feature_name} is equal to nan") df[feature_name].dropna(inplace=True) df.reset_index(drop=True, inplace=True) df_features.remove_feature(feature_name) if _add_to_que: # Remove any unwanted arguments in params_dict params_dict = locals() parameters = get_parameters(self.remove_nans) self._DataPipelineSegment__add_function_to_que("remove_nans", parameters, params_dict)
def drop_feature(self, df, df_features, feature_name, _add_to_que=True): """ Drop a feature in the dataframe. Args: df: pd.Dataframe Pandas Dataframe df_features: DataFrameType from eflow Organizes feature types into groups. feature_name: string Name of the feature in the datatframe _add_to_que: bool Pushes the function to pipeline segment parent if set to 'True'. """ check_if_feature_exists(df, feature_name) if feature_name not in df_features.all_features(): raise KeyError( f"The feature \'{feature_name}\' was not found in the dataframe!" + " Please select a valid feature from the df_features.") if not self.__test_cleaning_methods: print("Droping Feature: ", feature_name) df.drop(columns=feature_name, inplace=True) df.reset_index(drop=True, inplace=True) df_features.remove_feature(feature_name) if _add_to_que: params_dict = locals() parameters = get_parameters(self.drop_feature) self._DataPipelineSegment__add_function_to_que("drop_feature", parameters, params_dict)
def remove_features(self, df, df_features, feature_names, _add_to_que=True): """ Removes unwanted features from the dataframe and saves them to the pipeline segment structure if _add_to_que is set to True. Args: df: Pandas Dataframe to update. df_features: DataFrameTypes object to update. feature_names: Features to remove _add_to_que: Pushes the function to pipeline segment parent if set to 'True'. """ if isinstance(feature_names, str): feature_names = [feature_names] for feature_n in feature_names: try: if feature_n in df_features.all_features(): df_features.remove_feature(feature_n) check_if_feature_exists(df, feature_n) df.drop(columns=[feature_n], inplace=True) except KeyError: pass if _add_to_que: params_dict = locals() parameters = get_parameters(self.remove_features) self._DataPipelineSegment__add_function_to_que( "remove_features", parameters, params_dict)
def apply_binning(self, df, df_features, binable_features=[], _add_to_que=True): # Remove any unwanted arguments in params_dict params_dict = locals() for arg in ["self", "df", "df_features", "_add_to_que", "params_dict"]: try: del params_dict[arg] except KeyError: pass # Iterate through all features if no features are selected if not binable_features: binable_features = df.columns # Apply binning for feature_name in binable_features: bin_labels_dict = df_features.get_feature_binning(feature_name) if bin_labels_dict: # Convert to category data df[feature_name] = pd.to_numeric(df[feature_name].dropna(), errors='coerce') df[feature_name] = pd.cut(df[feature_name], bins=bin_labels_dict["bins"], labels=bin_labels_dict["labels"]) # Feature set to categorical df_features.set_feature_to_categorical(feature_name) if _add_to_que: params_dict = locals() parameters = get_parameters(self.decode_data) self._DataPipelineSegment__add_function_to_que( "apply_binning", parameters, params_dict)
def encode_data(self, df, df_features, apply_value_representation=True, _add_to_que=True): """ Encode the data into numerical values for machine learning processes. Args: df: pd.Dataframe Pandas dataframe. df_features: DataFrameTypes from eflow DataFrameTypes object. apply_value_representation: bool Translate features into most understandable/best representation/ _add_to_que: bool Hidden variable to determine if the function should be pushed to the pipeline segment. """ # Apply value representation to feature values if apply_value_representation: feature_value_represention = df_features.get_feature_value_representation( ) # Inverse dict tmp_dict = copy.deepcopy(feature_value_represention) for feature_name in feature_value_represention.keys(): tmp_dict[feature_name] = dict() for val, reprs in feature_value_represention[ feature_name].items(): tmp_dict[feature_name][reprs] = val feature_value_represention = tmp_dict for feature_name in feature_value_represention.keys(): if feature_name not in df.columns: continue if df[feature_name].dtype == "O": df[feature_name].replace( feature_value_represention[feature_name], inplace=True) # Decode data from categorical values to proper strings. encoder_dict = df_features.get_label_encoder() for feature_name in encoder_dict.keys(): if feature_name not in df.columns: continue if df[feature_name].dtype == "O": df[feature_name].replace(encoder_dict[feature_name], inplace=True) df_features.set_feature_to_categorical(feature_name) if _add_to_que: params_dict = locals() parameters = get_parameters(self.encode_data) self._DataPipelineSegment__add_function_to_que( "encode_data", parameters, params_dict)
def revert_dummies(self, df, df_features, qualitative_features=[], _add_to_que=True): """ Convert dummies features back to the original feature. Args: df: pd.Dataframe Pandas dataframe. df_features: DataFrameTypes from eflow DataFrameTypes object. qualitative_features: collection of strings Feature names to convert the dummy features into original feature data. _add_to_que: bool Hidden variable to determine if the function should be pushed to the pipeline segment. """ df.reset_index(inplace=True, drop=True) if isinstance(qualitative_features, str): feature_name = [qualitative_features] for feature_name in qualitative_features: dummies_df = df[df_features.get_dummy_encoded_features() [feature_name]] dummies_columns = dummies_df.columns.to_list() tmp_df = dummies_df[dummies_df == 1].stack().reset_index() del dummies_df df[feature_name] = np.full([len(df)], np.nan) df[feature_name].iloc[ tmp_df["level_0"]] = tmp_df["level_1"].values.tolist() # Remove dummy features df.drop(columns=dummies_columns, inplace=True) df[feature_name] = df[feature_name].str[len(feature_name) + 1:] # Remove dummy encoded relationship df_features.remove_feature_from_dummy_encoded(feature_name) # Add feature back to original set in df_features try: pd.to_numeric(df[feature_name].dropna()) df_features.add_new_categorical_feature(feature_name) except ValueError: df_features.add_new_string_feature(feature_name) if _add_to_que: params_dict = locals() parameters = get_parameters(self.revert_dummies) self._DataPipelineSegment__add_function_to_que( "revert_dummies", parameters, params_dict)
def make_dummies(self, df, df_features, qualitative_features=[], _feature_values_dict=None, _add_to_que=True): """ Create dummies features of based on qualtative feature data and removes the original feature. Note _feature_values_dict does not need to be init. Used for backend resource. Args: df: pd.Dataframe Pandas dataframe. df_features: DataFrameTypes from eflow DataFrameTypes object. qualtative_features: collection of strings Feature names to convert the feature data into dummy features. _add_to_que: bool Hidden variable to determine if the function should be pushed to the pipeline segment. """ # Convert to the correct types if isinstance(qualitative_features, str): qualtative_features = [qualitative_features] if not _feature_values_dict: _feature_values_dict = dict() pd.set_option('mode.chained_assignment', None) for cat_feature in qualitative_features: if cat_feature not in df_features.string_features( ) | df_features.categorical_features(): raise UnsatisfiedRequirments( f"No feature named '{cat_feature}' in categorical or string features." ) if cat_feature not in _feature_values_dict: _feature_values_dict[cat_feature] = df[cat_feature].dropna( ).unique() _feature_values_dict[cat_feature].sort() _feature_values_dict[cat_feature] = _feature_values_dict[ cat_feature].tolist() dummy_features = [] for feature_value in _feature_values_dict[cat_feature]: new_feature = cat_feature + f"_{feature_value}" bool_array = df[cat_feature] == feature_value df[new_feature] = copy.deepcopy(bool_array) dummy_features.append(new_feature) # # Make dummies and remove original feature # dummies_df = pd.get_dummies(_feature_values_dict[cat_feature], # prefix=cat_feature) df.drop(columns=[cat_feature], inplace=True) df_features.remove_feature(cat_feature) df_features.set_feature_to_dummy_encoded(cat_feature, dummy_features) # # Apply to dataframe # for feature_name in dummies_df.columns: # df[feature_name] = dummies_df[feature_name] if _add_to_que: params_dict = locals() parameters = get_parameters(self.make_dummies) self._DataPipelineSegment__add_function_to_que( "make_dummies", parameters, params_dict)