Beispiel #1
0
    def make_values_bool(self, df, df_features, _add_to_que=True):
        """

            Convert all string bools to numeric bool value
        Args:
            df: pd.Dataframe
                Pandas dataframe.

            df_features: DataFrameTypes from eflow
                DataFrameTypes object.

            _add_to_que: bool
                Hidden variable to determine if the function should be pushed
                to the pipeline segment.
        """
        for bool_feature in df_features.bool_features():
            if df[bool_feature].dtype == "O":
                bool_check, true_val, false_val = self.__bool_string_values_check(
                    df[bool_feature].dropna().unique())

                # Replace bool string values with bools
                if bool_check:
                    df[bool_feature].replace({
                        true_val: 1,
                        false_val: 0
                    },
                                             inplace=True)
        if _add_to_que:
            params_dict = locals()
            parameters = get_parameters(self.make_values_bool)
            self._DataPipelineSegment__add_function_to_que(
                "make_values_bool", parameters, params_dict)
Beispiel #2
0
    def fill_nan_with_specfic_value(self,
                                    df,
                                    df_features,
                                    feature_name,
                                    replace_value,
                                    _add_to_que=True):

        check_if_feature_exists(df,
                                feature_name)

        if feature_name not in df_features.all_features():
            raise KeyError(
                f"The feature \'{feature_name}\' was not found in the dataframe!"
                + " Please select a valid feature from the df_features.")

        try:
            replace_value = replace_value.item()
        except AttributeError:
            pass

        if not self.__test_cleaning_methods:
            df[feature_name].fillna(replace_value,
                                    inplace=True)

            if _add_to_que:
                params_dict = locals()
                parameters = get_parameters(self.fill_nan_with_specfic_value)

                print("Replace nan with {0} on feature: {1}".format(replace_value,
                                                                    feature_name))

                self._DataPipelineSegment__add_function_to_que("fill_nan_with_specfic_value",
                                                               parameters,
                                                               params_dict)
Beispiel #3
0
    def apply_value_representation(self, df, df_features, _add_to_que=True):
        """

            Translate features into most understandable/best representation

        Args:
            df: pd.Dataframe
                Pandas dataframe.

            df_features: DataFrameTypes from eflow
                DataFrameTypes object.

            _add_to_que: bool
                Hidden variable to determine if the function should be pushed
                to the pipeline segment.
        """
        feature_value_represention = df_features.get_feature_value_representation(
        )

        # Replace values by each corresponding feature value related dict
        for feature_name in feature_value_represention:

            if feature_name not in df.columns:
                raise KeyError(
                    f"Dataframe doesn't have feature name '{feature_name}'.")

            df[feature_name].replace(feature_value_represention[feature_name],
                                     inplace=True)

        if _add_to_que:
            params_dict = locals()
            parameters = get_parameters(self.decode_data)
            self._DataPipelineSegment__add_function_to_que(
                "apply_value_representation", parameters, params_dict)
Beispiel #4
0
    def decode_data(self,
                    df,
                    df_features,
                    apply_value_representation=True,
                    _add_to_que=True):
        """

            Decode the data into non-numerical values for more descriptive analysis.

        Args:
            df: pd.Dataframe
                Pandas dataframe.

            df_features: DataFrameTypes from eflow
                DataFrameTypes object.

            apply_value_representation: bool
                Translate features into most understandable/best representation/

            _add_to_que: bool
                Hidden variable to determine if the function should be pushed
                to the pipeline segment.
        """
        # Decode data from categorical values to proper strings.
        decoder_dict = df_features.get_label_decoder()
        for feature_name in decoder_dict.keys():

            if feature_name not in df.columns:
                continue

            if df[feature_name].dtype != "O":
                df[feature_name].replace(decoder_dict[feature_name],
                                         inplace=True)

        # Apply value representation to feature values
        if apply_value_representation:
            feature_value_represention = df_features.get_feature_value_representation(
            )
            # Replace values by each corresponding feature value related dict
            for feature_name in feature_value_represention.keys():
                if feature_name not in df.columns:
                    continue

                if df[feature_name].dtype == "O":
                    df[feature_name].replace(
                        feature_value_represention[feature_name], inplace=True)

                df_features.set_feature_to_string(feature_name)

        if _add_to_que:
            params_dict = locals()
            parameters = get_parameters(self.decode_data)
            self._DataPipelineSegment__add_function_to_que(
                "decode_data", parameters, params_dict)
Beispiel #5
0
    def remove_nans(self,
                    df,
                    df_features,
                    feature_name,
                    _add_to_que=True):
        """

            Remove rows of data based on the given feature.

        Args:
            df: pd.Dataframe
                Pandas Dataframe

            df_features: DataFrameType from eflow
                Organizes feature types into groups.

            feature_name: string
                Name of the feature in the datatframe

            _add_to_que: bool
                Pushes the function to pipeline segment parent if set to 'True'.
        """
        check_if_feature_exists(df,
                                feature_name)

        if feature_name not in df_features.all_features():
            raise KeyError(
                f"The feature \'{feature_name}\' was not found in the dataframe!"
                + " Please select a valid feature from the df_features.")

        if not self.__test_cleaning_methods:

            print(f"Remove data from rows where the feature {feature_name} is equal to nan")

            df[feature_name].dropna(inplace=True)
            df.reset_index(drop=True,
                           inplace=True)
            df_features.remove_feature(feature_name)

            if _add_to_que:

                # Remove any unwanted arguments in params_dict
                params_dict = locals()
                parameters = get_parameters(self.remove_nans)

                self._DataPipelineSegment__add_function_to_que("remove_nans",
                                                               parameters,
                                                               params_dict)
Beispiel #6
0
    def drop_feature(self,
                     df,
                     df_features,
                     feature_name,
                     _add_to_que=True):
        """
            Drop a feature in the dataframe.

        Args:
            df: pd.Dataframe
                Pandas Dataframe

            df_features: DataFrameType from eflow
                Organizes feature types into groups.

            feature_name: string
                Name of the feature in the datatframe

            _add_to_que: bool
                Pushes the function to pipeline segment parent if set to 'True'.
        """
        check_if_feature_exists(df,
                                feature_name)

        if feature_name not in df_features.all_features():
            raise KeyError(
                f"The feature \'{feature_name}\' was not found in the dataframe!"
                + " Please select a valid feature from the df_features.")

        if not self.__test_cleaning_methods:

            print("Droping Feature: ", feature_name)

            df.drop(columns=feature_name,
                    inplace=True)
            df.reset_index(drop=True,
                           inplace=True)
            df_features.remove_feature(feature_name)

            if _add_to_que:
                params_dict = locals()
                parameters = get_parameters(self.drop_feature)

                self._DataPipelineSegment__add_function_to_que("drop_feature",
                                                               parameters,
                                                               params_dict)
Beispiel #7
0
    def remove_features(self,
                        df,
                        df_features,
                        feature_names,
                        _add_to_que=True):
        """

            Removes unwanted features from the dataframe and saves them to the
            pipeline segment structure if _add_to_que is set to True.

        Args:
            df:
                Pandas Dataframe to update.

            df_features:
                DataFrameTypes object to update.

            feature_names:
                Features to remove

            _add_to_que:
                Pushes the function to pipeline segment parent if set to 'True'.
        """

        if isinstance(feature_names, str):
            feature_names = [feature_names]

        for feature_n in feature_names:

            try:

                if feature_n in df_features.all_features():
                    df_features.remove_feature(feature_n)

                check_if_feature_exists(df, feature_n)
                df.drop(columns=[feature_n], inplace=True)

            except KeyError:
                pass

        if _add_to_que:
            params_dict = locals()
            parameters = get_parameters(self.remove_features)
            self._DataPipelineSegment__add_function_to_que(
                "remove_features", parameters, params_dict)
Beispiel #8
0
    def apply_binning(self,
                      df,
                      df_features,
                      binable_features=[],
                      _add_to_que=True):

        # Remove any unwanted arguments in params_dict
        params_dict = locals()
        for arg in ["self", "df", "df_features", "_add_to_que", "params_dict"]:
            try:
                del params_dict[arg]
            except KeyError:
                pass

        # Iterate through all features if no features are selected
        if not binable_features:
            binable_features = df.columns

        # Apply binning
        for feature_name in binable_features:
            bin_labels_dict = df_features.get_feature_binning(feature_name)
            if bin_labels_dict:
                # Convert to category data
                df[feature_name] = pd.to_numeric(df[feature_name].dropna(),
                                                 errors='coerce')
                df[feature_name] = pd.cut(df[feature_name],
                                          bins=bin_labels_dict["bins"],
                                          labels=bin_labels_dict["labels"])

                # Feature set to categorical
                df_features.set_feature_to_categorical(feature_name)

        if _add_to_que:
            params_dict = locals()
            parameters = get_parameters(self.decode_data)
            self._DataPipelineSegment__add_function_to_que(
                "apply_binning", parameters, params_dict)
Beispiel #9
0
    def encode_data(self,
                    df,
                    df_features,
                    apply_value_representation=True,
                    _add_to_que=True):
        """

            Encode the data into numerical values for machine learning processes.

        Args:
            df: pd.Dataframe
                Pandas dataframe.

            df_features: DataFrameTypes from eflow
                DataFrameTypes object.

            apply_value_representation: bool
                Translate features into most understandable/best representation/

            _add_to_que: bool
                Hidden variable to determine if the function should be pushed
                to the pipeline segment.
        """

        # Apply value representation to feature values
        if apply_value_representation:
            feature_value_represention = df_features.get_feature_value_representation(
            )

            # Inverse dict
            tmp_dict = copy.deepcopy(feature_value_represention)
            for feature_name in feature_value_represention.keys():
                tmp_dict[feature_name] = dict()
                for val, reprs in feature_value_represention[
                        feature_name].items():
                    tmp_dict[feature_name][reprs] = val

            feature_value_represention = tmp_dict

            for feature_name in feature_value_represention.keys():
                if feature_name not in df.columns:
                    continue

                if df[feature_name].dtype == "O":
                    df[feature_name].replace(
                        feature_value_represention[feature_name], inplace=True)

        # Decode data from categorical values to proper strings.
        encoder_dict = df_features.get_label_encoder()
        for feature_name in encoder_dict.keys():

            if feature_name not in df.columns:
                continue

            if df[feature_name].dtype == "O":
                df[feature_name].replace(encoder_dict[feature_name],
                                         inplace=True)

                df_features.set_feature_to_categorical(feature_name)

        if _add_to_que:
            params_dict = locals()
            parameters = get_parameters(self.encode_data)
            self._DataPipelineSegment__add_function_to_que(
                "encode_data", parameters, params_dict)
Beispiel #10
0
    def revert_dummies(self,
                       df,
                       df_features,
                       qualitative_features=[],
                       _add_to_que=True):
        """

            Convert dummies features back to the original feature.

        Args:
            df: pd.Dataframe
                Pandas dataframe.

            df_features: DataFrameTypes from eflow
                DataFrameTypes object.

            qualitative_features: collection of strings
                Feature names to convert the dummy features into original feature
                data.

            _add_to_que: bool
                Hidden variable to determine if the function should be pushed
                to the pipeline segment.
        """

        df.reset_index(inplace=True, drop=True)

        if isinstance(qualitative_features, str):
            feature_name = [qualitative_features]

        for feature_name in qualitative_features:
            dummies_df = df[df_features.get_dummy_encoded_features()
                            [feature_name]]
            dummies_columns = dummies_df.columns.to_list()

            tmp_df = dummies_df[dummies_df == 1].stack().reset_index()
            del dummies_df

            df[feature_name] = np.full([len(df)], np.nan)
            df[feature_name].iloc[
                tmp_df["level_0"]] = tmp_df["level_1"].values.tolist()

            # Remove dummy features
            df.drop(columns=dummies_columns, inplace=True)

            df[feature_name] = df[feature_name].str[len(feature_name) + 1:]

            # Remove dummy encoded relationship
            df_features.remove_feature_from_dummy_encoded(feature_name)

            # Add feature back to original set in df_features
            try:
                pd.to_numeric(df[feature_name].dropna())
                df_features.add_new_categorical_feature(feature_name)

            except ValueError:
                df_features.add_new_string_feature(feature_name)

        if _add_to_que:
            params_dict = locals()
            parameters = get_parameters(self.revert_dummies)
            self._DataPipelineSegment__add_function_to_que(
                "revert_dummies", parameters, params_dict)
Beispiel #11
0
    def make_dummies(self,
                     df,
                     df_features,
                     qualitative_features=[],
                     _feature_values_dict=None,
                     _add_to_que=True):
        """

            Create dummies features of based on qualtative feature data and removes
            the original feature.

            Note
                _feature_values_dict does not need to be init. Used for backend
                resource.

        Args:
            df: pd.Dataframe
                Pandas dataframe.

            df_features: DataFrameTypes from eflow
                DataFrameTypes object.

            qualtative_features: collection of strings
                Feature names to convert the feature data into dummy features.

            _add_to_que: bool
                Hidden variable to determine if the function should be pushed
                to the pipeline segment.
        """
        # Convert to the correct types
        if isinstance(qualitative_features, str):
            qualtative_features = [qualitative_features]

        if not _feature_values_dict:
            _feature_values_dict = dict()

        pd.set_option('mode.chained_assignment', None)

        for cat_feature in qualitative_features:

            if cat_feature not in df_features.string_features(
            ) | df_features.categorical_features():
                raise UnsatisfiedRequirments(
                    f"No feature named '{cat_feature}' in categorical or string features."
                )

            if cat_feature not in _feature_values_dict:
                _feature_values_dict[cat_feature] = df[cat_feature].dropna(
                ).unique()
                _feature_values_dict[cat_feature].sort()
                _feature_values_dict[cat_feature] = _feature_values_dict[
                    cat_feature].tolist()

            dummy_features = []
            for feature_value in _feature_values_dict[cat_feature]:
                new_feature = cat_feature + f"_{feature_value}"
                bool_array = df[cat_feature] == feature_value
                df[new_feature] = copy.deepcopy(bool_array)
                dummy_features.append(new_feature)

            # # Make dummies and remove original feature
            # dummies_df = pd.get_dummies(_feature_values_dict[cat_feature],
            #                             prefix=cat_feature)

            df.drop(columns=[cat_feature], inplace=True)
            df_features.remove_feature(cat_feature)
            df_features.set_feature_to_dummy_encoded(cat_feature,
                                                     dummy_features)

            # # Apply to dataframe
            # for feature_name in dummies_df.columns:
            #     df[feature_name] = dummies_df[feature_name]

        if _add_to_que:
            params_dict = locals()
            parameters = get_parameters(self.make_dummies)
            self._DataPipelineSegment__add_function_to_que(
                "make_dummies", parameters, params_dict)