예제 #1
0
파일: clean.py 프로젝트: nperera0/aethos
    def replace_missing_constant(
        self, *list_args, list_of_cols=[], constant=0, col_mapping=None
    ):
        """
        Replaces missing values in every numeric column with a constant.

        If no columns are supplied, missing values will be replaced with the mean in every numeric column.

        If a list of columns is provided use the list, otherwise use arguemnts.

        This function exists in `clean/numeric.py` as `replace_missing_constant`.
        
        Parameters
        ----------
        list_args : str(s), optional
            Specific columns to apply this technique to.

        list_of_cols : list, optional
            A list of specific columns to apply this technique to., by default []

        constant : int or float, optional
            Numeric value to replace all missing values with , by default 0

        col_mapping : dict, optional
            Dictionary mapping {'ColumnName': `constant`}, by default None
        
        Returns
        -------
        Data:
            Returns a deep copy of the Data object.

        Examples
        --------
        >>> data.replace_missing_constant(col_mapping={'a': 1, 'b': 2, 'c': 3})
        >>> data.replace_missing_constant('col1', 'col2', constant=2)
        >>> data.replace_missing_constant(['col1', 'col2'], constant=3)
        """

        report_info = technique_reason_repo["clean"]["numeric"]["constant"]

        if col_mapping:
            col_to_constant = col_mapping
        else:
            # If a list of columns is provided use the list, otherwise use arguemnts.
            col_to_constant = _input_columns(list_args, list_of_cols)

        (self.x_train, self.x_test,) = num.replace_missing_constant(
            x_train=self.x_train,
            x_test=self.x_test,
            col_to_constant=col_to_constant,
            constant=constant,
        )

        if self.report is not None:
            if not col_to_constant:
                self.report.report_technique(report_info, self.x_train.columns)
            else:
                self.report.report_technique(report_info, list(col_to_constant))

        return self.copy()
예제 #2
0
    def stem_nltk(self,
                  *list_args,
                  list_of_cols=[],
                  stemmer="porter",
                  new_col_name="_stemmed"):
        """
        Transforms text to their word stem, base or root form. 
        For example:
            dogs --> dog
            churches --> church
            abaci --> abacus
        
        Parameters
        ----------
        list_args : str(s), optional
            Specific columns to apply this technique to.

        list_of_cols : list, optional
            A list of specific columns to apply this technique to., by default []

        stemmer : str, optional
            Type of NLTK stemmer to use, by default porter

            Current stemming implementations:
                - porter
                - snowball

            For more information please refer to the NLTK stemming api https://www.nltk.org/api/nltk.stem.html

        new_col_name : str, optional
            New column name to be created when applying this technique, by default `COLUMN_stemmed`
        
        Returns
        -------
        Data:
            Returns a deep copy of the Data object.

        Examples
        --------
        >>> data.stem_nltk('col1')
        >>> data.stem_nltk(['col1', 'col2'], stemmer='snowball')
        """

        report_info = technique_reason_repo["preprocess"]["text"]["stem"]

        list_of_cols = _input_columns(list_args, list_of_cols)

        self.x_train, self.x_test = text.nltk_stem(
            x_train=self.x_train,
            x_test=self.x_test,
            list_of_cols=list_of_cols,
            stemmer=stemmer,
            new_col_name=new_col_name,
        )

        if self.report is not None:
            self.report.report_technique(report_info, list_of_cols)

        return self.copy()
예제 #3
0
파일: clean.py 프로젝트: nperera0/aethos
    def replace_missing_indicator(
        self,
        *list_args,
        list_of_cols=[],
        missing_indicator=1,
        valid_indicator=0,
        keep_col=True
    ):
        """
        Adds a new column describing whether data is missing for each record in a column.

        This is useful if the missing data has meaning, aka not random.
        
        Parameters
        ----------
        list_args : str(s), optional
            Specific columns to apply this technique to.
            
        list_of_cols : list, optional
            A list of specific columns to apply this technique to., by default []

        missing_indicator : int, optional
            Value to indicate missing data, by default 1

        valid_indicator : int, optional
            Value to indicate non missing data, by default 0

        keep_col : bool, optional
            True to keep column, False to replace it, by default False
        
        Returns
        -------
        Data:
            Returns a deep copy of the Data object.

        Examples
        --------
        >>> data.replace_missing_indicator('col1', 'col2')
        >>> data.replace_missing_indicator(['col1', 'col2'])
        >>> data.replace_missing_indicator(['col1', 'col2'], missing_indicator='missing', valid_indicator='not missing', keep_col=False)
        """

        report_info = technique_reason_repo["clean"]["general"]["indicator"]
        list_of_cols = _input_columns(list_args, list_of_cols)

        (self.x_train, self.x_test,) = util.replace_missing_indicator(
            x_train=self.x_train,
            x_test=self.x_test,
            list_of_cols=list_of_cols,
            missing_indicator=missing_indicator,
            valid_indicator=valid_indicator,
            keep_col=keep_col,
        )

        if self.report is not None:
            self.report.report_technique(report_info, list_of_cols)

        return self.copy()
예제 #4
0
    def remove_stopwords_nltk(self,
                              *list_args,
                              list_of_cols=[],
                              custom_stopwords=[],
                              new_col_name="_rem_stop"):
        """
        Removes stopwords following the nltk English stopwords list.

        A list of custom words can be provided as well, usually for domain specific words.

        Stop words are generally the most common words in a language
        
        Parameters
        ----------
        list_args : str(s), optional
            Specific columns to apply this technique to.

        list_of_cols : list, optional
            A list of specific columns to apply this technique to., by default []

        custom_stop_words : list, optional
            Custom list of words to also drop with the stop words, must be LOWERCASE, by default []

        new_col_name : str, optional
            New column name to be created when applying this technique, by default `COLUMN_rem_stop`
        
        Returns
        -------
        Data:
            Returns a deep copy of the Data object.

        Examples
        --------
        >>> data.remove_stopwords_nltk('col1')
        >>> data.remove_stopwords_nltk(['col1', 'col2'])
        """

        report_info = technique_reason_repo["preprocess"]["text"][
            "remove_stopwords"]

        list_of_cols = _input_columns(list_args, list_of_cols)

        (
            self.x_train,
            self.x_test,
        ) = text.nltk_remove_stopwords(
            x_train=self.x_train,
            x_test=self.x_test,
            list_of_cols=list_of_cols,
            custom_stopwords=custom_stopwords,
            new_col_name=new_col_name,
        )

        if self.report is not None:
            self.report.report_technique(report_info, list_of_cols)

        return self.copy()
예제 #5
0
    def split_words_nltk(self,
                         *list_args,
                         list_of_cols=[],
                         regexp="",
                         new_col_name="_tokenized"):
        """
        Splits text into its words using nltk punkt tokenizer by default. 
    
        Default is by spaces and punctuation but if a regex expression is provided, it will use that.
        
        Parameters
        ----------
        list_args : str(s), optional
            Specific columns to apply this technique to.

        list_of_cols : list, optional
            A list of specific columns to apply this technique to., by default []

        regexp : str, optional
            Regex expression used to define what a word is.

        new_col_name : str, optional
            New column name to be created when applying this technique, by default `COLUMN_tokenized`
        
        Returns
        -------
        Data:
            Returns a deep copy of the Data object.

        Examples
        --------
        >>> data.split_words_nltk('col1')
        >>> data.split_words_nltk(['col1', 'col2'])
        """

        report_info = technique_reason_repo["preprocess"]["text"][
            "split_words"]

        list_of_cols = _input_columns(list_args, list_of_cols)

        (
            self.x_train,
            self.x_test,
        ) = text.nltk_word_tokenizer(
            x_train=self.x_train,
            x_test=self.x_test,
            list_of_cols=list_of_cols,
            regexp=regexp,
            new_col_name=new_col_name,
        )

        if self.report is not None:
            self.report.report_technique(report_info, list_of_cols)

        return self.copy()
예제 #6
0
파일: feature.py 프로젝트: nperera0/aethos
    def polynomial_features(self, *list_args, list_of_cols=[], **poly_kwargs):
        """
        Generate polynomial and interaction features.

        Generate a new feature matrix consisting of all polynomial combinations of the features with degree less than or equal to the specified degree.
        
        For example, if an input sample is two dimensional and of the form [a, b], the degree-2 polynomial features are [1, a, b, a^2, ab, b^2].
        
        Parameters
        ----------
        list_args : str(s), optional
            Specific columns to apply this technique to.

        list_of_cols : list, optional
            A list of specific columns to apply this technique to., by default []

        degree : int
            Degree of the polynomial features, by default 2

        interaction_only : boolean, 
            If true, only interaction features are produced: features that are products of at most degree distinct input features (so not x[1] ** 2, x[0] * x[2] ** 3, etc.).
            by default = False
        
        Returns
        -------
        Data:
            Returns a deep copy of the Data object.

        Examples
        --------
        >>> data.polynomial_features('col1', 'col2', 'col3')
        """

        report_info = technique_reason_repo["feature"]["numeric"]["poly"]

        ## If a list of columns is provided use the list, otherwise use arguemnts.
        list_of_cols = _input_columns(list_args, list_of_cols)

        (
            self.x_train,
            self.x_test,
        ) = num.polynomial_features(
            x_train=self.x_train,
            x_test=self.x_test,
            list_of_cols=list_of_cols,
            **poly_kwargs,
        )

        if self.report is not None:
            self.report.report_technique(report_info, list_of_cols)

        return self.copy()
예제 #7
0
파일: feature.py 프로젝트: nperera0/aethos
    def postag_spacy_detailed(self,
                              *list_args,
                              list_of_cols=[],
                              new_col_name="_postagged"):
        """
        Tag documents with their respective "Part of Speech" tag with the Spacy NLP engine and the PennState PoS tags.
        These tags classify a word as a noun, verb, adjective, etc. A full list and their meaning can be found here:
        https://spacy.io/api/annotation#pos-tagging 

        If a list of columns is provided use the list, otherwise use arguments.
        
        Parameters
        ----------
        list_args : str(s), optional
            Specific columns to apply this technique to.

        list_of_cols : list, optional
            A list of specific columns to apply this technique to., by default []

        new_col_name : str, optional
            New column name to be created when applying this technique, by default `COLUMN_postagged`

        Returns
        -------
        Data:
            Returns a deep copy of the Data object.

        Examples
        --------
        >>> data.postag_spacy_detailed('col1', 'col2', 'col3')
        """

        report_info = technique_reason_repo["feature"]["text"]["spacy_postag"]

        list_of_cols = _input_columns(list_args, list_of_cols)

        (
            self.x_train,
            self.x_test,
        ) = text.spacy_feature_postag(
            x_train=self.x_train,
            x_test=self.x_test,
            list_of_cols=list_of_cols,
            new_col_name=new_col_name,
            method='d',
        )

        if self.report is not None:
            self.report.report_technique(report_info, list_of_cols)

        return self.copy()
예제 #8
0
파일: clean.py 프로젝트: nperera0/aethos
    def replace_missing_median(self, *list_args, list_of_cols=[]):
        """
        Replaces missing values in every numeric column with the median of that column.

        If no columns are supplied, missing values will be replaced with the mean in every numeric column.

        Median: Middle value of a list of numbers. Equal to the mean if data follows normal distribution. Not effected much by anomalies.

        If a list of columns is provided use the list, otherwise use arguemnts.

        This function exists in `clean/numeric.py` as `replace_missing_mean_median_mode`.
        
        Parameters
        ----------
        list_args : str(s), optional
            Specific columns to apply this technique to.

        list_of_cols : list, optional
            Specific columns to apply this technique to., by default []
        
        Returns
        -------
        Data:
            Returns a deep copy of the Data object.

        Examples
        --------
        >>> data.replace_missing_median('col1', 'col2')
        >>> data.replace_missing_median(['col1', 'col2'])
        """

        report_info = technique_reason_repo["clean"]["numeric"]["median"]

        ## If a list of columns is provided use the list, otherwise use arguemnts.
        list_of_cols = _input_columns(list_args, list_of_cols)

        (self.x_train, self.x_test,) = num.replace_missing_mean_median_mode(
            x_train=self.x_train,
            x_test=self.x_test,
            list_of_cols=list_of_cols,
            strategy="median",
        )

        if self.report is not None:
            if list_of_cols:
                self.report.report_technique(report_info, list_of_cols)
            else:
                list_of_cols = _numeric_input_conditions(list_of_cols, self.x_train)
                self.report.report_technique(report_info, list_of_cols)

        return self.copy()
예제 #9
0
파일: feature.py 프로젝트: nperera0/aethos
    def postag_nltk(self,
                    *list_args,
                    list_of_cols=[],
                    new_col_name="_postagged"):
        """
        Tag documents with their respective "Part of Speech" tag with the Textblob package which utilizes the NLTK NLP engine and Penn Treebank tag set.
        These tags classify a word as a noun, verb, adjective, etc. A full list and their meaning can be found here:
        https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html

        If a list of columns is provided use the list, otherwise use arguments.
        
        Parameters
        ----------
        list_args : str(s), optional
            Specific columns to apply this technique to.

        list_of_cols : list, optional
            A list of specific columns to apply this technique to., by default []

        new_col_name : str, optional
            New column name to be created when applying this technique, by default `COLUMN_postagged`

        Returns
        -------
        Data:
            Returns a deep copy of the Data object.

        Examples
        --------
        >>> data.postag_nltk('col1', 'col2', 'col3')
        """

        report_info = technique_reason_repo["feature"]["text"]["nltk_postag"]

        list_of_cols = _input_columns(list_args, list_of_cols)

        (
            self.x_train,
            self.x_test,
        ) = text.nltk_feature_postag(
            x_train=self.x_train,
            x_test=self.x_test,
            list_of_cols=list_of_cols,
            new_col_name=new_col_name,
        )

        if self.report is not None:
            self.report.report_technique(report_info, [])

        return self.copy()
예제 #10
0
파일: feature.py 프로젝트: nperera0/aethos
    def nounphrases_spacy(self,
                          *list_args,
                          list_of_cols=[],
                          new_col_name="_phrases"):
        """
        Extract noun phrases from text using the Textblob packages which uses the NLTK NLP engine.

        If a list of columns is provided use the list, otherwise use arguments.
        
        Parameters
        ----------
        list_args : str(s), optional
            Specific columns to apply this technique to.

        list_of_cols : list, optional
            A list of specific columns to apply this technique to., by default []

        new_col_name : str, optional
            New column name to be created when applying this technique, by default `COLUMN_phrases`

        Returns
        -------
        Data:
            Returns a deep copy of the Data object.

        Examples
        --------
        >>> data.nounphrases_spacy('col1', 'col2', 'col3')
        """

        report_info = technique_reason_repo["feature"]["text"]["spacy_np"]

        list_of_cols = _input_columns(list_args, list_of_cols)

        (
            self.x_train,
            self.x_test,
        ) = text.spacy_feature_noun_phrases(
            x_train=self.x_train,
            x_test=self.x_test,
            list_of_cols=list_of_cols,
            new_col_name=new_col_name,
        )

        if self.report is not None:
            self.report.report_technique(report_info, list_of_cols)

        return self.copy()
예제 #11
0
    def normalize_log(self, *list_args, list_of_cols=[], base=1):
        """
        Scales data logarithmically.

        Options are 1 for natural log, 2 for base2, 10 for base10.
        
        Parameters
        ----------
        list_args : str(s), optional
            Specific columns to apply this technique to.

        list_of_cols : list, optional
            A list of specific columns to apply this technique to., by default []

        base : str, optional
            Base to logarithmically scale by, by default ''
        
        Returns
        -------
        Data:
            Returns a deep copy of the Data object.

        Examples
        --------
        >>> data.normalize_log('col1')
        >>> data.normalize_log(['col1', 'col2'], base=10)
        """

        report_info = technique_reason_repo["preprocess"]["numeric"]["log"]

        list_of_cols = _input_columns(list_args, list_of_cols)

        self.x_train, self.x_test = num.log_scale(
            x_train=self.x_train,
            x_test=self.x_test,
            list_of_cols=list_of_cols,
            base=base,
        )

        if self.report is not None:
            if list_of_cols:
                self.report.report_technique(report_info, list_of_cols)
            else:
                list_of_cols = _numeric_input_conditions(
                    list_of_cols, self.x_train)
                self.report.report_technique(report_info, list_of_cols)

        return self.copy()
예제 #12
0
파일: clean.py 프로젝트: nperera0/aethos
    def replace_missing_mostcommon(self, *list_args, list_of_cols=[]):
        """
        Replaces missing values in every numeric column with the most common value of that column

        Mode: Most common value.

        If a list of columns is provided use the list, otherwise use arguemnts.

        This function exists in `clean/numeric.py` as `replace_missing_mean_median_mode`.
        
        Parameters
        ----------
        list_args : str(s), optional
            Specific columns to apply this technique to.

        list_of_cols : list, optional
            A list of specific columns to apply this technique to., by default []
        
        Returns
        -------
        Data:
            Returns a deep copy of the Data object.

        Examples
        --------
        >>> data.replace_missing_mostcommon('col1', 'col2')
        >>> data.replace_missing_mostcommon(['col1', 'col2'])
        """

        report_info = technique_reason_repo["clean"]["numeric"]["mode"]

        ## If a list of columns is provided use the list, otherwise use arguemnts.
        list_of_cols = _input_columns(list_args, list_of_cols)

        (self.x_train, self.x_test,) = num.replace_missing_mean_median_mode(
            x_train=self.x_train,
            x_test=self.x_test,
            list_of_cols=list_of_cols,
            strategy="most_frequent",
        )
        if self.report is not None:
            if list_of_cols:
                self.report.report_technique(report_info, list_of_cols)
            else:
                list_of_cols = _numeric_input_conditions(list_of_cols, self.x_train)
                self.report.report_technique(report_info, list_of_cols)

        return self.copy()
예제 #13
0
    def split_sentences(self,
                        *list_args,
                        list_of_cols=[],
                        new_col_name="_sentences"):
        """
        Splits text data into sentences and saves it into another column for analysis.

        If a list of columns is provided use the list, otherwise use arguments.
        
        Parameters
        ----------
        list_args : str(s), optional
            Specific columns to apply this technique to.

        list_of_cols : list, optional
            A list of specific columns to apply this technique to., by default []

        new_col_name : str, optional
            New column name to be created when applying this technique, by default `COLUMN_sentences`

        Returns
        -------
        Data:
            Returns a deep copy of the Data object.

        Examples
        --------
        >>> data.split_sentences('col1')
        >>> data.split_sentences(['col1', 'col2'])
        """

        report_info = technique_reason_repo["preprocess"]["text"][
            "split_sentence"]

        list_of_cols = _input_columns(list_args, list_of_cols)

        self.x_train, self.x_test = text.split_sentences(
            x_train=self.x_train,
            x_test=self.x_test,
            list_of_cols=list_of_cols,
            new_col_name=new_col_name,
        )

        if self.report is not None:
            self.report.report_technique(report_info, list_of_cols)

        return self.copy()
예제 #14
0
파일: clean.py 프로젝트: nperera0/aethos
    def drop_duplicate_rows(self, *list_args, list_of_cols=[]):
        """
        Remove rows from the data that are exact duplicates of each other and leave only 1.
        This can be used to reduce processing time or performance for algorithms where
        duplicates have no effect on the outcome (i.e DBSCAN)

        If a list of columns is provided use the list, otherwise use arguemnts.

        This function exists in `clean/util.py` as `remove_duplicate_rows`.
       
        Parameters
        ----------
        list_args : str(s), optional
            Specific columns to apply this technique to.

        list_of_cols : list, optional
            A list of specific columns to apply this technique to., by default []
       
        Returns
        -------
        Data:
            Returns a deep copy of the Data object.

        Examples
        --------
        >>> data.drop_duplicate_rows('col1', 'col2') # Only look at columns 1 and 2
        >>> data.drop_duplicate_rows(['col1', 'col2'])
        >>> data.drop_duplicate_rows()
        """

        report_info = technique_reason_repo["clean"]["general"]["remove_duplicate_rows"]

        ## If a list of columns is provided use the list, otherwise use arguemnts.
        list_of_cols = _input_columns(list_args, list_of_cols)

        (self.x_train, self.x_test,) = util.remove_duplicate_rows(
            x_train=self.x_train, x_test=self.x_test, list_of_cols=list_of_cols,
        )

        if self.report is not None:
            self.report.report_technique(report_info, list_of_cols)

        return self.copy()
예제 #15
0
파일: clean.py 프로젝트: nperera0/aethos
    def replace_missing_forwardfill(self, *list_args, list_of_cols=[], **extra_kwargs):
        """
        Replaces missing values in a column with the last known data point.

        This is useful when dealing with timeseries data and you want to replace future missing data with the past.

        For more info view the following link: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.fillna.html
        
        Parameters
        ----------
        list_args : str(s), optional
            Specific columns to apply this technique to.
            
        list_of_cols : list, optional
            A list of specific columns to apply this technique to., by default []
        
        Returns
        -------
        Data:
            Returns a deep copy of the Data object.

        Examples
        --------
        >>> data.replace_missing_forwardfill('col1', 'col2')
        >>> data.replace_missing_forwardfill(['col1', 'col2'])
        """

        report_info = technique_reason_repo["clean"]["general"]["ffill"]
        list_of_cols = _input_columns(list_args, list_of_cols)

        (self.x_train, self.x_test,) = util.replace_missing_fill(
            x_train=self.x_train,
            x_test=self.x_test,
            list_of_cols=list_of_cols,
            method="ffill",
            **extra_kwargs
        )

        if self.report is not None:
            self.report.report_technique(report_info, list_of_cols)

        return self.copy()
예제 #16
0
파일: clean.py 프로젝트: nperera0/aethos
    def replace_missing_random_discrete(self, *list_args, list_of_cols=[]):
        """
        Replace missing values in with a random number based off the distribution (number of occurences) 
        of the data.

        For example if your data was [5, 5, NaN, 1, 2]
        There would be a 50% chance that the NaN would be replaced with a 5, a 25% chance for 1 and a 25% chance for 2.

        If a list of columns is provided use the list, otherwise use arguemnts.
        
        Parameters
        ----------
        list_args : str(s), optional
            Specific columns to apply this technique to.
            
        list_of_cols : list, optional
            A list of specific columns to apply this technique to., by default []
        
        Returns
        -------
        Data:
            Returns a deep copy of the Data object.

        Examples
        --------
        >>> data.replace_missing_random_discrete('col1', 'col2')
        >>> data.replace_missing_random_discrete(['col1', 'col2'])
        """

        report_info = technique_reason_repo["clean"]["general"]["random_discrete"]

        ## If a list of columns is provided use the list, otherwise use arguemnts.
        list_of_cols = _input_columns(list_args, list_of_cols)

        (self.x_train, self.x_test,) = util.replace_missing_random_discrete(
            x_train=self.x_train, x_test=self.x_test, list_of_cols=list_of_cols,
        )

        if self.report is not None:
            self.report.report_technique(report_info, list_of_cols)

        return self.copy()
예제 #17
0
파일: feature.py 프로젝트: nperera0/aethos
    def encode_labels(self, *list_args, list_of_cols=[]):
        """
        Encode categorical values with value between 0 and n_classes-1.

        Running this function will automatically set the corresponding mapping for the target variable mapping number to the original value.

        Note that this will not work if your test data will have labels that your train data does not.        

        Parameters
        ----------
        list_args : str(s), optional
            Specific columns to apply this technique to.

        list_of_cols : list, optional
            A list of specific columns to apply this technique to., by default []
        
        Returns
        -------
        Data:
            Returns a deep copy of the Data object.

        Examples
        --------
        >>> data.encode_labels('col1', 'col2', 'col3')
        """

        report_info = technique_reason_repo["preprocess"]["categorical"][
            "label_encode"]

        list_of_cols = _input_columns(list_args, list_of_cols)

        self.x_train, self.x_test, _ = label_encoder(
            x_train=self.x_train,
            x_test=self.x_test,
            list_of_cols=list_of_cols,
        )

        if self.report is not None:
            self.report.report_technique(report_info, list_of_cols)

        return self.copy()
예제 #18
0
파일: clean.py 프로젝트: nperera0/aethos
    def replace_missing_remove_row(self, *list_args, list_of_cols=[]):
        """
        Remove rows where the value of a column for those rows is missing.

        If a list of columns is provided use the list, otherwise use arguemnts.

        This function exists in `clean/categorical.py` as `replace_missing_remove_row`.
        
        Parameters
        ----------
        list_args : str(s), optional
            Specific columns to apply this technique to.

        list_of_cols : list, optional
            A list of specific columns to apply this technique to., by default []

        Returns
        -------
        Data:
            Returns a deep copy of the Data object.

        Examples
        --------
        >>> data.replace_missing_remove_row('col1', 'col2')
        >>> data.replace_missing_remove_row(['col1', 'col2'])
        """

        report_info = technique_reason_repo["clean"]["categorical"]["remove_rows"]

        ## If a list of columns is provided use the list, otherwise use arguemnts.
        list_of_cols = _input_columns(list_args, list_of_cols)

        (self.x_train, self.x_test,) = cat.replace_missing_remove_row(
            x_train=self.x_train, x_test=self.x_test, cols_to_remove=list_of_cols,
        )

        if self.report is not None:
            self.report.report_technique(report_info, list_of_cols)

        return self.copy()
예제 #19
0
    def remove_punctuation(
        self,
        *list_args,
        list_of_cols=[],
        regexp="",
        exceptions=[],
        new_col_name="_rem_punct",
    ):
        """
        Removes punctuation from every string entry.

        Defaults to removing all punctuation, but if regex of punctuation is provided, it will remove them.
        
        An example regex would be:

        (\w+\.|\w+)[^,] - Include all words and words with periods after them but don't include commas.
        (\w+\.)|(\w+), would also achieve the same result

        Parameters
        ----------
        list_args : str(s), optional
            Specific columns to apply this technique to.

        list_of_cols : list, optional
            A list of specific columns to apply this technique to., by default []

        regexp : str, optional
            Regex expression used to define what to include.
            
        exceptions : list, optional
            List of punctuation to include in the text, by default []

        new_col_name : str, optional
            New column name to be created when applying this technique, by default `COLUMN_rem_punct`
        
        Returns
        -------
        Data:
            Returns a deep copy of the Data object.

        Examples
        --------
        >>> data.remove_punctuation('col1')
        >>> data.remove_punctuation(['col1', 'col2'])
        >>> data.remove_punctuation('col1', regexp=r'(\w+\.)|(\w+)') # Include all words and words with periods after.
        """

        report_info = technique_reason_repo["preprocess"]["text"][
            "remove_punctuation"]

        list_of_cols = _input_columns(list_args, list_of_cols)

        (
            self.x_train,
            self.x_test,
        ) = text.remove_punctuation(
            x_train=self.x_train,
            x_test=self.x_test,
            list_of_cols=list_of_cols,
            regexp=regexp,
            exceptions=exceptions,
            new_col_name=new_col_name,
        )

        if self.report is not None:
            self.report.report_technique(report_info, list_of_cols)

        return self.copy()
예제 #20
0
파일: clean.py 프로젝트: nperera0/aethos
    def replace_missing_new_category(
        self, *list_args, list_of_cols=[], new_category=None, col_mapping=None
    ):
        """
        Replaces missing values in categorical column with its own category. The categories can be autochosen
        from the defaults set.

        For numeric categorical columns default values are: -1, -999, -9999
        For string categorical columns default values are: "Other", "Unknown", "MissingDataCategory"

        If a list of columns is provided use the list, otherwise use arguemnts.

        This function exists in `clean/categorical.py` as `replace_missing_new_category`.
        
        Parameters
        ----------
        list_args : str(s), optional
            Specific columns to apply this technique to.

        list_of_cols : list, optional
            A list of specific columns to apply this technique to., by default []

        new_category : str, int, or float, optional
            Category to replace missing values with, by default None

        col_mapping : dict, optional
           Dictionary mapping {'ColumnName': `constant`}, by default None
        
        Returns
        -------
        Data:
            Returns a deep copy of the Data object.

        Examples
        --------
        >>> data.replace_missing_new_category(col_mapping={'col1': "Green", 'col2': "Canada", 'col3': "December"})
        >>> data.replace_missing_new_category('col1', 'col2', 'col3', new_category='Blue')
        >>> data.replace_missing_new_category(['col1', 'col2', 'col3'], new_category='Blue')
        """

        report_info = technique_reason_repo["clean"]["categorical"]["new_category"]

        # If dictionary mapping is provided, use that otherwise use column
        if col_mapping:
            col_to_category = col_mapping
        else:
            # If a list of columns is provided use the list, otherwise use arguemnts.
            col_to_category = _input_columns(list_args, list_of_cols)

        (self.x_train, self.x_test,) = cat.replace_missing_new_category(
            x_train=self.x_train,
            x_test=self.x_test,
            col_to_category=col_to_category,
            constant=new_category,
        )

        if self.report is not None:
            if not col_to_category:
                self.report.report_technique(report_info, self.x_train.columns)
            else:
                self.report.report_technique(report_info, list(col_to_category))

        return self.copy()
예제 #21
0
파일: feature.py 프로젝트: nperera0/aethos
    def tfidf(self,
              *list_args,
              list_of_cols=[],
              keep_col=True,
              **tfidf_kwargs):
        """
        Creates a matrix of the tf-idf score for every word in the corpus as it pertains to each document.

        The higher the score the more important a word is to a document, the lower the score (relative to the other scores)
        the less important a word is to a document.

        For more information see: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

        If a list of columns is provided use the list, otherwise use arguments.

        This function exists in `feature-extraction/text.py`
        
        Parameters
        ----------
        list_args : str(s), optional
            Specific columns to apply this technique to.

        list_of_cols : list, optional
            A list of specific columns to apply this technique to., by default []

        keep_col : bool, optional
            True if you want to keep the column(s) or False if you want to drop the column(s)

        encoding: str, default=’utf-8’
            If bytes or files are given to analyze, this encoding is used to decode.

        decode_error: {‘strict’, ‘ignore’, ‘replace’} (default=’strict’)
            Instruction on what to do if a byte sequence is given to analyze that contains characters not of the given encoding.
            By default, it is ‘strict’, meaning that a UnicodeDecodeError will be raised.
            Other values are ‘ignore’ and ‘replace’.

        strip_accents: {‘ascii’, ‘unicode’, None} (default=None)
            Remove accents and perform other character normalization during the preprocessing step.\
            ‘ascii’ is a fast method that only works on characters that have an direct ASCII mapping.
            ‘unicode’ is a slightly slower method that works on any characters.
            None (default) does nothing.

            Both ‘ascii’ and ‘unicode’ use NFKD normalization from unicodedata.normalize.

        lowercase: bool (default=True)
            Convert all characters to lowercase before tokenizing.

        preprocessor: callable or None (default=None)
            Override the preprocessing (string transformation) stage while preserving the tokenizing and n-grams generation steps.
            Only applies if analyzer is not callable.

        tokenizer: callable or None (default=None)
            Override the string tokenization step while preserving the preprocessing and n-grams generation steps.
            Only applies if analyzer == 'word'.

        analyzer: str, {‘word’, ‘char’, ‘char_wb’} or callable
            Whether the feature should be made of word or character n-grams
            Option ‘char_wb’ creates character n-grams only from text inside word boundaries;
            n-grams at the edges of words are padded with space.

            If a callable is passed it is used to extract the sequence of features out of the raw, unprocessed input.

        stop_words: str {‘english’}, list, or None (default=None)
            If a string, it is passed to _check_stop_list and the appropriate stop list is returned.
            ‘english’ is currently the only supported string value.
            There are several known issues with ‘english’ and you should consider an alternative (see Using stop words).

            If a list, that list is assumed to contain stop words, all of which will be removed from the resulting tokens. Only applies if analyzer == 'word'.

            If None, no stop words will be used. max_df can be set to a value in the range [0.7, 1.0) to automatically detect and filter stop words based on intra corpus document frequency of terms.

        token_pattern: str
            Regular expression denoting what constitutes a “token”, only used if analyzer == 'word'.
            The default regexp selects tokens of 2 or more alphanumeric characters (punctuation is completely ignored and always treated as a token separator).
        
        ngram_range: tuple (min_n, max_n), default=(1, 1)
            The lower and upper boundary of the range of n-values for different n-grams to be extracted.
            All values of n such that min_n <= n <= max_n will be used.
            For example an ngram_range of (1, 1) means only unigrams, (1, 2) means unigrams and bigrams, and (2, 2) means only bigrams.
            Only applies if analyzer is not callable.
        
        max_df: float in range [0.0, 1.0] or int (default=1.0)
            When building the vocabulary ignore terms that have a document frequency strictly higher than the given threshold (corpus-specific stop words).
            If float, the parameter represents a proportion of documents, integer absolute counts.
            This parameter is ignored if vocabulary is not None.

        min_df: float in range [0.0, 1.0] or int (default=1)
            When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold.
            This value is also called cut-off in the literature.
            If float, the parameter represents a proportion of documents, integer absolute counts. This parameter is ignored if vocabulary is not None.

        max_features: int or None (default=None)
            If not None, build a vocabulary that only consider the top max_features ordered by term frequency across the corpus.

            This parameter is ignored if vocabulary is not None.

        vocabulary: Mapping or iterable, optional (default=None)
            Either a Mapping (e.g., a dict) where keys are terms and values are indices in the feature matrix, or an iterable over terms.
            If not given, a vocabulary is determined from the input documents.

        binary: bool (default=False)
            If True, all non-zero term counts are set to 1.
            This does not mean outputs will have only 0/1 values, only that the tf term in tf-idf is binary. (Set idf and normalization to False to get 0/1 outputs).

        dtype: type, optional (default=float64)
            Type of the matrix returned by fit_transform() or transform().

        norm: ‘l1’, ‘l2’ or None, optional (default=’l2’)
            Each output row will have unit norm, either: * ‘l2’: Sum of squares of vector elements is 1.
            The cosine similarity between two vectors is their dot product when l2 norm has been applied. * ‘l1’: Sum of absolute values of vector elements is 1.

        use_idf: bool (default=True)
            Enable inverse-document-frequency reweighting.

        smooth_idf: bool (default=True)
            Smooth idf weights by adding one to document frequencies, as if an extra document was seen containing every term in the collection exactly once.
            Prevents zero divisions.

        sublinear_tf: bool (default=False)
            Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf).
        
        Returns
        -------
        Data:
            Returns a deep copy of the Data object.

        Examples
        --------
        >>> data.tfidf('col1', 'col2', 'col3')
        >>> data.tfidf('col1', 'col2', 'col3', lowercase=False, smoothidf=False)
        """

        report_info = technique_reason_repo["feature"]["text"]["tfidf"]

        ## If a list of columns is provided use the list, otherwise use arguemnts.
        list_of_cols = _input_columns(list_args, list_of_cols)

        self.x_train, self.x_test = text.feature_tfidf(
            x_train=self.x_train,
            x_test=self.x_test,
            list_of_cols=list_of_cols,
            keep_col=keep_col,
            **tfidf_kwargs,
        )

        if self.report is not None:
            self.report.report_technique(report_info, [])

        return self.copy()
예제 #22
0
파일: feature.py 프로젝트: nperera0/aethos
    def onehot_encode(self,
                      *list_args,
                      list_of_cols=[],
                      keep_col=True,
                      **onehot_kwargs):
        """
        Creates a matrix of converted categorical columns into binary columns of ones and zeros.

        For more info see: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html

        If a list of columns is provided use the list, otherwise use arguments.
    
        Parameters
        ----------
        list_args : str(s), optional
            Specific columns to apply this technique to.

        list_of_cols : list, optional
            A list of specific columns to apply this technique to., by default []

        keep_col : bool
            A parameter to specify whether to drop the column being transformed, by default
            keep the column, True

        categories : ‘auto’ or a list of array-like, default=’auto’
            Categories (unique values) per feature:

                ‘auto’ : Determine categories automatically from the training data.

                list : categories[i] holds the categories expected in the ith column. The passed categories should not mix strings and numeric values within a single feature, and should be sorted in case of numeric values.

            The used categories can be found in the categories_ attribute.

        drop : ‘first’ or a array-like of shape (n_features,), default=None
            Specifies a methodology to use to drop one of the categories per feature. This is useful in situations where perfectly collinear features cause problems, such as when feeding the resulting data into a neural network or an unregularized regression.

                None : retain all features (the default).

                ‘first’ : drop the first category in each feature. If only one category is present, the feature will be dropped entirely.

                array : drop[i] is the category in feature X[:, i] that should be dropped.

        sparsebool : default=True
            Will return sparse matrix if set True else will return an array.

        dtype : number type, default=np.float
            Desired dtype of output.

        handle_unknown: {‘error’, ‘ignore’}, default='ignore'
            Whether to raise an error or ignore if an unknown categorical feature is present during transform (default is to raise).
            When this parameter is set to ‘ignore’ and an unknown category is encountered during transform, the resulting one-hot encoded columns for this feature will be all zeros.
            In the inverse transform, an unknown category will be denoted as None.

        Returns
        -------
        Data:
            Returns a deep copy of the Data object.

        Examples
        --------
        >>> data.onehot_encode('col1', 'col2', 'col3')
        >>> data.onehot_encode('col1', 'col2', 'col3', drop='first')
        """

        report_info = technique_reason_repo["feature"]["categorical"][
            "onehotencode"]

        ## If a list of columns is provided use the list, otherwise use arguemnts.
        list_of_cols = _input_columns(list_args, list_of_cols)

        (
            self.x_train,
            self.x_test,
        ) = cat.feature_one_hot_encode(
            x_train=self.x_train,
            x_test=self.x_test,
            list_of_cols=list_of_cols,
            keep_col=keep_col,
            **onehot_kwargs,
        )
        if self.report is not None:
            self.report.report_technique(report_info, list_of_cols)

        return self.copy()
예제 #23
0
파일: clean.py 프로젝트: nperera0/aethos
    def replace_missing_interpolate(
        self, *list_args, list_of_cols=[], method="linear", **inter_kwargs
    ):
        """
        Replaces missing values with an interpolation method and possible extrapolation.

        The possible interpolation methods are:
           
            - 'linear': Ignore the index and treat the values as equally spaced. This is the only method supported on MultiIndexes.
            - 'time': Works on daily and higher resolution data to interpolate given length of interval.
            - 'index', ‘values’: use the actual numerical values of the index.
            - 'pad': Fill in NaNs using existing values.
            - 'nearest', 'zero', 'slinear', 'quadratic', 'cubic', 'spline', ‘barycentric’, ‘polynomial’: Passed to scipy.interpolate.interp1d.
                - These methods use the numerical values of the index. Both ‘polynomial’ and ‘spline’ require that you also specify an order (int), e.g. df.interpolate(method='polynomial', order=5).
            - 'krogh', 'piecewise_polynomial', 'spline', 'pchip', 'akima': Wrappers around the SciPy interpolation methods of similar names.
            - 'from_derivatives': Refers to scipy.interpolate.BPoly.from_derivatives which replaces ‘piecewise_polynomial’ interpolation method in scipy 0.18.

        For more information see: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.interpolate.html or https://docs.scipy.org/doc/scipy/reference/interpolate.html.

        Parameters
        ----------
        list_args : str(s), optional
            Specific columns to apply this technique to.
            
        list_of_cols : list, optional
            A list of specific columns to apply this technique to., by default []

        method : str, optional
            Interpolation method, by default 'linear'

        limit : int, optional
            Maximum number of consecutive NaNs to fill. Must be greater than 0.

        limit_area : {None, ‘inside’, ‘outside’}, default None
            If limit is specified, consecutive NaNs will be filled with this restriction.

            - None: No fill restriction.
            - ‘inside’: Only fill NaNs surrounded by valid values (interpolate).
            - ‘outside’: Only fill NaNs outside valid values (extrapolate).
        
        Returns
        -------
        Data:
            Returns a deep copy of the Data object.

        Examples
        --------
        >>> data.replace_missing_interpolate('col1', 'col2')
        >>> data.replace_missing_interpolate(['col1', 'col2'])
        >>> data.replace_missing_interpolate('col1', 'col2', method='pad', limit=3)
        """

        report_info = technique_reason_repo["clean"]["general"]["interpolate"]
        list_of_cols = _input_columns(list_args, list_of_cols)

        (self.x_train, self.x_test,) = util.replace_missing_interpolate(
            x_train=self.x_train,
            x_test=self.x_test,
            list_of_cols=list_of_cols,
            method=method,
            **inter_kwargs
        )

        if self.report is not None:
            self.report.report_technique(report_info, list_of_cols)

        return self.copy()
예제 #24
0
    def clean_text(
        self,
        *list_args,
        list_of_cols=[],
        lower=True,
        punctuation=True,
        stopwords=True,
        stemmer=True,
        new_col_name="_clean",
    ):
        """
        Function that takes text and does the following:

        - Casts it to lowercase
        - Removes punctuation
        - Removes stopwords
        - Stems the text
        
        Parameters
        ----------
        list_args : str(s), optional
            Specific columns to apply this technique to.

        list_of_cols : list, optional
            A list of specific columns to apply this technique to., by default []
        
        lower : bool, optional
            True to cast all text to lowercase, by default True
        
        punctuation : bool, optional
            True to remove punctuation, by default True
        
        stopwords : bool, optional
            True to remove stop words, by default True
        
        stemmer : bool, optional
            True to stem the data, by default True

        new_col_name : str, optional
            New column name to be created when applying this technique, by default `COLUMN_clean`            
        
        Returns
        -------
        Data:
            Returns a deep copy of the Data object.

        Examples
        --------
        >>> data.clean_text('col1')
        >>> data.clean_text(['col1', 'col2'], lower=False)
        >>> data.clean_text(lower=False, stopwords=False, stemmer=False)
        """

        report_info = technique_reason_repo["preprocess"]["text"]["clean_text"]

        list_of_cols = _input_columns(list_args, list_of_cols)

        for col in list_of_cols:
            self.x_train[col + new_col_name] = [
                text.process_text(txt) for txt in self.x_train[col]
            ]

            if self.x_test is not None:
                self.x_test[col + new_col_name] = [
                    text.process_text(txt) for txt in self.x_test[col]
                ]

        if self.report is not None:
            self.report.report_technique(report_info, list_of_cols)

        return self.copy()
예제 #25
0
    def normalize_numeric(self,
                          *list_args,
                          list_of_cols=[],
                          **normalize_params):
        """
        Function that normalizes all numeric values between 2 values to bring features into same domain.
        
        If `list_of_cols` is not provided, the strategy will be applied to all numeric columns.

        If a list of columns is provided use the list, otherwise use arguments.

        For more info please see: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html#sklearn.preprocessing.MinMaxScaler 

        This function can be found in `preprocess/numeric.py`     
        
        Parameters
        ----------
        list_args : str(s), optional
            Specific columns to apply this technique to.

        list_of_cols : list, optional
            A list of specific columns to apply this technique to., by default []

        feature_range : tuple(int or float, int or float), optional
            Min and max range to normalize values to, by default (0, 1)

        normalize_params : dict, optional
            Parmaters to pass into MinMaxScaler() constructor from Scikit-Learn
        
        Returns
        -------
        Data:
            Returns a deep copy of the Data object.

        Examples
        --------
        >>> data.normalize_numeric('col1')
        >>> data.normalize_numeric(['col1', 'col2'])
        """

        report_info = technique_reason_repo["preprocess"]["numeric"][
            "standardize"]

        list_of_cols = _input_columns(list_args, list_of_cols)

        self.x_train, self.x_test = num.scale(
            x_train=self.x_train,
            x_test=self.x_test,
            list_of_cols=list_of_cols,
            method="minmax",
            **normalize_params,
        )

        if self.report is not None:
            if list_of_cols:
                self.report.report_technique(report_info, list_of_cols)
            else:
                list_of_cols = _numeric_input_conditions(
                    list_of_cols, self.x_train)
                self.report.report_technique(report_info, list_of_cols)

        return self.copy()
예제 #26
0
    def normalize_quantile_range(self,
                                 *list_args,
                                 list_of_cols=[],
                                 **robust_params):
        """
        Scale features using statistics that are robust to outliers.

        This Scaler removes the median and scales the data according to the quantile range (defaults to IQR: Interquartile Range).
        The IQR is the range between the 1st quartile (25th quantile) and the 3rd quartile (75th quantile).

        Standardization of a dataset is a common requirement for many machine learning estimators.
        Typically this is done by removing the mean and scaling to unit variance.
        However, outliers can often influence the sample mean / variance in a negative way.
        In such cases, the median and the interquartile range often give better results.
        
        If `list_of_cols` is not provided, the strategy will be applied to all numeric columns.

        If a list of columns is provided use the list, otherwise use arguments.

        For more info please see: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html#sklearn.preprocessing.RobustScaler

        This function can be found in `preprocess/numeric.py`     
        
        Parameters
        ----------
        list_args : str(s), optional
            Specific columns to apply this technique to.

        list_of_cols : list, optional
            A list of specific columns to apply this technique to., by default []

        with_centering : boolean, True by default
            If True, center the data before scaling.
            This will cause transform to raise an exception when attempted on sparse matrices,
            because centering them entails building a dense matrix which in common use cases is likely to be too large to fit in memory.
        
        with_scaling : boolean, True by default
            If True, scale the data to interquartile range.

        quantile_range : tuple (q_min, q_max), 0.0 < q_min < q_max < 100.0
            Default: (25.0, 75.0) = (1st quantile, 3rd quantile) = IQR Quantile range used to calculate scale_.

        robust_params : dict, optional
            Parmaters to pass into MinMaxScaler() constructor from Scikit-Learn
        
        Returns
        -------
        Data:
            Returns a deep copy of the Data object.

        Examples
        --------
        >>> data.normalize_quantile_range('col1')
        >>> data.normalize_quantile_range(['col1', 'col2'])
        """

        report_info = technique_reason_repo["preprocess"]["numeric"]["robust"]

        list_of_cols = _input_columns(list_args, list_of_cols)

        self.x_train, self.x_test = num.scale(
            x_train=self.x_train,
            x_test=self.x_test,
            list_of_cols=list_of_cols,
            method="robust",
            **robust_params,
        )

        if self.report is not None:
            if list_of_cols:
                self.report.report_technique(report_info, list_of_cols)
            else:
                list_of_cols = _numeric_input_conditions(
                    list_of_cols, self.x_train)
                self.report.report_technique(report_info, list_of_cols)

        return self.copy()
예제 #27
0
파일: feature.py 프로젝트: nperera0/aethos
    def text_hash(self,
                  *list_args,
                  list_of_cols=[],
                  keep_col=True,
                  **hash_kwargs):
        """
        Creates a matrix of how many times a word appears in a document. It can possibly normalized as token frequencies if norm=’l1’ or projected on the euclidean unit sphere if norm=’l2’.

        The premise is that the more times a word appears the more the word represents that document.

        This text vectorizer implementation uses the hashing trick to find the token string name to feature integer index mapping.

        This strategy has several advantages:

            It is very low memory scalable to large datasets as there is no need to store a vocabulary dictionary in memory
            It is fast to pickle and un-pickle as it holds no state besides the constructor parameters
            It can be used in a streaming (partial fit) or parallel pipeline as there is no state computed during fit.

        For more info please see: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.HashingVectorizer.html

        If a list of columns is provided use the list, otherwise use arguments.

        This function exists in `feature-extraction/text.py`
        
        Parameters
        ----------
        list_args : str(s), optional
            Specific columns to apply this technique to.

        list_of_cols : list, optional
            A list of specific columns to apply this technique to., by default []

        keep_col : bool, optional
            True if you want to keep the column(s) or False if you want to drop the column(s)

        n_features : integer, default=(2 ** 20)
            The number of features (columns) in the output matrices.
            Small numbers of features are likely to cause hash collisions, but large numbers will cause larger coefficient dimensions in linear learners.

        hash_kwargs : dict, optional
            Parameters you would pass into Bag of Words constructor, by default {}
        
        Returns
        -------
        Data:
            Returns a deep copy of the Data object.

        Examples
        --------
        >>> data.text_hash('col1', 'col2', 'col3')
        >>> data.text_hash('col1', 'col2', 'col3', n_features=50)
        """

        report_info = technique_reason_repo["feature"]["text"]["hash"]

        ## If a list of columns is provided use the list, otherwise use arguemnts.
        list_of_cols = _input_columns(list_args, list_of_cols)

        (
            self.x_train,
            self.x_test,
        ) = text.feature_hash_vectorizer(
            x_train=self.x_train,
            x_test=self.x_test,
            list_of_cols=list_of_cols,
            keep_col=keep_col,
            **hash_kwargs,
        )

        if self.report is not None:
            self.report.report_technique(report_info, [])

        return self.copy()