Ejemplo n.º 1
0
def mev(X: pd.DataFrame) -> pd.DataFrame:
    ''' Returns the Matrix of Explained Variance (MEV). This is a matrix
        constructed by running a simple linear regression (OLS) with every
        pair of variable in the matrix of attributes X. Then, the R-squared
        statistic is calculated and stored in a square matrix. It is equivalent
        to a correlation matrix for pairs of continuous variables
        
        Parameters
        ----------
        X: pandas.DataFrame
            Matrix of attributes
            
        Returns
        -------
        pandas.DataFrame
            MEV matrix
    '''
    
    guards.not_dataframe(X, 'X')
    
    cols = X.columns.to_list()
    matrix = pd.DataFrame({k: np.nan for k in cols}, index=cols)
    reg = LinearRegression()
    
    for i in cols:
        matrix.at[i, i] = 1
        for j in cols:
            if cols.index(i) < cols.index(j):
                x = np.asarray(X[j]).reshape(-1, 1)
                r_sq = reg.fit(x, X[i]).score(x, X[i])
                matrix.at[i, j] = r_sq
                matrix.at[j, i] = r_sq
            
    return matrix
Ejemplo n.º 2
0
def vif(X: pd.DataFrame) -> pd.Series:
    ''' Returns the Variance Inflator Factor (VIF) for all variables in X
        
        Parameters
        ----------
        X: pandas.DataFrame
            Matrix of attributes
            
        Returns
        -------
        pandas.Series
            VIF vector for all columns in X
    '''
    
    guards.not_dataframe(X, 'X')
    
    results = {}
    reg = LinearRegression()
    
    for col in X.columns.to_list():
        others = X.drop(columns=col)
        r_sq = reg.fit(others, X[col]).score(others, X[col])
        results[col] = _vif(r_sq)
            
    return pd.Series(results, name='vif')
Ejemplo n.º 3
0
    def corr(self, df: pd.DataFrame) -> pd.DataFrame:
        ''' Calculates a correlation matrix for df with the method on the
            class description. It mimics the behavior of `pandas.DataFrame.corr`
            method
            
            Parameters
            ----------
            df: pandas.DataFrame
                Matrix for which to calculate all pairwise correlations
                
            Returns
            -------
            matrix: pandas.DataFrame
                Square matrix corresponding to the correlation matrix of
                every variable (column) in df. It is a symmetric matrix, with
                only 1 on its main diagonal
        '''

        guards.not_dataframe(df, 'df')

        cols = df.columns.to_list()
        matrix = pd.DataFrame({k: np.nan for k in cols}, index=cols)
        for i in cols:
            matrix.at[i, i] = 1
            for j in cols:
                if cols.index(i) < cols.index(j):
                    r = self.assoc(df[i], df[j])[0]
                    matrix.at[i, j] = r
                    matrix.at[j, i] = r
        return matrix
Ejemplo n.º 4
0
    def fit(
        self,
        X: pd.DataFrame,
        y: pd.Series,
        apply_cutoffs: t.Optional[bool] = True,
        ranking: t.Optional[t.Union[t.List[str], t.Tuple[str, ...]]] = None
    ) -> pd.DataFrame:
        ''' Applies correlation-based feature selection for a attribute
            matrix X and a response vector y
            
            Parameters
            ----------
            X: pandas.DataFrame
                Matrix of attributes whose columns are to be ranked according
                to their correlation with y
                
            y: pandas.Series
                Vector of response measures (aka dependent variable or target)
                to use for ranking features
                
            apply_cutoffs: bool, default True
                Whether to apply the cut-offs associated if the object during
                ranking. See the documentation on the class attributes for
                more information on these
            
            ranking: Union[List[str], Tuple[str, ...]], default None
                List or tuple of variable names that will be used as a ranking. 
                When doing the filtering, this method will always drop the 
                variable in the pair that comes AFTER the other one in this 
                list. This means that the first item of this list should be the 
                most important variable (ranked 1), the second should be the
                second most important variable (ranked 2) and so on. If no
                such list is provided, one is created using the `rank` method
                of this class
                
            Returns
            -------
            X_new: pandas.DataFrame
                Dataframe with the columns in X which were deemed not
                pathologically correlated with any of the others. It has
                the same number of rows as X
        '''

        guards.not_dataframe(X, 'X')
        guards.not_series(y, 'y')
        guards.not_iterable(ranking, 'ranking')

        if ranking is None:
            rk = (self.rank(
                X, y, apply_cutoffs)['rank'].sort_values().index.to_list())
        else:
            rk = ranking
        corr = self.corr(X).abs()
        for r in np.sort(np.linspace(0, self.corr_cutoff))[::-1]:
            X_new = self.filter(X, corr, r, rk)
            if self._stop(X_new):
                return X_new
Ejemplo n.º 5
0
    def rank(self,
             X: pd.DataFrame,
             y: pd.Series,
             apply_cutoffs: t.Optional[bool] = True) -> pd.DataFrame:
        ''' Ranks attributes (columns) in X according to its correlation with
            the target y
            
            Parameters
            ----------
            X: pandas.DataFrame
                Matrix of attributes whose columns are to be ranked according
                to their correlation with y
                
            y: pandas.Series
                Vector of response measures (aka dependent variable or target)
                to use for ranking features
                
            apply_cutoffs: bool, default True
                Whether to apply the cut-offs associated if the object during
                ranking. See the documentation on the class attributes for
                more information on these
                
            Returns
            -------
            df: pandas.DataFrame
                Dataframe containing 4 columns and n rows, where rows is at
                most the number of columns in X. The first column is the
                association measure (`assoc`), the second and third its
                associated p-value and the fourth is the ranking of the
                variable, where the number 1 indicates the best one, the
                number 2 the second-best and so on
        '''

        guards.not_dataframe(X, 'X')
        guards.not_series(y, 'y')

        data = {'assoc': [], 'pvalue': [], '-log10(pvalue)': []}
        cols = X.columns.to_list()

        for var in cols:
            r, pval = self.assoc(X[var], y)
            pval2 = -np.log10(pval)
            data['assoc'].append(r)
            data['pvalue'].append(pval)
            data['-log10(pvalue)'].append(pval2)

        df = pd.DataFrame(data, index=cols)

        if apply_cutoffs:
            pval_filter = df['pvalue'] < self.pval_cutoff
            too_good_filter = df['assoc'].abs() < self.too_good_to_be_true
            df = df[pval_filter & too_good_filter]

        df['rank'] = df['assoc'].abs().rank(ascending=False, method='dense')

        return df
Ejemplo n.º 6
0
    def filter(
            X: pd.DataFrame, corr: pd.DataFrame, cutoff: float,
            ranking: t.Union[t.List[str], t.Tuple[str, ...]]) -> pd.DataFrame:
        ''' Applies the correlation cut-off to X, according to the correlations
            calculated in the correlation matrix corr. It uses a ordered list
            of feature names to start the cutting from the most important
            variables according to the ranking method. In this list, the first
            element is the most important variable (ranked 1), the second
            the second most important variable (ranked 2) and so on
            
            Parameters
            ----------
            X: pandas.DataFrame
                Matrix of attributes whose columns are to be filtered according
                to their correlation with each other
                
            corr: pandas.DataFrame
                Correlation matrix used for assessing each pairwise correlation
                of the columns of X. It should be a square matrix with dimension
                n x n, where n is the number of columns in X. Preferably, it
                should be symmetric and have only number 1 on its main diagonal.
                For an example of this matrix, see the result of
                pandas.DataFrame.corr method
                
            cutoff: float
                Correlation above which the pair of variables are considered
                to be pathologically correlated
                
            ranking: Union[List[str], Tuple[str, ...]]
                List or tuple of variable names that will be used as a ranking. 
                When doing the filtering, this method will always drop the 
                variable in the pair that comes AFTER the other one in this 
                list. This means that the first item of this list should be the 
                most important variable (ranked 1), the second should be the
                second most important variable (ranked 2) and so on
                
            Returns
            -------
            pandas.DataFrame:
                Dataframe with the columns in X which were deemed not
                pathologically correlated with any of the others. It has
                the same number of rows as X
        '''

        guards.not_dataframe(X, 'X')
        guards.not_dataframe(corr, 'corr')
        guards.not_iterable(ranking, 'ranking')

        keep = set(ranking)
        for i in ranking:
            keep -= set([
                j for j in ranking
                if ranking.index(i) < ranking.index(j) and corr.at[i,
                                                                   j] > cutoff
            ])
        return X[keep]
Ejemplo n.º 7
0
    def rank(self,
             X: pd.DataFrame,
             apply_cutoffs: t.Optional[bool] = False) -> pd.DataFrame:
        ''' Creates a ranking of low variance variables, from the ones with
            least variance to the ones with most variance. Therefore, the
            variable with rank equals to 1 is the variable with least variance
            among those in X
            
            Parameters
            ----------
            X: pandas.DataFrame
                Attribute matrix
                
            apply_cutoffs: bool, default False
                Whether to apply the cut-offs specified during the
                inicialization of the object. It is recommended not to apply
                the cut-offs without first evaluating the low variance index
                (LVI), because variables with few values can be still be useful
                as categorical features
                
            Returns
            -------
            df: pandas.DataFrame
                Dataframe containing 3 columns and n <= k rows, where k is
                the number of columns in X. The first columns is the percentage
                of distinct values relative to the sample size (`pct_distinct`).
                The second column is the ratio between the frequencies of the
                two most frequent values (`freq_ratio`). The third is the
                low variance index (LVI), which is the logaritm (base 10) of
                the ratio between `freq_ratio` and `pct_distinct`
        '''

        guards.not_dataframe(X, 'X')

        df = pd.concat([self.pct_distinct(X), self.freq_ratio(X)], axis=1)

        if apply_cutoffs:
            pct_filter = df['pct_distinct'] < self.pct_distinct_cutoff
            freq_filter = df['freq_ratio'] > self.freq_ratio_cutoff
            df = df[pct_filter & freq_filter]

        df['lvi'] = np.log10(df['freq_ratio'] / df['pct_distinct'])
        df['rank'] = df['lvi'].rank(ascending=False, method='dense')

        return df
Ejemplo n.º 8
0
    def _stop(self, X: pd.DataFrame) -> bool:
        ''' Stop condition of the fit algorithm
        
            Parameters
            ----------
            X: pandas.DataFrame
                Matrix of attributes being evaluated
                
            Returns
            -------
            bool:
                Whether the maximum VIF value for variables in X is
                less than the cut-off specified when instantiating the object
        '''

        guards.not_dataframe(X, 'X')

        return metrics.vif(X).max() < self.vif_cutoff
Ejemplo n.º 9
0
    def _stop(self, X: pd.DataFrame) -> bool:
        ''' Stop condition of the fit algorithm
        
            Parameters
            ----------
            X: pandas.DataFrame
                Matrix of attributes being evaluated
                
            Returns
            -------
            bool:
                Whether the maximum absolute pairwise correlation in X is
                less than the correlation cutoff specified when instantiating
                the object
        '''

        guards.not_dataframe(X, 'X')

        return self.corr(X).abs().max().max() < self.corr_cutoff
Ejemplo n.º 10
0
    def pct_distinct(X: pd.DataFrame) -> pd.Series:
        ''' Calculates the percentage of distinct values relative to the sample
            size (number of rows) for every column in X
            
            Parameters
            ----------
            X: pandas.DataFrame
                Dataframe to evaluate
                
            Returns
            -------
            pct: pandas.Series
                Series of resulting values
        '''

        guards.not_dataframe(X, 'X')

        pct = X.apply(lambda sr: len(sr.value_counts()) / X.shape[0])
        pct.name = 'pct_distinct'

        return pct
Ejemplo n.º 11
0
def nan_pct(df: pd.DataFrame, ascending: t.Optional[bool] = False):
    ''' Shows feature names and their corresponding percentage of missing
        values

        Parameters
        ----------
        df: pandas.DataFrame
            Dataframe with variable values

        ascending: bool, default False
            Whether to sort values in ascending order of percentage of NaNs

        Returns
        -------
        pandas.Series:
            Series of variable names and their corresponding percentage of
            NaNs
    '''

    guards.not_dataframe(df, 'df')

    return (df[[var for var in df.columns if df[var].isna().sum() > 0
                ]].isna().mean().sort_values(ascending=ascending))
Ejemplo n.º 12
0
    def freq_ratio(X: pd.DataFrame) -> pd.Series:
        ''' Calculates the ratio between the two most frequent values in a
            variable for every column in X
            
            Parameters
            ----------
            X: pandas.DataFrame
                Dataframe to evaluate
                
            Returns
            -------
            pandas.Series
                Series of resulting values
        '''

        guards.not_dataframe(X, 'X')

        freq_ratio_ = {}
        for var in X.columns.to_list():
            vcount = X[var].value_counts()
            freq_ratio_[var] = vcount.iloc[0] / vcount.iloc[1]

        return pd.Series(freq_ratio_, name='freq_ratio')
Ejemplo n.º 13
0
def deletion_diagnostics(
        data: pd.DataFrame,
        y_col: str,
        base_metric: float,
        learner: t.Any,
        fit: t.Callable[[t.Any, ta.Matrix, t.Optional[ta.Vector]], t.Any],
        predict: t.Callable[[t.Any, ta.Matrix], ta.Vector],
        aggfunc: t.Callable[[ta.Vector], float],
        deviation: t.Optional[str] = 'arithmetic') -> pd.DataFrame:
    ''' Performs deletion diagnostics for the specified model. It computes
        the difference in predicted values, aggregated according to a specified
        aggregation function
        
        Parameters
        ----------
        data: pandas.DataFrame
            Information to be used for learning
            
        y_col: str
            Name of the column in `data` to containing the target
            supervising the model
            
        base_metric: float
            Value of the diagnostics measure before deletion exercises. Used
            for measuring changes
            
        learner: Any
            Any object corresponding to a learner. It must be already
            initialized with desired hyperparameters values
            
        fit: Callable[[learner, X, y], learner]
            Fits the learner to the data. It must return the fitted learner
        
        predict: Callable[[learner, X], Union[pandas.Series, numpy.array]]
            Uses the learner for making predictions. The first argument must be 
            the learner, suposed fitted and ready to predict
            
        aggfunc: Callable[[Union[pandas.Series, numpy.array]], float], default 
            np.mean
            Function to be used to aggregate `metric` for all data after
            predicting with deletion
        
        deviation: str, default 'arithmetic'
            Type of deviation to use for calculating diagnostics. If
            'arithmetic' (default), the difference is calculated. If
            'multiplicative', the ratio is calculated. It must be one of
            these two
    '''

    guards.not_dataframe(data, 'data')
    guards.not_callable(fit, 'fit')
    guards.not_callable(predict, 'predict')
    guards.not_in_supported_values(deviation, ['arithmetic', 'multiplicative'])

    influences = {}

    for idx in data.index:
        X = data.drop(index=idx, columns=y_col)
        y = data.drop(index=idx)[y_col]

        fitted = fit(learner, X, y)
        y_hat = predict(fitted, X)

        after = aggfunc(y_hat)
        influences[idx] = _delta_metric(after, base_metric, deviation)

    return pd.Series(influences)