def mev(X: pd.DataFrame) -> pd.DataFrame: ''' Returns the Matrix of Explained Variance (MEV). This is a matrix constructed by running a simple linear regression (OLS) with every pair of variable in the matrix of attributes X. Then, the R-squared statistic is calculated and stored in a square matrix. It is equivalent to a correlation matrix for pairs of continuous variables Parameters ---------- X: pandas.DataFrame Matrix of attributes Returns ------- pandas.DataFrame MEV matrix ''' guards.not_dataframe(X, 'X') cols = X.columns.to_list() matrix = pd.DataFrame({k: np.nan for k in cols}, index=cols) reg = LinearRegression() for i in cols: matrix.at[i, i] = 1 for j in cols: if cols.index(i) < cols.index(j): x = np.asarray(X[j]).reshape(-1, 1) r_sq = reg.fit(x, X[i]).score(x, X[i]) matrix.at[i, j] = r_sq matrix.at[j, i] = r_sq return matrix
def vif(X: pd.DataFrame) -> pd.Series: ''' Returns the Variance Inflator Factor (VIF) for all variables in X Parameters ---------- X: pandas.DataFrame Matrix of attributes Returns ------- pandas.Series VIF vector for all columns in X ''' guards.not_dataframe(X, 'X') results = {} reg = LinearRegression() for col in X.columns.to_list(): others = X.drop(columns=col) r_sq = reg.fit(others, X[col]).score(others, X[col]) results[col] = _vif(r_sq) return pd.Series(results, name='vif')
def corr(self, df: pd.DataFrame) -> pd.DataFrame: ''' Calculates a correlation matrix for df with the method on the class description. It mimics the behavior of `pandas.DataFrame.corr` method Parameters ---------- df: pandas.DataFrame Matrix for which to calculate all pairwise correlations Returns ------- matrix: pandas.DataFrame Square matrix corresponding to the correlation matrix of every variable (column) in df. It is a symmetric matrix, with only 1 on its main diagonal ''' guards.not_dataframe(df, 'df') cols = df.columns.to_list() matrix = pd.DataFrame({k: np.nan for k in cols}, index=cols) for i in cols: matrix.at[i, i] = 1 for j in cols: if cols.index(i) < cols.index(j): r = self.assoc(df[i], df[j])[0] matrix.at[i, j] = r matrix.at[j, i] = r return matrix
def fit( self, X: pd.DataFrame, y: pd.Series, apply_cutoffs: t.Optional[bool] = True, ranking: t.Optional[t.Union[t.List[str], t.Tuple[str, ...]]] = None ) -> pd.DataFrame: ''' Applies correlation-based feature selection for a attribute matrix X and a response vector y Parameters ---------- X: pandas.DataFrame Matrix of attributes whose columns are to be ranked according to their correlation with y y: pandas.Series Vector of response measures (aka dependent variable or target) to use for ranking features apply_cutoffs: bool, default True Whether to apply the cut-offs associated if the object during ranking. See the documentation on the class attributes for more information on these ranking: Union[List[str], Tuple[str, ...]], default None List or tuple of variable names that will be used as a ranking. When doing the filtering, this method will always drop the variable in the pair that comes AFTER the other one in this list. This means that the first item of this list should be the most important variable (ranked 1), the second should be the second most important variable (ranked 2) and so on. If no such list is provided, one is created using the `rank` method of this class Returns ------- X_new: pandas.DataFrame Dataframe with the columns in X which were deemed not pathologically correlated with any of the others. It has the same number of rows as X ''' guards.not_dataframe(X, 'X') guards.not_series(y, 'y') guards.not_iterable(ranking, 'ranking') if ranking is None: rk = (self.rank( X, y, apply_cutoffs)['rank'].sort_values().index.to_list()) else: rk = ranking corr = self.corr(X).abs() for r in np.sort(np.linspace(0, self.corr_cutoff))[::-1]: X_new = self.filter(X, corr, r, rk) if self._stop(X_new): return X_new
def rank(self, X: pd.DataFrame, y: pd.Series, apply_cutoffs: t.Optional[bool] = True) -> pd.DataFrame: ''' Ranks attributes (columns) in X according to its correlation with the target y Parameters ---------- X: pandas.DataFrame Matrix of attributes whose columns are to be ranked according to their correlation with y y: pandas.Series Vector of response measures (aka dependent variable or target) to use for ranking features apply_cutoffs: bool, default True Whether to apply the cut-offs associated if the object during ranking. See the documentation on the class attributes for more information on these Returns ------- df: pandas.DataFrame Dataframe containing 4 columns and n rows, where rows is at most the number of columns in X. The first column is the association measure (`assoc`), the second and third its associated p-value and the fourth is the ranking of the variable, where the number 1 indicates the best one, the number 2 the second-best and so on ''' guards.not_dataframe(X, 'X') guards.not_series(y, 'y') data = {'assoc': [], 'pvalue': [], '-log10(pvalue)': []} cols = X.columns.to_list() for var in cols: r, pval = self.assoc(X[var], y) pval2 = -np.log10(pval) data['assoc'].append(r) data['pvalue'].append(pval) data['-log10(pvalue)'].append(pval2) df = pd.DataFrame(data, index=cols) if apply_cutoffs: pval_filter = df['pvalue'] < self.pval_cutoff too_good_filter = df['assoc'].abs() < self.too_good_to_be_true df = df[pval_filter & too_good_filter] df['rank'] = df['assoc'].abs().rank(ascending=False, method='dense') return df
def filter( X: pd.DataFrame, corr: pd.DataFrame, cutoff: float, ranking: t.Union[t.List[str], t.Tuple[str, ...]]) -> pd.DataFrame: ''' Applies the correlation cut-off to X, according to the correlations calculated in the correlation matrix corr. It uses a ordered list of feature names to start the cutting from the most important variables according to the ranking method. In this list, the first element is the most important variable (ranked 1), the second the second most important variable (ranked 2) and so on Parameters ---------- X: pandas.DataFrame Matrix of attributes whose columns are to be filtered according to their correlation with each other corr: pandas.DataFrame Correlation matrix used for assessing each pairwise correlation of the columns of X. It should be a square matrix with dimension n x n, where n is the number of columns in X. Preferably, it should be symmetric and have only number 1 on its main diagonal. For an example of this matrix, see the result of pandas.DataFrame.corr method cutoff: float Correlation above which the pair of variables are considered to be pathologically correlated ranking: Union[List[str], Tuple[str, ...]] List or tuple of variable names that will be used as a ranking. When doing the filtering, this method will always drop the variable in the pair that comes AFTER the other one in this list. This means that the first item of this list should be the most important variable (ranked 1), the second should be the second most important variable (ranked 2) and so on Returns ------- pandas.DataFrame: Dataframe with the columns in X which were deemed not pathologically correlated with any of the others. It has the same number of rows as X ''' guards.not_dataframe(X, 'X') guards.not_dataframe(corr, 'corr') guards.not_iterable(ranking, 'ranking') keep = set(ranking) for i in ranking: keep -= set([ j for j in ranking if ranking.index(i) < ranking.index(j) and corr.at[i, j] > cutoff ]) return X[keep]
def rank(self, X: pd.DataFrame, apply_cutoffs: t.Optional[bool] = False) -> pd.DataFrame: ''' Creates a ranking of low variance variables, from the ones with least variance to the ones with most variance. Therefore, the variable with rank equals to 1 is the variable with least variance among those in X Parameters ---------- X: pandas.DataFrame Attribute matrix apply_cutoffs: bool, default False Whether to apply the cut-offs specified during the inicialization of the object. It is recommended not to apply the cut-offs without first evaluating the low variance index (LVI), because variables with few values can be still be useful as categorical features Returns ------- df: pandas.DataFrame Dataframe containing 3 columns and n <= k rows, where k is the number of columns in X. The first columns is the percentage of distinct values relative to the sample size (`pct_distinct`). The second column is the ratio between the frequencies of the two most frequent values (`freq_ratio`). The third is the low variance index (LVI), which is the logaritm (base 10) of the ratio between `freq_ratio` and `pct_distinct` ''' guards.not_dataframe(X, 'X') df = pd.concat([self.pct_distinct(X), self.freq_ratio(X)], axis=1) if apply_cutoffs: pct_filter = df['pct_distinct'] < self.pct_distinct_cutoff freq_filter = df['freq_ratio'] > self.freq_ratio_cutoff df = df[pct_filter & freq_filter] df['lvi'] = np.log10(df['freq_ratio'] / df['pct_distinct']) df['rank'] = df['lvi'].rank(ascending=False, method='dense') return df
def _stop(self, X: pd.DataFrame) -> bool: ''' Stop condition of the fit algorithm Parameters ---------- X: pandas.DataFrame Matrix of attributes being evaluated Returns ------- bool: Whether the maximum VIF value for variables in X is less than the cut-off specified when instantiating the object ''' guards.not_dataframe(X, 'X') return metrics.vif(X).max() < self.vif_cutoff
def _stop(self, X: pd.DataFrame) -> bool: ''' Stop condition of the fit algorithm Parameters ---------- X: pandas.DataFrame Matrix of attributes being evaluated Returns ------- bool: Whether the maximum absolute pairwise correlation in X is less than the correlation cutoff specified when instantiating the object ''' guards.not_dataframe(X, 'X') return self.corr(X).abs().max().max() < self.corr_cutoff
def pct_distinct(X: pd.DataFrame) -> pd.Series: ''' Calculates the percentage of distinct values relative to the sample size (number of rows) for every column in X Parameters ---------- X: pandas.DataFrame Dataframe to evaluate Returns ------- pct: pandas.Series Series of resulting values ''' guards.not_dataframe(X, 'X') pct = X.apply(lambda sr: len(sr.value_counts()) / X.shape[0]) pct.name = 'pct_distinct' return pct
def nan_pct(df: pd.DataFrame, ascending: t.Optional[bool] = False): ''' Shows feature names and their corresponding percentage of missing values Parameters ---------- df: pandas.DataFrame Dataframe with variable values ascending: bool, default False Whether to sort values in ascending order of percentage of NaNs Returns ------- pandas.Series: Series of variable names and their corresponding percentage of NaNs ''' guards.not_dataframe(df, 'df') return (df[[var for var in df.columns if df[var].isna().sum() > 0 ]].isna().mean().sort_values(ascending=ascending))
def freq_ratio(X: pd.DataFrame) -> pd.Series: ''' Calculates the ratio between the two most frequent values in a variable for every column in X Parameters ---------- X: pandas.DataFrame Dataframe to evaluate Returns ------- pandas.Series Series of resulting values ''' guards.not_dataframe(X, 'X') freq_ratio_ = {} for var in X.columns.to_list(): vcount = X[var].value_counts() freq_ratio_[var] = vcount.iloc[0] / vcount.iloc[1] return pd.Series(freq_ratio_, name='freq_ratio')
def deletion_diagnostics( data: pd.DataFrame, y_col: str, base_metric: float, learner: t.Any, fit: t.Callable[[t.Any, ta.Matrix, t.Optional[ta.Vector]], t.Any], predict: t.Callable[[t.Any, ta.Matrix], ta.Vector], aggfunc: t.Callable[[ta.Vector], float], deviation: t.Optional[str] = 'arithmetic') -> pd.DataFrame: ''' Performs deletion diagnostics for the specified model. It computes the difference in predicted values, aggregated according to a specified aggregation function Parameters ---------- data: pandas.DataFrame Information to be used for learning y_col: str Name of the column in `data` to containing the target supervising the model base_metric: float Value of the diagnostics measure before deletion exercises. Used for measuring changes learner: Any Any object corresponding to a learner. It must be already initialized with desired hyperparameters values fit: Callable[[learner, X, y], learner] Fits the learner to the data. It must return the fitted learner predict: Callable[[learner, X], Union[pandas.Series, numpy.array]] Uses the learner for making predictions. The first argument must be the learner, suposed fitted and ready to predict aggfunc: Callable[[Union[pandas.Series, numpy.array]], float], default np.mean Function to be used to aggregate `metric` for all data after predicting with deletion deviation: str, default 'arithmetic' Type of deviation to use for calculating diagnostics. If 'arithmetic' (default), the difference is calculated. If 'multiplicative', the ratio is calculated. It must be one of these two ''' guards.not_dataframe(data, 'data') guards.not_callable(fit, 'fit') guards.not_callable(predict, 'predict') guards.not_in_supported_values(deviation, ['arithmetic', 'multiplicative']) influences = {} for idx in data.index: X = data.drop(index=idx, columns=y_col) y = data.drop(index=idx)[y_col] fitted = fit(learner, X, y) y_hat = predict(fitted, X) after = aggfunc(y_hat) influences[idx] = _delta_metric(after, base_metric, deviation) return pd.Series(influences)