def proportion_of_correlated_features_target(X: numpy.array, y: numpy.array = None, method: str = 'spearman', coef: float = 0.5) -> float: ''' Proportion of features correlated with target. Parameters ---------- X : numpy.array 2d-array with features columns. y : numpy.array Array of response values. method : {'pearson', 'kendall', 'spearman'} or callable * pearson : standard correlation coefficient * kendall : Kendall Tau correlation coefficient * spearman : Spearman rank correlation * callable: callable with input two 1d ndarrays and returning a float. Note that the returned matrix from corr will have 1 along the diagonals and will be symmetric regardless of the callable's behavior .. versionadded:: 0.24.0 coef : float, optional (default=0.5) Threshold to mean correlated. Return ------ float: Number of features correlated to target over total number of features. ''' # check if is dataframe X = (X if isinstance(X, pandas.DataFrame) else pandas.DataFrame(X)) y = pandas.Series(y) if method == 'spearman': rho = spearmanr if method == 'pearson': rho = pearsonr is_corr = X.apply(lambda x: abs(rho(x, y).correlation)) > coef return is_corr.sum() / X.shape[1]
def mean_feature_correlation_target(X: numpy.array, y: numpy.array, method: str='spearman') -> float: """ Average feature correlation to the output. Parameters ---------- X : numpy.array 2d-array with features columns. y : numpy.array Array of response values. method : {'pearson', 'kendall', 'spearman'} or callable * pearson : standard correlation coefficient * kendall : Kendall Tau correlation coefficient * spearman : Spearman rank correlation * callable: callable with input two 1d ndarrays and returning a float. Note that the returned matrix from corr will have 1 along the diagonals and will be symmetric regardless of the callable's behavior .. versionadded:: 0.24.0 Return ------ float: Average feature correlation to the output. """ # check if not dataframe X = (pandas.DataFrame(X) if not isinstance(X, pandas.DataFrame) else X) y = pandas.Series(y) if method=='spearman': rho = spearmanr if method=='pearson': rho = pearsonr #TODO # add other methods return X.apply(lambda x: abs(rho(x, y).correlation)).mean()
def proportion_of_binary_features(X: numpy.array, y: numpy.array = None) -> float: ''' Proportion of binary features. Parameters ---------- X : numpy.array 2d-array with features columns. y : numpy.array Array of response values. Return ------ float: Number of binary features over total number of features. ''' # check if is dataframe X = (X if isinstance(X, pandas.DataFrame) else pandas.DataFrame(X)) is_binary = X.apply(lambda x: len(numpy.unique(x))) == 2 return sum(is_binary) / X.shape[1]
def end_tokens(texts: np.array, end_token: str = '\n') -> tuple: """ adds token (default '\n') to the end of each text in an array """ texts_stopped = texts.apply(lambda x: x + end_token) return texts_stopped