Ejemplo n.º 1
0
def proportion_of_correlated_features_target(X: numpy.array,
                                             y: numpy.array = None,
                                             method: str = 'spearman',
                                             coef: float = 0.5) -> float:
    '''
    Proportion of features correlated with target.

    Parameters
    ----------
    X : numpy.array
        2d-array with features columns.
    y : numpy.array
        Array of response values.
    method : {'pearson', 'kendall', 'spearman'} or callable
        * pearson : standard correlation coefficient
        * kendall : Kendall Tau correlation coefficient
        * spearman : Spearman rank correlation
        * callable: callable with input two 1d ndarrays
            and returning a float. Note that the returned matrix from corr
            will have 1 along the diagonals and will be symmetric
            regardless of the callable's behavior
            .. versionadded:: 0.24.0
    coef : float, optional (default=0.5)
        Threshold to mean correlated.

    Return
    ------
    float:
        Number of features correlated to target over total number of features.    
    '''

    # check if is dataframe
    X = (X if isinstance(X, pandas.DataFrame) else pandas.DataFrame(X))
    y = pandas.Series(y)

    if method == 'spearman':
        rho = spearmanr
    if method == 'pearson':
        rho = pearsonr

    is_corr = X.apply(lambda x: abs(rho(x, y).correlation)) > coef

    return is_corr.sum() / X.shape[1]
Ejemplo n.º 2
0
def mean_feature_correlation_target(X: numpy.array, y: numpy.array, method: str='spearman') -> float:
    """
    Average feature correlation to the output.

    Parameters
    ----------
    X : numpy.array
        2d-array with features columns.
    y : numpy.array
        Array of response values.
    method : {'pearson', 'kendall', 'spearman'} or callable
        * pearson : standard correlation coefficient
        * kendall : Kendall Tau correlation coefficient
        * spearman : Spearman rank correlation
        * callable: callable with input two 1d ndarrays
            and returning a float. Note that the returned matrix from corr
            will have 1 along the diagonals and will be symmetric
            regardless of the callable's behavior
            .. versionadded:: 0.24.0        

    Return
    ------
    float:
        Average feature correlation to the output.
    """
    # check if not dataframe
    X = (pandas.DataFrame(X) if not isinstance(X, pandas.DataFrame) else X)
    
    y = pandas.Series(y) 

    if method=='spearman':
        rho = spearmanr
    if method=='pearson':
        rho = pearsonr
    #TODO
    # add other methods    

    return X.apply(lambda x: abs(rho(x, y).correlation)).mean()
Ejemplo n.º 3
0
def proportion_of_binary_features(X: numpy.array,
                                  y: numpy.array = None) -> float:
    '''
    Proportion of binary features.

    Parameters
    ----------
    X : numpy.array
        2d-array with features columns.
    y : numpy.array
        Array of response values.

    Return
    ------
    float:
        Number of binary features over total number of features.    
    '''

    # check if is dataframe
    X = (X if isinstance(X, pandas.DataFrame) else pandas.DataFrame(X))
    is_binary = X.apply(lambda x: len(numpy.unique(x))) == 2

    return sum(is_binary) / X.shape[1]
Ejemplo n.º 4
0
def end_tokens(texts: np.array, end_token: str = '\n') -> tuple:
    """
    adds token (default '\n') to the end of each text in an array
    """
    texts_stopped = texts.apply(lambda x: x + end_token)
    return texts_stopped