def diagnoseForFeatureSelection(X, y, dirResultsMachineLearning,
                                diagnoseModels):
    Q = dummyColumns(X)
    Y = pd.concat([Q, y], axis=1)
    plt.close('all')

    #Analizzo la correlazione fra le variabili
    correlationMatrix(Y, dirResultsMachineLearning, annotationCell=False)

    #analysis of correlation with the target variable
    if diagnoseModels.count('correlation') > 0:
        selectByCorrelation(X, y, 1, dirResultsMachineLearning, True)

    #analysis of the variance of the variables
    if diagnoseModels.count('variance') > 0:
        selectByVariance(X, 1, dirResultsMachineLearning, True)

    #analysi of the selection of the variables by lasso
    if diagnoseModels.count('lasso') > 0:
        selectByLassoL1(X, y, 1, dirResultsMachineLearning, True)

    #Verifico se posso esprimere la varianza in un sottoInsieme di componenti principali
    if diagnoseModels.count('PCA') > 0:
        PCAplot(len(Q.columns), Q, dirResultsMachineLearning, diagnose=True)

    #Creo la curva del forward Stepwise
    if diagnoseModels.count('forward stepwise') > 0:
        selectByForwardStepwiseSelection(X,
                                         y,
                                         dirResultsMachineLearning,
                                         len(X.columns),
                                         saveFig=True)

    return True
def selectByTree(X, y):
    '''
    

    Parameters
    ----------
    X : TYPE pandas dataframe
        DESCRIPTION. dataframe of the attributes 
    y : TYPE pandas dataframe
        DESCRIPTION. dataframe with the target variable

    Returns
    -------
    res : TYPE pandas dataframe
        DESCRIPTION. output dataframe with selected variables

    '''
    #lasso feature selection works for regression models
    Q = dummyColumns(X)
    tree = ExtraTreesClassifier(n_estimators=50).fit(Q, y)
    model = SelectFromModel(tree, prefit=True)
    Z = model.transform(Q)
    feature_idx = model.get_support()
    feature_name = Q.columns[feature_idx]
    res = pd.DataFrame(Z, columns=feature_name)
    return res
Exemple #3
0
def BootstrapLoop(nboot, model, X, y):

    X = dummyColumns(X)  #rimuovo eventuali variabili categoriche

    scores_names = ["MSE"]
    scores_boot = np.zeros((nboot, len(scores_names)))
    #coefs_boot = np.zeros((nboot, X.shape[1]))
    orig_all = np.arange(X.shape[0])
    for boot_i in range(nboot):
        boot_tr = np.random.choice(orig_all, size=len(orig_all), replace=True)
        boot_te = np.setdiff1d(orig_all, boot_tr, assume_unique=False)
        Xtr, ytr = X.iloc[boot_tr, :], y[boot_tr]
        Xte, yte = X.iloc[boot_te, :], y[boot_te]
        model.fit(Xtr, ytr)
        y_pred = model.predict(Xte).ravel()
        scores_boot[boot_i, :] = metrics.mean_squared_error(yte, y_pred)
        #coefs_boot[boot_i, :] = model.coef_
    # Compute Mean, SE, CI
    scores_boot = pd.DataFrame(scores_boot, columns=scores_names)
    scores_stat = scores_boot.describe(
        percentiles=[.99, .95, .5, .1, .05, 0.01])
    #print("r-squared: Mean=%.2f, SE=%.2f, CI=(%.2f %.2f)" %\
    #      tuple(scores_stat.ix[["mean", "std", "5%", "95%"], "r2"]))
    #coefs_boot = pd.DataFrame(coefs_boot)
    #coefs_stat = coefs_boot.describe(percentiles=[.99, .95, .5, .1, .05, 0.01])
    #print("Coefficients distribution")
    #print(coefs_stat)
    return scores_stat
def selectByCorrelation(X, y, corrThreshold, diagnose=False):
    '''
    Select the features of the input dataframe X
    based on the correlation with y

    Parameters
    ----------
    X : TYPE pandas dataframe
        DESCRIPTION. dataframe of the attributes 
    y : TYPE pandas dataframe
        DESCRIPTION. dataframe with the target variable
    corrThreshold : TYPE float
        DESCRIPTION. correlation treshold 0->1
    diagnose : TYPE, optional if true generates the correlation of each variable to the target var
        DESCRIPTION. The default is False.

    Returns
    -------
    Z : TYPE pandas dataframe
        DESCRIPTION. dataframe with only the features above the correlation threshold
    output_figure : TYPE dictionary
        DESCRIPTION. dictionary containing the figure of the correlation for each variable (if diagnose is true)

    '''
    #select only the features (column) of a dataframe having a correlation
    #>= than the corrThreshold with the target variable

    output_figure = {}
    targetVariable = y.columns[0]
    Q = dummyColumns(X)
    Q = pd.concat([Q, y], axis=1)
    cor = Q.corr()
    cor_target = abs(cor[targetVariable])

    if diagnose:
        numFeatSelected = []
        #count the number of selected features for each value of correlation
        for i in range(1, 101):
            val = i / 100
            nn = len(cor_target[cor_target > val])
            #print(nn)
            numFeatSelected.append(nn)
        fig1 = plt.figure()
        plt.plot(range(1, 101), numFeatSelected)
        plt.title('Correlation graph')
        plt.xlabel('Corr threshold %')
        plt.ylabel('Num. selected features')
        output_figure['CorrelationChart'] = fig1
        plt.close('all')

    #Selecting highly correlated features
    relevant_features = cor_target[cor_target > corrThreshold]
    relevant_features = list(relevant_features.index.values)
    try:  #check if the target feature is contained within the relevant features and remove
        relevant_features.remove(targetVariable)
    except:
        True
    Z = Q.loc[:, relevant_features]
    return Z, output_figure
def selectByVariance(X, perc, diagnose=False):
    '''
    Select the features of the input dataframe X
    based on their variance
    

    Parameters
    ----------
    X : TYPE pandas dataframe
        DESCRIPTION. dataframe of the attributes 
    perc : TYPE float
        DESCRIPTION. variance treshold
    diagnose : TYPE, optional boolean
        DESCRIPTION. The default is False. if true render the plot of the selected features depending o the variance

    Returns
    -------
    res : TYPE pandas dataframe
        DESCRIPTION. output dataframe with selected variables
    output_figure : TYPE dictionary
        DESCRIPTION. dictionary containing the figures with the plot of the selected features depending o the variance

    '''

    output_figure = {}
    #It select the features having the same value in more than perc of the samples
    Q = dummyColumns(X)
    if diagnose:
        numFeatSelected = []
        for i in range(1, 101):
            val = i / 100
            sel = VarianceThreshold(threshold=(val))
            sel.fit_transform(Q)
            nn = len(Q.columns[sel.get_support()])
            #print(nn)
            numFeatSelected.append(nn)
        fig1 = plt.figure()
        plt.plot(range(1, 101), numFeatSelected)
        plt.title('Variance graph')
        plt.xlabel('Variance threshold %')
        plt.ylabel('Num. selected features')
        output_figure['VarianceChart'] = fig1
        plt.close('all')

    sel = VarianceThreshold(threshold=(perc))
    Z = sel.fit_transform(Q)
    feature_idx = sel.get_support()
    feature_name = Q.columns[feature_idx]
    res = pd.DataFrame(Z, columns=feature_name)
    return res, output_figure
def lassoPath(XX, y):
    '''
    generate the path of the lasso regression using different alphas

    Parameters
    ----------
    XX : TYPE pandas dataframe
        DESCRIPTION. dataframe of the attributes 
    y : TYPE pandas dataframe
        DESCRIPTION. dataframe with the target variable

    Returns
    -------
    output_figure : TYPE output
        DESCRIPTION. dictionary with the figure

    '''

    output_figure = {}
    X = dummyColumns(XX)

    columnNames = list(pd.concat([X, y], axis=1))
    columnNames[-1] = []

    alphas = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3]

    coefs = []
    for a in alphas:
        lasso = linear_model.Lasso(alpha=a, fit_intercept=False)
        lasso.fit(X, y)
        coefs.append(lasso.coef_)

    # Display results
    fig1 = plt.figure(figsize=(25, 10))
    ax = plt.gca()

    ax.plot(alphas, coefs)
    ax.set_xscale('log')
    ax.set_xlim(ax.get_xlim()[::-1])  # reverse axis
    plt.xlabel('alpha')
    plt.ylabel('weights')
    plt.title('Lasso coefficients as a function of the regularization')
    plt.axis('tight')
    plt.show()
    plt.legend(columnNames)
    output_figure['LassoPath'] = fig1
    plt.close('all')
    return output_figure
def selectPreprocessFeature(X, y, value, model):
    #X is the feature dataframe
    #y is the target variable
    #value indicates a parameter of the model (if any)
    #for correlation it is the minimum treshold
    #for forward stepwise it is the number of features to select
    # for PCA it is the number of principal component
    if model == 'correlation':
        return selectByCorrelation(X, y, value, False)
    elif model == 'variance':
        return selectByVariance(X, value, False)
    elif model == 'lasso':
        return selectByLassoL1(X, y, value, False)
    elif model == 'tree':
        return selectByTree(X, y)
    elif model == 'forward stepwise':
        return selectByForwardStepwiseSelection(X, y, value, False)
    elif model == 'PCA':
        return PCAplot(value, X, False)
    else:  #if no feature selection, the dataset is only returned as numerical (dummy) to code strings
        return dummyColumns(X)
def PCAplot(n_comp, XX, diagnose=False):
    '''
    select features using forward stepwise selection

     Parameters
    ----------
    XX : TYPE pandas dataframe
        DESCRIPTION. dataframe of the attributes 
    n_comp : TYPE int
        DESCRIPTION. number of components

    
    Returns
    -------
    X_res : TYPE pandas dataframe
        DESCRIPTION. dataframe with selected features
    output_figure : TYPE dictionary
        DESCRIPTION. dictionary of figures
    output_df : TYPE dictionary
        DESCRIPTION. dictionary of dataframes

    '''

    #n_comp is the number of component of the PCA
    #XX is the dataframe to build the PCA on
    #dirResult is a directory path where to save the plot (only if diagnose==true)
    #diagnose perform an analysis on the percentage of variance explained increasing the number of component
    #if the number of component is 2 and diagnose is true the PCA 2-dim graph is saved

    output_figure = {}
    output_df = {}

    D_Table = dummyColumns(XX)
    # applico la PCA
    data_scaled = pd.DataFrame(preprocessing.scale(D_Table),
                               columns=D_Table.columns)
    pca = PCA(n_components=n_comp)
    PC = pca.fit_transform(data_scaled)

    # Salvo i coefficienti dei parametri
    #components= pd.DataFrame(pca.components_,columns=data_scaled.columns,index = ['PC-1','PC-2'])

    if diagnose:
        components = pd.DataFrame(pca.components_, columns=data_scaled.columns)
        output_df['PCA'] = components

        var = np.cumsum(
            np.round(pca.explained_variance_ratio_, decimals=3) * 100)

        #Plot variance explaination
        fig = plt.figure(num=None,
                         figsize=(10, 8),
                         dpi=80,
                         facecolor='w',
                         edgecolor='k')
        plt.ylabel('% Variance Explained')
        plt.xlabel('# of Features')
        plt.title('PCA Analysis')
        plt.ylim(30, 100.5)
        plt.style.context('seaborn-whitegrid')
        plt.plot(np.arange(1, len(var) + 1), var)
        output_figure['PCA_varianceExplanation'] = fig

        # Plot graph PCA
        if n_comp == 2:  #se ho solo due comèpnenti posso graficare
            fig1 = plt.figure(num=None,
                              figsize=(10, 8),
                              dpi=80,
                              facecolor='w',
                              edgecolor='k')
            plt.scatter(PC[:, 0], PC[:, 1], color='orange')
            plt.xlabel("PC1 (var=%.2f)" % pca.explained_variance_ratio_[0])
            plt.ylabel("PC2 (var=%.2f)" % pca.explained_variance_ratio_[1])
            plt.axis('equal')
            plt.tight_layout()
            plt.title('Principal Component plot')
            output_figure['PCA_plot'] = fig1

            plt.close('all')
    return pd.DataFrame(PC), output_figure, output_df
def selectByForwardStepwiseSelection(XX, y, n_feat):
    '''
    select features using forward stepwise selection

     Parameters
    ----------
    XX : TYPE pandas dataframe
        DESCRIPTION. dataframe of the attributes 
    y : TYPE pandas dataframe
        DESCRIPTION. dataframe with the target variable

    
    Returns
    -------
    X_res : TYPE pandas dataframe
        DESCRIPTION. dataframe with selected features
    output_figure : TYPE dictionary
        DESCRIPTION. dictionary of figures
    output_df : TYPE dictionary
        DESCRIPTION. dictionary of dataframes

    '''

    output_figure = {}
    output_df = {}

    #Converto eventuali variabili categoriche
    X = dummyColumns(XX)
    k = len(X.columns)

    columnNames = ['Features', 'RSS', 'R_squared']
    resultFSs = pd.DataFrame(columns=columnNames)

    remaining_features = list(X.columns.values)
    features = []
    numb_features = [np.inf]
    RSS_list, R_squared_list = [np.inf], [np.inf
                                          ]  #Due to 1 indexing of the loop...
    features_list = dict()

    for i in range(1, k + 1):
        best_RSS = np.inf

        for combo in itertools.combinations(remaining_features, 1):

            RSS = fit_linear_reg(X[list(combo) + features],
                                 y)  #Store temp result

            if RSS[0] < best_RSS:
                best_RSS = RSS[0]
                best_R_squared = RSS[1]
                best_feature = combo[0]

        #Updating variables for next loop
        features.append(best_feature)
        if len(remaining_features) > 0:
            remaining_features.remove(best_feature)

        #Saving values for plotting
        RSS_list.append(best_RSS)
        R_squared_list.append(best_R_squared)
        features_list[i] = features.copy()

        listResult = []
        listResult.append(features_list[i])
        listResult.append([round(best_RSS, 3)])
        listResult.append([round(best_R_squared, 3)])
        listResult = [listResult]
        row = pd.DataFrame(listResult, columns=columnNames)
        resultFSs = resultFSs.append(row)

        numb_features.append(len(features))
    #resultFSs.to_excel(dirResults+'\\00-ForwardStepwiseSelection.xlsx')

    #Salvo le variabili
    s = resultFSs['Features']
    mlb = preprocessing.MultiLabelBinarizer()
    aaa = pd.DataFrame(mlb.fit_transform(s),
                       columns=mlb.classes_,
                       index=resultFSs.index)
    bbb = pd.concat([resultFSs, aaa], axis=1)

    output_df['ForwardStepwiseSelection'] = bbb

    #Store in DataFrame
    df = pd.DataFrame({
        'numb_features': numb_features,
        'RSS': RSS_list,
        'R_squared': R_squared_list
    })
    df_min = df[df.groupby('numb_features')['RSS'].transform(min) == df['RSS']]
    #df_max = df[df.groupby('numb_features')['R_squared'].transform(max) == df['R_squared']]

    output_df['ForwardStepwiseSelection_min'] = df_min

    df['min_RSS'] = df.groupby('numb_features')['RSS'].transform(min)
    df['max_R_squared'] = df.groupby('numb_features')['R_squared'].transform(
        max)

    fig = plt.figure(figsize=(16, 6))
    ax = fig.add_subplot(1, 2, 1)

    ax.scatter(df.numb_features, df.RSS, alpha=.2, color='darkblue')
    ax.set_xlabel('# Features')
    ax.set_ylabel('RSS')
    ax.set_title('RSS - Forward Stepwise selection')
    ax.plot(df.numb_features, df.min_RSS, color='r', label='Best subset')
    ax.legend()

    ax = fig.add_subplot(1, 2, 2)
    ax.scatter(df.numb_features, df.R_squared, alpha=.2, color='darkblue')
    ax.plot(df.numb_features, df.max_R_squared, color='r', label='Best subset')
    ax.set_xlabel('# Features')
    ax.set_ylabel('R squared')
    ax.set_title('R_squared - Forward Stepwise selection')
    ax.legend()
    plt.show()
    output_figure['ForwardStepwiseSelection'] = fig
    plt.close('all')

    result = resultFSs.reset_index()
    selected_features = result.Features[n_feat - 1]

    X_res = X.loc[:, selected_features]
    return X_res, output_figure, output_df
def selectByLassoL1(X, y, value, diagnose=False):
    '''
    Select the features using lasso regression

    Parameters
    ----------
    X : TYPE pandas dataframe
        DESCRIPTION. dataframe of the attributes 
    y : TYPE pandas dataframe
        DESCRIPTION. dataframe with the target variable
    value : TYPE float
        DESCRIPTION. correlation treshold 0->1
    diagnose : TYPE, optional if true generates the correlation of each variable to the target var
        DESCRIPTION. The default is False.


    Returns
    -------
    res : TYPE pandas dataframe
        DESCRIPTION. output dataframe with selected variables
    output_figure : TYPE dictionary
        DESCRIPTION. dictionary containing the figures with the plot of the selected features depending o the variance


    '''

    output_figure = {}
    #lasso feature selection works for regression models
    Q = dummyColumns(X)
    clf = linear_model.LassoCV(cv=5)

    if diagnose:
        numFeatSelected = []
        for i in range(1, 101):
            val = i / 100
            sfm = SelectFromModel(clf, threshold=val)
            sfm.fit(Q, y)
            nn = len(Q.columns[sfm.get_support()])
            #print(nn)
            numFeatSelected.append(nn)
        fig1 = plt.figure()
        plt.plot(range(1, 101), numFeatSelected)
        plt.title('Lasso graph')
        plt.xlabel('Coeff threshold %')
        plt.ylabel('Num. selected features')
        output_figure['LassoChart'] = fig1
        plt.close('all')

    #_, _, _, alphaValue, _= LassoRegressionCV(Q,y,'',nFolds=5, saveFig=False) #previous version with lasso from ZO_RegressionLinearModel
    #las=LassoRegressionComplete(Q,y,alphaValue)

    sfm = SelectFromModel(clf, threshold=value)
    sfm.fit(Q, y)
    #n_features = sfm.transform(X).shape[1]
    #model = SelectFromModel(las, prefit=True,threshold=0.25)

    Z = sfm.transform(Q)
    feature_idx = sfm.get_support()
    feature_name = Q.columns[feature_idx]
    res = pd.DataFrame(Z, columns=feature_name)
    return res, output_figure