Beispiel #1
0
def plot_cumulative_features_importance(features,
                                        threshold=0.90,
                                        plot_size=(12, 8),
                                        feature_type='features',
                                        return_threshold=False):
    """
    """
    plt.figure(figsize=plot_size)
    # Number of features needed for threshold cumulative importance
    importance_idx = np.min(np.where(features['cumulative_coefficient_frequency'] > threshold))
    thr_percentage = 100 * threshold
    required_features = importance_idx+1
    l = '{} {} required for \n{:.0f}% cumulative importance.'.format(required_features,
                                                                     feature_type,
                                                                     thr_percentage)
    # Cumulative importance plot
    plt.plot(range(len(features)), features['cumulative_coefficient_frequency'], 'b-', label=l)
    plt.xlabel('Number of {}'.format(feature_type.capitalize()), fontsize=12, labelpad=20)
    plt.ylabel('Cumulative {} frequency'.format(feature_type.capitalize()), fontsize=12, labelpad=20) 
    plt.title('Cumulative {} Importance'.format(feature_type.capitalize()), fontsize=12, pad=20)
    #plt.title(f'Cumulative Feature Importance\n\n{l}', fontsize=14, pad=20)
    # Threshold  vertical line plot
    plt.vlines(importance_idx + 1, ymin=0, ymax=1.05, linestyles='--', colors='red')
    plt.legend(loc='lower right', fontsize=10)
    plt.tight_layout()
    plt.show()
    if return_threshold:
        return required_features
Beispiel #2
0
def subplot_feature_importance(features, threshold=0.90, plot_size=(12, 8), return_data=False):
    """
    """
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=plot_size)
    # First plot : Cumulative importance plot
    # Number of features needed for threshold cumulative importance
    importance_idx = np.min(np.where(features['cumulative_coefficient_frequency'] > threshold)) + 1
    thr_percentage = 100 * threshold
    # Legend label
    l = '{} features required for \n{:.0f}% cumulative importance.'.format(importance_idx, thr_percentage)
    ax1.plot(range(len(features)), features['cumulative_coefficient_frequency'], 'b-', label=l)
    ax1.set_xlabel('Number of Features', fontsize=12, labelpad=20)
    ax1.set_ylabel('Cumulative Coefficient frequency', fontsize=12, labelpad=20) 
    ax1.set_title('Cumulative Feature Importance', fontsize=14, pad=20)
    #plt.title(f'Cumulative Feature Importance\n\n{l}', fontsize=14, pad=20)
    # Threshold  vertical line plot
    ax1.vlines(importance_idx + 1, ymin=0, ymax=1.05, linestyles='--', colors='red')
    ax1.legend(loc='lower right', fontsize=10)
    
    # Second plot : n selected features  
    features.head(importance_idx).plot(x='feature', y='coefficient', kind='barh',
                                       fontsize=12, figsize=plot_size, ax=ax2)
    fig.gca().invert_yaxis()
    model_label = features.index.name
    ax2.set_title(f'{model_label} Top {importance_idx} Features', fontsize=14, pad=20)
    ax2.set_xlabel('Coefficients', fontsize=12, labelpad=20)
    ax2.set_ylabel('Features labels', fontsize=12, labelpad=20)
    fig.tight_layout()
    if return_data:
        return features.head(importance_idx)
    return fig   
Beispiel #3
0
def plot_cumulative_features_importance(features, threshold=0.90, plot_size=(12, 8)):
    """

    :param features: a dataframe which contains features data
    :param threshold:
    :param plot_size:
    :return:
    """
    plt.figure(figsize=plot_size)
    # Number of features needed for threshold cumulative importance
    importance_idx = np.min(np.where(features['cumulative_coefficient_frequency'] > threshold))
    thr_percentage = 100 * threshold
    l = '{} features required for {:.0f}% of cumulative importance.'.format(importance_idx+1, thr_percentage)
    # Cumulative importance plot
    plt.plot(range(len(features)), features['cumulative_coefficient_frequency'], 'b-', label=l)
    plt.xlabel('Number of Features', fontsize=12, labelpad=20)
    plt.ylabel('Cumulative Coefficient frequency', fontsize=12, labelpad=20) 
    plt.title('Cumulative Feature Importance', fontsize=14, pad=20)
    # plt.title(f'Cumulative Feature Importance\n\n{l}', fontsize=14, pad=20)
    # Threshold  vertical line plot
    plt.vlines(importance_idx + 1, ymin=0, ymax=1.05, linestyles='--', colors='red')
    plt.legend(loc='lower right', fontsize=12)
    plt.tight_layout()
    plt.show()