Esempio n. 1
0
 def features_correlation(df, cols, target, fig_size=(6, 6), path=None):
     """
     Correlation of variables in the dataframe with respect to the target
 
     Parameters
     ----------    
     df       : pd.Dataframe
                dataframe with the data to calculate the correlation
     cols     : array
                columns to be correlated with the target
     target   : str
                target name
     fig_size : tuple
                figure size
     path     : str
                path where the graphics will be saved
                   
     Returns
     -------
     None
     """
     f, ax = plt.subplots(1, figsize=fig_size)
     ax.set_xlabel("Feature Correlation")
     visualizer = FeatureCorrelation(labels=list(cols))
     visualizer.fit(df[cols], df[target])
     f.tight_layout()
     if (path != None):
         f.savefig(path + '/features_correlation.png')
    def test_feature_correlation_labels(self):
        """
        Test labels as feature labels
        """
        viz = FeatureCorrelation(labels=self.labels)
        viz.fit(self.X, self.y)

        npt.assert_array_equal(viz.features_, self.labels)
 def test_feature_correlation_select_feature_by_index_out_of_range(self):
     """
     Test selecting feature by feature index but index is out of range
     """
     e = 'Feature index is out of range'
     with pytest.raises(YellowbrickValueError, match=e):
         viz = FeatureCorrelation(feature_index=[0, 2, 10])
         viz.fit(self.X, self.y)
    def test_feature_correlation_select_feature_by_index(self):
        """
        Test selecting feature by index
        """
        viz = FeatureCorrelation(feature_index=[0, 2, 3])
        viz.fit(self.X, self.y)

        assert viz.scores_.shape[0] == 3
    def test_feature_correlation_labels_from_index(self):
        """
        Test getting feature labels from index
        """
        viz = FeatureCorrelation()
        viz.fit(self.X, self.y)

        npt.assert_array_equal(viz.features_, np.arange(self.X.shape[1]))
    def test_feature_correlation_sort(self):
        """
        Test sorting of correlation
        """
        viz = FeatureCorrelation(sort=True)
        viz.fit(self.X, self.y)

        assert np.all(viz.scores_[:-1] <= viz.scores_[1:])
    def test_feature_correlation_select_feature_by_index(self):
        """
        Test selecting feature by index
        """
        viz = FeatureCorrelation(feature_index=[0, 2, 3])
        viz.fit(self.X, self.y)

        assert viz.scores_.shape[0] == 3
    def test_feature_correlation_labels(self):
        """
        Test labels as feature labels
        """
        viz = FeatureCorrelation(labels=self.labels)
        viz.fit(self.X, self.y)

        npt.assert_array_equal(viz.features_, self.labels)
 def test_feature_correlation_select_feature_by_index_out_of_range(self):
     """
     Test selecting feature by feature index but index is out of range
     """
     e = "Feature index is out of range"
     with pytest.raises(YellowbrickValueError, match=e):
         viz = FeatureCorrelation(feature_index=[0, 2, 10])
         viz.fit(self.X, self.y)
    def test_feature_correlation_sort(self):
        """
        Test sorting of correlation
        """
        viz = FeatureCorrelation(sort=True)
        viz.fit(self.X, self.y)

        assert np.all(viz.scores_[:-1] <= viz.scores_[1:])
    def test_feature_correlation_labels_from_index(self):
        """
        Test getting feature labels from index
        """
        viz = FeatureCorrelation()
        viz.fit(self.X, self.y)

        npt.assert_array_equal(viz.features_, np.arange(self.X.shape[1]))
    def test_feature_correlation_integrated_mutual_info_regression(self):
        """
        Test FeatureCorrelation visualizer with mutual information regression
        """
        viz = FeatureCorrelation(method='mutual_info-regression')
        viz.fit(self.X, self.y, random_state=23456)
        viz.poof()

        self.assert_images_similar(viz)
def feature_correlation_pearson(
        path="images/feature_correlation_pearson.png"):
    data = datasets.load_diabetes()
    X, y = data['data'], data['target']
    feature_names = np.array(data['feature_names'])

    visualizer = FeatureCorrelation(labels=feature_names)
    visualizer.fit(X, y)
    visualizer.poof(outpath=path, clear_figure=True)
    def test_feature_correlation_select_feature_by_name(self):
        """
        Test selecting feature by feature names
        """
        feature_names = ["age", "sex", "bp", "s5"]

        viz = FeatureCorrelation(labels=self.labels, feature_names=feature_names)
        viz.fit(self.X, self.y)

        npt.assert_array_equal(viz.features_, feature_names)
    def test_feature_correlation_select_feature_by_name_no_labels(self):
        """
        Test selecting feature by feature names with labels is not supplied
        """
        feature_names = ["age"]

        e = "age not in labels"
        with pytest.raises(YellowbrickValueError, match=e):
            viz = FeatureCorrelation(feature_names=feature_names)
            viz.fit(self.X, self.y)
    def test_feature_correlation_labels_from_dataframe(self):
        """
        Test getting feature labels from DataFrame
        """
        X_pd = pd.DataFrame(self.X, columns=self.labels)

        viz = FeatureCorrelation()
        viz.fit(X_pd, self.y)

        npt.assert_array_equal(viz.features_, self.labels)
    def test_feature_correlation_integrated_pearson(self):
        """
        Test FeatureCorrelation visualizer with pearson correlation
        coefficient
        """
        viz = FeatureCorrelation()
        viz.fit(self.X, self.y)
        viz.poof()

        self.assert_images_similar(viz)
    def test_feature_correlation_select_feature_by_name_no_labels(self):
        """
        Test selecting feature by feature names with labels is not supplied
        """
        feature_names = ['age']

        e = 'age not in labels'
        with pytest.raises(YellowbrickValueError, match=e):
            viz = FeatureCorrelation(feature_names=feature_names)
            viz.fit(self.X, self.y)
    def test_feature_correlation_labels_from_dataframe(self):
        """
        Test getting feature labels from DataFrame
        """
        X_pd = pd.DataFrame(self.X, columns=self.labels)

        viz = FeatureCorrelation()
        viz.fit(X_pd, self.y)

        npt.assert_array_equal(viz.features_, self.labels)
    def test_feature_correlation_select_feature_by_name(self):
        """
        Test selecting feature by feature names
        """
        feature_names = ['age', 'sex', 'bp', 's5']

        viz = FeatureCorrelation(labels=self.labels,
                                 feature_names=feature_names)
        viz.fit(self.X, self.y)

        npt.assert_array_equal(viz.features_, feature_names)
Esempio n. 21
0
 def target_visualizer(self,
                       classes=None,
                       params={'BalancedBinningReference': {
                           'bins': 5
                       }}):
     LOGGER.info('Initializing target visualizer')
     if os.path.isdir(os.path.join(os.getcwd(), 'visualizer/')) == False:
         os.makedirs(os.path.join(os.getcwd(), 'visualizer/'))
     visualizers = []
     y = self.y.squeeze()
     try:
         LOGGER.info('Visualizer BalancedBinningReference')
         visualizer = BalancedBinningReference()
         if visualizer.__class__.__name__ in params.keys():
             visualizer = BalancedBinningReference(
                 **params[visualizer.__class__.__name__])
         visualizer.fit(y)
         visualizer.show(outpath=os.path.join(
             os.getcwd(),
             f"visualizer/{visualizer.__class__.__name__}.png"))
         plt.cla()
     except:
         LOGGER.warn('ERROR BalancedBinning')
     try:
         LOGGER.info('Visualizer CLassBalance')
         visualizer = ClassBalance()
         if visualizer.__class__.__name__ in params.keys():
             visualizer = ClassBalance(
                 **params[visualizer.__class__.__name__])
         visualizer.fit(y)
         visualizer.show(outpath=os.path.join(
             os.getcwd(),
             f"visualizer/{visualizer.__class__.__name__}.png"))
         plt.cla()
     except:
         LOGGER.warn('ERROR ClassBalance')
     try:
         LOGGER.info('Visualizer Feature Correlation')
         visualizer = FeatureCorrelation(
             method='mutual_info-classification',
             feature_names=self.X.columns.tolist(),
             sort=True)
         if visualizer.__class__.__name__ in params.keys():
             visualizer = FeatureCorrelation(
                 **params[visualizer.__class__.__name__])
         visualizer.fit(self.X, y)
         visualizer.show(outpath=os.path.join(
             os.getcwd(),
             f"visualizer/{visualizer.__class__.__name__}.png"))
         plt.cla()
     except:
         LOGGER.warn('ERROR FeatureCorrelation')
    def test_feature_correlation_integrated_mutual_info_classification(self):
        """
        Test FeatureCorrelation visualizer with mutual information
        on wine dataset (classification)
        """
        data = datasets.load_wine()
        X, y = data['data'], data['target']

        viz = FeatureCorrelation(method='mutual_info-classification')
        viz.fit(X, y, random_state=12345)
        viz.poof()

        self.assert_images_similar(viz)
def feature_correlation_mutual_info_regression(
        path="images/feature_correlation_mutual_info_regression.png"):
    data = datasets.load_diabetes()
    X, y = data['data'], data['target']
    feature_names = np.array(data['feature_names'])

    discrete_features = [False for _ in range(len(feature_names))]
    discrete_features[1] = True

    visualizer = FeatureCorrelation(method='mutual_info-regression',
                                    labels=feature_names, sort=True)
    visualizer.fit(X, y, discrete_features=discrete_features, random_state=0)
    visualizer.poof(outpath=path, clear_figure=True)
def feature_correlation_mutual_info_classification(
        path="images/feature_correlation_mutual_info_classification.png"):
    data = datasets.load_wine()
    X, y = data['data'], data['target']
    feature_names = np.array(data['feature_names'])
    X_pd = pd.DataFrame(X, columns=feature_names)

    feature_to_plot = ['alcohol', 'ash', 'hue', 'proline', 'total_phenols']

    visualizer = FeatureCorrelation(method='mutual_info-classification',
                                    feature_names=feature_to_plot)
    visualizer.fit(X_pd, y, random_state=0)
    visualizer.poof(outpath=path, clear_figure=True)
    def test_feature_correlation_select_feature_by_index_and_name(self):
        """
        Test selecting feature warning when both index and names are provided
        """
        feature_index = [0, 2, 3]
        feature_names = ['age']

        e = ('Both feature_index and feature_names are specified. '
             'feature_names is ignored')
        with pytest.raises(YellowbrickWarning, match=e):
            viz = FeatureCorrelation(feature_index=feature_index,
                                     feature_names=feature_names)
            viz.fit(self.X, self.y)
            assert viz.scores_.shape[0] == 3
    def test_feature_correlation_select_feature_by_index_and_name(self):
        """
        Test selecting feature warning when both index and names are provided
        """
        feature_index = [0, 2, 3]
        feature_names = ['age']

        e = ('Both feature_index and feature_names are specified. '
             'feature_names is ignored')
        with pytest.raises(YellowbrickWarning, match=e):
            viz = FeatureCorrelation(feature_index=feature_index,
                                     feature_names=feature_names)
            viz.fit(self.X, self.y)
            assert viz.scores_.shape[0] == 3
 def test_feature_correlation_method_not_implemented(self):
     """
     Test FeatureCorrelation visualizer with unknown method
     """
     method = "foo"
     e = "Method foo not implement; choose from *"
     with pytest.raises(YellowbrickValueError, match=e):
         FeatureCorrelation(method=method)
Esempio n. 28
0
def showCorrelation(gripperjack_nr, part):
    data = DynamicCsvConverter(gripperjack_nr, part, '5min', 'max',
                               pd.read_csv(
                                   'C:\\Users\\Lukassen\\PycharmProjects\\GelredomeVeldErrorVoorspellen\\Recources\\Volledige_Gelredome_Data_CSV.csv',
                                   index_col=False))
    data = data.make_file()

    # to see correlation with the to be predicted remove 'to_be_predicted' from drop columns and put the 'to_be_predicted variable in the data.pop method'
    data = data.drop(columns=['Timestamp'])
    data = data.dropna()
    y = data.pop('to_be_predicted')
    X = data

    # Create a list of the feature names
    features = np.array(data.columns)

    # Create a list of the discrete features
    discrete = [False for _ in range(len(features))]
    discrete[1] = True

    # Instantiate the visualizer
    visualizer = FeatureCorrelation(labels=features, size=(1200, 700))
    visualizer.title = part
    visualizer.fit(X, y)
    values.append(visualizer.scores_)
    visualizer.show()
def create_correlation_matrix(data):
    encoded = encode_data(data)

    kendall = encoded.corr(method ='kendall')['class'].to_frame()
    pearson = encoded.corr(method ='pearson')['class'].to_frame()
    spearman = encoded.corr(method ='spearman')['class'].to_frame()
    kendall.columns = ['kendall']
    pearson.columns = ['pearson']
    spearman.columns = ['spearman']

    correlation_matrix = pd.concat([kendall, pearson, spearman], axis=1, sort=False)

    X, y = encoded.drop(columns = ['class']), encoded['class']
    visualizer = FeatureCorrelation(method='mutual_info-classification', labels=X.columns)
    visualizer.fit(X, y)

    correlation_matrix = correlation_matrix.drop('class', axis = 0)
    correlation_matrix['mutual_info-classification'] = visualizer.scores_.tolist()

    return correlation_matrix
Esempio n. 30
0
def feature_correlation_pearson(path="images/feature_correlation_pearson.png"):
    data = datasets.load_diabetes()
    X, y = data['data'], data['target']
    feature_names = np.array(data['feature_names'])

    visualizer = FeatureCorrelation(labels=feature_names)
    visualizer.fit(X, y)
    visualizer.poof(outpath=path, clear_figure=True)
    def test_feature_correlation_integrated_mutual_info_regression(self):
        """
        Test FeatureCorrelation visualizer with mutual information regression
        """
        viz = FeatureCorrelation(method="mutual_info-regression")
        viz.fit(self.X, self.y, random_state=23456)
        viz.finalize()

        self.assert_images_similar(viz)
    def test_feature_correlation_integrated_pearson(self):
        """
        Test FeatureCorrelation visualizer with pearson correlation
        coefficient
        """
        viz = FeatureCorrelation()
        viz.fit(self.X, self.y)
        viz.finalize()

        self.assert_images_similar(viz)
Esempio n. 33
0
def mutual_info_classification(classes, feature_names, X, y):
    from sklearn import datasets
    from yellowbrick.target import FeatureCorrelation

    # Load the regression data set

    visualizer = FeatureCorrelation(method='mutual_info-classification',
                                    feature_names=feature_names,
                                    sort=True)
    visualizer.fit(X, y, random_state=0)
    visualizer.poof()
Esempio n. 34
0
def pearson_correlation(classes, fetures, X, Y):
    from sklearn import datasets
    from yellowbrick.target import FeatureCorrelation

    # Load the regression data set
    # data = datasets.load_diabetes()
    # X, y = data['data'], data['target']
    # feature_names = np.array(data['feature_names'])

    visualizer = FeatureCorrelation(labels=fetures)
    visualizer.fit(X, Y)
    visualizer.poof()
    def report(self, pipeline: AbstractPipeline):

        folder = get_cache_path()
        path = pkg_resources.resource_filename(
            'crcdal', 'cache/' + folder + '/' + self.sub_folder + '/')
        pkg_resources.ensure_directory(path)

        feature_names = list(pipeline.train.columns())
        visualizer = FeatureCorrelation(labels=feature_names)
        visualizer.fit(pipeline.train, pipeline.train_y)
        visualizer.poof(outpath=path + pipeline.dataset_tag +
                        '_model_feature_correlation_report.csv')
Esempio n. 36
0
def mutual_info_regress(classes, feature_names, X, y):
    from sklearn import datasets
    from yellowbrick.target import FeatureCorrelation

    # Load the regression data set

    discrete_features = [False for _ in range(len(feature_names))]
    discrete_features[1] = True

    visualizer = FeatureCorrelation(method='mutual_info-regression',
                                    labels=feature_names)
    visualizer.fit(X, y, discrete_features=discrete_features, random_state=0)
    visualizer.poof()
Esempio n. 37
0
def feature_correlation_mutual_info_classification(
        path="images/feature_correlation_mutual_info_classification.png"):
    data = datasets.load_wine()
    X, y = data['data'], data['target']
    feature_names = np.array(data['feature_names'])
    X_pd = pd.DataFrame(X, columns=feature_names)

    feature_to_plot = ['alcohol', 'ash', 'hue', 'proline', 'total_phenols']

    visualizer = FeatureCorrelation(method='mutual_info-classification',
                                    feature_names=feature_to_plot)
    visualizer.fit(X_pd, y, random_state=0)
    visualizer.poof(outpath=path, clear_figure=True)
    def test_feature_correlation_integrated_mutual_info_classification(self):
        """
        Test FeatureCorrelation visualizer with mutual information
        on wine dataset (classification)
        """
        data = datasets.load_wine()
        X, y = data["data"], data["target"]

        viz = FeatureCorrelation(method="mutual_info-classification")
        viz.fit(X, y, random_state=12345)
        viz.finalize()

        self.assert_images_similar(viz)
Esempio n. 39
0
def feature_correlation_mutual_info_regression(
        path="images/feature_correlation_mutual_info_regression.png"):
    data = datasets.load_diabetes()
    X, y = data['data'], data['target']
    feature_names = np.array(data['feature_names'])

    discrete_features = [False for _ in range(len(feature_names))]
    discrete_features[1] = True

    visualizer = FeatureCorrelation(method='mutual_info-regression',
                                    labels=feature_names,
                                    sort=True)
    visualizer.fit(X, y, discrete_features=discrete_features, random_state=0)
    visualizer.poof(outpath=path, clear_figure=True)
Esempio n. 40
0
import pandas as pd
import numpy as np
import math
import seaborn as sns
import matplotlib.pyplot as plt
from yellowbrick.target import FeatureCorrelation

dataset = pd.read_csv('house_prices.csv')

#removendo atributos que nao serao utilizados para analise
dataset.drop(labels=['id', 'date', 'sqft_living15', 'sqft_lot15'],
             axis=1,
             inplace=True)

print(dataset.columns)

grafico = FeatureCorrelation(labels=dataset.columns[1:])
grafico.fit(dataset.iloc[:, 1:16].values, dataset.iloc[:, 0].values)
plt.show()
Esempio n. 41
0
visualizer.fit_transform(X, y)
visualizer.poof()

# %%
visualizer = Rank1D()
visualizer.fit(X, y)
visualizer.transform(X)
visualizer.poof()

# %%
visualizer = Rank2D()
visualizer.fit_transform(X)
visualizer.poof()

# %%
visualizer = FeatureCorrelation()
visualizer.fit(X, y)
visualizer.poof()

# %%
visualizer = FeatureCorrelation(method='mutual_info-classification')
visualizer.fit(X, y)
visualizer.poof()

# %%
visualizer = RadViz(classes=class_names)
visualizer.fit(X, y)
visualizer.transform(X)
visualizer.poof()

# %%
Esempio n. 42
0
            correlated_features.add(colname)

## yellowbrick 

from sklearn import datasets
from yellowbrick.target import FeatureCorrelation

# Load the regression dataset
data = datasets.load_diabetes()
X, y = data['data'], data['target']

# Create a list of the feature names
features = np.array(data['feature_names'])

# Instantiate the visualizer
visualizer = FeatureCorrelation(labels=features)

visualizer.fit(X, y)        # Fit the data to the visualizer
visualizer.show() 

## PCA - Principal Component Analysis https://www.kaggle.com/ryanholbrook/principal-component-analysis
from sklearn.decomposition import PCA

# Create principal components
pca = PCA()
X_pca = pca.fit_transform(X_scaled)

# Convert to dataframe
component_names = [f"PC{i+1}" for i in range(X_pca.shape[1])]
X_pca = pd.DataFrame(X_pca, columns=component_names)
Esempio n. 43
0
def featcorr():
    data = load_diabetes()

    oz = FeatureCorrelation(ax=newfig())
    oz.fit(data.data, data.target)
    savefig(oz, "feature_correlation")
from matplotlib import rcParams as rc
from yellowbrick.target import FeatureCorrelation

rc['xtick.labelsize'] = 15.0
rc['ytick.labelsize'] = 15.0
rc['xtick.direction'] = 'out'
rc['axes.labelsize'] = 15.0
rc['axes.titlesize'] = 18.0
rc['savefig.format'] = 'png'
rc['savefig.dpi'] = 600
rc['legend.fontsize'] = 15

x = df.drop('Death_Event', axis=1)
y = df['Death_Event']
fig = plt.figure(figsize=(8, 6))
corr = FeatureCorrelation(method='pearson', label=x.columns,
                          sort=True).fit(x, y)
plt.savefig('../../outputs/visuals/correlations')
corr.show()

fig, ax = plt.subplots(figsize=(20, 10))
sns.heatmap(df.corr(), annot=True, square=False, ax=ax)
ax.set_title('Correlations between features')
plt.savefig('../../outputs/visuals/correlations_all')
plt.show()

# Age distribution of Patients
fig, ax = plt.subplots(figsize=(8, 6))
sns.kdeplot(df['Age'], legend=False, shade=True, ax=ax)
plt.savefig('../../outputs/visuals/age_distribution')
ax.set_title('Age Distribution of Patients')
Esempio n. 45
0
                                                cv=cv)
clf = scores['estimator'][np.argmax(scores['test_score'])]
print(np.max(scores['test_score']))

# %%
explainer = shap.TreeExplainer(clf)
shap_values = explainer.shap_values(Xv)

# %%
shap.summary_plot(shap_values, Xv, plot_type="bar")

# %%
feat = feature_names[feat][np.mean(abs(shap_values), axis=0) > 0.55]
print(feat)
X = X[feat]

# %%
visualizer = FeatureCorrelation(method='mutual_info-classification')
visualizer.fit(X, y)
visualizer.poof()

# %%
# This step doesn't always produce the same result, idk why.
feat = visualizer.features_[visualizer.scores_ > 0.04]
X = X[feat]

# %%
# Our final 10 features:
# [263, 268, 287, 288, 300, 302, 307, 308, 313, 315]
print(feat)