Ejemplo n.º 1
0
def showCorrelation(gripperjack_nr, part):
    data = DynamicCsvConverter(gripperjack_nr, part, '5min', 'max',
                               pd.read_csv(
                                   'C:\\Users\\Lukassen\\PycharmProjects\\GelredomeVeldErrorVoorspellen\\Recources\\Volledige_Gelredome_Data_CSV.csv',
                                   index_col=False))
    data = data.make_file()

    # to see correlation with the to be predicted remove 'to_be_predicted' from drop columns and put the 'to_be_predicted variable in the data.pop method'
    data = data.drop(columns=['Timestamp'])
    data = data.dropna()
    y = data.pop('to_be_predicted')
    X = data

    # Create a list of the feature names
    features = np.array(data.columns)

    # Create a list of the discrete features
    discrete = [False for _ in range(len(features))]
    discrete[1] = True

    # Instantiate the visualizer
    visualizer = FeatureCorrelation(labels=features, size=(1200, 700))
    visualizer.title = part
    visualizer.fit(X, y)
    values.append(visualizer.scores_)
    visualizer.show()
Ejemplo n.º 2
0
 def features_correlation(df, cols, target, fig_size=(6, 6), path=None):
     """
     Correlation of variables in the dataframe with respect to the target
 
     Parameters
     ----------    
     df       : pd.Dataframe
                dataframe with the data to calculate the correlation
     cols     : array
                columns to be correlated with the target
     target   : str
                target name
     fig_size : tuple
                figure size
     path     : str
                path where the graphics will be saved
                   
     Returns
     -------
     None
     """
     f, ax = plt.subplots(1, figsize=fig_size)
     ax.set_xlabel("Feature Correlation")
     visualizer = FeatureCorrelation(labels=list(cols))
     visualizer.fit(df[cols], df[target])
     f.tight_layout()
     if (path != None):
         f.savefig(path + '/features_correlation.png')
    def test_feature_correlation_sort(self):
        """
        Test sorting of correlation
        """
        viz = FeatureCorrelation(sort=True)
        viz.fit(self.X, self.y)

        assert np.all(viz.scores_[:-1] <= viz.scores_[1:])
Ejemplo n.º 4
0
def feature_correlation_pearson(path="images/feature_correlation_pearson.png"):
    data = datasets.load_diabetes()
    X, y = data['data'], data['target']
    feature_names = np.array(data['feature_names'])

    visualizer = FeatureCorrelation(labels=feature_names)
    visualizer.fit(X, y)
    visualizer.poof(outpath=path, clear_figure=True)
 def test_feature_correlation_select_feature_by_index_out_of_range(self):
     """
     Test selecting feature by feature index but index is out of range
     """
     e = "Feature index is out of range"
     with pytest.raises(YellowbrickValueError, match=e):
         viz = FeatureCorrelation(feature_index=[0, 2, 10])
         viz.fit(self.X, self.y)
    def test_feature_correlation_labels(self):
        """
        Test labels as feature labels
        """
        viz = FeatureCorrelation(labels=self.labels)
        viz.fit(self.X, self.y)

        npt.assert_array_equal(viz.features_, self.labels)
    def test_feature_correlation_sort(self):
        """
        Test sorting of correlation
        """
        viz = FeatureCorrelation(sort=True)
        viz.fit(self.X, self.y)

        assert np.all(viz.scores_[:-1] <= viz.scores_[1:])
    def test_feature_correlation_labels_from_index(self):
        """
        Test getting feature labels from index
        """
        viz = FeatureCorrelation()
        viz.fit(self.X, self.y)

        npt.assert_array_equal(viz.features_, np.arange(self.X.shape[1]))
 def test_feature_correlation_select_feature_by_index_out_of_range(self):
     """
     Test selecting feature by feature index but index is out of range
     """
     e = 'Feature index is out of range'
     with pytest.raises(YellowbrickValueError, match=e):
         viz = FeatureCorrelation(feature_index=[0, 2, 10])
         viz.fit(self.X, self.y)
    def test_feature_correlation_select_feature_by_index(self):
        """
        Test selecting feature by index
        """
        viz = FeatureCorrelation(feature_index=[0, 2, 3])
        viz.fit(self.X, self.y)

        assert viz.scores_.shape[0] == 3
    def test_feature_correlation_labels(self):
        """
        Test labels as feature labels
        """
        viz = FeatureCorrelation(labels=self.labels)
        viz.fit(self.X, self.y)

        npt.assert_array_equal(viz.features_, self.labels)
    def test_feature_correlation_select_feature_by_index(self):
        """
        Test selecting feature by index
        """
        viz = FeatureCorrelation(feature_index=[0, 2, 3])
        viz.fit(self.X, self.y)

        assert viz.scores_.shape[0] == 3
    def test_feature_correlation_labels_from_index(self):
        """
        Test getting feature labels from index
        """
        viz = FeatureCorrelation()
        viz.fit(self.X, self.y)

        npt.assert_array_equal(viz.features_, np.arange(self.X.shape[1]))
    def test_feature_correlation_integrated_mutual_info_regression(self):
        """
        Test FeatureCorrelation visualizer with mutual information regression
        """
        viz = FeatureCorrelation(method="mutual_info-regression")
        viz.fit(self.X, self.y, random_state=23456)
        viz.finalize()

        self.assert_images_similar(viz)
    def test_feature_correlation_integrated_mutual_info_regression(self):
        """
        Test FeatureCorrelation visualizer with mutual information regression
        """
        viz = FeatureCorrelation(method='mutual_info-regression')
        viz.fit(self.X, self.y, random_state=23456)
        viz.poof()

        self.assert_images_similar(viz)
def feature_correlation_pearson(
        path="images/feature_correlation_pearson.png"):
    data = datasets.load_diabetes()
    X, y = data['data'], data['target']
    feature_names = np.array(data['feature_names'])

    visualizer = FeatureCorrelation(labels=feature_names)
    visualizer.fit(X, y)
    visualizer.poof(outpath=path, clear_figure=True)
    def test_feature_correlation_integrated_pearson(self):
        """
        Test FeatureCorrelation visualizer with pearson correlation
        coefficient
        """
        viz = FeatureCorrelation()
        viz.fit(self.X, self.y)
        viz.finalize()

        self.assert_images_similar(viz)
    def test_feature_correlation_select_feature_by_name_no_labels(self):
        """
        Test selecting feature by feature names with labels is not supplied
        """
        feature_names = ["age"]

        e = "age not in labels"
        with pytest.raises(YellowbrickValueError, match=e):
            viz = FeatureCorrelation(feature_names=feature_names)
            viz.fit(self.X, self.y)
    def test_feature_correlation_labels_from_dataframe(self):
        """
        Test getting feature labels from DataFrame
        """
        X_pd = pd.DataFrame(self.X, columns=self.labels)

        viz = FeatureCorrelation()
        viz.fit(X_pd, self.y)

        npt.assert_array_equal(viz.features_, self.labels)
    def test_feature_correlation_select_feature_by_name_no_labels(self):
        """
        Test selecting feature by feature names with labels is not supplied
        """
        feature_names = ['age']

        e = 'age not in labels'
        with pytest.raises(YellowbrickValueError, match=e):
            viz = FeatureCorrelation(feature_names=feature_names)
            viz.fit(self.X, self.y)
    def test_feature_correlation_select_feature_by_name(self):
        """
        Test selecting feature by feature names
        """
        feature_names = ["age", "sex", "bp", "s5"]

        viz = FeatureCorrelation(labels=self.labels, feature_names=feature_names)
        viz.fit(self.X, self.y)

        npt.assert_array_equal(viz.features_, feature_names)
    def test_feature_correlation_integrated_pearson(self):
        """
        Test FeatureCorrelation visualizer with pearson correlation
        coefficient
        """
        viz = FeatureCorrelation()
        viz.fit(self.X, self.y)
        viz.poof()

        self.assert_images_similar(viz)
    def test_feature_correlation_labels_from_dataframe(self):
        """
        Test getting feature labels from DataFrame
        """
        X_pd = pd.DataFrame(self.X, columns=self.labels)

        viz = FeatureCorrelation()
        viz.fit(X_pd, self.y)

        npt.assert_array_equal(viz.features_, self.labels)
    def test_feature_correlation_select_feature_by_name(self):
        """
        Test selecting feature by feature names
        """
        feature_names = ['age', 'sex', 'bp', 's5']

        viz = FeatureCorrelation(labels=self.labels,
                                 feature_names=feature_names)
        viz.fit(self.X, self.y)

        npt.assert_array_equal(viz.features_, feature_names)
Ejemplo n.º 25
0
def mutual_info_classification(classes, feature_names, X, y):
    from sklearn import datasets
    from yellowbrick.target import FeatureCorrelation

    # Load the regression data set

    visualizer = FeatureCorrelation(method='mutual_info-classification',
                                    feature_names=feature_names,
                                    sort=True)
    visualizer.fit(X, y, random_state=0)
    visualizer.poof()
Ejemplo n.º 26
0
def pearson_correlation(classes, fetures, X, Y):
    from sklearn import datasets
    from yellowbrick.target import FeatureCorrelation

    # Load the regression data set
    # data = datasets.load_diabetes()
    # X, y = data['data'], data['target']
    # feature_names = np.array(data['feature_names'])

    visualizer = FeatureCorrelation(labels=fetures)
    visualizer.fit(X, Y)
    visualizer.poof()
    def report(self, pipeline: AbstractPipeline):

        folder = get_cache_path()
        path = pkg_resources.resource_filename(
            'crcdal', 'cache/' + folder + '/' + self.sub_folder + '/')
        pkg_resources.ensure_directory(path)

        feature_names = list(pipeline.train.columns())
        visualizer = FeatureCorrelation(labels=feature_names)
        visualizer.fit(pipeline.train, pipeline.train_y)
        visualizer.poof(outpath=path + pipeline.dataset_tag +
                        '_model_feature_correlation_report.csv')
Ejemplo n.º 28
0
def feature_correlation_mutual_info_classification(
        path="images/feature_correlation_mutual_info_classification.png"):
    data = datasets.load_wine()
    X, y = data['data'], data['target']
    feature_names = np.array(data['feature_names'])
    X_pd = pd.DataFrame(X, columns=feature_names)

    feature_to_plot = ['alcohol', 'ash', 'hue', 'proline', 'total_phenols']

    visualizer = FeatureCorrelation(method='mutual_info-classification',
                                    feature_names=feature_to_plot)
    visualizer.fit(X_pd, y, random_state=0)
    visualizer.poof(outpath=path, clear_figure=True)
    def test_feature_correlation_integrated_mutual_info_classification(self):
        """
        Test FeatureCorrelation visualizer with mutual information
        on wine dataset (classification)
        """
        data = datasets.load_wine()
        X, y = data["data"], data["target"]

        viz = FeatureCorrelation(method="mutual_info-classification")
        viz.fit(X, y, random_state=12345)
        viz.finalize()

        self.assert_images_similar(viz)
    def test_feature_correlation_integrated_mutual_info_classification(self):
        """
        Test FeatureCorrelation visualizer with mutual information
        on wine dataset (classification)
        """
        data = datasets.load_wine()
        X, y = data['data'], data['target']

        viz = FeatureCorrelation(method='mutual_info-classification')
        viz.fit(X, y, random_state=12345)
        viz.poof()

        self.assert_images_similar(viz)
Ejemplo n.º 31
0
def mutual_info_regress(classes, feature_names, X, y):
    from sklearn import datasets
    from yellowbrick.target import FeatureCorrelation

    # Load the regression data set

    discrete_features = [False for _ in range(len(feature_names))]
    discrete_features[1] = True

    visualizer = FeatureCorrelation(method='mutual_info-regression',
                                    labels=feature_names)
    visualizer.fit(X, y, discrete_features=discrete_features, random_state=0)
    visualizer.poof()
def feature_correlation_mutual_info_classification(
        path="images/feature_correlation_mutual_info_classification.png"):
    data = datasets.load_wine()
    X, y = data['data'], data['target']
    feature_names = np.array(data['feature_names'])
    X_pd = pd.DataFrame(X, columns=feature_names)

    feature_to_plot = ['alcohol', 'ash', 'hue', 'proline', 'total_phenols']

    visualizer = FeatureCorrelation(method='mutual_info-classification',
                                    feature_names=feature_to_plot)
    visualizer.fit(X_pd, y, random_state=0)
    visualizer.poof(outpath=path, clear_figure=True)
def feature_correlation_mutual_info_regression(
        path="images/feature_correlation_mutual_info_regression.png"):
    data = datasets.load_diabetes()
    X, y = data['data'], data['target']
    feature_names = np.array(data['feature_names'])

    discrete_features = [False for _ in range(len(feature_names))]
    discrete_features[1] = True

    visualizer = FeatureCorrelation(method='mutual_info-regression',
                                    labels=feature_names, sort=True)
    visualizer.fit(X, y, discrete_features=discrete_features, random_state=0)
    visualizer.poof(outpath=path, clear_figure=True)
Ejemplo n.º 34
0
def feature_correlation_mutual_info_regression(
        path="images/feature_correlation_mutual_info_regression.png"):
    data = datasets.load_diabetes()
    X, y = data['data'], data['target']
    feature_names = np.array(data['feature_names'])

    discrete_features = [False for _ in range(len(feature_names))]
    discrete_features[1] = True

    visualizer = FeatureCorrelation(method='mutual_info-regression',
                                    labels=feature_names,
                                    sort=True)
    visualizer.fit(X, y, discrete_features=discrete_features, random_state=0)
    visualizer.poof(outpath=path, clear_figure=True)
    def test_feature_correlation_select_feature_by_index_and_name(self):
        """
        Test selecting feature warning when both index and names are provided
        """
        feature_index = [0, 2, 3]
        feature_names = ['age']

        e = ('Both feature_index and feature_names are specified. '
             'feature_names is ignored')
        with pytest.raises(YellowbrickWarning, match=e):
            viz = FeatureCorrelation(feature_index=feature_index,
                                     feature_names=feature_names)
            viz.fit(self.X, self.y)
            assert viz.scores_.shape[0] == 3
    def test_feature_correlation_select_feature_by_index_and_name(self):
        """
        Test selecting feature warning when both index and names are provided
        """
        feature_index = [0, 2, 3]
        feature_names = ['age']

        e = ('Both feature_index and feature_names are specified. '
             'feature_names is ignored')
        with pytest.raises(YellowbrickWarning, match=e):
            viz = FeatureCorrelation(feature_index=feature_index,
                                     feature_names=feature_names)
            viz.fit(self.X, self.y)
            assert viz.scores_.shape[0] == 3
def create_correlation_matrix(data):
    encoded = encode_data(data)

    kendall = encoded.corr(method ='kendall')['class'].to_frame()
    pearson = encoded.corr(method ='pearson')['class'].to_frame()
    spearman = encoded.corr(method ='spearman')['class'].to_frame()
    kendall.columns = ['kendall']
    pearson.columns = ['pearson']
    spearman.columns = ['spearman']

    correlation_matrix = pd.concat([kendall, pearson, spearman], axis=1, sort=False)

    X, y = encoded.drop(columns = ['class']), encoded['class']
    visualizer = FeatureCorrelation(method='mutual_info-classification', labels=X.columns)
    visualizer.fit(X, y)

    correlation_matrix = correlation_matrix.drop('class', axis = 0)
    correlation_matrix['mutual_info-classification'] = visualizer.scores_.tolist()

    return correlation_matrix
Ejemplo n.º 38
0
import pandas as pd
import numpy as np
import math
import seaborn as sns
import matplotlib.pyplot as plt
from yellowbrick.target import FeatureCorrelation

dataset = pd.read_csv('house_prices.csv')

#removendo atributos que nao serao utilizados para analise
dataset.drop(labels=['id', 'date', 'sqft_living15', 'sqft_lot15'],
             axis=1,
             inplace=True)

print(dataset.columns)

grafico = FeatureCorrelation(labels=dataset.columns[1:])
grafico.fit(dataset.iloc[:, 1:16].values, dataset.iloc[:, 0].values)
plt.show()
                plt.xticks(fontsize=14)
                plt.yticks(fontsize=12)
                locationFileNameRFC = os.path.join('/home/ak/Documents/Research/Papers/figures',str(symbols[symbolIdx]) \
                                                   +'_label_'+str(labelsIdx)+ '_idx_'+str(idx)+str('Date')+str(dateIdx)+'_RandForImp.png')
                vizRFC.fit(X, y)
                vizRFC.show(outpath=locationFileNameRFC)
                plt.show()

                plt.figure()

                # Instantiate the visualizer
                visualizerFC = FeatureCorrelation(labels=features,
                                                  color="rebeccapurple",
                                                  title=' ')

                visualizerFC.fit(X, y)
                locationFileNameFC = os.path.join('/home/ak/Documents/Research/Papers/figures',str(symbols[symbolIdx])+'_idx_'+str(idx) \
                                                  +'_label_'+str(labelName)+'_date_'+str(dateIdx)+'_label_'+str(labelsIdx)+'_FeatureCorrelation_w_depn_var.png')
                plt.xlabel('', fontsize=11)
                plt.xticks(fontsize=14)
                plt.yticks(fontsize=12)
                visualizerFC.show(outpath=locationFileNameFC)
                plt.show()
                #
                # # Instantiate the visualizer

                set_palette('yellowbrick')
                plt.figure()
                classes = np.array([0, 1.])
                plt.xticks(fontsize=9)
                visualizerRadViz = RadViz(classes=classes,
Ejemplo n.º 40
0
def featcorr():
    data = load_diabetes()

    oz = FeatureCorrelation(ax=newfig())
    oz.fit(data.data, data.target)
    savefig(oz, "feature_correlation")
Ejemplo n.º 41
0
## yellowbrick 

from sklearn import datasets
from yellowbrick.target import FeatureCorrelation

# Load the regression dataset
data = datasets.load_diabetes()
X, y = data['data'], data['target']

# Create a list of the feature names
features = np.array(data['feature_names'])

# Instantiate the visualizer
visualizer = FeatureCorrelation(labels=features)

visualizer.fit(X, y)        # Fit the data to the visualizer
visualizer.show() 

## PCA - Principal Component Analysis https://www.kaggle.com/ryanholbrook/principal-component-analysis
from sklearn.decomposition import PCA

# Create principal components
pca = PCA()
X_pca = pca.fit_transform(X_scaled)

# Convert to dataframe
component_names = [f"PC{i+1}" for i in range(X_pca.shape[1])]
X_pca = pd.DataFrame(X_pca, columns=component_names)


## Target Encoding
Ejemplo n.º 42
0
                                                cv=cv)
clf = scores['estimator'][np.argmax(scores['test_score'])]
print(np.max(scores['test_score']))

# %%
explainer = shap.TreeExplainer(clf)
shap_values = explainer.shap_values(Xv)

# %%
shap.summary_plot(shap_values, Xv, plot_type="bar")

# %%
feat = feature_names[feat][np.mean(abs(shap_values), axis=0) > 0.55]
print(feat)
X = X[feat]

# %%
visualizer = FeatureCorrelation(method='mutual_info-classification')
visualizer.fit(X, y)
visualizer.poof()

# %%
# This step doesn't always produce the same result, idk why.
feat = visualizer.features_[visualizer.scores_ > 0.04]
X = X[feat]

# %%
# Our final 10 features:
# [263, 268, 287, 288, 300, 302, 307, 308, 313, 315]
print(feat)
Ejemplo n.º 43
0
 def draw_feature_correlation(self):
     visualizer = FeatureCorrelation(method='mutual_info-classification',
                                     labels=self.get_feature_labels(),
                                     sort=True)
     visualizer.fit(self.training_data, self.training_labels)
     visualizer.poof()