def test_pairplot_iris(): data = data_df_from_bunch(load_iris()) axes = pairplot(data, target_col='target') # the diagonal axes are duplicated # so we can have sharex / sharey assert axes.shape == (4, 4) assert len(plt.gcf().get_axes()) == 4 * 4 + 4
def test_plot_classification_continuous(): data = fetch_openml('MiceProtein') df = data_df_from_bunch(data) # only univariate plots figures = plot_classification_continuous(df, target_col='target', plot_pairwise=False) assert len(figures) == 1 # top 10 axes assert len(figures[0].get_axes()) == 10 # six is the minimum number of features for histograms # (last column is target) figures = plot_classification_continuous(df.iloc[:, -7:], target_col='target', plot_pairwise=False) assert len(figures) == 1 assert len(figures[0].get_axes()) == 6 # for 5 features, do full pairplot figures = plot_classification_continuous(df.iloc[:, -6:], target_col='target', plot_pairwise=False) assert len(figures) == 1 # diagonal has twin axes assert len(figures[0].get_axes()) == 5 * 5 + 5 # also do pairwise plots figures = plot_classification_continuous(df, target_col='target', random_state=42) # univariate, pairwise, pca, lda assert len(figures) == 4 # univariate axes = figures[0].get_axes() assert len(axes) == 10 # known result assert axes[0].get_xlabel() == "SOD1_N" # bar plot never has ylabel assert axes[0].get_ylabel() == "" # pairwise axes = figures[1].get_axes() assert len(axes) == 4 # known result assert axes[0].get_xlabel() == "SOD1_N" assert axes[0].get_ylabel() == 'S6_N' # PCA axes = figures[2].get_axes() assert len(axes) == 4 # known result assert axes[0].get_xlabel() == "PCA 1" assert axes[0].get_ylabel() == 'PCA 5' # LDA axes = figures[3].get_axes() assert len(axes) == 4 # known result assert axes[0].get_xlabel() == "LDA 0" assert axes[0].get_ylabel() == 'LDA 1'
def test_data_df_from_bunch(): data_bunch = fetch_openml('MiceProtein') data = data_df_from_bunch(data_bunch) assert len(data) == len(data_bunch.data) assert all(data.target.unique() == [ 'c-CS-m', 'c-SC-m', 'c-CS-s', 'c-SC-s', 't-CS-m', 't-SC-m', 't-CS-s', 't-SC-s' ])
def test_regression_boston(): boston = load_boston() data = data_df_from_bunch(boston) er = SimpleRegressor() er.fit(data, target_col='target') # test nupmy array er = SimpleRegressor() er.fit(boston.data, boston.target)
def test_digits_type_hints(): data_bunch = load_digits() try: feature_names = data_bunch.feature_names except AttributeError: feature_names = ['x%d' % i for i in range(data_bunch.data.shape[1])] data = data_df_from_bunch(data_bunch) data_clean = clean(data, type_hints={ feature: 'continuous' for feature in feature_names}) assert data_clean.shape[1] == 65
def test_plot_target_low_card_int(): data = load_digits() df = data_df_from_bunch(data) plot(df[::10], target_col='target')
def test_digits_type_hints(): data = data_df_from_bunch(load_digits()) data_clean = clean( data, type_hints={"x{}".format(i): 'continuous' for i in range(64)}) assert data_clean.shape[1] == 65
def test_pairplot_iris(): data = data_df_from_bunch(load_iris()) pairplot(data, target_col='target')
""" Wine Classification Dataset Visualization ========================================== """ import matplotlib.pyplot as plt from sklearn.datasets import load_wine from dabl import plot_supervised from dabl.utils import data_df_from_bunch wine_bunch = load_wine() wine_df = data_df_from_bunch(wine_bunch) plot_supervised(wine_df, 'target') plt.show()
# print data(feature)shape wine.data.shape # print the wine data features (top 5 records) print (wine.data[0:5]) print (wine.target) #Count number of observation in each class for i in set(wine.target): print('Class', i, ' -> ', list(wine.target).count(i)) explore() # Import Gaussian Naive Bayes model from sklearn.naive_bayes import GaussianNB as GNB # Train the model using the training sets # and Predict the response for test dataset y_pred = GNB().fit(X_train, y_train).predict(X_test) # Import scikit-learn metrics module for accuracy calculation from sklearn import metrics # Model Accuracy, how often is the classifier correct? print(f"Accuracy: {100*metrics.accuracy_score(y_test, y_pred):.3f}%") import matplotlib.pyplot as plt from dabl import plot from dabl.utils import data_df_from_bunch plot(data_df_from_bunch(wine), 'target') plt.show()