def test_dirty_float_target_regression(): titanic_data = load_titanic() data = pd.DataFrame({'one': np.repeat(np.arange(50), 2)}) dirty = make_dirty_float() data['target'] = dirty with pytest.warns(UserWarning, match="Discarding dirty_float targets that " "cannot be converted to float."): clean(data, target_col="target") with pytest.warns(UserWarning, match="Discarding dirty_float targets that " "cannot be converted to float."): plot(data, target_col="target") # check if works for non dirty_float targets plot(titanic_data, 'survived')
""" Adult Census Dataset Visualization ==================================== """ # sphinx_gallery_thumbnail_number = 2 from dabl import plot from dabl.datasets import load_adult import matplotlib.pyplot as plt # load the adult census housing dataset # returns a plain dataframe data = load_adult() plot(data, 'income', scatter_alpha=.1) plt.show()
""" Diamonds Dataset Visualization ========================================== Regression on the classical diamond dataset. """ # sphinx_gallery_thumbnail_number = 2 import matplotlib.pyplot as plt from sklearn.datasets import fetch_openml from dabl import plot X, y = fetch_openml('diamonds', as_frame=True, return_X_y=True) plot(X, y) plt.show()
# In[ ]: import dabl # In[ ]: feature_df = bdf[final_feature_list] # In[ ]: dabl_data = dabl.clean(feature_df) # In[ ]: dabl.plot(dabl_data, target_col='save_within_48') # In[ ]: X = dabl_data.drop("save_within_48", axis=1) Y = dabl_data.save_within_48 # In[ ]: preprocessor = dabl.EasyPreprocessor() X_trans = preprocessor.fit_transform(X) # In[ ]: fc = dabl.SimpleClassifier(random_state=0).fit(X_trans, Y)
""" Ames Housing Dataset Visualization ==================================== """ from dabl import plot from dabl.datasets import load_ames import matplotlib.pyplot as plt # load the ames housing dataset # returns a plain dataframe data = load_ames() plot(data, 'SalePrice') plt.show()
# No. of unique items present in the categorical column data.select_dtypes('object').nunique() # Percentage of missing data in each columns present in the data no_of_columns = data.shape[0] percentage_of_missing_data = data.isnull().sum()/no_of_columns print(percentage_of_missing_data) # comparison of all other attributes with respect to Math Marks plt.rcParams['figure.figsize'] = (18, 6) plt.style.use('fivethirtyeight') dabl.plot(data, target_col = 'math score') # comparison of all other attributes with respect to Reading Marks plt.rcParams['figure.figsize'] = (18, 6) plt.style.use('fivethirtyeight') dabl.plot(data, target_col = 'reading score') # comparison of all other attributes with respect to Writing Marks plt.rcParams['figure.figsize'] = (18, 6) plt.style.use('fivethirtyeight') dabl.plot(data, target_col = 'writing score') # Inferential Statistics
# print data(feature)shape wine.data.shape # print the wine data features (top 5 records) print (wine.data[0:5]) print (wine.target) #Count number of observation in each class for i in set(wine.target): print('Class', i, ' -> ', list(wine.target).count(i)) explore() # Import Gaussian Naive Bayes model from sklearn.naive_bayes import GaussianNB as GNB # Train the model using the training sets # and Predict the response for test dataset y_pred = GNB().fit(X_train, y_train).predict(X_test) # Import scikit-learn metrics module for accuracy calculation from sklearn import metrics # Model Accuracy, how often is the classifier correct? print(f"Accuracy: {100*metrics.accuracy_score(y_test, y_pred):.3f}%") import matplotlib.pyplot as plt from dabl import plot from dabl.utils import data_df_from_bunch plot(data_df_from_bunch(wine), 'target') plt.show()
""" Wine Classification Dataset Visualization ========================================== """ import matplotlib.pyplot as plt from sklearn.datasets import load_wine from dabl import plot from dabl.utils import data_df_from_bunch wine_bunch = load_wine() wine_df = data_df_from_bunch(wine_bunch) plot(wine_df, 'target') plt.show()