def test_load_wine(): res = load_wine() assert_equal(res.data.shape, (178, 13)) assert_equal(res.target.size, 178) assert_equal(res.target_names.size, 3) assert_true(res.DESCR) # test return_X_y option X_y_tuple = load_wine(return_X_y=True) bunch = load_wine() assert_true(isinstance(X_y_tuple, tuple)) assert_array_equal(X_y_tuple[0], bunch.data) assert_array_equal(X_y_tuple[1], bunch.target)
def test_load_wine(): res = load_wine() assert_equal(res.data.shape, (178, 13)) assert_equal(res.target.size, 178) assert_equal(res.target_names.size, 3) assert_true(res.DESCR) # test return_X_y option check_return_X_y(res, partial(load_wine))
def test_feature_correlation_integrated_mutual_info_classification(self): """ Test FeatureCorrelation visualizer with mutual information on wine dataset (classification) """ data = datasets.load_wine() X, y = data['data'], data['target'] viz = FeatureCorrelation(method='mutual_info-classification') viz.fit(X, y, random_state=12345) viz.poof() self.assert_images_similar(viz)
def feature_correlation_mutual_info_classification( path="images/feature_correlation_mutual_info_classification.png"): data = datasets.load_wine() X, y = data['data'], data['target'] feature_names = np.array(data['feature_names']) X_pd = pd.DataFrame(X, columns=feature_names) feature_to_plot = ['alcohol', 'ash', 'hue', 'proline', 'total_phenols'] visualizer = FeatureCorrelation(method='mutual_info-classification', feature_names=feature_to_plot) visualizer.fit(X_pd, y, random_state=0) visualizer.poof(outpath=path, clear_figure=True)
def load_wine(): #KMEANS WINE DATA wine = datasets.load_wine() wine_df = pd.DataFrame(wine.data) wine_df.columns = wine.feature_names wine_df['target'] = wine.target wine_df_feats = wine_df.drop('target',axis=1) scaler = MinMaxScaler() wine_sc = scaler.fit_transform(wine_df_feats) return wine_df, wine_sc
def Wine(training_size, test_size, n, PLOT_DATA): class_labels = [r'A', r'B', r'C'] data, target = datasets.load_wine(True) sample_train, sample_test, label_train, label_test = train_test_split(data, target, test_size=0.1, random_state=7) # Now we standarize for gaussian around 0 with unit variance std_scale = StandardScaler().fit(sample_train) sample_train = std_scale.transform(sample_train) sample_test = std_scale.transform(sample_test) # Now reduce number of features to number of qubits pca = PCA(n_components=n).fit(sample_train) sample_train = pca.transform(sample_train) sample_test = pca.transform(sample_test) # Scale to the range (-1,+1) samples = np.append(sample_train, sample_test, axis=0) minmax_scale = MinMaxScaler((-1, 1)).fit(samples) sample_train = minmax_scale.transform(sample_train) sample_test = minmax_scale.transform(sample_test) # Pick training size number of samples from each distro training_input = {key: (sample_train[label_train == k, :])[:training_size] for k, key in enumerate(class_labels)} test_input = {key: (sample_train[label_train == k, :])[training_size:( training_size+test_size)] for k, key in enumerate(class_labels)} if PLOT_DATA: for k in range(0, 3): plt.scatter(sample_train[label_train == k, 0][:training_size], sample_train[label_train == k, 1][:training_size]) plt.title("PCA dim. reduced Wine dataset") plt.show() return sample_train, training_input, test_input, class_labels
# plt.show() if it % (iterations / 1000) == 0: temp = nearest_neighbors(scale(A, input), label) if correct < temp: correct = temp A_optimal = A print('Iteration', it, 'Nearest neighbors on nca data:') print('Got', correct, 'correct out of', input.shape[0]) else: print('Iteration', it, 'Nearest neighbors on nca data:') print('Got', temp, 'correct out of', input.shape[0]) return A_optimal if __name__ == "__main__": X, y = load_wine(return_X_y=True) # X = np.array([[0, 0, 0.1], [0, 0.1, 0.1], [0.9, 0.6, 0.8], [0.9, 0.5, 0.7]]) # y = np.array([0, 0, 1, 1]) print('Nearest neighbors on raw data:') print('Got', nearest_neighbors(X, y), 'correct out of', X.shape[0]) A = scaling_matrix(X) print('A\n', A) print('Nearest neighbors on scaled data:') print('Got', nearest_neighbors(scale(A, X), y), 'correct out of', X.shape[0]) A = neighborhood_components_analysis(X, y, A, 100000, 0.001) print('A\n', A) print('Nearest neighbors on nca data:') print('Got', nearest_neighbors(scale(A, X), y), 'correct out of',
def test_data(): """Make sure dataset is not Null""" X, y = load_wine(return_X_y=True) assert X and y == numpy.ndarray
def get_wine(): data = datasets.load_wine() df = pd.DataFrame(data['data']) df['class'] = data['target'] return df
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import cross_val_score from sklearn import datasets # 决策树,进行裂分的时候,分局信息增益最大进行裂分,会比较刻板 # 极限森林:样本随机,裂分条件随机 print('--wine--') X, y = datasets.load_wine(return_X_y=True) clf = DecisionTreeClassifier() print(cross_val_score(clf, X, y, cv=6, scoring='accuracy').mean()) # cv 依据于 (Stratified)KFold forest = RandomForestClassifier() print(cross_val_score(forest, X, y, cv=6, scoring='accuracy').mean()) extra = ExtraTreesClassifier() print(cross_val_score(extra, X, y, cv=6, scoring='accuracy').mean()) print('--鸢尾花--') X, y = datasets.load_iris(return_X_y=True) clf = DecisionTreeClassifier() print(cross_val_score(clf, X, y, cv=6, scoring='accuracy').mean()) # cv 依据于 (Stratified)KFold forest = RandomForestClassifier() print(cross_val_score(forest, X, y, cv=6, scoring='accuracy').mean()) extra = ExtraTreesClassifier() print(cross_val_score(extra, X, y, cv=6, scoring='accuracy').mean())
from sklearn import datasets, metrics # 如果是分類問題,請使用 DecisionTreeClassifier,若為回歸問題,請使用 DecisionTreeRegressor from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor from sklearn.model_selection import train_test_split # 讀取wine資料集 wine = datasets.load_wine() # 切分訓練集/測試集 x_train, x_test, y_train, y_test = train_test_split(wine.data, wine.target, test_size=0.25, random_state=4) # 建立模型 # https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html # DecisionTreeClassifier(criterion=’gini’ # , splitter=’best’ # , max_depth=None # , min_samples_split=2 # , min_samples_leaf=1 # , min_weight_fraction_leaf=0.0 # , max_features=None # , random_state=None # , max_leaf_nodes=None # , min_impurity_decrease=0.0 # , min_impurity_split=None # , class_weight=None # , presort=False) clf = DecisionTreeClassifier()
from itertools import product from collections import defaultdict from sklearn.metrics.pairwise import pairwise_distances import warnings warnings.filterwarnings('ignore') # Import 7.2. Toy datasets from scikit-learn Library #https://scikit-learn.org/stable/datasets/index.html from sklearn.datasets import load_digits data_digits = load_digits() X1, Y1 = pd.DataFrame(data_digits["data"]), pd.Series(data_digits["target"]) Dataset = "digits" from sklearn.datasets import load_wine data_wine = load_wine() #X1, Y1 = pd.DataFrame(data_wine["data"],columns=data_wine.feature_names), pd.Series(data_wine["target"]) #Dataset = "wine" def pairwiseDistCorr(X1, X2): assert X1.shape[0] == X2.shape[0] d1 = pairwise_distances(X1) d2 = pairwise_distances(X2) return np.corrcoef(d1.ravel(), d2.ravel())[0, 1] # Run RCA dims = list(np.arange(2, (X1.shape[1] - 1), 3)) dims.append(X1.shape[1]) tmp = defaultdict(dict)
from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.decomposition import PCA from sklearn.naive_bayes import GaussianNB from sklearn import metrics import matplotlib.pyplot as plt from sklearn.datasets import load_wine from sklearn.pipeline import make_pipeline from matplotlib.font_manager import * myfont = FontProperties(fname='C:\Windows\Fonts\simfang.ttf') RANDOM_STATE = 42 FIG_SIZE = (10, 7) features, target = load_wine(return_X_y=True) # Make a train/test split using 30% test size X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.30, random_state=RANDOM_STATE) # Fit to data and predict using pipelined GNB and PCA. unscaled_clf = make_pipeline(PCA(n_components=2), GaussianNB()) unscaled_clf.fit(X_train, y_train) pred_test = unscaled_clf.predict(X_test) # Fit to data and predict using pipelined scaling, GNB and PCA. std_clf = make_pipeline(StandardScaler(), PCA(n_components=2), GaussianNB()) std_clf.fit(X_train, y_train) pred_test_std = std_clf.predict(X_test) # Show prediction accuracies in scaled and unscaled data. print('\nPrediction accuracy for the normal test dataset with PCA') print('{:.2%}\n'.format(metrics.accuracy_score(y_test, pred_test))) print('\nPrediction accuracy for the standardized test dataset with PCA')
#!/usr/bin/env python """Test xgboost integration for classification task.""" from sklearn.datasets import load_wine from sklearn.model_selection import train_test_split import wandb from wandb.integration.xgboost import wandb_callback from xgboost import XGBClassifier X, y = load_wine(return_X_y=True, as_frame=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) model = XGBClassifier(use_label_encoder=False, eval_metric=['mlogloss', 'auc'], seed=42, n_estimators=50) wandb.init(project="wine-xgboost") model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], early_stopping_rounds=40, callbacks=[wandb_callback()], verbose=False)
plt.ylim(-1, n_features) plot_feature_importances_dataset(model) plt.show() ''' from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor from sklearn.datasets import load_wine from sklearn.model_selection import train_test_split import numpy as np import pandas as pd dataset = load_wine() x = dataset.data y = dataset.target df = pd.DataFrame(x, columns=dataset.feature_names) df1 = df.drop(['magnesium', 'alcalinity_of_ash', 'nonflavanoid_phenols', 'total_phenols', 'alcohol', 'ash'],axis=1) df2 = df.to_numpy() print(df1.shape) # 1. 데이터 # dataset = load_breast_cancer() x_train, x_test, y_train, y_test = train_test_split( df2, dataset.target, train_size=0.8, random_state=44 )
def main(): # create path for html output if not os.path.exists("html"): os.makedirs("html") # import data to pandas data = load_wine() input_df = pd.DataFrame(data.data) # find which columns are predictors and which is response cols = input_df.columns.to_list() print(input_df.head()) check = False while not check: response = input(f"Which column is the response? \n {cols}? \n") if response in cols: check = True elif int(response) in cols: response = int(response) check = True else: print("Incorrect user input.") else: response = 1 predictors = [x for x in cols if x != response] # determine which columns are categorical and which are continuous bool_dict = {response: bool_check(input_df[response])} plot_dict = {} for predictor in predictors: bool_dict[predictor] = bool_check(input_df[predictor]) # generate plots if response is categorical if bool_dict[response]: for predictor in predictors: if bool_dict[predictor]: # heat plot df = input_df[[response, predictor]].copy() hist_2d = px.density_heatmap(df, x=predictor, y=response) hist_2d.update_xaxes(title=predictor) hist_2d.update_yaxes(title=response) hist_2d.show() plot_loc = f"html/{predictor}_plot.html" hist_2d.write_html( file=plot_loc, include_plotlyjs="cdn", ) plot_dict[predictor] = plot_loc else: # violin plot df = input_df[[response, predictor]].copy() violin = px.violin(df, y=predictor, color=response, violinmode="overlay") violin.update_layout( title_text= f"violin plot of {predictor} grouped by {response}", ) violin.update_yaxes(title_text=predictor) violin.show() plot_loc = f"html/{predictor}_plot.html" violin.write_html( file=plot_loc, include_plotlyjs="cdn", ) plot_dict[predictor] = plot_loc # generate plots if response is continuous else: for predictor in predictors: if bool_dict[predictor]: # histogram plot df = input_df[[response, predictor]].copy() fig = px.histogram( df, x=response, y=response, color=predictor, marginal="box", hover_data=df.columns, ) fig.show() plot_loc = f"html/{predictor}_plot.html" fig.write_html( file=plot_loc, include_plotlyjs="cdn", ) plot_dict[predictor] = plot_loc else: # scatter plot with trend line df = input_df[[response, predictor]].copy() scatter = px.scatter(df, x=predictor, y=response, trendline="ols") scatter.update_layout(title_text=f"{predictor} v. {response}") scatter.update_xaxes(ticks="inside", title_text=predictor) scatter.update_yaxes(ticks="inside", title_text=response) scatter.show() plot_loc = f"html/{predictor}_plot.html" scatter.write_html( file=plot_loc, include_plotlyjs="cdn", ) plot_dict[predictor] = plot_loc # generate stats data inputs X_cols = input_df.drop(response, axis=1).columns.to_list() X = input_df.drop(response, axis=1).to_numpy() y = input_df[response].to_numpy() t_val, p_val, stat_plots = {}, {}, {} # linear regression stats if response is continuous if not bool_dict[response]: for idx, column in enumerate(X.T): column = X[:, idx] feature_name = X_cols[idx] predictor = statsmodels.api.add_constant(column) linear_regression_model = statsmodels.api.OLS(y, predictor, missing="drop") linear_regression_fitted = linear_regression_model.fit() print(linear_regression_fitted.summary()) p_value = round(linear_regression_fitted.tvalues[1], 4) t_value = "{:.6e}".format(linear_regression_fitted.pvalues[1]) p_val[feature_name], t_val[feature_name] = t_value, p_value fig = px.scatter(x=column, y=y, trendline="ols") fig.update_layout( title= f"Variable: {feature_name}: (t-value={t_value}) (p-value={p_value})", xaxis_title=f"Variable: {feature_name}", yaxis_title=f"Response: {response}", ) fig.show() plot_loc = f"html/{feature_name}_stats_plot.html" fig.write_html( file=plot_loc, include_plotlyjs="cdn", ) stat_plots[feature_name] = plot_loc # logistic regression stats if response is boolean else: for idx, column in enumerate(X.T): column = X[:, idx] feature_name = X_cols[idx] predictor = statsmodels.api.add_constant(column) logistic_regression_model = statsmodels.api.Logit(y, predictor, missing="drop") logistic_regression_fitted = logistic_regression_model.fit() print(logistic_regression_fitted.summary()) p_value = round(logistic_regression_fitted.tvalues[1], 4) t_value = "{:.6e}".format(logistic_regression_fitted.pvalues[1]) p_val[feature_name], t_val[feature_name] = t_value, p_value fig = px.scatter(x=column, y=y, trendline="ols") fig.update_layout( title= f"Variable: {feature_name}: (t-value={t_value}) (p-value={p_value})", xaxis_title=f"Variable: {feature_name}", yaxis_title=f"Response: {response}", ) fig.show() plot_loc = f"html/{feature_name}_stats_plot.html" fig.write_html( file=plot_loc, include_plotlyjs="cdn", ) stat_plots[feature_name] = plot_loc # mean square difference setup msd_plots, msd_tables = {}, {} for feature in X_cols: data = input_df[feature].to_list() data.sort() data_range = max(data) - min(data) step = data_range / 10 table = pd.DataFrame(columns=[ "lower bin", "upper bin", "median", "count", "bin mean", "population mean", "mean square diff", ]) weighted_table = pd.DataFrame(columns=[ "lower bin", "upper bin", "median", "count", "bin mean", "population mean", "mean square diff", "pop proportion", "weighted MSD", ]) # mean square unweighted table for n in range(10): low, high = min(data) + (step * n), min(data) + (step * (n + 1)) if n == 9: b = [y for y in data if low <= y <= high] else: b = [y for y in data if low <= y < high] if not b: new_row = { "lower bin": low, "upper bin": high, "median": 0, "count": 0, "bin mean": 0, "population mean": np.nanmean(data), "mean square diff": 0, } else: med, count, mean = ( statistics.median(b), int(len(b)), statistics.mean(b), ) pop_mean = np.nanmean(data) mean_sq_diff = abs((mean - pop_mean)**2) new_row = { "lower bin": low, "upper bin": high, "median": med, "count": count, "bin mean": mean, "population mean": pop_mean, "mean square diff": mean_sq_diff, } table = table.append(new_row, ignore_index=True) msd_tables[feature] = html_write(table, feature, "unweighted") # mean square weighted table for n in range(10): low, high = min(data) + (step * n), min(data) + (step * (n + 1)) if n == 9: b = [y for y in data if low <= y <= high] else: b = [y for y in data if low <= y < high] if not b: new_row = { "lower bin": low, "upper bin": high, "median": 0, "count": 0, "bin mean": 0, "population mean": np.nanmean(data), "mean square diff": 0, "pop proportion": 0, "weighted MSD": 0, } else: med, count, mean = ( statistics.median(b), int(len(b)), statistics.mean(b), ) pop_prop = count / len(data) pop_mean = np.nanmean(data) mean_sq_diff = abs((mean - pop_mean)**2) weighted_msd = mean_sq_diff * pop_prop new_row = { "lower bin": low, "upper bin": high, "median": med, "count": count, "bin mean": mean, "population mean": pop_mean, "mean square diff": mean_sq_diff, "pop proportion": pop_prop, "weighted MSD": weighted_msd, } weighted_table = weighted_table.append(new_row, ignore_index=True) table = weighted_table msd_tables[feature] = html_write(table, feature, "weighted") # plot from table msd_plots[feature] = plot_msd(table, feature, response) # feature importance calculations y = input_df[response].values X = input_df.drop(response, axis=1) if bool_dict[response]: rf = RandomForestClassifier() rf.fit(X, y) feature_importance = rf.feature_importances_ else: rf = RandomForestRegressor() rf.fit(X, y) feature_importance = rf.feature_importances_ feature_importance_dict = { predictors[i]: feature_importance[i] for i in range(len(predictors)) } # generate final output output_list = [ {i: bool_dict[i] for i in bool_dict if i != response}, plot_dict, p_val, t_val, stat_plots, msd_tables, msd_plots, feature_importance_dict, ] output_names = [ "boolean", "plots", "p values", "t values", "statistics plots", "msd table", "msd plots", "feature importances", ] html = "" for i in range(len(output_list)): df = pd.DataFrame.from_dict( output_list[i], orient="index", ) try: if df[0].str.contains("html").any(): df = df.style.format(make_clickable).render() else: df = df.style.render() except AttributeError: df = df.style.set_precision(4).render() html = html + "\n<br><br>" + output_names[i] + "\n" + df with open("output.html", "w") as f: f.write(html) f.close()
from IPython.core.display import display from sklearn import datasets import matplotlib.pyplot as plt import seaborn as sns import pandas as pd import numpy as np from sklearn.model_selection import train_test_split, learning_curve, GridSearchCV import time from sklearn.metrics import accuracy_score raw_data = datasets.load_wine() #print(raw_data) data_train, data_test, label_train, label_test = train_test_split( raw_data['data'], raw_data['target'], test_size=0.2) print( len(data_train), ' samples in training data\n', len(data_test), ' samples in test data\n', ) from sklearn.naive_bayes import GaussianNB from sklearn.svm import SVC, LinearSVC from sklearn.neighbors import KNeighborsClassifier from sklearn.linear_model import LogisticRegression from sklearn import tree from sklearn.neural_network import MLPClassifier from sklearn.ensemble import GradientBoostingClassifier from sklearn.gaussian_process.kernels import RBF from sklearn.ensemble import RandomForestClassifier
kmeanModel = KMeans(n_clusters=k).fit(X) kmeanModel.fit(X) distortions.append( sum( np.min(cdist(X, kmeanModel.cluster_centers_, 'euclidean'), axis=1)) / X.shape[0]) # Plot the elbow plt.plot(K, distortions, 'bx-') plt.xlabel('k') plt.ylabel('Distortion') plt.title('The Elbow Method showing the optimal k') plt.show() X = load_wine().data y = load_wine().target scaler = StandardScaler() scaler.fit(X) X = scaler.transform(X) transformer = random_projection.GaussianRandomProjection(n_components=2) dr_X = transformer.fit_transform(X) #obtain elbow plot plot_elbow(dr_X) #pick three clusters, and view a few groupings km = KMeans(n_clusters=2, random_state=0).fit(dr_X)
from sklearn.datasets import load_wine from sklearn import tree from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score import pandas as pd """ Simple Decision Tree Classifier that identifies types of Wine. @Author Afaq Anwar @Version 02/25/2019 """ # Sets up the data as a DataFrame in order to easier work with data. df = pd.DataFrame(load_wine().data) df.columns = load_wine().feature_names df['type'] = load_wine().target # X = Features, y = labels X = df.drop('type', axis=1) y = df['type'] # Splits the data. X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33) classifier = tree.DecisionTreeClassifier() classifier.fit(X_train, y_train) predictions = classifier.predict(X_test) print(accuracy_score(y_test, predictions))
plt.ylabel('X1 [standardized]') plt.legend(loc='upper left') plt.tight_layout() plt.show() title = "Learning Curves (Naive Bayes)" cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0) estimator = gnb plot_learning_curve(estimator, X, Y, title, cv=cv, n_jobs=4) plt.show() del gnb # Wine print("Wine Test:") wine_dataset = datasets.load_wine() X = wine_dataset.data indice = sorted(np.random.choice(X.shape[1], 2, replace=False)) X = X[:, indice] # print("X:", X) Y = wine_dataset.target # print("Y:", Y) # print("Class lables:", np.unique(Y)) X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=1, stratify=Y) sc = StandardScaler()
# Copyright (c) 2020, Anders Lervik. # Distributed under the MIT License. See LICENSE for more info. """ Residual variance ================= This example will show the residual variance from a `principal component analysis <https://en.wikipedia.org/wiki/Principal_component_analysis>`_ as a function of the number of principal components considered. """ from matplotlib import pyplot as plt import pandas as pd from sklearn.datasets import load_wine from sklearn.preprocessing import scale from sklearn.decomposition import PCA from psynlig import pca_residual_variance plt.style.use('ggplot') data_set = load_wine() data = pd.DataFrame(data_set['data'], columns=data_set['feature_names']) data = scale(data) pca = PCA() pca.fit_transform(data) pca_residual_variance(pca, marker='o', markersize=16, alpha=0.8) plt.show()
# -*- coding: utf-8 -*- # http://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_wine.html # wine数据集 from sklearn.datasets import load_wine wine = load_wine() X = wine.data y = wine.target # train_test_split # http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html from sklearn.model_selection import train_test_split X_train_sp, X_test_sp, y_train_sp, y_test_sp = train_test_split( X, y, test_size=0.33, shuffle=True, random_state=33) print("Train_Test_Split", "TRAIN:", X_train_sp.shape[0], "TEST:", X_test_sp.shape[0]) # 原始K-fold # http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html # 输入一个样本数为n的数据(使用X即可,因X,y样本数相同),返回分割后的 索引向量(生成器形式,需要用for依次获得每次分割结果) # 参数:shuffle打乱; random_state随机种子; kf.get_n_splits(X)获得折数 from sklearn.model_selection import KFold kf = KFold(n_splits=5, shuffle=True, random_state=33) kf_count = 0 for train_index, test_index in kf.split(X): X_train_kf, X_test_kf = X[train_index], X[test_index] y_train_kf, y_test_kf = y[train_index], y[test_index] print("KFold Num:", kf_count, "TRAIN:", train_index.shape[0], "TEST:", test_index.shape[0]) kf_count += 1
from sklearn import datasets import numpy as np import pandas from classifiers.knn import knn ######################################## # Load and organize data into different arrays ######################################## iris_dataset = datasets.load_wine() # load data = iris_dataset['data'] # data values target = iris_dataset['target'] # its targets target_names = iris_dataset['target_names'] # split data index split_80 = int(len(data)*0.8) # *0.8 = get 80% # 80% for train train_data = data[0:split_80] train_target = target[0:split_80] # 20% for test test_data = data[split_80:] test_target = target[split_80:] ######################################## # Create a confusion Matrix ######################################## classes_count = len(target_names) # rows: actual class, cols: predicted class confusion_matrix = np.zeros((classes_count, classes_count), dtype=int) k = 3 # k nearest neighbors# # for each example in test data
def load_dataset(): """ 加载wine数据 :return: """ return datasets.load_wine()
# -*- coding:utf-8 -*- #@Time : 2020/4/11 16:47 #@Author: Kevin.Liu #@File : extremeForest.py # 极限森林 from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import cross_val_score from sklearn import datasets import numpy as np import matplotlib.pyplot as plt # 葡萄酒 X, y = datasets.load_wine(True) # 决策树 clf = DecisionTreeClassifier() print(cross_val_score(clf, X, y, cv=6, scoring='accuracy').mean()) # 随机森林 forest = RandomForestClassifier(n_estimators=100) print(cross_val_score(forest, X, y, cv=6, scoring='accuracy').mean()) # 极限森林 extra = ExtraTreesClassifier(n_estimators=100) print(cross_val_score(extra, X, y, cv=6, scoring='accuracy').mean()) # 鸢尾花数据,特征只有4个,相对于葡萄酒 数据量简单 X, y = datasets.load_iris(True)
# -*- coding: utf-8 -*- """ Created on Tue Mar 17 13:35:36 2020 @author: casti """ from sklearn.datasets import load_wine import pandas as pd d = load_wine() print(d['DESCR']) df = pd.DataFrame(d['data'], columns=d['feature_names']) y = d['target'] # cultivator
def download(output_dir: str): data_input = load_wine(as_frame=True) data_pd = data_input['data'] data_pd['target'] = data_input['target'] os.makedirs(output_dir, exist_ok=True) data_pd.to_csv(os.path.join(output_dir, "data.csv"), index=False)
@author: sandra_chang """ from sklearn import datasets, metrics # 如果是分類問題,請使用 DecisionTreeClassifier,若為回歸問題,請使用 DecisionTreeRegressor from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor from sklearn.model_selection import train_test_split from sklearn import linear_model import warnings warnings.filterwarnings('ignore') wineData = datasets.load_wine() x_train, x_test, y_train, y_test = train_test_split(wineData.data, wineData.target, test_size=0.2, random_state=4) DTC = DecisionTreeClassifier() DTC.fit(x_train, y_train) y_pred = DTC.predict(x_test) acc = metrics.accuracy_score(y_test, y_pred) print("Decision Tree Acuuracy: ", acc)
def wineTest(): advTest(load_wine(), 130, kernel.linear) def irisTest(): advTest(load_iris(), 100, kernel.make_poly_kernel(3)) # Main method. Makes calls based on parameters. if __name__ == '__main__': print("Args:", str(sys.argv[1:])) if "multi" in sys.argv: if "wine" in sys.argv: multi_test.run(load_wine(), 130) else: multi_test.run(load_iris(), 100) else: if "1" in sys.argv: testOne() elif "2" in sys.argv: testTwo() elif "3" in sys.argv: testThree() elif "wine" in sys.argv: wineTest() else: irisTest()
# # pca_example.py # import numpy as np import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import Axes3D from sklearn import decomposition from sklearn import datasets # wczytajmy dane dotyczace win wines = datasets.load_wine() # same punkty (178 punktow 13 wymiarowych) sa w .data points = wines.data # klasy wines_types = wines.target # nazwy klas wines_names = wines.target_names # pca 3d pca = decomposition.PCA(n_components=3) points_reduced=points; pca.fit(points_reduced) points_reduced = pca.transform(points_reduced) fig = plt.figure()
from sklearn import metrics import matplotlib.pyplot as plt from sklearn.datasets import load_wine from sklearn.pipeline import make_pipeline print(__doc__) # Code source: Tyler Lanigan <*****@*****.**> # Sebastian Raschka <*****@*****.**> # License: BSD 3 clause RANDOM_STATE = 42 FIG_SIZE = (10, 7) features, target = load_wine(return_X_y=True) # Make a train/test split using 30% test size X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.30, random_state=RANDOM_STATE) # Fit to data and predict using pipelined GNB and PCA. unscaled_clf = make_pipeline(PCA(n_components=2), GaussianNB()) unscaled_clf.fit(X_train, y_train) pred_test = unscaled_clf.predict(X_test) # Fit to data and predict using pipelined scaling, GNB and PCA. std_clf = make_pipeline(StandardScaler(), PCA(n_components=2), GaussianNB()) std_clf.fit(X_train, y_train) pred_test_std = std_clf.predict(X_test)
import pandas as pd import numpy as np import seaborn as sns import sklearn.datasets as data from sklearn.linear_model import LinearRegression from sklearn import metrics wine = data.load_wine() type(wine) wine.keys() print(wine.DESCR) wine.data wine.feature_names df = pd.DataFrame(wine.data, columns=wine.feature_names) df['wine_type'] = wine.target wine.target df.info() df.head() sns.pairplot(df) # plt.show() reg = LinearRegression() reg X = df['proline'] y = df['alcohol'] X # here X is Pandas's series; so it's like 1D; it's wrong, we need to convert data frame. reg.fit(X, y) # Cause of error: sklearn requirements is X should be a 2D array, or Pandas DataFrame, or a Matrix; #it should be 2D tabel of data #y can be 1D array or 1D DataFrame
from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression from sklearn.svm import SVC import matplotlib.pyplot as plt from sklearn.feature_extraction.text import CountVectorizer from util import plot_classifier def effect_of_removing_examples(X, y): # Train a linear SVM svm = SVC(kernel="linear") svm.fit(X, y) plot_classifier(X, y, svm, lims=(11, 15, 0, 6)) # Make a new data set keeping only the support vectors print("Number of original examples", len(X)) print("Number of support vectors", len(svm.support_)) X_small = X[svm.support_] y_small = y[svm.support_] # Train a new SVM using only the support vectors svm_small = SVC(kernel="linear") svm_small.fit(X_small, y_small) plot_classifier(X_small, y_small, svm_small, lims=(11, 15, 0, 6)) df = datasets.load_wine() X = df.data[:, [0, 1]] y = df.target effect_of_removing_examples(X, y)
def wineTest(): advTest(load_wine(), 130, kernel.linear)
from sklearn.datasets import make_blobs from sklearn.neighbors import KNeighborsClassifier from sklearn.neighbors import KNeighborsRegressor import matplotlib.pyplot as plt import numpy as np import matplotlib as mpl from sklearn.model_selection import train_test_split from sklearn.datasets import make_regression from sklearn.datasets import load_wine wine_set = load_wine( ) #dict_keys(['feature_names', 'data', 'target_names', 'target', 'DESCR']) X_train, X_test, y_train, y_test = train_test_split(wine_set['data'], wine_set['target'], random_state=0) knn = KNeighborsClassifier(n_neighbors=1) knn.fit(X_train, y_train) score = knn.score(X_test, y_test) X_new = np.array([[ 13.2, 2.77, 2.51, 18.5, 96.6, 1.04, 2.55, 0.57, 1.47, 6.2, 1.05, 3.33, 820 ]]) c = knn.predict(X_new) print(c)
from sklearn.ensemble import RandomForestClassifier from sklearn import datasets from sklearn.preprocessing import StandardScaler from sklearn.model_selection import train_test_split import numpy as np import matplotlib.pyplot as plt __author__ = 'wangj' __date__ = '2018/01/04 00:52' __doc__ = ''' 使用RandomForest选择特征 ''' if __name__ == '__main__': wine = datasets.load_wine() x = wine.data y = wine.target x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0) stdsc = StandardScaler() x_train_std = stdsc.fit_transform(x_train) x_test_std = stdsc.transform(x_test) feature_names = wine.feature_names forest = RandomForestClassifier(n_estimators=10, random_state=0, n_jobs=-1) forest.fit(x_train_std, y_train) importances = forest.feature_importances_ indices = np.argsort(importances)[::-1] for f in range(x_train_std.shape[1]): print('{0:<3}{1:30}{2}'.format(f + 1, feature_names[f], importances[indices[f]])) plt.title('Feature Importance') plt.bar(range(x_train_std.shape[1]), importances[indices], color='lightblue', align='center') plt.xticks(range(x_train_std.shape[1]), feature_names, rotation=90)
import matplotlib.pyplot as plt import numpy as np import pandas as pd from sklearn.datasets import load_wine '''聚类是一种无监督学习,它允许我们找到相似对象的组,这些对象彼此之间的相关性比与其他组中的对象更相关。业务用例的示例包括根据内容对文档、音乐和电影进行分组,或者根据购买行为查找客户群,作为推荐引擎的基础。''' '''最流行的聚类算法之一是k-means。假设有 n 个数据点,算法工作如下: 步骤 1:初始化- 选择 k 个随机点作为聚类中心,称为质心步骤 2:聚类分配- 根据与每个质心的距离将每个数据点分配到其最近的质心, 并形成 k 个集群 第 3 步:质心更新- 对于每个新集群,通过取分配给集群的所有点的平均值来计算其质心 第 4 步:重复第2 步和第 3 步,直到没有任何集群分配发生变化,或者达到最大迭代次数''' '''使用numpy计算2点间的距离''' x1 = np.array([0, 1]) x2 = np.array([2, 0]) print(np.sqrt(((x1 - x2)**2).sum())) # 2.23606797749979 print(np.sqrt(5)) # 2.23606797749979 '''计算葡萄酒分类:每种葡萄酒有 13 个特征,如果我们可以将所有的葡萄酒分成 3 组,那么它将 13 维空间缩减为 3 维空间。''' data = load_wine() wine = pd.DataFrame(data.data, columns=data.feature_names) print(wine.shape) print(wine.columns) print(wine.iloc[:, :3].describe()) #统计前3个字段的数据描述 '''pd.plotting.scatter_matrix():显示沿对角线的直方图和对角线外每对属性的散点图''' from pandas.plotting import scatter_matrix scatter_matrix(wine.iloc[:, :]) plt.savefig("plot_win_scatter_matrix.png") plt.show() '''k的数量(子组)需要通过观察散点图进行主观判断(瞎猜)''' '''对数据进行标准化处理: z = (x - mean) / std 其中 x 是原始数据,mean 和 std 是 x 的平均值和标准差,z 是缩放后的 x,使得它以 0 为中心并且具有单位标准差。使用 sklearn.preprocessing 的StandardScaler''' from sklearn.preprocessing import StandardScaler #对数据进行标准化的库 X = wine[['alcohol', 'total_phenols']] scale = StandardScaler() #实例化缩放器
from sklearn.cluster import KMeans from SA import simulated_annealing from GRASP import grasp from AG import genetic from comons import evaluate_clusters, objective_function matplotlib.use('TkAgg') # load ############################################################################################################# max_time = 1 iris = load_iris()['data'] iris = [{'id': x, 'coord': y} for x, y in zip(range(len(iris)), iris)] k_I = [2, 4, 8, 11, 15, 17, 23, 28, 32, 50] wine = load_wine()['data'] wine = [{'id': x, 'coord': y} for x, y in zip(range(len(wine)), wine)] k_W = [3, 5, 13, 15, 20, 23, 25, 30, 41, 45] ionosphere = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/ionosphere/ionosphere.data') ionosphere = np.asarray(ionosphere.iloc[:,:34]) ionosphere = [{'id': x, 'coord': y} for x, y in zip(range(len(ionosphere)), ionosphere)] k_H = [2, 3, 5, 10, 15, 20, 25, 30, 40, 50] # utils ############################################################################################################ # kmeans def kmeans(dataset, k): tmp = [i['coord'] for i in dataset] start = time.process_time() kmeans = KMeans(n_clusters=k).fit(tmp)
# sklearn库 决策树分类器 wine数据集 from sklearn.datasets import load_wine # 引入数据集,sklearn包含众多数据集 from sklearn.model_selection import train_test_split # 将数据分为测试集和训练集 from sklearn import tree # 利用邻近点方式训练数据\ import matplotlib.pyplot as plt # 引入数据 wine=load_wine() # 引入wine数据 X_train,X_test,y_train,y_test=train_test_split(wine.data,wine.target,test_size=0.3) # 利用train_test_split进行将训练集和测试集进行分开,test_size占30% # 训练数据 clf=tree.DecisionTreeClassifier(random_state=0) # 引入训练方法 clf.fit(X_train,y_train) # 进行填充测试数据进行训练 # 预测数据 print(clf.predict(X_test)) result=clf.score(X_test,y_test) print('score:',result) plt.figure(figsize=(15,9)) tree.plot_tree(clf ,filled=True ,feature_names=wine.feature_names ,class_names=wine.target_names ) # sklearn库 泰坦尼克号生存者预测 import pandas as pd from sklearn.tree import DecisionTreeClassifier import matplotlib.pyplot as plt from sklearn.model_selection import GridSearchCV data=pd.read_csv("E:/data/titannic_data.csv")