コード例 #1
0
ファイル: test_base.py プロジェクト: TaihuaLi/scikit-learn
def test_load_wine():
    res = load_wine()
    assert_equal(res.data.shape, (178, 13))
    assert_equal(res.target.size, 178)
    assert_equal(res.target_names.size, 3)
    assert_true(res.DESCR)

    # test return_X_y option
    X_y_tuple = load_wine(return_X_y=True)
    bunch = load_wine()
    assert_true(isinstance(X_y_tuple, tuple))
    assert_array_equal(X_y_tuple[0], bunch.data)
    assert_array_equal(X_y_tuple[1], bunch.target)
コード例 #2
0
def test_load_wine():
    res = load_wine()
    assert_equal(res.data.shape, (178, 13))
    assert_equal(res.target.size, 178)
    assert_equal(res.target_names.size, 3)
    assert_true(res.DESCR)

    # test return_X_y option
    check_return_X_y(res, partial(load_wine))
コード例 #3
0
    def test_feature_correlation_integrated_mutual_info_classification(self):
        """
        Test FeatureCorrelation visualizer with mutual information
        on wine dataset (classification)
        """
        data = datasets.load_wine()
        X, y = data['data'], data['target']

        viz = FeatureCorrelation(method='mutual_info-classification')
        viz.fit(X, y, random_state=12345)
        viz.poof()

        self.assert_images_similar(viz)
コード例 #4
0
def feature_correlation_mutual_info_classification(
        path="images/feature_correlation_mutual_info_classification.png"):
    data = datasets.load_wine()
    X, y = data['data'], data['target']
    feature_names = np.array(data['feature_names'])
    X_pd = pd.DataFrame(X, columns=feature_names)

    feature_to_plot = ['alcohol', 'ash', 'hue', 'proline', 'total_phenols']

    visualizer = FeatureCorrelation(method='mutual_info-classification',
                                    feature_names=feature_to_plot)
    visualizer.fit(X_pd, y, random_state=0)
    visualizer.poof(outpath=path, clear_figure=True)
コード例 #5
0
ファイル: hw3.py プロジェクト: jezlax/python
def load_wine():
    #KMEANS WINE DATA
    wine = datasets.load_wine()
    
    wine_df = pd.DataFrame(wine.data)
    wine_df.columns = wine.feature_names
    
    wine_df['target'] = wine.target
    
    wine_df_feats = wine_df.drop('target',axis=1)
    scaler = MinMaxScaler()
    wine_sc = scaler.fit_transform(wine_df_feats)
    
    return wine_df, wine_sc
コード例 #6
0
def Wine(training_size, test_size, n, PLOT_DATA):
    class_labels = [r'A', r'B', r'C']

    data, target = datasets.load_wine(True)
    sample_train, sample_test, label_train, label_test = train_test_split(data, target, test_size=0.1,
                                                                          random_state=7)

    # Now we standarize for gaussian around 0 with unit variance
    std_scale = StandardScaler().fit(sample_train)
    sample_train = std_scale.transform(sample_train)
    sample_test = std_scale.transform(sample_test)

    # Now reduce number of features to number of qubits
    pca = PCA(n_components=n).fit(sample_train)
    sample_train = pca.transform(sample_train)
    sample_test = pca.transform(sample_test)

    # Scale to the range (-1,+1)
    samples = np.append(sample_train, sample_test, axis=0)
    minmax_scale = MinMaxScaler((-1, 1)).fit(samples)
    sample_train = minmax_scale.transform(sample_train)
    sample_test = minmax_scale.transform(sample_test)
    # Pick training size number of samples from each distro
    training_input = {key: (sample_train[label_train == k, :])[:training_size] for k, key in enumerate(class_labels)}
    test_input = {key: (sample_train[label_train == k, :])[training_size:(
        training_size+test_size)] for k, key in enumerate(class_labels)}

    if PLOT_DATA:
        for k in range(0, 3):
            plt.scatter(sample_train[label_train == k, 0][:training_size],
                        sample_train[label_train == k, 1][:training_size])

        plt.title("PCA dim. reduced Wine dataset")
        plt.show()

    return sample_train, training_input, test_input, class_labels
コード例 #7
0
        # plt.show()
        if it % (iterations / 1000) == 0:
            temp = nearest_neighbors(scale(A, input), label)
            if correct < temp:
                correct = temp
                A_optimal = A
                print('Iteration', it, 'Nearest neighbors on nca data:')
                print('Got', correct, 'correct out of', input.shape[0])
            else:
                print('Iteration', it, 'Nearest neighbors on nca data:')
                print('Got', temp, 'correct out of', input.shape[0])
    return A_optimal


if __name__ == "__main__":
    X, y = load_wine(return_X_y=True)
    # X = np.array([[0, 0, 0.1], [0, 0.1, 0.1], [0.9, 0.6, 0.8], [0.9, 0.5, 0.7]])
    # y = np.array([0, 0, 1, 1])
    print('Nearest neighbors on raw data:')
    print('Got', nearest_neighbors(X, y), 'correct out of', X.shape[0])

    A = scaling_matrix(X)
    print('A\n', A)
    print('Nearest neighbors on scaled data:')
    print('Got', nearest_neighbors(scale(A, X), y), 'correct out of',
          X.shape[0])

    A = neighborhood_components_analysis(X, y, A, 100000, 0.001)
    print('A\n', A)
    print('Nearest neighbors on nca data:')
    print('Got', nearest_neighbors(scale(A, X), y), 'correct out of',
コード例 #8
0
def test_data():
    """Make sure dataset is not Null"""
    X, y = load_wine(return_X_y=True)
    assert X and y == numpy.ndarray
コード例 #9
0
def get_wine():
    data = datasets.load_wine()
    df = pd.DataFrame(data['data'])
    df['class'] = data['target']
    return df
コード例 #10
0
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn import datasets

# 决策树,进行裂分的时候,分局信息增益最大进行裂分,会比较刻板
# 极限森林:样本随机,裂分条件随机

print('--wine--')
X, y = datasets.load_wine(return_X_y=True)
clf = DecisionTreeClassifier()
print(cross_val_score(clf, X, y, cv=6,
                      scoring='accuracy').mean())  # cv 依据于 (Stratified)KFold

forest = RandomForestClassifier()
print(cross_val_score(forest, X, y, cv=6, scoring='accuracy').mean())

extra = ExtraTreesClassifier()
print(cross_val_score(extra, X, y, cv=6, scoring='accuracy').mean())

print('--鸢尾花--')
X, y = datasets.load_iris(return_X_y=True)
clf = DecisionTreeClassifier()
print(cross_val_score(clf, X, y, cv=6,
                      scoring='accuracy').mean())  # cv 依据于 (Stratified)KFold

forest = RandomForestClassifier()
print(cross_val_score(forest, X, y, cv=6, scoring='accuracy').mean())

extra = ExtraTreesClassifier()
print(cross_val_score(extra, X, y, cv=6, scoring='accuracy').mean())
コード例 #11
0
from sklearn import datasets, metrics

# 如果是分類問題,請使用 DecisionTreeClassifier,若為回歸問題,請使用 DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.model_selection import train_test_split

# 讀取wine資料集
wine = datasets.load_wine()

# 切分訓練集/測試集
x_train, x_test, y_train, y_test = train_test_split(wine.data,
                                                    wine.target,
                                                    test_size=0.25,
                                                    random_state=4)

# 建立模型
# https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html
# DecisionTreeClassifier(criterion=’gini’
# , splitter=’best’
# , max_depth=None
# , min_samples_split=2
# , min_samples_leaf=1
# , min_weight_fraction_leaf=0.0
# , max_features=None
# , random_state=None
# , max_leaf_nodes=None
# , min_impurity_decrease=0.0
# , min_impurity_split=None
# , class_weight=None
# , presort=False)
clf = DecisionTreeClassifier()
コード例 #12
0
from itertools import product
from collections import defaultdict
from sklearn.metrics.pairwise import pairwise_distances

import warnings
warnings.filterwarnings('ignore')

# Import 7.2. Toy datasets from scikit-learn Library
#https://scikit-learn.org/stable/datasets/index.html
from sklearn.datasets import load_digits
data_digits = load_digits()
X1, Y1 = pd.DataFrame(data_digits["data"]), pd.Series(data_digits["target"])
Dataset = "digits"

from sklearn.datasets import load_wine
data_wine = load_wine()
#X1, Y1 = pd.DataFrame(data_wine["data"],columns=data_wine.feature_names), pd.Series(data_wine["target"])
#Dataset = "wine"


def pairwiseDistCorr(X1, X2):
    assert X1.shape[0] == X2.shape[0]
    d1 = pairwise_distances(X1)
    d2 = pairwise_distances(X2)
    return np.corrcoef(d1.ravel(), d2.ravel())[0, 1]


# Run RCA
dims = list(np.arange(2, (X1.shape[1] - 1), 3))
dims.append(X1.shape[1])
tmp = defaultdict(dict)
コード例 #13
0
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.datasets import load_wine
from sklearn.pipeline import make_pipeline
from matplotlib.font_manager import *

myfont = FontProperties(fname='C:\Windows\Fonts\simfang.ttf') 
RANDOM_STATE = 42
FIG_SIZE = (10, 7)
features, target = load_wine(return_X_y=True)
# Make a train/test split using 30% test size
X_train, X_test, y_train, y_test = train_test_split(features, target,
test_size=0.30,
random_state=RANDOM_STATE)
# Fit to data and predict using pipelined GNB and PCA.
unscaled_clf = make_pipeline(PCA(n_components=2), GaussianNB())
unscaled_clf.fit(X_train, y_train)
pred_test = unscaled_clf.predict(X_test)
# Fit to data and predict using pipelined scaling, GNB and PCA.
std_clf = make_pipeline(StandardScaler(), PCA(n_components=2), GaussianNB())
std_clf.fit(X_train, y_train)
pred_test_std = std_clf.predict(X_test)
# Show prediction accuracies in scaled and unscaled data.
print('\nPrediction accuracy for the normal test dataset with PCA')
print('{:.2%}\n'.format(metrics.accuracy_score(y_test, pred_test)))
print('\nPrediction accuracy for the standardized test dataset with PCA')
コード例 #14
0
#!/usr/bin/env python
"""Test xgboost integration for classification task."""

from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
import wandb
from wandb.integration.xgboost import wandb_callback
from xgboost import XGBClassifier

X, y = load_wine(return_X_y=True, as_frame=True)

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=1)

model = XGBClassifier(use_label_encoder=False,
                      eval_metric=['mlogloss', 'auc'],
                      seed=42,
                      n_estimators=50)

wandb.init(project="wine-xgboost")

model.fit(X_train,
          y_train,
          eval_set=[(X_train, y_train), (X_test, y_test)],
          early_stopping_rounds=40,
          callbacks=[wandb_callback()],
          verbose=False)
コード例 #15
0
    plt.ylim(-1, n_features)

plot_feature_importances_dataset(model)
plt.show()
'''

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split

import numpy as np
import pandas as pd

dataset = load_wine()
x = dataset.data
y = dataset.target

df = pd.DataFrame(x, columns=dataset.feature_names) 
df1 = df.drop(['magnesium', 'alcalinity_of_ash', 'nonflavanoid_phenols', 'total_phenols', 'alcohol', 'ash'],axis=1)
df2 = df.to_numpy()
print(df1.shape)


# 1. 데이터
# dataset = load_breast_cancer()
x_train, x_test, y_train, y_test = train_test_split(
    df2, dataset.target, train_size=0.8, random_state=44
)
コード例 #16
0
def main():
    # create path for html output
    if not os.path.exists("html"):
        os.makedirs("html")

    # import data to pandas
    data = load_wine()
    input_df = pd.DataFrame(data.data)

    # find which columns are predictors and which is response
    cols = input_df.columns.to_list()
    print(input_df.head())
    check = False
    while not check:
        response = input(f"Which column is the response? \n {cols}? \n")
        if response in cols:
            check = True
        elif int(response) in cols:
            response = int(response)
            check = True
        else:
            print("Incorrect user input.")
    else:
        response = 1
    predictors = [x for x in cols if x != response]

    # determine which columns are categorical and which are continuous
    bool_dict = {response: bool_check(input_df[response])}
    plot_dict = {}
    for predictor in predictors:
        bool_dict[predictor] = bool_check(input_df[predictor])

    # generate plots if response is categorical
    if bool_dict[response]:
        for predictor in predictors:
            if bool_dict[predictor]:
                # heat plot
                df = input_df[[response, predictor]].copy()
                hist_2d = px.density_heatmap(df, x=predictor, y=response)
                hist_2d.update_xaxes(title=predictor)
                hist_2d.update_yaxes(title=response)
                hist_2d.show()
                plot_loc = f"html/{predictor}_plot.html"
                hist_2d.write_html(
                    file=plot_loc,
                    include_plotlyjs="cdn",
                )
                plot_dict[predictor] = plot_loc
            else:
                # violin plot
                df = input_df[[response, predictor]].copy()
                violin = px.violin(df,
                                   y=predictor,
                                   color=response,
                                   violinmode="overlay")
                violin.update_layout(
                    title_text=
                    f"violin plot of {predictor} grouped by {response}", )
                violin.update_yaxes(title_text=predictor)
                violin.show()
                plot_loc = f"html/{predictor}_plot.html"
                violin.write_html(
                    file=plot_loc,
                    include_plotlyjs="cdn",
                )
                plot_dict[predictor] = plot_loc

    # generate plots if response is continuous
    else:
        for predictor in predictors:
            if bool_dict[predictor]:
                # histogram plot
                df = input_df[[response, predictor]].copy()
                fig = px.histogram(
                    df,
                    x=response,
                    y=response,
                    color=predictor,
                    marginal="box",
                    hover_data=df.columns,
                )
                fig.show()
                plot_loc = f"html/{predictor}_plot.html"
                fig.write_html(
                    file=plot_loc,
                    include_plotlyjs="cdn",
                )
                plot_dict[predictor] = plot_loc
            else:
                # scatter plot with trend line
                df = input_df[[response, predictor]].copy()
                scatter = px.scatter(df,
                                     x=predictor,
                                     y=response,
                                     trendline="ols")
                scatter.update_layout(title_text=f"{predictor} v. {response}")
                scatter.update_xaxes(ticks="inside", title_text=predictor)
                scatter.update_yaxes(ticks="inside", title_text=response)
                scatter.show()
                plot_loc = f"html/{predictor}_plot.html"
                scatter.write_html(
                    file=plot_loc,
                    include_plotlyjs="cdn",
                )
                plot_dict[predictor] = plot_loc

    # generate stats data inputs
    X_cols = input_df.drop(response, axis=1).columns.to_list()
    X = input_df.drop(response, axis=1).to_numpy()
    y = input_df[response].to_numpy()
    t_val, p_val, stat_plots = {}, {}, {}

    # linear regression stats if response is continuous
    if not bool_dict[response]:
        for idx, column in enumerate(X.T):
            column = X[:, idx]
            feature_name = X_cols[idx]
            predictor = statsmodels.api.add_constant(column)
            linear_regression_model = statsmodels.api.OLS(y,
                                                          predictor,
                                                          missing="drop")
            linear_regression_fitted = linear_regression_model.fit()
            print(linear_regression_fitted.summary())
            p_value = round(linear_regression_fitted.tvalues[1], 4)
            t_value = "{:.6e}".format(linear_regression_fitted.pvalues[1])
            p_val[feature_name], t_val[feature_name] = t_value, p_value
            fig = px.scatter(x=column, y=y, trendline="ols")
            fig.update_layout(
                title=
                f"Variable: {feature_name}: (t-value={t_value}) (p-value={p_value})",
                xaxis_title=f"Variable: {feature_name}",
                yaxis_title=f"Response: {response}",
            )
            fig.show()
            plot_loc = f"html/{feature_name}_stats_plot.html"
            fig.write_html(
                file=plot_loc,
                include_plotlyjs="cdn",
            )
            stat_plots[feature_name] = plot_loc

    # logistic regression stats if response is boolean
    else:
        for idx, column in enumerate(X.T):
            column = X[:, idx]
            feature_name = X_cols[idx]
            predictor = statsmodels.api.add_constant(column)
            logistic_regression_model = statsmodels.api.Logit(y,
                                                              predictor,
                                                              missing="drop")
            logistic_regression_fitted = logistic_regression_model.fit()
            print(logistic_regression_fitted.summary())
            p_value = round(logistic_regression_fitted.tvalues[1], 4)
            t_value = "{:.6e}".format(logistic_regression_fitted.pvalues[1])
            p_val[feature_name], t_val[feature_name] = t_value, p_value
            fig = px.scatter(x=column, y=y, trendline="ols")
            fig.update_layout(
                title=
                f"Variable: {feature_name}: (t-value={t_value}) (p-value={p_value})",
                xaxis_title=f"Variable: {feature_name}",
                yaxis_title=f"Response: {response}",
            )
            fig.show()
            plot_loc = f"html/{feature_name}_stats_plot.html"
            fig.write_html(
                file=plot_loc,
                include_plotlyjs="cdn",
            )
            stat_plots[feature_name] = plot_loc

    # mean square difference setup
    msd_plots, msd_tables = {}, {}
    for feature in X_cols:
        data = input_df[feature].to_list()
        data.sort()
        data_range = max(data) - min(data)
        step = data_range / 10
        table = pd.DataFrame(columns=[
            "lower bin",
            "upper bin",
            "median",
            "count",
            "bin mean",
            "population mean",
            "mean square diff",
        ])
        weighted_table = pd.DataFrame(columns=[
            "lower bin",
            "upper bin",
            "median",
            "count",
            "bin mean",
            "population mean",
            "mean square diff",
            "pop proportion",
            "weighted MSD",
        ])

        # mean square unweighted table
        for n in range(10):
            low, high = min(data) + (step * n), min(data) + (step * (n + 1))
            if n == 9:
                b = [y for y in data if low <= y <= high]
            else:
                b = [y for y in data if low <= y < high]
            if not b:
                new_row = {
                    "lower bin": low,
                    "upper bin": high,
                    "median": 0,
                    "count": 0,
                    "bin mean": 0,
                    "population mean": np.nanmean(data),
                    "mean square diff": 0,
                }
            else:
                med, count, mean = (
                    statistics.median(b),
                    int(len(b)),
                    statistics.mean(b),
                )
                pop_mean = np.nanmean(data)
                mean_sq_diff = abs((mean - pop_mean)**2)
                new_row = {
                    "lower bin": low,
                    "upper bin": high,
                    "median": med,
                    "count": count,
                    "bin mean": mean,
                    "population mean": pop_mean,
                    "mean square diff": mean_sq_diff,
                }
            table = table.append(new_row, ignore_index=True)
        msd_tables[feature] = html_write(table, feature, "unweighted")

        # mean square weighted table
        for n in range(10):
            low, high = min(data) + (step * n), min(data) + (step * (n + 1))
            if n == 9:
                b = [y for y in data if low <= y <= high]
            else:
                b = [y for y in data if low <= y < high]
            if not b:
                new_row = {
                    "lower bin": low,
                    "upper bin": high,
                    "median": 0,
                    "count": 0,
                    "bin mean": 0,
                    "population mean": np.nanmean(data),
                    "mean square diff": 0,
                    "pop proportion": 0,
                    "weighted MSD": 0,
                }
            else:
                med, count, mean = (
                    statistics.median(b),
                    int(len(b)),
                    statistics.mean(b),
                )
                pop_prop = count / len(data)
                pop_mean = np.nanmean(data)
                mean_sq_diff = abs((mean - pop_mean)**2)
                weighted_msd = mean_sq_diff * pop_prop
                new_row = {
                    "lower bin": low,
                    "upper bin": high,
                    "median": med,
                    "count": count,
                    "bin mean": mean,
                    "population mean": pop_mean,
                    "mean square diff": mean_sq_diff,
                    "pop proportion": pop_prop,
                    "weighted MSD": weighted_msd,
                }
            weighted_table = weighted_table.append(new_row, ignore_index=True)
        table = weighted_table
        msd_tables[feature] = html_write(table, feature, "weighted")
        # plot from table
        msd_plots[feature] = plot_msd(table, feature, response)

    # feature importance calculations
    y = input_df[response].values
    X = input_df.drop(response, axis=1)
    if bool_dict[response]:
        rf = RandomForestClassifier()
        rf.fit(X, y)
        feature_importance = rf.feature_importances_
    else:
        rf = RandomForestRegressor()
        rf.fit(X, y)
        feature_importance = rf.feature_importances_
    feature_importance_dict = {
        predictors[i]: feature_importance[i]
        for i in range(len(predictors))
    }

    # generate final output
    output_list = [
        {i: bool_dict[i]
         for i in bool_dict if i != response},
        plot_dict,
        p_val,
        t_val,
        stat_plots,
        msd_tables,
        msd_plots,
        feature_importance_dict,
    ]
    output_names = [
        "boolean",
        "plots",
        "p values",
        "t values",
        "statistics plots",
        "msd table",
        "msd plots",
        "feature importances",
    ]
    html = ""
    for i in range(len(output_list)):
        df = pd.DataFrame.from_dict(
            output_list[i],
            orient="index",
        )
        try:
            if df[0].str.contains("html").any():
                df = df.style.format(make_clickable).render()
            else:
                df = df.style.render()
        except AttributeError:
            df = df.style.set_precision(4).render()
        html = html + "\n<br><br>" + output_names[i] + "\n" + df
    with open("output.html", "w") as f:
        f.write(html)
        f.close()
コード例 #17
0
from IPython.core.display import display
from sklearn import datasets
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, learning_curve, GridSearchCV
import time
from sklearn.metrics import accuracy_score

raw_data = datasets.load_wine()
#print(raw_data)

data_train, data_test, label_train, label_test = train_test_split(
    raw_data['data'], raw_data['target'], test_size=0.2)
print(
    len(data_train),
    ' samples in training data\n',
    len(data_test),
    ' samples in test data\n',
)

from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.ensemble import RandomForestClassifier
コード例 #18
0
        kmeanModel = KMeans(n_clusters=k).fit(X)
        kmeanModel.fit(X)
        distortions.append(
            sum(
                np.min(cdist(X, kmeanModel.cluster_centers_, 'euclidean'),
                       axis=1)) / X.shape[0])

    # Plot the elbow
    plt.plot(K, distortions, 'bx-')
    plt.xlabel('k')
    plt.ylabel('Distortion')
    plt.title('The Elbow Method showing the optimal k')
    plt.show()


X = load_wine().data
y = load_wine().target

scaler = StandardScaler()
scaler.fit(X)

X = scaler.transform(X)
transformer = random_projection.GaussianRandomProjection(n_components=2)
dr_X = transformer.fit_transform(X)

#obtain elbow plot
plot_elbow(dr_X)

#pick three clusters, and view a few groupings

km = KMeans(n_clusters=2, random_state=0).fit(dr_X)
コード例 #19
0
from sklearn.datasets import load_wine
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd

"""
Simple Decision Tree Classifier that identifies types of Wine.
@Author Afaq Anwar
@Version 02/25/2019
"""

# Sets up the data as a DataFrame in order to easier work with data.
df = pd.DataFrame(load_wine().data)
df.columns = load_wine().feature_names
df['type'] = load_wine().target

# X = Features, y = labels
X = df.drop('type', axis=1)
y = df['type']

# Splits the data.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

classifier = tree.DecisionTreeClassifier()
classifier.fit(X_train, y_train)

predictions = classifier.predict(X_test)

print(accuracy_score(y_test, predictions))
コード例 #20
0
    plt.ylabel('X1 [standardized]')
    plt.legend(loc='upper left')
    plt.tight_layout()
    plt.show()

    title = "Learning Curves (Naive Bayes)"
    cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0)
    estimator = gnb
    plot_learning_curve(estimator, X, Y, title, cv=cv, n_jobs=4)
    plt.show()

    del gnb

    # Wine
    print("Wine Test:")
    wine_dataset = datasets.load_wine()
    X = wine_dataset.data
    indice = sorted(np.random.choice(X.shape[1], 2, replace=False))
    X = X[:, indice]
    # print("X:", X)
    Y = wine_dataset.target
    # print("Y:", Y)
    # print("Class lables:", np.unique(Y))

    X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                        Y,
                                                        test_size=0.3,
                                                        random_state=1,
                                                        stratify=Y)

    sc = StandardScaler()
コード例 #21
0
# Copyright (c) 2020, Anders Lervik.
# Distributed under the MIT License. See LICENSE for more info.
"""
Residual variance
=================

This example will show the residual variance from a
`principal component analysis
<https://en.wikipedia.org/wiki/Principal_component_analysis>`_
as a function of the number of principal components considered.
"""
from matplotlib import pyplot as plt
import pandas as pd
from sklearn.datasets import load_wine
from sklearn.preprocessing import scale
from sklearn.decomposition import PCA
from psynlig import pca_residual_variance
plt.style.use('ggplot')

data_set = load_wine()
data = pd.DataFrame(data_set['data'], columns=data_set['feature_names'])
data = scale(data)

pca = PCA()
pca.fit_transform(data)

pca_residual_variance(pca, marker='o', markersize=16, alpha=0.8)

plt.show()
コード例 #22
0
# -*- coding: utf-8 -*-

# http://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_wine.html
# wine数据集
from sklearn.datasets import load_wine
wine = load_wine()

X = wine.data
y = wine.target

# train_test_split
# http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html
from sklearn.model_selection import train_test_split
X_train_sp, X_test_sp, y_train_sp, y_test_sp = train_test_split(
    X, y, test_size=0.33, shuffle=True, random_state=33)
print("Train_Test_Split", "TRAIN:", X_train_sp.shape[0], "TEST:",
      X_test_sp.shape[0])

# 原始K-fold
# http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html
# 输入一个样本数为n的数据(使用X即可,因X,y样本数相同),返回分割后的 索引向量(生成器形式,需要用for依次获得每次分割结果)
# 参数:shuffle打乱; random_state随机种子; kf.get_n_splits(X)获得折数
from sklearn.model_selection import KFold
kf = KFold(n_splits=5, shuffle=True, random_state=33)
kf_count = 0
for train_index, test_index in kf.split(X):
    X_train_kf, X_test_kf = X[train_index], X[test_index]
    y_train_kf, y_test_kf = y[train_index], y[test_index]
    print("KFold Num:", kf_count, "TRAIN:", train_index.shape[0], "TEST:",
          test_index.shape[0])
    kf_count += 1
コード例 #23
0
from sklearn import datasets
import numpy as np
import pandas
from classifiers.knn import knn

########################################
# Load and organize data into different arrays
########################################
iris_dataset = datasets.load_wine()  # load
data = iris_dataset['data']  # data values
target = iris_dataset['target']  # its targets
target_names = iris_dataset['target_names']
# split data index
split_80 = int(len(data)*0.8)  # *0.8 = get 80%
# 80% for train
train_data = data[0:split_80]
train_target = target[0:split_80]
# 20% for test
test_data = data[split_80:]
test_target = target[split_80:]


########################################
# Create a confusion Matrix
########################################
classes_count = len(target_names)
# rows: actual class, cols: predicted class
confusion_matrix = np.zeros((classes_count, classes_count), dtype=int)
k = 3  # k nearest neighbors#

# for each example in test data
コード例 #24
0
ファイル: fcm_exc.py プロジェクト: finepix/ITbegin_course
def load_dataset():
    """
        加载wine数据
    :return:
    """
    return datasets.load_wine()
コード例 #25
0
ファイル: extremeForest.py プロジェクト: liuqi123456/python
# -*- coding:utf-8 -*-
#@Time  : 2020/4/11 16:47
#@Author: Kevin.Liu
#@File  : extremeForest.py

# 极限森林

from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn import datasets
import numpy as np
import matplotlib.pyplot as plt

# 葡萄酒
X, y = datasets.load_wine(True)

# 决策树
clf = DecisionTreeClassifier()
print(cross_val_score(clf, X, y, cv=6, scoring='accuracy').mean())

# 随机森林
forest = RandomForestClassifier(n_estimators=100)
print(cross_val_score(forest, X, y, cv=6, scoring='accuracy').mean())

# 极限森林
extra = ExtraTreesClassifier(n_estimators=100)
print(cross_val_score(extra, X, y, cv=6, scoring='accuracy').mean())

# 鸢尾花数据,特征只有4个,相对于葡萄酒 数据量简单
X, y = datasets.load_iris(True)
コード例 #26
0
# -*- coding: utf-8 -*-
"""
Created on Tue Mar 17 13:35:36 2020

@author: casti
"""

from sklearn.datasets import load_wine
import pandas as pd

d = load_wine()
print(d['DESCR'])
df = pd.DataFrame(d['data'], columns=d['feature_names'])
y = d['target']  # cultivator
コード例 #27
0
def download(output_dir: str):
    data_input = load_wine(as_frame=True)
    data_pd = data_input['data']
    data_pd['target'] = data_input['target']
    os.makedirs(output_dir, exist_ok=True)
    data_pd.to_csv(os.path.join(output_dir, "data.csv"), index=False)
コード例 #28
0
@author: sandra_chang
"""

from sklearn import datasets, metrics

# 如果是分類問題,請使用 DecisionTreeClassifier,若為回歸問題,請使用 DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn import linear_model

import warnings

warnings.filterwarnings('ignore')

wineData = datasets.load_wine()

x_train, x_test, y_train, y_test = train_test_split(wineData.data,
                                                    wineData.target,
                                                    test_size=0.2,
                                                    random_state=4)

DTC = DecisionTreeClassifier()

DTC.fit(x_train, y_train)

y_pred = DTC.predict(x_test)

acc = metrics.accuracy_score(y_test, y_pred)
print("Decision Tree Acuuracy: ", acc)
コード例 #29
0
ファイル: test_svm.py プロジェクト: jw-develop/cs394-projects

def wineTest():
    advTest(load_wine(), 130, kernel.linear)


def irisTest():
    advTest(load_iris(), 100, kernel.make_poly_kernel(3))


# Main method. Makes calls based on parameters.
if __name__ == '__main__':

    print("Args:", str(sys.argv[1:]))

    if "multi" in sys.argv:
        if "wine" in sys.argv:
            multi_test.run(load_wine(), 130)
        else:
            multi_test.run(load_iris(), 100)
    else:
        if "1" in sys.argv:
            testOne()
        elif "2" in sys.argv:
            testTwo()
        elif "3" in sys.argv:
            testThree()
        elif "wine" in sys.argv:
            wineTest()
        else:
            irisTest()
コード例 #30
0
ファイル: pca_example2.py プロジェクト: sjwilczynski/Studia
#
# pca_example.py
#

import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn import decomposition
from sklearn import datasets

# wczytajmy dane dotyczace win
wines = datasets.load_wine()

# same punkty (178 punktow 13 wymiarowych) sa w .data
points = wines.data

# klasy
wines_types = wines.target

# nazwy klas
wines_names = wines.target_names


# pca 3d
pca = decomposition.PCA(n_components=3)
points_reduced=points;
pca.fit(points_reduced)
points_reduced = pca.transform(points_reduced)


fig = plt.figure()
コード例 #31
0
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.datasets import load_wine
from sklearn.pipeline import make_pipeline
print(__doc__)

# Code source: Tyler Lanigan <*****@*****.**>
#              Sebastian Raschka <*****@*****.**>

# License: BSD 3 clause

RANDOM_STATE = 42
FIG_SIZE = (10, 7)


features, target = load_wine(return_X_y=True)

# Make a train/test split using 30% test size
X_train, X_test, y_train, y_test = train_test_split(features, target,
                                                    test_size=0.30,
                                                    random_state=RANDOM_STATE)

# Fit to data and predict using pipelined GNB and PCA.
unscaled_clf = make_pipeline(PCA(n_components=2), GaussianNB())
unscaled_clf.fit(X_train, y_train)
pred_test = unscaled_clf.predict(X_test)

# Fit to data and predict using pipelined scaling, GNB and PCA.
std_clf = make_pipeline(StandardScaler(), PCA(n_components=2), GaussianNB())
std_clf.fit(X_train, y_train)
pred_test_std = std_clf.predict(X_test)
コード例 #32
0
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn.datasets as data
from sklearn.linear_model import LinearRegression
from sklearn import metrics

wine = data.load_wine()
type(wine)
wine.keys()
print(wine.DESCR)
wine.data
wine.feature_names
df = pd.DataFrame(wine.data, columns=wine.feature_names)
df['wine_type'] = wine.target
wine.target
df.info()
df.head()
sns.pairplot(df)
# plt.show()

reg = LinearRegression()
reg
X = df['proline']
y = df['alcohol']
X  # here X is Pandas's series; so it's like 1D; it's wrong, we need to convert data frame.
reg.fit(X, y)
# Cause of error: sklearn requirements is X should be a 2D array, or Pandas DataFrame, or a Matrix;
#it should be 2D tabel of data

#y can be 1D array or 1D DataFrame
コード例 #33
0
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from util import plot_classifier


def effect_of_removing_examples(X, y):
    # Train a linear SVM
    svm = SVC(kernel="linear")
    svm.fit(X, y)
    plot_classifier(X, y, svm, lims=(11, 15, 0, 6))

    # Make a new data set keeping only the support vectors
    print("Number of original examples", len(X))
    print("Number of support vectors", len(svm.support_))
    X_small = X[svm.support_]
    y_small = y[svm.support_]

    # Train a new SVM using only the support vectors
    svm_small = SVC(kernel="linear")
    svm_small.fit(X_small, y_small)
    plot_classifier(X_small, y_small, svm_small, lims=(11, 15, 0, 6))


df = datasets.load_wine()
X = df.data[:, [0, 1]]
y = df.target
effect_of_removing_examples(X, y)
コード例 #34
0
ファイル: test_svm.py プロジェクト: jw-develop/cs394-projects
def wineTest():
    advTest(load_wine(), 130, kernel.linear)
コード例 #35
0
ファイル: c33.py プロジェクト: leonsharp2015/untitled
from sklearn.datasets import make_blobs
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
import matplotlib.pyplot as plt
import numpy as np
import matplotlib as mpl
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_regression
from sklearn.datasets import load_wine

wine_set = load_wine(
)  #dict_keys(['feature_names', 'data', 'target_names', 'target', 'DESCR'])
X_train, X_test, y_train, y_test = train_test_split(wine_set['data'],
                                                    wine_set['target'],
                                                    random_state=0)
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train, y_train)
score = knn.score(X_test, y_test)
X_new = np.array([[
    13.2, 2.77, 2.51, 18.5, 96.6, 1.04, 2.55, 0.57, 1.47, 6.2, 1.05, 3.33, 820
]])
c = knn.predict(X_new)
print(c)
from sklearn.ensemble import RandomForestClassifier
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt

__author__ = 'wangj'
__date__ = '2018/01/04 00:52'
__doc__ = '''
使用RandomForest选择特征
'''

if __name__ == '__main__':
    wine = datasets.load_wine()
    x = wine.data
    y = wine.target
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)
    stdsc = StandardScaler()
    x_train_std = stdsc.fit_transform(x_train)
    x_test_std = stdsc.transform(x_test)
    feature_names = wine.feature_names
    forest = RandomForestClassifier(n_estimators=10, random_state=0, n_jobs=-1)
    forest.fit(x_train_std, y_train)
    importances = forest.feature_importances_
    indices = np.argsort(importances)[::-1]
    for f in range(x_train_std.shape[1]):
        print('{0:<3}{1:30}{2}'.format(f + 1, feature_names[f], importances[indices[f]]))
    plt.title('Feature Importance')
    plt.bar(range(x_train_std.shape[1]), importances[indices], color='lightblue', align='center')
    plt.xticks(range(x_train_std.shape[1]), feature_names, rotation=90)
コード例 #37
0
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.datasets import load_wine
'''聚类是一种无监督学习,它允许我们找到相似对象的组,这些对象彼此之间的相关性比与其他组中的对象更相关。业务用例的示例包括根据内容对文档、音乐和电影进行分组,或者根据购买行为查找客户群,作为推荐引擎的基础。'''
'''最流行的聚类算法之一是k-means。假设有 n 个数据点,算法工作如下: 步骤 1:初始化- 选择 k 个随机点作为聚类中心,称为质心步骤 2:聚类分配- 根据与每个质心的距离将每个数据点分配到其最近的质心, 并形成 k 个集群 第 3 步:质心更新- 对于每个新集群,通过取分配给集群的所有点的平均值来计算其质心 第 4 步:重复第2 步和第 3 步,直到没有任何集群分配发生变化,或者达到最大迭代次数'''
'''使用numpy计算2点间的距离'''
x1 = np.array([0, 1])
x2 = np.array([2, 0])
print(np.sqrt(((x1 - x2)**2).sum()))
# 2.23606797749979
print(np.sqrt(5))
# 2.23606797749979
'''计算葡萄酒分类:每种葡萄酒有 13 个特征,如果我们可以将所有的葡萄酒分成 3 组,那么它将 13 维空间缩减为 3 维空间。'''
data = load_wine()
wine = pd.DataFrame(data.data, columns=data.feature_names)
print(wine.shape)
print(wine.columns)
print(wine.iloc[:, :3].describe())  #统计前3个字段的数据描述
'''pd.plotting.scatter_matrix():显示沿对角线的直方图和对角线外每对属性的散点图'''
from pandas.plotting import scatter_matrix

scatter_matrix(wine.iloc[:, :])
plt.savefig("plot_win_scatter_matrix.png")
plt.show()
'''k的数量(子组)需要通过观察散点图进行主观判断(瞎猜)'''
'''对数据进行标准化处理: z = (x - mean) / std 其中 x 是原始数据,mean 和 std 是 x 的平均值和标准差,z 是缩放后的 x,使得它以 0 为中心并且具有单位标准差。使用 sklearn.preprocessing 的StandardScaler'''
from sklearn.preprocessing import StandardScaler  #对数据进行标准化的库

X = wine[['alcohol', 'total_phenols']]
scale = StandardScaler()  #实例化缩放器
コード例 #38
0
ファイル: test.py プロジェクト: leandrofturi/mochila_MT
from sklearn.cluster import KMeans
from SA import simulated_annealing
from GRASP import grasp
from AG import genetic
from comons import evaluate_clusters, objective_function
matplotlib.use('TkAgg')


# load #############################################################################################################
max_time = 1

iris = load_iris()['data']
iris = [{'id': x, 'coord': y} for x, y in zip(range(len(iris)), iris)]
k_I = [2, 4, 8, 11, 15, 17, 23, 28, 32, 50]

wine = load_wine()['data']
wine = [{'id': x, 'coord': y} for x, y in zip(range(len(wine)), wine)]
k_W = [3, 5, 13, 15, 20, 23, 25, 30, 41, 45]

ionosphere = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/ionosphere/ionosphere.data')
ionosphere = np.asarray(ionosphere.iloc[:,:34])
ionosphere = [{'id': x, 'coord': y} for x, y in zip(range(len(ionosphere)), ionosphere)]
k_H =  [2, 3, 5, 10, 15, 20, 25, 30, 40, 50]


# utils ############################################################################################################
# kmeans
def kmeans(dataset, k):
    tmp = [i['coord'] for i in dataset]
    start = time.process_time()
    kmeans = KMeans(n_clusters=k).fit(tmp)
コード例 #39
0
ファイル: try0.py プロジェクト: GuoJia563/sklearn0
# sklearn库 决策树分类器 wine数据集
from sklearn.datasets import load_wine  # 引入数据集,sklearn包含众多数据集
from sklearn.model_selection import train_test_split   # 将数据分为测试集和训练集
from sklearn import tree   # 利用邻近点方式训练数据\
import matplotlib.pyplot as plt
# 引入数据
wine=load_wine()   # 引入wine数据
X_train,X_test,y_train,y_test=train_test_split(wine.data,wine.target,test_size=0.3)   # 利用train_test_split进行将训练集和测试集进行分开,test_size占30%
# 训练数据
clf=tree.DecisionTreeClassifier(random_state=0)   # 引入训练方法
clf.fit(X_train,y_train)   # 进行填充测试数据进行训练
# 预测数据
print(clf.predict(X_test))
result=clf.score(X_test,y_test)
print('score:',result)
plt.figure(figsize=(15,9))
tree.plot_tree(clf
               ,filled=True
               ,feature_names=wine.feature_names
               ,class_names=wine.target_names
               )



# sklearn库 泰坦尼克号生存者预测
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
data=pd.read_csv("E:/data/titannic_data.csv")